feat(search): partial-word matching via to_tsquery prefix queries
Replace websearch_to_tsquery with a CROSS JOIN LATERAL subquery that appends :* to each lexeme so prefix matches work (e.g. "furchtb" finds "furchtbar"). websearch_to_tsquery still handles the safe tokenisation of user input (stop words, special chars, operators); regexp_replace then adds :* before to_tsquery re-parses the result. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -83,8 +83,17 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
|
|
||||||
@Query(nativeQuery = true, value = """
|
@Query(nativeQuery = true, value = """
|
||||||
SELECT d.id FROM documents d
|
SELECT d.id FROM documents d
|
||||||
WHERE d.search_vector @@ websearch_to_tsquery('german', :query)
|
CROSS JOIN LATERAL (
|
||||||
ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC,
|
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
||||||
|
THEN to_tsquery('german', regexp_replace(
|
||||||
|
websearch_to_tsquery('german', :query)::text,
|
||||||
|
'''([^'']+)''',
|
||||||
|
'''\\1'':*',
|
||||||
|
'g'))
|
||||||
|
END AS pq
|
||||||
|
) q
|
||||||
|
WHERE d.search_vector @@ q.pq
|
||||||
|
ORDER BY ts_rank(d.search_vector, q.pq) DESC,
|
||||||
d.meta_date DESC NULLS LAST
|
d.meta_date DESC NULLS LAST
|
||||||
""")
|
""")
|
||||||
List<UUID> findRankedIdsByFts(@Param("query") String query);
|
List<UUID> findRankedIdsByFts(@Param("query") String query);
|
||||||
@@ -99,44 +108,59 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
* <li>Boolean — whether the sender's name matched the query</li>
|
* <li>Boolean — whether the sender's name matched the query</li>
|
||||||
* <li>String — comma-separated matched receiver UUIDs, or null</li>
|
* <li>String — comma-separated matched receiver UUIDs, or null</li>
|
||||||
* <li>String — comma-separated matched tag UUIDs, or null</li>
|
* <li>String — comma-separated matched tag UUIDs, or null</li>
|
||||||
|
* <li>String — summary snippet with \x01/\x02 delimiters, or null if summary didn't match</li>
|
||||||
* </ol>
|
* </ol>
|
||||||
* Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank.
|
* Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank.
|
||||||
*/
|
*/
|
||||||
@Query(nativeQuery = true, value = """
|
@Query(nativeQuery = true, value = """
|
||||||
SELECT
|
SELECT
|
||||||
d.id,
|
d.id,
|
||||||
ts_headline('german', d.title, websearch_to_tsquery('german', :query),
|
ts_headline('german', d.title, q.pq,
|
||||||
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
|
||||||
AS title_headline,
|
AS title_headline,
|
||||||
CASE WHEN best_block.text IS NOT NULL THEN
|
CASE WHEN best_block.text IS NOT NULL THEN
|
||||||
ts_headline('german', best_block.text, websearch_to_tsquery('german', :query),
|
ts_headline('german', best_block.text, q.pq,
|
||||||
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
|
||||||
END AS transcription_snippet,
|
END AS transcription_snippet,
|
||||||
(s.id IS NOT NULL AND
|
(s.id IS NOT NULL AND
|
||||||
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
|
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
|
||||||
@@ websearch_to_tsquery('german', :query))
|
@@ q.pq)
|
||||||
AS sender_matched,
|
AS sender_matched,
|
||||||
(SELECT string_agg(r.id::text, ',')
|
(SELECT string_agg(r.id::text, ',')
|
||||||
FROM document_receivers dr
|
FROM document_receivers dr
|
||||||
JOIN persons r ON r.id = dr.person_id
|
JOIN persons r ON r.id = dr.person_id
|
||||||
WHERE dr.document_id = d.id
|
WHERE dr.document_id = d.id
|
||||||
AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name)
|
AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name)
|
||||||
@@ websearch_to_tsquery('german', :query)
|
@@ q.pq
|
||||||
) AS matched_receiver_ids,
|
) AS matched_receiver_ids,
|
||||||
(SELECT string_agg(t.id::text, ',')
|
(SELECT string_agg(t.id::text, ',')
|
||||||
FROM document_tags dt
|
FROM document_tags dt
|
||||||
JOIN tag t ON t.id = dt.tag_id
|
JOIN tag t ON t.id = dt.tag_id
|
||||||
WHERE dt.document_id = d.id
|
WHERE dt.document_id = d.id
|
||||||
AND to_tsvector('german', t.name) @@ websearch_to_tsquery('german', :query)
|
AND to_tsvector('german', t.name) @@ q.pq
|
||||||
) AS matched_tag_ids
|
) AS matched_tag_ids,
|
||||||
|
CASE WHEN d.summary IS NOT NULL AND d.summary <> ''
|
||||||
|
AND to_tsvector('german', d.summary) @@ q.pq
|
||||||
|
THEN ts_headline('german', d.summary, q.pq,
|
||||||
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
|
||||||
|
END AS summary_snippet
|
||||||
FROM documents d
|
FROM documents d
|
||||||
|
CROSS JOIN LATERAL (
|
||||||
|
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
||||||
|
THEN to_tsquery('german', regexp_replace(
|
||||||
|
websearch_to_tsquery('german', :query)::text,
|
||||||
|
'''([^'']+)''',
|
||||||
|
'''\\1'':*',
|
||||||
|
'g'))
|
||||||
|
END AS pq
|
||||||
|
) q
|
||||||
LEFT JOIN persons s ON s.id = d.sender_id
|
LEFT JOIN persons s ON s.id = d.sender_id
|
||||||
LEFT JOIN LATERAL (
|
LEFT JOIN LATERAL (
|
||||||
SELECT tb.text
|
SELECT tb.text
|
||||||
FROM transcription_blocks tb
|
FROM transcription_blocks tb
|
||||||
WHERE tb.document_id = d.id
|
WHERE tb.document_id = d.id
|
||||||
AND to_tsvector('german', tb.text) @@ websearch_to_tsquery('german', :query)
|
AND to_tsvector('german', tb.text) @@ q.pq
|
||||||
ORDER BY ts_rank(to_tsvector('german', tb.text), websearch_to_tsquery('german', :query)) DESC
|
ORDER BY ts_rank(to_tsvector('german', tb.text), q.pq) DESC
|
||||||
LIMIT 1
|
LIMIT 1
|
||||||
) best_block ON true
|
) best_block ON true
|
||||||
WHERE d.id IN :ids
|
WHERE d.id IN :ids
|
||||||
|
|||||||
@@ -79,6 +79,16 @@ class DocumentFtsTest {
|
|||||||
assertThat(ids).hasSize(1);
|
assertThat(ids).hasSize(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void should_find_document_by_partial_word_prefix() {
|
||||||
|
documentRepository.saveAndFlush(document("Ein furchtbarer Brief"));
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<UUID> ids = documentRepository.findRankedIdsByFts("furchtb");
|
||||||
|
|
||||||
|
assertThat(ids).hasSize(1);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void should_not_find_document_when_term_absent() {
|
void should_not_find_document_when_term_absent() {
|
||||||
documentRepository.saveAndFlush(document("Familienfoto"));
|
documentRepository.saveAndFlush(document("Familienfoto"));
|
||||||
|
|||||||
Reference in New Issue
Block a user