feat(search): partial-word matching via to_tsquery prefix queries

Replace websearch_to_tsquery with a CROSS JOIN LATERAL subquery that
appends :* to each lexeme so prefix matches work (e.g. "furchtb" finds
"furchtbar"). websearch_to_tsquery still handles the safe tokenisation
of user input (stop words, special chars, operators); regexp_replace
then adds :* before to_tsquery re-parses the result.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-15 21:32:37 +02:00
committed by marcel
parent 32f151ff31
commit 091f7e5d25
2 changed files with 44 additions and 10 deletions

View File

@@ -83,8 +83,17 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
@Query(nativeQuery = true, value = """ @Query(nativeQuery = true, value = """
SELECT d.id FROM documents d SELECT d.id FROM documents d
WHERE d.search_vector @@ websearch_to_tsquery('german', :query) CROSS JOIN LATERAL (
ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC, SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
THEN to_tsquery('german', regexp_replace(
websearch_to_tsquery('german', :query)::text,
'''([^'']+)''',
'''\\1'':*',
'g'))
END AS pq
) q
WHERE d.search_vector @@ q.pq
ORDER BY ts_rank(d.search_vector, q.pq) DESC,
d.meta_date DESC NULLS LAST d.meta_date DESC NULLS LAST
""") """)
List<UUID> findRankedIdsByFts(@Param("query") String query); List<UUID> findRankedIdsByFts(@Param("query") String query);
@@ -99,44 +108,59 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
* <li>Boolean — whether the sender's name matched the query</li> * <li>Boolean — whether the sender's name matched the query</li>
* <li>String — comma-separated matched receiver UUIDs, or null</li> * <li>String — comma-separated matched receiver UUIDs, or null</li>
* <li>String — comma-separated matched tag UUIDs, or null</li> * <li>String — comma-separated matched tag UUIDs, or null</li>
* <li>String — summary snippet with \x01/\x02 delimiters, or null if summary didn't match</li>
* </ol> * </ol>
* Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank. * Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank.
*/ */
@Query(nativeQuery = true, value = """ @Query(nativeQuery = true, value = """
SELECT SELECT
d.id, d.id,
ts_headline('german', d.title, websearch_to_tsquery('german', :query), ts_headline('german', d.title, q.pq,
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true') 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
AS title_headline, AS title_headline,
CASE WHEN best_block.text IS NOT NULL THEN CASE WHEN best_block.text IS NOT NULL THEN
ts_headline('german', best_block.text, websearch_to_tsquery('german', :query), ts_headline('german', best_block.text, q.pq,
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20') 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
END AS transcription_snippet, END AS transcription_snippet,
(s.id IS NOT NULL AND (s.id IS NOT NULL AND
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, '')) to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
@@ websearch_to_tsquery('german', :query)) @@ q.pq)
AS sender_matched, AS sender_matched,
(SELECT string_agg(r.id::text, ',') (SELECT string_agg(r.id::text, ',')
FROM document_receivers dr FROM document_receivers dr
JOIN persons r ON r.id = dr.person_id JOIN persons r ON r.id = dr.person_id
WHERE dr.document_id = d.id WHERE dr.document_id = d.id
AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name) AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name)
@@ websearch_to_tsquery('german', :query) @@ q.pq
) AS matched_receiver_ids, ) AS matched_receiver_ids,
(SELECT string_agg(t.id::text, ',') (SELECT string_agg(t.id::text, ',')
FROM document_tags dt FROM document_tags dt
JOIN tag t ON t.id = dt.tag_id JOIN tag t ON t.id = dt.tag_id
WHERE dt.document_id = d.id WHERE dt.document_id = d.id
AND to_tsvector('german', t.name) @@ websearch_to_tsquery('german', :query) AND to_tsvector('german', t.name) @@ q.pq
) AS matched_tag_ids ) AS matched_tag_ids,
CASE WHEN d.summary IS NOT NULL AND d.summary <> ''
AND to_tsvector('german', d.summary) @@ q.pq
THEN ts_headline('german', d.summary, q.pq,
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
END AS summary_snippet
FROM documents d FROM documents d
CROSS JOIN LATERAL (
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
THEN to_tsquery('german', regexp_replace(
websearch_to_tsquery('german', :query)::text,
'''([^'']+)''',
'''\\1'':*',
'g'))
END AS pq
) q
LEFT JOIN persons s ON s.id = d.sender_id LEFT JOIN persons s ON s.id = d.sender_id
LEFT JOIN LATERAL ( LEFT JOIN LATERAL (
SELECT tb.text SELECT tb.text
FROM transcription_blocks tb FROM transcription_blocks tb
WHERE tb.document_id = d.id WHERE tb.document_id = d.id
AND to_tsvector('german', tb.text) @@ websearch_to_tsquery('german', :query) AND to_tsvector('german', tb.text) @@ q.pq
ORDER BY ts_rank(to_tsvector('german', tb.text), websearch_to_tsquery('german', :query)) DESC ORDER BY ts_rank(to_tsvector('german', tb.text), q.pq) DESC
LIMIT 1 LIMIT 1
) best_block ON true ) best_block ON true
WHERE d.id IN :ids WHERE d.id IN :ids

View File

@@ -79,6 +79,16 @@ class DocumentFtsTest {
assertThat(ids).hasSize(1); assertThat(ids).hasSize(1);
} }
@Test
void should_find_document_by_partial_word_prefix() {
documentRepository.saveAndFlush(document("Ein furchtbarer Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("furchtb");
assertThat(ids).hasSize(1);
}
@Test @Test
void should_not_find_document_when_term_absent() { void should_not_find_document_when_term_absent() {
documentRepository.saveAndFlush(document("Familienfoto")); documentRepository.saveAndFlush(document("Familienfoto"));