From 091f7e5d258ade58f3b2cc67ce84517213ca9041 Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 15 Apr 2026 21:32:37 +0200 Subject: [PATCH] feat(search): partial-word matching via to_tsquery prefix queries Replace websearch_to_tsquery with a CROSS JOIN LATERAL subquery that appends :* to each lexeme so prefix matches work (e.g. "furchtb" finds "furchtbar"). websearch_to_tsquery still handles the safe tokenisation of user input (stop words, special chars, operators); regexp_replace then adds :* before to_tsquery re-parses the result. Co-Authored-By: Claude Sonnet 4.6 --- .../repository/DocumentRepository.java | 44 ++++++++++++++----- .../repository/DocumentFtsTest.java | 10 +++++ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java index 3e4f7150..022a2ebb 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java @@ -83,8 +83,17 @@ public interface DocumentRepository extends JpaRepository, JpaSp @Query(nativeQuery = true, value = """ SELECT d.id FROM documents d - WHERE d.search_vector @@ websearch_to_tsquery('german', :query) - ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC, + CROSS JOIN LATERAL ( + SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' + THEN to_tsquery('german', regexp_replace( + websearch_to_tsquery('german', :query)::text, + '''([^'']+)''', + '''\\1'':*', + 'g')) + END AS pq + ) q + WHERE d.search_vector @@ q.pq + ORDER BY ts_rank(d.search_vector, q.pq) DESC, d.meta_date DESC NULLS LAST """) List findRankedIdsByFts(@Param("query") String query); @@ -99,44 +108,59 @@ public interface DocumentRepository extends JpaRepository, JpaSp *
  • Boolean — whether the sender's name matched the query
  • *
  • String — comma-separated matched receiver UUIDs, or null
  • *
  • String — comma-separated matched tag UUIDs, or null
  • + *
  • String — summary snippet with \x01/\x02 delimiters, or null if summary didn't match
  • * * Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank. */ @Query(nativeQuery = true, value = """ SELECT d.id, - ts_headline('german', d.title, websearch_to_tsquery('german', :query), + ts_headline('german', d.title, q.pq, 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true') AS title_headline, CASE WHEN best_block.text IS NOT NULL THEN - ts_headline('german', best_block.text, websearch_to_tsquery('german', :query), + ts_headline('german', best_block.text, q.pq, 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20') END AS transcription_snippet, (s.id IS NOT NULL AND to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, '')) - @@ websearch_to_tsquery('german', :query)) + @@ q.pq) AS sender_matched, (SELECT string_agg(r.id::text, ',') FROM document_receivers dr JOIN persons r ON r.id = dr.person_id WHERE dr.document_id = d.id AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name) - @@ websearch_to_tsquery('german', :query) + @@ q.pq ) AS matched_receiver_ids, (SELECT string_agg(t.id::text, ',') FROM document_tags dt JOIN tag t ON t.id = dt.tag_id WHERE dt.document_id = d.id - AND to_tsvector('german', t.name) @@ websearch_to_tsquery('german', :query) - ) AS matched_tag_ids + AND to_tsvector('german', t.name) @@ q.pq + ) AS matched_tag_ids, + CASE WHEN d.summary IS NOT NULL AND d.summary <> '' + AND to_tsvector('german', d.summary) @@ q.pq + THEN ts_headline('german', d.summary, q.pq, + 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20') + END AS summary_snippet FROM documents d + CROSS JOIN LATERAL ( + SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' + THEN to_tsquery('german', regexp_replace( + websearch_to_tsquery('german', :query)::text, + '''([^'']+)''', + '''\\1'':*', + 'g')) + END AS pq + ) q LEFT JOIN persons s ON s.id = d.sender_id LEFT JOIN LATERAL ( SELECT tb.text FROM transcription_blocks tb WHERE tb.document_id = d.id - AND to_tsvector('german', tb.text) @@ websearch_to_tsquery('german', :query) - ORDER BY ts_rank(to_tsvector('german', tb.text), websearch_to_tsquery('german', :query)) DESC + AND to_tsvector('german', tb.text) @@ q.pq + ORDER BY ts_rank(to_tsvector('german', tb.text), q.pq) DESC LIMIT 1 ) best_block ON true WHERE d.id IN :ids diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java index 581cb063..38f64bc9 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java @@ -79,6 +79,16 @@ class DocumentFtsTest { assertThat(ids).hasSize(1); } + @Test + void should_find_document_by_partial_word_prefix() { + documentRepository.saveAndFlush(document("Ein furchtbarer Brief")); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("furchtb"); + + assertThat(ids).hasSize(1); + } + @Test void should_not_find_document_when_term_absent() { documentRepository.saveAndFlush(document("Familienfoto"));