From d45739cb76d7e3df9e22838db157c4793287a2c8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 25 Apr 2026 21:40:07 +0200 Subject: [PATCH] fix(search): use to_tsquery('simple') for prefix transform to avoid German stop word collision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Words like "Wille" stem to "will" via the German Snowball stemmer, which is also a German stop word. The prefix-transform step (websearch_to_tsquery text → regexp_replace → to_tsquery) was passing already-stemmed lexemes back through the German dictionary, causing them to be silently dropped as stop words. Using the 'simple' configuration skips stop-word processing entirely while the tsvector @@ tsquery comparison still works because lexemes are matched by string value, not by configuration. Co-Authored-By: Claude Sonnet 4.6 --- .../repository/DocumentRepository.java | 4 ++-- .../repository/DocumentFtsTest.java | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java index 1f1af2cc..cc2fbfc1 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java @@ -87,7 +87,7 @@ public interface DocumentRepository extends JpaRepository, JpaSp SELECT d.id FROM documents d CROSS JOIN LATERAL ( SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' - THEN to_tsquery('german', regexp_replace( + THEN to_tsquery('simple', regexp_replace( websearch_to_tsquery('german', :query)::text, '''([^'']+)''', '''\\1'':*', @@ -149,7 +149,7 @@ public interface DocumentRepository extends JpaRepository, JpaSp FROM documents d CROSS JOIN LATERAL ( SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> '' - THEN to_tsquery('german', regexp_replace( + THEN to_tsquery('simple', regexp_replace( websearch_to_tsquery('german', :query)::text, '''([^'']+)''', '''\\1'':*', diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java index 38f64bc9..3240ba03 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java @@ -179,6 +179,22 @@ class DocumentFtsTest { assertThat(ids).isEmpty(); } + @Test + void should_find_document_whose_transcription_contains_word_that_stems_to_german_stop_word() { + // "Wille" stems to "will" via the German Snowball stemmer. + // "will" is also a German stop word, so to_tsquery('german','will:*') drops it. + // The prefix-transform step must use to_tsquery('simple',...) to avoid this. + Document doc = documentRepository.saveAndFlush(document("Foto")); + UUID annotationId = annotation(doc.getId()); + blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Der Wille des Volkes", 0)); + em.flush(); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Wille"); + + assertThat(ids).contains(doc.getId()); + } + @Test void should_not_throw_when_query_contains_invalid_tsquery_syntax() { documentRepository.saveAndFlush(document("Brief"));