fix(search): use to_tsquery('simple') for prefix transform to avoid German stop word collision
Words like "Wille" stem to "will" via the German Snowball stemmer, which is also a German stop word. The prefix-transform step (websearch_to_tsquery text → regexp_replace → to_tsquery) was passing already-stemmed lexemes back through the German dictionary, causing them to be silently dropped as stop words. Using the 'simple' configuration skips stop-word processing entirely while the tsvector @@ tsquery comparison still works because lexemes are matched by string value, not by configuration. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -87,7 +87,7 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
SELECT d.id FROM documents d
|
SELECT d.id FROM documents d
|
||||||
CROSS JOIN LATERAL (
|
CROSS JOIN LATERAL (
|
||||||
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
||||||
THEN to_tsquery('german', regexp_replace(
|
THEN to_tsquery('simple', regexp_replace(
|
||||||
websearch_to_tsquery('german', :query)::text,
|
websearch_to_tsquery('german', :query)::text,
|
||||||
'''([^'']+)''',
|
'''([^'']+)''',
|
||||||
'''\\1'':*',
|
'''\\1'':*',
|
||||||
@@ -149,7 +149,7 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
FROM documents d
|
FROM documents d
|
||||||
CROSS JOIN LATERAL (
|
CROSS JOIN LATERAL (
|
||||||
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
SELECT CASE WHEN websearch_to_tsquery('german', :query)::text <> ''
|
||||||
THEN to_tsquery('german', regexp_replace(
|
THEN to_tsquery('simple', regexp_replace(
|
||||||
websearch_to_tsquery('german', :query)::text,
|
websearch_to_tsquery('german', :query)::text,
|
||||||
'''([^'']+)''',
|
'''([^'']+)''',
|
||||||
'''\\1'':*',
|
'''\\1'':*',
|
||||||
|
|||||||
@@ -179,6 +179,22 @@ class DocumentFtsTest {
|
|||||||
assertThat(ids).isEmpty();
|
assertThat(ids).isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void should_find_document_whose_transcription_contains_word_that_stems_to_german_stop_word() {
|
||||||
|
// "Wille" stems to "will" via the German Snowball stemmer.
|
||||||
|
// "will" is also a German stop word, so to_tsquery('german','will:*') drops it.
|
||||||
|
// The prefix-transform step must use to_tsquery('simple',...) to avoid this.
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Foto"));
|
||||||
|
UUID annotationId = annotation(doc.getId());
|
||||||
|
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Der Wille des Volkes", 0));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<UUID> ids = documentRepository.findRankedIdsByFts("Wille");
|
||||||
|
|
||||||
|
assertThat(ids).contains(doc.getId());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
|
void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
|
||||||
documentRepository.saveAndFlush(document("Brief"));
|
documentRepository.saveAndFlush(document("Brief"));
|
||||||
|
|||||||
Reference in New Issue
Block a user