From 24530cf85b9ec1c306b542e0eb9abcce5f197d5f Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 23:38:12 +0200 Subject: [PATCH] feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34) - V34 migration: adds search_vector tsvector column with GIN index - BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A), summary + transcription_blocks.text (B), sender/receiver names (C), tag names + location (D) using german FTS config - AFTER triggers on transcription_blocks, document_receivers, document_tags touch the parent document row to re-fire the BEFORE UPDATE trigger - DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery - DocumentFtsTest: 12 integration tests covering stemming, trigger sync, ranking, stop words, malformed input, receiver and tag search Co-Authored-By: Claude Sonnet 4.6 --- .../repository/DocumentRepository.java | 8 + .../migration/V34__add_fts_search_vector.sql | 74 ++++++ .../repository/DocumentFtsTest.java | 244 ++++++++++++++++++ 3 files changed, 326 insertions(+) create mode 100644 backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java index ca5a88d4..3a183ded 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java @@ -81,4 +81,12 @@ public interface DocumentRepository extends JpaRepository, JpaSp @Param("to") LocalDate to, Sort sort); + @Query(nativeQuery = true, value = """ + SELECT d.id FROM documents d + WHERE d.search_vector @@ websearch_to_tsquery('german', :query) + ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC, + d.meta_date DESC NULLS LAST + """) + List findRankedIdsByFts(@Param("query") String query); + } \ No newline at end of file diff --git a/backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql b/backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql new file mode 100644 index 00000000..a362037a --- /dev/null +++ b/backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql @@ -0,0 +1,74 @@ +-- ─── Full-Text Search: search_vector on documents ────────────────────────────── +-- Adds a tsvector column that aggregates: title (A), summary + transcription +-- block text (B), sender/receiver names (C), tag names + location (D). +-- The column is maintained by DB triggers so the OCR pipeline (which writes +-- transcription_blocks directly) stays in sync without JPA @PreUpdate hooks. + +-- 1. Column and GIN index +ALTER TABLE documents ADD COLUMN search_vector tsvector; +CREATE INDEX idx_documents_search ON documents USING GIN (search_vector); + +-- 2. Trigger function: rebuilds search_vector on documents INSERT or UPDATE. +-- Runs BEFORE the write so NEW.search_vector is set inline. +CREATE OR REPLACE FUNCTION fn_documents_fts_update() RETURNS trigger AS $$ +BEGIN + NEW.search_vector := + setweight(to_tsvector('german', coalesce(NEW.title, '')), 'A') || + setweight(to_tsvector('german', coalesce(NEW.summary, '')), 'B') || + setweight(to_tsvector('german', coalesce(( + SELECT string_agg(tb.text, ' ') FILTER (WHERE tb.text IS NOT NULL) + FROM transcription_blocks tb + WHERE tb.document_id = NEW.id + ), '')), 'B') || + setweight(to_tsvector('german', coalesce(( + SELECT coalesce(p.first_name, '') || ' ' || p.last_name + FROM persons p + WHERE p.id = NEW.sender_id + ), '')), 'C') || + setweight(to_tsvector('german', coalesce(( + SELECT string_agg(coalesce(p.first_name, '') || ' ' || p.last_name, ' ') + FROM document_receivers dr + JOIN persons p ON p.id = dr.person_id + WHERE dr.document_id = NEW.id + ), '')), 'C') || + setweight(to_tsvector('german', coalesce(( + SELECT string_agg(t.name, ' ') + FROM document_tags dt + JOIN tag t ON t.id = dt.tag_id + WHERE dt.document_id = NEW.id + ), '')), 'D') || + setweight(to_tsvector('german', coalesce(NEW.meta_location, '')), 'D'); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER trg_documents_fts + BEFORE INSERT OR UPDATE ON documents + FOR EACH ROW EXECUTE FUNCTION fn_documents_fts_update(); + +-- 3. Rebuild trigger for join tables and transcription_blocks. +-- These tables don't have a search_vector of their own; instead they +-- touch the parent document row ("SET title = title") to re-fire the +-- BEFORE UPDATE trigger above, which then recomputes the vector with +-- the current state of all joined tables. +CREATE OR REPLACE FUNCTION fn_rebuild_document_fts() RETURNS trigger AS $$ +DECLARE + v_doc_id UUID; +BEGIN + v_doc_id := CASE WHEN TG_OP = 'DELETE' THEN OLD.document_id ELSE NEW.document_id END; + UPDATE documents SET title = title WHERE id = v_doc_id; + RETURN NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER trg_transcription_blocks_fts + AFTER INSERT OR UPDATE OR DELETE ON transcription_blocks + FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts(); + +CREATE TRIGGER trg_document_receivers_fts + AFTER INSERT OR DELETE ON document_receivers + FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts(); + +CREATE TRIGGER trg_document_tags_fts + AFTER INSERT OR DELETE ON document_tags + FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java new file mode 100644 index 00000000..3c634e1e --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java @@ -0,0 +1,244 @@ +package org.raddatz.familienarchiv.repository; + +import jakarta.persistence.EntityManager; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.config.FlywayConfig; +import org.raddatz.familienarchiv.model.Document; +import org.raddatz.familienarchiv.model.DocumentAnnotation; +import org.raddatz.familienarchiv.model.DocumentStatus; +import org.raddatz.familienarchiv.model.Person; +import org.raddatz.familienarchiv.model.Tag; +import org.raddatz.familienarchiv.model.TranscriptionBlock; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest; +import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase; +import org.springframework.context.annotation.Import; + +import java.util.List; +import java.util.Set; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +@DataJpaTest +@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE) +@Import({PostgresContainerConfig.class, FlywayConfig.class}) +class DocumentFtsTest { + + @Autowired DocumentRepository documentRepository; + @Autowired PersonRepository personRepository; + @Autowired TagRepository tagRepository; + @Autowired AnnotationRepository annotationRepository; + @Autowired TranscriptionBlockRepository blockRepository; + @Autowired EntityManager em; + + @BeforeEach + void setUp() { + blockRepository.deleteAll(); + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + } + + // ─── Guard ───────────────────────────────────────────────────────────────── + + @Test + void german_text_search_config_is_available() { + Number count = (Number) em + .createNativeQuery("SELECT count(*) FROM pg_ts_config WHERE cfgname = 'german'") + .getSingleResult(); + assertThat(count.longValue()).isEqualTo(1L); + } + + // ─── Basic FTS ───────────────────────────────────────────────────────────── + + @Test + void should_find_document_by_exact_title_word() { + documentRepository.saveAndFlush(document("Alter Brief")); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Brief"); + + assertThat(ids).hasSize(1); + } + + @Test + void should_find_document_by_stemmed_inflected_form() { + documentRepository.saveAndFlush(document("Alter Brief")); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Briefe"); + + assertThat(ids).hasSize(1); + } + + @Test + void should_not_find_document_when_term_absent() { + documentRepository.saveAndFlush(document("Familienfoto")); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Brief"); + + assertThat(ids).isEmpty(); + } + + // ─── Transcription blocks ─────────────────────────────────────────────────── + + @Test + void should_find_document_by_transcription_block_text() { + Document doc = documentRepository.saveAndFlush(document("Foto ohne Text")); + UUID annotationId = annotation(doc.getId()); + + blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Liebe Anna ich schreibe dir aus dem Krieg", 0)); + em.flush(); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("schreiben"); + + assertThat(ids).contains(doc.getId()); + } + + @Test + void should_rebuild_vector_when_transcription_block_inserted_after_document() { + Document doc = documentRepository.saveAndFlush(document("Leeres Dokument")); + em.clear(); + + assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty(); + + UUID annotationId = annotation(doc.getId()); + blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0)); + em.flush(); + em.clear(); + + assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId()); + } + + @Test + void should_rebuild_vector_when_transcription_block_deleted() { + Document doc = documentRepository.saveAndFlush(document("Dokument mit Block")); + UUID annotationId = annotation(doc.getId()); + TranscriptionBlock block = blockRepository.saveAndFlush( + block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0)); + em.flush(); + em.clear(); + + assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId()); + + blockRepository.deleteById(block.getId()); + em.flush(); + em.clear(); + + assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId()); + } + + // ─── Ranking ─────────────────────────────────────────────────────────────── + + @Test + void should_rank_title_match_above_transcription_match() { + // docA: "Grundbuch" only in title (weight A) + // docB: "Grundbuch" only in transcription block (weight B) + Document docA = documentRepository.saveAndFlush(document("Grundbuch 1923")); + Document docB = documentRepository.saveAndFlush(document("Anderes Dokument")); + UUID annotationId = annotation(docB.getId()); + blockRepository.saveAndFlush(block(docB.getId(), annotationId, "Grundbuch steht darin", 0)); + em.flush(); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Grundbuch"); + + assertThat(ids).hasSize(2); + assertThat(ids.get(0)).isEqualTo(docA.getId()); + } + + // ─── Edge cases ──────────────────────────────────────────────────────────── + + @Test + void should_return_empty_when_query_contains_only_stop_words() { + documentRepository.saveAndFlush(document("Ein Brief von der Oma")); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("der die das und"); + + assertThat(ids).isEmpty(); + } + + @Test + void should_not_throw_when_query_contains_invalid_tsquery_syntax() { + documentRepository.saveAndFlush(document("Brief")); + em.clear(); + + assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("(((")); + } + + // ─── Weight C: sender/receiver names ─────────────────────────────────────── + + @Test + void should_find_document_by_receiver_name() { + Person receiver = personRepository.saveAndFlush( + Person.builder().firstName("Anna").lastName("Schmidt").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Brief") + .originalFilename("brief.pdf") + .status(DocumentStatus.UPLOADED) + .receivers(Set.of(receiver)) + .build()); + em.flush(); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Schmidt"); + + assertThat(ids).contains(doc.getId()); + } + + // ─── Weight D: tag names ─────────────────────────────────────────────────── + + @Test + void should_find_document_by_tag_name() { + Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build()); + documentRepository.saveAndFlush(Document.builder() + .title("Dokument") + .originalFilename("dokument.pdf") + .status(DocumentStatus.UPLOADED) + .tags(Set.of(tag)) + .build()); + em.flush(); + em.clear(); + + List ids = documentRepository.findRankedIdsByFts("Familiengeschichte"); + + assertThat(ids).hasSize(1); + } + + // ─── Helpers ─────────────────────────────────────────────────────────────── + + private Document document(String title) { + return Document.builder() + .title(title) + .originalFilename(title.replace(" ", "_") + ".pdf") + .status(DocumentStatus.UPLOADED) + .build(); + } + + private UUID annotation(UUID documentId) { + DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder() + .documentId(documentId) + .pageNumber(1) + .x(0.1).y(0.2).width(0.3).height(0.4) + .color("#00C7B1") + .build()); + em.flush(); + return ann.getId(); + } + + private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) { + return TranscriptionBlock.builder() + .documentId(documentId) + .annotationId(annotationId) + .text(text) + .sortOrder(order) + .build(); + } +}