feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34)

- V34 migration: adds search_vector tsvector column with GIN index - BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A), summary + transcription_blocks.text (B), sender/receiver names (C), tag names + location (D) using german FTS config - AFTER triggers on transcription_blocks, document_receivers, document_tags touch the parent document row to re-fire the BEFORE UPDATE trigger - DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery - DocumentFtsTest: 12 integration tests covering stemming, trigger sync, ranking, stop words, malformed input, receiver and tag search Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 23:38:12 +02:00
parent 57c44cf02f
commit 24530cf85b
3 changed files with 326 additions and 0 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java
@@ -81,4 +81,12 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
            @Param("to") LocalDate to,
            Sort sort);

+    @Query(nativeQuery = true, value = """
+            SELECT d.id FROM documents d
+            WHERE d.search_vector @@ websearch_to_tsquery('german', :query)
+            ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC,
+                     d.meta_date DESC NULLS LAST
+            """)
+    List<UUID> findRankedIdsByFts(@Param("query") String query);
+
 }
--- a/backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql
+++ b/backend/src/main/resources/db/migration/V34__add_fts_search_vector.sql
@@ -0,0 +1,74 @@
+-- ─── Full-Text Search: search_vector on documents ──────────────────────────────
+-- Adds a tsvector column that aggregates: title (A), summary + transcription
+-- block text (B), sender/receiver names (C), tag names + location (D).
+-- The column is maintained by DB triggers so the OCR pipeline (which writes
+-- transcription_blocks directly) stays in sync without JPA @PreUpdate hooks.
+
+-- 1. Column and GIN index
+ALTER TABLE documents ADD COLUMN search_vector tsvector;
+CREATE INDEX idx_documents_search ON documents USING GIN (search_vector);
+
+-- 2. Trigger function: rebuilds search_vector on documents INSERT or UPDATE.
+--    Runs BEFORE the write so NEW.search_vector is set inline.
+CREATE OR REPLACE FUNCTION fn_documents_fts_update() RETURNS trigger AS $$
+BEGIN
+  NEW.search_vector :=
+    setweight(to_tsvector('german', coalesce(NEW.title,   '')), 'A') ||
+    setweight(to_tsvector('german', coalesce(NEW.summary, '')), 'B') ||
+    setweight(to_tsvector('german', coalesce((
+      SELECT string_agg(tb.text, ' ') FILTER (WHERE tb.text IS NOT NULL)
+      FROM   transcription_blocks tb
+      WHERE  tb.document_id = NEW.id
+    ), '')), 'B') ||
+    setweight(to_tsvector('german', coalesce((
+      SELECT coalesce(p.first_name, '') || ' ' || p.last_name
+      FROM   persons p
+      WHERE  p.id = NEW.sender_id
+    ), '')), 'C') ||
+    setweight(to_tsvector('german', coalesce((
+      SELECT string_agg(coalesce(p.first_name, '') || ' ' || p.last_name, ' ')
+      FROM   document_receivers dr
+      JOIN   persons p ON p.id = dr.person_id
+      WHERE  dr.document_id = NEW.id
+    ), '')), 'C') ||
+    setweight(to_tsvector('german', coalesce((
+      SELECT string_agg(t.name, ' ')
+      FROM   document_tags dt
+      JOIN   tag t ON t.id = dt.tag_id
+      WHERE  dt.document_id = NEW.id
+    ), '')), 'D') ||
+    setweight(to_tsvector('german', coalesce(NEW.meta_location, '')), 'D');
+  RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trg_documents_fts
+  BEFORE INSERT OR UPDATE ON documents
+  FOR EACH ROW EXECUTE FUNCTION fn_documents_fts_update();
+
+-- 3. Rebuild trigger for join tables and transcription_blocks.
+--    These tables don't have a search_vector of their own; instead they
+--    touch the parent document row ("SET title = title") to re-fire the
+--    BEFORE UPDATE trigger above, which then recomputes the vector with
+--    the current state of all joined tables.
+CREATE OR REPLACE FUNCTION fn_rebuild_document_fts() RETURNS trigger AS $$
+DECLARE
+  v_doc_id UUID;
+BEGIN
+  v_doc_id := CASE WHEN TG_OP = 'DELETE' THEN OLD.document_id ELSE NEW.document_id END;
+  UPDATE documents SET title = title WHERE id = v_doc_id;
+  RETURN NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trg_transcription_blocks_fts
+  AFTER INSERT OR UPDATE OR DELETE ON transcription_blocks
+  FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
+
+CREATE TRIGGER trg_document_receivers_fts
+  AFTER INSERT OR DELETE ON document_receivers
+  FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
+
+CREATE TRIGGER trg_document_tags_fts
+  AFTER INSERT OR DELETE ON document_tags
+  FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
--- a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java
@@ -0,0 +1,244 @@
+package org.raddatz.familienarchiv.repository;
+
+import jakarta.persistence.EntityManager;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.raddatz.familienarchiv.PostgresContainerConfig;
+import org.raddatz.familienarchiv.config.FlywayConfig;
+import org.raddatz.familienarchiv.model.Document;
+import org.raddatz.familienarchiv.model.DocumentAnnotation;
+import org.raddatz.familienarchiv.model.DocumentStatus;
+import org.raddatz.familienarchiv.model.Person;
+import org.raddatz.familienarchiv.model.Tag;
+import org.raddatz.familienarchiv.model.TranscriptionBlock;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
+import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
+import org.springframework.context.annotation.Import;
+
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatNoException;
+
+@DataJpaTest
+@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
+@Import({PostgresContainerConfig.class, FlywayConfig.class})
+class DocumentFtsTest {
+
+    @Autowired DocumentRepository documentRepository;
+    @Autowired PersonRepository personRepository;
+    @Autowired TagRepository tagRepository;
+    @Autowired AnnotationRepository annotationRepository;
+    @Autowired TranscriptionBlockRepository blockRepository;
+    @Autowired EntityManager em;
+
+    @BeforeEach
+    void setUp() {
+        blockRepository.deleteAll();
+        documentRepository.deleteAll();
+        personRepository.deleteAll();
+        tagRepository.deleteAll();
+    }
+
+    // ─── Guard ─────────────────────────────────────────────────────────────────
+
+    @Test
+    void german_text_search_config_is_available() {
+        Number count = (Number) em
+                .createNativeQuery("SELECT count(*) FROM pg_ts_config WHERE cfgname = 'german'")
+                .getSingleResult();
+        assertThat(count.longValue()).isEqualTo(1L);
+    }
+
+    // ─── Basic FTS ─────────────────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_exact_title_word() {
+        documentRepository.saveAndFlush(document("Alter Brief"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    @Test
+    void should_find_document_by_stemmed_inflected_form() {
+        documentRepository.saveAndFlush(document("Alter Brief"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Briefe");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    @Test
+    void should_not_find_document_when_term_absent() {
+        documentRepository.saveAndFlush(document("Familienfoto"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
+
+        assertThat(ids).isEmpty();
+    }
+
+    // ─── Transcription blocks ───────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_transcription_block_text() {
+        Document doc = documentRepository.saveAndFlush(document("Foto ohne Text"));
+        UUID annotationId = annotation(doc.getId());
+
+        blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Liebe Anna ich schreibe dir aus dem Krieg", 0));
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("schreiben");
+
+        assertThat(ids).contains(doc.getId());
+    }
+
+    @Test
+    void should_rebuild_vector_when_transcription_block_inserted_after_document() {
+        Document doc = documentRepository.saveAndFlush(document("Leeres Dokument"));
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty();
+
+        UUID annotationId = annotation(doc.getId());
+        blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
+    }
+
+    @Test
+    void should_rebuild_vector_when_transcription_block_deleted() {
+        Document doc = documentRepository.saveAndFlush(document("Dokument mit Block"));
+        UUID annotationId = annotation(doc.getId());
+        TranscriptionBlock block = blockRepository.saveAndFlush(
+                block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
+
+        blockRepository.deleteById(block.getId());
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId());
+    }
+
+    // ─── Ranking ───────────────────────────────────────────────────────────────
+
+    @Test
+    void should_rank_title_match_above_transcription_match() {
+        // docA: "Grundbuch" only in title (weight A)
+        // docB: "Grundbuch" only in transcription block (weight B)
+        Document docA = documentRepository.saveAndFlush(document("Grundbuch 1923"));
+        Document docB = documentRepository.saveAndFlush(document("Anderes Dokument"));
+        UUID annotationId = annotation(docB.getId());
+        blockRepository.saveAndFlush(block(docB.getId(), annotationId, "Grundbuch steht darin", 0));
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Grundbuch");
+
+        assertThat(ids).hasSize(2);
+        assertThat(ids.get(0)).isEqualTo(docA.getId());
+    }
+
+    // ─── Edge cases ────────────────────────────────────────────────────────────
+
+    @Test
+    void should_return_empty_when_query_contains_only_stop_words() {
+        documentRepository.saveAndFlush(document("Ein Brief von der Oma"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("der die das und");
+
+        assertThat(ids).isEmpty();
+    }
+
+    @Test
+    void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
+        documentRepository.saveAndFlush(document("Brief"));
+        em.clear();
+
+        assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("((("));
+    }
+
+    // ─── Weight C: sender/receiver names ───────────────────────────────────────
+
+    @Test
+    void should_find_document_by_receiver_name() {
+        Person receiver = personRepository.saveAndFlush(
+                Person.builder().firstName("Anna").lastName("Schmidt").build());
+        Document doc = documentRepository.saveAndFlush(Document.builder()
+                .title("Brief")
+                .originalFilename("brief.pdf")
+                .status(DocumentStatus.UPLOADED)
+                .receivers(Set.of(receiver))
+                .build());
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Schmidt");
+
+        assertThat(ids).contains(doc.getId());
+    }
+
+    // ─── Weight D: tag names ───────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_tag_name() {
+        Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
+        documentRepository.saveAndFlush(Document.builder()
+                .title("Dokument")
+                .originalFilename("dokument.pdf")
+                .status(DocumentStatus.UPLOADED)
+                .tags(Set.of(tag))
+                .build());
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Familiengeschichte");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    // ─── Helpers ───────────────────────────────────────────────────────────────
+
+    private Document document(String title) {
+        return Document.builder()
+                .title(title)
+                .originalFilename(title.replace(" ", "_") + ".pdf")
+                .status(DocumentStatus.UPLOADED)
+                .build();
+    }
+
+    private UUID annotation(UUID documentId) {
+        DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder()
+                .documentId(documentId)
+                .pageNumber(1)
+                .x(0.1).y(0.2).width(0.3).height(0.4)
+                .color("#00C7B1")
+                .build());
+        em.flush();
+        return ann.getId();
+    }
+
+    private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) {
+        return TranscriptionBlock.builder()
+                .documentId(documentId)
+                .annotationId(annotationId)
+                .text(text)
+                .sortOrder(order)
+                .build();
+    }
+}