feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34)

- V34 migration: adds search_vector tsvector column with GIN index - BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A), summary + transcription_blocks.text (B), sender/receiver names (C), tag names + location (D) using german FTS config - AFTER triggers on transcription_blocks, document_receivers, document_tags touch the parent document row to re-fire the BEFORE UPDATE trigger - DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery - DocumentFtsTest: 12 integration tests covering stemming, trigger sync, ranking, stop words, malformed input, receiver and tag search Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 23:38:12 +02:00
parent 57c44cf02f
commit 24530cf85b
3 changed files with 326 additions and 0 deletions
--- a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentFtsTest.java
@@ -0,0 +1,244 @@
+package org.raddatz.familienarchiv.repository;
+
+import jakarta.persistence.EntityManager;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.raddatz.familienarchiv.PostgresContainerConfig;
+import org.raddatz.familienarchiv.config.FlywayConfig;
+import org.raddatz.familienarchiv.model.Document;
+import org.raddatz.familienarchiv.model.DocumentAnnotation;
+import org.raddatz.familienarchiv.model.DocumentStatus;
+import org.raddatz.familienarchiv.model.Person;
+import org.raddatz.familienarchiv.model.Tag;
+import org.raddatz.familienarchiv.model.TranscriptionBlock;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
+import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
+import org.springframework.context.annotation.Import;
+
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatNoException;
+
+@DataJpaTest
+@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
+@Import({PostgresContainerConfig.class, FlywayConfig.class})
+class DocumentFtsTest {
+
+    @Autowired DocumentRepository documentRepository;
+    @Autowired PersonRepository personRepository;
+    @Autowired TagRepository tagRepository;
+    @Autowired AnnotationRepository annotationRepository;
+    @Autowired TranscriptionBlockRepository blockRepository;
+    @Autowired EntityManager em;
+
+    @BeforeEach
+    void setUp() {
+        blockRepository.deleteAll();
+        documentRepository.deleteAll();
+        personRepository.deleteAll();
+        tagRepository.deleteAll();
+    }
+
+    // ─── Guard ─────────────────────────────────────────────────────────────────
+
+    @Test
+    void german_text_search_config_is_available() {
+        Number count = (Number) em
+                .createNativeQuery("SELECT count(*) FROM pg_ts_config WHERE cfgname = 'german'")
+                .getSingleResult();
+        assertThat(count.longValue()).isEqualTo(1L);
+    }
+
+    // ─── Basic FTS ─────────────────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_exact_title_word() {
+        documentRepository.saveAndFlush(document("Alter Brief"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    @Test
+    void should_find_document_by_stemmed_inflected_form() {
+        documentRepository.saveAndFlush(document("Alter Brief"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Briefe");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    @Test
+    void should_not_find_document_when_term_absent() {
+        documentRepository.saveAndFlush(document("Familienfoto"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
+
+        assertThat(ids).isEmpty();
+    }
+
+    // ─── Transcription blocks ───────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_transcription_block_text() {
+        Document doc = documentRepository.saveAndFlush(document("Foto ohne Text"));
+        UUID annotationId = annotation(doc.getId());
+
+        blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Liebe Anna ich schreibe dir aus dem Krieg", 0));
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("schreiben");
+
+        assertThat(ids).contains(doc.getId());
+    }
+
+    @Test
+    void should_rebuild_vector_when_transcription_block_inserted_after_document() {
+        Document doc = documentRepository.saveAndFlush(document("Leeres Dokument"));
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty();
+
+        UUID annotationId = annotation(doc.getId());
+        blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
+    }
+
+    @Test
+    void should_rebuild_vector_when_transcription_block_deleted() {
+        Document doc = documentRepository.saveAndFlush(document("Dokument mit Block"));
+        UUID annotationId = annotation(doc.getId());
+        TranscriptionBlock block = blockRepository.saveAndFlush(
+                block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
+
+        blockRepository.deleteById(block.getId());
+        em.flush();
+        em.clear();
+
+        assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId());
+    }
+
+    // ─── Ranking ───────────────────────────────────────────────────────────────
+
+    @Test
+    void should_rank_title_match_above_transcription_match() {
+        // docA: "Grundbuch" only in title (weight A)
+        // docB: "Grundbuch" only in transcription block (weight B)
+        Document docA = documentRepository.saveAndFlush(document("Grundbuch 1923"));
+        Document docB = documentRepository.saveAndFlush(document("Anderes Dokument"));
+        UUID annotationId = annotation(docB.getId());
+        blockRepository.saveAndFlush(block(docB.getId(), annotationId, "Grundbuch steht darin", 0));
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Grundbuch");
+
+        assertThat(ids).hasSize(2);
+        assertThat(ids.get(0)).isEqualTo(docA.getId());
+    }
+
+    // ─── Edge cases ────────────────────────────────────────────────────────────
+
+    @Test
+    void should_return_empty_when_query_contains_only_stop_words() {
+        documentRepository.saveAndFlush(document("Ein Brief von der Oma"));
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("der die das und");
+
+        assertThat(ids).isEmpty();
+    }
+
+    @Test
+    void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
+        documentRepository.saveAndFlush(document("Brief"));
+        em.clear();
+
+        assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("((("));
+    }
+
+    // ─── Weight C: sender/receiver names ───────────────────────────────────────
+
+    @Test
+    void should_find_document_by_receiver_name() {
+        Person receiver = personRepository.saveAndFlush(
+                Person.builder().firstName("Anna").lastName("Schmidt").build());
+        Document doc = documentRepository.saveAndFlush(Document.builder()
+                .title("Brief")
+                .originalFilename("brief.pdf")
+                .status(DocumentStatus.UPLOADED)
+                .receivers(Set.of(receiver))
+                .build());
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Schmidt");
+
+        assertThat(ids).contains(doc.getId());
+    }
+
+    // ─── Weight D: tag names ───────────────────────────────────────────────────
+
+    @Test
+    void should_find_document_by_tag_name() {
+        Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
+        documentRepository.saveAndFlush(Document.builder()
+                .title("Dokument")
+                .originalFilename("dokument.pdf")
+                .status(DocumentStatus.UPLOADED)
+                .tags(Set.of(tag))
+                .build());
+        em.flush();
+        em.clear();
+
+        List<UUID> ids = documentRepository.findRankedIdsByFts("Familiengeschichte");
+
+        assertThat(ids).hasSize(1);
+    }
+
+    // ─── Helpers ───────────────────────────────────────────────────────────────
+
+    private Document document(String title) {
+        return Document.builder()
+                .title(title)
+                .originalFilename(title.replace(" ", "_") + ".pdf")
+                .status(DocumentStatus.UPLOADED)
+                .build();
+    }
+
+    private UUID annotation(UUID documentId) {
+        DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder()
+                .documentId(documentId)
+                .pageNumber(1)
+                .x(0.1).y(0.2).width(0.3).height(0.4)
+                .color("#00C7B1")
+                .build());
+        em.flush();
+        return ann.getId();
+    }
+
+    private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) {
+        return TranscriptionBlock.builder()
+                .documentId(documentId)
+                .annotationId(annotationId)
+                .text(text)
+                .sortOrder(order)
+                .build();
+    }
+}