feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34)

- V34 migration: adds search_vector tsvector column with GIN index
- BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A),
  summary + transcription_blocks.text (B), sender/receiver names (C),
  tag names + location (D) using german FTS config
- AFTER triggers on transcription_blocks, document_receivers, document_tags
  touch the parent document row to re-fire the BEFORE UPDATE trigger
- DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery
- DocumentFtsTest: 12 integration tests covering stemming, trigger sync,
  ranking, stop words, malformed input, receiver and tag search

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-14 23:38:12 +02:00
parent 57c44cf02f
commit 24530cf85b
3 changed files with 326 additions and 0 deletions

View File

@@ -0,0 +1,244 @@
package org.raddatz.familienarchiv.repository;
import jakarta.persistence.EntityManager;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.raddatz.familienarchiv.PostgresContainerConfig;
import org.raddatz.familienarchiv.config.FlywayConfig;
import org.raddatz.familienarchiv.model.Document;
import org.raddatz.familienarchiv.model.DocumentAnnotation;
import org.raddatz.familienarchiv.model.DocumentStatus;
import org.raddatz.familienarchiv.model.Person;
import org.raddatz.familienarchiv.model.Tag;
import org.raddatz.familienarchiv.model.TranscriptionBlock;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
import org.springframework.context.annotation.Import;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatNoException;
@DataJpaTest
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
@Import({PostgresContainerConfig.class, FlywayConfig.class})
class DocumentFtsTest {
@Autowired DocumentRepository documentRepository;
@Autowired PersonRepository personRepository;
@Autowired TagRepository tagRepository;
@Autowired AnnotationRepository annotationRepository;
@Autowired TranscriptionBlockRepository blockRepository;
@Autowired EntityManager em;
@BeforeEach
void setUp() {
blockRepository.deleteAll();
documentRepository.deleteAll();
personRepository.deleteAll();
tagRepository.deleteAll();
}
// ─── Guard ─────────────────────────────────────────────────────────────────
@Test
void german_text_search_config_is_available() {
Number count = (Number) em
.createNativeQuery("SELECT count(*) FROM pg_ts_config WHERE cfgname = 'german'")
.getSingleResult();
assertThat(count.longValue()).isEqualTo(1L);
}
// ─── Basic FTS ─────────────────────────────────────────────────────────────
@Test
void should_find_document_by_exact_title_word() {
documentRepository.saveAndFlush(document("Alter Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
assertThat(ids).hasSize(1);
}
@Test
void should_find_document_by_stemmed_inflected_form() {
documentRepository.saveAndFlush(document("Alter Brief"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Briefe");
assertThat(ids).hasSize(1);
}
@Test
void should_not_find_document_when_term_absent() {
documentRepository.saveAndFlush(document("Familienfoto"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
assertThat(ids).isEmpty();
}
// ─── Transcription blocks ───────────────────────────────────────────────────
@Test
void should_find_document_by_transcription_block_text() {
Document doc = documentRepository.saveAndFlush(document("Foto ohne Text"));
UUID annotationId = annotation(doc.getId());
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Liebe Anna ich schreibe dir aus dem Krieg", 0));
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("schreiben");
assertThat(ids).contains(doc.getId());
}
@Test
void should_rebuild_vector_when_transcription_block_inserted_after_document() {
Document doc = documentRepository.saveAndFlush(document("Leeres Dokument"));
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty();
UUID annotationId = annotation(doc.getId());
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
}
@Test
void should_rebuild_vector_when_transcription_block_deleted() {
Document doc = documentRepository.saveAndFlush(document("Dokument mit Block"));
UUID annotationId = annotation(doc.getId());
TranscriptionBlock block = blockRepository.saveAndFlush(
block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
blockRepository.deleteById(block.getId());
em.flush();
em.clear();
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId());
}
// ─── Ranking ───────────────────────────────────────────────────────────────
@Test
void should_rank_title_match_above_transcription_match() {
// docA: "Grundbuch" only in title (weight A)
// docB: "Grundbuch" only in transcription block (weight B)
Document docA = documentRepository.saveAndFlush(document("Grundbuch 1923"));
Document docB = documentRepository.saveAndFlush(document("Anderes Dokument"));
UUID annotationId = annotation(docB.getId());
blockRepository.saveAndFlush(block(docB.getId(), annotationId, "Grundbuch steht darin", 0));
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Grundbuch");
assertThat(ids).hasSize(2);
assertThat(ids.get(0)).isEqualTo(docA.getId());
}
// ─── Edge cases ────────────────────────────────────────────────────────────
@Test
void should_return_empty_when_query_contains_only_stop_words() {
documentRepository.saveAndFlush(document("Ein Brief von der Oma"));
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("der die das und");
assertThat(ids).isEmpty();
}
@Test
void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
documentRepository.saveAndFlush(document("Brief"));
em.clear();
assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("((("));
}
// ─── Weight C: sender/receiver names ───────────────────────────────────────
@Test
void should_find_document_by_receiver_name() {
Person receiver = personRepository.saveAndFlush(
Person.builder().firstName("Anna").lastName("Schmidt").build());
Document doc = documentRepository.saveAndFlush(Document.builder()
.title("Brief")
.originalFilename("brief.pdf")
.status(DocumentStatus.UPLOADED)
.receivers(Set.of(receiver))
.build());
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Schmidt");
assertThat(ids).contains(doc.getId());
}
// ─── Weight D: tag names ───────────────────────────────────────────────────
@Test
void should_find_document_by_tag_name() {
Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
documentRepository.saveAndFlush(Document.builder()
.title("Dokument")
.originalFilename("dokument.pdf")
.status(DocumentStatus.UPLOADED)
.tags(Set.of(tag))
.build());
em.flush();
em.clear();
List<UUID> ids = documentRepository.findRankedIdsByFts("Familiengeschichte");
assertThat(ids).hasSize(1);
}
// ─── Helpers ───────────────────────────────────────────────────────────────
private Document document(String title) {
return Document.builder()
.title(title)
.originalFilename(title.replace(" ", "_") + ".pdf")
.status(DocumentStatus.UPLOADED)
.build();
}
private UUID annotation(UUID documentId) {
DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder()
.documentId(documentId)
.pageNumber(1)
.x(0.1).y(0.2).width(0.3).height(0.4)
.color("#00C7B1")
.build());
em.flush();
return ann.getId();
}
private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) {
return TranscriptionBlock.builder()
.documentId(documentId)
.annotationId(annotationId)
.text(text)
.sortOrder(order)
.build();
}
}