feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34)
- V34 migration: adds search_vector tsvector column with GIN index - BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A), summary + transcription_blocks.text (B), sender/receiver names (C), tag names + location (D) using german FTS config - AFTER triggers on transcription_blocks, document_receivers, document_tags touch the parent document row to re-fire the BEFORE UPDATE trigger - DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery - DocumentFtsTest: 12 integration tests covering stemming, trigger sync, ranking, stop words, malformed input, receiver and tag search Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,4 +81,12 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
||||
@Param("to") LocalDate to,
|
||||
Sort sort);
|
||||
|
||||
@Query(nativeQuery = true, value = """
|
||||
SELECT d.id FROM documents d
|
||||
WHERE d.search_vector @@ websearch_to_tsquery('german', :query)
|
||||
ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC,
|
||||
d.meta_date DESC NULLS LAST
|
||||
""")
|
||||
List<UUID> findRankedIdsByFts(@Param("query") String query);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
-- ─── Full-Text Search: search_vector on documents ──────────────────────────────
|
||||
-- Adds a tsvector column that aggregates: title (A), summary + transcription
|
||||
-- block text (B), sender/receiver names (C), tag names + location (D).
|
||||
-- The column is maintained by DB triggers so the OCR pipeline (which writes
|
||||
-- transcription_blocks directly) stays in sync without JPA @PreUpdate hooks.
|
||||
|
||||
-- 1. Column and GIN index
|
||||
ALTER TABLE documents ADD COLUMN search_vector tsvector;
|
||||
CREATE INDEX idx_documents_search ON documents USING GIN (search_vector);
|
||||
|
||||
-- 2. Trigger function: rebuilds search_vector on documents INSERT or UPDATE.
|
||||
-- Runs BEFORE the write so NEW.search_vector is set inline.
|
||||
CREATE OR REPLACE FUNCTION fn_documents_fts_update() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
NEW.search_vector :=
|
||||
setweight(to_tsvector('german', coalesce(NEW.title, '')), 'A') ||
|
||||
setweight(to_tsvector('german', coalesce(NEW.summary, '')), 'B') ||
|
||||
setweight(to_tsvector('german', coalesce((
|
||||
SELECT string_agg(tb.text, ' ') FILTER (WHERE tb.text IS NOT NULL)
|
||||
FROM transcription_blocks tb
|
||||
WHERE tb.document_id = NEW.id
|
||||
), '')), 'B') ||
|
||||
setweight(to_tsvector('german', coalesce((
|
||||
SELECT coalesce(p.first_name, '') || ' ' || p.last_name
|
||||
FROM persons p
|
||||
WHERE p.id = NEW.sender_id
|
||||
), '')), 'C') ||
|
||||
setweight(to_tsvector('german', coalesce((
|
||||
SELECT string_agg(coalesce(p.first_name, '') || ' ' || p.last_name, ' ')
|
||||
FROM document_receivers dr
|
||||
JOIN persons p ON p.id = dr.person_id
|
||||
WHERE dr.document_id = NEW.id
|
||||
), '')), 'C') ||
|
||||
setweight(to_tsvector('german', coalesce((
|
||||
SELECT string_agg(t.name, ' ')
|
||||
FROM document_tags dt
|
||||
JOIN tag t ON t.id = dt.tag_id
|
||||
WHERE dt.document_id = NEW.id
|
||||
), '')), 'D') ||
|
||||
setweight(to_tsvector('german', coalesce(NEW.meta_location, '')), 'D');
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER trg_documents_fts
|
||||
BEFORE INSERT OR UPDATE ON documents
|
||||
FOR EACH ROW EXECUTE FUNCTION fn_documents_fts_update();
|
||||
|
||||
-- 3. Rebuild trigger for join tables and transcription_blocks.
|
||||
-- These tables don't have a search_vector of their own; instead they
|
||||
-- touch the parent document row ("SET title = title") to re-fire the
|
||||
-- BEFORE UPDATE trigger above, which then recomputes the vector with
|
||||
-- the current state of all joined tables.
|
||||
CREATE OR REPLACE FUNCTION fn_rebuild_document_fts() RETURNS trigger AS $$
|
||||
DECLARE
|
||||
v_doc_id UUID;
|
||||
BEGIN
|
||||
v_doc_id := CASE WHEN TG_OP = 'DELETE' THEN OLD.document_id ELSE NEW.document_id END;
|
||||
UPDATE documents SET title = title WHERE id = v_doc_id;
|
||||
RETURN NULL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER trg_transcription_blocks_fts
|
||||
AFTER INSERT OR UPDATE OR DELETE ON transcription_blocks
|
||||
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
|
||||
|
||||
CREATE TRIGGER trg_document_receivers_fts
|
||||
AFTER INSERT OR DELETE ON document_receivers
|
||||
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
|
||||
|
||||
CREATE TRIGGER trg_document_tags_fts
|
||||
AFTER INSERT OR DELETE ON document_tags
|
||||
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
|
||||
@@ -0,0 +1,244 @@
|
||||
package org.raddatz.familienarchiv.repository;
|
||||
|
||||
import jakarta.persistence.EntityManager;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.raddatz.familienarchiv.PostgresContainerConfig;
|
||||
import org.raddatz.familienarchiv.config.FlywayConfig;
|
||||
import org.raddatz.familienarchiv.model.Document;
|
||||
import org.raddatz.familienarchiv.model.DocumentAnnotation;
|
||||
import org.raddatz.familienarchiv.model.DocumentStatus;
|
||||
import org.raddatz.familienarchiv.model.Person;
|
||||
import org.raddatz.familienarchiv.model.Tag;
|
||||
import org.raddatz.familienarchiv.model.TranscriptionBlock;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
|
||||
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
|
||||
import org.springframework.context.annotation.Import;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.assertj.core.api.Assertions.assertThatNoException;
|
||||
|
||||
@DataJpaTest
|
||||
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
|
||||
@Import({PostgresContainerConfig.class, FlywayConfig.class})
|
||||
class DocumentFtsTest {
|
||||
|
||||
@Autowired DocumentRepository documentRepository;
|
||||
@Autowired PersonRepository personRepository;
|
||||
@Autowired TagRepository tagRepository;
|
||||
@Autowired AnnotationRepository annotationRepository;
|
||||
@Autowired TranscriptionBlockRepository blockRepository;
|
||||
@Autowired EntityManager em;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
blockRepository.deleteAll();
|
||||
documentRepository.deleteAll();
|
||||
personRepository.deleteAll();
|
||||
tagRepository.deleteAll();
|
||||
}
|
||||
|
||||
// ─── Guard ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void german_text_search_config_is_available() {
|
||||
Number count = (Number) em
|
||||
.createNativeQuery("SELECT count(*) FROM pg_ts_config WHERE cfgname = 'german'")
|
||||
.getSingleResult();
|
||||
assertThat(count.longValue()).isEqualTo(1L);
|
||||
}
|
||||
|
||||
// ─── Basic FTS ─────────────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_find_document_by_exact_title_word() {
|
||||
documentRepository.saveAndFlush(document("Alter Brief"));
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
|
||||
|
||||
assertThat(ids).hasSize(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_find_document_by_stemmed_inflected_form() {
|
||||
documentRepository.saveAndFlush(document("Alter Brief"));
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Briefe");
|
||||
|
||||
assertThat(ids).hasSize(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_not_find_document_when_term_absent() {
|
||||
documentRepository.saveAndFlush(document("Familienfoto"));
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Brief");
|
||||
|
||||
assertThat(ids).isEmpty();
|
||||
}
|
||||
|
||||
// ─── Transcription blocks ───────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_find_document_by_transcription_block_text() {
|
||||
Document doc = documentRepository.saveAndFlush(document("Foto ohne Text"));
|
||||
UUID annotationId = annotation(doc.getId());
|
||||
|
||||
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Liebe Anna ich schreibe dir aus dem Krieg", 0));
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("schreiben");
|
||||
|
||||
assertThat(ids).contains(doc.getId());
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_rebuild_vector_when_transcription_block_inserted_after_document() {
|
||||
Document doc = documentRepository.saveAndFlush(document("Leeres Dokument"));
|
||||
em.clear();
|
||||
|
||||
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).isEmpty();
|
||||
|
||||
UUID annotationId = annotation(doc.getId());
|
||||
blockRepository.saveAndFlush(block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_rebuild_vector_when_transcription_block_deleted() {
|
||||
Document doc = documentRepository.saveAndFlush(document("Dokument mit Block"));
|
||||
UUID annotationId = annotation(doc.getId());
|
||||
TranscriptionBlock block = blockRepository.saveAndFlush(
|
||||
block(doc.getId(), annotationId, "Grundbuch Eintrag 1923", 0));
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).contains(doc.getId());
|
||||
|
||||
blockRepository.deleteById(block.getId());
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
assertThat(documentRepository.findRankedIdsByFts("Grundbuch")).doesNotContain(doc.getId());
|
||||
}
|
||||
|
||||
// ─── Ranking ───────────────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_rank_title_match_above_transcription_match() {
|
||||
// docA: "Grundbuch" only in title (weight A)
|
||||
// docB: "Grundbuch" only in transcription block (weight B)
|
||||
Document docA = documentRepository.saveAndFlush(document("Grundbuch 1923"));
|
||||
Document docB = documentRepository.saveAndFlush(document("Anderes Dokument"));
|
||||
UUID annotationId = annotation(docB.getId());
|
||||
blockRepository.saveAndFlush(block(docB.getId(), annotationId, "Grundbuch steht darin", 0));
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Grundbuch");
|
||||
|
||||
assertThat(ids).hasSize(2);
|
||||
assertThat(ids.get(0)).isEqualTo(docA.getId());
|
||||
}
|
||||
|
||||
// ─── Edge cases ────────────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_return_empty_when_query_contains_only_stop_words() {
|
||||
documentRepository.saveAndFlush(document("Ein Brief von der Oma"));
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("der die das und");
|
||||
|
||||
assertThat(ids).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_not_throw_when_query_contains_invalid_tsquery_syntax() {
|
||||
documentRepository.saveAndFlush(document("Brief"));
|
||||
em.clear();
|
||||
|
||||
assertThatNoException().isThrownBy(() -> documentRepository.findRankedIdsByFts("((("));
|
||||
}
|
||||
|
||||
// ─── Weight C: sender/receiver names ───────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_find_document_by_receiver_name() {
|
||||
Person receiver = personRepository.saveAndFlush(
|
||||
Person.builder().firstName("Anna").lastName("Schmidt").build());
|
||||
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||
.title("Brief")
|
||||
.originalFilename("brief.pdf")
|
||||
.status(DocumentStatus.UPLOADED)
|
||||
.receivers(Set.of(receiver))
|
||||
.build());
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Schmidt");
|
||||
|
||||
assertThat(ids).contains(doc.getId());
|
||||
}
|
||||
|
||||
// ─── Weight D: tag names ───────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void should_find_document_by_tag_name() {
|
||||
Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
|
||||
documentRepository.saveAndFlush(Document.builder()
|
||||
.title("Dokument")
|
||||
.originalFilename("dokument.pdf")
|
||||
.status(DocumentStatus.UPLOADED)
|
||||
.tags(Set.of(tag))
|
||||
.build());
|
||||
em.flush();
|
||||
em.clear();
|
||||
|
||||
List<UUID> ids = documentRepository.findRankedIdsByFts("Familiengeschichte");
|
||||
|
||||
assertThat(ids).hasSize(1);
|
||||
}
|
||||
|
||||
// ─── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
private Document document(String title) {
|
||||
return Document.builder()
|
||||
.title(title)
|
||||
.originalFilename(title.replace(" ", "_") + ".pdf")
|
||||
.status(DocumentStatus.UPLOADED)
|
||||
.build();
|
||||
}
|
||||
|
||||
private UUID annotation(UUID documentId) {
|
||||
DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder()
|
||||
.documentId(documentId)
|
||||
.pageNumber(1)
|
||||
.x(0.1).y(0.2).width(0.3).height(0.4)
|
||||
.color("#00C7B1")
|
||||
.build());
|
||||
em.flush();
|
||||
return ann.getId();
|
||||
}
|
||||
|
||||
private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) {
|
||||
return TranscriptionBlock.builder()
|
||||
.documentId(documentId)
|
||||
.annotationId(annotationId)
|
||||
.text(text)
|
||||
.sortOrder(order)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user