feat(fts): add search_vector column, GIN index, DB triggers, and FTS repository method (V34)

- V34 migration: adds search_vector tsvector column with GIN index
- BEFORE INSERT/UPDATE trigger on documents rebuilds vector from title (A),
  summary + transcription_blocks.text (B), sender/receiver names (C),
  tag names + location (D) using german FTS config
- AFTER triggers on transcription_blocks, document_receivers, document_tags
  touch the parent document row to re-fire the BEFORE UPDATE trigger
- DocumentRepository.findRankedIdsByFts() native query using websearch_to_tsquery
- DocumentFtsTest: 12 integration tests covering stemming, trigger sync,
  ranking, stop words, malformed input, receiver and tag search

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-14 23:38:12 +02:00
parent 57c44cf02f
commit 24530cf85b
3 changed files with 326 additions and 0 deletions

View File

@@ -81,4 +81,12 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
@Param("to") LocalDate to,
Sort sort);
@Query(nativeQuery = true, value = """
SELECT d.id FROM documents d
WHERE d.search_vector @@ websearch_to_tsquery('german', :query)
ORDER BY ts_rank(d.search_vector, websearch_to_tsquery('german', :query)) DESC,
d.meta_date DESC NULLS LAST
""")
List<UUID> findRankedIdsByFts(@Param("query") String query);
}

View File

@@ -0,0 +1,74 @@
-- ─── Full-Text Search: search_vector on documents ──────────────────────────────
-- Adds a tsvector column that aggregates: title (A), summary + transcription
-- block text (B), sender/receiver names (C), tag names + location (D).
-- The column is maintained by DB triggers so the OCR pipeline (which writes
-- transcription_blocks directly) stays in sync without JPA @PreUpdate hooks.
-- 1. Column and GIN index
ALTER TABLE documents ADD COLUMN search_vector tsvector;
CREATE INDEX idx_documents_search ON documents USING GIN (search_vector);
-- 2. Trigger function: rebuilds search_vector on documents INSERT or UPDATE.
-- Runs BEFORE the write so NEW.search_vector is set inline.
CREATE OR REPLACE FUNCTION fn_documents_fts_update() RETURNS trigger AS $$
BEGIN
NEW.search_vector :=
setweight(to_tsvector('german', coalesce(NEW.title, '')), 'A') ||
setweight(to_tsvector('german', coalesce(NEW.summary, '')), 'B') ||
setweight(to_tsvector('german', coalesce((
SELECT string_agg(tb.text, ' ') FILTER (WHERE tb.text IS NOT NULL)
FROM transcription_blocks tb
WHERE tb.document_id = NEW.id
), '')), 'B') ||
setweight(to_tsvector('german', coalesce((
SELECT coalesce(p.first_name, '') || ' ' || p.last_name
FROM persons p
WHERE p.id = NEW.sender_id
), '')), 'C') ||
setweight(to_tsvector('german', coalesce((
SELECT string_agg(coalesce(p.first_name, '') || ' ' || p.last_name, ' ')
FROM document_receivers dr
JOIN persons p ON p.id = dr.person_id
WHERE dr.document_id = NEW.id
), '')), 'C') ||
setweight(to_tsvector('german', coalesce((
SELECT string_agg(t.name, ' ')
FROM document_tags dt
JOIN tag t ON t.id = dt.tag_id
WHERE dt.document_id = NEW.id
), '')), 'D') ||
setweight(to_tsvector('german', coalesce(NEW.meta_location, '')), 'D');
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_documents_fts
BEFORE INSERT OR UPDATE ON documents
FOR EACH ROW EXECUTE FUNCTION fn_documents_fts_update();
-- 3. Rebuild trigger for join tables and transcription_blocks.
-- These tables don't have a search_vector of their own; instead they
-- touch the parent document row ("SET title = title") to re-fire the
-- BEFORE UPDATE trigger above, which then recomputes the vector with
-- the current state of all joined tables.
CREATE OR REPLACE FUNCTION fn_rebuild_document_fts() RETURNS trigger AS $$
DECLARE
v_doc_id UUID;
BEGIN
v_doc_id := CASE WHEN TG_OP = 'DELETE' THEN OLD.document_id ELSE NEW.document_id END;
UPDATE documents SET title = title WHERE id = v_doc_id;
RETURN NULL;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_transcription_blocks_fts
AFTER INSERT OR UPDATE OR DELETE ON transcription_blocks
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
CREATE TRIGGER trg_document_receivers_fts
AFTER INSERT OR DELETE ON document_receivers
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();
CREATE TRIGGER trg_document_tags_fts
AFTER INSERT OR DELETE ON document_tags
FOR EACH ROW EXECUTE FUNCTION fn_rebuild_document_fts();