From c23515107564ef6e1f1ddc3a2b4944f595ac6185 Mon Sep 17 00:00:00 2001 From: Marcel Date: Wed, 15 Apr 2026 17:40:47 +0200 Subject: [PATCH] test(search): add DocumentSearchEnrichmentTest for findEnrichmentData native query Tests lateral join best-block selection, chr(1)/chr(2) headline delimiters, sender/receiver/tag match flags, and null cases for missing relations. Co-Authored-By: Claude Sonnet 4.6 --- .../repository/DocumentRepository.java | 51 +++ .../DocumentSearchEnrichmentTest.java | 303 ++++++++++++++++++ 2 files changed, 354 insertions(+) create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java index 3a183ded..f256025a 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java @@ -89,4 +89,55 @@ public interface DocumentRepository extends JpaRepository, JpaSp """) List findRankedIdsByFts(@Param("query") String query); + /** + * Returns match-enrichment data for a set of documents identified by their IDs. + * Each row contains (in column order): + *
    + *
  1. UUID — document id
  2. + *
  3. String — title headline with \x01/\x02 delimiters around matched terms
  4. + *
  5. String — best-ranked matching transcription block text, or null
  6. + *
  7. Boolean — whether the sender's name matched the query
  8. + *
  9. String — comma-separated matched receiver UUIDs, or null
  10. + *
  11. String — comma-separated matched tag UUIDs, or null
  12. + *
+ * Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank. + */ + @Query(nativeQuery = true, value = """ + SELECT + d.id, + ts_headline('german', d.title, websearch_to_tsquery('german', :query), + 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true') + AS title_headline, + best_block.text AS transcription_snippet, + (s.id IS NOT NULL AND + to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, '')) + @@ websearch_to_tsquery('german', :query)) + AS sender_matched, + (SELECT string_agg(r.id::text, ',') + FROM document_receivers dr + JOIN persons r ON r.id = dr.person_id + WHERE dr.document_id = d.id + AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name) + @@ websearch_to_tsquery('german', :query) + ) AS matched_receiver_ids, + (SELECT string_agg(t.id::text, ',') + FROM document_tags dt + JOIN tag t ON t.id = dt.tag_id + WHERE dt.document_id = d.id + AND to_tsvector('german', t.name) @@ websearch_to_tsquery('german', :query) + ) AS matched_tag_ids + FROM documents d + LEFT JOIN persons s ON s.id = d.sender_id + LEFT JOIN LATERAL ( + SELECT tb.text + FROM transcription_blocks tb + WHERE tb.document_id = d.id + AND to_tsvector('german', tb.text) @@ websearch_to_tsquery('german', :query) + ORDER BY ts_rank(to_tsvector('german', tb.text), websearch_to_tsquery('german', :query)) DESC + LIMIT 1 + ) best_block ON true + WHERE d.id IN :ids + """) + List findEnrichmentData(@Param("ids") Collection ids, @Param("query") String query); + } \ No newline at end of file diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java new file mode 100644 index 00000000..4454c741 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java @@ -0,0 +1,303 @@ +package org.raddatz.familienarchiv.repository; + +import jakarta.persistence.EntityManager; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.config.FlywayConfig; +import org.raddatz.familienarchiv.model.Document; +import org.raddatz.familienarchiv.model.DocumentAnnotation; +import org.raddatz.familienarchiv.model.DocumentStatus; +import org.raddatz.familienarchiv.model.Person; +import org.raddatz.familienarchiv.model.Tag; +import org.raddatz.familienarchiv.model.TranscriptionBlock; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest; +import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase; +import org.springframework.context.annotation.Import; + +import java.util.List; +import java.util.Set; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +@DataJpaTest +@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE) +@Import({PostgresContainerConfig.class, FlywayConfig.class}) +class DocumentSearchEnrichmentTest { + + @Autowired DocumentRepository documentRepository; + @Autowired PersonRepository personRepository; + @Autowired TagRepository tagRepository; + @Autowired AnnotationRepository annotationRepository; + @Autowired TranscriptionBlockRepository blockRepository; + @Autowired EntityManager em; + + @BeforeEach + void setUp() { + blockRepository.deleteAll(); + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + } + + // ─── Lateral join: best transcription snippet ────────────────────────────── + + @Test + void lateral_join_returns_highest_ranked_transcription_block() { + Document doc = documentRepository.saveAndFlush(document("Brief an Anna")); + UUID annotId = annotation(doc.getId()); + // Three blocks — the one with two occurrences has highest rank + blockRepository.saveAndFlush(block(doc.getId(), annotId, "Das Wetter war schön", 0)); + blockRepository.saveAndFlush(block(doc.getId(), annotId, "Brief Brief Brief", 1)); // highest rank for "Brief" + blockRepository.saveAndFlush(block(doc.getId(), annotId, "Ein Brief liegt vor", 2)); // one occurrence + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + String snippet = (String) rows.get(0)[2]; + assertThat(snippet).isEqualTo("Brief Brief Brief"); + } + + @Test + void document_with_no_transcription_blocks_has_null_snippet() { + Document doc = documentRepository.saveAndFlush(document("Foto ohne Text")); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Foto"); + + assertThat(rows).hasSize(1); + Object snippet = rows.get(0)[2]; + assertThat(snippet).isNull(); + } + + @Test + void document_with_non_matching_blocks_has_null_snippet() { + Document doc = documentRepository.saveAndFlush(document("Dok")); + UUID annotId = annotation(doc.getId()); + blockRepository.saveAndFlush(block(doc.getId(), annotId, "Kein Match hier", 0)); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[2]).isNull(); + } + + // ─── Title headline: delimiter-based offset detection ───────────────────── + + @Test + void title_headline_contains_delimiters_when_title_matches() { + Document doc = documentRepository.saveAndFlush(document("Brief an die Familie")); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + String headline = (String) rows.get(0)[1]; + // chr(1) marks the start of the highlighted term + assertThat(headline).contains("\u0001"); + assertThat(headline).contains("\u0002"); + } + + @Test + void title_headline_has_no_delimiters_when_title_does_not_match() { + Document doc = documentRepository.saveAndFlush(document("Familienfoto")); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + String headline = (String) rows.get(0)[1]; + assertThat(headline).doesNotContain("\u0001"); + assertThat(headline).doesNotContain("\u0002"); + } + + @Test + void title_headline_matches_stemmed_form() { + // "Brief" (singular, query) should match "Briefe" (plural, in title) via German FTS stemming. + // Both reduce to the stem "brief" under the Snowball German algorithm — verified by the + // existing should_find_document_by_stemmed_inflected_form test in DocumentFtsTest. + Document doc = documentRepository.saveAndFlush(document("Alte Briefe aus Berlin")); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + String headline = (String) rows.get(0)[1]; + assertThat(headline).contains("\u0001"); + } + + // ─── Sender match ────────────────────────────────────────────────────────── + + @Test + void sender_matched_is_true_when_sender_last_name_matches_query() { + Person sender = personRepository.saveAndFlush( + Person.builder().firstName("Walter").lastName("Raddatz").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Brief") + .originalFilename("brief.pdf") + .status(DocumentStatus.UPLOADED) + .sender(sender) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Raddatz"); + + assertThat(rows).hasSize(1); + Boolean senderMatched = (Boolean) rows.get(0)[3]; + assertThat(senderMatched).isTrue(); + } + + @Test + void sender_matched_is_false_when_sender_name_does_not_match() { + Person sender = personRepository.saveAndFlush( + Person.builder().firstName("Walter").lastName("Raddatz").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Brief") + .originalFilename("brief.pdf") + .status(DocumentStatus.UPLOADED) + .sender(sender) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Schmidt"); + + assertThat(rows).hasSize(1); + Boolean senderMatched = (Boolean) rows.get(0)[3]; + assertThat(senderMatched).isFalse(); + } + + @Test + void sender_matched_is_false_when_document_has_no_sender() { + Document doc = documentRepository.saveAndFlush(document("Brief von unbekannt")); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + Boolean senderMatched = (Boolean) rows.get(0)[3]; + assertThat(senderMatched).isFalse(); + } + + // ─── Receiver match ──────────────────────────────────────────────────────── + + @Test + void matched_receiver_ids_contains_uuid_of_matching_receiver() { + Person receiver = personRepository.saveAndFlush( + Person.builder().firstName("Anna").lastName("Schmidt").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Brief") + .originalFilename("brief.pdf") + .status(DocumentStatus.UPLOADED) + .receivers(Set.of(receiver)) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Schmidt"); + + assertThat(rows).hasSize(1); + String receiverIds = (String) rows.get(0)[4]; + assertThat(receiverIds).contains(receiver.getId().toString()); + } + + @Test + void matched_receiver_ids_is_null_when_no_receiver_matches() { + Person receiver = personRepository.saveAndFlush( + Person.builder().firstName("Anna").lastName("Schmidt").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Brief") + .originalFilename("brief.pdf") + .status(DocumentStatus.UPLOADED) + .receivers(Set.of(receiver)) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Raddatz"); + + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[4]).isNull(); + } + + // ─── Tag match ───────────────────────────────────────────────────────────── + + @Test + void matched_tag_ids_contains_uuid_of_matching_tag() { + Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Dokument") + .originalFilename("dok.pdf") + .status(DocumentStatus.UPLOADED) + .tags(Set.of(tag)) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Familiengeschichte"); + + assertThat(rows).hasSize(1); + String tagIds = (String) rows.get(0)[5]; + assertThat(tagIds).contains(tag.getId().toString()); + } + + @Test + void matched_tag_ids_is_null_when_no_tag_matches() { + Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build()); + Document doc = documentRepository.saveAndFlush(Document.builder() + .title("Dokument") + .originalFilename("dok.pdf") + .status(DocumentStatus.UPLOADED) + .tags(Set.of(tag)) + .build()); + em.flush(); + em.clear(); + + List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); + + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[5]).isNull(); + } + + // ─── Helpers ─────────────────────────────────────────────────────────────── + + private Document document(String title) { + return Document.builder() + .title(title) + .originalFilename(title.replace(" ", "_") + ".pdf") + .status(DocumentStatus.UPLOADED) + .build(); + } + + private UUID annotation(UUID documentId) { + DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder() + .documentId(documentId) + .pageNumber(1) + .x(0.1).y(0.2).width(0.3).height(0.4) + .color("#00C7B1") + .build()); + em.flush(); + return ann.getId(); + } + + private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) { + return TranscriptionBlock.builder() + .documentId(documentId) + .annotationId(annotationId) + .text(text) + .sortOrder(order) + .build(); + } +}