test(search): add DocumentSearchEnrichmentTest for findEnrichmentData native query
Tests lateral join best-block selection, chr(1)/chr(2) headline delimiters, sender/receiver/tag match flags, and null cases for missing relations. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -89,4 +89,55 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
""")
|
""")
|
||||||
List<UUID> findRankedIdsByFts(@Param("query") String query);
|
List<UUID> findRankedIdsByFts(@Param("query") String query);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns match-enrichment data for a set of documents identified by their IDs.
|
||||||
|
* Each row contains (in column order):
|
||||||
|
* <ol>
|
||||||
|
* <li>UUID — document id</li>
|
||||||
|
* <li>String — title headline with \x01/\x02 delimiters around matched terms</li>
|
||||||
|
* <li>String — best-ranked matching transcription block text, or null</li>
|
||||||
|
* <li>Boolean — whether the sender's name matched the query</li>
|
||||||
|
* <li>String — comma-separated matched receiver UUIDs, or null</li>
|
||||||
|
* <li>String — comma-separated matched tag UUIDs, or null</li>
|
||||||
|
* </ol>
|
||||||
|
* Short-circuit before calling this method when {@code ids} is empty or {@code query} is blank.
|
||||||
|
*/
|
||||||
|
@Query(nativeQuery = true, value = """
|
||||||
|
SELECT
|
||||||
|
d.id,
|
||||||
|
ts_headline('german', d.title, websearch_to_tsquery('german', :query),
|
||||||
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
|
||||||
|
AS title_headline,
|
||||||
|
best_block.text AS transcription_snippet,
|
||||||
|
(s.id IS NOT NULL AND
|
||||||
|
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
|
||||||
|
@@ websearch_to_tsquery('german', :query))
|
||||||
|
AS sender_matched,
|
||||||
|
(SELECT string_agg(r.id::text, ',')
|
||||||
|
FROM document_receivers dr
|
||||||
|
JOIN persons r ON r.id = dr.person_id
|
||||||
|
WHERE dr.document_id = d.id
|
||||||
|
AND to_tsvector('german', COALESCE(r.first_name, '') || ' ' || r.last_name)
|
||||||
|
@@ websearch_to_tsquery('german', :query)
|
||||||
|
) AS matched_receiver_ids,
|
||||||
|
(SELECT string_agg(t.id::text, ',')
|
||||||
|
FROM document_tags dt
|
||||||
|
JOIN tag t ON t.id = dt.tag_id
|
||||||
|
WHERE dt.document_id = d.id
|
||||||
|
AND to_tsvector('german', t.name) @@ websearch_to_tsquery('german', :query)
|
||||||
|
) AS matched_tag_ids
|
||||||
|
FROM documents d
|
||||||
|
LEFT JOIN persons s ON s.id = d.sender_id
|
||||||
|
LEFT JOIN LATERAL (
|
||||||
|
SELECT tb.text
|
||||||
|
FROM transcription_blocks tb
|
||||||
|
WHERE tb.document_id = d.id
|
||||||
|
AND to_tsvector('german', tb.text) @@ websearch_to_tsquery('german', :query)
|
||||||
|
ORDER BY ts_rank(to_tsvector('german', tb.text), websearch_to_tsquery('german', :query)) DESC
|
||||||
|
LIMIT 1
|
||||||
|
) best_block ON true
|
||||||
|
WHERE d.id IN :ids
|
||||||
|
""")
|
||||||
|
List<Object[]> findEnrichmentData(@Param("ids") Collection<UUID> ids, @Param("query") String query);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,303 @@
|
|||||||
|
package org.raddatz.familienarchiv.repository;
|
||||||
|
|
||||||
|
import jakarta.persistence.EntityManager;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.raddatz.familienarchiv.PostgresContainerConfig;
|
||||||
|
import org.raddatz.familienarchiv.config.FlywayConfig;
|
||||||
|
import org.raddatz.familienarchiv.model.Document;
|
||||||
|
import org.raddatz.familienarchiv.model.DocumentAnnotation;
|
||||||
|
import org.raddatz.familienarchiv.model.DocumentStatus;
|
||||||
|
import org.raddatz.familienarchiv.model.Person;
|
||||||
|
import org.raddatz.familienarchiv.model.Tag;
|
||||||
|
import org.raddatz.familienarchiv.model.TranscriptionBlock;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
|
||||||
|
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
|
||||||
|
import org.springframework.context.annotation.Import;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
@DataJpaTest
|
||||||
|
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
|
||||||
|
@Import({PostgresContainerConfig.class, FlywayConfig.class})
|
||||||
|
class DocumentSearchEnrichmentTest {
|
||||||
|
|
||||||
|
@Autowired DocumentRepository documentRepository;
|
||||||
|
@Autowired PersonRepository personRepository;
|
||||||
|
@Autowired TagRepository tagRepository;
|
||||||
|
@Autowired AnnotationRepository annotationRepository;
|
||||||
|
@Autowired TranscriptionBlockRepository blockRepository;
|
||||||
|
@Autowired EntityManager em;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
blockRepository.deleteAll();
|
||||||
|
documentRepository.deleteAll();
|
||||||
|
personRepository.deleteAll();
|
||||||
|
tagRepository.deleteAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Lateral join: best transcription snippet ──────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void lateral_join_returns_highest_ranked_transcription_block() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Brief an Anna"));
|
||||||
|
UUID annotId = annotation(doc.getId());
|
||||||
|
// Three blocks — the one with two occurrences has highest rank
|
||||||
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Das Wetter war schön", 0));
|
||||||
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Brief Brief Brief", 1)); // highest rank for "Brief"
|
||||||
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Ein Brief liegt vor", 2)); // one occurrence
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String snippet = (String) rows.get(0)[2];
|
||||||
|
assertThat(snippet).isEqualTo("Brief Brief Brief");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void document_with_no_transcription_blocks_has_null_snippet() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Foto ohne Text"));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Foto");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
Object snippet = rows.get(0)[2];
|
||||||
|
assertThat(snippet).isNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void document_with_non_matching_blocks_has_null_snippet() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Dok"));
|
||||||
|
UUID annotId = annotation(doc.getId());
|
||||||
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Kein Match hier", 0));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
assertThat(rows.get(0)[2]).isNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Title headline: delimiter-based offset detection ─────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void title_headline_contains_delimiters_when_title_matches() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Brief an die Familie"));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String headline = (String) rows.get(0)[1];
|
||||||
|
// chr(1) marks the start of the highlighted term
|
||||||
|
assertThat(headline).contains("\u0001");
|
||||||
|
assertThat(headline).contains("\u0002");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void title_headline_has_no_delimiters_when_title_does_not_match() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Familienfoto"));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String headline = (String) rows.get(0)[1];
|
||||||
|
assertThat(headline).doesNotContain("\u0001");
|
||||||
|
assertThat(headline).doesNotContain("\u0002");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void title_headline_matches_stemmed_form() {
|
||||||
|
// "Brief" (singular, query) should match "Briefe" (plural, in title) via German FTS stemming.
|
||||||
|
// Both reduce to the stem "brief" under the Snowball German algorithm — verified by the
|
||||||
|
// existing should_find_document_by_stemmed_inflected_form test in DocumentFtsTest.
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Alte Briefe aus Berlin"));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String headline = (String) rows.get(0)[1];
|
||||||
|
assertThat(headline).contains("\u0001");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Sender match ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void sender_matched_is_true_when_sender_last_name_matches_query() {
|
||||||
|
Person sender = personRepository.saveAndFlush(
|
||||||
|
Person.builder().firstName("Walter").lastName("Raddatz").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Brief")
|
||||||
|
.originalFilename("brief.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.sender(sender)
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Raddatz");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
Boolean senderMatched = (Boolean) rows.get(0)[3];
|
||||||
|
assertThat(senderMatched).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void sender_matched_is_false_when_sender_name_does_not_match() {
|
||||||
|
Person sender = personRepository.saveAndFlush(
|
||||||
|
Person.builder().firstName("Walter").lastName("Raddatz").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Brief")
|
||||||
|
.originalFilename("brief.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.sender(sender)
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Schmidt");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
Boolean senderMatched = (Boolean) rows.get(0)[3];
|
||||||
|
assertThat(senderMatched).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void sender_matched_is_false_when_document_has_no_sender() {
|
||||||
|
Document doc = documentRepository.saveAndFlush(document("Brief von unbekannt"));
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
Boolean senderMatched = (Boolean) rows.get(0)[3];
|
||||||
|
assertThat(senderMatched).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Receiver match ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void matched_receiver_ids_contains_uuid_of_matching_receiver() {
|
||||||
|
Person receiver = personRepository.saveAndFlush(
|
||||||
|
Person.builder().firstName("Anna").lastName("Schmidt").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Brief")
|
||||||
|
.originalFilename("brief.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.receivers(Set.of(receiver))
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Schmidt");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String receiverIds = (String) rows.get(0)[4];
|
||||||
|
assertThat(receiverIds).contains(receiver.getId().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void matched_receiver_ids_is_null_when_no_receiver_matches() {
|
||||||
|
Person receiver = personRepository.saveAndFlush(
|
||||||
|
Person.builder().firstName("Anna").lastName("Schmidt").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Brief")
|
||||||
|
.originalFilename("brief.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.receivers(Set.of(receiver))
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Raddatz");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
assertThat(rows.get(0)[4]).isNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Tag match ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void matched_tag_ids_contains_uuid_of_matching_tag() {
|
||||||
|
Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Dokument")
|
||||||
|
.originalFilename("dok.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.tags(Set.of(tag))
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Familiengeschichte");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
String tagIds = (String) rows.get(0)[5];
|
||||||
|
assertThat(tagIds).contains(tag.getId().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void matched_tag_ids_is_null_when_no_tag_matches() {
|
||||||
|
Tag tag = tagRepository.saveAndFlush(Tag.builder().name("Familiengeschichte").build());
|
||||||
|
Document doc = documentRepository.saveAndFlush(Document.builder()
|
||||||
|
.title("Dokument")
|
||||||
|
.originalFilename("dok.pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.tags(Set.of(tag))
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
em.clear();
|
||||||
|
|
||||||
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
|
assertThat(rows).hasSize(1);
|
||||||
|
assertThat(rows.get(0)[5]).isNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
private Document document(String title) {
|
||||||
|
return Document.builder()
|
||||||
|
.title(title)
|
||||||
|
.originalFilename(title.replace(" ", "_") + ".pdf")
|
||||||
|
.status(DocumentStatus.UPLOADED)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private UUID annotation(UUID documentId) {
|
||||||
|
DocumentAnnotation ann = annotationRepository.save(DocumentAnnotation.builder()
|
||||||
|
.documentId(documentId)
|
||||||
|
.pageNumber(1)
|
||||||
|
.x(0.1).y(0.2).width(0.3).height(0.4)
|
||||||
|
.color("#00C7B1")
|
||||||
|
.build());
|
||||||
|
em.flush();
|
||||||
|
return ann.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
private TranscriptionBlock block(UUID documentId, UUID annotationId, String text, int order) {
|
||||||
|
return TranscriptionBlock.builder()
|
||||||
|
.documentId(documentId)
|
||||||
|
.annotationId(annotationId)
|
||||||
|
.text(text)
|
||||||
|
.sortOrder(order)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user