diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java index 7cd06d85..8f76de03 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java @@ -39,10 +39,17 @@ public record SearchMatchData( * IDs of tags whose names matched the query. */ @Schema(requiredMode = Schema.RequiredMode.REQUIRED) - List matchedTagIds + List matchedTagIds, + + /** + * Character offsets of highlighted terms within the transcription snippet. + * Empty when no transcription block matched or the snippet has no highlights. + */ + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + List snippetOffsets ) { /** Canonical "no match data" value for a single document. */ public static SearchMatchData empty() { - return new SearchMatchData(null, List.of(), false, List.of(), List.of()); + return new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of()); } } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java index f256025a..3e4f7150 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java @@ -95,7 +95,7 @@ public interface DocumentRepository extends JpaRepository, JpaSp *
    *
  1. UUID — document id
  2. *
  3. String — title headline with \x01/\x02 delimiters around matched terms
  4. - *
  5. String — best-ranked matching transcription block text, or null
  6. + *
  7. String — best-ranked transcription snippet with \x01/\x02 delimiters, or null
  8. *
  9. Boolean — whether the sender's name matched the query
  10. *
  11. String — comma-separated matched receiver UUIDs, or null
  12. *
  13. String — comma-separated matched tag UUIDs, or null
  14. @@ -108,7 +108,10 @@ public interface DocumentRepository extends JpaRepository, JpaSp ts_headline('german', d.title, websearch_to_tsquery('german', :query), 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true') AS title_headline, - best_block.text AS transcription_snippet, + CASE WHEN best_block.text IS NOT NULL THEN + ts_headline('german', best_block.text, websearch_to_tsquery('german', :query), + 'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20') + END AS transcription_snippet, (s.id IS NOT NULL AND to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, '')) @@ websearch_to_tsquery('german', :query)) diff --git a/backend/src/test/java/org/raddatz/familienarchiv/controller/DocumentControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/controller/DocumentControllerTest.java index 816c0b9e..97286b68 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/controller/DocumentControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/controller/DocumentControllerTest.java @@ -125,7 +125,7 @@ class DocumentControllerTest { .status(DocumentStatus.UPLOADED) .build(); var matchData = new org.raddatz.familienarchiv.dto.SearchMatchData( - "Er schrieb einen langen Brief", List.of(), false, List.of(), List.of()); + "Er schrieb einen langen Brief", List.of(), false, List.of(), List.of(), List.of()); when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) .thenReturn(DocumentSearchResult.withMatchData(List.of(doc), Map.of(docId, matchData))); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/dto/DocumentSearchResultTest.java b/backend/src/test/java/org/raddatz/familienarchiv/dto/DocumentSearchResultTest.java index 7f2292f9..3bca81a2 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/dto/DocumentSearchResultTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/dto/DocumentSearchResultTest.java @@ -36,7 +36,7 @@ class DocumentSearchResultTest { @Test void withMatchData_exposes_match_data_map() { UUID id = UUID.randomUUID(); - SearchMatchData data = new SearchMatchData("snippet", List.of(), false, List.of(), List.of()); + SearchMatchData data = new SearchMatchData("snippet", List.of(), false, List.of(), List.of(), List.of()); DocumentSearchResult result = DocumentSearchResult.withMatchData(List.of(doc(id)), Map.of(id, data)); assertThat(result.matchData()).containsKey(id); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/dto/SearchMatchDataTest.java b/backend/src/test/java/org/raddatz/familienarchiv/dto/SearchMatchDataTest.java index 8135aafa..27050480 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/dto/SearchMatchDataTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/dto/SearchMatchDataTest.java @@ -10,7 +10,7 @@ class SearchMatchDataTest { @Test void transcription_snippet_is_nullable() { - SearchMatchData data = new SearchMatchData(null, List.of(), false, List.of(), List.of()); + SearchMatchData data = new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of()); assertThat(data.transcriptionSnippet()).isNull(); } @@ -34,6 +34,7 @@ class SearchMatchDataTest { List.of(offset), true, List.of(), + List.of(), List.of() ); @@ -41,4 +42,24 @@ class SearchMatchDataTest { assertThat(data.titleOffsets()).containsExactly(offset); assertThat(data.senderMatched()).isTrue(); } + + @Test + void snippet_offsets_are_empty_in_empty_factory() { + SearchMatchData data = SearchMatchData.empty(); + assertThat(data.snippetOffsets()).isEmpty(); + } + + @Test + void snippet_offsets_carry_through_constructor() { + MatchOffset offset = new MatchOffset(5, 3); + SearchMatchData data = new SearchMatchData( + "Das ist ein furchtbares Bild", + List.of(), + false, + List.of(), + List.of(), + List.of(offset) + ); + assertThat(data.snippetOffsets()).containsExactly(offset); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java index 4454c741..b0b69ad7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/DocumentSearchEnrichmentTest.java @@ -4,6 +4,7 @@ import jakarta.persistence.EntityManager; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.service.DocumentService; import org.raddatz.familienarchiv.config.FlywayConfig; import org.raddatz.familienarchiv.model.Document; import org.raddatz.familienarchiv.model.DocumentAnnotation; @@ -48,7 +49,7 @@ class DocumentSearchEnrichmentTest { void lateral_join_returns_highest_ranked_transcription_block() { Document doc = documentRepository.saveAndFlush(document("Brief an Anna")); UUID annotId = annotation(doc.getId()); - // Three blocks — the one with two occurrences has highest rank + // Three blocks — the one with three occurrences has highest rank blockRepository.saveAndFlush(block(doc.getId(), annotId, "Das Wetter war schön", 0)); blockRepository.saveAndFlush(block(doc.getId(), annotId, "Brief Brief Brief", 1)); // highest rank for "Brief" blockRepository.saveAndFlush(block(doc.getId(), annotId, "Ein Brief liegt vor", 2)); // one occurrence @@ -58,8 +59,11 @@ class DocumentSearchEnrichmentTest { List rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief"); assertThat(rows).hasSize(1); - String snippet = (String) rows.get(0)[2]; - assertThat(snippet).isEqualTo("Brief Brief Brief"); + // row[2] is now a ts_headline result with sentinel chars — parse it for clean text + DocumentService.ParsedHighlight parsed = DocumentService.parseHighlight((String) rows.get(0)[2]); + assertThat(parsed).isNotNull(); + assertThat(parsed.cleanText()).isEqualTo("Brief Brief Brief"); + assertThat(parsed.offsets()).isNotEmpty(); // at least one "Brief" is highlighted } @Test diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java index 9d6d666b..e4baab52 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/DocumentServiceTest.java @@ -1379,7 +1379,9 @@ class DocumentServiceTest { void searchDocuments_withTextQuery_includesTranscriptionSnippetWhenPresent() { UUID docId = UUID.randomUUID(); Document doc = Document.builder().id(docId).title("Dok").build(); - List rows = Collections.singletonList(new Object[]{docId, "Dok", "Hier ist der Brief aus Berlin", false, null, null}); + // Simulate ts_headline output with sentinel markers around the matched word + String snippetHeadline = "Hier ist der \u0001Brief\u0002 aus Berlin"; + List rows = Collections.singletonList(new Object[]{docId, "Dok", snippetHeadline, false, null, null}); when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId)); when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) @@ -1391,5 +1393,39 @@ class DocumentServiceTest { SearchMatchData md = result.matchData().get(docId); assertThat(md.transcriptionSnippet()).isEqualTo("Hier ist der Brief aus Berlin"); + assertThat(md.snippetOffsets()).containsExactly(new MatchOffset(13, 5)); // "Brief" at pos 13 + } + + // ─── parseHighlight unit tests ──────────────────────────────────────────── + + @Test + void parseHighlight_returnsNull_whenInputIsNull() { + assertThat(DocumentService.parseHighlight(null)).isNull(); + } + + @Test + void parseHighlight_returnsCleanTextAndEmptyOffsets_whenNoSentinels() { + DocumentService.ParsedHighlight result = DocumentService.parseHighlight("plain text"); + assertThat(result.cleanText()).isEqualTo("plain text"); + assertThat(result.offsets()).isEmpty(); + } + + @Test + void parseHighlight_extractsOffsetAndStripsDelimiters() { + // \u0001 = start sentinel, \u0002 = stop sentinel + DocumentService.ParsedHighlight result = DocumentService.parseHighlight("Das \u0001furchtbare\u0002 Wort"); + assertThat(result.cleanText()).isEqualTo("Das furchtbare Wort"); + assertThat(result.offsets()).containsExactly(new MatchOffset(4, 10)); // "furchtbare" at pos 4, len 10 + } + + @Test + void parseHighlight_handlesMultipleHighlightedTerms() { + DocumentService.ParsedHighlight result = + DocumentService.parseHighlight("\u0001Hallo\u0002 und \u0001Welt\u0002"); + assertThat(result.cleanText()).isEqualTo("Hallo und Welt"); + assertThat(result.offsets()).containsExactly( + new MatchOffset(0, 5), // "Hallo" + new MatchOffset(10, 4) // "Welt" + ); } }