feat(search): add snippetOffsets to SearchMatchData and use ts_headline for highlighted snippets
- SearchMatchData gains a 6th field snippetOffsets: List<MatchOffset> so the frontend
can render highlighted terms inside the transcription snippet without {#html}.
- DocumentRepository.findEnrichmentData now calls ts_headline() with chr(1)/chr(2)
sentinels instead of returning raw block text; parseHighlight() strips the sentinels
and produces clean text + MatchOffset list in one pass.
- DocumentService exposes ParsedHighlight and parseHighlight() as public so they can be
called from cross-package integration tests.
- All related tests updated to the new 6-argument SearchMatchData constructor and
to call parseHighlight() for asserting the snippet clean text and offsets.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -39,10 +39,17 @@ public record SearchMatchData(
|
|||||||
* IDs of tags whose names matched the query.
|
* IDs of tags whose names matched the query.
|
||||||
*/
|
*/
|
||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
List<UUID> matchedTagIds
|
List<UUID> matchedTagIds,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Character offsets of highlighted terms within the transcription snippet.
|
||||||
|
* Empty when no transcription block matched or the snippet has no highlights.
|
||||||
|
*/
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
List<MatchOffset> snippetOffsets
|
||||||
) {
|
) {
|
||||||
/** Canonical "no match data" value for a single document. */
|
/** Canonical "no match data" value for a single document. */
|
||||||
public static SearchMatchData empty() {
|
public static SearchMatchData empty() {
|
||||||
return new SearchMatchData(null, List.of(), false, List.of(), List.of());
|
return new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
* <ol>
|
* <ol>
|
||||||
* <li>UUID — document id</li>
|
* <li>UUID — document id</li>
|
||||||
* <li>String — title headline with \x01/\x02 delimiters around matched terms</li>
|
* <li>String — title headline with \x01/\x02 delimiters around matched terms</li>
|
||||||
* <li>String — best-ranked matching transcription block text, or null</li>
|
* <li>String — best-ranked transcription snippet with \x01/\x02 delimiters, or null</li>
|
||||||
* <li>Boolean — whether the sender's name matched the query</li>
|
* <li>Boolean — whether the sender's name matched the query</li>
|
||||||
* <li>String — comma-separated matched receiver UUIDs, or null</li>
|
* <li>String — comma-separated matched receiver UUIDs, or null</li>
|
||||||
* <li>String — comma-separated matched tag UUIDs, or null</li>
|
* <li>String — comma-separated matched tag UUIDs, or null</li>
|
||||||
@@ -108,7 +108,10 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
|
|||||||
ts_headline('german', d.title, websearch_to_tsquery('german', :query),
|
ts_headline('german', d.title, websearch_to_tsquery('german', :query),
|
||||||
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
|
||||||
AS title_headline,
|
AS title_headline,
|
||||||
best_block.text AS transcription_snippet,
|
CASE WHEN best_block.text IS NOT NULL THEN
|
||||||
|
ts_headline('german', best_block.text, websearch_to_tsquery('german', :query),
|
||||||
|
'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
|
||||||
|
END AS transcription_snippet,
|
||||||
(s.id IS NOT NULL AND
|
(s.id IS NOT NULL AND
|
||||||
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
|
to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
|
||||||
@@ websearch_to_tsquery('german', :query))
|
@@ websearch_to_tsquery('german', :query))
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ class DocumentControllerTest {
|
|||||||
.status(DocumentStatus.UPLOADED)
|
.status(DocumentStatus.UPLOADED)
|
||||||
.build();
|
.build();
|
||||||
var matchData = new org.raddatz.familienarchiv.dto.SearchMatchData(
|
var matchData = new org.raddatz.familienarchiv.dto.SearchMatchData(
|
||||||
"Er schrieb einen langen Brief", List.of(), false, List.of(), List.of());
|
"Er schrieb einen langen Brief", List.of(), false, List.of(), List.of(), List.of());
|
||||||
when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any()))
|
when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any()))
|
||||||
.thenReturn(DocumentSearchResult.withMatchData(List.of(doc), Map.of(docId, matchData)));
|
.thenReturn(DocumentSearchResult.withMatchData(List.of(doc), Map.of(docId, matchData)));
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class DocumentSearchResultTest {
|
|||||||
@Test
|
@Test
|
||||||
void withMatchData_exposes_match_data_map() {
|
void withMatchData_exposes_match_data_map() {
|
||||||
UUID id = UUID.randomUUID();
|
UUID id = UUID.randomUUID();
|
||||||
SearchMatchData data = new SearchMatchData("snippet", List.of(), false, List.of(), List.of());
|
SearchMatchData data = new SearchMatchData("snippet", List.of(), false, List.of(), List.of(), List.of());
|
||||||
DocumentSearchResult result = DocumentSearchResult.withMatchData(List.of(doc(id)), Map.of(id, data));
|
DocumentSearchResult result = DocumentSearchResult.withMatchData(List.of(doc(id)), Map.of(id, data));
|
||||||
|
|
||||||
assertThat(result.matchData()).containsKey(id);
|
assertThat(result.matchData()).containsKey(id);
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class SearchMatchDataTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void transcription_snippet_is_nullable() {
|
void transcription_snippet_is_nullable() {
|
||||||
SearchMatchData data = new SearchMatchData(null, List.of(), false, List.of(), List.of());
|
SearchMatchData data = new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of());
|
||||||
|
|
||||||
assertThat(data.transcriptionSnippet()).isNull();
|
assertThat(data.transcriptionSnippet()).isNull();
|
||||||
}
|
}
|
||||||
@@ -34,6 +34,7 @@ class SearchMatchDataTest {
|
|||||||
List.of(offset),
|
List.of(offset),
|
||||||
true,
|
true,
|
||||||
List.of(),
|
List.of(),
|
||||||
|
List.of(),
|
||||||
List.of()
|
List.of()
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -41,4 +42,24 @@ class SearchMatchDataTest {
|
|||||||
assertThat(data.titleOffsets()).containsExactly(offset);
|
assertThat(data.titleOffsets()).containsExactly(offset);
|
||||||
assertThat(data.senderMatched()).isTrue();
|
assertThat(data.senderMatched()).isTrue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void snippet_offsets_are_empty_in_empty_factory() {
|
||||||
|
SearchMatchData data = SearchMatchData.empty();
|
||||||
|
assertThat(data.snippetOffsets()).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void snippet_offsets_carry_through_constructor() {
|
||||||
|
MatchOffset offset = new MatchOffset(5, 3);
|
||||||
|
SearchMatchData data = new SearchMatchData(
|
||||||
|
"Das ist ein furchtbares Bild",
|
||||||
|
List.of(),
|
||||||
|
false,
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(offset)
|
||||||
|
);
|
||||||
|
assertThat(data.snippetOffsets()).containsExactly(offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import jakarta.persistence.EntityManager;
|
|||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.raddatz.familienarchiv.PostgresContainerConfig;
|
import org.raddatz.familienarchiv.PostgresContainerConfig;
|
||||||
|
import org.raddatz.familienarchiv.service.DocumentService;
|
||||||
import org.raddatz.familienarchiv.config.FlywayConfig;
|
import org.raddatz.familienarchiv.config.FlywayConfig;
|
||||||
import org.raddatz.familienarchiv.model.Document;
|
import org.raddatz.familienarchiv.model.Document;
|
||||||
import org.raddatz.familienarchiv.model.DocumentAnnotation;
|
import org.raddatz.familienarchiv.model.DocumentAnnotation;
|
||||||
@@ -48,7 +49,7 @@ class DocumentSearchEnrichmentTest {
|
|||||||
void lateral_join_returns_highest_ranked_transcription_block() {
|
void lateral_join_returns_highest_ranked_transcription_block() {
|
||||||
Document doc = documentRepository.saveAndFlush(document("Brief an Anna"));
|
Document doc = documentRepository.saveAndFlush(document("Brief an Anna"));
|
||||||
UUID annotId = annotation(doc.getId());
|
UUID annotId = annotation(doc.getId());
|
||||||
// Three blocks — the one with two occurrences has highest rank
|
// Three blocks — the one with three occurrences has highest rank
|
||||||
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Das Wetter war schön", 0));
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Das Wetter war schön", 0));
|
||||||
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Brief Brief Brief", 1)); // highest rank for "Brief"
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Brief Brief Brief", 1)); // highest rank for "Brief"
|
||||||
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Ein Brief liegt vor", 2)); // one occurrence
|
blockRepository.saveAndFlush(block(doc.getId(), annotId, "Ein Brief liegt vor", 2)); // one occurrence
|
||||||
@@ -58,8 +59,11 @@ class DocumentSearchEnrichmentTest {
|
|||||||
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
List<Object[]> rows = documentRepository.findEnrichmentData(List.of(doc.getId()), "Brief");
|
||||||
|
|
||||||
assertThat(rows).hasSize(1);
|
assertThat(rows).hasSize(1);
|
||||||
String snippet = (String) rows.get(0)[2];
|
// row[2] is now a ts_headline result with sentinel chars — parse it for clean text
|
||||||
assertThat(snippet).isEqualTo("Brief Brief Brief");
|
DocumentService.ParsedHighlight parsed = DocumentService.parseHighlight((String) rows.get(0)[2]);
|
||||||
|
assertThat(parsed).isNotNull();
|
||||||
|
assertThat(parsed.cleanText()).isEqualTo("Brief Brief Brief");
|
||||||
|
assertThat(parsed.offsets()).isNotEmpty(); // at least one "Brief" is highlighted
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
@@ -1379,7 +1379,9 @@ class DocumentServiceTest {
|
|||||||
void searchDocuments_withTextQuery_includesTranscriptionSnippetWhenPresent() {
|
void searchDocuments_withTextQuery_includesTranscriptionSnippetWhenPresent() {
|
||||||
UUID docId = UUID.randomUUID();
|
UUID docId = UUID.randomUUID();
|
||||||
Document doc = Document.builder().id(docId).title("Dok").build();
|
Document doc = Document.builder().id(docId).title("Dok").build();
|
||||||
List<Object[]> rows = Collections.singletonList(new Object[]{docId, "Dok", "Hier ist der Brief aus Berlin", false, null, null});
|
// Simulate ts_headline output with sentinel markers around the matched word
|
||||||
|
String snippetHeadline = "Hier ist der \u0001Brief\u0002 aus Berlin";
|
||||||
|
List<Object[]> rows = Collections.singletonList(new Object[]{docId, "Dok", snippetHeadline, false, null, null});
|
||||||
|
|
||||||
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId));
|
when(documentRepository.findRankedIdsByFts("Brief")).thenReturn(List.of(docId));
|
||||||
when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class)))
|
when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class)))
|
||||||
@@ -1391,5 +1393,39 @@ class DocumentServiceTest {
|
|||||||
|
|
||||||
SearchMatchData md = result.matchData().get(docId);
|
SearchMatchData md = result.matchData().get(docId);
|
||||||
assertThat(md.transcriptionSnippet()).isEqualTo("Hier ist der Brief aus Berlin");
|
assertThat(md.transcriptionSnippet()).isEqualTo("Hier ist der Brief aus Berlin");
|
||||||
|
assertThat(md.snippetOffsets()).containsExactly(new MatchOffset(13, 5)); // "Brief" at pos 13
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── parseHighlight unit tests ────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parseHighlight_returnsNull_whenInputIsNull() {
|
||||||
|
assertThat(DocumentService.parseHighlight(null)).isNull();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parseHighlight_returnsCleanTextAndEmptyOffsets_whenNoSentinels() {
|
||||||
|
DocumentService.ParsedHighlight result = DocumentService.parseHighlight("plain text");
|
||||||
|
assertThat(result.cleanText()).isEqualTo("plain text");
|
||||||
|
assertThat(result.offsets()).isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parseHighlight_extractsOffsetAndStripsDelimiters() {
|
||||||
|
// \u0001 = start sentinel, \u0002 = stop sentinel
|
||||||
|
DocumentService.ParsedHighlight result = DocumentService.parseHighlight("Das \u0001furchtbare\u0002 Wort");
|
||||||
|
assertThat(result.cleanText()).isEqualTo("Das furchtbare Wort");
|
||||||
|
assertThat(result.offsets()).containsExactly(new MatchOffset(4, 10)); // "furchtbare" at pos 4, len 10
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void parseHighlight_handlesMultipleHighlightedTerms() {
|
||||||
|
DocumentService.ParsedHighlight result =
|
||||||
|
DocumentService.parseHighlight("\u0001Hallo\u0002 und \u0001Welt\u0002");
|
||||||
|
assertThat(result.cleanText()).isEqualTo("Hallo und Welt");
|
||||||
|
assertThat(result.offsets()).containsExactly(
|
||||||
|
new MatchOffset(0, 5), // "Hallo"
|
||||||
|
new MatchOffset(10, 4) // "Welt"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user