feat(search): add snippetOffsets to SearchMatchData and use ts_headline for highlighted snippets

- SearchMatchData gains a 6th field snippetOffsets: List<MatchOffset> so the frontend can render highlighted terms inside the transcription snippet without {#html}. - DocumentRepository.findEnrichmentData now calls ts_headline() with chr(1)/chr(2) sentinels instead of returning raw block text; parseHighlight() strips the sentinels and produces clean text + MatchOffset list in one pass. - DocumentService exposes ParsedHighlight and parseHighlight() as public so they can be called from cross-package integration tests. - All related tests updated to the new 6-argument SearchMatchData constructor and to call parseHighlight() for asserting the snippet clean text and offsets. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 20:14:20 +02:00
parent 9ff8423da6
commit 32f151ff31
7 changed files with 82 additions and 11 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/SearchMatchData.java
@@ -39,10 +39,17 @@ public record SearchMatchData(
         * IDs of tags whose names matched the query.
         */
        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
-        List<UUID> matchedTagIds
+        List<UUID> matchedTagIds,
+
+        /**
+         * Character offsets of highlighted terms within the transcription snippet.
+         * Empty when no transcription block matched or the snippet has no highlights.
+         */
+        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
+        List<MatchOffset> snippetOffsets
 ) {
    /** Canonical "no match data" value for a single document. */
    public static SearchMatchData empty() {
-        return new SearchMatchData(null, List.of(), false, List.of(), List.of());
+        return new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of());
    }
 }
--- a/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/DocumentRepository.java
@@ -95,7 +95,7 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
     * <ol>
     *   <li>UUID   — document id</li>
     *   <li>String — title headline with \x01/\x02 delimiters around matched terms</li>
-     *   <li>String — best-ranked matching transcription block text, or null</li>
+     *   <li>String — best-ranked transcription snippet with \x01/\x02 delimiters, or null</li>
     *   <li>Boolean — whether the sender's name matched the query</li>
     *   <li>String — comma-separated matched receiver UUIDs, or null</li>
     *   <li>String — comma-separated matched tag UUIDs, or null</li>
@@ -108,7 +108,10 @@ public interface DocumentRepository extends JpaRepository<Document, UUID>, JpaSp
                ts_headline('german', d.title, websearch_to_tsquery('german', :query),
                    'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',HighlightAll=true')
                    AS title_headline,
-                best_block.text AS transcription_snippet,
+                CASE WHEN best_block.text IS NOT NULL THEN
+                    ts_headline('german', best_block.text, websearch_to_tsquery('german', :query),
+                        'StartSel=' || chr(1) || ',StopSel=' || chr(2) || ',MaxWords=50,MinWords=20')
+                END AS transcription_snippet,
                (s.id IS NOT NULL AND
                 to_tsvector('german', COALESCE(s.first_name, '') || ' ' || COALESCE(s.last_name, ''))
                     @@ websearch_to_tsquery('german', :query))