From 641e91d5a3d23241efdfc85cec2e53a82202968c Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 10:01:26 +0200 Subject: [PATCH] feat(ocr): add default streamBlocks method to OcrClient interface The default method synthesizes Start/Page/Done events from extractBlocks() results, providing backward compatibility for implementations that don't support streaming natively. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../familienarchiv/service/OcrClient.java | 26 +++++++++ .../service/OcrClientDefaultStreamTest.java | 55 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/OcrClientDefaultStreamTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java index 3b33aaf2..9cf7c886 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java @@ -2,8 +2,34 @@ package org.raddatz.familienarchiv.service; import org.raddatz.familienarchiv.model.ScriptType; +import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.function.Consumer; public interface OcrClient { List extractBlocks(String pdfUrl, ScriptType scriptType); + + /** + * Stream OCR results page-by-page via NDJSON. Implementations should override + * this method. The default exists only for backward compatibility during migration + * — it calls extractBlocks() and synthesizes events from the collected result. + */ + default void streamBlocks(String pdfUrl, ScriptType scriptType, Consumer handler) { + List allBlocks = extractBlocks(pdfUrl, scriptType); + + LinkedHashMap> byPage = new LinkedHashMap<>(); + for (OcrBlockResult block : allBlocks) { + byPage.computeIfAbsent(block.pageNumber(), k -> new ArrayList<>()).add(block); + } + + int totalPages = byPage.isEmpty() ? 0 : byPage.keySet().stream().mapToInt(i -> i).max().orElse(0) + 1; + handler.accept(new OcrStreamEvent.Start(totalPages)); + + for (var entry : byPage.entrySet()) { + handler.accept(new OcrStreamEvent.Page(entry.getKey(), entry.getValue())); + } + + handler.accept(new OcrStreamEvent.Done(allBlocks.size(), 0)); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrClientDefaultStreamTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrClientDefaultStreamTest.java new file mode 100644 index 00000000..42219299 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrClientDefaultStreamTest.java @@ -0,0 +1,55 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.model.ScriptType; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class OcrClientDefaultStreamTest { + + @Test + void defaultStreamBlocksSynthesizesEventsFromExtractBlocks() { + OcrClient client = (pdfUrl, scriptType) -> List.of( + new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Line 1"), + new OcrBlockResult(0, 0.1, 0.2, 0.8, 0.04, null, "Line 2"), + new OcrBlockResult(1, 0.1, 0.1, 0.8, 0.04, null, "Line 3")); + + List events = new ArrayList<>(); + client.streamBlocks("http://test", ScriptType.TYPEWRITER, events::add); + + assertThat(events).hasSize(4); + assertThat(events.get(0)).isInstanceOf(OcrStreamEvent.Start.class); + assertThat(((OcrStreamEvent.Start) events.get(0)).totalPages()).isEqualTo(2); + + assertThat(events.get(1)).isInstanceOf(OcrStreamEvent.Page.class); + var page0 = (OcrStreamEvent.Page) events.get(1); + assertThat(page0.pageNumber()).isEqualTo(0); + assertThat(page0.blocks()).hasSize(2); + + assertThat(events.get(2)).isInstanceOf(OcrStreamEvent.Page.class); + var page1 = (OcrStreamEvent.Page) events.get(2); + assertThat(page1.pageNumber()).isEqualTo(1); + assertThat(page1.blocks()).hasSize(1); + + assertThat(events.get(3)).isInstanceOf(OcrStreamEvent.Done.class); + var done = (OcrStreamEvent.Done) events.get(3); + assertThat(done.totalBlocks()).isEqualTo(3); + assertThat(done.skippedPages()).isEqualTo(0); + } + + @Test + void defaultStreamBlocksHandlesEmptyResults() { + OcrClient client = (pdfUrl, scriptType) -> List.of(); + + List events = new ArrayList<>(); + client.streamBlocks("http://test", ScriptType.TYPEWRITER, events::add); + + assertThat(events).hasSize(2); + assertThat(events.get(0)).isInstanceOf(OcrStreamEvent.Start.class); + assertThat(((OcrStreamEvent.Start) events.get(0)).totalPages()).isEqualTo(0); + assertThat(events.get(1)).isInstanceOf(OcrStreamEvent.Done.class); + } +}