feat(ocr): add default streamBlocks method to OcrClient interface
The default method synthesizes Start/Page/Done events from extractBlocks() results, providing backward compatibility for implementations that don't support streaming natively. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,8 +2,34 @@ package org.raddatz.familienarchiv.service;
|
|||||||
|
|
||||||
import org.raddatz.familienarchiv.model.ScriptType;
|
import org.raddatz.familienarchiv.model.ScriptType;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public interface OcrClient {
|
public interface OcrClient {
|
||||||
List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType);
|
List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream OCR results page-by-page via NDJSON. Implementations should override
|
||||||
|
* this method. The default exists only for backward compatibility during migration
|
||||||
|
* — it calls extractBlocks() and synthesizes events from the collected result.
|
||||||
|
*/
|
||||||
|
default void streamBlocks(String pdfUrl, ScriptType scriptType, Consumer<OcrStreamEvent> handler) {
|
||||||
|
List<OcrBlockResult> allBlocks = extractBlocks(pdfUrl, scriptType);
|
||||||
|
|
||||||
|
LinkedHashMap<Integer, List<OcrBlockResult>> byPage = new LinkedHashMap<>();
|
||||||
|
for (OcrBlockResult block : allBlocks) {
|
||||||
|
byPage.computeIfAbsent(block.pageNumber(), k -> new ArrayList<>()).add(block);
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalPages = byPage.isEmpty() ? 0 : byPage.keySet().stream().mapToInt(i -> i).max().orElse(0) + 1;
|
||||||
|
handler.accept(new OcrStreamEvent.Start(totalPages));
|
||||||
|
|
||||||
|
for (var entry : byPage.entrySet()) {
|
||||||
|
handler.accept(new OcrStreamEvent.Page(entry.getKey(), entry.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
handler.accept(new OcrStreamEvent.Done(allBlocks.size(), 0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.raddatz.familienarchiv.model.ScriptType;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
class OcrClientDefaultStreamTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void defaultStreamBlocksSynthesizesEventsFromExtractBlocks() {
|
||||||
|
OcrClient client = (pdfUrl, scriptType) -> List.of(
|
||||||
|
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Line 1"),
|
||||||
|
new OcrBlockResult(0, 0.1, 0.2, 0.8, 0.04, null, "Line 2"),
|
||||||
|
new OcrBlockResult(1, 0.1, 0.1, 0.8, 0.04, null, "Line 3"));
|
||||||
|
|
||||||
|
List<OcrStreamEvent> events = new ArrayList<>();
|
||||||
|
client.streamBlocks("http://test", ScriptType.TYPEWRITER, events::add);
|
||||||
|
|
||||||
|
assertThat(events).hasSize(4);
|
||||||
|
assertThat(events.get(0)).isInstanceOf(OcrStreamEvent.Start.class);
|
||||||
|
assertThat(((OcrStreamEvent.Start) events.get(0)).totalPages()).isEqualTo(2);
|
||||||
|
|
||||||
|
assertThat(events.get(1)).isInstanceOf(OcrStreamEvent.Page.class);
|
||||||
|
var page0 = (OcrStreamEvent.Page) events.get(1);
|
||||||
|
assertThat(page0.pageNumber()).isEqualTo(0);
|
||||||
|
assertThat(page0.blocks()).hasSize(2);
|
||||||
|
|
||||||
|
assertThat(events.get(2)).isInstanceOf(OcrStreamEvent.Page.class);
|
||||||
|
var page1 = (OcrStreamEvent.Page) events.get(2);
|
||||||
|
assertThat(page1.pageNumber()).isEqualTo(1);
|
||||||
|
assertThat(page1.blocks()).hasSize(1);
|
||||||
|
|
||||||
|
assertThat(events.get(3)).isInstanceOf(OcrStreamEvent.Done.class);
|
||||||
|
var done = (OcrStreamEvent.Done) events.get(3);
|
||||||
|
assertThat(done.totalBlocks()).isEqualTo(3);
|
||||||
|
assertThat(done.skippedPages()).isEqualTo(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void defaultStreamBlocksHandlesEmptyResults() {
|
||||||
|
OcrClient client = (pdfUrl, scriptType) -> List.of();
|
||||||
|
|
||||||
|
List<OcrStreamEvent> events = new ArrayList<>();
|
||||||
|
client.streamBlocks("http://test", ScriptType.TYPEWRITER, events::add);
|
||||||
|
|
||||||
|
assertThat(events).hasSize(2);
|
||||||
|
assertThat(events.get(0)).isInstanceOf(OcrStreamEvent.Start.class);
|
||||||
|
assertThat(((OcrStreamEvent.Start) events.get(0)).totalPages()).isEqualTo(0);
|
||||||
|
assertThat(events.get(1)).isInstanceOf(OcrStreamEvent.Done.class);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user