feat(ocr): implement NDJSON streaming in RestClientOcrClient

Add streamBlocks() that POSTs to /ocr/stream and parses the NDJSON
response line by line with a dedicated ObjectMapper. Falls back to
the old /ocr endpoint via the default method when /ocr/stream returns
404. Uses a separate HttpClient with 5-minute request timeout for
streaming.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 10:03:12 +02:00
parent 641e91d5a3
commit 93c3154b3c
2 changed files with 218 additions and 0 deletions

View File

@@ -0,0 +1,117 @@
package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
class RestClientOcrClientStreamTest {
@Test
void parseNdjsonStream_dispatchesStartPageDoneInOrder() {
String ndjson = """
{"type":"start","totalPages":2}
{"type":"page","pageNumber":0,"blocks":[{"pageNumber":0,"x":0.1,"y":0.2,"width":0.8,"height":0.1,"polygon":null,"text":"Line 1"}]}
{"type":"page","pageNumber":1,"blocks":[{"pageNumber":1,"x":0.1,"y":0.3,"width":0.8,"height":0.1,"polygon":null,"text":"Line 2"}]}
{"type":"done","totalBlocks":2,"skippedPages":0}
""";
InputStream stream = new ByteArrayInputStream(ndjson.getBytes(StandardCharsets.UTF_8));
List<OcrStreamEvent> events = new ArrayList<>();
RestClientOcrClient.parseNdjsonStream(stream, events::add);
assertThat(events).hasSize(4);
assertThat(events.get(0)).isInstanceOf(OcrStreamEvent.Start.class);
assertThat(((OcrStreamEvent.Start) events.get(0)).totalPages()).isEqualTo(2);
assertThat(events.get(1)).isInstanceOf(OcrStreamEvent.Page.class);
var page0 = (OcrStreamEvent.Page) events.get(1);
assertThat(page0.pageNumber()).isEqualTo(0);
assertThat(page0.blocks()).hasSize(1);
assertThat(page0.blocks().get(0).text()).isEqualTo("Line 1");
assertThat(events.get(2)).isInstanceOf(OcrStreamEvent.Page.class);
var page1 = (OcrStreamEvent.Page) events.get(2);
assertThat(page1.pageNumber()).isEqualTo(1);
assertThat(events.get(3)).isInstanceOf(OcrStreamEvent.Done.class);
var done = (OcrStreamEvent.Done) events.get(3);
assertThat(done.totalBlocks()).isEqualTo(2);
assertThat(done.skippedPages()).isEqualTo(0);
}
@Test
void parseNdjsonStream_parsesErrorEvents() {
String ndjson = """
{"type":"start","totalPages":3}
{"type":"page","pageNumber":0,"blocks":[]}
{"type":"error","pageNumber":1,"message":"OCR processing failed on page 1"}
{"type":"page","pageNumber":2,"blocks":[]}
{"type":"done","totalBlocks":0,"skippedPages":1}
""";
InputStream stream = new ByteArrayInputStream(ndjson.getBytes(StandardCharsets.UTF_8));
List<OcrStreamEvent> events = new ArrayList<>();
RestClientOcrClient.parseNdjsonStream(stream, events::add);
assertThat(events).hasSize(5);
assertThat(events.get(2)).isInstanceOf(OcrStreamEvent.Error.class);
var error = (OcrStreamEvent.Error) events.get(2);
assertThat(error.pageNumber()).isEqualTo(1);
assertThat(error.message()).contains("OCR processing failed");
}
@Test
void parseNdjsonStream_skipsBlankLines() {
String ndjson = """
{"type":"start","totalPages":1}
{"type":"page","pageNumber":0,"blocks":[]}
{"type":"done","totalBlocks":0,"skippedPages":0}
""";
InputStream stream = new ByteArrayInputStream(ndjson.getBytes(StandardCharsets.UTF_8));
List<OcrStreamEvent> events = new ArrayList<>();
RestClientOcrClient.parseNdjsonStream(stream, events::add);
assertThat(events).hasSize(3);
}
@Test
void parseNdjsonStream_ignoresUnknownEventTypes() {
String ndjson = """
{"type":"start","totalPages":1}
{"type":"unknown","foo":"bar"}
{"type":"done","totalBlocks":0,"skippedPages":0}
""";
InputStream stream = new ByteArrayInputStream(ndjson.getBytes(StandardCharsets.UTF_8));
List<OcrStreamEvent> events = new ArrayList<>();
RestClientOcrClient.parseNdjsonStream(stream, events::add);
assertThat(events).hasSize(2);
}
@Test
void parseNdjsonStream_parsesPageWithPolygon() {
String ndjson = """
{"type":"start","totalPages":1}
{"type":"page","pageNumber":0,"blocks":[{"pageNumber":0,"x":0.1,"y":0.2,"width":0.8,"height":0.1,"polygon":[[0.1,0.2],[0.9,0.2],[0.9,0.3],[0.1,0.3]],"text":"With polygon"}]}
{"type":"done","totalBlocks":1,"skippedPages":0}
""";
InputStream stream = new ByteArrayInputStream(ndjson.getBytes(StandardCharsets.UTF_8));
List<OcrStreamEvent> events = new ArrayList<>();
RestClientOcrClient.parseNdjsonStream(stream, events::add);
var page = (OcrStreamEvent.Page) events.get(1);
assertThat(page.blocks().get(0).polygon()).hasSize(4);
assertThat(page.blocks().get(0).text()).isEqualTo("With polygon");
}
}