feat(ocr): rewrite runSingleDocument to use streamBlocks with per-page progress

Replace the single extractBlocks() call with streamBlocks() that
processes pages incrementally. Each page's blocks are persisted
immediately via createSingleBlock(). Progress updates use the
ANALYZING_PAGE:current:total:blocks format. Per-page errors are
logged at WARN level without failing the entire job. The batch path
(processDocument) remains on the old extractBlocks() path.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 10:07:06 +02:00
parent 6823973429
commit 292dc66f3c
2 changed files with 182 additions and 6 deletions

View File

@@ -13,6 +13,7 @@ import org.springframework.stereotype.Component;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
@Component
@RequiredArgsConstructor
@@ -54,14 +55,56 @@ public class OcrAsyncRunner {
String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath());
updateProgress(job, "ANALYZING");
List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
updateProgress(job, "CREATING_BLOCKS:" + blocks.size());
createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
AtomicInteger blockCounter = new AtomicInteger(0);
AtomicInteger currentPage = new AtomicInteger(0);
AtomicInteger skippedPages = new AtomicInteger(0);
AtomicInteger totalPages = new AtomicInteger(0);
ocrClient.streamBlocks(pdfUrl, doc.getScriptType(), event -> {
switch (event) {
case OcrStreamEvent.Start start -> {
totalPages.set(start.totalPages());
if (jobDoc != null) {
jobDoc.setTotalPages(start.totalPages());
ocrJobDocumentRepository.save(jobDoc);
}
}
case OcrStreamEvent.Page page -> {
for (OcrBlockResult block : page.blocks()) {
createSingleBlock(documentId, block, userId,
doc.getFileHash(), blockCounter.getAndIncrement());
}
currentPage.incrementAndGet();
if (jobDoc != null) {
jobDoc.setCurrentPage(currentPage.get());
ocrJobDocumentRepository.save(jobDoc);
}
updateProgress(job, "ANALYZING_PAGE:" + currentPage.get()
+ ":" + totalPages.get() + ":" + blockCounter.get());
}
case OcrStreamEvent.Error error -> {
log.warn("OCR page {} failed for document {}: {}",
error.pageNumber(), documentId, error.message());
skippedPages.incrementAndGet();
currentPage.incrementAndGet();
if (jobDoc != null) {
jobDoc.setCurrentPage(currentPage.get());
ocrJobDocumentRepository.save(jobDoc);
}
}
case OcrStreamEvent.Done done -> {
if (jobDoc != null) {
jobDoc.setCurrentPage(totalPages.get());
ocrJobDocumentRepository.save(jobDoc);
}
}
}
});
job.setStatus(OcrJobStatus.DONE);
job.setProcessedDocuments(1);
updateProgress(job, "DONE:" + blocks.size());
updateProgress(job, "DONE:" + blockCounter.get() + ":" + skippedPages.get());
if (jobDoc != null) {
jobDoc.setStatus(OcrDocumentStatus.DONE);
ocrJobDocumentRepository.save(jobDoc);