feat(ocr): rewrite runSingleDocument to use streamBlocks with per-page progress
Replace the single extractBlocks() call with streamBlocks() that processes pages incrementally. Each page's blocks are persisted immediately via createSingleBlock(). Progress updates use the ANALYZING_PAGE:current:total:blocks format. Per-page errors are logged at WARN level without failing the entire job. The batch path (processDocument) remains on the old extractBlocks() path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,6 +13,7 @@ import org.springframework.stereotype.Component;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@@ -54,14 +55,56 @@ public class OcrAsyncRunner {
|
||||
String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath());
|
||||
|
||||
updateProgress(job, "ANALYZING");
|
||||
List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
|
||||
|
||||
updateProgress(job, "CREATING_BLOCKS:" + blocks.size());
|
||||
createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
|
||||
AtomicInteger blockCounter = new AtomicInteger(0);
|
||||
AtomicInteger currentPage = new AtomicInteger(0);
|
||||
AtomicInteger skippedPages = new AtomicInteger(0);
|
||||
AtomicInteger totalPages = new AtomicInteger(0);
|
||||
|
||||
ocrClient.streamBlocks(pdfUrl, doc.getScriptType(), event -> {
|
||||
switch (event) {
|
||||
case OcrStreamEvent.Start start -> {
|
||||
totalPages.set(start.totalPages());
|
||||
if (jobDoc != null) {
|
||||
jobDoc.setTotalPages(start.totalPages());
|
||||
ocrJobDocumentRepository.save(jobDoc);
|
||||
}
|
||||
}
|
||||
case OcrStreamEvent.Page page -> {
|
||||
for (OcrBlockResult block : page.blocks()) {
|
||||
createSingleBlock(documentId, block, userId,
|
||||
doc.getFileHash(), blockCounter.getAndIncrement());
|
||||
}
|
||||
currentPage.incrementAndGet();
|
||||
if (jobDoc != null) {
|
||||
jobDoc.setCurrentPage(currentPage.get());
|
||||
ocrJobDocumentRepository.save(jobDoc);
|
||||
}
|
||||
updateProgress(job, "ANALYZING_PAGE:" + currentPage.get()
|
||||
+ ":" + totalPages.get() + ":" + blockCounter.get());
|
||||
}
|
||||
case OcrStreamEvent.Error error -> {
|
||||
log.warn("OCR page {} failed for document {}: {}",
|
||||
error.pageNumber(), documentId, error.message());
|
||||
skippedPages.incrementAndGet();
|
||||
currentPage.incrementAndGet();
|
||||
if (jobDoc != null) {
|
||||
jobDoc.setCurrentPage(currentPage.get());
|
||||
ocrJobDocumentRepository.save(jobDoc);
|
||||
}
|
||||
}
|
||||
case OcrStreamEvent.Done done -> {
|
||||
if (jobDoc != null) {
|
||||
jobDoc.setCurrentPage(totalPages.get());
|
||||
ocrJobDocumentRepository.save(jobDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
job.setStatus(OcrJobStatus.DONE);
|
||||
job.setProcessedDocuments(1);
|
||||
updateProgress(job, "DONE:" + blocks.size());
|
||||
updateProgress(job, "DONE:" + blockCounter.get() + ":" + skippedPages.get());
|
||||
if (jobDoc != null) {
|
||||
jobDoc.setStatus(OcrDocumentStatus.DONE);
|
||||
ocrJobDocumentRepository.save(jobDoc);
|
||||
|
||||
Reference in New Issue
Block a user