feat(ocr): add NDJSON streaming endpoint POST /ocr/stream

Streams one JSON line per completed page instead of buffering the entire result. Emits start/page/error/done events. On per-page failure, logs the traceback but yields a generic error message and continues with the next page. Adds X-Accel-Buffering: no and Cache-Control: no-cache headers for reverse-proxy compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 09:57:57 +02:00
parent b7d5f71ef7
commit 97c6cf6a65
2 changed files with 236 additions and 0 deletions
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -2,12 +2,14 @@

 import asyncio
 import io
+import json
 import logging
 from contextlib import asynccontextmanager

 import httpx
 import pypdfium2 as pdfium
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
 from PIL import Image

 from confidence import apply_confidence_markers, get_threshold
@@ -82,6 +84,77 @@ async def run_ocr(request: OcrRequest):
    return [OcrBlock(**b) for b in blocks]


+@app.post("/ocr/stream")
+async def run_ocr_stream(request: OcrRequest):
+    """Run OCR on a PDF with NDJSON streaming — one JSON line per completed page."""
+    if not _models_ready:
+        raise HTTPException(status_code=503, detail="Models not loaded yet")
+
+    images = await _download_and_convert_pdf(request.pdfUrl)
+    script_type = request.scriptType.upper()
+    threshold = get_threshold(script_type)
+
+    use_kraken = script_type == "HANDWRITING_KURRENT"
+    if use_kraken and not kraken_engine.is_available():
+        raise HTTPException(
+            status_code=400,
+            detail="Kraken model not available — cannot process Kurrent script",
+        )
+
+    async def generate():
+        total_pages = len(images)
+        yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
+
+        total_blocks = 0
+        skipped_pages = 0
+
+        for page_idx, image in enumerate(images):
+            try:
+                engine = kraken_engine if use_kraken else surya_engine
+                blocks = await asyncio.to_thread(
+                    engine.extract_page_blocks, image, page_idx, request.language
+                )
+
+                for block in blocks:
+                    if block.get("words"):
+                        block["text"] = apply_confidence_markers(block["words"], threshold)
+                    block.pop("words", None)
+
+                total_blocks += len(blocks)
+                yield json.dumps({
+                    "type": "page",
+                    "pageNumber": page_idx,
+                    "blocks": blocks,
+                }) + "\n"
+
+            except Exception:
+                logger.exception("OCR failed on page %d", page_idx)
+                skipped_pages += 1
+                yield json.dumps({
+                    "type": "error",
+                    "pageNumber": page_idx,
+                    "message": f"OCR processing failed on page {page_idx}",
+                }) + "\n"
+
+            finally:
+                del image
+
+        yield json.dumps({
+            "type": "done",
+            "totalBlocks": total_blocks,
+            "skippedPages": skipped_pages,
+        }) + "\n"
+
+    return StreamingResponse(
+        generate(),
+        media_type="application/x-ndjson",
+        headers={
+            "X-Accel-Buffering": "no",
+            "Cache-Control": "no-cache",
+        },
+    )
+
+
 async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
    """Download a PDF from a presigned URL and convert each page to a PIL Image."""
    async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: