feat(ocr): add NDJSON streaming endpoint POST /ocr/stream
Streams one JSON line per completed page instead of buffering the entire result. Emits start/page/error/done events. On per-page failure, logs the traceback but yields a generic error message and continues with the next page. Adds X-Accel-Buffering: no and Cache-Control: no-cache headers for reverse-proxy compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,12 +2,14 @@
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import pypdfium2 as pdfium
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from PIL import Image
|
||||
|
||||
from confidence import apply_confidence_markers, get_threshold
|
||||
@@ -82,6 +84,77 @@ async def run_ocr(request: OcrRequest):
|
||||
return [OcrBlock(**b) for b in blocks]
|
||||
|
||||
|
||||
@app.post("/ocr/stream")
|
||||
async def run_ocr_stream(request: OcrRequest):
|
||||
"""Run OCR on a PDF with NDJSON streaming — one JSON line per completed page."""
|
||||
if not _models_ready:
|
||||
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||
|
||||
images = await _download_and_convert_pdf(request.pdfUrl)
|
||||
script_type = request.scriptType.upper()
|
||||
threshold = get_threshold(script_type)
|
||||
|
||||
use_kraken = script_type == "HANDWRITING_KURRENT"
|
||||
if use_kraken and not kraken_engine.is_available():
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Kraken model not available — cannot process Kurrent script",
|
||||
)
|
||||
|
||||
async def generate():
|
||||
total_pages = len(images)
|
||||
yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
|
||||
|
||||
total_blocks = 0
|
||||
skipped_pages = 0
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
try:
|
||||
engine = kraken_engine if use_kraken else surya_engine
|
||||
blocks = await asyncio.to_thread(
|
||||
engine.extract_page_blocks, image, page_idx, request.language
|
||||
)
|
||||
|
||||
for block in blocks:
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
block.pop("words", None)
|
||||
|
||||
total_blocks += len(blocks)
|
||||
yield json.dumps({
|
||||
"type": "page",
|
||||
"pageNumber": page_idx,
|
||||
"blocks": blocks,
|
||||
}) + "\n"
|
||||
|
||||
except Exception:
|
||||
logger.exception("OCR failed on page %d", page_idx)
|
||||
skipped_pages += 1
|
||||
yield json.dumps({
|
||||
"type": "error",
|
||||
"pageNumber": page_idx,
|
||||
"message": f"OCR processing failed on page {page_idx}",
|
||||
}) + "\n"
|
||||
|
||||
finally:
|
||||
del image
|
||||
|
||||
yield json.dumps({
|
||||
"type": "done",
|
||||
"totalBlocks": total_blocks,
|
||||
"skippedPages": skipped_pages,
|
||||
}) + "\n"
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="application/x-ndjson",
|
||||
headers={
|
||||
"X-Accel-Buffering": "no",
|
||||
"Cache-Control": "no-cache",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
||||
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||
|
||||
Reference in New Issue
Block a user