Streams one JSON line per completed page instead of buffering the entire result. Emits start/page/error/done events. On per-page failure, logs the traceback but yields a generic error message and continues with the next page. Adds X-Accel-Buffering: no and Cache-Control: no-cache headers for reverse-proxy compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
176 lines
5.7 KiB
Python
176 lines
5.7 KiB
Python
"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
|
|
|
|
import asyncio
|
|
import io
|
|
import json
|
|
import logging
|
|
from contextlib import asynccontextmanager
|
|
|
|
import httpx
|
|
import pypdfium2 as pdfium
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import StreamingResponse
|
|
from PIL import Image
|
|
|
|
from confidence import apply_confidence_markers, get_threshold
|
|
from engines import kraken as kraken_engine
|
|
from engines import surya as surya_engine
|
|
from models import OcrBlock, OcrRequest
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_models_ready = False
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Load lightweight models at startup. Surya loads lazily on first request."""
|
|
global _models_ready
|
|
|
|
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
|
|
kraken_engine.load_models()
|
|
_models_ready = True
|
|
logger.info("Startup complete — ready to accept requests")
|
|
|
|
yield
|
|
|
|
logger.info("Shutting down OCR service")
|
|
|
|
|
|
app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
"""Health endpoint — returns 200 only after models are loaded."""
|
|
if not _models_ready:
|
|
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
|
return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
|
|
|
|
|
|
@app.post("/ocr", response_model=list[OcrBlock])
|
|
async def run_ocr(request: OcrRequest):
|
|
"""Run OCR on a PDF document.
|
|
|
|
Downloads the PDF from the provided URL, converts pages to images,
|
|
and runs the appropriate OCR engine based on scriptType.
|
|
OCR engines run in a thread pool so the event loop stays free for /health.
|
|
"""
|
|
if not _models_ready:
|
|
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
|
|
|
images = await _download_and_convert_pdf(request.pdfUrl)
|
|
|
|
script_type = request.scriptType.upper()
|
|
|
|
if script_type == "HANDWRITING_KURRENT":
|
|
if not kraken_engine.is_available():
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Kraken model not available — cannot process Kurrent script",
|
|
)
|
|
blocks = await asyncio.to_thread(kraken_engine.extract_blocks, images, request.language)
|
|
else:
|
|
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
|
blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
|
|
|
|
threshold = get_threshold(script_type)
|
|
for block in blocks:
|
|
if block.get("words"):
|
|
block["text"] = apply_confidence_markers(block["words"], threshold)
|
|
block.pop("words", None)
|
|
|
|
return [OcrBlock(**b) for b in blocks]
|
|
|
|
|
|
@app.post("/ocr/stream")
|
|
async def run_ocr_stream(request: OcrRequest):
|
|
"""Run OCR on a PDF with NDJSON streaming — one JSON line per completed page."""
|
|
if not _models_ready:
|
|
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
|
|
|
images = await _download_and_convert_pdf(request.pdfUrl)
|
|
script_type = request.scriptType.upper()
|
|
threshold = get_threshold(script_type)
|
|
|
|
use_kraken = script_type == "HANDWRITING_KURRENT"
|
|
if use_kraken and not kraken_engine.is_available():
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Kraken model not available — cannot process Kurrent script",
|
|
)
|
|
|
|
async def generate():
|
|
total_pages = len(images)
|
|
yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
|
|
|
|
total_blocks = 0
|
|
skipped_pages = 0
|
|
|
|
for page_idx, image in enumerate(images):
|
|
try:
|
|
engine = kraken_engine if use_kraken else surya_engine
|
|
blocks = await asyncio.to_thread(
|
|
engine.extract_page_blocks, image, page_idx, request.language
|
|
)
|
|
|
|
for block in blocks:
|
|
if block.get("words"):
|
|
block["text"] = apply_confidence_markers(block["words"], threshold)
|
|
block.pop("words", None)
|
|
|
|
total_blocks += len(blocks)
|
|
yield json.dumps({
|
|
"type": "page",
|
|
"pageNumber": page_idx,
|
|
"blocks": blocks,
|
|
}) + "\n"
|
|
|
|
except Exception:
|
|
logger.exception("OCR failed on page %d", page_idx)
|
|
skipped_pages += 1
|
|
yield json.dumps({
|
|
"type": "error",
|
|
"pageNumber": page_idx,
|
|
"message": f"OCR processing failed on page {page_idx}",
|
|
}) + "\n"
|
|
|
|
finally:
|
|
del image
|
|
|
|
yield json.dumps({
|
|
"type": "done",
|
|
"totalBlocks": total_blocks,
|
|
"skippedPages": skipped_pages,
|
|
}) + "\n"
|
|
|
|
return StreamingResponse(
|
|
generate(),
|
|
media_type="application/x-ndjson",
|
|
headers={
|
|
"X-Accel-Buffering": "no",
|
|
"Cache-Control": "no-cache",
|
|
},
|
|
)
|
|
|
|
|
|
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
|
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
|
|
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
|
|
pdf = pdfium.PdfDocument(io.BytesIO(response.content))
|
|
images = []
|
|
|
|
for page_idx in range(len(pdf)):
|
|
page = pdf[page_idx]
|
|
# Render at 200 DPI — balances OCR quality vs memory usage
|
|
# (Surya 0.17 models use ~5GB idle; 300 DPI causes OOM on multi-page docs)
|
|
bitmap = page.render(scale=200 / 72)
|
|
pil_image = bitmap.to_pil()
|
|
images.append(pil_image)
|
|
|
|
return images
|