"""OCR microservice — FastAPI app with Surya and Kraken engine support.""" import io import logging from contextlib import asynccontextmanager import httpx import pypdfium2 as pdfium from fastapi import FastAPI, HTTPException from PIL import Image from confidence import apply_confidence_markers from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _models_ready = False @asynccontextmanager async def lifespan(app: FastAPI): """Load all OCR models at startup before accepting requests.""" global _models_ready logger.info("Loading OCR models at startup...") surya_engine.load_models() kraken_engine.load_models() _models_ready = True logger.info("All OCR models loaded — ready to accept requests") yield logger.info("Shutting down OCR service") app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) @app.get("/health") def health(): """Health endpoint — returns 200 only after models are loaded.""" if not _models_ready: raise HTTPException(status_code=503, detail="Models not loaded yet") return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()} @app.post("/ocr", response_model=list[OcrBlock]) async def run_ocr(request: OcrRequest): """Run OCR on a PDF document. Downloads the PDF from the provided URL, converts pages to images, and runs the appropriate OCR engine based on scriptType. """ if not _models_ready: raise HTTPException(status_code=503, detail="Models not loaded yet") images = await _download_and_convert_pdf(request.pdf_url) script_type = request.script_type.upper() if script_type == "HANDWRITING_KURRENT": if not kraken_engine.is_available(): raise HTTPException( status_code=400, detail="Kraken model not available — cannot process Kurrent script", ) blocks = kraken_engine.extract_blocks(images, request.language) else: # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = surya_engine.extract_blocks(images, request.language) for block in blocks: if block.get("words"): block["text"] = apply_confidence_markers(block["words"]) block.pop("words", None) return [OcrBlock(**b) for b in blocks] async def _download_and_convert_pdf(url: str) -> list[Image.Image]: """Download a PDF from URL and convert each page to a PIL Image.""" async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: response = await client.get(url) response.raise_for_status() pdf = pdfium.PdfDocument(io.BytesIO(response.content)) images = [] for page_idx in range(len(pdf)): page = pdf[page_idx] # Render at 300 DPI for good OCR quality bitmap = page.render(scale=300 / 72) pil_image = bitmap.to_pil() images.append(pil_image) return images