Files
familienarchiv/ocr-service/main.py
Marcel 70689b8f7b
Some checks failed
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 1s
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 0s
feat(ocr): add SSRF protection for PDF URL downloads
Validates PDF download URLs against an ALLOWED_PDF_HOSTS allowlist
(default: minio,localhost,127.0.0.1) and disables redirect following
to prevent redirect-based SSRF.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 12:29:42 +02:00

193 lines
6.3 KiB
Python

"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
import asyncio
import io
import json
import logging
import os
from contextlib import asynccontextmanager
from urllib.parse import urlparse
import httpx
import pypdfium2 as pdfium
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from PIL import Image
from confidence import apply_confidence_markers, get_threshold
from engines import kraken as kraken_engine
from engines import surya as surya_engine
from models import OcrBlock, OcrRequest
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
_models_ready = False
ALLOWED_PDF_HOSTS = set(
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
)
def _validate_url(url: str) -> None:
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
parsed = urlparse(url)
hostname = parsed.hostname or ""
if hostname not in ALLOWED_PDF_HOSTS:
raise HTTPException(status_code=400, detail=f"PDF host not allowed: {hostname}")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load lightweight models at startup. Surya loads lazily on first request."""
global _models_ready
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
kraken_engine.load_models()
_models_ready = True
logger.info("Startup complete — ready to accept requests")
yield
logger.info("Shutting down OCR service")
app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
@app.get("/health")
def health():
"""Health endpoint — returns 200 only after models are loaded."""
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
@app.post("/ocr", response_model=list[OcrBlock])
async def run_ocr(request: OcrRequest):
"""Run OCR on a PDF document.
Downloads the PDF from the provided URL, converts pages to images,
and runs the appropriate OCR engine based on scriptType.
OCR engines run in a thread pool so the event loop stays free for /health.
"""
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
images = await _download_and_convert_pdf(request.pdfUrl)
script_type = request.scriptType.upper()
if script_type == "HANDWRITING_KURRENT":
if not kraken_engine.is_available():
raise HTTPException(
status_code=400,
detail="Kraken model not available — cannot process Kurrent script",
)
blocks = await asyncio.to_thread(kraken_engine.extract_blocks, images, request.language)
else:
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
threshold = get_threshold(script_type)
for block in blocks:
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
block.pop("words", None)
return [OcrBlock(**b) for b in blocks]
@app.post("/ocr/stream")
async def run_ocr_stream(request: OcrRequest):
"""Run OCR on a PDF with NDJSON streaming — one JSON line per completed page."""
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
images = await _download_and_convert_pdf(request.pdfUrl)
script_type = request.scriptType.upper()
threshold = get_threshold(script_type)
use_kraken = script_type == "HANDWRITING_KURRENT"
if use_kraken and not kraken_engine.is_available():
raise HTTPException(
status_code=400,
detail="Kraken model not available — cannot process Kurrent script",
)
async def generate():
total_pages = len(images)
yield json.dumps({"type": "start", "totalPages": total_pages}) + "\n"
total_blocks = 0
skipped_pages = 0
for page_idx, image in enumerate(images, start=1):
try:
engine = kraken_engine if use_kraken else surya_engine
blocks = await asyncio.to_thread(
engine.extract_page_blocks, image, page_idx, request.language
)
for block in blocks:
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
block.pop("words", None)
total_blocks += len(blocks)
yield json.dumps({
"type": "page",
"pageNumber": page_idx,
"blocks": blocks,
}) + "\n"
except Exception:
logger.exception("OCR failed on page %d", page_idx)
skipped_pages += 1
yield json.dumps({
"type": "error",
"pageNumber": page_idx,
"message": f"OCR processing failed on page {page_idx}",
}) + "\n"
finally:
del image
yield json.dumps({
"type": "done",
"totalBlocks": total_blocks,
"skippedPages": skipped_pages,
}) + "\n"
return StreamingResponse(
generate(),
media_type="application/x-ndjson",
headers={
"X-Accel-Buffering": "no",
"Cache-Control": "no-cache",
},
)
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
_validate_url(url)
async with httpx.AsyncClient(
timeout=httpx.Timeout(300.0), follow_redirects=False
) as client:
response = await client.get(url)
response.raise_for_status()
pdf = pdfium.PdfDocument(io.BytesIO(response.content))
images = []
for page_idx in range(len(pdf)):
page = pdf[page_idx]
# Render at 200 DPI — balances OCR quality vs memory usage
# (Surya 0.17 models use ~5GB idle; 300 DPI causes OOM on multi-page docs)
bitmap = page.render(scale=200 / 72)
pil_image = bitmap.to_pil()
images.append(pil_image)
return images