feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose
Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
93
ocr-service/main.py
Normal file
93
ocr-service/main.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import pypdfium2 as pdfium
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from PIL import Image
|
||||
|
||||
from engines import kraken as kraken_engine
|
||||
from engines import surya as surya_engine
|
||||
from models import OcrBlock, OcrRequest
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_models_ready = False
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load all OCR models at startup before accepting requests."""
|
||||
global _models_ready
|
||||
|
||||
logger.info("Loading OCR models at startup...")
|
||||
surya_engine.load_models()
|
||||
kraken_engine.load_models()
|
||||
_models_ready = True
|
||||
logger.info("All OCR models loaded — ready to accept requests")
|
||||
|
||||
yield
|
||||
|
||||
logger.info("Shutting down OCR service")
|
||||
|
||||
|
||||
app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
"""Health endpoint — returns 200 only after models are loaded."""
|
||||
if not _models_ready:
|
||||
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||
return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
|
||||
|
||||
|
||||
@app.post("/ocr", response_model=list[OcrBlock])
|
||||
async def run_ocr(request: OcrRequest):
|
||||
"""Run OCR on a PDF document.
|
||||
|
||||
Downloads the PDF from the provided URL, converts pages to images,
|
||||
and runs the appropriate OCR engine based on scriptType.
|
||||
"""
|
||||
if not _models_ready:
|
||||
raise HTTPException(status_code=503, detail="Models not loaded yet")
|
||||
|
||||
images = await _download_and_convert_pdf(request.pdf_url)
|
||||
|
||||
script_type = request.script_type.upper()
|
||||
|
||||
if script_type == "HANDWRITING_KURRENT":
|
||||
if not kraken_engine.is_available():
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Kraken model not available — cannot process Kurrent script",
|
||||
)
|
||||
blocks = kraken_engine.extract_blocks(images, request.language)
|
||||
else:
|
||||
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
||||
blocks = surya_engine.extract_blocks(images, request.language)
|
||||
|
||||
return [OcrBlock(**b) for b in blocks]
|
||||
|
||||
|
||||
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
||||
"""Download a PDF from URL and convert each page to a PIL Image."""
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
pdf = pdfium.PdfDocument(io.BytesIO(response.content))
|
||||
images = []
|
||||
|
||||
for page_idx in range(len(pdf)):
|
||||
page = pdf[page_idx]
|
||||
# Render at 300 DPI for good OCR quality
|
||||
bitmap = page.render(scale=300 / 72)
|
||||
pil_image = bitmap.to_pil()
|
||||
images.append(pil_image)
|
||||
|
||||
return images
|
||||
Reference in New Issue
Block a user