Files
familienarchiv/ocr-service/main.py
Marcel c74539b04b
Some checks failed
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 2s
CI / Unit & Component Tests (pull_request) Failing after 2s
CI / Backend Unit Tests (pull_request) Failing after 1s
feat(ocr): auto-insert [unleserlich] markers for low-confidence words
New confidence.py module with two functions:
- apply_confidence_markers(): replaces words below threshold with
  [unleserlich], collapses adjacent markers into one
- words_from_characters(): reconstructs word-level confidence from
  Kraken's character-level data

Surya 0.17 provides native word-level confidence via line.words.
Kraken 7.0 provides per-character confidences via record.confidences.
Both engines now pass word+confidence data through main.py, which
applies the marker post-processing before returning the API response.

Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3).
Frontend already renders [unleserlich] markers via transcriptionMarkers.ts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 19:16:17 +02:00

100 lines
3.0 KiB
Python

"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
import io
import logging
from contextlib import asynccontextmanager
import httpx
import pypdfium2 as pdfium
from fastapi import FastAPI, HTTPException
from PIL import Image
from confidence import apply_confidence_markers
from engines import kraken as kraken_engine
from engines import surya as surya_engine
from models import OcrBlock, OcrRequest
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
_models_ready = False
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load all OCR models at startup before accepting requests."""
global _models_ready
logger.info("Loading OCR models at startup...")
surya_engine.load_models()
kraken_engine.load_models()
_models_ready = True
logger.info("All OCR models loaded — ready to accept requests")
yield
logger.info("Shutting down OCR service")
app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
@app.get("/health")
def health():
"""Health endpoint — returns 200 only after models are loaded."""
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
@app.post("/ocr", response_model=list[OcrBlock])
async def run_ocr(request: OcrRequest):
"""Run OCR on a PDF document.
Downloads the PDF from the provided URL, converts pages to images,
and runs the appropriate OCR engine based on scriptType.
"""
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
images = await _download_and_convert_pdf(request.pdf_url)
script_type = request.script_type.upper()
if script_type == "HANDWRITING_KURRENT":
if not kraken_engine.is_available():
raise HTTPException(
status_code=400,
detail="Kraken model not available — cannot process Kurrent script",
)
blocks = kraken_engine.extract_blocks(images, request.language)
else:
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
blocks = surya_engine.extract_blocks(images, request.language)
for block in blocks:
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"])
block.pop("words", None)
return [OcrBlock(**b) for b in blocks]
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
"""Download a PDF from URL and convert each page to a PIL Image."""
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
response = await client.get(url)
response.raise_for_status()
pdf = pdfium.PdfDocument(io.BytesIO(response.content))
images = []
for page_idx in range(len(pdf)):
page = pdf[page_idx]
# Render at 300 DPI for good OCR quality
bitmap = page.render(scale=300 / 72)
pil_image = bitmap.to_pil()
images.append(pil_image)
return images