feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose

Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:26:40 +02:00
parent aea46c5fd0
commit 6737bd6db5
9 changed files with 500 additions and 0 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java
@@ -0,0 +1,73 @@
 package org.raddatz.familienarchiv.service;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import lombok.extern.slf4j.Slf4j;
 import org.raddatz.familienarchiv.model.ScriptType;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.core.ParameterizedTypeReference;
 import org.springframework.http.MediaType;
 import org.springframework.stereotype.Component;
 import org.springframework.web.client.RestClient;
 import java.util.List;
 import java.util.Map;
@Component
@Slf4j
 public class RestClientOcrClient implements OcrClient, OcrHealthClient {
    private final RestClient restClient;
    public RestClientOcrClient(@Value("${app.ocr.base-url:http://ocr-service:8000}") String baseUrl) {
        this.restClient = RestClient.builder().baseUrl(baseUrl).build();
    }
    @Override
    public List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType) {
        Map<String, String> body = Map.of(
                "pdfUrl", pdfUrl,
                "scriptType", scriptType.name(),
                "language", "de");
        List<OcrBlockJson> response = restClient.post()
                .uri("/ocr")
                .contentType(MediaType.APPLICATION_JSON)
                .body(body)
                .retrieve()
                .body(new ParameterizedTypeReference<>() {});
        if (response == null) return List.of();
        return response.stream()
                .map(OcrBlockJson::toResult)
                .toList();
    }
    @Override
    public boolean isHealthy() {
        try {
            restClient.get()
                    .uri("/health")
                    .retrieve()
                    .toBodilessEntity();
            return true;
        } catch (Exception e) {
            log.warn("OCR service health check failed: {}", e.getMessage());
            return false;
        }
    }
    record OcrBlockJson(
            @JsonProperty("pageNumber") int pageNumber,
            double x,
            double y,
            double width,
            double height,
            List<List<Double>> polygon,
            String text
    ) {
        OcrBlockResult toResult() {
            return new OcrBlockResult(pageNumber, x, y, width, height, polygon, text);
        }
    }
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -71,6 +71,28 @@ services:
    networks:
      - archive-net
  # --- OCR: Python microservice (Surya + Kraken) ---
  ocr-service:
    build:
      context: ./ocr-service
      dockerfile: Dockerfile
    container_name: archive-ocr
    restart: unless-stopped
    mem_limit: 6g
    memswap_limit: 6g
    volumes:
      - ocr_models:/app/models
    environment:
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
    networks:
      - archive-net
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 10s
      timeout: 5s
      retries: 12
      start_period: 60s
  # --- Backend: Spring Boot ---
  backend:
    build:
@@ -89,6 +111,8 @@ services:
        condition: service_healthy
      mailpit:
        condition: service_started
      ocr-service:
        condition: service_healthy
    environment:
      SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
      SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
@@ -109,6 +133,8 @@ services:
      # Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env
      SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
      SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
      APP_OCR_BASE_URL: http://ocr-service:8000
      APP_S3_INTERNAL_URL: http://minio:9000
    ports:
      - "${PORT_BACKEND}:8080"
    networks:
@@ -155,3 +181,4 @@ networks:
 volumes:
  frontend_node_modules:
  maven_cache:
  ocr_models:
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -0,0 +1,23 @@
 FROM python:3.11-slim
 WORKDIR /app
 # curl for healthcheck; libgomp1 for PyTorch CPU threading
 RUN apt-get update && apt-get install -y --no-install-recommends \
    curl \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*
 # PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved)
 RUN pip install --no-cache-dir \
    torch==2.5.1 \
    --index-url https://download.pytorch.org/whl/cpu
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/ocr-service/engines/init.py
+++ b/ocr-service/engines/init.py
--- a/ocr-service/engines/kraken.py
+++ b/ocr-service/engines/kraken.py
@@ -0,0 +1,192 @@
 """Kraken OCR engine wrapper — historical HTR model support for Kurrent/Suetterlin."""
 import logging
 import os
 logger = logging.getLogger(__name__)
 _model = None
 _model_path = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
 def load_models():
    """Load the Kraken model at startup. Skips if model file is not present."""
    global _model
    if not os.path.exists(_model_path):
        logger.warning("Kraken model not found at %s — Kurrent OCR will not be available", _model_path)
        return
    logger.info("Loading Kraken model from %s...", _model_path)
    from kraken.lib import models as kraken_models
    _model = kraken_models.load_any(_model_path)
    logger.info("Kraken model loaded successfully")
 def is_available() -> bool:
    return _model is not None
 def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Kraken segmentation + recognition on a list of PIL images.
    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
    Coordinates are normalized to [0, 1].
    """
    from kraken import blla, rpred
    if _model is None:
        raise RuntimeError("Kraken model is not loaded")
    all_blocks = []
    for page_idx, image in enumerate(images):
        page_w, page_h = image.size
        baseline_seg = blla.segment(image)
        pred_it = rpred.rpred(_model, image, baseline_seg)
        for record in pred_it:
            # record.prediction is the recognized text
            # record.cuts contains polygon points
            # record.line is the baseline polygon
            polygon_pts = record.cuts if hasattr(record, "cuts") else []
            # Compute AABB from the polygon
            if polygon_pts:
                xs = [p[0] for p in polygon_pts]
                ys = [p[1] for p in polygon_pts]
                x1, y1 = min(xs), min(ys)
                x2, y2 = max(xs), max(ys)
            else:
                # Fallback to line baseline
                xs = [p[0] for p in record.line]
                ys = [p[1] for p in record.line]
                x1, y1 = min(xs), min(ys) - 5
                x2, y2 = max(xs), max(ys) + 5
            # Approximate polygon to quadrilateral
            quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
                "y": y1 / page_h,
                "width": (x2 - x1) / page_w,
                "height": (y2 - y1) / page_h,
                "polygon": quad,
                "text": record.prediction,
            })
    return all_blocks
 def _approximate_to_quad(points: list[tuple], page_w: float, page_h: float) -> list[list[float]] | None:
    """Approximate a polygon to a 4-point quadrilateral using the minimum bounding rectangle.
    Uses gift-wrapping (Jarvis march) for convex hull, then rotating calipers
    for the minimum area bounding rectangle. Pure Python, no scipy/numpy.
    """
    if len(points) < 3:
        return None
    try:
        hull = _convex_hull(points)
        if len(hull) < 3:
            return None
        rect = _min_bounding_rect(hull)
        # Normalize to [0, 1]
        return [[p[0] / page_w, p[1] / page_h] for p in rect]
    except Exception:
        logger.debug("Failed to approximate polygon to quad, returning None")
        return None
 def _convex_hull(points: list[tuple]) -> list[tuple]:
    """Jarvis march (gift wrapping) algorithm for 2D convex hull."""
    pts = list(set(points))
    if len(pts) < 3:
        return pts
    # Start from leftmost point
    start = min(pts, key=lambda p: (p[0], p[1]))
    hull = []
    current = start
    while True:
        hull.append(current)
        candidate = pts[0]
        for p in pts[1:]:
            if candidate == current:
                candidate = p
                continue
            cross = _cross(current, candidate, p)
            if cross < 0:
                candidate = p
            elif cross == 0:
                # Collinear — pick the farther point
                if _dist_sq(current, p) > _dist_sq(current, candidate):
                    candidate = p
        current = candidate
        if current == start:
            break
    return hull
 def _min_bounding_rect(hull: list[tuple]) -> list[tuple]:
    """Find the minimum area bounding rectangle of a convex hull using rotating calipers."""
    n = len(hull)
    if n < 2:
        return hull
    min_area = float("inf")
    best_rect = None
    for i in range(n):
        # Edge vector
        edge_x = hull[(i + 1) % n][0] - hull[i][0]
        edge_y = hull[(i + 1) % n][1] - hull[i][1]
        edge_len = (edge_x ** 2 + edge_y ** 2) ** 0.5
        if edge_len == 0:
            continue
        # Unit vectors along and perpendicular to the edge
        ux, uy = edge_x / edge_len, edge_y / edge_len
        vx, vy = -uy, ux
        # Project all hull points onto the edge coordinate system
        projs_u = [p[0] * ux + p[1] * uy for p in hull]
        projs_v = [p[0] * vx + p[1] * vy for p in hull]
        min_u, max_u = min(projs_u), max(projs_u)
        min_v, max_v = min(projs_v), max(projs_v)
        area = (max_u - min_u) * (max_v - min_v)
        if area < min_area:
            min_area = area
            # Reconstruct 4 corners in original coordinates
            best_rect = [
                (min_u * ux + min_v * vx, min_u * uy + min_v * vy),
                (max_u * ux + min_v * vx, max_u * uy + min_v * vy),
                (max_u * ux + max_v * vx, max_u * uy + max_v * vy),
                (min_u * ux + max_v * vx, min_u * uy + max_v * vy),
            ]
    return best_rect if best_rect else hull[:4]
 def _cross(o: tuple, a: tuple, b: tuple) -> float:
    return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
 def _dist_sq(a: tuple, b: tuple) -> float:
    return (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -0,0 +1,66 @@
 """Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
 import logging
 logger = logging.getLogger(__name__)
 # Lazy-loaded at startup via load_models()
 _recognition_model = None
 _recognition_processor = None
 _detection_model = None
 _detection_processor = None
 def load_models():
    """Eagerly load Surya models into memory. Called once at container startup."""
    global _recognition_model, _recognition_processor, _detection_model, _detection_processor
    logger.info("Loading Surya models...")
    from surya.model.detection.model import load_model as load_det_model
    from surya.model.detection.model import load_processor as load_det_processor
    from surya.model.recognition.model import load_model as load_rec_model
    from surya.model.recognition.processor import load_processor as load_rec_processor
    _detection_model = load_det_model()
    _detection_processor = load_det_processor()
    _recognition_model = load_rec_model()
    _recognition_processor = load_rec_processor()
    logger.info("Surya models loaded successfully")
 def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Surya OCR on a list of PIL images (one per page).
    Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
    Coordinates are normalized to [0, 1] relative to page dimensions.
    """
    from surya.detection import batch_text_detection
    from surya.recognition import batch_recognition
    all_blocks = []
    for page_idx, image in enumerate(images):
        page_w, page_h = image.size
        det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
        rec_predictions = batch_recognition(
            [image], det_predictions, _recognition_model, _recognition_processor, [language]
        )
        for line in rec_predictions[0].text_lines:
            bbox = line.bbox  # [x1, y1, x2, y2] in pixel coordinates
            x1, y1, x2, y2 = bbox
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
                "y": y1 / page_h,
                "width": (x2 - x1) / page_w,
                "height": (y2 - y1) / page_h,
                "polygon": None,
                "text": line.text,
            })
    return all_blocks
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -0,0 +1,93 @@
 """OCR microservice — FastAPI app with Surya and Kraken engine support."""
 import io
 import logging
 from contextlib import asynccontextmanager
 import httpx
 import pypdfium2 as pdfium
 from fastapi import FastAPI, HTTPException
 from PIL import Image
 from engines import kraken as kraken_engine
 from engines import surya as surya_engine
 from models import OcrBlock, OcrRequest
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 _models_ready = False
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Load all OCR models at startup before accepting requests."""
    global _models_ready
    logger.info("Loading OCR models at startup...")
    surya_engine.load_models()
    kraken_engine.load_models()
    _models_ready = True
    logger.info("All OCR models loaded — ready to accept requests")
    yield
    logger.info("Shutting down OCR service")
 app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
@app.get("/health")
 def health():
    """Health endpoint — returns 200 only after models are loaded."""
    if not _models_ready:
        raise HTTPException(status_code=503, detail="Models not loaded yet")
    return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
@app.post("/ocr", response_model=list[OcrBlock])
 async def run_ocr(request: OcrRequest):
    """Run OCR on a PDF document.
    Downloads the PDF from the provided URL, converts pages to images,
    and runs the appropriate OCR engine based on scriptType.
    """
    if not _models_ready:
        raise HTTPException(status_code=503, detail="Models not loaded yet")
    images = await _download_and_convert_pdf(request.pdf_url)
    script_type = request.script_type.upper()
    if script_type == "HANDWRITING_KURRENT":
        if not kraken_engine.is_available():
            raise HTTPException(
                status_code=400,
                detail="Kraken model not available — cannot process Kurrent script",
            )
        blocks = kraken_engine.extract_blocks(images, request.language)
    else:
        # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
        blocks = surya_engine.extract_blocks(images, request.language)
    return [OcrBlock(**b) for b in blocks]
 async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
    """Download a PDF from URL and convert each page to a PIL Image."""
    async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
        response = await client.get(url)
        response.raise_for_status()
    pdf = pdfium.PdfDocument(io.BytesIO(response.content))
    images = []
    for page_idx in range(len(pdf)):
        page = pdf[page_idx]
        # Render at 300 DPI for good OCR quality
        bitmap = page.render(scale=300 / 72)
        pil_image = bitmap.to_pil()
        images.append(pil_image)
    return images
--- a/ocr-service/models.py
+++ b/ocr-service/models.py
@@ -0,0 +1,20 @@
 from pydantic import BaseModel, Field
 class OcrRequest(BaseModel):
    pdf_url: str = Field(..., alias="pdfUrl")
    script_type: str = Field("UNKNOWN", alias="scriptType")
    language: str = "de"
 class OcrBlock(BaseModel):
    page_number: int = Field(..., alias="pageNumber")
    x: float
    y: float
    width: float
    height: float
    polygon: list[list[float]] | None = None
    text: str
    class Config:
        populate_by_name = True
--- a/ocr-service/requirements.txt
+++ b/ocr-service/requirements.txt
@@ -0,0 +1,6 @@
 fastapi[standard]==0.115.6
 surya-ocr==0.6.3
 kraken==5.2.9
 pillow==11.1.0
 pypdfium2==4.30.0
 httpx==0.28.1