feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose

Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:26:40 +02:00
parent aea46c5fd0
commit 6737bd6db5
9 changed files with 500 additions and 0 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java
@@ -0,0 +1,73 @@
+package org.raddatz.familienarchiv.service;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.extern.slf4j.Slf4j;
+import org.raddatz.familienarchiv.model.ScriptType;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.core.ParameterizedTypeReference;
+import org.springframework.http.MediaType;
+import org.springframework.stereotype.Component;
+import org.springframework.web.client.RestClient;
+
+import java.util.List;
+import java.util.Map;
+
+@Component
+@Slf4j
+public class RestClientOcrClient implements OcrClient, OcrHealthClient {
+
+    private final RestClient restClient;
+
+    public RestClientOcrClient(@Value("${app.ocr.base-url:http://ocr-service:8000}") String baseUrl) {
+        this.restClient = RestClient.builder().baseUrl(baseUrl).build();
+    }
+
+    @Override
+    public List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType) {
+        Map<String, String> body = Map.of(
+                "pdfUrl", pdfUrl,
+                "scriptType", scriptType.name(),
+                "language", "de");
+
+        List<OcrBlockJson> response = restClient.post()
+                .uri("/ocr")
+                .contentType(MediaType.APPLICATION_JSON)
+                .body(body)
+                .retrieve()
+                .body(new ParameterizedTypeReference<>() {});
+
+        if (response == null) return List.of();
+
+        return response.stream()
+                .map(OcrBlockJson::toResult)
+                .toList();
+    }
+
+    @Override
+    public boolean isHealthy() {
+        try {
+            restClient.get()
+                    .uri("/health")
+                    .retrieve()
+                    .toBodilessEntity();
+            return true;
+        } catch (Exception e) {
+            log.warn("OCR service health check failed: {}", e.getMessage());
+            return false;
+        }
+    }
+
+    record OcrBlockJson(
+            @JsonProperty("pageNumber") int pageNumber,
+            double x,
+            double y,
+            double width,
+            double height,
+            List<List<Double>> polygon,
+            String text
+    ) {
+        OcrBlockResult toResult() {
+            return new OcrBlockResult(pageNumber, x, y, width, height, polygon, text);
+        }
+    }
+}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -71,6 +71,28 @@ services:
    networks:
      - archive-net

+  # --- OCR: Python microservice (Surya + Kraken) ---
+  ocr-service:
+    build:
+      context: ./ocr-service
+      dockerfile: Dockerfile
+    container_name: archive-ocr
+    restart: unless-stopped
+    mem_limit: 6g
+    memswap_limit: 6g
+    volumes:
+      - ocr_models:/app/models
+    environment:
+      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
+    networks:
+      - archive-net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 12
+      start_period: 60s
+
  # --- Backend: Spring Boot ---
  backend:
    build:
@@ -89,6 +111,8 @@ services:
        condition: service_healthy
      mailpit:
        condition: service_started
+      ocr-service:
+        condition: service_healthy
    environment:
      SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
      SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
@@ -109,6 +133,8 @@ services:
      # Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env
      SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
      SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
+      APP_OCR_BASE_URL: http://ocr-service:8000
+      APP_S3_INTERNAL_URL: http://minio:9000
    ports:
      - "${PORT_BACKEND}:8080"
    networks:
@@ -155,3 +181,4 @@ networks:
 volumes:
  frontend_node_modules:
  maven_cache:
+  ocr_models:
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# curl for healthcheck; libgomp1 for PyTorch CPU threading
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved)
+RUN pip install --no-cache-dir \
+    torch==2.5.1 \
+    --index-url https://download.pytorch.org/whl/cpu
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8000
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/ocr-service/engines/init.py
+++ b/ocr-service/engines/init.py
--- a/ocr-service/engines/kraken.py
+++ b/ocr-service/engines/kraken.py
@@ -0,0 +1,192 @@
+"""Kraken OCR engine wrapper — historical HTR model support for Kurrent/Suetterlin."""
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+_model = None
+_model_path = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
+
+
+def load_models():
+    """Load the Kraken model at startup. Skips if model file is not present."""
+    global _model
+
+    if not os.path.exists(_model_path):
+        logger.warning("Kraken model not found at %s — Kurrent OCR will not be available", _model_path)
+        return
+
+    logger.info("Loading Kraken model from %s...", _model_path)
+
+    from kraken.lib import models as kraken_models
+    _model = kraken_models.load_any(_model_path)
+
+    logger.info("Kraken model loaded successfully")
+
+
+def is_available() -> bool:
+    return _model is not None
+
+
+def extract_blocks(images: list, language: str = "de") -> list[dict]:
+    """Run Kraken segmentation + recognition on a list of PIL images.
+
+    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
+    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
+    Coordinates are normalized to [0, 1].
+    """
+    from kraken import blla, rpred
+
+    if _model is None:
+        raise RuntimeError("Kraken model is not loaded")
+
+    all_blocks = []
+
+    for page_idx, image in enumerate(images):
+        page_w, page_h = image.size
+
+        baseline_seg = blla.segment(image)
+
+        pred_it = rpred.rpred(_model, image, baseline_seg)
+
+        for record in pred_it:
+            # record.prediction is the recognized text
+            # record.cuts contains polygon points
+            # record.line is the baseline polygon
+
+            polygon_pts = record.cuts if hasattr(record, "cuts") else []
+
+            # Compute AABB from the polygon
+            if polygon_pts:
+                xs = [p[0] for p in polygon_pts]
+                ys = [p[1] for p in polygon_pts]
+                x1, y1 = min(xs), min(ys)
+                x2, y2 = max(xs), max(ys)
+            else:
+                # Fallback to line baseline
+                xs = [p[0] for p in record.line]
+                ys = [p[1] for p in record.line]
+                x1, y1 = min(xs), min(ys) - 5
+                x2, y2 = max(xs), max(ys) + 5
+
+            # Approximate polygon to quadrilateral
+            quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
+
+            all_blocks.append({
+                "pageNumber": page_idx,
+                "x": x1 / page_w,
+                "y": y1 / page_h,
+                "width": (x2 - x1) / page_w,
+                "height": (y2 - y1) / page_h,
+                "polygon": quad,
+                "text": record.prediction,
+            })
+
+    return all_blocks
+
+
+def _approximate_to_quad(points: list[tuple], page_w: float, page_h: float) -> list[list[float]] | None:
+    """Approximate a polygon to a 4-point quadrilateral using the minimum bounding rectangle.
+
+    Uses gift-wrapping (Jarvis march) for convex hull, then rotating calipers
+    for the minimum area bounding rectangle. Pure Python, no scipy/numpy.
+    """
+    if len(points) < 3:
+        return None
+
+    try:
+        hull = _convex_hull(points)
+        if len(hull) < 3:
+            return None
+
+        rect = _min_bounding_rect(hull)
+
+        # Normalize to [0, 1]
+        return [[p[0] / page_w, p[1] / page_h] for p in rect]
+    except Exception:
+        logger.debug("Failed to approximate polygon to quad, returning None")
+        return None
+
+
+def _convex_hull(points: list[tuple]) -> list[tuple]:
+    """Jarvis march (gift wrapping) algorithm for 2D convex hull."""
+    pts = list(set(points))
+    if len(pts) < 3:
+        return pts
+
+    # Start from leftmost point
+    start = min(pts, key=lambda p: (p[0], p[1]))
+    hull = []
+    current = start
+
+    while True:
+        hull.append(current)
+        candidate = pts[0]
+        for p in pts[1:]:
+            if candidate == current:
+                candidate = p
+                continue
+            cross = _cross(current, candidate, p)
+            if cross < 0:
+                candidate = p
+            elif cross == 0:
+                # Collinear — pick the farther point
+                if _dist_sq(current, p) > _dist_sq(current, candidate):
+                    candidate = p
+
+        current = candidate
+        if current == start:
+            break
+
+    return hull
+
+
+def _min_bounding_rect(hull: list[tuple]) -> list[tuple]:
+    """Find the minimum area bounding rectangle of a convex hull using rotating calipers."""
+    n = len(hull)
+    if n < 2:
+        return hull
+
+    min_area = float("inf")
+    best_rect = None
+
+    for i in range(n):
+        # Edge vector
+        edge_x = hull[(i + 1) % n][0] - hull[i][0]
+        edge_y = hull[(i + 1) % n][1] - hull[i][1]
+        edge_len = (edge_x ** 2 + edge_y ** 2) ** 0.5
+        if edge_len == 0:
+            continue
+
+        # Unit vectors along and perpendicular to the edge
+        ux, uy = edge_x / edge_len, edge_y / edge_len
+        vx, vy = -uy, ux
+
+        # Project all hull points onto the edge coordinate system
+        projs_u = [p[0] * ux + p[1] * uy for p in hull]
+        projs_v = [p[0] * vx + p[1] * vy for p in hull]
+
+        min_u, max_u = min(projs_u), max(projs_u)
+        min_v, max_v = min(projs_v), max(projs_v)
+
+        area = (max_u - min_u) * (max_v - min_v)
+        if area < min_area:
+            min_area = area
+            # Reconstruct 4 corners in original coordinates
+            best_rect = [
+                (min_u * ux + min_v * vx, min_u * uy + min_v * vy),
+                (max_u * ux + min_v * vx, max_u * uy + min_v * vy),
+                (max_u * ux + max_v * vx, max_u * uy + max_v * vy),
+                (min_u * ux + max_v * vx, min_u * uy + max_v * vy),
+            ]
+
+    return best_rect if best_rect else hull[:4]
+
+
+def _cross(o: tuple, a: tuple, b: tuple) -> float:
+    return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
+
+
+def _dist_sq(a: tuple, b: tuple) -> float:
+    return (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -0,0 +1,66 @@
+"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# Lazy-loaded at startup via load_models()
+_recognition_model = None
+_recognition_processor = None
+_detection_model = None
+_detection_processor = None
+
+
+def load_models():
+    """Eagerly load Surya models into memory. Called once at container startup."""
+    global _recognition_model, _recognition_processor, _detection_model, _detection_processor
+
+    logger.info("Loading Surya models...")
+
+    from surya.model.detection.model import load_model as load_det_model
+    from surya.model.detection.model import load_processor as load_det_processor
+    from surya.model.recognition.model import load_model as load_rec_model
+    from surya.model.recognition.processor import load_processor as load_rec_processor
+
+    _detection_model = load_det_model()
+    _detection_processor = load_det_processor()
+    _recognition_model = load_rec_model()
+    _recognition_processor = load_rec_processor()
+
+    logger.info("Surya models loaded successfully")
+
+
+def extract_blocks(images: list, language: str = "de") -> list[dict]:
+    """Run Surya OCR on a list of PIL images (one per page).
+
+    Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
+    Coordinates are normalized to [0, 1] relative to page dimensions.
+    """
+    from surya.detection import batch_text_detection
+    from surya.recognition import batch_recognition
+
+    all_blocks = []
+
+    for page_idx, image in enumerate(images):
+        page_w, page_h = image.size
+
+        det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
+        rec_predictions = batch_recognition(
+            [image], det_predictions, _recognition_model, _recognition_processor, [language]
+        )
+
+        for line in rec_predictions[0].text_lines:
+            bbox = line.bbox  # [x1, y1, x2, y2] in pixel coordinates
+            x1, y1, x2, y2 = bbox
+
+            all_blocks.append({
+                "pageNumber": page_idx,
+                "x": x1 / page_w,
+                "y": y1 / page_h,
+                "width": (x2 - x1) / page_w,
+                "height": (y2 - y1) / page_h,
+                "polygon": None,
+                "text": line.text,
+            })
+
+    return all_blocks
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -0,0 +1,93 @@
+"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
+
+import io
+import logging
+from contextlib import asynccontextmanager
+
+import httpx
+import pypdfium2 as pdfium
+from fastapi import FastAPI, HTTPException
+from PIL import Image
+
+from engines import kraken as kraken_engine
+from engines import surya as surya_engine
+from models import OcrBlock, OcrRequest
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+_models_ready = False
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load all OCR models at startup before accepting requests."""
+    global _models_ready
+
+    logger.info("Loading OCR models at startup...")
+    surya_engine.load_models()
+    kraken_engine.load_models()
+    _models_ready = True
+    logger.info("All OCR models loaded — ready to accept requests")
+
+    yield
+
+    logger.info("Shutting down OCR service")
+
+
+app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
+
+
+@app.get("/health")
+def health():
+    """Health endpoint — returns 200 only after models are loaded."""
+    if not _models_ready:
+        raise HTTPException(status_code=503, detail="Models not loaded yet")
+    return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()}
+
+
+@app.post("/ocr", response_model=list[OcrBlock])
+async def run_ocr(request: OcrRequest):
+    """Run OCR on a PDF document.
+
+    Downloads the PDF from the provided URL, converts pages to images,
+    and runs the appropriate OCR engine based on scriptType.
+    """
+    if not _models_ready:
+        raise HTTPException(status_code=503, detail="Models not loaded yet")
+
+    images = await _download_and_convert_pdf(request.pdf_url)
+
+    script_type = request.script_type.upper()
+
+    if script_type == "HANDWRITING_KURRENT":
+        if not kraken_engine.is_available():
+            raise HTTPException(
+                status_code=400,
+                detail="Kraken model not available — cannot process Kurrent script",
+            )
+        blocks = kraken_engine.extract_blocks(images, request.language)
+    else:
+        # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
+        blocks = surya_engine.extract_blocks(images, request.language)
+
+    return [OcrBlock(**b) for b in blocks]
+
+
+async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
+    """Download a PDF from URL and convert each page to a PIL Image."""
+    async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
+        response = await client.get(url)
+        response.raise_for_status()
+
+    pdf = pdfium.PdfDocument(io.BytesIO(response.content))
+    images = []
+
+    for page_idx in range(len(pdf)):
+        page = pdf[page_idx]
+        # Render at 300 DPI for good OCR quality
+        bitmap = page.render(scale=300 / 72)
+        pil_image = bitmap.to_pil()
+        images.append(pil_image)
+
+    return images
--- a/ocr-service/models.py
+++ b/ocr-service/models.py
@@ -0,0 +1,20 @@
+from pydantic import BaseModel, Field
+
+
+class OcrRequest(BaseModel):
+    pdf_url: str = Field(..., alias="pdfUrl")
+    script_type: str = Field("UNKNOWN", alias="scriptType")
+    language: str = "de"
+
+
+class OcrBlock(BaseModel):
+    page_number: int = Field(..., alias="pageNumber")
+    x: float
+    y: float
+    width: float
+    height: float
+    polygon: list[list[float]] | None = None
+    text: str
+
+    class Config:
+        populate_by_name = True
--- a/ocr-service/requirements.txt
+++ b/ocr-service/requirements.txt
@@ -0,0 +1,6 @@
+fastapi[standard]==0.115.6
+surya-ocr==0.6.3
+kraken==5.2.9
+pillow==11.1.0
+pypdfium2==4.30.0
+httpx==0.28.1