diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java new file mode 100644 index 00000000..0bbb533c --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java @@ -0,0 +1,73 @@ +package org.raddatz.familienarchiv.service; + +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.model.ScriptType; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.ParameterizedTypeReference; +import org.springframework.http.MediaType; +import org.springframework.stereotype.Component; +import org.springframework.web.client.RestClient; + +import java.util.List; +import java.util.Map; + +@Component +@Slf4j +public class RestClientOcrClient implements OcrClient, OcrHealthClient { + + private final RestClient restClient; + + public RestClientOcrClient(@Value("${app.ocr.base-url:http://ocr-service:8000}") String baseUrl) { + this.restClient = RestClient.builder().baseUrl(baseUrl).build(); + } + + @Override + public List extractBlocks(String pdfUrl, ScriptType scriptType) { + Map body = Map.of( + "pdfUrl", pdfUrl, + "scriptType", scriptType.name(), + "language", "de"); + + List response = restClient.post() + .uri("/ocr") + .contentType(MediaType.APPLICATION_JSON) + .body(body) + .retrieve() + .body(new ParameterizedTypeReference<>() {}); + + if (response == null) return List.of(); + + return response.stream() + .map(OcrBlockJson::toResult) + .toList(); + } + + @Override + public boolean isHealthy() { + try { + restClient.get() + .uri("/health") + .retrieve() + .toBodilessEntity(); + return true; + } catch (Exception e) { + log.warn("OCR service health check failed: {}", e.getMessage()); + return false; + } + } + + record OcrBlockJson( + @JsonProperty("pageNumber") int pageNumber, + double x, + double y, + double width, + double height, + List> polygon, + String text + ) { + OcrBlockResult toResult() { + return new OcrBlockResult(pageNumber, x, y, width, height, polygon, text); + } + } +} diff --git a/docker-compose.yml b/docker-compose.yml index 7ceabc66..0dd05942 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,28 @@ services: networks: - archive-net + # --- OCR: Python microservice (Surya + Kraken) --- + ocr-service: + build: + context: ./ocr-service + dockerfile: Dockerfile + container_name: archive-ocr + restart: unless-stopped + mem_limit: 6g + memswap_limit: 6g + volumes: + - ocr_models:/app/models + environment: + KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel + networks: + - archive-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 60s + # --- Backend: Spring Boot --- backend: build: @@ -89,6 +111,8 @@ services: condition: service_healthy mailpit: condition: service_started + ocr-service: + condition: service_healthy environment: SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB} SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER} @@ -109,6 +133,8 @@ services: # Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false} SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false} + APP_OCR_BASE_URL: http://ocr-service:8000 + APP_S3_INTERNAL_URL: http://minio:9000 ports: - "${PORT_BACKEND}:8080" networks: @@ -155,3 +181,4 @@ networks: volumes: frontend_node_modules: maven_cache: + ocr_models: diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile new file mode 100644 index 00000000..24f74be0 --- /dev/null +++ b/ocr-service/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +WORKDIR /app + +# curl for healthcheck; libgomp1 for PyTorch CPU threading +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved) +RUN pip install --no-cache-dir \ + torch==2.5.1 \ + --index-url https://download.pytorch.org/whl/cpu + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/ocr-service/engines/__init__.py b/ocr-service/engines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py new file mode 100644 index 00000000..16cb3d0b --- /dev/null +++ b/ocr-service/engines/kraken.py @@ -0,0 +1,192 @@ +"""Kraken OCR engine wrapper — historical HTR model support for Kurrent/Suetterlin.""" + +import logging +import os + +logger = logging.getLogger(__name__) + +_model = None +_model_path = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel") + + +def load_models(): + """Load the Kraken model at startup. Skips if model file is not present.""" + global _model + + if not os.path.exists(_model_path): + logger.warning("Kraken model not found at %s — Kurrent OCR will not be available", _model_path) + return + + logger.info("Loading Kraken model from %s...", _model_path) + + from kraken.lib import models as kraken_models + _model = kraken_models.load_any(_model_path) + + logger.info("Kraken model loaded successfully") + + +def is_available() -> bool: + return _model is not None + + +def extract_blocks(images: list, language: str = "de") -> list[dict]: + """Run Kraken segmentation + recognition on a list of PIL images. + + Returns block dicts with pageNumber, x, y, width, height, polygon, text. + Polygon is a 4-point quadrilateral approximation of the baseline polygon. + Coordinates are normalized to [0, 1]. + """ + from kraken import blla, rpred + + if _model is None: + raise RuntimeError("Kraken model is not loaded") + + all_blocks = [] + + for page_idx, image in enumerate(images): + page_w, page_h = image.size + + baseline_seg = blla.segment(image) + + pred_it = rpred.rpred(_model, image, baseline_seg) + + for record in pred_it: + # record.prediction is the recognized text + # record.cuts contains polygon points + # record.line is the baseline polygon + + polygon_pts = record.cuts if hasattr(record, "cuts") else [] + + # Compute AABB from the polygon + if polygon_pts: + xs = [p[0] for p in polygon_pts] + ys = [p[1] for p in polygon_pts] + x1, y1 = min(xs), min(ys) + x2, y2 = max(xs), max(ys) + else: + # Fallback to line baseline + xs = [p[0] for p in record.line] + ys = [p[1] for p in record.line] + x1, y1 = min(xs), min(ys) - 5 + x2, y2 = max(xs), max(ys) + 5 + + # Approximate polygon to quadrilateral + quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None + + all_blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": quad, + "text": record.prediction, + }) + + return all_blocks + + +def _approximate_to_quad(points: list[tuple], page_w: float, page_h: float) -> list[list[float]] | None: + """Approximate a polygon to a 4-point quadrilateral using the minimum bounding rectangle. + + Uses gift-wrapping (Jarvis march) for convex hull, then rotating calipers + for the minimum area bounding rectangle. Pure Python, no scipy/numpy. + """ + if len(points) < 3: + return None + + try: + hull = _convex_hull(points) + if len(hull) < 3: + return None + + rect = _min_bounding_rect(hull) + + # Normalize to [0, 1] + return [[p[0] / page_w, p[1] / page_h] for p in rect] + except Exception: + logger.debug("Failed to approximate polygon to quad, returning None") + return None + + +def _convex_hull(points: list[tuple]) -> list[tuple]: + """Jarvis march (gift wrapping) algorithm for 2D convex hull.""" + pts = list(set(points)) + if len(pts) < 3: + return pts + + # Start from leftmost point + start = min(pts, key=lambda p: (p[0], p[1])) + hull = [] + current = start + + while True: + hull.append(current) + candidate = pts[0] + for p in pts[1:]: + if candidate == current: + candidate = p + continue + cross = _cross(current, candidate, p) + if cross < 0: + candidate = p + elif cross == 0: + # Collinear — pick the farther point + if _dist_sq(current, p) > _dist_sq(current, candidate): + candidate = p + + current = candidate + if current == start: + break + + return hull + + +def _min_bounding_rect(hull: list[tuple]) -> list[tuple]: + """Find the minimum area bounding rectangle of a convex hull using rotating calipers.""" + n = len(hull) + if n < 2: + return hull + + min_area = float("inf") + best_rect = None + + for i in range(n): + # Edge vector + edge_x = hull[(i + 1) % n][0] - hull[i][0] + edge_y = hull[(i + 1) % n][1] - hull[i][1] + edge_len = (edge_x ** 2 + edge_y ** 2) ** 0.5 + if edge_len == 0: + continue + + # Unit vectors along and perpendicular to the edge + ux, uy = edge_x / edge_len, edge_y / edge_len + vx, vy = -uy, ux + + # Project all hull points onto the edge coordinate system + projs_u = [p[0] * ux + p[1] * uy for p in hull] + projs_v = [p[0] * vx + p[1] * vy for p in hull] + + min_u, max_u = min(projs_u), max(projs_u) + min_v, max_v = min(projs_v), max(projs_v) + + area = (max_u - min_u) * (max_v - min_v) + if area < min_area: + min_area = area + # Reconstruct 4 corners in original coordinates + best_rect = [ + (min_u * ux + min_v * vx, min_u * uy + min_v * vy), + (max_u * ux + min_v * vx, max_u * uy + min_v * vy), + (max_u * ux + max_v * vx, max_u * uy + max_v * vy), + (min_u * ux + max_v * vx, min_u * uy + max_v * vy), + ] + + return best_rect if best_rect else hull[:4] + + +def _cross(o: tuple, a: tuple, b: tuple) -> float: + return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0]) + + +def _dist_sq(a: tuple, b: tuple) -> float: + return (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2 diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py new file mode 100644 index 00000000..c6cc7768 --- /dev/null +++ b/ocr-service/engines/surya.py @@ -0,0 +1,66 @@ +"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting.""" + +import logging + +logger = logging.getLogger(__name__) + +# Lazy-loaded at startup via load_models() +_recognition_model = None +_recognition_processor = None +_detection_model = None +_detection_processor = None + + +def load_models(): + """Eagerly load Surya models into memory. Called once at container startup.""" + global _recognition_model, _recognition_processor, _detection_model, _detection_processor + + logger.info("Loading Surya models...") + + from surya.model.detection.model import load_model as load_det_model + from surya.model.detection.model import load_processor as load_det_processor + from surya.model.recognition.model import load_model as load_rec_model + from surya.model.recognition.processor import load_processor as load_rec_processor + + _detection_model = load_det_model() + _detection_processor = load_det_processor() + _recognition_model = load_rec_model() + _recognition_processor = load_rec_processor() + + logger.info("Surya models loaded successfully") + + +def extract_blocks(images: list, language: str = "de") -> list[dict]: + """Run Surya OCR on a list of PIL images (one per page). + + Returns a flat list of block dicts with pageNumber, x, y, width, height, text. + Coordinates are normalized to [0, 1] relative to page dimensions. + """ + from surya.detection import batch_text_detection + from surya.recognition import batch_recognition + + all_blocks = [] + + for page_idx, image in enumerate(images): + page_w, page_h = image.size + + det_predictions = batch_text_detection([image], _detection_model, _detection_processor) + rec_predictions = batch_recognition( + [image], det_predictions, _recognition_model, _recognition_processor, [language] + ) + + for line in rec_predictions[0].text_lines: + bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates + x1, y1, x2, y2 = bbox + + all_blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": None, + "text": line.text, + }) + + return all_blocks diff --git a/ocr-service/main.py b/ocr-service/main.py new file mode 100644 index 00000000..d4e3f957 --- /dev/null +++ b/ocr-service/main.py @@ -0,0 +1,93 @@ +"""OCR microservice — FastAPI app with Surya and Kraken engine support.""" + +import io +import logging +from contextlib import asynccontextmanager + +import httpx +import pypdfium2 as pdfium +from fastapi import FastAPI, HTTPException +from PIL import Image + +from engines import kraken as kraken_engine +from engines import surya as surya_engine +from models import OcrBlock, OcrRequest + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +_models_ready = False + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Load all OCR models at startup before accepting requests.""" + global _models_ready + + logger.info("Loading OCR models at startup...") + surya_engine.load_models() + kraken_engine.load_models() + _models_ready = True + logger.info("All OCR models loaded — ready to accept requests") + + yield + + logger.info("Shutting down OCR service") + + +app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) + + +@app.get("/health") +def health(): + """Health endpoint — returns 200 only after models are loaded.""" + if not _models_ready: + raise HTTPException(status_code=503, detail="Models not loaded yet") + return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()} + + +@app.post("/ocr", response_model=list[OcrBlock]) +async def run_ocr(request: OcrRequest): + """Run OCR on a PDF document. + + Downloads the PDF from the provided URL, converts pages to images, + and runs the appropriate OCR engine based on scriptType. + """ + if not _models_ready: + raise HTTPException(status_code=503, detail="Models not loaded yet") + + images = await _download_and_convert_pdf(request.pdf_url) + + script_type = request.script_type.upper() + + if script_type == "HANDWRITING_KURRENT": + if not kraken_engine.is_available(): + raise HTTPException( + status_code=400, + detail="Kraken model not available — cannot process Kurrent script", + ) + blocks = kraken_engine.extract_blocks(images, request.language) + else: + # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya + blocks = surya_engine.extract_blocks(images, request.language) + + return [OcrBlock(**b) for b in blocks] + + +async def _download_and_convert_pdf(url: str) -> list[Image.Image]: + """Download a PDF from URL and convert each page to a PIL Image.""" + async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: + response = await client.get(url) + response.raise_for_status() + + pdf = pdfium.PdfDocument(io.BytesIO(response.content)) + images = [] + + for page_idx in range(len(pdf)): + page = pdf[page_idx] + # Render at 300 DPI for good OCR quality + bitmap = page.render(scale=300 / 72) + pil_image = bitmap.to_pil() + images.append(pil_image) + + return images diff --git a/ocr-service/models.py b/ocr-service/models.py new file mode 100644 index 00000000..0d2c1590 --- /dev/null +++ b/ocr-service/models.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field + + +class OcrRequest(BaseModel): + pdf_url: str = Field(..., alias="pdfUrl") + script_type: str = Field("UNKNOWN", alias="scriptType") + language: str = "de" + + +class OcrBlock(BaseModel): + page_number: int = Field(..., alias="pageNumber") + x: float + y: float + width: float + height: float + polygon: list[list[float]] | None = None + text: str + + class Config: + populate_by_name = True diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt new file mode 100644 index 00000000..49bd00e9 --- /dev/null +++ b/ocr-service/requirements.txt @@ -0,0 +1,6 @@ +fastapi[standard]==0.115.6 +surya-ocr==0.6.3 +kraken==5.2.9 +pillow==11.1.0 +pypdfium2==4.30.0 +httpx==0.28.1