feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose

Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:26:40 +02:00
parent aea46c5fd0
commit 6737bd6db5
9 changed files with 500 additions and 0 deletions
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -0,0 +1,66 @@
+"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+# Lazy-loaded at startup via load_models()
+_recognition_model = None
+_recognition_processor = None
+_detection_model = None
+_detection_processor = None
+
+
+def load_models():
+    """Eagerly load Surya models into memory. Called once at container startup."""
+    global _recognition_model, _recognition_processor, _detection_model, _detection_processor
+
+    logger.info("Loading Surya models...")
+
+    from surya.model.detection.model import load_model as load_det_model
+    from surya.model.detection.model import load_processor as load_det_processor
+    from surya.model.recognition.model import load_model as load_rec_model
+    from surya.model.recognition.processor import load_processor as load_rec_processor
+
+    _detection_model = load_det_model()
+    _detection_processor = load_det_processor()
+    _recognition_model = load_rec_model()
+    _recognition_processor = load_rec_processor()
+
+    logger.info("Surya models loaded successfully")
+
+
+def extract_blocks(images: list, language: str = "de") -> list[dict]:
+    """Run Surya OCR on a list of PIL images (one per page).
+
+    Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
+    Coordinates are normalized to [0, 1] relative to page dimensions.
+    """
+    from surya.detection import batch_text_detection
+    from surya.recognition import batch_recognition
+
+    all_blocks = []
+
+    for page_idx, image in enumerate(images):
+        page_w, page_h = image.size
+
+        det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
+        rec_predictions = batch_recognition(
+            [image], det_predictions, _recognition_model, _recognition_processor, [language]
+        )
+
+        for line in rec_predictions[0].text_lines:
+            bbox = line.bbox  # [x1, y1, x2, y2] in pixel coordinates
+            x1, y1, x2, y2 = bbox
+
+            all_blocks.append({
+                "pageNumber": page_idx,
+                "x": x1 / page_w,
+                "y": y1 / page_h,
+                "width": (x2 - x1) / page_w,
+                "height": (y2 - y1) / page_h,
+                "polygon": None,
+                "text": line.text,
+            })
+
+    return all_blocks