feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose
Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
66
ocr-service/engines/surya.py
Normal file
66
ocr-service/engines/surya.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy-loaded at startup via load_models()
|
||||
_recognition_model = None
|
||||
_recognition_processor = None
|
||||
_detection_model = None
|
||||
_detection_processor = None
|
||||
|
||||
|
||||
def load_models():
|
||||
"""Eagerly load Surya models into memory. Called once at container startup."""
|
||||
global _recognition_model, _recognition_processor, _detection_model, _detection_processor
|
||||
|
||||
logger.info("Loading Surya models...")
|
||||
|
||||
from surya.model.detection.model import load_model as load_det_model
|
||||
from surya.model.detection.model import load_processor as load_det_processor
|
||||
from surya.model.recognition.model import load_model as load_rec_model
|
||||
from surya.model.recognition.processor import load_processor as load_rec_processor
|
||||
|
||||
_detection_model = load_det_model()
|
||||
_detection_processor = load_det_processor()
|
||||
_recognition_model = load_rec_model()
|
||||
_recognition_processor = load_rec_processor()
|
||||
|
||||
logger.info("Surya models loaded successfully")
|
||||
|
||||
|
||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""Run Surya OCR on a list of PIL images (one per page).
|
||||
|
||||
Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
|
||||
Coordinates are normalized to [0, 1] relative to page dimensions.
|
||||
"""
|
||||
from surya.detection import batch_text_detection
|
||||
from surya.recognition import batch_recognition
|
||||
|
||||
all_blocks = []
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
page_w, page_h = image.size
|
||||
|
||||
det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
|
||||
rec_predictions = batch_recognition(
|
||||
[image], det_predictions, _recognition_model, _recognition_processor, [language]
|
||||
)
|
||||
|
||||
for line in rec_predictions[0].text_lines:
|
||||
bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates
|
||||
x1, y1, x2, y2 = bbox
|
||||
|
||||
all_blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
"y": y1 / page_h,
|
||||
"width": (x2 - x1) / page_w,
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": None,
|
||||
"text": line.text,
|
||||
})
|
||||
|
||||
return all_blocks
|
||||
Reference in New Issue
Block a user