familienarchiv/ocr-service/engines/surya.py

"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""

import logging

logger = logging.getLogger(__name__)

_recognition_predictor = None
_detection_predictor = None
_loaded = False


def load_models():
    """Lazy-load Surya models on first use to save RAM at idle.

    Called automatically by extract_blocks(). Can also be called explicitly
    to pre-warm if desired.
    """
    global _recognition_predictor, _detection_predictor, _loaded
    if _loaded:
        return

    logger.info("Loading Surya models (lazy, first OCR request)...")

    from surya.foundation import FoundationPredictor
    from surya.recognition import RecognitionPredictor
    from surya.detection import DetectionPredictor

    foundation_predictor = FoundationPredictor()
    _recognition_predictor = RecognitionPredictor(foundation_predictor)
    _detection_predictor = DetectionPredictor()
    _loaded = True

    logger.info("Surya models loaded successfully")


def extract_page_blocks(
    image, page_idx: int, language: str = "de", sender_model_path: str | None = None
) -> list[dict]:
    """Run Surya OCR on a single PIL image and return block dicts for that page.

    `sender_model_path` is accepted for signature parity with the Kraken engine
    (which uses it to select a fine-tuned HTR model) and is ignored here.
    Coordinates are normalized to [0, 1].
    """
    del sender_model_path
    load_models()

    page_w, page_h = image.size
    blocks = []

    predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
    page_pred = predictions[0]

    for line in page_pred.text_lines:
        bbox = line.bbox
        x1, y1, x2, y2 = bbox

        polygon = None
        if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
            polygon = [
                [p[0] / page_w, p[1] / page_h]
                for p in line.polygon
            ]

        words = []
        if hasattr(line, "words") and line.words:
            for word in line.words:
                words.append({
                    "text": word.text,
                    "confidence": word.confidence,
                })
        else:
            words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]

        blocks.append({
            "pageNumber": page_idx,
            "x": x1 / page_w,
            "y": y1 / page_h,
            "width": (x2 - x1) / page_w,
            "height": (y2 - y1) / page_h,
            "polygon": polygon,
            "text": line.text,
            "words": words,
        })

    return blocks


def extract_region_text(
    image,
    x: float,
    y: float,
    w: float,
    h: float,
    sender_model_path: str | None = None,
) -> str:
    """Crop image to a normalized region and run Surya recognition on the crop.

    Used for guided OCR — skips full-page layout detection and only processes
    the given bounding box. `sender_model_path` is accepted for signature
    parity with the Kraken engine and is ignored here. Coordinates are
    normalized to [0, 1].
    """
    del sender_model_path
    load_models()

    pw, ph = image.size
    x1 = max(0, int(x * pw))
    y1 = max(0, int(y * ph))
    x2 = min(pw, int((x + w) * pw))
    y2 = min(ph, int((y + h) * ph))
    crop = image.crop((x1, y1, x2, y2))

    predictions = _recognition_predictor([crop], det_predictor=_detection_predictor)
    return " ".join(line.text for line in predictions[0].text_lines)


def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Surya OCR on a list of PIL images (one per page).

    Processes one page at a time to limit peak memory usage.
    Returns a flat list of block dicts with pageNumber, x, y, width, height,
    polygon, text, words. Coordinates are normalized to [0, 1].
    """
    all_blocks = []

    for page_idx, image in enumerate(images, start=1):
        all_blocks.extend(extract_page_blocks(image, page_idx, language))
        del image

    return all_blocks