"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting.""" import logging logger = logging.getLogger(__name__) # Lazy-loaded at startup via load_models() _recognition_model = None _recognition_processor = None _detection_model = None _detection_processor = None def load_models(): """Eagerly load Surya models into memory. Called once at container startup.""" global _recognition_model, _recognition_processor, _detection_model, _detection_processor logger.info("Loading Surya models...") from surya.model.detection.model import load_model as load_det_model from surya.model.detection.model import load_processor as load_det_processor from surya.model.recognition.model import load_model as load_rec_model from surya.model.recognition.processor import load_processor as load_rec_processor _detection_model = load_det_model() _detection_processor = load_det_processor() _recognition_model = load_rec_model() _recognition_processor = load_rec_processor() logger.info("Surya models loaded successfully") def extract_blocks(images: list, language: str = "de") -> list[dict]: """Run Surya OCR on a list of PIL images (one per page). Returns a flat list of block dicts with pageNumber, x, y, width, height, text. Coordinates are normalized to [0, 1] relative to page dimensions. """ from surya.detection import batch_text_detection from surya.recognition import batch_recognition all_blocks = [] for page_idx, image in enumerate(images): page_w, page_h = image.size det_predictions = batch_text_detection([image], _detection_model, _detection_processor) rec_predictions = batch_recognition( [image], det_predictions, _recognition_model, _recognition_processor, [language] ) for line in rec_predictions[0].text_lines: bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates x1, y1, x2, y2 = bbox all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, "y": y1 / page_h, "width": (x2 - x1) / page_w, "height": (y2 - y1) / page_h, "polygon": None, "text": line.text, }) return all_blocks