"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting.""" import logging logger = logging.getLogger(__name__) _recognition_predictor = None _detection_predictor = None def load_models(): """Eagerly load Surya models into memory. Called once at container startup.""" global _recognition_predictor, _detection_predictor logger.info("Loading Surya models...") from surya.foundation import FoundationPredictor from surya.recognition import RecognitionPredictor from surya.detection import DetectionPredictor foundation_predictor = FoundationPredictor() _recognition_predictor = RecognitionPredictor(foundation_predictor) _detection_predictor = DetectionPredictor() logger.info("Surya models loaded successfully") def extract_blocks(images: list, language: str = "de") -> list[dict]: """Run Surya OCR on a list of PIL images (one per page). Returns a flat list of block dicts with pageNumber, x, y, width, height, polygon, text. Coordinates are normalized to [0, 1] relative to page dimensions. Surya 0.17+ returns polygon (4-point) natively on each text line. """ all_blocks = [] predictions = _recognition_predictor(images, det_predictor=_detection_predictor) for page_idx, page_pred in enumerate(predictions): page_w, page_h = images[page_idx].size for line in page_pred.text_lines: bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates x1, y1, x2, y2 = bbox # Surya 0.17 provides polygon as list of (x, y) tuples (4 points, clockwise) polygon = None if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4: polygon = [ [p[0] / page_w, p[1] / page_h] for p in line.polygon ] # Extract word-level confidence for [unleserlich] marking words = [] if hasattr(line, "words") and line.words: for word in line.words: words.append({ "text": word.text, "confidence": word.confidence, }) else: words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, "y": y1 / page_h, "width": (x2 - x1) / page_w, "height": (y2 - y1) / page_h, "polygon": polygon, "text": line.text, "words": words, }) return all_blocks