"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting.""" import logging logger = logging.getLogger(__name__) _recognition_predictor = None _detection_predictor = None _loaded = False def load_models(): """Lazy-load Surya models on first use to save RAM at idle. Called automatically by extract_blocks(). Can also be called explicitly to pre-warm if desired. """ global _recognition_predictor, _detection_predictor, _loaded if _loaded: return logger.info("Loading Surya models (lazy, first OCR request)...") from surya.foundation import FoundationPredictor from surya.recognition import RecognitionPredictor from surya.detection import DetectionPredictor foundation_predictor = FoundationPredictor() _recognition_predictor = RecognitionPredictor(foundation_predictor) _detection_predictor = DetectionPredictor() _loaded = True logger.info("Surya models loaded successfully") def extract_page_blocks( image, page_idx: int, language: str = "de", sender_model_path: str | None = None ) -> list[dict]: """Run Surya OCR on a single PIL image and return block dicts for that page. `sender_model_path` is accepted for signature parity with the Kraken engine (which uses it to select a fine-tuned HTR model) and is ignored here. Coordinates are normalized to [0, 1]. """ del sender_model_path load_models() page_w, page_h = image.size blocks = [] predictions = _recognition_predictor([image], det_predictor=_detection_predictor) page_pred = predictions[0] for line in page_pred.text_lines: bbox = line.bbox x1, y1, x2, y2 = bbox polygon = None if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4: polygon = [ [p[0] / page_w, p[1] / page_h] for p in line.polygon ] words = [] if hasattr(line, "words") and line.words: for word in line.words: words.append({ "text": word.text, "confidence": word.confidence, }) else: words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, "y": y1 / page_h, "width": (x2 - x1) / page_w, "height": (y2 - y1) / page_h, "polygon": polygon, "text": line.text, "words": words, }) return blocks def extract_region_text( image, x: float, y: float, w: float, h: float, sender_model_path: str | None = None, ) -> str: """Crop image to a normalized region and run Surya recognition on the crop. Used for guided OCR — skips full-page layout detection and only processes the given bounding box. `sender_model_path` is accepted for signature parity with the Kraken engine and is ignored here. Coordinates are normalized to [0, 1]. """ del sender_model_path load_models() pw, ph = image.size x1 = max(0, int(x * pw)) y1 = max(0, int(y * ph)) x2 = min(pw, int((x + w) * pw)) y2 = min(ph, int((y + h) * ph)) crop = image.crop((x1, y1, x2, y2)) predictions = _recognition_predictor([crop], det_predictor=_detection_predictor) return " ".join(line.text for line in predictions[0].text_lines) def extract_blocks(images: list, language: str = "de") -> list[dict]: """Run Surya OCR on a list of PIL images (one per page). Processes one page at a time to limit peak memory usage. Returns a flat list of block dicts with pageNumber, x, y, width, height, polygon, text, words. Coordinates are normalized to [0, 1]. """ all_blocks = [] for page_idx, image in enumerate(images, start=1): all_blocks.extend(extract_page_blocks(image, page_idx, language)) del image return all_blocks