- Surya models lazy-load on first OCR request instead of at startup (saves ~3-4GB idle RAM — Kraken stays eager at ~16MB) - Process one page at a time in Surya engine (limits peak memory) - RECOGNITION_BATCH_SIZE=1, DETECTOR_BATCH_SIZE=1 (slower but fits in RAM) - Revert mem_limit back to 6GB (sufficient with these optimizations) - Render DPI stays at 200 Idle memory: ~2GB (Kraken only). Peak during OCR: ~5-6GB (Surya loaded). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
|
|
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_recognition_predictor = None
|
|
_detection_predictor = None
|
|
_loaded = False
|
|
|
|
|
|
def load_models():
|
|
"""Lazy-load Surya models on first use to save RAM at idle.
|
|
|
|
Called automatically by extract_blocks(). Can also be called explicitly
|
|
to pre-warm if desired.
|
|
"""
|
|
global _recognition_predictor, _detection_predictor, _loaded
|
|
if _loaded:
|
|
return
|
|
|
|
logger.info("Loading Surya models (lazy, first OCR request)...")
|
|
|
|
from surya.foundation import FoundationPredictor
|
|
from surya.recognition import RecognitionPredictor
|
|
from surya.detection import DetectionPredictor
|
|
|
|
foundation_predictor = FoundationPredictor()
|
|
_recognition_predictor = RecognitionPredictor(foundation_predictor)
|
|
_detection_predictor = DetectionPredictor()
|
|
_loaded = True
|
|
|
|
logger.info("Surya models loaded successfully")
|
|
|
|
|
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|
"""Run Surya OCR on a list of PIL images (one per page).
|
|
|
|
Processes one page at a time to limit peak memory usage.
|
|
Returns a flat list of block dicts with pageNumber, x, y, width, height,
|
|
polygon, text, words. Coordinates are normalized to [0, 1].
|
|
"""
|
|
load_models()
|
|
|
|
all_blocks = []
|
|
|
|
for page_idx, image in enumerate(images):
|
|
page_w, page_h = image.size
|
|
|
|
# Process single page to limit peak memory
|
|
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
|
|
page_pred = predictions[0]
|
|
|
|
for line in page_pred.text_lines:
|
|
bbox = line.bbox
|
|
x1, y1, x2, y2 = bbox
|
|
|
|
polygon = None
|
|
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
|
polygon = [
|
|
[p[0] / page_w, p[1] / page_h]
|
|
for p in line.polygon
|
|
]
|
|
|
|
words = []
|
|
if hasattr(line, "words") and line.words:
|
|
for word in line.words:
|
|
words.append({
|
|
"text": word.text,
|
|
"confidence": word.confidence,
|
|
})
|
|
else:
|
|
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
|
|
|
all_blocks.append({
|
|
"pageNumber": page_idx,
|
|
"x": x1 / page_w,
|
|
"y": y1 / page_h,
|
|
"width": (x2 - x1) / page_w,
|
|
"height": (y2 - y1) / page_h,
|
|
"polygon": polygon,
|
|
"text": line.text,
|
|
"words": words,
|
|
})
|
|
|
|
# Free page image after processing
|
|
del image
|
|
|
|
return all_blocks
|