Files
familienarchiv/ocr-service/engines/surya.py
Marcel 1f7b712dd0
Some checks failed
CI / Unit & Component Tests (push) Failing after 2m36s
CI / OCR Service Tests (push) Successful in 33s
CI / Backend Unit Tests (push) Has started running
fix(ocr): accept sender_model_path in Surya engine so non-Kurrent OCR works
main.py unifies the call to both engines and always passes
`sender_model_path` (None for non-Kurrent scripts). Surya's
extract_region_text / extract_page_blocks accepted one fewer positional
arg than Kraken's, so every guided-OCR run on a TYPEWRITER or
HANDWRITING_LATIN document raised "takes 5 positional arguments but 6
were given" and the stream returned 0 blocks / 1 skipped page.

Add an ignored `sender_model_path` kwarg to both Surya functions so the
signatures match Kraken's, and guard the regression with two signature
tests in test_engines.py that compare both engines' parameter lists.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-23 09:28:25 +02:00

132 lines
3.9 KiB
Python

"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
import logging
logger = logging.getLogger(__name__)
_recognition_predictor = None
_detection_predictor = None
_loaded = False
def load_models():
"""Lazy-load Surya models on first use to save RAM at idle.
Called automatically by extract_blocks(). Can also be called explicitly
to pre-warm if desired.
"""
global _recognition_predictor, _detection_predictor, _loaded
if _loaded:
return
logger.info("Loading Surya models (lazy, first OCR request)...")
from surya.foundation import FoundationPredictor
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor
foundation_predictor = FoundationPredictor()
_recognition_predictor = RecognitionPredictor(foundation_predictor)
_detection_predictor = DetectionPredictor()
_loaded = True
logger.info("Surya models loaded successfully")
def extract_page_blocks(
image, page_idx: int, language: str = "de", sender_model_path: str | None = None
) -> list[dict]:
"""Run Surya OCR on a single PIL image and return block dicts for that page.
`sender_model_path` is accepted for signature parity with the Kraken engine
(which uses it to select a fine-tuned HTR model) and is ignored here.
Coordinates are normalized to [0, 1].
"""
del sender_model_path
load_models()
page_w, page_h = image.size
blocks = []
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
page_pred = predictions[0]
for line in page_pred.text_lines:
bbox = line.bbox
x1, y1, x2, y2 = bbox
polygon = None
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
polygon = [
[p[0] / page_w, p[1] / page_h]
for p in line.polygon
]
words = []
if hasattr(line, "words") and line.words:
for word in line.words:
words.append({
"text": word.text,
"confidence": word.confidence,
})
else:
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": polygon,
"text": line.text,
"words": words,
})
return blocks
def extract_region_text(
image,
x: float,
y: float,
w: float,
h: float,
sender_model_path: str | None = None,
) -> str:
"""Crop image to a normalized region and run Surya recognition on the crop.
Used for guided OCR — skips full-page layout detection and only processes
the given bounding box. `sender_model_path` is accepted for signature
parity with the Kraken engine and is ignored here. Coordinates are
normalized to [0, 1].
"""
del sender_model_path
load_models()
pw, ph = image.size
x1 = max(0, int(x * pw))
y1 = max(0, int(y * ph))
x2 = min(pw, int((x + w) * pw))
y2 = min(ph, int((y + h) * ph))
crop = image.crop((x1, y1, x2, y2))
predictions = _recognition_predictor([crop], det_predictor=_detection_predictor)
return " ".join(line.text for line in predictions[0].text_lines)
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Surya OCR on a list of PIL images (one per page).
Processes one page at a time to limit peak memory usage.
Returns a flat list of block dicts with pageNumber, x, y, width, height,
polygon, text, words. Coordinates are normalized to [0, 1].
"""
all_blocks = []
for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language))
del image
return all_blocks