From 1f7b712dd080f4698e1957c20ef3f0e371da0a85 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 23 Apr 2026 09:28:25 +0200 Subject: [PATCH] fix(ocr): accept sender_model_path in Surya engine so non-Kurrent OCR works main.py unifies the call to both engines and always passes `sender_model_path` (None for non-Kurrent scripts). Surya's extract_region_text / extract_page_blocks accepted one fewer positional arg than Kraken's, so every guided-OCR run on a TYPEWRITER or HANDWRITING_LATIN document raised "takes 5 positional arguments but 6 were given" and the stream returned 0 blocks / 1 skipped page. Add an ignored `sender_model_path` kwarg to both Surya functions so the signatures match Kraken's, and guard the regression with two signature tests in test_engines.py that compare both engines' parameter lists. Co-Authored-By: Claude Opus 4.7 --- ocr-service/engines/surya.py | 21 ++++++++++++++++++--- ocr-service/test_engines.py | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index 59301865..42f21369 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -33,11 +33,16 @@ def load_models(): logger.info("Surya models loaded successfully") -def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]: +def extract_page_blocks( + image, page_idx: int, language: str = "de", sender_model_path: str | None = None +) -> list[dict]: """Run Surya OCR on a single PIL image and return block dicts for that page. + `sender_model_path` is accepted for signature parity with the Kraken engine + (which uses it to select a fine-tuned HTR model) and is ignored here. Coordinates are normalized to [0, 1]. """ + del sender_model_path load_models() page_w, page_h = image.size @@ -81,12 +86,22 @@ def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict return blocks -def extract_region_text(image, x: float, y: float, w: float, h: float) -> str: +def extract_region_text( + image, + x: float, + y: float, + w: float, + h: float, + sender_model_path: str | None = None, +) -> str: """Crop image to a normalized region and run Surya recognition on the crop. Used for guided OCR — skips full-page layout detection and only processes - the given bounding box. Coordinates are normalized to [0, 1]. + the given bounding box. `sender_model_path` is accepted for signature + parity with the Kraken engine and is ignored here. Coordinates are + normalized to [0, 1]. """ + del sender_model_path load_models() pw, ph = image.size diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py index a7ceba27..6202c02a 100644 --- a/ocr-service/test_engines.py +++ b/ocr-service/test_engines.py @@ -1,5 +1,6 @@ """Tests for per-page block extraction in OCR engines.""" +import inspect from unittest.mock import MagicMock, patch from PIL import Image @@ -176,3 +177,24 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks(): assert len(blocks) == 2 assert blocks[0]["pageNumber"] == 1 assert blocks[1]["pageNumber"] == 2 + + +# ─── Engine signatures must match ───────────────────────────────────────────── +# +# main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and +# then invokes the chosen engine with a uniform call pattern that always +# includes `sender_model_path` (None for non-Kurrent scripts). A signature +# drift between the two engines therefore breaks OCR at runtime — which is +# exactly the regression these tests guard against. + + +def test_extract_region_text_signatures_match(): + surya_params = list(inspect.signature(surya.extract_region_text).parameters) + kraken_params = list(inspect.signature(kraken.extract_region_text).parameters) + assert surya_params == kraken_params + + +def test_extract_page_blocks_signatures_match(): + surya_params = list(inspect.signature(surya.extract_page_blocks).parameters) + kraken_params = list(inspect.signature(kraken.extract_page_blocks).parameters) + assert surya_params == kraken_params