From 1f7b712dd080f4698e1957c20ef3f0e371da0a85 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Thu, 23 Apr 2026 09:28:25 +0200
Subject: [PATCH] fix(ocr): accept sender_model_path in Surya engine so
 non-Kurrent OCR works

main.py unifies the call to both engines and always passes
`sender_model_path` (None for non-Kurrent scripts). Surya's
extract_region_text / extract_page_blocks accepted one fewer positional
arg than Kraken's, so every guided-OCR run on a TYPEWRITER or
HANDWRITING_LATIN document raised "takes 5 positional arguments but 6
were given" and the stream returned 0 blocks / 1 skipped page.

Add an ignored `sender_model_path` kwarg to both Surya functions so the
signatures match Kraken's, and guard the regression with two signature
tests in test_engines.py that compare both engines' parameter lists.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 ocr-service/engines/surya.py | 21 ++++++++++++++++++---
 ocr-service/test_engines.py  | 22 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py
index 59301865..42f21369 100644
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -33,11 +33,16 @@ def load_models():
     logger.info("Surya models loaded successfully")
 
 
-def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
+def extract_page_blocks(
+    image, page_idx: int, language: str = "de", sender_model_path: str | None = None
+) -> list[dict]:
     """Run Surya OCR on a single PIL image and return block dicts for that page.
 
+    `sender_model_path` is accepted for signature parity with the Kraken engine
+    (which uses it to select a fine-tuned HTR model) and is ignored here.
     Coordinates are normalized to [0, 1].
     """
+    del sender_model_path
     load_models()
 
     page_w, page_h = image.size
@@ -81,12 +86,22 @@ def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict
     return blocks
 
 
-def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
+def extract_region_text(
+    image,
+    x: float,
+    y: float,
+    w: float,
+    h: float,
+    sender_model_path: str | None = None,
+) -> str:
     """Crop image to a normalized region and run Surya recognition on the crop.
 
     Used for guided OCR — skips full-page layout detection and only processes
-    the given bounding box. Coordinates are normalized to [0, 1].
+    the given bounding box. `sender_model_path` is accepted for signature
+    parity with the Kraken engine and is ignored here. Coordinates are
+    normalized to [0, 1].
     """
+    del sender_model_path
     load_models()
 
     pw, ph = image.size
diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py
index a7ceba27..6202c02a 100644
--- a/ocr-service/test_engines.py
+++ b/ocr-service/test_engines.py
@@ -1,5 +1,6 @@
 """Tests for per-page block extraction in OCR engines."""
 
+import inspect
 from unittest.mock import MagicMock, patch
 from PIL import Image
 
@@ -176,3 +177,24 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
     assert len(blocks) == 2
     assert blocks[0]["pageNumber"] == 1
     assert blocks[1]["pageNumber"] == 2
+
+
+# ─── Engine signatures must match ─────────────────────────────────────────────
+#
+# main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and
+# then invokes the chosen engine with a uniform call pattern that always
+# includes `sender_model_path` (None for non-Kurrent scripts). A signature
+# drift between the two engines therefore breaks OCR at runtime — which is
+# exactly the regression these tests guard against.
+
+
+def test_extract_region_text_signatures_match():
+    surya_params = list(inspect.signature(surya.extract_region_text).parameters)
+    kraken_params = list(inspect.signature(kraken.extract_region_text).parameters)
+    assert surya_params == kraken_params
+
+
+def test_extract_page_blocks_signatures_match():
+    surya_params = list(inspect.signature(surya.extract_page_blocks).parameters)
+    kraken_params = list(inspect.signature(kraken.extract_page_blocks).parameters)
+    assert surya_params == kraken_params