Files
familienarchiv/ocr-service/test_engines.py
Marcel 23cf88856e
Some checks failed
CI / Unit & Component Tests (push) Failing after 2m37s
CI / OCR Service Tests (push) Successful in 32s
CI / Backend Unit Tests (push) Failing after 2m51s
fix(ocr): guard Kraken block extraction against missing boundary/baseline
extract_page_blocks() walked `record.boundary` and `record.baseline`
unconditionally, so a record that arrived without either (malformed
kraken output, or a MagicMock in tests that iterates to nothing)
crashed with "min() arg is an empty sequence".

Coerce both attributes through list(), require at least 3 points for
the polygon path, fall back to the baseline path when the polygon is
missing, and skip the record entirely when neither is usable —
emitting no block is safer than emitting one with garbage coordinates.

The test helper now sets `boundary` and `baseline` explicitly to
mirror real Kraken 7.0 records (and so the happy-path test exercises
the polygon branch). A new regression test covers the skip path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-23 09:33:03 +02:00

233 lines
8.2 KiB
Python

"""Tests for per-page block extraction in OCR engines."""
import inspect
from unittest.mock import MagicMock, patch
from PIL import Image
from engines import surya, kraken
# ─── Surya extract_page_blocks ───────────────────────────────────────────────
def _make_surya_line(text, bbox, polygon=None, words=None):
"""Create a mock Surya text line with the expected attributes."""
line = MagicMock()
line.text = text
line.bbox = bbox
line.polygon = polygon
line.words = words or []
if not words:
line.confidence = 0.95
# hasattr check needs words to be falsy
del line.words
return line
def test_surya_extract_page_blocks_returns_blocks_for_single_image():
image = Image.new("RGB", (100, 200))
mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40])
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_page_blocks(image, page_idx=2, language="de")
assert len(blocks) == 1
assert blocks[0]["pageNumber"] == 2
assert blocks[0]["text"] == "Hallo Welt"
assert blocks[0]["x"] == 10 / 100
assert blocks[0]["y"] == 20 / 200
assert blocks[0]["width"] == 80 / 100
assert blocks[0]["height"] == 20 / 200
def test_surya_extract_page_blocks_extracts_polygon_when_present():
image = Image.new("RGB", (100, 200))
mock_line = _make_surya_line("Text", [10, 20, 90, 40])
mock_line.polygon = [(10, 20), (90, 20), (90, 40), (10, 40)]
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_page_blocks(image, page_idx=1, language="de")
assert blocks[0]["polygon"] is not None
assert len(blocks[0]["polygon"]) == 4
assert blocks[0]["polygon"][0] == [10 / 100, 20 / 200]
def test_surya_extract_page_blocks_extracts_word_level_confidence():
image = Image.new("RGB", (100, 200))
word1 = MagicMock()
word1.text = "Hallo"
word1.confidence = 0.95
word2 = MagicMock()
word2.text = "Welt"
word2.confidence = 0.3
mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40], words=[word1, word2])
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_page_blocks(image, page_idx=1, language="de")
assert len(blocks[0]["words"]) == 2
assert blocks[0]["words"][0]["text"] == "Hallo"
assert blocks[0]["words"][0]["confidence"] == 0.95
def test_surya_extract_blocks_delegates_to_extract_page_blocks():
"""After refactoring, extract_blocks should produce the same output."""
image1 = Image.new("RGB", (100, 200))
image2 = Image.new("RGB", (100, 200))
mock_line = _make_surya_line("Test", [10, 20, 90, 40])
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_blocks([image1, image2])
assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 2
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
def _make_kraken_record(text, cuts, confidences=None, boundary=None, baseline=None):
record = MagicMock()
record.prediction = text
record.cuts = cuts
record.line = cuts
# Real kraken records expose `boundary` (polygon) and `baseline` lists.
# Mirror that here so the extract path doesn't take the "missing data"
# branch during normal tests.
record.boundary = boundary if boundary is not None else cuts
record.baseline = baseline if baseline is not None else cuts
record.confidences = confidences or [0.9] * len(text)
return record
def _run_kraken_with_mocks(fn, *args):
"""Run a kraken function with blla/rpred mocked via sys.modules."""
import sys
mock_blla = MagicMock()
mock_blla.segment.return_value = MagicMock()
mock_rpred = MagicMock()
mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)])
mock_rpred.rpred.return_value = [mock_record]
saved_kraken = sys.modules.get("kraken")
saved_blla = sys.modules.get("kraken.blla")
saved_rpred = sys.modules.get("kraken.rpred")
sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
sys.modules["kraken.blla"] = mock_blla
sys.modules["kraken.rpred"] = mock_rpred
try:
with patch.object(kraken, "_model", MagicMock()):
return fn(*args)
finally:
if saved_kraken is not None:
sys.modules["kraken"] = saved_kraken
else:
sys.modules.pop("kraken", None)
if saved_blla is not None:
sys.modules["kraken.blla"] = saved_blla
else:
sys.modules.pop("kraken.blla", None)
if saved_rpred is not None:
sys.modules["kraken.rpred"] = saved_rpred
else:
sys.modules.pop("kraken.rpred", None)
def test_kraken_extract_page_blocks_returns_blocks_for_single_image():
image = Image.new("RGB", (100, 200))
blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de")
assert len(blocks) == 1
assert blocks[0]["pageNumber"] == 3
assert blocks[0]["text"] == "Kurrent"
def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
image1 = Image.new("RGB", (100, 200))
image2 = Image.new("RGB", (100, 200))
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
assert len(blocks) == 2
assert blocks[0]["pageNumber"] == 1
assert blocks[1]["pageNumber"] == 2
def test_kraken_extract_page_blocks_skips_records_without_positional_data():
"""Records that arrive without a usable boundary polygon OR baseline must
be dropped rather than crash min() with an empty sequence."""
import sys
image = Image.new("RGB", (100, 200))
mock_blla = MagicMock()
mock_blla.segment.return_value = MagicMock()
mock_rpred = MagicMock()
malformed = _make_kraken_record("Noise", [], boundary=[], baseline=[])
mock_rpred.rpred.return_value = [malformed]
sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
sys.modules["kraken.blla"] = mock_blla
sys.modules["kraken.rpred"] = mock_rpred
try:
with patch.object(kraken, "_model", MagicMock()):
blocks = kraken.extract_page_blocks(image, page_idx=1, language="de")
finally:
sys.modules.pop("kraken", None)
sys.modules.pop("kraken.blla", None)
sys.modules.pop("kraken.rpred", None)
assert blocks == []
# ─── Engine signatures must match ─────────────────────────────────────────────
#
# main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and
# then invokes the chosen engine with a uniform call pattern that always
# includes `sender_model_path` (None for non-Kurrent scripts). A signature
# drift between the two engines therefore breaks OCR at runtime — which is
# exactly the regression these tests guard against.
def test_extract_region_text_signatures_match():
surya_params = list(inspect.signature(surya.extract_region_text).parameters)
kraken_params = list(inspect.signature(kraken.extract_region_text).parameters)
assert surya_params == kraken_params
def test_extract_page_blocks_signatures_match():
surya_params = list(inspect.signature(surya.extract_page_blocks).parameters)
kraken_params = list(inspect.signature(kraken.extract_page_blocks).parameters)
assert surya_params == kraken_params