Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
133 lines
4.4 KiB
Python
133 lines
4.4 KiB
Python
"""Tests for per-page block extraction in OCR engines."""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
from PIL import Image
|
|
|
|
from engines import surya, kraken
|
|
|
|
|
|
# ─── Surya extract_page_blocks ───────────────────────────────────────────────
|
|
|
|
|
|
def _make_surya_line(text, bbox, polygon=None, words=None):
|
|
"""Create a mock Surya text line with the expected attributes."""
|
|
line = MagicMock()
|
|
line.text = text
|
|
line.bbox = bbox
|
|
line.polygon = polygon
|
|
line.words = words or []
|
|
if not words:
|
|
line.confidence = 0.95
|
|
# hasattr check needs words to be falsy
|
|
del line.words
|
|
return line
|
|
|
|
|
|
def test_surya_extract_page_blocks_returns_blocks_for_single_image():
|
|
image = Image.new("RGB", (100, 200))
|
|
|
|
mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40])
|
|
mock_pred = MagicMock()
|
|
mock_pred.text_lines = [mock_line]
|
|
|
|
with patch.object(surya, "_recognition_predictor") as mock_rec, \
|
|
patch.object(surya, "_loaded", True):
|
|
mock_rec.return_value = [mock_pred]
|
|
|
|
blocks = surya.extract_page_blocks(image, page_idx=2, language="de")
|
|
|
|
assert len(blocks) == 1
|
|
assert blocks[0]["pageNumber"] == 2
|
|
assert blocks[0]["text"] == "Hallo Welt"
|
|
assert blocks[0]["x"] == 10 / 100
|
|
assert blocks[0]["y"] == 20 / 200
|
|
assert blocks[0]["width"] == 80 / 100
|
|
assert blocks[0]["height"] == 20 / 200
|
|
|
|
|
|
def test_surya_extract_blocks_delegates_to_extract_page_blocks():
|
|
"""After refactoring, extract_blocks should produce the same output."""
|
|
image1 = Image.new("RGB", (100, 200))
|
|
image2 = Image.new("RGB", (100, 200))
|
|
|
|
mock_line = _make_surya_line("Test", [10, 20, 90, 40])
|
|
mock_pred = MagicMock()
|
|
mock_pred.text_lines = [mock_line]
|
|
|
|
with patch.object(surya, "_recognition_predictor") as mock_rec, \
|
|
patch.object(surya, "_loaded", True):
|
|
mock_rec.return_value = [mock_pred]
|
|
|
|
blocks = surya.extract_blocks([image1, image2])
|
|
|
|
assert len(blocks) == 2
|
|
assert blocks[0]["pageNumber"] == 0
|
|
assert blocks[1]["pageNumber"] == 1
|
|
|
|
|
|
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
|
|
|
|
|
def _make_kraken_record(text, cuts, confidences=None):
|
|
record = MagicMock()
|
|
record.prediction = text
|
|
record.cuts = cuts
|
|
record.line = cuts
|
|
record.confidences = confidences or [0.9] * len(text)
|
|
return record
|
|
|
|
|
|
def _run_kraken_with_mocks(fn, *args):
|
|
"""Run a kraken function with blla/rpred mocked via sys.modules."""
|
|
import sys
|
|
mock_blla = MagicMock()
|
|
mock_blla.segment.return_value = MagicMock()
|
|
mock_rpred = MagicMock()
|
|
|
|
mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)])
|
|
mock_rpred.rpred.return_value = [mock_record]
|
|
|
|
saved_kraken = sys.modules.get("kraken")
|
|
saved_blla = sys.modules.get("kraken.blla")
|
|
saved_rpred = sys.modules.get("kraken.rpred")
|
|
|
|
sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
|
|
sys.modules["kraken.blla"] = mock_blla
|
|
sys.modules["kraken.rpred"] = mock_rpred
|
|
|
|
try:
|
|
with patch.object(kraken, "_model", MagicMock()):
|
|
return fn(*args)
|
|
finally:
|
|
if saved_kraken is not None:
|
|
sys.modules["kraken"] = saved_kraken
|
|
else:
|
|
sys.modules.pop("kraken", None)
|
|
if saved_blla is not None:
|
|
sys.modules["kraken.blla"] = saved_blla
|
|
else:
|
|
sys.modules.pop("kraken.blla", None)
|
|
if saved_rpred is not None:
|
|
sys.modules["kraken.rpred"] = saved_rpred
|
|
else:
|
|
sys.modules.pop("kraken.rpred", None)
|
|
|
|
|
|
def test_kraken_extract_page_blocks_returns_blocks_for_single_image():
|
|
image = Image.new("RGB", (100, 200))
|
|
blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de")
|
|
|
|
assert len(blocks) == 1
|
|
assert blocks[0]["pageNumber"] == 3
|
|
assert blocks[0]["text"] == "Kurrent"
|
|
|
|
|
|
def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
|
|
image1 = Image.new("RGB", (100, 200))
|
|
image2 = Image.new("RGB", (100, 200))
|
|
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
|
|
|
|
assert len(blocks) == 2
|
|
assert blocks[0]["pageNumber"] == 0
|
|
assert blocks[1]["pageNumber"] == 1
|