"""Tests for per-page block extraction in OCR engines.""" from unittest.mock import MagicMock, patch from PIL import Image from engines import surya, kraken # ─── Surya extract_page_blocks ─────────────────────────────────────────────── def _make_surya_line(text, bbox, polygon=None, words=None): """Create a mock Surya text line with the expected attributes.""" line = MagicMock() line.text = text line.bbox = bbox line.polygon = polygon line.words = words or [] if not words: line.confidence = 0.95 # hasattr check needs words to be falsy del line.words return line def test_surya_extract_page_blocks_returns_blocks_for_single_image(): image = Image.new("RGB", (100, 200)) mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40]) mock_pred = MagicMock() mock_pred.text_lines = [mock_line] with patch.object(surya, "_recognition_predictor") as mock_rec, \ patch.object(surya, "_loaded", True): mock_rec.return_value = [mock_pred] blocks = surya.extract_page_blocks(image, page_idx=2, language="de") assert len(blocks) == 1 assert blocks[0]["pageNumber"] == 2 assert blocks[0]["text"] == "Hallo Welt" assert blocks[0]["x"] == 10 / 100 assert blocks[0]["y"] == 20 / 200 assert blocks[0]["width"] == 80 / 100 assert blocks[0]["height"] == 20 / 200 def test_surya_extract_page_blocks_extracts_polygon_when_present(): image = Image.new("RGB", (100, 200)) mock_line = _make_surya_line("Text", [10, 20, 90, 40]) mock_line.polygon = [(10, 20), (90, 20), (90, 40), (10, 40)] mock_pred = MagicMock() mock_pred.text_lines = [mock_line] with patch.object(surya, "_recognition_predictor") as mock_rec, \ patch.object(surya, "_loaded", True): mock_rec.return_value = [mock_pred] blocks = surya.extract_page_blocks(image, page_idx=1, language="de") assert blocks[0]["polygon"] is not None assert len(blocks[0]["polygon"]) == 4 assert blocks[0]["polygon"][0] == [10 / 100, 20 / 200] def test_surya_extract_page_blocks_extracts_word_level_confidence(): image = Image.new("RGB", (100, 200)) word1 = MagicMock() word1.text = "Hallo" word1.confidence = 0.95 word2 = MagicMock() word2.text = "Welt" word2.confidence = 0.3 mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40], words=[word1, word2]) mock_pred = MagicMock() mock_pred.text_lines = [mock_line] with patch.object(surya, "_recognition_predictor") as mock_rec, \ patch.object(surya, "_loaded", True): mock_rec.return_value = [mock_pred] blocks = surya.extract_page_blocks(image, page_idx=1, language="de") assert len(blocks[0]["words"]) == 2 assert blocks[0]["words"][0]["text"] == "Hallo" assert blocks[0]["words"][0]["confidence"] == 0.95 def test_surya_extract_blocks_delegates_to_extract_page_blocks(): """After refactoring, extract_blocks should produce the same output.""" image1 = Image.new("RGB", (100, 200)) image2 = Image.new("RGB", (100, 200)) mock_line = _make_surya_line("Test", [10, 20, 90, 40]) mock_pred = MagicMock() mock_pred.text_lines = [mock_line] with patch.object(surya, "_recognition_predictor") as mock_rec, \ patch.object(surya, "_loaded", True): mock_rec.return_value = [mock_pred] blocks = surya.extract_blocks([image1, image2]) assert len(blocks) == 2 assert blocks[0]["pageNumber"] == 1 assert blocks[1]["pageNumber"] == 2 # ─── Kraken extract_page_blocks ────────────────────────────────────────────── def _make_kraken_record(text, cuts, confidences=None): record = MagicMock() record.prediction = text record.cuts = cuts record.line = cuts record.confidences = confidences or [0.9] * len(text) return record def _run_kraken_with_mocks(fn, *args): """Run a kraken function with blla/rpred mocked via sys.modules.""" import sys mock_blla = MagicMock() mock_blla.segment.return_value = MagicMock() mock_rpred = MagicMock() mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)]) mock_rpred.rpred.return_value = [mock_record] saved_kraken = sys.modules.get("kraken") saved_blla = sys.modules.get("kraken.blla") saved_rpred = sys.modules.get("kraken.rpred") sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred) sys.modules["kraken.blla"] = mock_blla sys.modules["kraken.rpred"] = mock_rpred try: with patch.object(kraken, "_model", MagicMock()): return fn(*args) finally: if saved_kraken is not None: sys.modules["kraken"] = saved_kraken else: sys.modules.pop("kraken", None) if saved_blla is not None: sys.modules["kraken.blla"] = saved_blla else: sys.modules.pop("kraken.blla", None) if saved_rpred is not None: sys.modules["kraken.rpred"] = saved_rpred else: sys.modules.pop("kraken.rpred", None) def test_kraken_extract_page_blocks_returns_blocks_for_single_image(): image = Image.new("RGB", (100, 200)) blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de") assert len(blocks) == 1 assert blocks[0]["pageNumber"] == 3 assert blocks[0]["text"] == "Kurrent" def test_kraken_extract_blocks_delegates_to_extract_page_blocks(): image1 = Image.new("RGB", (100, 200)) image2 = Image.new("RGB", (100, 200)) blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2]) assert len(blocks) == 2 assert blocks[0]["pageNumber"] == 1 assert blocks[1]["pageNumber"] == 2