test(ocr): add business-logic tests for polygon extraction, Kraken routing, and confidence markers

Cover Surya polygon/word-level extraction, health endpoint states,
Kraken script-type routing, 503 when models not ready, 400 when
Kraken unavailable for Kurrent, and confidence marker application
during streaming. Production code coverage: 88%.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 10:34:23 +02:00
parent 97e5138934
commit 69768a104d
2 changed files with 154 additions and 0 deletions

View File

@@ -45,6 +45,52 @@ def test_surya_extract_page_blocks_returns_blocks_for_single_image():
assert blocks[0]["height"] == 20 / 200
def test_surya_extract_page_blocks_extracts_polygon_when_present():
image = Image.new("RGB", (100, 200))
mock_line = _make_surya_line("Text", [10, 20, 90, 40])
mock_line.polygon = [(10, 20), (90, 20), (90, 40), (10, 40)]
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_page_blocks(image, page_idx=1, language="de")
assert blocks[0]["polygon"] is not None
assert len(blocks[0]["polygon"]) == 4
assert blocks[0]["polygon"][0] == [10 / 100, 20 / 200]
def test_surya_extract_page_blocks_extracts_word_level_confidence():
image = Image.new("RGB", (100, 200))
word1 = MagicMock()
word1.text = "Hallo"
word1.confidence = 0.95
word2 = MagicMock()
word2.text = "Welt"
word2.confidence = 0.3
mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40], words=[word1, word2])
mock_pred = MagicMock()
mock_pred.text_lines = [mock_line]
with patch.object(surya, "_recognition_predictor") as mock_rec, \
patch.object(surya, "_loaded", True):
mock_rec.return_value = [mock_pred]
blocks = surya.extract_page_blocks(image, page_idx=1, language="de")
assert len(blocks[0]["words"]) == 2
assert blocks[0]["words"][0]["text"] == "Hallo"
assert blocks[0]["words"][0]["confidence"] == 0.95
def test_surya_extract_blocks_delegates_to_extract_page_blocks():
"""After refactoring, extract_blocks should produce the same output."""
image1 = Image.new("RGB", (100, 200))

View File

@@ -161,3 +161,111 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
assert isinstance(data, list)
assert len(data) == 2
assert data[0]["pageNumber"] == 1
# ─── Health and error handling ────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_health_returns_ok_when_models_ready():
with patch("main._models_ready", True), \
patch("main.kraken_engine") as mock_kraken:
mock_kraken.is_available.return_value = True
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.get("/health")
assert response.status_code == 200
data = response.json()
assert data["status"] == "ok"
assert data["surya"] is True
assert data["kraken"] is True
@pytest.mark.asyncio
async def test_health_returns_503_when_models_not_ready():
with patch("main._models_ready", False):
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.get("/health")
assert response.status_code == 503
@pytest.mark.asyncio
async def test_ocr_stream_returns_503_when_models_not_ready():
with patch("main._models_ready", False):
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr/stream", json={
"pdfUrl": "http://minio/test.pdf",
"scriptType": "TYPEWRITER",
})
assert response.status_code == 503
@pytest.mark.asyncio
async def test_ocr_stream_uses_kraken_for_kurrent_script(mock_images):
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
patch("main._models_ready", True), \
patch("main.kraken_engine") as mock_kraken:
mock_kraken.is_available.return_value = True
mock_kraken.extract_page_blocks.return_value = [_make_block(1, "Kurrent text")]
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr/stream", json={
"pdfUrl": "http://minio/test.pdf",
"scriptType": "HANDWRITING_KURRENT",
})
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
page_events = [l for l in lines if l["type"] == "page"]
assert len(page_events) == 3
mock_kraken.extract_page_blocks.assert_called()
@pytest.mark.asyncio
async def test_ocr_stream_returns_400_when_kraken_unavailable_for_kurrent(mock_images):
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
patch("main._models_ready", True), \
patch("main.kraken_engine") as mock_kraken:
mock_kraken.is_available.return_value = False
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr/stream", json={
"pdfUrl": "http://minio/test.pdf",
"scriptType": "HANDWRITING_KURRENT",
})
assert response.status_code == 400
@pytest.mark.asyncio
async def test_ocr_stream_applies_confidence_markers(mock_images):
"""Low-confidence words should be replaced with [unleserlich] in the stream output."""
def page_blocks(image, page_idx, language="de"):
return [{
"pageNumber": page_idx,
"x": 0.1, "y": 0.2, "width": 0.8, "height": 0.1,
"polygon": None,
"text": "original text",
"words": [
{"text": "Lieber", "confidence": 0.95},
{"text": "xkqz", "confidence": 0.1},
],
}]
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images[:1]), \
patch("main._models_ready", True), \
patch("main.surya_engine") as mock_surya:
mock_surya.extract_page_blocks.side_effect = page_blocks
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
response = await client.post("/ocr/stream", json={
"pdfUrl": "http://minio/test.pdf",
"scriptType": "TYPEWRITER",
})
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
page = [l for l in lines if l["type"] == "page"][0]
assert page["blocks"][0]["text"] == "Lieber [unleserlich]"
assert "words" not in page["blocks"][0]