From 69768a104d2cfd683d4f155f43237980345d19c6 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 10:34:23 +0200 Subject: [PATCH] test(ocr): add business-logic tests for polygon extraction, Kraken routing, and confidence markers Cover Surya polygon/word-level extraction, health endpoint states, Kraken script-type routing, 503 when models not ready, 400 when Kraken unavailable for Kurrent, and confidence marker application during streaming. Production code coverage: 88%. Co-Authored-By: Claude Opus 4.6 (1M context) --- ocr-service/test_engines.py | 46 +++++++++++++++ ocr-service/test_stream.py | 108 ++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py index 8218978e..a7ceba27 100644 --- a/ocr-service/test_engines.py +++ b/ocr-service/test_engines.py @@ -45,6 +45,52 @@ def test_surya_extract_page_blocks_returns_blocks_for_single_image(): assert blocks[0]["height"] == 20 / 200 +def test_surya_extract_page_blocks_extracts_polygon_when_present(): + image = Image.new("RGB", (100, 200)) + + mock_line = _make_surya_line("Text", [10, 20, 90, 40]) + mock_line.polygon = [(10, 20), (90, 20), (90, 40), (10, 40)] + + mock_pred = MagicMock() + mock_pred.text_lines = [mock_line] + + with patch.object(surya, "_recognition_predictor") as mock_rec, \ + patch.object(surya, "_loaded", True): + mock_rec.return_value = [mock_pred] + + blocks = surya.extract_page_blocks(image, page_idx=1, language="de") + + assert blocks[0]["polygon"] is not None + assert len(blocks[0]["polygon"]) == 4 + assert blocks[0]["polygon"][0] == [10 / 100, 20 / 200] + + +def test_surya_extract_page_blocks_extracts_word_level_confidence(): + image = Image.new("RGB", (100, 200)) + + word1 = MagicMock() + word1.text = "Hallo" + word1.confidence = 0.95 + word2 = MagicMock() + word2.text = "Welt" + word2.confidence = 0.3 + + mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40], words=[word1, word2]) + + mock_pred = MagicMock() + mock_pred.text_lines = [mock_line] + + with patch.object(surya, "_recognition_predictor") as mock_rec, \ + patch.object(surya, "_loaded", True): + mock_rec.return_value = [mock_pred] + + blocks = surya.extract_page_blocks(image, page_idx=1, language="de") + + assert len(blocks[0]["words"]) == 2 + assert blocks[0]["words"][0]["text"] == "Hallo" + assert blocks[0]["words"][0]["confidence"] == 0.95 + + def test_surya_extract_blocks_delegates_to_extract_page_blocks(): """After refactoring, extract_blocks should produce the same output.""" image1 = Image.new("RGB", (100, 200)) diff --git a/ocr-service/test_stream.py b/ocr-service/test_stream.py index 8b070430..5b9a9332 100644 --- a/ocr-service/test_stream.py +++ b/ocr-service/test_stream.py @@ -161,3 +161,111 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images): assert isinstance(data, list) assert len(data) == 2 assert data[0]["pageNumber"] == 1 + + +# ─── Health and error handling ──────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_health_returns_ok_when_models_ready(): + with patch("main._models_ready", True), \ + patch("main.kraken_engine") as mock_kraken: + mock_kraken.is_available.return_value = True + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/health") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "ok" + assert data["surya"] is True + assert data["kraken"] is True + + +@pytest.mark.asyncio +async def test_health_returns_503_when_models_not_ready(): + with patch("main._models_ready", False): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/health") + + assert response.status_code == 503 + + +@pytest.mark.asyncio +async def test_ocr_stream_returns_503_when_models_not_ready(): + with patch("main._models_ready", False): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/test.pdf", + "scriptType": "TYPEWRITER", + }) + + assert response.status_code == 503 + + +@pytest.mark.asyncio +async def test_ocr_stream_uses_kraken_for_kurrent_script(mock_images): + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main._models_ready", True), \ + patch("main.kraken_engine") as mock_kraken: + mock_kraken.is_available.return_value = True + mock_kraken.extract_page_blocks.return_value = [_make_block(1, "Kurrent text")] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/test.pdf", + "scriptType": "HANDWRITING_KURRENT", + }) + + lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()] + page_events = [l for l in lines if l["type"] == "page"] + assert len(page_events) == 3 + mock_kraken.extract_page_blocks.assert_called() + + +@pytest.mark.asyncio +async def test_ocr_stream_returns_400_when_kraken_unavailable_for_kurrent(mock_images): + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main._models_ready", True), \ + patch("main.kraken_engine") as mock_kraken: + mock_kraken.is_available.return_value = False + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/test.pdf", + "scriptType": "HANDWRITING_KURRENT", + }) + + assert response.status_code == 400 + + +@pytest.mark.asyncio +async def test_ocr_stream_applies_confidence_markers(mock_images): + """Low-confidence words should be replaced with [unleserlich] in the stream output.""" + def page_blocks(image, page_idx, language="de"): + return [{ + "pageNumber": page_idx, + "x": 0.1, "y": 0.2, "width": 0.8, "height": 0.1, + "polygon": None, + "text": "original text", + "words": [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + ], + }] + + with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images[:1]), \ + patch("main._models_ready", True), \ + patch("main.surya_engine") as mock_surya: + mock_surya.extract_page_blocks.side_effect = page_blocks + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/ocr/stream", json={ + "pdfUrl": "http://minio/test.pdf", + "scriptType": "TYPEWRITER", + }) + + lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()] + page = [l for l in lines if l["type"] == "page"][0] + assert page["blocks"][0]["text"] == "Lieber [unleserlich]" + assert "words" not in page["blocks"][0]