test(ocr): add business-logic tests for polygon extraction, Kraken routing, and confidence markers
Cover Surya polygon/word-level extraction, health endpoint states, Kraken script-type routing, 503 when models not ready, 400 when Kraken unavailable for Kurrent, and confidence marker application during streaming. Production code coverage: 88%. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -161,3 +161,111 @@ async def test_old_ocr_endpoint_still_returns_flat_list(mock_images):
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 2
|
||||
assert data[0]["pageNumber"] == 1
|
||||
|
||||
|
||||
# ─── Health and error handling ────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_returns_ok_when_models_ready():
|
||||
with patch("main._models_ready", True), \
|
||||
patch("main.kraken_engine") as mock_kraken:
|
||||
mock_kraken.is_available.return_value = True
|
||||
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.get("/health")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "ok"
|
||||
assert data["surya"] is True
|
||||
assert data["kraken"] is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_returns_503_when_models_not_ready():
|
||||
with patch("main._models_ready", False):
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.get("/health")
|
||||
|
||||
assert response.status_code == 503
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_stream_returns_503_when_models_not_ready():
|
||||
with patch("main._models_ready", False):
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.post("/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/test.pdf",
|
||||
"scriptType": "TYPEWRITER",
|
||||
})
|
||||
|
||||
assert response.status_code == 503
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_stream_uses_kraken_for_kurrent_script(mock_images):
|
||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main._models_ready", True), \
|
||||
patch("main.kraken_engine") as mock_kraken:
|
||||
mock_kraken.is_available.return_value = True
|
||||
mock_kraken.extract_page_blocks.return_value = [_make_block(1, "Kurrent text")]
|
||||
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.post("/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/test.pdf",
|
||||
"scriptType": "HANDWRITING_KURRENT",
|
||||
})
|
||||
|
||||
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
||||
page_events = [l for l in lines if l["type"] == "page"]
|
||||
assert len(page_events) == 3
|
||||
mock_kraken.extract_page_blocks.assert_called()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_stream_returns_400_when_kraken_unavailable_for_kurrent(mock_images):
|
||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main._models_ready", True), \
|
||||
patch("main.kraken_engine") as mock_kraken:
|
||||
mock_kraken.is_available.return_value = False
|
||||
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.post("/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/test.pdf",
|
||||
"scriptType": "HANDWRITING_KURRENT",
|
||||
})
|
||||
|
||||
assert response.status_code == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_stream_applies_confidence_markers(mock_images):
|
||||
"""Low-confidence words should be replaced with [unleserlich] in the stream output."""
|
||||
def page_blocks(image, page_idx, language="de"):
|
||||
return [{
|
||||
"pageNumber": page_idx,
|
||||
"x": 0.1, "y": 0.2, "width": 0.8, "height": 0.1,
|
||||
"polygon": None,
|
||||
"text": "original text",
|
||||
"words": [
|
||||
{"text": "Lieber", "confidence": 0.95},
|
||||
{"text": "xkqz", "confidence": 0.1},
|
||||
],
|
||||
}]
|
||||
|
||||
with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images[:1]), \
|
||||
patch("main._models_ready", True), \
|
||||
patch("main.surya_engine") as mock_surya:
|
||||
mock_surya.extract_page_blocks.side_effect = page_blocks
|
||||
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
response = await client.post("/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/test.pdf",
|
||||
"scriptType": "TYPEWRITER",
|
||||
})
|
||||
|
||||
lines = [json.loads(line) for line in response.text.strip().split("\n") if line.strip()]
|
||||
page = [l for l in lines if l["type"] == "page"][0]
|
||||
assert page["blocks"][0]["text"] == "Lieber [unleserlich]"
|
||||
assert "words" not in page["blocks"][0]
|
||||
|
||||
Reference in New Issue
Block a user