feat(ocr): increment ocr_jobs_total with engine and script_type labels

Pick engine="kraken" for HANDWRITING_KURRENT, engine="surya" otherwise,
then increment after the blocks have been extracted.

Refs #652 (AC2)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-21 16:03:37 +02:00
parent f3e3545d06
commit 696b71da5a
2 changed files with 41 additions and 0 deletions

View File

@@ -106,6 +106,7 @@ async def run_ocr(request: OcrRequest):
del img
script_type = request.scriptType.upper()
engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya"
if script_type == "HANDWRITING_KURRENT":
if not kraken_engine.is_available():
@@ -119,6 +120,8 @@ async def run_ocr(request: OcrRequest):
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc()
threshold = get_threshold(script_type)
for block in blocks:
if block.get("words"):

View File

@@ -91,3 +91,41 @@ def test_build_metrics_registers_all_custom_metrics_on_given_registry():
# A second registry yields a separate container — no shared state.
other_metrics = build_metrics(CollectorRegistry())
assert metrics is not other_metrics
async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None:
"""Helper — fires /ocr with a single mocked page and asserts a 200."""
response = await client.post("/ocr", json={
"pdfUrl": "http://minio/doc.pdf",
"scriptType": script_type,
"language": "de",
})
assert response.status_code == 200, response.text
@pytest.mark.asyncio
async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics):
"""A /ocr call with HANDWRITING_KURRENT increments engine=kraken."""
mock_images = [Image.new("RGB", (100, 100))]
mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
"polygon": None, "text": "hi", "words": []}]
with patch("main.kraken_engine.load_models"), \
patch("main.load_spell_checker"), \
patch("main.correct_text", side_effect=lambda t: t), \
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
patch("main.preprocess_page", side_effect=lambda img: img), \
patch("main.kraken_engine.is_available", return_value=True), \
patch("main.kraken_engine.extract_blocks", return_value=mock_blocks):
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
import main as main_module
main_module._models_ready = True
try:
await _drive_ocr(client, script_type="HANDWRITING_KURRENT")
finally:
main_module._models_ready = False
value = fresh_metrics.ocr_jobs_total.labels(
engine="kraken", script_type="HANDWRITING_KURRENT"
)._value.get()
assert value == 1.0