From 696b71da5abe63915f823ac1a58740c51b89c79e Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:03:37 +0200 Subject: [PATCH] feat(ocr): increment ocr_jobs_total with engine and script_type labels Pick engine="kraken" for HANDWRITING_KURRENT, engine="surya" otherwise, then increment after the blocks have been extracted. Refs #652 (AC2) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 3 +++ ocr-service/test_metrics.py | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 0d068d15..83a092ee 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -106,6 +106,7 @@ async def run_ocr(request: OcrRequest): del img script_type = request.scriptType.upper() + engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya" if script_type == "HANDWRITING_KURRENT": if not kraken_engine.is_available(): @@ -119,6 +120,8 @@ async def run_ocr(request: OcrRequest): # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language) + metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc() + threshold = get_threshold(script_type) for block in blocks: if block.get("words"): diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 0906beec..2f7b0db8 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -91,3 +91,41 @@ def test_build_metrics_registers_all_custom_metrics_on_given_registry(): # A second registry yields a separate container — no shared state. other_metrics = build_metrics(CollectorRegistry()) assert metrics is not other_metrics + + +async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None: + """Helper — fires /ocr with a single mocked page and asserts a 200.""" + response = await client.post("/ocr", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": script_type, + "language": "de", + }) + assert response.status_code == 200, response.text + + +@pytest.mark.asyncio +async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics): + """A /ocr call with HANDWRITING_KURRENT increments engine=kraken.""" + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main.correct_text", side_effect=lambda t: t), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.kraken_engine.is_available", return_value=True), \ + patch("main.kraken_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + await _drive_ocr(client, script_type="HANDWRITING_KURRENT") + finally: + main_module._models_ready = False + + value = fresh_metrics.ocr_jobs_total.labels( + engine="kraken", script_type="HANDWRITING_KURRENT" + )._value.get() + assert value == 1.0