From 79edb9455810dcfad65fb5bea05a690443ec2995 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:05:36 +0200 Subject: [PATCH] feat(ocr): increment ocr_pages_total per successful page in stream Bumps the counter inside both the standard and guided /ocr/stream generators after a page yields its blocks, before the per-page json line is emitted. Also moves the ocr_jobs_total increment for /ocr/stream right after engine selection so the counter still fires when a page later errors out. Refs #652 (AC3a) Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 5 +++++ ocr-service/test_metrics.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 83a092ee..08990e24 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -156,6 +156,9 @@ async def run_ocr_stream(request: OcrRequest): ) engine = kraken_engine if use_kraken else surya_engine + engine_name = "kraken" if use_kraken else "surya" + + metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc() if request.regions: # Guided mode: recognize only the user-drawn annotation regions @@ -206,6 +209,7 @@ async def run_ocr_stream(request: OcrRequest): }) total_blocks += len(blocks) + metrics.ocr_pages_total.labels(engine=engine_name).inc() yield json.dumps({ "type": "page", "pageNumber": page_idx, @@ -260,6 +264,7 @@ async def run_ocr_stream(request: OcrRequest): block["text"] = correct_text(block["text"]) total_blocks += len(blocks) + metrics.ocr_pages_total.labels(engine=engine_name).inc() yield json.dumps({ "type": "page", "pageNumber": page_idx, diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index f034a31c..f91bd0af 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -155,3 +155,35 @@ async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter engine="surya", script_type="TYPEWRITER" )._value.get() assert value == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics): + """The /ocr/stream generator increments ocr_pages_total per successful page.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(3)] + mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "hi", "words": []}] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + # Drain the stream so all per-page increments fire. + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() + assert value == 3.0