From 74ddf16b01cfdd727eb4465347b2f4bc0b92d609 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:53:04 +0200 Subject: [PATCH] feat(ocr): time only engine work in guided stream histogram Previously the guided generator's page_started timer wrapped the entire region loop including the synchronous correct_text() call, inflating ocr_processing_seconds with spell-check latency. Sum the per-region engine.extract_region_text durations instead so the histogram matches the unguided stream's "engine only" semantic. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 6 ++- ocr-service/test_metrics.py | 83 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index bf26ef35..31fd9cad 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -217,13 +217,15 @@ async def run_ocr_stream(request: OcrRequest): image = await asyncio.to_thread(preprocess_page, image) blocks = [] sender_path = request.senderModelPath if use_kraken else None - page_started = time.monotonic() + engine_seconds = 0.0 for region in page_regions: + region_started = time.monotonic() text = await asyncio.to_thread( engine.extract_region_text, image, region.x, region.y, region.width, region.height, sender_path, ) + engine_seconds += time.monotonic() - region_started if script_type in _SPELL_CHECK_SCRIPT_TYPES: text = correct_text(text) blocks.append({ @@ -238,7 +240,7 @@ async def run_ocr_stream(request: OcrRequest): }) metrics.ocr_processing_seconds.labels(engine=engine_name).observe( - time.monotonic() - page_started + engine_seconds ) total_blocks += len(blocks) metrics.ocr_pages_total.labels(engine=engine_name).inc() diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 783e1d25..3ed02c31 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -444,6 +444,89 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric assert fresh_metrics.ocr_models_ready._value.get() == 1.0 +@pytest.mark.asyncio +async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics): + """The guided streaming generator observes ocr_processing_seconds once per page.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)] + regions = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"}, + {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"}, + ] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_region_text", return_value="text"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + sum_seconds, count = _histogram_count_sum( + fresh_metrics.ocr_processing_seconds, engine="surya" + ) + assert count == 2.0 + assert sum_seconds >= 0.0 + + +@pytest.mark.asyncio +async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics): + """The guided observation must time engine work only, not the spell-check pass.""" + mock_images = [Image.new("RGB", (100, 100))] + regions = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"}, + {"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"}, + ] + + def slow_correct(text): + import time as _time + _time.sleep(0.05) + return text + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.kraken_engine.is_available", return_value=True), \ + patch("main.kraken_engine.extract_region_text", return_value="text"), \ + patch("main.correct_text", side_effect=slow_correct): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "HANDWRITING_KURRENT", + "language": "de", + "regions": regions, + }) as response: + assert response.status_code == 200 + async for _ in response.aiter_lines(): + pass + finally: + main_module._models_ready = False + + sum_seconds, _ = _histogram_count_sum( + fresh_metrics.ocr_processing_seconds, engine="kraken" + ) + # Spell-check sleeps 0.05s per region × 2 regions = 0.1s; engine work is instantaneous. + # If timing included spell-check, sum_seconds would be >= 0.1s. Allow 30ms slack + # for scheduler overhead. + assert sum_seconds < 0.05, f"timing must exclude spell-check; got sum={sum_seconds}" + + @pytest.mark.asyncio async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics): """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented.