From 74ddf16b01cfdd727eb4465347b2f4bc0b92d609 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Thu, 21 May 2026 16:53:04 +0200
Subject: [PATCH] feat(ocr): time only engine work in guided stream histogram

Previously the guided generator's page_started timer wrapped the entire
region loop including the synchronous correct_text() call, inflating
ocr_processing_seconds with spell-check latency. Sum the per-region
engine.extract_region_text durations instead so the histogram matches
the unguided stream's "engine only" semantic.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/main.py         |  6 ++-
 ocr-service/test_metrics.py | 83 +++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/ocr-service/main.py b/ocr-service/main.py
index bf26ef35..31fd9cad 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -217,13 +217,15 @@ async def run_ocr_stream(request: OcrRequest):
                     image = await asyncio.to_thread(preprocess_page, image)
                     blocks = []
                     sender_path = request.senderModelPath if use_kraken else None
-                    page_started = time.monotonic()
+                    engine_seconds = 0.0
                     for region in page_regions:
+                        region_started = time.monotonic()
                         text = await asyncio.to_thread(
                             engine.extract_region_text, image,
                             region.x, region.y, region.width, region.height,
                             sender_path,
                         )
+                        engine_seconds += time.monotonic() - region_started
                         if script_type in _SPELL_CHECK_SCRIPT_TYPES:
                             text = correct_text(text)
                         blocks.append({
@@ -238,7 +240,7 @@ async def run_ocr_stream(request: OcrRequest):
                         })
 
                     metrics.ocr_processing_seconds.labels(engine=engine_name).observe(
-                        time.monotonic() - page_started
+                        engine_seconds
                     )
                     total_blocks += len(blocks)
                     metrics.ocr_pages_total.labels(engine=engine_name).inc()
diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py
index 783e1d25..3ed02c31 100644
--- a/ocr-service/test_metrics.py
+++ b/ocr-service/test_metrics.py
@@ -444,6 +444,89 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric
             assert fresh_metrics.ocr_models_ready._value.get() == 1.0
 
 
+@pytest.mark.asyncio
+async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics):
+    """The guided streaming generator observes ocr_processing_seconds once per page."""
+    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
+    regions = [
+        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
+        {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"},
+    ]
+
+    with patch("main.kraken_engine.load_models"), \
+         patch("main.load_spell_checker"), \
+         patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
+         patch("main.preprocess_page", side_effect=lambda img: img), \
+         patch("main.surya_engine.extract_region_text", return_value="text"):
+        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+            import main as main_module
+            main_module._models_ready = True
+            try:
+                async with client.stream("POST", "/ocr/stream", json={
+                    "pdfUrl": "http://minio/doc.pdf",
+                    "scriptType": "TYPEWRITER",
+                    "language": "de",
+                    "regions": regions,
+                }) as response:
+                    assert response.status_code == 200
+                    async for _ in response.aiter_lines():
+                        pass
+            finally:
+                main_module._models_ready = False
+
+    sum_seconds, count = _histogram_count_sum(
+        fresh_metrics.ocr_processing_seconds, engine="surya"
+    )
+    assert count == 2.0
+    assert sum_seconds >= 0.0
+
+
+@pytest.mark.asyncio
+async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics):
+    """The guided observation must time engine work only, not the spell-check pass."""
+    mock_images = [Image.new("RGB", (100, 100))]
+    regions = [
+        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
+        {"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"},
+    ]
+
+    def slow_correct(text):
+        import time as _time
+        _time.sleep(0.05)
+        return text
+
+    with patch("main.kraken_engine.load_models"), \
+         patch("main.load_spell_checker"), \
+         patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
+         patch("main.preprocess_page", side_effect=lambda img: img), \
+         patch("main.kraken_engine.is_available", return_value=True), \
+         patch("main.kraken_engine.extract_region_text", return_value="text"), \
+         patch("main.correct_text", side_effect=slow_correct):
+        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+            import main as main_module
+            main_module._models_ready = True
+            try:
+                async with client.stream("POST", "/ocr/stream", json={
+                    "pdfUrl": "http://minio/doc.pdf",
+                    "scriptType": "HANDWRITING_KURRENT",
+                    "language": "de",
+                    "regions": regions,
+                }) as response:
+                    assert response.status_code == 200
+                    async for _ in response.aiter_lines():
+                        pass
+            finally:
+                main_module._models_ready = False
+
+    sum_seconds, _ = _histogram_count_sum(
+        fresh_metrics.ocr_processing_seconds, engine="kraken"
+    )
+    # Spell-check sleeps 0.05s per region × 2 regions = 0.1s; engine work is instantaneous.
+    # If timing included spell-check, sum_seconds would be >= 0.1s. Allow 30ms slack
+    # for scheduler overhead.
+    assert sum_seconds < 0.05, f"timing must exclude spell-check; got sum={sum_seconds}"
+
+
 @pytest.mark.asyncio
 async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics):
     """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented.