feat(ocr): time only engine work in guided stream histogram
Previously the guided generator's page_started timer wrapped the entire region loop including the synchronous correct_text() call, inflating ocr_processing_seconds with spell-check latency. Sum the per-region engine.extract_region_text durations instead so the histogram matches the unguided stream's "engine only" semantic. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -444,6 +444,89 @@ async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metric
|
||||
assert fresh_metrics.ocr_models_ready._value.get() == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics):
|
||||
"""The guided streaming generator observes ocr_processing_seconds once per page."""
|
||||
mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
|
||||
regions = [
|
||||
{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
|
||||
{"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"},
|
||||
]
|
||||
|
||||
with patch("main.kraken_engine.load_models"), \
|
||||
patch("main.load_spell_checker"), \
|
||||
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main.preprocess_page", side_effect=lambda img: img), \
|
||||
patch("main.surya_engine.extract_region_text", return_value="text"):
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
import main as main_module
|
||||
main_module._models_ready = True
|
||||
try:
|
||||
async with client.stream("POST", "/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/doc.pdf",
|
||||
"scriptType": "TYPEWRITER",
|
||||
"language": "de",
|
||||
"regions": regions,
|
||||
}) as response:
|
||||
assert response.status_code == 200
|
||||
async for _ in response.aiter_lines():
|
||||
pass
|
||||
finally:
|
||||
main_module._models_ready = False
|
||||
|
||||
sum_seconds, count = _histogram_count_sum(
|
||||
fresh_metrics.ocr_processing_seconds, engine="surya"
|
||||
)
|
||||
assert count == 2.0
|
||||
assert sum_seconds >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics):
|
||||
"""The guided observation must time engine work only, not the spell-check pass."""
|
||||
mock_images = [Image.new("RGB", (100, 100))]
|
||||
regions = [
|
||||
{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
|
||||
{"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"},
|
||||
]
|
||||
|
||||
def slow_correct(text):
|
||||
import time as _time
|
||||
_time.sleep(0.05)
|
||||
return text
|
||||
|
||||
with patch("main.kraken_engine.load_models"), \
|
||||
patch("main.load_spell_checker"), \
|
||||
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main.preprocess_page", side_effect=lambda img: img), \
|
||||
patch("main.kraken_engine.is_available", return_value=True), \
|
||||
patch("main.kraken_engine.extract_region_text", return_value="text"), \
|
||||
patch("main.correct_text", side_effect=slow_correct):
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
import main as main_module
|
||||
main_module._models_ready = True
|
||||
try:
|
||||
async with client.stream("POST", "/ocr/stream", json={
|
||||
"pdfUrl": "http://minio/doc.pdf",
|
||||
"scriptType": "HANDWRITING_KURRENT",
|
||||
"language": "de",
|
||||
"regions": regions,
|
||||
}) as response:
|
||||
assert response.status_code == 200
|
||||
async for _ in response.aiter_lines():
|
||||
pass
|
||||
finally:
|
||||
main_module._models_ready = False
|
||||
|
||||
sum_seconds, _ = _histogram_count_sum(
|
||||
fresh_metrics.ocr_processing_seconds, engine="kraken"
|
||||
)
|
||||
# Spell-check sleeps 0.05s per region × 2 regions = 0.1s; engine work is instantaneous.
|
||||
# If timing included spell-check, sum_seconds would be >= 0.1s. Allow 30ms slack
|
||||
# for scheduler overhead.
|
||||
assert sum_seconds < 0.05, f"timing must exclude spell-check; got sum={sum_seconds}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics):
|
||||
"""If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented.
|
||||
|
||||
Reference in New Issue
Block a user