feat(ocr): increment ocr_pages_total per successful page in stream
Bumps the counter inside both the standard and guided /ocr/stream generators after a page yields its blocks, before the per-page json line is emitted. Also moves the ocr_jobs_total increment for /ocr/stream right after engine selection so the counter still fires when a page later errors out. Refs #652 (AC3a) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -156,6 +156,9 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
)
|
)
|
||||||
|
|
||||||
engine = kraken_engine if use_kraken else surya_engine
|
engine = kraken_engine if use_kraken else surya_engine
|
||||||
|
engine_name = "kraken" if use_kraken else "surya"
|
||||||
|
|
||||||
|
metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc()
|
||||||
|
|
||||||
if request.regions:
|
if request.regions:
|
||||||
# Guided mode: recognize only the user-drawn annotation regions
|
# Guided mode: recognize only the user-drawn annotation regions
|
||||||
@@ -206,6 +209,7 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
})
|
})
|
||||||
|
|
||||||
total_blocks += len(blocks)
|
total_blocks += len(blocks)
|
||||||
|
metrics.ocr_pages_total.labels(engine=engine_name).inc()
|
||||||
yield json.dumps({
|
yield json.dumps({
|
||||||
"type": "page",
|
"type": "page",
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
@@ -260,6 +264,7 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
block["text"] = correct_text(block["text"])
|
block["text"] = correct_text(block["text"])
|
||||||
|
|
||||||
total_blocks += len(blocks)
|
total_blocks += len(blocks)
|
||||||
|
metrics.ocr_pages_total.labels(engine=engine_name).inc()
|
||||||
yield json.dumps({
|
yield json.dumps({
|
||||||
"type": "page",
|
"type": "page",
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
|
|||||||
@@ -155,3 +155,35 @@ async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter
|
|||||||
engine="surya", script_type="TYPEWRITER"
|
engine="surya", script_type="TYPEWRITER"
|
||||||
)._value.get()
|
)._value.get()
|
||||||
assert value == 1.0
|
assert value == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics):
|
||||||
|
"""The /ocr/stream generator increments ocr_pages_total per successful page."""
|
||||||
|
mock_images = [Image.new("RGB", (100, 100)) for _ in range(3)]
|
||||||
|
mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
|
||||||
|
"polygon": None, "text": "hi", "words": []}]
|
||||||
|
|
||||||
|
with patch("main.kraken_engine.load_models"), \
|
||||||
|
patch("main.load_spell_checker"), \
|
||||||
|
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||||
|
patch("main.preprocess_page", side_effect=lambda img: img), \
|
||||||
|
patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks):
|
||||||
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||||
|
import main as main_module
|
||||||
|
main_module._models_ready = True
|
||||||
|
try:
|
||||||
|
async with client.stream("POST", "/ocr/stream", json={
|
||||||
|
"pdfUrl": "http://minio/doc.pdf",
|
||||||
|
"scriptType": "TYPEWRITER",
|
||||||
|
"language": "de",
|
||||||
|
}) as response:
|
||||||
|
assert response.status_code == 200
|
||||||
|
# Drain the stream so all per-page increments fire.
|
||||||
|
async for _ in response.aiter_lines():
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
main_module._models_ready = False
|
||||||
|
|
||||||
|
value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get()
|
||||||
|
assert value == 3.0
|
||||||
|
|||||||
Reference in New Issue
Block a user