feat(ocr): increment ocr_jobs_total with engine and script_type labels
Pick engine="kraken" for HANDWRITING_KURRENT, engine="surya" otherwise, then increment after the blocks have been extracted. Refs #652 (AC2) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -106,6 +106,7 @@ async def run_ocr(request: OcrRequest):
|
||||
del img
|
||||
|
||||
script_type = request.scriptType.upper()
|
||||
engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya"
|
||||
|
||||
if script_type == "HANDWRITING_KURRENT":
|
||||
if not kraken_engine.is_available():
|
||||
@@ -119,6 +120,8 @@ async def run_ocr(request: OcrRequest):
|
||||
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
||||
blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
|
||||
|
||||
metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc()
|
||||
|
||||
threshold = get_threshold(script_type)
|
||||
for block in blocks:
|
||||
if block.get("words"):
|
||||
|
||||
@@ -91,3 +91,41 @@ def test_build_metrics_registers_all_custom_metrics_on_given_registry():
|
||||
# A second registry yields a separate container — no shared state.
|
||||
other_metrics = build_metrics(CollectorRegistry())
|
||||
assert metrics is not other_metrics
|
||||
|
||||
|
||||
async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None:
|
||||
"""Helper — fires /ocr with a single mocked page and asserts a 200."""
|
||||
response = await client.post("/ocr", json={
|
||||
"pdfUrl": "http://minio/doc.pdf",
|
||||
"scriptType": script_type,
|
||||
"language": "de",
|
||||
})
|
||||
assert response.status_code == 200, response.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics):
|
||||
"""A /ocr call with HANDWRITING_KURRENT increments engine=kraken."""
|
||||
mock_images = [Image.new("RGB", (100, 100))]
|
||||
mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
|
||||
"polygon": None, "text": "hi", "words": []}]
|
||||
|
||||
with patch("main.kraken_engine.load_models"), \
|
||||
patch("main.load_spell_checker"), \
|
||||
patch("main.correct_text", side_effect=lambda t: t), \
|
||||
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||
patch("main.preprocess_page", side_effect=lambda img: img), \
|
||||
patch("main.kraken_engine.is_available", return_value=True), \
|
||||
patch("main.kraken_engine.extract_blocks", return_value=mock_blocks):
|
||||
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||
import main as main_module
|
||||
main_module._models_ready = True
|
||||
try:
|
||||
await _drive_ocr(client, script_type="HANDWRITING_KURRENT")
|
||||
finally:
|
||||
main_module._models_ready = False
|
||||
|
||||
value = fresh_metrics.ocr_jobs_total.labels(
|
||||
engine="kraken", script_type="HANDWRITING_KURRENT"
|
||||
)._value.get()
|
||||
assert value == 1.0
|
||||
|
||||
Reference in New Issue
Block a user