diff --git a/ocr-service/main.py b/ocr-service/main.py index 08990e24..9522d8ae 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -219,6 +219,7 @@ async def run_ocr_stream(request: OcrRequest): except Exception: logger.exception("Guided OCR failed on page %d", page_idx) skipped_pages += 1 + metrics.ocr_skipped_pages_total.inc() yield json.dumps({ "type": "error", "pageNumber": page_idx, @@ -274,6 +275,7 @@ async def run_ocr_stream(request: OcrRequest): except Exception: logger.exception("OCR failed on page %d", page_idx) skipped_pages += 1 + metrics.ocr_skipped_pages_total.inc() yield json.dumps({ "type": "error", "pageNumber": page_idx, diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index f91bd0af..627a78d7 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -187,3 +187,46 @@ async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() assert value == 3.0 + + +@pytest.mark.asyncio +async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page(fresh_metrics): + """When the engine raises on a page, ocr_skipped_pages_total bumps and the stream finishes.""" + mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)] + good_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ok", "words": []}] + + call_count = {"n": 0} + + def extract_side_effect(*args, **kwargs): + call_count["n"] += 1 + if call_count["n"] == 1: + raise RuntimeError("synthetic engine failure") + return good_blocks + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + async with client.stream("POST", "/ocr/stream", json={ + "pdfUrl": "http://minio/doc.pdf", + "scriptType": "TYPEWRITER", + "language": "de", + }) as response: + assert response.status_code == 200 + saw_error = False + async for line in response.aiter_lines(): + if line and '"type": "error"' in line: + saw_error = True + assert saw_error + finally: + main_module._models_ready = False + + assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 + # The second page still succeeds. + assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0