diff --git a/ocr-service/main.py b/ocr-service/main.py index 9522d8ae..12a77a45 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest): threshold = get_threshold(script_type) for block in blocks: - if block.get("words"): - block["text"] = apply_confidence_markers(block["words"], threshold) + words = block.get("words") or [] + if words: + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: block["text"] = correct_text(block["text"]) @@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest): ) for block in blocks: - if block.get("words"): - block["text"] = apply_confidence_markers(block["words"], threshold) + words = block.get("words") or [] + if words: + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: block["text"] = correct_text(block["text"]) diff --git a/ocr-service/test_metrics.py b/ocr-service/test_metrics.py index 627a78d7..a13db209 100644 --- a/ocr-service/test_metrics.py +++ b/ocr-service/test_metrics.py @@ -230,3 +230,40 @@ async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0 # The second page still succeeds. assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0 + + +@pytest.mark.asyncio +async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metrics): + """Counters reflect totals summed over every block in the request. + + Threshold defaults to THRESHOLD_DEFAULT (0.3) for non-Kurrent scripts. Two + blocks: 3 words above + 2 words below threshold across blocks. + """ + mock_images = [Image.new("RGB", (100, 100))] + mock_blocks = [ + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ignored", + "words": [{"text": "Lieber", "confidence": 0.9}, + {"text": "Freund", "confidence": 0.1}]}, + {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, + "polygon": None, "text": "ignored", + "words": [{"text": "Gruss", "confidence": 0.8}, + {"text": "verschmiert", "confidence": 0.05}, + {"text": "Karl", "confidence": 0.95}]}, + ] + + with patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \ + patch("main.preprocess_page", side_effect=lambda img: img), \ + patch("main.surya_engine.extract_blocks", return_value=mock_blocks): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + import main as main_module + main_module._models_ready = True + try: + await _drive_ocr(client, script_type="TYPEWRITER") + finally: + main_module._models_ready = False + + assert fresh_metrics.ocr_words_total._value.get() == 5.0 + assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0