feat(ocr): count words and illegible words at the OCR call sites
Walks block["words"] before apply_confidence_markers strips the list, then increments ocr_words_total by len(words) and ocr_illegible_words_total by the count below threshold. Same pattern in both /ocr and /ocr/stream so the ratio illegible/words is a faithful quality signal across endpoints. Refs #652 (AC4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest):
|
|||||||
|
|
||||||
threshold = get_threshold(script_type)
|
threshold = get_threshold(script_type)
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
if block.get("words"):
|
words = block.get("words") or []
|
||||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
if words:
|
||||||
|
metrics.ocr_words_total.inc(len(words))
|
||||||
|
metrics.ocr_illegible_words_total.inc(
|
||||||
|
sum(1 for w in words if w["confidence"] < threshold)
|
||||||
|
)
|
||||||
|
block["text"] = apply_confidence_markers(words, threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
block["text"] = correct_text(block["text"])
|
block["text"] = correct_text(block["text"])
|
||||||
@@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
)
|
)
|
||||||
|
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
if block.get("words"):
|
words = block.get("words") or []
|
||||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
if words:
|
||||||
|
metrics.ocr_words_total.inc(len(words))
|
||||||
|
metrics.ocr_illegible_words_total.inc(
|
||||||
|
sum(1 for w in words if w["confidence"] < threshold)
|
||||||
|
)
|
||||||
|
block["text"] = apply_confidence_markers(words, threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
block["text"] = correct_text(block["text"])
|
block["text"] = correct_text(block["text"])
|
||||||
|
|||||||
@@ -230,3 +230,40 @@ async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page
|
|||||||
assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0
|
assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0
|
||||||
# The second page still succeeds.
|
# The second page still succeeds.
|
||||||
assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0
|
assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metrics):
|
||||||
|
"""Counters reflect totals summed over every block in the request.
|
||||||
|
|
||||||
|
Threshold defaults to THRESHOLD_DEFAULT (0.3) for non-Kurrent scripts. Two
|
||||||
|
blocks: 3 words above + 2 words below threshold across blocks.
|
||||||
|
"""
|
||||||
|
mock_images = [Image.new("RGB", (100, 100))]
|
||||||
|
mock_blocks = [
|
||||||
|
{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
|
||||||
|
"polygon": None, "text": "ignored",
|
||||||
|
"words": [{"text": "Lieber", "confidence": 0.9},
|
||||||
|
{"text": "Freund", "confidence": 0.1}]},
|
||||||
|
{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
|
||||||
|
"polygon": None, "text": "ignored",
|
||||||
|
"words": [{"text": "Gruss", "confidence": 0.8},
|
||||||
|
{"text": "verschmiert", "confidence": 0.05},
|
||||||
|
{"text": "Karl", "confidence": 0.95}]},
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("main.kraken_engine.load_models"), \
|
||||||
|
patch("main.load_spell_checker"), \
|
||||||
|
patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
|
||||||
|
patch("main.preprocess_page", side_effect=lambda img: img), \
|
||||||
|
patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
|
||||||
|
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
|
||||||
|
import main as main_module
|
||||||
|
main_module._models_ready = True
|
||||||
|
try:
|
||||||
|
await _drive_ocr(client, script_type="TYPEWRITER")
|
||||||
|
finally:
|
||||||
|
main_module._models_ready = False
|
||||||
|
|
||||||
|
assert fresh_metrics.ocr_words_total._value.get() == 5.0
|
||||||
|
assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user