feat(ocr): count words and illegible words at the OCR call sites

Walks block["words"] before apply_confidence_markers strips the list, then
increments ocr_words_total by len(words) and ocr_illegible_words_total by
the count below threshold. Same pattern in both /ocr and /ocr/stream so the
ratio illegible/words is a faithful quality signal across endpoints.

Refs #652 (AC4)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-21 16:07:59 +02:00
parent 3fa3460dbf
commit 131ed336bc
2 changed files with 51 additions and 4 deletions

View File

@@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest):
threshold = get_threshold(script_type)
for block in blocks:
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
words = block.get("words") or []
if words:
metrics.ocr_words_total.inc(len(words))
metrics.ocr_illegible_words_total.inc(
sum(1 for w in words if w["confidence"] < threshold)
)
block["text"] = apply_confidence_markers(words, threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
block["text"] = correct_text(block["text"])
@@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest):
)
for block in blocks:
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
words = block.get("words") or []
if words:
metrics.ocr_words_total.inc(len(words))
metrics.ocr_illegible_words_total.inc(
sum(1 for w in words if w["confidence"] < threshold)
)
block["text"] = apply_confidence_markers(words, threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
block["text"] = correct_text(block["text"])