feat(ocr): count words and illegible words at the OCR call sites

Walks block["words"] before apply_confidence_markers strips the list, then increments ocr_words_total by len(words) and ocr_illegible_words_total by the count below threshold. Same pattern in both /ocr and /ocr/stream so the ratio illegible/words is a faithful quality signal across endpoints. Refs #652 (AC4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 16:07:59 +02:00
parent 3fa3460dbf
commit 131ed336bc
2 changed files with 51 additions and 4 deletions
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest):

    threshold = get_threshold(script_type)
    for block in blocks:
-        if block.get("words"):
-            block["text"] = apply_confidence_markers(block["words"], threshold)
+        words = block.get("words") or []
+        if words:
+            metrics.ocr_words_total.inc(len(words))
+            metrics.ocr_illegible_words_total.inc(
+                sum(1 for w in words if w["confidence"] < threshold)
+            )
+            block["text"] = apply_confidence_markers(words, threshold)
        block.pop("words", None)
        if script_type in _SPELL_CHECK_SCRIPT_TYPES:
            block["text"] = correct_text(block["text"])
@@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest):
                )

                for block in blocks:
-                    if block.get("words"):
-                        block["text"] = apply_confidence_markers(block["words"], threshold)
+                    words = block.get("words") or []
+                    if words:
+                        metrics.ocr_words_total.inc(len(words))
+                        metrics.ocr_illegible_words_total.inc(
+                            sum(1 for w in words if w["confidence"] < threshold)
+                        )
+                        block["text"] = apply_confidence_markers(words, threshold)
                    block.pop("words", None)
                    if script_type in _SPELL_CHECK_SCRIPT_TYPES:
                        block["text"] = correct_text(block["text"])