feat(ocr): count words and illegible words at the OCR call sites
Walks block["words"] before apply_confidence_markers strips the list, then increments ocr_words_total by len(words) and ocr_illegible_words_total by the count below threshold. Same pattern in both /ocr and /ocr/stream so the ratio illegible/words is a faithful quality signal across endpoints. Refs #652 (AC4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -124,8 +124,13 @@ async def run_ocr(request: OcrRequest):
|
||||
|
||||
threshold = get_threshold(script_type)
|
||||
for block in blocks:
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
words = block.get("words") or []
|
||||
if words:
|
||||
metrics.ocr_words_total.inc(len(words))
|
||||
metrics.ocr_illegible_words_total.inc(
|
||||
sum(1 for w in words if w["confidence"] < threshold)
|
||||
)
|
||||
block["text"] = apply_confidence_markers(words, threshold)
|
||||
block.pop("words", None)
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
block["text"] = correct_text(block["text"])
|
||||
@@ -258,8 +263,13 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
)
|
||||
|
||||
for block in blocks:
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
words = block.get("words") or []
|
||||
if words:
|
||||
metrics.ocr_words_total.inc(len(words))
|
||||
metrics.ocr_illegible_words_total.inc(
|
||||
sum(1 for w in words if w["confidence"] < threshold)
|
||||
)
|
||||
block["text"] = apply_confidence_markers(words, threshold)
|
||||
block.pop("words", None)
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
block["text"] = correct_text(block["text"])
|
||||
|
||||
Reference in New Issue
Block a user