refactor(ocr): extract _observe_block_words for word counter sites
The two block-iteration loops (/ocr and /ocr/stream's standard generator) both ran the same word-total and illegible-word increments. Lift them into a single helper so each call site becomes one line and the counter intent reads cleanly. Pure refactor — no behavior change, tests stay green. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set(
|
|||||||
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
|
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
|
||||||
|
|
||||||
|
|
||||||
|
def _observe_block_words(words: list[dict], threshold: float) -> None:
|
||||||
|
"""Record per-block word counts and below-threshold word counts.
|
||||||
|
|
||||||
|
Pre: `words` is non-empty. Caller checks for that — keeping the helper
|
||||||
|
branch-free makes the call sites read as a single line.
|
||||||
|
"""
|
||||||
|
metrics.ocr_words_total.inc(len(words))
|
||||||
|
metrics.ocr_illegible_words_total.inc(
|
||||||
|
sum(1 for w in words if w["confidence"] < threshold)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _validate_url(url: str) -> None:
|
def _validate_url(url: str) -> None:
|
||||||
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
|
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
@@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
words = block.get("words") or []
|
words = block.get("words") or []
|
||||||
if words:
|
if words:
|
||||||
metrics.ocr_words_total.inc(len(words))
|
_observe_block_words(words, threshold)
|
||||||
metrics.ocr_illegible_words_total.inc(
|
|
||||||
sum(1 for w in words if w["confidence"] < threshold)
|
|
||||||
)
|
|
||||||
block["text"] = apply_confidence_markers(words, threshold)
|
block["text"] = apply_confidence_markers(words, threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
@@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
words = block.get("words") or []
|
words = block.get("words") or []
|
||||||
if words:
|
if words:
|
||||||
metrics.ocr_words_total.inc(len(words))
|
_observe_block_words(words, threshold)
|
||||||
metrics.ocr_illegible_words_total.inc(
|
|
||||||
sum(1 for w in words if w["confidence"] < threshold)
|
|
||||||
)
|
|
||||||
block["text"] = apply_confidence_markers(words, threshold)
|
block["text"] = apply_confidence_markers(words, threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
|
|||||||
Reference in New Issue
Block a user