refactor(ocr): extract _observe_block_words for word counter sites

The two block-iteration loops (/ocr and /ocr/stream's standard generator) both ran the same word-total and illegible-word increments. Lift them into a single helper so each call site becomes one line and the counter intent reads cleanly. Pure refactor — no behavior change, tests stay green. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 16:57:18 +02:00
parent 0179e93a4b
commit 22a5ee816a
1 changed files with 14 additions and 8 deletions
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set(
 _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
 def _observe_block_words(words: list[dict], threshold: float) -> None:
    """Record per-block word counts and below-threshold word counts.
    Pre: `words` is non-empty. Caller checks for that — keeping the helper
    branch-free makes the call sites read as a single line.
    """
    metrics.ocr_words_total.inc(len(words))
    metrics.ocr_illegible_words_total.inc(
        sum(1 for w in words if w["confidence"] < threshold)
    )
 def _validate_url(url: str) -> None:
    """Validate that the PDF URL points to an allowed host (SSRF protection)."""
    parsed = urlparse(url)
@@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest):
    for block in blocks:
        words = block.get("words") or []
        if words:
-            metrics.ocr_words_total.inc(len(words))
+            _observe_block_words(words, threshold)
            metrics.ocr_illegible_words_total.inc(
                sum(1 for w in words if w["confidence"] < threshold)
            )
            block["text"] = apply_confidence_markers(words, threshold)
        block.pop("words", None)
        if script_type in _SPELL_CHECK_SCRIPT_TYPES:
@@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest):
                for block in blocks:
                    words = block.get("words") or []
                    if words:
-                        metrics.ocr_words_total.inc(len(words))
+                        _observe_block_words(words, threshold)
                        metrics.ocr_illegible_words_total.inc(
                            sum(1 for w in words if w["confidence"] < threshold)
                        )
                        block["text"] = apply_confidence_markers(words, threshold)
                    block.pop("words", None)
                    if script_type in _SPELL_CHECK_SCRIPT_TYPES: