From 22a5ee816af65b5e9757b7fa9651483e673bbeb1 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 21 May 2026 16:57:18 +0200 Subject: [PATCH] refactor(ocr): extract _observe_block_words for word counter sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two block-iteration loops (/ocr and /ocr/stream's standard generator) both ran the same word-total and illegible-word increments. Lift them into a single helper so each call site becomes one line and the counter intent reads cleanly. Pure refactor — no behavior change, tests stay green. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 31fd9cad..51cf737b 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set( _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"} +def _observe_block_words(words: list[dict], threshold: float) -> None: + """Record per-block word counts and below-threshold word counts. + + Pre: `words` is non-empty. Caller checks for that — keeping the helper + branch-free makes the call sites read as a single line. + """ + metrics.ocr_words_total.inc(len(words)) + metrics.ocr_illegible_words_total.inc( + sum(1 for w in words if w["confidence"] < threshold) + ) + + def _validate_url(url: str) -> None: """Validate that the PDF URL points to an allowed host (SSRF protection).""" parsed = urlparse(url) @@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest): for block in blocks: words = block.get("words") or [] if words: - metrics.ocr_words_total.inc(len(words)) - metrics.ocr_illegible_words_total.inc( - sum(1 for w in words if w["confidence"] < threshold) - ) + _observe_block_words(words, threshold) block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: @@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest): for block in blocks: words = block.get("words") or [] if words: - metrics.ocr_words_total.inc(len(words)) - metrics.ocr_illegible_words_total.inc( - sum(1 for w in words if w["confidence"] < threshold) - ) + _observe_block_words(words, threshold) block["text"] = apply_confidence_markers(words, threshold) block.pop("words", None) if script_type in _SPELL_CHECK_SCRIPT_TYPES: