From 22a5ee816af65b5e9757b7fa9651483e673bbeb1 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Thu, 21 May 2026 16:57:18 +0200
Subject: [PATCH] refactor(ocr): extract _observe_block_words for word counter
 sites
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two block-iteration loops (/ocr and /ocr/stream's standard generator)
both ran the same word-total and illegible-word increments. Lift them
into a single helper so each call site becomes one line and the counter
intent reads cleanly. Pure refactor — no behavior change, tests stay green.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/main.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/ocr-service/main.py b/ocr-service/main.py
index 31fd9cad..51cf737b 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -50,6 +50,18 @@ ALLOWED_PDF_HOSTS = set(
 _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
 
 
+def _observe_block_words(words: list[dict], threshold: float) -> None:
+    """Record per-block word counts and below-threshold word counts.
+
+    Pre: `words` is non-empty. Caller checks for that — keeping the helper
+    branch-free makes the call sites read as a single line.
+    """
+    metrics.ocr_words_total.inc(len(words))
+    metrics.ocr_illegible_words_total.inc(
+        sum(1 for w in words if w["confidence"] < threshold)
+    )
+
+
 def _validate_url(url: str) -> None:
     """Validate that the PDF URL points to an allowed host (SSRF protection)."""
     parsed = urlparse(url)
@@ -149,10 +161,7 @@ async def run_ocr(request: OcrRequest):
     for block in blocks:
         words = block.get("words") or []
         if words:
-            metrics.ocr_words_total.inc(len(words))
-            metrics.ocr_illegible_words_total.inc(
-                sum(1 for w in words if w["confidence"] < threshold)
-            )
+            _observe_block_words(words, threshold)
             block["text"] = apply_confidence_markers(words, threshold)
         block.pop("words", None)
         if script_type in _SPELL_CHECK_SCRIPT_TYPES:
@@ -298,10 +307,7 @@ async def run_ocr_stream(request: OcrRequest):
                 for block in blocks:
                     words = block.get("words") or []
                     if words:
-                        metrics.ocr_words_total.inc(len(words))
-                        metrics.ocr_illegible_words_total.inc(
-                            sum(1 for w in words if w["confidence"] < threshold)
-                        )
+                        _observe_block_words(words, threshold)
                         block["text"] = apply_confidence_markers(words, threshold)
                     block.pop("words", None)
                     if script_type in _SPELL_CHECK_SCRIPT_TYPES: