refactor(ocr): extract extract_page_blocks() from both OCR engines

Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 09:56:34 +02:00
parent d8dcba1a71
commit b7d5f71ef7
3 changed files with 231 additions and 89 deletions
--- a/ocr-service/engines/kraken.py
+++ b/ocr-service/engines/kraken.py
@@ -29,12 +29,10 @@ def is_available() -> bool:
    return _model is not None


-def extract_blocks(images: list, language: str = "de") -> list[dict]:
-    """Run Kraken segmentation + recognition on a list of PIL images.
+def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
+    """Run Kraken segmentation + recognition on a single PIL image.

-    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
-    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
-    Coordinates are normalized to [0, 1].
+    Returns block dicts for that page. Coordinates are normalized to [0, 1].
    """
    from kraken import blla, rpred
    from confidence import words_from_characters
@@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
    if _model is None:
        raise RuntimeError("Kraken model is not loaded")

+    page_w, page_h = image.size
+    blocks = []
+
+    baseline_seg = blla.segment(image)
+    pred_it = rpred.rpred(_model, image, baseline_seg)
+
+    for record in pred_it:
+        polygon_pts = record.cuts if hasattr(record, "cuts") else []
+
+        if polygon_pts:
+            xs = [p[0] for p in polygon_pts]
+            ys = [p[1] for p in polygon_pts]
+            x1, y1 = min(xs), min(ys)
+            x2, y2 = max(xs), max(ys)
+        else:
+            xs = [p[0] for p in record.line]
+            ys = [p[1] for p in record.line]
+            x1, y1 = min(xs), min(ys) - 5
+            x2, y2 = max(xs), max(ys) + 5
+
+        quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
+
+        char_confidences = getattr(record, "confidences", [])
+        words = words_from_characters(record.prediction, char_confidences)
+
+        blocks.append({
+            "pageNumber": page_idx,
+            "x": x1 / page_w,
+            "y": y1 / page_h,
+            "width": (x2 - x1) / page_w,
+            "height": (y2 - y1) / page_h,
+            "polygon": quad,
+            "text": record.prediction,
+            "words": words,
+        })
+
+    return blocks
+
+
+def extract_blocks(images: list, language: str = "de") -> list[dict]:
+    """Run Kraken segmentation + recognition on a list of PIL images.
+
+    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
+    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
+    Coordinates are normalized to [0, 1].
+    """
    all_blocks = []

    for page_idx, image in enumerate(images):
-        page_w, page_h = image.size
-
-        baseline_seg = blla.segment(image)
-
-        pred_it = rpred.rpred(_model, image, baseline_seg)
-
-        for record in pred_it:
-            # record.prediction is the recognized text
-            # record.cuts contains polygon points
-            # record.line is the baseline polygon
-
-            polygon_pts = record.cuts if hasattr(record, "cuts") else []
-
-            # Compute AABB from the polygon
-            if polygon_pts:
-                xs = [p[0] for p in polygon_pts]
-                ys = [p[1] for p in polygon_pts]
-                x1, y1 = min(xs), min(ys)
-                x2, y2 = max(xs), max(ys)
-            else:
-                # Fallback to line baseline
-                xs = [p[0] for p in record.line]
-                ys = [p[1] for p in record.line]
-                x1, y1 = min(xs), min(ys) - 5
-                x2, y2 = max(xs), max(ys) + 5
-
-            # Approximate polygon to quadrilateral
-            quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
-
-            # Extract word-level confidence for [unleserlich] marking
-            char_confidences = getattr(record, "confidences", [])
-            words = words_from_characters(record.prediction, char_confidences)
-
-            all_blocks.append({
-                "pageNumber": page_idx,
-                "x": x1 / page_w,
-                "y": y1 / page_h,
-                "width": (x2 - x1) / page_w,
-                "height": (y2 - y1) / page_h,
-                "polygon": quad,
-                "text": record.prediction,
-                "words": words,
-            })
+        all_blocks.extend(extract_page_blocks(image, page_idx, language))

    return all_blocks

--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -33,6 +33,54 @@ def load_models():
    logger.info("Surya models loaded successfully")


+def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
+    """Run Surya OCR on a single PIL image and return block dicts for that page.
+
+    Coordinates are normalized to [0, 1].
+    """
+    load_models()
+
+    page_w, page_h = image.size
+    blocks = []
+
+    predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
+    page_pred = predictions[0]
+
+    for line in page_pred.text_lines:
+        bbox = line.bbox
+        x1, y1, x2, y2 = bbox
+
+        polygon = None
+        if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
+            polygon = [
+                [p[0] / page_w, p[1] / page_h]
+                for p in line.polygon
+            ]
+
+        words = []
+        if hasattr(line, "words") and line.words:
+            for word in line.words:
+                words.append({
+                    "text": word.text,
+                    "confidence": word.confidence,
+                })
+        else:
+            words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
+
+        blocks.append({
+            "pageNumber": page_idx,
+            "x": x1 / page_w,
+            "y": y1 / page_h,
+            "width": (x2 - x1) / page_w,
+            "height": (y2 - y1) / page_h,
+            "polygon": polygon,
+            "text": line.text,
+            "words": words,
+        })
+
+    return blocks
+
+
 def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Surya OCR on a list of PIL images (one per page).

@@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
    Returns a flat list of block dicts with pageNumber, x, y, width, height,
    polygon, text, words. Coordinates are normalized to [0, 1].
    """
-    load_models()
-
    all_blocks = []

    for page_idx, image in enumerate(images):
-        page_w, page_h = image.size
-
-        # Process single page to limit peak memory
-        predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
-        page_pred = predictions[0]
-
-        for line in page_pred.text_lines:
-            bbox = line.bbox
-            x1, y1, x2, y2 = bbox
-
-            polygon = None
-            if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
-                polygon = [
-                    [p[0] / page_w, p[1] / page_h]
-                    for p in line.polygon
-                ]
-
-            words = []
-            if hasattr(line, "words") and line.words:
-                for word in line.words:
-                    words.append({
-                        "text": word.text,
-                        "confidence": word.confidence,
-                    })
-            else:
-                words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
-
-            all_blocks.append({
-                "pageNumber": page_idx,
-                "x": x1 / page_w,
-                "y": y1 / page_h,
-                "width": (x2 - x1) / page_w,
-                "height": (y2 - y1) / page_h,
-                "polygon": polygon,
-                "text": line.text,
-                "words": words,
-            })
-
-        # Free page image after processing
+        all_blocks.extend(extract_page_blocks(image, page_idx, language))
        del image

    return all_blocks