refactor(ocr): extract extract_page_blocks() from both OCR engines

Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 09:56:34 +02:00
parent d8dcba1a71
commit b7d5f71ef7
3 changed files with 231 additions and 89 deletions
--- a/ocr-service/engines/kraken.py
+++ b/ocr-service/engines/kraken.py
@@ -29,12 +29,10 @@ def is_available() -> bool:
    return _model is not None
-def extract_blocks(images: list, language: str = "de") -> list[dict]:
+def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
-    """Run Kraken segmentation + recognition on a list of PIL images.
+    """Run Kraken segmentation + recognition on a single PIL image.
-    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
+    Returns block dicts for that page. Coordinates are normalized to [0, 1].
    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
    Coordinates are normalized to [0, 1].
    """
    from kraken import blla, rpred
    from confidence import words_from_characters
@@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
    if _model is None:
        raise RuntimeError("Kraken model is not loaded")
    page_w, page_h = image.size
    blocks = []
    baseline_seg = blla.segment(image)
    pred_it = rpred.rpred(_model, image, baseline_seg)
    for record in pred_it:
        polygon_pts = record.cuts if hasattr(record, "cuts") else []
        if polygon_pts:
            xs = [p[0] for p in polygon_pts]
            ys = [p[1] for p in polygon_pts]
            x1, y1 = min(xs), min(ys)
            x2, y2 = max(xs), max(ys)
        else:
            xs = [p[0] for p in record.line]
            ys = [p[1] for p in record.line]
            x1, y1 = min(xs), min(ys) - 5
            x2, y2 = max(xs), max(ys) + 5
        quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
        char_confidences = getattr(record, "confidences", [])
        words = words_from_characters(record.prediction, char_confidences)
        blocks.append({
            "pageNumber": page_idx,
            "x": x1 / page_w,
            "y": y1 / page_h,
            "width": (x2 - x1) / page_w,
            "height": (y2 - y1) / page_h,
            "polygon": quad,
            "text": record.prediction,
            "words": words,
        })
    return blocks
 def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Kraken segmentation + recognition on a list of PIL images.
    Returns block dicts with pageNumber, x, y, width, height, polygon, text.
    Polygon is a 4-point quadrilateral approximation of the baseline polygon.
    Coordinates are normalized to [0, 1].
    """
    all_blocks = []
    for page_idx, image in enumerate(images):
-        page_w, page_h = image.size
+        all_blocks.extend(extract_page_blocks(image, page_idx, language))
        baseline_seg = blla.segment(image)
        pred_it = rpred.rpred(_model, image, baseline_seg)
        for record in pred_it:
            # record.prediction is the recognized text
            # record.cuts contains polygon points
            # record.line is the baseline polygon
            polygon_pts = record.cuts if hasattr(record, "cuts") else []
            # Compute AABB from the polygon
            if polygon_pts:
                xs = [p[0] for p in polygon_pts]
                ys = [p[1] for p in polygon_pts]
                x1, y1 = min(xs), min(ys)
                x2, y2 = max(xs), max(ys)
            else:
                # Fallback to line baseline
                xs = [p[0] for p in record.line]
                ys = [p[1] for p in record.line]
                x1, y1 = min(xs), min(ys) - 5
                x2, y2 = max(xs), max(ys) + 5
            # Approximate polygon to quadrilateral
            quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
            # Extract word-level confidence for [unleserlich] marking
            char_confidences = getattr(record, "confidences", [])
            words = words_from_characters(record.prediction, char_confidences)
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
                "y": y1 / page_h,
                "width": (x2 - x1) / page_w,
                "height": (y2 - y1) / page_h,
                "polygon": quad,
                "text": record.prediction,
                "words": words,
            })
    return all_blocks
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -33,6 +33,54 @@ def load_models():
    logger.info("Surya models loaded successfully")
 def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
    """Run Surya OCR on a single PIL image and return block dicts for that page.
    Coordinates are normalized to [0, 1].
    """
    load_models()
    page_w, page_h = image.size
    blocks = []
    predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
    page_pred = predictions[0]
    for line in page_pred.text_lines:
        bbox = line.bbox
        x1, y1, x2, y2 = bbox
        polygon = None
        if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
            polygon = [
                [p[0] / page_w, p[1] / page_h]
                for p in line.polygon
            ]
        words = []
        if hasattr(line, "words") and line.words:
            for word in line.words:
                words.append({
                    "text": word.text,
                    "confidence": word.confidence,
                })
        else:
            words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
        blocks.append({
            "pageNumber": page_idx,
            "x": x1 / page_w,
            "y": y1 / page_h,
            "width": (x2 - x1) / page_w,
            "height": (y2 - y1) / page_h,
            "polygon": polygon,
            "text": line.text,
            "words": words,
        })
    return blocks
 def extract_blocks(images: list, language: str = "de") -> list[dict]:
    """Run Surya OCR on a list of PIL images (one per page).
@@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
    Returns a flat list of block dicts with pageNumber, x, y, width, height,
    polygon, text, words. Coordinates are normalized to [0, 1].
    """
    load_models()
    all_blocks = []
    for page_idx, image in enumerate(images):
-        page_w, page_h = image.size
+        all_blocks.extend(extract_page_blocks(image, page_idx, language))
        # Process single page to limit peak memory
        predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
        page_pred = predictions[0]
        for line in page_pred.text_lines:
            bbox = line.bbox
            x1, y1, x2, y2 = bbox
            polygon = None
            if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
                polygon = [
                    [p[0] / page_w, p[1] / page_h]
                    for p in line.polygon
                ]
            words = []
            if hasattr(line, "words") and line.words:
                for word in line.words:
                    words.append({
                        "text": word.text,
                        "confidence": word.confidence,
                    })
            else:
                words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
                "y": y1 / page_h,
                "width": (x2 - x1) / page_w,
                "height": (y2 - y1) / page_h,
                "polygon": polygon,
                "text": line.text,
                "words": words,
            })
        # Free page image after processing
        del image
    return all_blocks
--- a/ocr-service/test_engines.py
+++ b/ocr-service/test_engines.py
@@ -0,0 +1,132 @@
 """Tests for per-page block extraction in OCR engines."""
 from unittest.mock import MagicMock, patch
 from PIL import Image
 from engines import surya, kraken
 # ─── Surya extract_page_blocks ───────────────────────────────────────────────
 def _make_surya_line(text, bbox, polygon=None, words=None):
    """Create a mock Surya text line with the expected attributes."""
    line = MagicMock()
    line.text = text
    line.bbox = bbox
    line.polygon = polygon
    line.words = words or []
    if not words:
        line.confidence = 0.95
        # hasattr check needs words to be falsy
        del line.words
    return line
 def test_surya_extract_page_blocks_returns_blocks_for_single_image():
    image = Image.new("RGB", (100, 200))
    mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40])
    mock_pred = MagicMock()
    mock_pred.text_lines = [mock_line]
    with patch.object(surya, "_recognition_predictor") as mock_rec, \
         patch.object(surya, "_loaded", True):
        mock_rec.return_value = [mock_pred]
        blocks = surya.extract_page_blocks(image, page_idx=2, language="de")
    assert len(blocks) == 1
    assert blocks[0]["pageNumber"] == 2
    assert blocks[0]["text"] == "Hallo Welt"
    assert blocks[0]["x"] == 10 / 100
    assert blocks[0]["y"] == 20 / 200
    assert blocks[0]["width"] == 80 / 100
    assert blocks[0]["height"] == 20 / 200
 def test_surya_extract_blocks_delegates_to_extract_page_blocks():
    """After refactoring, extract_blocks should produce the same output."""
    image1 = Image.new("RGB", (100, 200))
    image2 = Image.new("RGB", (100, 200))
    mock_line = _make_surya_line("Test", [10, 20, 90, 40])
    mock_pred = MagicMock()
    mock_pred.text_lines = [mock_line]
    with patch.object(surya, "_recognition_predictor") as mock_rec, \
         patch.object(surya, "_loaded", True):
        mock_rec.return_value = [mock_pred]
        blocks = surya.extract_blocks([image1, image2])
    assert len(blocks) == 2
    assert blocks[0]["pageNumber"] == 0
    assert blocks[1]["pageNumber"] == 1
 # ─── Kraken extract_page_blocks ──────────────────────────────────────────────
 def _make_kraken_record(text, cuts, confidences=None):
    record = MagicMock()
    record.prediction = text
    record.cuts = cuts
    record.line = cuts
    record.confidences = confidences or [0.9] * len(text)
    return record
 def _run_kraken_with_mocks(fn, *args):
    """Run a kraken function with blla/rpred mocked via sys.modules."""
    import sys
    mock_blla = MagicMock()
    mock_blla.segment.return_value = MagicMock()
    mock_rpred = MagicMock()
    mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)])
    mock_rpred.rpred.return_value = [mock_record]
    saved_kraken = sys.modules.get("kraken")
    saved_blla = sys.modules.get("kraken.blla")
    saved_rpred = sys.modules.get("kraken.rpred")
    sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
    sys.modules["kraken.blla"] = mock_blla
    sys.modules["kraken.rpred"] = mock_rpred
    try:
        with patch.object(kraken, "_model", MagicMock()):
            return fn(*args)
    finally:
        if saved_kraken is not None:
            sys.modules["kraken"] = saved_kraken
        else:
            sys.modules.pop("kraken", None)
        if saved_blla is not None:
            sys.modules["kraken.blla"] = saved_blla
        else:
            sys.modules.pop("kraken.blla", None)
        if saved_rpred is not None:
            sys.modules["kraken.rpred"] = saved_rpred
        else:
            sys.modules.pop("kraken.rpred", None)
 def test_kraken_extract_page_blocks_returns_blocks_for_single_image():
    image = Image.new("RGB", (100, 200))
    blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de")
    assert len(blocks) == 1
    assert blocks[0]["pageNumber"] == 3
    assert blocks[0]["text"] == "Kurrent"
 def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
    image1 = Image.new("RGB", (100, 200))
    image2 = Image.new("RGB", (100, 200))
    blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
    assert len(blocks) == 2
    assert blocks[0]["pageNumber"] == 0
    assert blocks[1]["pageNumber"] == 1