From 23cf88856e67d957d718f8b66264ec2990754f31 Mon Sep 17 00:00:00 2001 From: Marcel Date: Thu, 23 Apr 2026 09:33:03 +0200 Subject: [PATCH] fix(ocr): guard Kraken block extraction against missing boundary/baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extract_page_blocks() walked `record.boundary` and `record.baseline` unconditionally, so a record that arrived without either (malformed kraken output, or a MagicMock in tests that iterates to nothing) crashed with "min() arg is an empty sequence". Coerce both attributes through list(), require at least 3 points for the polygon path, fall back to the baseline path when the polygon is missing, and skip the record entirely when neither is usable — emitting no block is safer than emitting one with garbage coordinates. The test helper now sets `boundary` and `baseline` explicitly to mirror real Kraken 7.0 records (and so the happy-path test exercises the polygon branch). A new regression test covers the skip path. Co-Authored-By: Claude Opus 4.7 --- ocr-service/engines/kraken.py | 19 +++++++++++++++---- ocr-service/test_engines.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py index 60dfd47e..7c811f48 100644 --- a/ocr-service/engines/kraken.py +++ b/ocr-service/engines/kraken.py @@ -122,16 +122,27 @@ def extract_page_blocks(image: Image, page_idx: int, language: str = "de", pred_it = rpred.rpred(active_model, image, baseline_seg) for record in pred_it: - polygon_pts = record.boundary if hasattr(record, "boundary") and record.boundary else [] + # Coerce via list() so unexpected shapes (None, truthy mocks that + # iterate to nothing, empty lists) collapse to [] and can't blow up + # the min/max calls below. + boundary_attr = getattr(record, "boundary", None) + polygon_pts = list(boundary_attr) if boundary_attr else [] - if polygon_pts: + if len(polygon_pts) >= 3: xs = [p[0] for p in polygon_pts] ys = [p[1] for p in polygon_pts] x1, y1 = min(xs), min(ys) x2, y2 = max(xs), max(ys) else: - xs = [p[0] for p in record.baseline] - ys = [p[1] for p in record.baseline] + baseline_attr = getattr(record, "baseline", None) + baseline_pts = list(baseline_attr) if baseline_attr else [] + if not baseline_pts: + # No polygon and no baseline — we have no way to place the + # block on the page, so drop it rather than emit garbage + # coordinates. + continue + xs = [p[0] for p in baseline_pts] + ys = [p[1] for p in baseline_pts] x1, y1 = min(xs), min(ys) - 5 x2, y2 = max(xs), max(ys) + 5 diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py index 6202c02a..9445cf73 100644 --- a/ocr-service/test_engines.py +++ b/ocr-service/test_engines.py @@ -115,11 +115,16 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks(): # ─── Kraken extract_page_blocks ────────────────────────────────────────────── -def _make_kraken_record(text, cuts, confidences=None): +def _make_kraken_record(text, cuts, confidences=None, boundary=None, baseline=None): record = MagicMock() record.prediction = text record.cuts = cuts record.line = cuts + # Real kraken records expose `boundary` (polygon) and `baseline` lists. + # Mirror that here so the extract path doesn't take the "missing data" + # branch during normal tests. + record.boundary = boundary if boundary is not None else cuts + record.baseline = baseline if baseline is not None else cuts record.confidences = confidences or [0.9] * len(text) return record @@ -179,6 +184,33 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks(): assert blocks[1]["pageNumber"] == 2 +def test_kraken_extract_page_blocks_skips_records_without_positional_data(): + """Records that arrive without a usable boundary polygon OR baseline must + be dropped rather than crash min() with an empty sequence.""" + import sys + + image = Image.new("RGB", (100, 200)) + mock_blla = MagicMock() + mock_blla.segment.return_value = MagicMock() + mock_rpred = MagicMock() + + malformed = _make_kraken_record("Noise", [], boundary=[], baseline=[]) + mock_rpred.rpred.return_value = [malformed] + + sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred) + sys.modules["kraken.blla"] = mock_blla + sys.modules["kraken.rpred"] = mock_rpred + try: + with patch.object(kraken, "_model", MagicMock()): + blocks = kraken.extract_page_blocks(image, page_idx=1, language="de") + finally: + sys.modules.pop("kraken", None) + sys.modules.pop("kraken.blla", None) + sys.modules.pop("kraken.rpred", None) + + assert blocks == [] + + # ─── Engine signatures must match ───────────────────────────────────────────── # # main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and