fix(ocr): guard Kraken block extraction against missing boundary/baseline
extract_page_blocks() walked `record.boundary` and `record.baseline` unconditionally, so a record that arrived without either (malformed kraken output, or a MagicMock in tests that iterates to nothing) crashed with "min() arg is an empty sequence". Coerce both attributes through list(), require at least 3 points for the polygon path, fall back to the baseline path when the polygon is missing, and skip the record entirely when neither is usable — emitting no block is safer than emitting one with garbage coordinates. The test helper now sets `boundary` and `baseline` explicitly to mirror real Kraken 7.0 records (and so the happy-path test exercises the polygon branch). A new regression test covers the skip path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -122,16 +122,27 @@ def extract_page_blocks(image: Image, page_idx: int, language: str = "de",
|
|||||||
pred_it = rpred.rpred(active_model, image, baseline_seg)
|
pred_it = rpred.rpred(active_model, image, baseline_seg)
|
||||||
|
|
||||||
for record in pred_it:
|
for record in pred_it:
|
||||||
polygon_pts = record.boundary if hasattr(record, "boundary") and record.boundary else []
|
# Coerce via list() so unexpected shapes (None, truthy mocks that
|
||||||
|
# iterate to nothing, empty lists) collapse to [] and can't blow up
|
||||||
|
# the min/max calls below.
|
||||||
|
boundary_attr = getattr(record, "boundary", None)
|
||||||
|
polygon_pts = list(boundary_attr) if boundary_attr else []
|
||||||
|
|
||||||
if polygon_pts:
|
if len(polygon_pts) >= 3:
|
||||||
xs = [p[0] for p in polygon_pts]
|
xs = [p[0] for p in polygon_pts]
|
||||||
ys = [p[1] for p in polygon_pts]
|
ys = [p[1] for p in polygon_pts]
|
||||||
x1, y1 = min(xs), min(ys)
|
x1, y1 = min(xs), min(ys)
|
||||||
x2, y2 = max(xs), max(ys)
|
x2, y2 = max(xs), max(ys)
|
||||||
else:
|
else:
|
||||||
xs = [p[0] for p in record.baseline]
|
baseline_attr = getattr(record, "baseline", None)
|
||||||
ys = [p[1] for p in record.baseline]
|
baseline_pts = list(baseline_attr) if baseline_attr else []
|
||||||
|
if not baseline_pts:
|
||||||
|
# No polygon and no baseline — we have no way to place the
|
||||||
|
# block on the page, so drop it rather than emit garbage
|
||||||
|
# coordinates.
|
||||||
|
continue
|
||||||
|
xs = [p[0] for p in baseline_pts]
|
||||||
|
ys = [p[1] for p in baseline_pts]
|
||||||
x1, y1 = min(xs), min(ys) - 5
|
x1, y1 = min(xs), min(ys) - 5
|
||||||
x2, y2 = max(xs), max(ys) + 5
|
x2, y2 = max(xs), max(ys) + 5
|
||||||
|
|
||||||
|
|||||||
@@ -115,11 +115,16 @@ def test_surya_extract_blocks_delegates_to_extract_page_blocks():
|
|||||||
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def _make_kraken_record(text, cuts, confidences=None):
|
def _make_kraken_record(text, cuts, confidences=None, boundary=None, baseline=None):
|
||||||
record = MagicMock()
|
record = MagicMock()
|
||||||
record.prediction = text
|
record.prediction = text
|
||||||
record.cuts = cuts
|
record.cuts = cuts
|
||||||
record.line = cuts
|
record.line = cuts
|
||||||
|
# Real kraken records expose `boundary` (polygon) and `baseline` lists.
|
||||||
|
# Mirror that here so the extract path doesn't take the "missing data"
|
||||||
|
# branch during normal tests.
|
||||||
|
record.boundary = boundary if boundary is not None else cuts
|
||||||
|
record.baseline = baseline if baseline is not None else cuts
|
||||||
record.confidences = confidences or [0.9] * len(text)
|
record.confidences = confidences or [0.9] * len(text)
|
||||||
return record
|
return record
|
||||||
|
|
||||||
@@ -179,6 +184,33 @@ def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
|
|||||||
assert blocks[1]["pageNumber"] == 2
|
assert blocks[1]["pageNumber"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_kraken_extract_page_blocks_skips_records_without_positional_data():
|
||||||
|
"""Records that arrive without a usable boundary polygon OR baseline must
|
||||||
|
be dropped rather than crash min() with an empty sequence."""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
image = Image.new("RGB", (100, 200))
|
||||||
|
mock_blla = MagicMock()
|
||||||
|
mock_blla.segment.return_value = MagicMock()
|
||||||
|
mock_rpred = MagicMock()
|
||||||
|
|
||||||
|
malformed = _make_kraken_record("Noise", [], boundary=[], baseline=[])
|
||||||
|
mock_rpred.rpred.return_value = [malformed]
|
||||||
|
|
||||||
|
sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
|
||||||
|
sys.modules["kraken.blla"] = mock_blla
|
||||||
|
sys.modules["kraken.rpred"] = mock_rpred
|
||||||
|
try:
|
||||||
|
with patch.object(kraken, "_model", MagicMock()):
|
||||||
|
blocks = kraken.extract_page_blocks(image, page_idx=1, language="de")
|
||||||
|
finally:
|
||||||
|
sys.modules.pop("kraken", None)
|
||||||
|
sys.modules.pop("kraken.blla", None)
|
||||||
|
sys.modules.pop("kraken.rpred", None)
|
||||||
|
|
||||||
|
assert blocks == []
|
||||||
|
|
||||||
|
|
||||||
# ─── Engine signatures must match ─────────────────────────────────────────────
|
# ─── Engine signatures must match ─────────────────────────────────────────────
|
||||||
#
|
#
|
||||||
# main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and
|
# main.py resolves `engine = kraken_engine if use_kraken else surya_engine` and
|
||||||
|
|||||||
Reference in New Issue
Block a user