diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py index a0fec491..b2b75787 100644 --- a/ocr-service/engines/kraken.py +++ b/ocr-service/engines/kraken.py @@ -29,12 +29,10 @@ def is_available() -> bool: return _model is not None -def extract_blocks(images: list, language: str = "de") -> list[dict]: - """Run Kraken segmentation + recognition on a list of PIL images. +def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]: + """Run Kraken segmentation + recognition on a single PIL image. - Returns block dicts with pageNumber, x, y, width, height, polygon, text. - Polygon is a 4-point quadrilateral approximation of the baseline polygon. - Coordinates are normalized to [0, 1]. + Returns block dicts for that page. Coordinates are normalized to [0, 1]. """ from kraken import blla, rpred from confidence import words_from_characters @@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: if _model is None: raise RuntimeError("Kraken model is not loaded") + page_w, page_h = image.size + blocks = [] + + baseline_seg = blla.segment(image) + pred_it = rpred.rpred(_model, image, baseline_seg) + + for record in pred_it: + polygon_pts = record.cuts if hasattr(record, "cuts") else [] + + if polygon_pts: + xs = [p[0] for p in polygon_pts] + ys = [p[1] for p in polygon_pts] + x1, y1 = min(xs), min(ys) + x2, y2 = max(xs), max(ys) + else: + xs = [p[0] for p in record.line] + ys = [p[1] for p in record.line] + x1, y1 = min(xs), min(ys) - 5 + x2, y2 = max(xs), max(ys) + 5 + + quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None + + char_confidences = getattr(record, "confidences", []) + words = words_from_characters(record.prediction, char_confidences) + + blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": quad, + "text": record.prediction, + "words": words, + }) + + return blocks + + +def extract_blocks(images: list, language: str = "de") -> list[dict]: + """Run Kraken segmentation + recognition on a list of PIL images. + + Returns block dicts with pageNumber, x, y, width, height, polygon, text. + Polygon is a 4-point quadrilateral approximation of the baseline polygon. + Coordinates are normalized to [0, 1]. + """ all_blocks = [] for page_idx, image in enumerate(images): - page_w, page_h = image.size - - baseline_seg = blla.segment(image) - - pred_it = rpred.rpred(_model, image, baseline_seg) - - for record in pred_it: - # record.prediction is the recognized text - # record.cuts contains polygon points - # record.line is the baseline polygon - - polygon_pts = record.cuts if hasattr(record, "cuts") else [] - - # Compute AABB from the polygon - if polygon_pts: - xs = [p[0] for p in polygon_pts] - ys = [p[1] for p in polygon_pts] - x1, y1 = min(xs), min(ys) - x2, y2 = max(xs), max(ys) - else: - # Fallback to line baseline - xs = [p[0] for p in record.line] - ys = [p[1] for p in record.line] - x1, y1 = min(xs), min(ys) - 5 - x2, y2 = max(xs), max(ys) + 5 - - # Approximate polygon to quadrilateral - quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None - - # Extract word-level confidence for [unleserlich] marking - char_confidences = getattr(record, "confidences", []) - words = words_from_characters(record.prediction, char_confidences) - - all_blocks.append({ - "pageNumber": page_idx, - "x": x1 / page_w, - "y": y1 / page_h, - "width": (x2 - x1) / page_w, - "height": (y2 - y1) / page_h, - "polygon": quad, - "text": record.prediction, - "words": words, - }) + all_blocks.extend(extract_page_blocks(image, page_idx, language)) return all_blocks diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index ea028d8c..a82968ce 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -33,6 +33,54 @@ def load_models(): logger.info("Surya models loaded successfully") +def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]: + """Run Surya OCR on a single PIL image and return block dicts for that page. + + Coordinates are normalized to [0, 1]. + """ + load_models() + + page_w, page_h = image.size + blocks = [] + + predictions = _recognition_predictor([image], det_predictor=_detection_predictor) + page_pred = predictions[0] + + for line in page_pred.text_lines: + bbox = line.bbox + x1, y1, x2, y2 = bbox + + polygon = None + if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4: + polygon = [ + [p[0] / page_w, p[1] / page_h] + for p in line.polygon + ] + + words = [] + if hasattr(line, "words") and line.words: + for word in line.words: + words.append({ + "text": word.text, + "confidence": word.confidence, + }) + else: + words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] + + blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": polygon, + "text": line.text, + "words": words, + }) + + return blocks + + def extract_blocks(images: list, language: str = "de") -> list[dict]: """Run Surya OCR on a list of PIL images (one per page). @@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: Returns a flat list of block dicts with pageNumber, x, y, width, height, polygon, text, words. Coordinates are normalized to [0, 1]. """ - load_models() - all_blocks = [] for page_idx, image in enumerate(images): - page_w, page_h = image.size - - # Process single page to limit peak memory - predictions = _recognition_predictor([image], det_predictor=_detection_predictor) - page_pred = predictions[0] - - for line in page_pred.text_lines: - bbox = line.bbox - x1, y1, x2, y2 = bbox - - polygon = None - if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4: - polygon = [ - [p[0] / page_w, p[1] / page_h] - for p in line.polygon - ] - - words = [] - if hasattr(line, "words") and line.words: - for word in line.words: - words.append({ - "text": word.text, - "confidence": word.confidence, - }) - else: - words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] - - all_blocks.append({ - "pageNumber": page_idx, - "x": x1 / page_w, - "y": y1 / page_h, - "width": (x2 - x1) / page_w, - "height": (y2 - y1) / page_h, - "polygon": polygon, - "text": line.text, - "words": words, - }) - - # Free page image after processing + all_blocks.extend(extract_page_blocks(image, page_idx, language)) del image return all_blocks diff --git a/ocr-service/test_engines.py b/ocr-service/test_engines.py new file mode 100644 index 00000000..a6966bd8 --- /dev/null +++ b/ocr-service/test_engines.py @@ -0,0 +1,132 @@ +"""Tests for per-page block extraction in OCR engines.""" + +from unittest.mock import MagicMock, patch +from PIL import Image + +from engines import surya, kraken + + +# ─── Surya extract_page_blocks ─────────────────────────────────────────────── + + +def _make_surya_line(text, bbox, polygon=None, words=None): + """Create a mock Surya text line with the expected attributes.""" + line = MagicMock() + line.text = text + line.bbox = bbox + line.polygon = polygon + line.words = words or [] + if not words: + line.confidence = 0.95 + # hasattr check needs words to be falsy + del line.words + return line + + +def test_surya_extract_page_blocks_returns_blocks_for_single_image(): + image = Image.new("RGB", (100, 200)) + + mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40]) + mock_pred = MagicMock() + mock_pred.text_lines = [mock_line] + + with patch.object(surya, "_recognition_predictor") as mock_rec, \ + patch.object(surya, "_loaded", True): + mock_rec.return_value = [mock_pred] + + blocks = surya.extract_page_blocks(image, page_idx=2, language="de") + + assert len(blocks) == 1 + assert blocks[0]["pageNumber"] == 2 + assert blocks[0]["text"] == "Hallo Welt" + assert blocks[0]["x"] == 10 / 100 + assert blocks[0]["y"] == 20 / 200 + assert blocks[0]["width"] == 80 / 100 + assert blocks[0]["height"] == 20 / 200 + + +def test_surya_extract_blocks_delegates_to_extract_page_blocks(): + """After refactoring, extract_blocks should produce the same output.""" + image1 = Image.new("RGB", (100, 200)) + image2 = Image.new("RGB", (100, 200)) + + mock_line = _make_surya_line("Test", [10, 20, 90, 40]) + mock_pred = MagicMock() + mock_pred.text_lines = [mock_line] + + with patch.object(surya, "_recognition_predictor") as mock_rec, \ + patch.object(surya, "_loaded", True): + mock_rec.return_value = [mock_pred] + + blocks = surya.extract_blocks([image1, image2]) + + assert len(blocks) == 2 + assert blocks[0]["pageNumber"] == 0 + assert blocks[1]["pageNumber"] == 1 + + +# ─── Kraken extract_page_blocks ────────────────────────────────────────────── + + +def _make_kraken_record(text, cuts, confidences=None): + record = MagicMock() + record.prediction = text + record.cuts = cuts + record.line = cuts + record.confidences = confidences or [0.9] * len(text) + return record + + +def _run_kraken_with_mocks(fn, *args): + """Run a kraken function with blla/rpred mocked via sys.modules.""" + import sys + mock_blla = MagicMock() + mock_blla.segment.return_value = MagicMock() + mock_rpred = MagicMock() + + mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)]) + mock_rpred.rpred.return_value = [mock_record] + + saved_kraken = sys.modules.get("kraken") + saved_blla = sys.modules.get("kraken.blla") + saved_rpred = sys.modules.get("kraken.rpred") + + sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred) + sys.modules["kraken.blla"] = mock_blla + sys.modules["kraken.rpred"] = mock_rpred + + try: + with patch.object(kraken, "_model", MagicMock()): + return fn(*args) + finally: + if saved_kraken is not None: + sys.modules["kraken"] = saved_kraken + else: + sys.modules.pop("kraken", None) + if saved_blla is not None: + sys.modules["kraken.blla"] = saved_blla + else: + sys.modules.pop("kraken.blla", None) + if saved_rpred is not None: + sys.modules["kraken.rpred"] = saved_rpred + else: + sys.modules.pop("kraken.rpred", None) + + +def test_kraken_extract_page_blocks_returns_blocks_for_single_image(): + image = Image.new("RGB", (100, 200)) + blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de") + + assert len(blocks) == 1 + assert blocks[0]["pageNumber"] == 3 + assert blocks[0]["text"] == "Kurrent" + + +def test_kraken_extract_blocks_delegates_to_extract_page_blocks(): + image1 = Image.new("RGB", (100, 200)) + image2 = Image.new("RGB", (100, 200)) + blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2]) + + assert len(blocks) == 2 + assert blocks[0]["pageNumber"] == 0 + assert blocks[1]["pageNumber"] == 1