refactor(ocr): extract extract_page_blocks() from both OCR engines
Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -29,12 +29,10 @@ def is_available() -> bool:
|
|||||||
return _model is not None
|
return _model is not None
|
||||||
|
|
||||||
|
|
||||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
|
||||||
"""Run Kraken segmentation + recognition on a list of PIL images.
|
"""Run Kraken segmentation + recognition on a single PIL image.
|
||||||
|
|
||||||
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
|
Returns block dicts for that page. Coordinates are normalized to [0, 1].
|
||||||
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
|
|
||||||
Coordinates are normalized to [0, 1].
|
|
||||||
"""
|
"""
|
||||||
from kraken import blla, rpred
|
from kraken import blla, rpred
|
||||||
from confidence import words_from_characters
|
from confidence import words_from_characters
|
||||||
@@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
if _model is None:
|
if _model is None:
|
||||||
raise RuntimeError("Kraken model is not loaded")
|
raise RuntimeError("Kraken model is not loaded")
|
||||||
|
|
||||||
|
page_w, page_h = image.size
|
||||||
|
blocks = []
|
||||||
|
|
||||||
|
baseline_seg = blla.segment(image)
|
||||||
|
pred_it = rpred.rpred(_model, image, baseline_seg)
|
||||||
|
|
||||||
|
for record in pred_it:
|
||||||
|
polygon_pts = record.cuts if hasattr(record, "cuts") else []
|
||||||
|
|
||||||
|
if polygon_pts:
|
||||||
|
xs = [p[0] for p in polygon_pts]
|
||||||
|
ys = [p[1] for p in polygon_pts]
|
||||||
|
x1, y1 = min(xs), min(ys)
|
||||||
|
x2, y2 = max(xs), max(ys)
|
||||||
|
else:
|
||||||
|
xs = [p[0] for p in record.line]
|
||||||
|
ys = [p[1] for p in record.line]
|
||||||
|
x1, y1 = min(xs), min(ys) - 5
|
||||||
|
x2, y2 = max(xs), max(ys) + 5
|
||||||
|
|
||||||
|
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||||
|
|
||||||
|
char_confidences = getattr(record, "confidences", [])
|
||||||
|
words = words_from_characters(record.prediction, char_confidences)
|
||||||
|
|
||||||
|
blocks.append({
|
||||||
|
"pageNumber": page_idx,
|
||||||
|
"x": x1 / page_w,
|
||||||
|
"y": y1 / page_h,
|
||||||
|
"width": (x2 - x1) / page_w,
|
||||||
|
"height": (y2 - y1) / page_h,
|
||||||
|
"polygon": quad,
|
||||||
|
"text": record.prediction,
|
||||||
|
"words": words,
|
||||||
|
})
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||||
|
"""Run Kraken segmentation + recognition on a list of PIL images.
|
||||||
|
|
||||||
|
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
|
||||||
|
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
|
||||||
|
Coordinates are normalized to [0, 1].
|
||||||
|
"""
|
||||||
all_blocks = []
|
all_blocks = []
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
for page_idx, image in enumerate(images):
|
||||||
page_w, page_h = image.size
|
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||||
|
|
||||||
baseline_seg = blla.segment(image)
|
|
||||||
|
|
||||||
pred_it = rpred.rpred(_model, image, baseline_seg)
|
|
||||||
|
|
||||||
for record in pred_it:
|
|
||||||
# record.prediction is the recognized text
|
|
||||||
# record.cuts contains polygon points
|
|
||||||
# record.line is the baseline polygon
|
|
||||||
|
|
||||||
polygon_pts = record.cuts if hasattr(record, "cuts") else []
|
|
||||||
|
|
||||||
# Compute AABB from the polygon
|
|
||||||
if polygon_pts:
|
|
||||||
xs = [p[0] for p in polygon_pts]
|
|
||||||
ys = [p[1] for p in polygon_pts]
|
|
||||||
x1, y1 = min(xs), min(ys)
|
|
||||||
x2, y2 = max(xs), max(ys)
|
|
||||||
else:
|
|
||||||
# Fallback to line baseline
|
|
||||||
xs = [p[0] for p in record.line]
|
|
||||||
ys = [p[1] for p in record.line]
|
|
||||||
x1, y1 = min(xs), min(ys) - 5
|
|
||||||
x2, y2 = max(xs), max(ys) + 5
|
|
||||||
|
|
||||||
# Approximate polygon to quadrilateral
|
|
||||||
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
|
||||||
|
|
||||||
# Extract word-level confidence for [unleserlich] marking
|
|
||||||
char_confidences = getattr(record, "confidences", [])
|
|
||||||
words = words_from_characters(record.prediction, char_confidences)
|
|
||||||
|
|
||||||
all_blocks.append({
|
|
||||||
"pageNumber": page_idx,
|
|
||||||
"x": x1 / page_w,
|
|
||||||
"y": y1 / page_h,
|
|
||||||
"width": (x2 - x1) / page_w,
|
|
||||||
"height": (y2 - y1) / page_h,
|
|
||||||
"polygon": quad,
|
|
||||||
"text": record.prediction,
|
|
||||||
"words": words,
|
|
||||||
})
|
|
||||||
|
|
||||||
return all_blocks
|
return all_blocks
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,54 @@ def load_models():
|
|||||||
logger.info("Surya models loaded successfully")
|
logger.info("Surya models loaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
|
||||||
|
"""Run Surya OCR on a single PIL image and return block dicts for that page.
|
||||||
|
|
||||||
|
Coordinates are normalized to [0, 1].
|
||||||
|
"""
|
||||||
|
load_models()
|
||||||
|
|
||||||
|
page_w, page_h = image.size
|
||||||
|
blocks = []
|
||||||
|
|
||||||
|
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
|
||||||
|
page_pred = predictions[0]
|
||||||
|
|
||||||
|
for line in page_pred.text_lines:
|
||||||
|
bbox = line.bbox
|
||||||
|
x1, y1, x2, y2 = bbox
|
||||||
|
|
||||||
|
polygon = None
|
||||||
|
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
||||||
|
polygon = [
|
||||||
|
[p[0] / page_w, p[1] / page_h]
|
||||||
|
for p in line.polygon
|
||||||
|
]
|
||||||
|
|
||||||
|
words = []
|
||||||
|
if hasattr(line, "words") and line.words:
|
||||||
|
for word in line.words:
|
||||||
|
words.append({
|
||||||
|
"text": word.text,
|
||||||
|
"confidence": word.confidence,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
||||||
|
|
||||||
|
blocks.append({
|
||||||
|
"pageNumber": page_idx,
|
||||||
|
"x": x1 / page_w,
|
||||||
|
"y": y1 / page_h,
|
||||||
|
"width": (x2 - x1) / page_w,
|
||||||
|
"height": (y2 - y1) / page_h,
|
||||||
|
"polygon": polygon,
|
||||||
|
"text": line.text,
|
||||||
|
"words": words,
|
||||||
|
})
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||||
"""Run Surya OCR on a list of PIL images (one per page).
|
"""Run Surya OCR on a list of PIL images (one per page).
|
||||||
|
|
||||||
@@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
|||||||
Returns a flat list of block dicts with pageNumber, x, y, width, height,
|
Returns a flat list of block dicts with pageNumber, x, y, width, height,
|
||||||
polygon, text, words. Coordinates are normalized to [0, 1].
|
polygon, text, words. Coordinates are normalized to [0, 1].
|
||||||
"""
|
"""
|
||||||
load_models()
|
|
||||||
|
|
||||||
all_blocks = []
|
all_blocks = []
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
for page_idx, image in enumerate(images):
|
||||||
page_w, page_h = image.size
|
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||||
|
|
||||||
# Process single page to limit peak memory
|
|
||||||
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
|
|
||||||
page_pred = predictions[0]
|
|
||||||
|
|
||||||
for line in page_pred.text_lines:
|
|
||||||
bbox = line.bbox
|
|
||||||
x1, y1, x2, y2 = bbox
|
|
||||||
|
|
||||||
polygon = None
|
|
||||||
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
|
||||||
polygon = [
|
|
||||||
[p[0] / page_w, p[1] / page_h]
|
|
||||||
for p in line.polygon
|
|
||||||
]
|
|
||||||
|
|
||||||
words = []
|
|
||||||
if hasattr(line, "words") and line.words:
|
|
||||||
for word in line.words:
|
|
||||||
words.append({
|
|
||||||
"text": word.text,
|
|
||||||
"confidence": word.confidence,
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
|
||||||
|
|
||||||
all_blocks.append({
|
|
||||||
"pageNumber": page_idx,
|
|
||||||
"x": x1 / page_w,
|
|
||||||
"y": y1 / page_h,
|
|
||||||
"width": (x2 - x1) / page_w,
|
|
||||||
"height": (y2 - y1) / page_h,
|
|
||||||
"polygon": polygon,
|
|
||||||
"text": line.text,
|
|
||||||
"words": words,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Free page image after processing
|
|
||||||
del image
|
del image
|
||||||
|
|
||||||
return all_blocks
|
return all_blocks
|
||||||
|
|||||||
132
ocr-service/test_engines.py
Normal file
132
ocr-service/test_engines.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
"""Tests for per-page block extraction in OCR engines."""
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from engines import surya, kraken
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Surya extract_page_blocks ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _make_surya_line(text, bbox, polygon=None, words=None):
|
||||||
|
"""Create a mock Surya text line with the expected attributes."""
|
||||||
|
line = MagicMock()
|
||||||
|
line.text = text
|
||||||
|
line.bbox = bbox
|
||||||
|
line.polygon = polygon
|
||||||
|
line.words = words or []
|
||||||
|
if not words:
|
||||||
|
line.confidence = 0.95
|
||||||
|
# hasattr check needs words to be falsy
|
||||||
|
del line.words
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def test_surya_extract_page_blocks_returns_blocks_for_single_image():
|
||||||
|
image = Image.new("RGB", (100, 200))
|
||||||
|
|
||||||
|
mock_line = _make_surya_line("Hallo Welt", [10, 20, 90, 40])
|
||||||
|
mock_pred = MagicMock()
|
||||||
|
mock_pred.text_lines = [mock_line]
|
||||||
|
|
||||||
|
with patch.object(surya, "_recognition_predictor") as mock_rec, \
|
||||||
|
patch.object(surya, "_loaded", True):
|
||||||
|
mock_rec.return_value = [mock_pred]
|
||||||
|
|
||||||
|
blocks = surya.extract_page_blocks(image, page_idx=2, language="de")
|
||||||
|
|
||||||
|
assert len(blocks) == 1
|
||||||
|
assert blocks[0]["pageNumber"] == 2
|
||||||
|
assert blocks[0]["text"] == "Hallo Welt"
|
||||||
|
assert blocks[0]["x"] == 10 / 100
|
||||||
|
assert blocks[0]["y"] == 20 / 200
|
||||||
|
assert blocks[0]["width"] == 80 / 100
|
||||||
|
assert blocks[0]["height"] == 20 / 200
|
||||||
|
|
||||||
|
|
||||||
|
def test_surya_extract_blocks_delegates_to_extract_page_blocks():
|
||||||
|
"""After refactoring, extract_blocks should produce the same output."""
|
||||||
|
image1 = Image.new("RGB", (100, 200))
|
||||||
|
image2 = Image.new("RGB", (100, 200))
|
||||||
|
|
||||||
|
mock_line = _make_surya_line("Test", [10, 20, 90, 40])
|
||||||
|
mock_pred = MagicMock()
|
||||||
|
mock_pred.text_lines = [mock_line]
|
||||||
|
|
||||||
|
with patch.object(surya, "_recognition_predictor") as mock_rec, \
|
||||||
|
patch.object(surya, "_loaded", True):
|
||||||
|
mock_rec.return_value = [mock_pred]
|
||||||
|
|
||||||
|
blocks = surya.extract_blocks([image1, image2])
|
||||||
|
|
||||||
|
assert len(blocks) == 2
|
||||||
|
assert blocks[0]["pageNumber"] == 0
|
||||||
|
assert blocks[1]["pageNumber"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Kraken extract_page_blocks ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _make_kraken_record(text, cuts, confidences=None):
|
||||||
|
record = MagicMock()
|
||||||
|
record.prediction = text
|
||||||
|
record.cuts = cuts
|
||||||
|
record.line = cuts
|
||||||
|
record.confidences = confidences or [0.9] * len(text)
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
def _run_kraken_with_mocks(fn, *args):
|
||||||
|
"""Run a kraken function with blla/rpred mocked via sys.modules."""
|
||||||
|
import sys
|
||||||
|
mock_blla = MagicMock()
|
||||||
|
mock_blla.segment.return_value = MagicMock()
|
||||||
|
mock_rpred = MagicMock()
|
||||||
|
|
||||||
|
mock_record = _make_kraken_record("Kurrent", [(10, 20), (90, 20), (90, 40), (10, 40)])
|
||||||
|
mock_rpred.rpred.return_value = [mock_record]
|
||||||
|
|
||||||
|
saved_kraken = sys.modules.get("kraken")
|
||||||
|
saved_blla = sys.modules.get("kraken.blla")
|
||||||
|
saved_rpred = sys.modules.get("kraken.rpred")
|
||||||
|
|
||||||
|
sys.modules["kraken"] = MagicMock(blla=mock_blla, rpred=mock_rpred)
|
||||||
|
sys.modules["kraken.blla"] = mock_blla
|
||||||
|
sys.modules["kraken.rpred"] = mock_rpred
|
||||||
|
|
||||||
|
try:
|
||||||
|
with patch.object(kraken, "_model", MagicMock()):
|
||||||
|
return fn(*args)
|
||||||
|
finally:
|
||||||
|
if saved_kraken is not None:
|
||||||
|
sys.modules["kraken"] = saved_kraken
|
||||||
|
else:
|
||||||
|
sys.modules.pop("kraken", None)
|
||||||
|
if saved_blla is not None:
|
||||||
|
sys.modules["kraken.blla"] = saved_blla
|
||||||
|
else:
|
||||||
|
sys.modules.pop("kraken.blla", None)
|
||||||
|
if saved_rpred is not None:
|
||||||
|
sys.modules["kraken.rpred"] = saved_rpred
|
||||||
|
else:
|
||||||
|
sys.modules.pop("kraken.rpred", None)
|
||||||
|
|
||||||
|
|
||||||
|
def test_kraken_extract_page_blocks_returns_blocks_for_single_image():
|
||||||
|
image = Image.new("RGB", (100, 200))
|
||||||
|
blocks = _run_kraken_with_mocks(kraken.extract_page_blocks, image, 3, "de")
|
||||||
|
|
||||||
|
assert len(blocks) == 1
|
||||||
|
assert blocks[0]["pageNumber"] == 3
|
||||||
|
assert blocks[0]["text"] == "Kurrent"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kraken_extract_blocks_delegates_to_extract_page_blocks():
|
||||||
|
image1 = Image.new("RGB", (100, 200))
|
||||||
|
image2 = Image.new("RGB", (100, 200))
|
||||||
|
blocks = _run_kraken_with_mocks(kraken.extract_blocks, [image1, image2])
|
||||||
|
|
||||||
|
assert len(blocks) == 2
|
||||||
|
assert blocks[0]["pageNumber"] == 0
|
||||||
|
assert blocks[1]["pageNumber"] == 1
|
||||||
Reference in New Issue
Block a user