refactor(ocr): extract extract_page_blocks() from both OCR engines

Enable per-page processing by extracting the inner loop body of
extract_blocks() into extract_page_blocks(image, page_idx, language).
The original extract_blocks() now delegates to the new function,
preserving backward compatibility for the batch path.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 09:56:34 +02:00
parent d8dcba1a71
commit b7d5f71ef7
3 changed files with 231 additions and 89 deletions

View File

@@ -29,12 +29,10 @@ def is_available() -> bool:
return _model is not None
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Kraken segmentation + recognition on a list of PIL images.
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
"""Run Kraken segmentation + recognition on a single PIL image.
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
Coordinates are normalized to [0, 1].
Returns block dicts for that page. Coordinates are normalized to [0, 1].
"""
from kraken import blla, rpred
from confidence import words_from_characters
@@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
if _model is None:
raise RuntimeError("Kraken model is not loaded")
page_w, page_h = image.size
blocks = []
baseline_seg = blla.segment(image)
pred_it = rpred.rpred(_model, image, baseline_seg)
for record in pred_it:
polygon_pts = record.cuts if hasattr(record, "cuts") else []
if polygon_pts:
xs = [p[0] for p in polygon_pts]
ys = [p[1] for p in polygon_pts]
x1, y1 = min(xs), min(ys)
x2, y2 = max(xs), max(ys)
else:
xs = [p[0] for p in record.line]
ys = [p[1] for p in record.line]
x1, y1 = min(xs), min(ys) - 5
x2, y2 = max(xs), max(ys) + 5
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
char_confidences = getattr(record, "confidences", [])
words = words_from_characters(record.prediction, char_confidences)
blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": quad,
"text": record.prediction,
"words": words,
})
return blocks
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Kraken segmentation + recognition on a list of PIL images.
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
Coordinates are normalized to [0, 1].
"""
all_blocks = []
for page_idx, image in enumerate(images):
page_w, page_h = image.size
baseline_seg = blla.segment(image)
pred_it = rpred.rpred(_model, image, baseline_seg)
for record in pred_it:
# record.prediction is the recognized text
# record.cuts contains polygon points
# record.line is the baseline polygon
polygon_pts = record.cuts if hasattr(record, "cuts") else []
# Compute AABB from the polygon
if polygon_pts:
xs = [p[0] for p in polygon_pts]
ys = [p[1] for p in polygon_pts]
x1, y1 = min(xs), min(ys)
x2, y2 = max(xs), max(ys)
else:
# Fallback to line baseline
xs = [p[0] for p in record.line]
ys = [p[1] for p in record.line]
x1, y1 = min(xs), min(ys) - 5
x2, y2 = max(xs), max(ys) + 5
# Approximate polygon to quadrilateral
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
# Extract word-level confidence for [unleserlich] marking
char_confidences = getattr(record, "confidences", [])
words = words_from_characters(record.prediction, char_confidences)
all_blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": quad,
"text": record.prediction,
"words": words,
})
all_blocks.extend(extract_page_blocks(image, page_idx, language))
return all_blocks

View File

@@ -33,6 +33,54 @@ def load_models():
logger.info("Surya models loaded successfully")
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
"""Run Surya OCR on a single PIL image and return block dicts for that page.
Coordinates are normalized to [0, 1].
"""
load_models()
page_w, page_h = image.size
blocks = []
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
page_pred = predictions[0]
for line in page_pred.text_lines:
bbox = line.bbox
x1, y1, x2, y2 = bbox
polygon = None
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
polygon = [
[p[0] / page_w, p[1] / page_h]
for p in line.polygon
]
words = []
if hasattr(line, "words") and line.words:
for word in line.words:
words.append({
"text": word.text,
"confidence": word.confidence,
})
else:
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": polygon,
"text": line.text,
"words": words,
})
return blocks
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Surya OCR on a list of PIL images (one per page).
@@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
Returns a flat list of block dicts with pageNumber, x, y, width, height,
polygon, text, words. Coordinates are normalized to [0, 1].
"""
load_models()
all_blocks = []
for page_idx, image in enumerate(images):
page_w, page_h = image.size
# Process single page to limit peak memory
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
page_pred = predictions[0]
for line in page_pred.text_lines:
bbox = line.bbox
x1, y1, x2, y2 = bbox
polygon = None
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
polygon = [
[p[0] / page_w, p[1] / page_h]
for p in line.polygon
]
words = []
if hasattr(line, "words") and line.words:
for word in line.words:
words.append({
"text": word.text,
"confidence": word.confidence,
})
else:
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
all_blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": polygon,
"text": line.text,
"words": words,
})
# Free page image after processing
all_blocks.extend(extract_page_blocks(image, page_idx, language))
del image
return all_blocks