refactor(ocr): extract extract_page_blocks() from both OCR engines
Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -29,12 +29,10 @@ def is_available() -> bool:
|
||||
return _model is not None
|
||||
|
||||
|
||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""Run Kraken segmentation + recognition on a list of PIL images.
|
||||
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
|
||||
"""Run Kraken segmentation + recognition on a single PIL image.
|
||||
|
||||
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
|
||||
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
|
||||
Coordinates are normalized to [0, 1].
|
||||
Returns block dicts for that page. Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
from kraken import blla, rpred
|
||||
from confidence import words_from_characters
|
||||
@@ -42,52 +40,56 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
if _model is None:
|
||||
raise RuntimeError("Kraken model is not loaded")
|
||||
|
||||
page_w, page_h = image.size
|
||||
blocks = []
|
||||
|
||||
baseline_seg = blla.segment(image)
|
||||
pred_it = rpred.rpred(_model, image, baseline_seg)
|
||||
|
||||
for record in pred_it:
|
||||
polygon_pts = record.cuts if hasattr(record, "cuts") else []
|
||||
|
||||
if polygon_pts:
|
||||
xs = [p[0] for p in polygon_pts]
|
||||
ys = [p[1] for p in polygon_pts]
|
||||
x1, y1 = min(xs), min(ys)
|
||||
x2, y2 = max(xs), max(ys)
|
||||
else:
|
||||
xs = [p[0] for p in record.line]
|
||||
ys = [p[1] for p in record.line]
|
||||
x1, y1 = min(xs), min(ys) - 5
|
||||
x2, y2 = max(xs), max(ys) + 5
|
||||
|
||||
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||
|
||||
char_confidences = getattr(record, "confidences", [])
|
||||
words = words_from_characters(record.prediction, char_confidences)
|
||||
|
||||
blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
"y": y1 / page_h,
|
||||
"width": (x2 - x1) / page_w,
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": quad,
|
||||
"text": record.prediction,
|
||||
"words": words,
|
||||
})
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""Run Kraken segmentation + recognition on a list of PIL images.
|
||||
|
||||
Returns block dicts with pageNumber, x, y, width, height, polygon, text.
|
||||
Polygon is a 4-point quadrilateral approximation of the baseline polygon.
|
||||
Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
all_blocks = []
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
page_w, page_h = image.size
|
||||
|
||||
baseline_seg = blla.segment(image)
|
||||
|
||||
pred_it = rpred.rpred(_model, image, baseline_seg)
|
||||
|
||||
for record in pred_it:
|
||||
# record.prediction is the recognized text
|
||||
# record.cuts contains polygon points
|
||||
# record.line is the baseline polygon
|
||||
|
||||
polygon_pts = record.cuts if hasattr(record, "cuts") else []
|
||||
|
||||
# Compute AABB from the polygon
|
||||
if polygon_pts:
|
||||
xs = [p[0] for p in polygon_pts]
|
||||
ys = [p[1] for p in polygon_pts]
|
||||
x1, y1 = min(xs), min(ys)
|
||||
x2, y2 = max(xs), max(ys)
|
||||
else:
|
||||
# Fallback to line baseline
|
||||
xs = [p[0] for p in record.line]
|
||||
ys = [p[1] for p in record.line]
|
||||
x1, y1 = min(xs), min(ys) - 5
|
||||
x2, y2 = max(xs), max(ys) + 5
|
||||
|
||||
# Approximate polygon to quadrilateral
|
||||
quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None
|
||||
|
||||
# Extract word-level confidence for [unleserlich] marking
|
||||
char_confidences = getattr(record, "confidences", [])
|
||||
words = words_from_characters(record.prediction, char_confidences)
|
||||
|
||||
all_blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
"y": y1 / page_h,
|
||||
"width": (x2 - x1) / page_w,
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": quad,
|
||||
"text": record.prediction,
|
||||
"words": words,
|
||||
})
|
||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||
|
||||
return all_blocks
|
||||
|
||||
|
||||
Reference in New Issue
Block a user