refactor(ocr): extract extract_page_blocks() from both OCR engines
Enable per-page processing by extracting the inner loop body of extract_blocks() into extract_page_blocks(image, page_idx, language). The original extract_blocks() now delegates to the new function, preserving backward compatibility for the batch path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,54 @@ def load_models():
|
||||
logger.info("Surya models loaded successfully")
|
||||
|
||||
|
||||
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
|
||||
"""Run Surya OCR on a single PIL image and return block dicts for that page.
|
||||
|
||||
Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
load_models()
|
||||
|
||||
page_w, page_h = image.size
|
||||
blocks = []
|
||||
|
||||
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
|
||||
page_pred = predictions[0]
|
||||
|
||||
for line in page_pred.text_lines:
|
||||
bbox = line.bbox
|
||||
x1, y1, x2, y2 = bbox
|
||||
|
||||
polygon = None
|
||||
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
||||
polygon = [
|
||||
[p[0] / page_w, p[1] / page_h]
|
||||
for p in line.polygon
|
||||
]
|
||||
|
||||
words = []
|
||||
if hasattr(line, "words") and line.words:
|
||||
for word in line.words:
|
||||
words.append({
|
||||
"text": word.text,
|
||||
"confidence": word.confidence,
|
||||
})
|
||||
else:
|
||||
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
||||
|
||||
blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
"y": y1 / page_h,
|
||||
"width": (x2 - x1) / page_w,
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": polygon,
|
||||
"text": line.text,
|
||||
"words": words,
|
||||
})
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
"""Run Surya OCR on a list of PIL images (one per page).
|
||||
|
||||
@@ -40,50 +88,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||
Returns a flat list of block dicts with pageNumber, x, y, width, height,
|
||||
polygon, text, words. Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
load_models()
|
||||
|
||||
all_blocks = []
|
||||
|
||||
for page_idx, image in enumerate(images):
|
||||
page_w, page_h = image.size
|
||||
|
||||
# Process single page to limit peak memory
|
||||
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
|
||||
page_pred = predictions[0]
|
||||
|
||||
for line in page_pred.text_lines:
|
||||
bbox = line.bbox
|
||||
x1, y1, x2, y2 = bbox
|
||||
|
||||
polygon = None
|
||||
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
||||
polygon = [
|
||||
[p[0] / page_w, p[1] / page_h]
|
||||
for p in line.polygon
|
||||
]
|
||||
|
||||
words = []
|
||||
if hasattr(line, "words") and line.words:
|
||||
for word in line.words:
|
||||
words.append({
|
||||
"text": word.text,
|
||||
"confidence": word.confidence,
|
||||
})
|
||||
else:
|
||||
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
|
||||
|
||||
all_blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": x1 / page_w,
|
||||
"y": y1 / page_h,
|
||||
"width": (x2 - x1) / page_w,
|
||||
"height": (y2 - y1) / page_h,
|
||||
"polygon": polygon,
|
||||
"text": line.text,
|
||||
"words": words,
|
||||
})
|
||||
|
||||
# Free page image after processing
|
||||
all_blocks.extend(extract_page_blocks(image, page_idx, language))
|
||||
del image
|
||||
|
||||
return all_blocks
|
||||
|
||||
Reference in New Issue
Block a user