feat(ocr): full OCR pipeline with polygon annotations, training, and guided mode #232

Merged
marcel merged 40 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-14 10:31:35 +02:00
Showing only changes of commit 051c43f088 - Show all commits

View File

@@ -82,10 +82,13 @@ def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict
def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
"""Crop image to a normalized region and run Kraken recognition on the crop.
Used for guided OCR — skips full-page layout detection and only processes
the given bounding box. Coordinates are normalized to [0, 1].
Used for guided OCR — skips full-page layout detection entirely.
A single synthetic baseline spanning the full crop width is used so that
blla.segment() (which crashes on small crops) is never called.
Coordinates are normalized to [0, 1].
"""
from kraken import blla, rpred
from kraken import rpred
from kraken.containers import Segmentation, BaselineLine
if _model is None:
raise RuntimeError("Kraken model is not loaded")
@@ -97,8 +100,28 @@ def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
y2 = min(ph, int((y + h) * ph))
crop = image.crop((x1, y1, x2, y2))
baseline_seg = blla.segment(crop)
pred_it = rpred.rpred(_model, crop, baseline_seg)
cw, ch = crop.size
if cw == 0 or ch == 0:
return ""
# Single synthetic baseline at vertical midpoint, spanning full crop width
mid_y = ch // 2
synthetic_seg = Segmentation(
type="baselines",
imagename="",
text_direction="horizontal-lr",
script_detection=False,
lines=[
BaselineLine(
id="line0",
baseline=[(0, mid_y), (cw, mid_y)],
boundary=[(0, 0), (cw, 0), (cw, ch), (0, ch)],
)
],
regions={},
line_orders=[],
)
pred_it = rpred.rpred(_model, crop, synthetic_seg)
return " ".join(r.prediction for r in pred_it)