feat(ocr): full OCR pipeline with polygon annotations, training, and guided mode #232
@@ -82,10 +82,13 @@ def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict
|
||||
def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
|
||||
"""Crop image to a normalized region and run Kraken recognition on the crop.
|
||||
|
||||
Used for guided OCR — skips full-page layout detection and only processes
|
||||
the given bounding box. Coordinates are normalized to [0, 1].
|
||||
Used for guided OCR — skips full-page layout detection entirely.
|
||||
A single synthetic baseline spanning the full crop width is used so that
|
||||
blla.segment() (which crashes on small crops) is never called.
|
||||
Coordinates are normalized to [0, 1].
|
||||
"""
|
||||
from kraken import blla, rpred
|
||||
from kraken import rpred
|
||||
from kraken.containers import Segmentation, BaselineLine
|
||||
|
||||
if _model is None:
|
||||
raise RuntimeError("Kraken model is not loaded")
|
||||
@@ -97,8 +100,28 @@ def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
|
||||
y2 = min(ph, int((y + h) * ph))
|
||||
crop = image.crop((x1, y1, x2, y2))
|
||||
|
||||
baseline_seg = blla.segment(crop)
|
||||
pred_it = rpred.rpred(_model, crop, baseline_seg)
|
||||
cw, ch = crop.size
|
||||
if cw == 0 or ch == 0:
|
||||
return ""
|
||||
|
||||
# Single synthetic baseline at vertical midpoint, spanning full crop width
|
||||
mid_y = ch // 2
|
||||
synthetic_seg = Segmentation(
|
||||
type="baselines",
|
||||
imagename="",
|
||||
text_direction="horizontal-lr",
|
||||
script_detection=False,
|
||||
lines=[
|
||||
BaselineLine(
|
||||
id="line0",
|
||||
baseline=[(0, mid_y), (cw, mid_y)],
|
||||
boundary=[(0, 0), (cw, 0), (cw, ch), (0, ch)],
|
||||
)
|
||||
],
|
||||
regions={},
|
||||
line_orders=[],
|
||||
)
|
||||
pred_it = rpred.rpred(_model, crop, synthetic_seg)
|
||||
return " ".join(r.prediction for r in pred_it)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user