feat(ocr): bump to latest surya 0.17.1, kraken 7.0, torch 2.7.1
- surya-ocr 0.6.3 → 0.17.1: new predictor API (FoundationPredictor, RecognitionPredictor, DetectionPredictor), native polygon output on text lines (4-point clockwise) - kraken 5.2.9 → 7.0: wider torch range (>=2.4,<=2.10), unpinned numpy - torch 2.5.1 → 2.7.1: satisfies surya's >=2.7.0 requirement - Rewrite engines/surya.py for the 0.17 predictor class API - Surya now outputs polygons natively — no longer rectangle-only Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
|
|
||||||
# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved)
|
# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved)
|
||||||
RUN pip install --no-cache-dir \
|
RUN pip install --no-cache-dir \
|
||||||
torch==2.5.1 \
|
torch==2.7.1 \
|
||||||
--index-url https://download.pytorch.org/whl/cpu
|
--index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|||||||
@@ -4,28 +4,23 @@ import logging
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Lazy-loaded at startup via load_models()
|
_recognition_predictor = None
|
||||||
_recognition_model = None
|
_detection_predictor = None
|
||||||
_recognition_processor = None
|
|
||||||
_detection_model = None
|
|
||||||
_detection_processor = None
|
|
||||||
|
|
||||||
|
|
||||||
def load_models():
|
def load_models():
|
||||||
"""Eagerly load Surya models into memory. Called once at container startup."""
|
"""Eagerly load Surya models into memory. Called once at container startup."""
|
||||||
global _recognition_model, _recognition_processor, _detection_model, _detection_processor
|
global _recognition_predictor, _detection_predictor
|
||||||
|
|
||||||
logger.info("Loading Surya models...")
|
logger.info("Loading Surya models...")
|
||||||
|
|
||||||
from surya.model.detection.model import load_model as load_det_model
|
from surya.foundation import FoundationPredictor
|
||||||
from surya.model.detection.model import load_processor as load_det_processor
|
from surya.recognition import RecognitionPredictor
|
||||||
from surya.model.recognition.model import load_model as load_rec_model
|
from surya.detection import DetectionPredictor
|
||||||
from surya.model.recognition.processor import load_processor as load_rec_processor
|
|
||||||
|
|
||||||
_detection_model = load_det_model()
|
foundation_predictor = FoundationPredictor()
|
||||||
_detection_processor = load_det_processor()
|
_recognition_predictor = RecognitionPredictor(foundation_predictor)
|
||||||
_recognition_model = load_rec_model()
|
_detection_predictor = DetectionPredictor()
|
||||||
_recognition_processor = load_rec_processor()
|
|
||||||
|
|
||||||
logger.info("Surya models loaded successfully")
|
logger.info("Surya models loaded successfully")
|
||||||
|
|
||||||
@@ -33,33 +28,36 @@ def load_models():
|
|||||||
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
def extract_blocks(images: list, language: str = "de") -> list[dict]:
|
||||||
"""Run Surya OCR on a list of PIL images (one per page).
|
"""Run Surya OCR on a list of PIL images (one per page).
|
||||||
|
|
||||||
Returns a flat list of block dicts with pageNumber, x, y, width, height, text.
|
Returns a flat list of block dicts with pageNumber, x, y, width, height,
|
||||||
Coordinates are normalized to [0, 1] relative to page dimensions.
|
polygon, text. Coordinates are normalized to [0, 1] relative to page dimensions.
|
||||||
|
Surya 0.17+ returns polygon (4-point) natively on each text line.
|
||||||
"""
|
"""
|
||||||
from surya.detection import batch_text_detection
|
|
||||||
from surya.recognition import batch_recognition
|
|
||||||
|
|
||||||
all_blocks = []
|
all_blocks = []
|
||||||
|
|
||||||
for page_idx, image in enumerate(images):
|
predictions = _recognition_predictor(images, det_predictor=_detection_predictor)
|
||||||
page_w, page_h = image.size
|
|
||||||
|
|
||||||
det_predictions = batch_text_detection([image], _detection_model, _detection_processor)
|
for page_idx, page_pred in enumerate(predictions):
|
||||||
rec_predictions = batch_recognition(
|
page_w, page_h = images[page_idx].size
|
||||||
[image], det_predictions, _recognition_model, _recognition_processor, [language]
|
|
||||||
)
|
|
||||||
|
|
||||||
for line in rec_predictions[0].text_lines:
|
for line in page_pred.text_lines:
|
||||||
bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates
|
bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates
|
||||||
x1, y1, x2, y2 = bbox
|
x1, y1, x2, y2 = bbox
|
||||||
|
|
||||||
|
# Surya 0.17 provides polygon as list of (x, y) tuples (4 points, clockwise)
|
||||||
|
polygon = None
|
||||||
|
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
|
||||||
|
polygon = [
|
||||||
|
[p[0] / page_w, p[1] / page_h]
|
||||||
|
for p in line.polygon
|
||||||
|
]
|
||||||
|
|
||||||
all_blocks.append({
|
all_blocks.append({
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
"x": x1 / page_w,
|
"x": x1 / page_w,
|
||||||
"y": y1 / page_h,
|
"y": y1 / page_h,
|
||||||
"width": (x2 - x1) / page_w,
|
"width": (x2 - x1) / page_w,
|
||||||
"height": (y2 - y1) / page_h,
|
"height": (y2 - y1) / page_h,
|
||||||
"polygon": None,
|
"polygon": polygon,
|
||||||
"text": line.text,
|
"text": line.text,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
fastapi[standard]==0.115.6
|
fastapi[standard]==0.115.6
|
||||||
surya-ocr==0.6.3
|
surya-ocr==0.17.1
|
||||||
kraken==6.0.3
|
kraken==7.0
|
||||||
pillow>=10.2.0,<11.0.0
|
pillow>=10.2.0,<11.0.0
|
||||||
pypdfium2==4.30.0
|
pypdfium2==4.30.0
|
||||||
httpx==0.28.1
|
httpx==0.28.1
|
||||||
|
|||||||
Reference in New Issue
Block a user