feat: OCR pipeline with NDJSON streaming and real-time progress (#226, #227, #231) #229

Merged
marcel merged 74 commits from feat/issue-226-227-ocr-pipeline-polygon into main 2026-04-13 12:39:04 +02:00
3 changed files with 31 additions and 17 deletions
Showing only changes of commit 902d423f3c - Show all commits

View File

@@ -78,14 +78,16 @@ services:
dockerfile: Dockerfile
container_name: archive-ocr
restart: unless-stopped
mem_limit: 10g
memswap_limit: 10g
mem_limit: 6g
memswap_limit: 6g
volumes:
- ocr_models:/app/models
environment:
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
OCR_CONFIDENCE_THRESHOLD: "0.3"
OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5"
RECOGNITION_BATCH_SIZE: "1"
DETECTOR_BATCH_SIZE: "1"
networks:
- archive-net
healthcheck:

View File

@@ -6,13 +6,20 @@ logger = logging.getLogger(__name__)
_recognition_predictor = None
_detection_predictor = None
_loaded = False
def load_models():
"""Eagerly load Surya models into memory. Called once at container startup."""
global _recognition_predictor, _detection_predictor
"""Lazy-load Surya models on first use to save RAM at idle.
logger.info("Loading Surya models...")
Called automatically by extract_blocks(). Can also be called explicitly
to pre-warm if desired.
"""
global _recognition_predictor, _detection_predictor, _loaded
if _loaded:
return
logger.info("Loading Surya models (lazy, first OCR request)...")
from surya.foundation import FoundationPredictor
from surya.recognition import RecognitionPredictor
@@ -21,6 +28,7 @@ def load_models():
foundation_predictor = FoundationPredictor()
_recognition_predictor = RecognitionPredictor(foundation_predictor)
_detection_predictor = DetectionPredictor()
_loaded = True
logger.info("Surya models loaded successfully")
@@ -28,22 +36,25 @@ def load_models():
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Surya OCR on a list of PIL images (one per page).
Processes one page at a time to limit peak memory usage.
Returns a flat list of block dicts with pageNumber, x, y, width, height,
polygon, text. Coordinates are normalized to [0, 1] relative to page dimensions.
Surya 0.17+ returns polygon (4-point) natively on each text line.
polygon, text, words. Coordinates are normalized to [0, 1].
"""
load_models()
all_blocks = []
predictions = _recognition_predictor(images, det_predictor=_detection_predictor)
for page_idx, image in enumerate(images):
page_w, page_h = image.size
for page_idx, page_pred in enumerate(predictions):
page_w, page_h = images[page_idx].size
# Process single page to limit peak memory
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
page_pred = predictions[0]
for line in page_pred.text_lines:
bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates
bbox = line.bbox
x1, y1, x2, y2 = bbox
# Surya 0.17 provides polygon as list of (x, y) tuples (4 points, clockwise)
polygon = None
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
polygon = [
@@ -51,7 +62,6 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
for p in line.polygon
]
# Extract word-level confidence for [unleserlich] marking
words = []
if hasattr(line, "words") and line.words:
for word in line.words:
@@ -73,4 +83,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
"words": words,
})
# Free page image after processing
del image
return all_blocks

View File

@@ -22,14 +22,13 @@ _models_ready = False
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load all OCR models at startup before accepting requests."""
"""Load lightweight models at startup. Surya loads lazily on first request."""
global _models_ready
logger.info("Loading OCR models at startup...")
surya_engine.load_models()
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
kraken_engine.load_models()
_models_ready = True
logger.info("All OCR models loaded — ready to accept requests")
logger.info("Startup complete — ready to accept requests")
yield