Files
familienarchiv/ocr-service/engines/surya.py
Marcel ee58b63517 feat(ocr): add guided OCR mode using existing annotation regions
When a document has manually drawn annotation boxes, the user can now
enable "Nur annotierte Bereiche" in the OCR trigger panel. The engine
skips layout detection entirely and runs recognition only within the
pre-drawn bounding boxes, preserving manual transcription blocks.

- Python: adds OcrRegion model, extend OcrRequest/OcrBlock; guided
  branch in /ocr/stream groups by page and crops each region
- Engines: add extract_region_text() to both Kraken and Surya
- Java: adds OcrBlockResult.annotationId, OcrClient.OcrRegion,
  TriggerOcrDTO.useExistingAnnotations; OcrAsyncRunner dispatches to
  upsertGuidedBlock when annotationId is present; OcrService threads
  the flag through to runSingleDocument
- TranscriptionService: adds upsertGuidedBlock (creates, updates OCR,
  or preserves MANUAL blocks)
- Frontend: guided OCR toggle in OcrTrigger shown when blocks exist;
  skips destructive-replace confirmation in guided mode

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 15:57:54 +02:00

117 lines
3.5 KiB
Python

"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting."""
import logging
logger = logging.getLogger(__name__)
_recognition_predictor = None
_detection_predictor = None
_loaded = False
def load_models():
"""Lazy-load Surya models on first use to save RAM at idle.
Called automatically by extract_blocks(). Can also be called explicitly
to pre-warm if desired.
"""
global _recognition_predictor, _detection_predictor, _loaded
if _loaded:
return
logger.info("Loading Surya models (lazy, first OCR request)...")
from surya.foundation import FoundationPredictor
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor
foundation_predictor = FoundationPredictor()
_recognition_predictor = RecognitionPredictor(foundation_predictor)
_detection_predictor = DetectionPredictor()
_loaded = True
logger.info("Surya models loaded successfully")
def extract_page_blocks(image, page_idx: int, language: str = "de") -> list[dict]:
"""Run Surya OCR on a single PIL image and return block dicts for that page.
Coordinates are normalized to [0, 1].
"""
load_models()
page_w, page_h = image.size
blocks = []
predictions = _recognition_predictor([image], det_predictor=_detection_predictor)
page_pred = predictions[0]
for line in page_pred.text_lines:
bbox = line.bbox
x1, y1, x2, y2 = bbox
polygon = None
if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4:
polygon = [
[p[0] / page_w, p[1] / page_h]
for p in line.polygon
]
words = []
if hasattr(line, "words") and line.words:
for word in line.words:
words.append({
"text": word.text,
"confidence": word.confidence,
})
else:
words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
blocks.append({
"pageNumber": page_idx,
"x": x1 / page_w,
"y": y1 / page_h,
"width": (x2 - x1) / page_w,
"height": (y2 - y1) / page_h,
"polygon": polygon,
"text": line.text,
"words": words,
})
return blocks
def extract_region_text(image, x: float, y: float, w: float, h: float) -> str:
"""Crop image to a normalized region and run Surya recognition on the crop.
Used for guided OCR — skips full-page layout detection and only processes
the given bounding box. Coordinates are normalized to [0, 1].
"""
load_models()
pw, ph = image.size
x1 = max(0, int(x * pw))
y1 = max(0, int(y * ph))
x2 = min(pw, int((x + w) * pw))
y2 = min(ph, int((y + h) * ph))
crop = image.crop((x1, y1, x2, y2))
predictions = _recognition_predictor([crop], det_predictor=_detection_predictor)
return " ".join(line.text for line in predictions[0].text_lines)
def extract_blocks(images: list, language: str = "de") -> list[dict]:
"""Run Surya OCR on a list of PIL images (one per page).
Processes one page at a time to limit peak memory usage.
Returns a flat list of block dicts with pageNumber, x, y, width, height,
polygon, text, words. Coordinates are normalized to [0, 1].
"""
all_blocks = []
for page_idx, image in enumerate(images, start=1):
all_blocks.extend(extract_page_blocks(image, page_idx, language))
del image
return all_blocks