feat(ocr): integrate spell-check post-processing for handwriting script types

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-17 16:54:17 +02:00
parent 092131930c
commit 77100ab1e6

View File

@@ -22,6 +22,7 @@ from fastapi.responses import StreamingResponse
from PIL import Image
from confidence import apply_confidence_markers, get_threshold
from spell_check import correct_text, load_spell_checker
from engines import kraken as kraken_engine
from engines import surya as surya_engine
from models import OcrBlock, OcrRequest
@@ -39,6 +40,8 @@ ALLOWED_PDF_HOSTS = set(
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
)
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
def _validate_url(url: str) -> None:
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
@@ -55,6 +58,7 @@ async def lifespan(app: FastAPI):
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
kraken_engine.load_models()
load_spell_checker()
_models_ready = True
logger.info("Startup complete — ready to accept requests")
@@ -109,6 +113,8 @@ async def run_ocr(request: OcrRequest):
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
block["text"] = correct_text(block["text"])
return [OcrBlock(**b) for b in blocks]
@@ -170,6 +176,8 @@ async def run_ocr_stream(request: OcrRequest):
engine.extract_region_text, image,
region.x, region.y, region.width, region.height,
)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
text = correct_text(text)
blocks.append({
"pageNumber": page_idx,
"x": region.x,
@@ -231,6 +239,8 @@ async def run_ocr_stream(request: OcrRequest):
if block.get("words"):
block["text"] = apply_confidence_markers(block["words"], threshold)
block.pop("words", None)
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
block["text"] = correct_text(block["text"])
total_blocks += len(blocks)
yield json.dumps({