From 77100ab1e65ace1311f31db70ca54b53596c3317 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 17 Apr 2026 16:54:17 +0200 Subject: [PATCH] feat(ocr): integrate spell-check post-processing for handwriting script types Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index 370dd999..77c5c33f 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -22,6 +22,7 @@ from fastapi.responses import StreamingResponse from PIL import Image from confidence import apply_confidence_markers, get_threshold +from spell_check import correct_text, load_spell_checker from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest @@ -39,6 +40,8 @@ ALLOWED_PDF_HOSTS = set( h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",") ) +_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"} + def _validate_url(url: str) -> None: """Validate that the PDF URL points to an allowed host (SSRF protection).""" @@ -55,6 +58,7 @@ async def lifespan(app: FastAPI): logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...") kraken_engine.load_models() + load_spell_checker() _models_ready = True logger.info("Startup complete — ready to accept requests") @@ -109,6 +113,8 @@ async def run_ocr(request: OcrRequest): if block.get("words"): block["text"] = apply_confidence_markers(block["words"], threshold) block.pop("words", None) + if script_type in _SPELL_CHECK_SCRIPT_TYPES: + block["text"] = correct_text(block["text"]) return [OcrBlock(**b) for b in blocks] @@ -170,6 +176,8 @@ async def run_ocr_stream(request: OcrRequest): engine.extract_region_text, image, region.x, region.y, region.width, region.height, ) + if script_type in _SPELL_CHECK_SCRIPT_TYPES: + text = correct_text(text) blocks.append({ "pageNumber": page_idx, "x": region.x, @@ -231,6 +239,8 @@ async def run_ocr_stream(request: OcrRequest): if block.get("words"): block["text"] = apply_confidence_markers(block["words"], threshold) block.pop("words", None) + if script_type in _SPELL_CHECK_SCRIPT_TYPES: + block["text"] = correct_text(block["text"]) total_blocks += len(blocks) yield json.dumps({