feat(ocr): integrate spell-check post-processing for handwriting script types
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from fastapi.responses import StreamingResponse
|
||||
from PIL import Image
|
||||
|
||||
from confidence import apply_confidence_markers, get_threshold
|
||||
from spell_check import correct_text, load_spell_checker
|
||||
from engines import kraken as kraken_engine
|
||||
from engines import surya as surya_engine
|
||||
from models import OcrBlock, OcrRequest
|
||||
@@ -39,6 +40,8 @@ ALLOWED_PDF_HOSTS = set(
|
||||
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
|
||||
)
|
||||
|
||||
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
|
||||
|
||||
|
||||
def _validate_url(url: str) -> None:
|
||||
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
|
||||
@@ -55,6 +58,7 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
|
||||
kraken_engine.load_models()
|
||||
load_spell_checker()
|
||||
_models_ready = True
|
||||
logger.info("Startup complete — ready to accept requests")
|
||||
|
||||
@@ -109,6 +113,8 @@ async def run_ocr(request: OcrRequest):
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
block.pop("words", None)
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
block["text"] = correct_text(block["text"])
|
||||
|
||||
return [OcrBlock(**b) for b in blocks]
|
||||
|
||||
@@ -170,6 +176,8 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
engine.extract_region_text, image,
|
||||
region.x, region.y, region.width, region.height,
|
||||
)
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
text = correct_text(text)
|
||||
blocks.append({
|
||||
"pageNumber": page_idx,
|
||||
"x": region.x,
|
||||
@@ -231,6 +239,8 @@ async def run_ocr_stream(request: OcrRequest):
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
block.pop("words", None)
|
||||
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||
block["text"] = correct_text(block["text"])
|
||||
|
||||
total_blocks += len(blocks)
|
||||
yield json.dumps({
|
||||
|
||||
Reference in New Issue
Block a user