feat(ocr): integrate spell-check post-processing for handwriting script types
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from fastapi.responses import StreamingResponse
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from confidence import apply_confidence_markers, get_threshold
|
from confidence import apply_confidence_markers, get_threshold
|
||||||
|
from spell_check import correct_text, load_spell_checker
|
||||||
from engines import kraken as kraken_engine
|
from engines import kraken as kraken_engine
|
||||||
from engines import surya as surya_engine
|
from engines import surya as surya_engine
|
||||||
from models import OcrBlock, OcrRequest
|
from models import OcrBlock, OcrRequest
|
||||||
@@ -39,6 +40,8 @@ ALLOWED_PDF_HOSTS = set(
|
|||||||
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
|
h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
|
||||||
|
|
||||||
|
|
||||||
def _validate_url(url: str) -> None:
|
def _validate_url(url: str) -> None:
|
||||||
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
|
"""Validate that the PDF URL points to an allowed host (SSRF protection)."""
|
||||||
@@ -55,6 +58,7 @@ async def lifespan(app: FastAPI):
|
|||||||
|
|
||||||
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
|
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
|
||||||
kraken_engine.load_models()
|
kraken_engine.load_models()
|
||||||
|
load_spell_checker()
|
||||||
_models_ready = True
|
_models_ready = True
|
||||||
logger.info("Startup complete — ready to accept requests")
|
logger.info("Startup complete — ready to accept requests")
|
||||||
|
|
||||||
@@ -109,6 +113,8 @@ async def run_ocr(request: OcrRequest):
|
|||||||
if block.get("words"):
|
if block.get("words"):
|
||||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
|
block["text"] = correct_text(block["text"])
|
||||||
|
|
||||||
return [OcrBlock(**b) for b in blocks]
|
return [OcrBlock(**b) for b in blocks]
|
||||||
|
|
||||||
@@ -170,6 +176,8 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
engine.extract_region_text, image,
|
engine.extract_region_text, image,
|
||||||
region.x, region.y, region.width, region.height,
|
region.x, region.y, region.width, region.height,
|
||||||
)
|
)
|
||||||
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
|
text = correct_text(text)
|
||||||
blocks.append({
|
blocks.append({
|
||||||
"pageNumber": page_idx,
|
"pageNumber": page_idx,
|
||||||
"x": region.x,
|
"x": region.x,
|
||||||
@@ -231,6 +239,8 @@ async def run_ocr_stream(request: OcrRequest):
|
|||||||
if block.get("words"):
|
if block.get("words"):
|
||||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||||
block.pop("words", None)
|
block.pop("words", None)
|
||||||
|
if script_type in _SPELL_CHECK_SCRIPT_TYPES:
|
||||||
|
block["text"] = correct_text(block["text"])
|
||||||
|
|
||||||
total_blocks += len(blocks)
|
total_blocks += len(blocks)
|
||||||
yield json.dumps({
|
yield json.dumps({
|
||||||
|
|||||||
Reference in New Issue
Block a user