feat(ocr): per-script-type confidence thresholds
Kurrent OCR produces much lower confidence than typewriter/Latin. Separate thresholds allow aggressive filtering for Kurrent (0.5) while keeping typewriter lenient (0.3). - OCR_CONFIDENCE_THRESHOLD: default for Surya paths (0.3) - OCR_CONFIDENCE_THRESHOLD_KURRENT: Kraken Kurrent path (0.5) - apply_confidence_markers() now accepts threshold parameter - get_threshold(script_type) selects the right threshold Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,7 +9,7 @@ import pypdfium2 as pdfium
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from PIL import Image
|
||||
|
||||
from confidence import apply_confidence_markers
|
||||
from confidence import apply_confidence_markers, get_threshold
|
||||
from engines import kraken as kraken_engine
|
||||
from engines import surya as surya_engine
|
||||
from models import OcrBlock, OcrRequest
|
||||
@@ -72,9 +72,10 @@ async def run_ocr(request: OcrRequest):
|
||||
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
|
||||
blocks = surya_engine.extract_blocks(images, request.language)
|
||||
|
||||
threshold = get_threshold(script_type)
|
||||
for block in blocks:
|
||||
if block.get("words"):
|
||||
block["text"] = apply_confidence_markers(block["words"])
|
||||
block["text"] = apply_confidence_markers(block["words"], threshold)
|
||||
block.pop("words", None)
|
||||
|
||||
return [OcrBlock(**b) for b in blocks]
|
||||
|
||||
Reference in New Issue
Block a user