feat(ocr): per-script-type confidence thresholds
Kurrent OCR produces much lower confidence than typewriter/Latin. Separate thresholds allow aggressive filtering for Kurrent (0.5) while keeping typewriter lenient (0.3). - OCR_CONFIDENCE_THRESHOLD: default for Surya paths (0.3) - OCR_CONFIDENCE_THRESHOLD_KURRENT: Kraken Kurrent path (0.5) - apply_confidence_markers() now accepts threshold parameter - get_threshold(script_type) selects the right threshold Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,16 +2,24 @@
|
||||
|
||||
import os
|
||||
|
||||
CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
||||
THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
||||
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
|
||||
|
||||
ILLEGIBLE_MARKER = "[unleserlich]"
|
||||
|
||||
|
||||
def apply_confidence_markers(words: list[dict]) -> str:
|
||||
def get_threshold(script_type: str) -> float:
|
||||
if script_type and script_type.upper() == "HANDWRITING_KURRENT":
|
||||
return THRESHOLD_KURRENT
|
||||
return THRESHOLD_DEFAULT
|
||||
|
||||
|
||||
def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str:
|
||||
"""Replace low-confidence words with [unleserlich], collapsing adjacent markers.
|
||||
|
||||
Args:
|
||||
words: list of {"text": str, "confidence": float} dicts
|
||||
threshold: confidence threshold (uses THRESHOLD_DEFAULT if None)
|
||||
|
||||
Returns:
|
||||
Reconstructed text string with [unleserlich] substitutions.
|
||||
@@ -19,11 +27,14 @@ def apply_confidence_markers(words: list[dict]) -> str:
|
||||
if not words:
|
||||
return ""
|
||||
|
||||
if threshold is None:
|
||||
threshold = THRESHOLD_DEFAULT
|
||||
|
||||
result: list[str] = []
|
||||
prev_was_marker = False
|
||||
|
||||
for word in words:
|
||||
if word["confidence"] < CONFIDENCE_THRESHOLD:
|
||||
if word["confidence"] < threshold:
|
||||
if not prev_was_marker:
|
||||
result.append(ILLEGIBLE_MARKER)
|
||||
prev_was_marker = True
|
||||
|
||||
Reference in New Issue
Block a user