Drop underscore prefix — the helper is part of confidence.py's effective public API since spell_check.py imports and calls it directly. Fixes reviewer concern: importing a _-prefixed name across module boundaries contradicts Python's private-by-convention signal. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
101 lines
3.0 KiB
Python
101 lines
3.0 KiB
Python
"""Confidence-based [unleserlich] marker insertion for OCR output."""
|
|
|
|
import os
|
|
|
|
THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
|
|
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
|
|
|
|
ILLEGIBLE_MARKER = "[unleserlich]"
|
|
CORRECTION_MARKER = "[?]"
|
|
|
|
|
|
def collapse_adjacent_markers(tokens: list[str]) -> list[str]:
|
|
collapsed: list[str] = []
|
|
prev_was_marker = False
|
|
for token in tokens:
|
|
if token == ILLEGIBLE_MARKER:
|
|
if not prev_was_marker:
|
|
collapsed.append(token)
|
|
prev_was_marker = True
|
|
else:
|
|
collapsed.append(token)
|
|
prev_was_marker = False
|
|
return collapsed
|
|
|
|
|
|
def get_threshold(script_type: str) -> float:
|
|
if script_type and script_type.upper() == "HANDWRITING_KURRENT":
|
|
return THRESHOLD_KURRENT
|
|
return THRESHOLD_DEFAULT
|
|
|
|
|
|
def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str:
|
|
"""Replace low-confidence words with [unleserlich], collapsing adjacent markers.
|
|
|
|
Args:
|
|
words: list of {"text": str, "confidence": float} dicts
|
|
threshold: confidence threshold (uses THRESHOLD_DEFAULT if None)
|
|
|
|
Returns:
|
|
Reconstructed text string with [unleserlich] substitutions.
|
|
"""
|
|
if not words:
|
|
return ""
|
|
|
|
if threshold is None:
|
|
threshold = THRESHOLD_DEFAULT
|
|
|
|
tokens: list[str] = []
|
|
for word in words:
|
|
if word["confidence"] < threshold:
|
|
tokens.append(ILLEGIBLE_MARKER)
|
|
else:
|
|
tokens.append(word["text"])
|
|
|
|
return " ".join(collapse_adjacent_markers(tokens))
|
|
|
|
|
|
def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
|
|
"""Reconstruct word-level confidence from character-level data.
|
|
|
|
Splits prediction on whitespace, maps characters to their confidences,
|
|
computes mean confidence per word.
|
|
|
|
Args:
|
|
prediction: full line text from Kraken
|
|
confidences: per-character confidence list (same length as prediction)
|
|
|
|
Returns:
|
|
list of {"text": str, "confidence": float} dicts
|
|
"""
|
|
if not prediction or not prediction.strip():
|
|
return []
|
|
|
|
if len(confidences) != len(prediction):
|
|
return [{"text": prediction, "confidence": 1.0}]
|
|
|
|
result: list[dict] = []
|
|
current_word: list[str] = []
|
|
current_confs: list[float] = []
|
|
|
|
for char, conf in zip(prediction, confidences):
|
|
if char == " ":
|
|
if current_word:
|
|
result.append({
|
|
"text": "".join(current_word),
|
|
"confidence": sum(current_confs) / len(current_confs),
|
|
})
|
|
current_word = []
|
|
current_confs = []
|
|
else:
|
|
current_word.append(char)
|
|
current_confs.append(conf)
|
|
|
|
if current_word:
|
|
result.append({
|
|
"text": "".join(current_word),
|
|
"confidence": sum(current_confs) / len(current_confs),
|
|
})
|
|
|
|
return result
|