"""Confidence-based [unleserlich] marker insertion for OCR output.""" import os THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5")) ILLEGIBLE_MARKER = "[unleserlich]" def get_threshold(script_type: str) -> float: if script_type and script_type.upper() == "HANDWRITING_KURRENT": return THRESHOLD_KURRENT return THRESHOLD_DEFAULT def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str: """Replace low-confidence words with [unleserlich], collapsing adjacent markers. Args: words: list of {"text": str, "confidence": float} dicts threshold: confidence threshold (uses THRESHOLD_DEFAULT if None) Returns: Reconstructed text string with [unleserlich] substitutions. """ if not words: return "" if threshold is None: threshold = THRESHOLD_DEFAULT result: list[str] = [] prev_was_marker = False for word in words: if word["confidence"] < threshold: if not prev_was_marker: result.append(ILLEGIBLE_MARKER) prev_was_marker = True else: result.append(word["text"]) prev_was_marker = False return " ".join(result) def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: """Reconstruct word-level confidence from character-level data. Splits prediction on whitespace, maps characters to their confidences, computes mean confidence per word. Args: prediction: full line text from Kraken confidences: per-character confidence list (same length as prediction) Returns: list of {"text": str, "confidence": float} dicts """ if not prediction or not prediction.strip(): return [] if len(confidences) != len(prediction): return [{"text": prediction, "confidence": 1.0}] result: list[dict] = [] current_word: list[str] = [] current_confs: list[float] = [] for char, conf in zip(prediction, confidences): if char == " ": if current_word: result.append({ "text": "".join(current_word), "confidence": sum(current_confs) / len(current_confs), }) current_word = [] current_confs = [] else: current_word.append(char) current_confs.append(conf) if current_word: result.append({ "text": "".join(current_word), "confidence": sum(current_confs) / len(current_confs), }) return result