"""Confidence-based [unleserlich] marker insertion for OCR output.""" import os CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) ILLEGIBLE_MARKER = "[unleserlich]" def apply_confidence_markers(words: list[dict]) -> str: """Replace low-confidence words with [unleserlich], collapsing adjacent markers. Args: words: list of {"text": str, "confidence": float} dicts Returns: Reconstructed text string with [unleserlich] substitutions. """ if not words: return "" result: list[str] = [] prev_was_marker = False for word in words: if word["confidence"] < CONFIDENCE_THRESHOLD: if not prev_was_marker: result.append(ILLEGIBLE_MARKER) prev_was_marker = True else: result.append(word["text"]) prev_was_marker = False return " ".join(result) def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: """Reconstruct word-level confidence from character-level data. Splits prediction on whitespace, maps characters to their confidences, computes mean confidence per word. Args: prediction: full line text from Kraken confidences: per-character confidence list (same length as prediction) Returns: list of {"text": str, "confidence": float} dicts """ if not prediction or not prediction.strip(): return [] if len(confidences) != len(prediction): return [{"text": prediction, "confidence": 1.0}] result: list[dict] = [] current_word: list[str] = [] current_confs: list[float] = [] for char, conf in zip(prediction, confidences): if char == " ": if current_word: result.append({ "text": "".join(current_word), "confidence": sum(current_confs) / len(current_confs), }) current_word = [] current_confs = [] else: current_word.append(char) current_confs.append(conf) if current_word: result.append({ "text": "".join(current_word), "confidence": sum(current_confs) / len(current_confs), }) return result