feat(ocr): auto-insert [unleserlich] markers for low-confidence words

New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 19:16:17 +02:00
parent 49975154d9
commit c74539b04b
6 changed files with 257 additions and 0 deletions
--- a/ocr-service/confidence.py
+++ b/ocr-service/confidence.py
@@ -0,0 +1,79 @@
+"""Confidence-based [unleserlich] marker insertion for OCR output."""
+
+import os
+
+CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
+
+ILLEGIBLE_MARKER = "[unleserlich]"
+
+
+def apply_confidence_markers(words: list[dict]) -> str:
+    """Replace low-confidence words with [unleserlich], collapsing adjacent markers.
+
+    Args:
+        words: list of {"text": str, "confidence": float} dicts
+
+    Returns:
+        Reconstructed text string with [unleserlich] substitutions.
+    """
+    if not words:
+        return ""
+
+    result: list[str] = []
+    prev_was_marker = False
+
+    for word in words:
+        if word["confidence"] < CONFIDENCE_THRESHOLD:
+            if not prev_was_marker:
+                result.append(ILLEGIBLE_MARKER)
+            prev_was_marker = True
+        else:
+            result.append(word["text"])
+            prev_was_marker = False
+
+    return " ".join(result)
+
+
+def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
+    """Reconstruct word-level confidence from character-level data.
+
+    Splits prediction on whitespace, maps characters to their confidences,
+    computes mean confidence per word.
+
+    Args:
+        prediction: full line text from Kraken
+        confidences: per-character confidence list (same length as prediction)
+
+    Returns:
+        list of {"text": str, "confidence": float} dicts
+    """
+    if not prediction or not prediction.strip():
+        return []
+
+    if len(confidences) != len(prediction):
+        return [{"text": prediction, "confidence": 1.0}]
+
+    result: list[dict] = []
+    current_word: list[str] = []
+    current_confs: list[float] = []
+
+    for char, conf in zip(prediction, confidences):
+        if char == " ":
+            if current_word:
+                result.append({
+                    "text": "".join(current_word),
+                    "confidence": sum(current_confs) / len(current_confs),
+                })
+                current_word = []
+                current_confs = []
+        else:
+            current_word.append(char)
+            current_confs.append(conf)
+
+    if current_word:
+        result.append({
+            "text": "".join(current_word),
+            "confidence": sum(current_confs) / len(current_confs),
+        })
+
+    return result