feat(ocr): auto-insert [unleserlich] markers for low-confidence words

New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 19:16:17 +02:00
parent 49975154d9
commit c74539b04b
6 changed files with 257 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -84,6 +84,7 @@ services:
      - ocr_models:/app/models
    environment:
      KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
+      OCR_CONFIDENCE_THRESHOLD: "0.3"
    networks:
      - archive-net
    healthcheck:
--- a/ocr-service/confidence.py
+++ b/ocr-service/confidence.py
@@ -0,0 +1,79 @@
+"""Confidence-based [unleserlich] marker insertion for OCR output."""
+
+import os
+
+CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
+
+ILLEGIBLE_MARKER = "[unleserlich]"
+
+
+def apply_confidence_markers(words: list[dict]) -> str:
+    """Replace low-confidence words with [unleserlich], collapsing adjacent markers.
+
+    Args:
+        words: list of {"text": str, "confidence": float} dicts
+
+    Returns:
+        Reconstructed text string with [unleserlich] substitutions.
+    """
+    if not words:
+        return ""
+
+    result: list[str] = []
+    prev_was_marker = False
+
+    for word in words:
+        if word["confidence"] < CONFIDENCE_THRESHOLD:
+            if not prev_was_marker:
+                result.append(ILLEGIBLE_MARKER)
+            prev_was_marker = True
+        else:
+            result.append(word["text"])
+            prev_was_marker = False
+
+    return " ".join(result)
+
+
+def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
+    """Reconstruct word-level confidence from character-level data.
+
+    Splits prediction on whitespace, maps characters to their confidences,
+    computes mean confidence per word.
+
+    Args:
+        prediction: full line text from Kraken
+        confidences: per-character confidence list (same length as prediction)
+
+    Returns:
+        list of {"text": str, "confidence": float} dicts
+    """
+    if not prediction or not prediction.strip():
+        return []
+
+    if len(confidences) != len(prediction):
+        return [{"text": prediction, "confidence": 1.0}]
+
+    result: list[dict] = []
+    current_word: list[str] = []
+    current_confs: list[float] = []
+
+    for char, conf in zip(prediction, confidences):
+        if char == " ":
+            if current_word:
+                result.append({
+                    "text": "".join(current_word),
+                    "confidence": sum(current_confs) / len(current_confs),
+                })
+                current_word = []
+                current_confs = []
+        else:
+            current_word.append(char)
+            current_confs.append(conf)
+
+    if current_word:
+        result.append({
+            "text": "".join(current_word),
+            "confidence": sum(current_confs) / len(current_confs),
+        })
+
+    return result
--- a/ocr-service/engines/kraken.py
+++ b/ocr-service/engines/kraken.py
@@ -37,6 +37,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
    Coordinates are normalized to [0, 1].
    """
    from kraken import blla, rpred
+    from confidence import words_from_characters

    if _model is None:
        raise RuntimeError("Kraken model is not loaded")
@@ -73,6 +74,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
            # Approximate polygon to quadrilateral
            quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None

+            # Extract word-level confidence for [unleserlich] marking
+            char_confidences = getattr(record, "confidences", [])
+            words = words_from_characters(record.prediction, char_confidences)
+
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
@@ -81,6 +86,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
                "height": (y2 - y1) / page_h,
                "polygon": quad,
                "text": record.prediction,
+                "words": words,
            })

    return all_blocks
--- a/ocr-service/engines/surya.py
+++ b/ocr-service/engines/surya.py
@@ -51,6 +51,17 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
                    for p in line.polygon
                ]

+            # Extract word-level confidence for [unleserlich] marking
+            words = []
+            if hasattr(line, "words") and line.words:
+                for word in line.words:
+                    words.append({
+                        "text": word.text,
+                        "confidence": word.confidence,
+                    })
+            else:
+                words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}]
+
            all_blocks.append({
                "pageNumber": page_idx,
                "x": x1 / page_w,
@@ -59,6 +70,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]:
                "height": (y2 - y1) / page_h,
                "polygon": polygon,
                "text": line.text,
+                "words": words,
            })

    return all_blocks
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -9,6 +9,7 @@ import pypdfium2 as pdfium
 from fastapi import FastAPI, HTTPException
 from PIL import Image

+from confidence import apply_confidence_markers
 from engines import kraken as kraken_engine
 from engines import surya as surya_engine
 from models import OcrBlock, OcrRequest
@@ -71,6 +72,11 @@ async def run_ocr(request: OcrRequest):
        # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
        blocks = surya_engine.extract_blocks(images, request.language)

+    for block in blocks:
+        if block.get("words"):
+            block["text"] = apply_confidence_markers(block["words"])
+        block.pop("words", None)
+
    return [OcrBlock(**b) for b in blocks]


--- a/ocr-service/test_confidence.py
+++ b/ocr-service/test_confidence.py
@@ -0,0 +1,153 @@
+"""Tests for confidence-based [unleserlich] marker insertion."""
+
+import os
+import pytest
+from confidence import apply_confidence_markers, words_from_characters
+
+
+# ─── apply_confidence_markers ─────────────────────────────────────────────────
+
+
+def test_all_words_above_threshold_passes_through():
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "Freund", "confidence": 0.88},
+    ]
+    assert apply_confidence_markers(words) == "Lieber Freund"
+
+
+def test_single_low_confidence_word_replaced():
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "xkqz", "confidence": 0.1},
+        {"text": "Freund", "confidence": 0.88},
+    ]
+    assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund"
+
+
+def test_adjacent_low_confidence_words_collapsed():
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "xkqz", "confidence": 0.1},
+        {"text": "abc", "confidence": 0.05},
+        {"text": "yyy", "confidence": 0.2},
+        {"text": "Freund", "confidence": 0.88},
+    ]
+    assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund"
+
+
+def test_mixed_high_low_each_group_gets_marker():
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "xkqz", "confidence": 0.1},
+        {"text": "wie", "confidence": 0.9},
+        {"text": "abc", "confidence": 0.05},
+        {"text": "dir", "confidence": 0.88},
+    ]
+    assert apply_confidence_markers(words) == "Lieber [unleserlich] wie [unleserlich] dir"
+
+
+def test_all_below_threshold_returns_single_marker():
+    words = [
+        {"text": "xkq", "confidence": 0.1},
+        {"text": "zzz", "confidence": 0.05},
+    ]
+    assert apply_confidence_markers(words) == "[unleserlich]"
+
+
+def test_empty_list_returns_empty_string():
+    assert apply_confidence_markers([]) == ""
+
+
+def test_single_word_above_threshold():
+    words = [{"text": "Hallo", "confidence": 0.9}]
+    assert apply_confidence_markers(words) == "Hallo"
+
+
+def test_exact_threshold_passes_through():
+    """Confidence exactly at threshold should NOT be replaced (strict <)."""
+    words = [{"text": "Wort", "confidence": 0.3}]
+    assert apply_confidence_markers(words) == "Wort"
+
+
+def test_just_below_threshold_replaced():
+    words = [{"text": "Wort", "confidence": 0.29}]
+    assert apply_confidence_markers(words) == "[unleserlich]"
+
+
+def test_custom_threshold_via_env(monkeypatch):
+    monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8")
+    # Need to reload the module to pick up the new env var
+    import importlib
+    import confidence
+    importlib.reload(confidence)
+
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "Freund", "confidence": 0.5},
+    ]
+    assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]"
+
+    # Reset
+    monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3")
+    importlib.reload(confidence)
+
+
+def test_low_confidence_at_start():
+    words = [
+        {"text": "xkq", "confidence": 0.1},
+        {"text": "Freund", "confidence": 0.88},
+    ]
+    assert apply_confidence_markers(words) == "[unleserlich] Freund"
+
+
+def test_low_confidence_at_end():
+    words = [
+        {"text": "Lieber", "confidence": 0.95},
+        {"text": "xkq", "confidence": 0.1},
+    ]
+    assert apply_confidence_markers(words) == "Lieber [unleserlich]"
+
+
+# ─── words_from_characters ────────────────────────────────────────────────────
+
+
+def test_single_word_matching_confidences():
+    words = words_from_characters("Hallo", [0.9, 0.8, 0.85, 0.7, 0.95])
+    assert len(words) == 1
+    assert words[0]["text"] == "Hallo"
+    assert abs(words[0]["confidence"] - 0.84) < 0.01
+
+
+def test_multi_word_with_spaces():
+    prediction = "Sehr geehrter"
+    confidences = [0.9, 0.8, 0.7, 0.6, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
+    words = words_from_characters(prediction, confidences)
+    assert len(words) == 2
+    assert words[0]["text"] == "Sehr"
+    assert words[1]["text"] == "geehrter"
+
+
+def test_length_mismatch_falls_back_safely():
+    words = words_from_characters("Hallo Welt", [0.9, 0.8])
+    assert len(words) == 1
+    assert words[0]["text"] == "Hallo Welt"
+    assert words[0]["confidence"] == 1.0
+
+
+def test_empty_prediction_returns_empty():
+    assert words_from_characters("", []) == []
+
+
+def test_single_character_word():
+    words = words_from_characters("A B", [0.9, 0.5, 0.3])
+    assert len(words) == 2
+    assert words[0]["text"] == "A"
+    assert words[0]["confidence"] == 0.9
+    assert words[1]["text"] == "B"
+    assert words[1]["confidence"] == 0.3
+
+
+def test_whitespace_only_prediction():
+    words = words_from_characters("   ", [0.5, 0.5, 0.5])
+    assert words == []