familienarchiv/ocr-service/confidence.py

"""Confidence-based [unleserlich] marker insertion for OCR output."""

import os

THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))

ILLEGIBLE_MARKER = "[unleserlich]"
CORRECTION_MARKER = "[?]"


def collapse_adjacent_markers(tokens: list[str]) -> list[str]:
    collapsed: list[str] = []
    prev_was_marker = False
    for token in tokens:
        if token == ILLEGIBLE_MARKER:
            if not prev_was_marker:
                collapsed.append(token)
            prev_was_marker = True
        else:
            collapsed.append(token)
            prev_was_marker = False
    return collapsed


def get_threshold(script_type: str) -> float:
    if script_type and script_type.upper() == "HANDWRITING_KURRENT":
        return THRESHOLD_KURRENT
    return THRESHOLD_DEFAULT


def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str:
    """Replace low-confidence words with [unleserlich], collapsing adjacent markers.

    Args:
        words: list of {"text": str, "confidence": float} dicts
        threshold: confidence threshold (uses THRESHOLD_DEFAULT if None)

    Returns:
        Reconstructed text string with [unleserlich] substitutions.
    """
    if not words:
        return ""

    if threshold is None:
        threshold = THRESHOLD_DEFAULT

    tokens: list[str] = []
    for word in words:
        if word["confidence"] < threshold:
            tokens.append(ILLEGIBLE_MARKER)
        else:
            tokens.append(word["text"])

    return " ".join(collapse_adjacent_markers(tokens))


def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]:
    """Reconstruct word-level confidence from character-level data.

    Splits prediction on whitespace, maps characters to their confidences,
    computes mean confidence per word.

    Args:
        prediction: full line text from Kraken
        confidences: per-character confidence list (same length as prediction)

    Returns:
        list of {"text": str, "confidence": float} dicts
    """
    if not prediction or not prediction.strip():
        return []

    if len(confidences) != len(prediction):
        return [{"text": prediction, "confidence": 1.0}]

    result: list[dict] = []
    current_word: list[str] = []
    current_confs: list[float] = []

    for char, conf in zip(prediction, confidences):
        if char == " ":
            if current_word:
                result.append({
                    "text": "".join(current_word),
                    "confidence": sum(current_confs) / len(current_confs),
                })
                current_word = []
                current_confs = []
        else:
            current_word.append(char)
            current_confs.append(conf)

    if current_word:
        result.append({
            "text": "".join(current_word),
            "confidence": sum(current_confs) / len(current_confs),
        })

    return result