diff --git a/docker-compose.yml b/docker-compose.yml index 0dd05942..5e88f381 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -84,6 +84,7 @@ services: - ocr_models:/app/models environment: KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel + OCR_CONFIDENCE_THRESHOLD: "0.3" networks: - archive-net healthcheck: diff --git a/ocr-service/confidence.py b/ocr-service/confidence.py new file mode 100644 index 00000000..092c2892 --- /dev/null +++ b/ocr-service/confidence.py @@ -0,0 +1,79 @@ +"""Confidence-based [unleserlich] marker insertion for OCR output.""" + +import os + +CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) + +ILLEGIBLE_MARKER = "[unleserlich]" + + +def apply_confidence_markers(words: list[dict]) -> str: + """Replace low-confidence words with [unleserlich], collapsing adjacent markers. + + Args: + words: list of {"text": str, "confidence": float} dicts + + Returns: + Reconstructed text string with [unleserlich] substitutions. + """ + if not words: + return "" + + result: list[str] = [] + prev_was_marker = False + + for word in words: + if word["confidence"] < CONFIDENCE_THRESHOLD: + if not prev_was_marker: + result.append(ILLEGIBLE_MARKER) + prev_was_marker = True + else: + result.append(word["text"]) + prev_was_marker = False + + return " ".join(result) + + +def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: + """Reconstruct word-level confidence from character-level data. + + Splits prediction on whitespace, maps characters to their confidences, + computes mean confidence per word. + + Args: + prediction: full line text from Kraken + confidences: per-character confidence list (same length as prediction) + + Returns: + list of {"text": str, "confidence": float} dicts + """ + if not prediction or not prediction.strip(): + return [] + + if len(confidences) != len(prediction): + return [{"text": prediction, "confidence": 1.0}] + + result: list[dict] = [] + current_word: list[str] = [] + current_confs: list[float] = [] + + for char, conf in zip(prediction, confidences): + if char == " ": + if current_word: + result.append({ + "text": "".join(current_word), + "confidence": sum(current_confs) / len(current_confs), + }) + current_word = [] + current_confs = [] + else: + current_word.append(char) + current_confs.append(conf) + + if current_word: + result.append({ + "text": "".join(current_word), + "confidence": sum(current_confs) / len(current_confs), + }) + + return result diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py index 16cb3d0b..a0fec491 100644 --- a/ocr-service/engines/kraken.py +++ b/ocr-service/engines/kraken.py @@ -37,6 +37,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: Coordinates are normalized to [0, 1]. """ from kraken import blla, rpred + from confidence import words_from_characters if _model is None: raise RuntimeError("Kraken model is not loaded") @@ -73,6 +74,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: # Approximate polygon to quadrilateral quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None + # Extract word-level confidence for [unleserlich] marking + char_confidences = getattr(record, "confidences", []) + words = words_from_characters(record.prediction, char_confidences) + all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, @@ -81,6 +86,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: "height": (y2 - y1) / page_h, "polygon": quad, "text": record.prediction, + "words": words, }) return all_blocks diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index 77a895d9..94fc330b 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -51,6 +51,17 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: for p in line.polygon ] + # Extract word-level confidence for [unleserlich] marking + words = [] + if hasattr(line, "words") and line.words: + for word in line.words: + words.append({ + "text": word.text, + "confidence": word.confidence, + }) + else: + words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] + all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, @@ -59,6 +70,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: "height": (y2 - y1) / page_h, "polygon": polygon, "text": line.text, + "words": words, }) return all_blocks diff --git a/ocr-service/main.py b/ocr-service/main.py index d4e3f957..f87985e6 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -9,6 +9,7 @@ import pypdfium2 as pdfium from fastapi import FastAPI, HTTPException from PIL import Image +from confidence import apply_confidence_markers from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest @@ -71,6 +72,11 @@ async def run_ocr(request: OcrRequest): # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = surya_engine.extract_blocks(images, request.language) + for block in blocks: + if block.get("words"): + block["text"] = apply_confidence_markers(block["words"]) + block.pop("words", None) + return [OcrBlock(**b) for b in blocks] diff --git a/ocr-service/test_confidence.py b/ocr-service/test_confidence.py new file mode 100644 index 00000000..e1359eb1 --- /dev/null +++ b/ocr-service/test_confidence.py @@ -0,0 +1,153 @@ +"""Tests for confidence-based [unleserlich] marker insertion.""" + +import os +import pytest +from confidence import apply_confidence_markers, words_from_characters + + +# ─── apply_confidence_markers ───────────────────────────────────────────────── + + +def test_all_words_above_threshold_passes_through(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber Freund" + + +def test_single_low_confidence_word_replaced(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" + + +def test_adjacent_low_confidence_words_collapsed(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "abc", "confidence": 0.05}, + {"text": "yyy", "confidence": 0.2}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" + + +def test_mixed_high_low_each_group_gets_marker(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "wie", "confidence": 0.9}, + {"text": "abc", "confidence": 0.05}, + {"text": "dir", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] wie [unleserlich] dir" + + +def test_all_below_threshold_returns_single_marker(): + words = [ + {"text": "xkq", "confidence": 0.1}, + {"text": "zzz", "confidence": 0.05}, + ] + assert apply_confidence_markers(words) == "[unleserlich]" + + +def test_empty_list_returns_empty_string(): + assert apply_confidence_markers([]) == "" + + +def test_single_word_above_threshold(): + words = [{"text": "Hallo", "confidence": 0.9}] + assert apply_confidence_markers(words) == "Hallo" + + +def test_exact_threshold_passes_through(): + """Confidence exactly at threshold should NOT be replaced (strict <).""" + words = [{"text": "Wort", "confidence": 0.3}] + assert apply_confidence_markers(words) == "Wort" + + +def test_just_below_threshold_replaced(): + words = [{"text": "Wort", "confidence": 0.29}] + assert apply_confidence_markers(words) == "[unleserlich]" + + +def test_custom_threshold_via_env(monkeypatch): + monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8") + # Need to reload the module to pick up the new env var + import importlib + import confidence + importlib.reload(confidence) + + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "Freund", "confidence": 0.5}, + ] + assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]" + + # Reset + monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3") + importlib.reload(confidence) + + +def test_low_confidence_at_start(): + words = [ + {"text": "xkq", "confidence": 0.1}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "[unleserlich] Freund" + + +def test_low_confidence_at_end(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkq", "confidence": 0.1}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich]" + + +# ─── words_from_characters ──────────────────────────────────────────────────── + + +def test_single_word_matching_confidences(): + words = words_from_characters("Hallo", [0.9, 0.8, 0.85, 0.7, 0.95]) + assert len(words) == 1 + assert words[0]["text"] == "Hallo" + assert abs(words[0]["confidence"] - 0.84) < 0.01 + + +def test_multi_word_with_spaces(): + prediction = "Sehr geehrter" + confidences = [0.9, 0.8, 0.7, 0.6, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2] + words = words_from_characters(prediction, confidences) + assert len(words) == 2 + assert words[0]["text"] == "Sehr" + assert words[1]["text"] == "geehrter" + + +def test_length_mismatch_falls_back_safely(): + words = words_from_characters("Hallo Welt", [0.9, 0.8]) + assert len(words) == 1 + assert words[0]["text"] == "Hallo Welt" + assert words[0]["confidence"] == 1.0 + + +def test_empty_prediction_returns_empty(): + assert words_from_characters("", []) == [] + + +def test_single_character_word(): + words = words_from_characters("A B", [0.9, 0.5, 0.3]) + assert len(words) == 2 + assert words[0]["text"] == "A" + assert words[0]["confidence"] == 0.9 + assert words[1]["text"] == "B" + assert words[1]["confidence"] == 0.3 + + +def test_whitespace_only_prediction(): + words = words_from_characters(" ", [0.5, 0.5, 0.5]) + assert words == []