feat(ocr): per-script-type confidence thresholds
Some checks failed
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 1s
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 1s

Kurrent OCR produces much lower confidence than typewriter/Latin.
Separate thresholds allow aggressive filtering for Kurrent (0.5)
while keeping typewriter lenient (0.3).

- OCR_CONFIDENCE_THRESHOLD: default for Surya paths (0.3)
- OCR_CONFIDENCE_THRESHOLD_KURRENT: Kraken Kurrent path (0.5)
- apply_confidence_markers() now accepts threshold parameter
- get_threshold(script_type) selects the right threshold

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 20:50:59 +02:00
parent dd078d50da
commit f064b27439
4 changed files with 37 additions and 17 deletions

View File

@@ -85,6 +85,7 @@ services:
environment: environment:
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
OCR_CONFIDENCE_THRESHOLD: "0.3" OCR_CONFIDENCE_THRESHOLD: "0.3"
OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5"
networks: networks:
- archive-net - archive-net
healthcheck: healthcheck:

View File

@@ -2,16 +2,24 @@
import os import os
CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3"))
THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5"))
ILLEGIBLE_MARKER = "[unleserlich]" ILLEGIBLE_MARKER = "[unleserlich]"
def apply_confidence_markers(words: list[dict]) -> str: def get_threshold(script_type: str) -> float:
if script_type and script_type.upper() == "HANDWRITING_KURRENT":
return THRESHOLD_KURRENT
return THRESHOLD_DEFAULT
def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str:
"""Replace low-confidence words with [unleserlich], collapsing adjacent markers. """Replace low-confidence words with [unleserlich], collapsing adjacent markers.
Args: Args:
words: list of {"text": str, "confidence": float} dicts words: list of {"text": str, "confidence": float} dicts
threshold: confidence threshold (uses THRESHOLD_DEFAULT if None)
Returns: Returns:
Reconstructed text string with [unleserlich] substitutions. Reconstructed text string with [unleserlich] substitutions.
@@ -19,11 +27,14 @@ def apply_confidence_markers(words: list[dict]) -> str:
if not words: if not words:
return "" return ""
if threshold is None:
threshold = THRESHOLD_DEFAULT
result: list[str] = [] result: list[str] = []
prev_was_marker = False prev_was_marker = False
for word in words: for word in words:
if word["confidence"] < CONFIDENCE_THRESHOLD: if word["confidence"] < threshold:
if not prev_was_marker: if not prev_was_marker:
result.append(ILLEGIBLE_MARKER) result.append(ILLEGIBLE_MARKER)
prev_was_marker = True prev_was_marker = True

View File

@@ -9,7 +9,7 @@ import pypdfium2 as pdfium
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from PIL import Image from PIL import Image
from confidence import apply_confidence_markers from confidence import apply_confidence_markers, get_threshold
from engines import kraken as kraken_engine from engines import kraken as kraken_engine
from engines import surya as surya_engine from engines import surya as surya_engine
from models import OcrBlock, OcrRequest from models import OcrBlock, OcrRequest
@@ -72,9 +72,10 @@ async def run_ocr(request: OcrRequest):
# TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
blocks = surya_engine.extract_blocks(images, request.language) blocks = surya_engine.extract_blocks(images, request.language)
threshold = get_threshold(script_type)
for block in blocks: for block in blocks:
if block.get("words"): if block.get("words"):
block["text"] = apply_confidence_markers(block["words"]) block["text"] = apply_confidence_markers(block["words"], threshold)
block.pop("words", None) block.pop("words", None)
return [OcrBlock(**b) for b in blocks] return [OcrBlock(**b) for b in blocks]

View File

@@ -2,7 +2,7 @@
import os import os
import pytest import pytest
from confidence import apply_confidence_markers, words_from_characters from confidence import apply_confidence_markers, words_from_characters, get_threshold
# ─── apply_confidence_markers ───────────────────────────────────────────────── # ─── apply_confidence_markers ─────────────────────────────────────────────────
@@ -75,22 +75,29 @@ def test_just_below_threshold_replaced():
assert apply_confidence_markers(words) == "[unleserlich]" assert apply_confidence_markers(words) == "[unleserlich]"
def test_custom_threshold_via_env(monkeypatch): def test_custom_threshold_via_parameter():
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8")
# Need to reload the module to pick up the new env var
import importlib
import confidence
importlib.reload(confidence)
words = [ words = [
{"text": "Lieber", "confidence": 0.95}, {"text": "Lieber", "confidence": 0.95},
{"text": "Freund", "confidence": 0.5}, {"text": "Freund", "confidence": 0.5},
] ]
assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]" assert apply_confidence_markers(words, threshold=0.8) == "Lieber [unleserlich]"
assert apply_confidence_markers(words, threshold=0.3) == "Lieber Freund"
# Reset
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3") def test_kurrent_threshold_is_higher_than_default():
importlib.reload(confidence) default = get_threshold("TYPEWRITER")
kurrent = get_threshold("HANDWRITING_KURRENT")
assert kurrent > default
def test_get_threshold_kurrent():
assert get_threshold("HANDWRITING_KURRENT") == 0.5
def test_get_threshold_default():
assert get_threshold("TYPEWRITER") == 0.3
assert get_threshold("HANDWRITING_LATIN") == 0.3
assert get_threshold("UNKNOWN") == 0.3
def test_low_confidence_at_start(): def test_low_confidence_at_start():