feat(ocr): per-script-type confidence thresholds
Some checks failed
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 1s
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 1s

Kurrent OCR produces much lower confidence than typewriter/Latin.
Separate thresholds allow aggressive filtering for Kurrent (0.5)
while keeping typewriter lenient (0.3).

- OCR_CONFIDENCE_THRESHOLD: default for Surya paths (0.3)
- OCR_CONFIDENCE_THRESHOLD_KURRENT: Kraken Kurrent path (0.5)
- apply_confidence_markers() now accepts threshold parameter
- get_threshold(script_type) selects the right threshold

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 20:50:59 +02:00
parent dd078d50da
commit f064b27439
4 changed files with 37 additions and 17 deletions

View File

@@ -2,7 +2,7 @@
import os
import pytest
from confidence import apply_confidence_markers, words_from_characters
from confidence import apply_confidence_markers, words_from_characters, get_threshold
# ─── apply_confidence_markers ─────────────────────────────────────────────────
@@ -75,22 +75,29 @@ def test_just_below_threshold_replaced():
assert apply_confidence_markers(words) == "[unleserlich]"
def test_custom_threshold_via_env(monkeypatch):
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8")
# Need to reload the module to pick up the new env var
import importlib
import confidence
importlib.reload(confidence)
def test_custom_threshold_via_parameter():
words = [
{"text": "Lieber", "confidence": 0.95},
{"text": "Freund", "confidence": 0.5},
]
assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]"
assert apply_confidence_markers(words, threshold=0.8) == "Lieber [unleserlich]"
assert apply_confidence_markers(words, threshold=0.3) == "Lieber Freund"
# Reset
monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3")
importlib.reload(confidence)
def test_kurrent_threshold_is_higher_than_default():
default = get_threshold("TYPEWRITER")
kurrent = get_threshold("HANDWRITING_KURRENT")
assert kurrent > default
def test_get_threshold_kurrent():
assert get_threshold("HANDWRITING_KURRENT") == 0.5
def test_get_threshold_default():
assert get_threshold("TYPEWRITER") == 0.3
assert get_threshold("HANDWRITING_LATIN") == 0.3
assert get_threshold("UNKNOWN") == 0.3
def test_low_confidence_at_start():