"""Tests for confidence-based [unleserlich] marker insertion.""" import os import pytest from confidence import apply_confidence_markers, words_from_characters, get_threshold # ─── apply_confidence_markers ───────────────────────────────────────────────── def test_all_words_above_threshold_passes_through(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "Freund", "confidence": 0.88}, ] assert apply_confidence_markers(words) == "Lieber Freund" def test_single_low_confidence_word_replaced(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "xkqz", "confidence": 0.1}, {"text": "Freund", "confidence": 0.88}, ] assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" def test_adjacent_low_confidence_words_collapsed(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "xkqz", "confidence": 0.1}, {"text": "abc", "confidence": 0.05}, {"text": "yyy", "confidence": 0.2}, {"text": "Freund", "confidence": 0.88}, ] assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" def test_mixed_high_low_each_group_gets_marker(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "xkqz", "confidence": 0.1}, {"text": "wie", "confidence": 0.9}, {"text": "abc", "confidence": 0.05}, {"text": "dir", "confidence": 0.88}, ] assert apply_confidence_markers(words) == "Lieber [unleserlich] wie [unleserlich] dir" def test_all_below_threshold_returns_single_marker(): words = [ {"text": "xkq", "confidence": 0.1}, {"text": "zzz", "confidence": 0.05}, ] assert apply_confidence_markers(words) == "[unleserlich]" def test_empty_list_returns_empty_string(): assert apply_confidence_markers([]) == "" def test_single_word_above_threshold(): words = [{"text": "Hallo", "confidence": 0.9}] assert apply_confidence_markers(words) == "Hallo" def test_exact_threshold_passes_through(): """Confidence exactly at threshold should NOT be replaced (strict <).""" words = [{"text": "Wort", "confidence": 0.3}] assert apply_confidence_markers(words) == "Wort" def test_just_below_threshold_replaced(): words = [{"text": "Wort", "confidence": 0.29}] assert apply_confidence_markers(words) == "[unleserlich]" def test_custom_threshold_via_parameter(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "Freund", "confidence": 0.5}, ] assert apply_confidence_markers(words, threshold=0.8) == "Lieber [unleserlich]" assert apply_confidence_markers(words, threshold=0.3) == "Lieber Freund" def test_kurrent_threshold_is_higher_than_default(): default = get_threshold("TYPEWRITER") kurrent = get_threshold("HANDWRITING_KURRENT") assert kurrent > default def test_get_threshold_kurrent(): assert get_threshold("HANDWRITING_KURRENT") == 0.5 def test_get_threshold_default(): assert get_threshold("TYPEWRITER") == 0.3 assert get_threshold("HANDWRITING_LATIN") == 0.3 assert get_threshold("UNKNOWN") == 0.3 def test_low_confidence_at_start(): words = [ {"text": "xkq", "confidence": 0.1}, {"text": "Freund", "confidence": 0.88}, ] assert apply_confidence_markers(words) == "[unleserlich] Freund" def test_low_confidence_at_end(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "xkq", "confidence": 0.1}, ] assert apply_confidence_markers(words) == "Lieber [unleserlich]" # ─── words_from_characters ──────────────────────────────────────────────────── def test_single_word_matching_confidences(): words = words_from_characters("Hallo", [0.9, 0.8, 0.85, 0.7, 0.95]) assert len(words) == 1 assert words[0]["text"] == "Hallo" assert abs(words[0]["confidence"] - 0.84) < 0.01 def test_multi_word_with_spaces(): prediction = "Sehr geehrter" confidences = [0.9, 0.8, 0.7, 0.6, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2] words = words_from_characters(prediction, confidences) assert len(words) == 2 assert words[0]["text"] == "Sehr" assert words[1]["text"] == "geehrter" def test_length_mismatch_falls_back_safely(): words = words_from_characters("Hallo Welt", [0.9, 0.8]) assert len(words) == 1 assert words[0]["text"] == "Hallo Welt" assert words[0]["confidence"] == 1.0 def test_empty_prediction_returns_empty(): assert words_from_characters("", []) == [] def test_single_character_word(): words = words_from_characters("A B", [0.9, 0.5, 0.3]) assert len(words) == 2 assert words[0]["text"] == "A" assert words[0]["confidence"] == 0.9 assert words[1]["text"] == "B" assert words[1]["confidence"] == 0.3 def test_whitespace_only_prediction(): words = words_from_characters(" ", [0.5, 0.5, 0.5]) assert words == []