diff --git a/ocr-service/test_spell_check.py b/ocr-service/test_spell_check.py new file mode 100644 index 00000000..1e19bac0 --- /dev/null +++ b/ocr-service/test_spell_check.py @@ -0,0 +1,89 @@ +"""Tests for OCR spell-check post-processing.""" + +import pytest +from spell_check import correct_text, load_spell_checker + + +@pytest.fixture(autouse=True) +def ensure_loaded(): + load_spell_checker() + + +def test_known_german_word_passes_through(): + assert correct_text("Haus") == "Haus" + + +def test_obvious_gibberish_replaced_with_marker(): + assert correct_text("xqzwrpvmk") == "[unleserlich]" + + +def test_short_word_exempt_from_check(): + assert correct_text("im") == "im" + assert correct_text("der") == "der" + assert correct_text("zu") == "zu" + + +def test_unleserlich_marker_preserved(): + assert correct_text("[unleserlich]") == "[unleserlich]" + + +def test_mixed_text_correct_and_gibberish(): + result = correct_text("Haus xqzwrpvmk Garten") + assert result == "Haus [unleserlich] Garten" + + +def test_adjacent_gibberish_words_collapsed_to_one_marker(): + result = correct_text("[unleserlich] xqzwrpvmk Haus") + assert result == "[unleserlich] Haus" + + +def test_empty_string_returns_empty(): + assert correct_text("") == "" + + +def test_whitespace_only_returns_unchanged(): + assert correct_text(" ") == " " + + +def test_existing_marker_not_doubled(): + result = correct_text("[unleserlich] Haus [unleserlich]") + assert result == "[unleserlich] Haus [unleserlich]" + + +def test_historical_word_passes_through(): + assert correct_text("Thür") == "Thür" + + +def test_correctable_ocr_error_gets_corrected(): + result = correct_text("Hauus") + assert result == "Haus[?]" + + +def test_sentence_with_multiple_corrections(): + result = correct_text("Thür Hauus xqzwrpvmk Garten") + assert result == "Thür Haus[?] [unleserlich] Garten" + + +def test_capitalization_preserved_on_correction(): + # "Freunnd" (doubled n) — OCR duplication error for "Freund" + result = correct_text("Freunnd") + assert result != "Freunnd" + assert result != "[unleserlich]" + assert result[0].isupper() + + +def test_raises_runtime_error_when_not_loaded(monkeypatch): + import spell_check as sc + monkeypatch.setattr(sc, "_spell", None) + with pytest.raises(RuntimeError, match="not loaded"): + correct_text("test") + + +def test_punctuation_attached_token_passes_through(): + assert correct_text("Haus,") == "Haus," + assert correct_text("Garten.") == "Garten." + + +def test_numeric_token_passes_through(): + assert correct_text("1870er") == "1870er" + assert correct_text("18.") == "18."