"""Tests for OCR spell-check post-processing.""" import pytest from spell_check import correct_text, load_spell_checker @pytest.fixture(autouse=True) def ensure_loaded(): load_spell_checker() def test_known_german_word_passes_through(): assert correct_text("Haus") == "Haus" def test_obvious_gibberish_replaced_with_marker(): assert correct_text("xqzwrpvmk") == "[unleserlich]" def test_short_word_exempt_from_check(): assert correct_text("im") == "im" assert correct_text("der") == "der" assert correct_text("zu") == "zu" def test_unleserlich_marker_preserved(): assert correct_text("[unleserlich]") == "[unleserlich]" def test_mixed_text_correct_and_gibberish(): result = correct_text("Haus xqzwrpvmk Garten") assert result == "Haus [unleserlich] Garten" def test_adjacent_gibberish_words_collapsed_to_one_marker(): result = correct_text("[unleserlich] xqzwrpvmk Haus") assert result == "[unleserlich] Haus" def test_empty_string_returns_empty(): assert correct_text("") == "" def test_whitespace_only_returns_unchanged(): assert correct_text(" ") == " " def test_existing_marker_not_doubled(): result = correct_text("[unleserlich] Haus [unleserlich]") assert result == "[unleserlich] Haus [unleserlich]" def test_historical_word_passes_through(): assert correct_text("Thür") == "Thür" def test_correctable_ocr_error_gets_corrected(): result = correct_text("Hauus") assert result != "Hauus" assert result != "[unleserlich]" assert "[?]" in result assert result.startswith("Haus") def test_sentence_with_multiple_corrections(): result = correct_text("Thür Hauus xqzwrpvmk Garten") tokens = result.split() assert tokens[0] == "Thür" assert "[?]" in tokens[1] and tokens[1].startswith("Haus") assert tokens[2] == "[unleserlich]" assert tokens[3] == "Garten" def test_capitalization_preserved_on_correction(): # "Freunnd" (doubled n) — OCR duplication error for "Freund" result = correct_text("Freunnd") assert result != "Freunnd" assert result != "[unleserlich]" assert result[0].isupper() def test_raises_runtime_error_when_not_loaded(monkeypatch): import spell_check as sc monkeypatch.setattr(sc, "_spell", None) with pytest.raises(RuntimeError, match="not loaded"): correct_text("test") def test_punctuation_attached_token_passes_through(): assert correct_text("Haus,") == "Haus," assert correct_text("Garten.") == "Garten." def test_numeric_token_passes_through(): assert correct_text("1870er") == "1870er" assert correct_text("18.") == "18."