Some checks failed
CI / Unit & Component Tests (pull_request) Successful in 3m35s
CI / OCR Service Tests (pull_request) Successful in 36s
CI / Backend Unit Tests (pull_request) Failing after 2m47s
CI / Unit & Component Tests (push) Failing after 2m33s
CI / OCR Service Tests (push) Successful in 34s
CI / Backend Unit Tests (push) Failing after 2m41s
Replace exact-string assertions in test_correctable_ocr_error_gets_corrected and test_sentence_with_multiple_corrections with structural assertions that verify behavior (correction attempted, marker present, expected stem) without coupling to a specific pyspellchecker version's frequency weights. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
97 lines
2.6 KiB
Python
97 lines
2.6 KiB
Python
"""Tests for OCR spell-check post-processing."""
|
|
|
|
import pytest
|
|
from spell_check import correct_text, load_spell_checker
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def ensure_loaded():
|
|
load_spell_checker()
|
|
|
|
|
|
def test_known_german_word_passes_through():
|
|
assert correct_text("Haus") == "Haus"
|
|
|
|
|
|
def test_obvious_gibberish_replaced_with_marker():
|
|
assert correct_text("xqzwrpvmk") == "[unleserlich]"
|
|
|
|
|
|
def test_short_word_exempt_from_check():
|
|
assert correct_text("im") == "im"
|
|
assert correct_text("der") == "der"
|
|
assert correct_text("zu") == "zu"
|
|
|
|
|
|
def test_unleserlich_marker_preserved():
|
|
assert correct_text("[unleserlich]") == "[unleserlich]"
|
|
|
|
|
|
def test_mixed_text_correct_and_gibberish():
|
|
result = correct_text("Haus xqzwrpvmk Garten")
|
|
assert result == "Haus [unleserlich] Garten"
|
|
|
|
|
|
def test_adjacent_gibberish_words_collapsed_to_one_marker():
|
|
result = correct_text("[unleserlich] xqzwrpvmk Haus")
|
|
assert result == "[unleserlich] Haus"
|
|
|
|
|
|
def test_empty_string_returns_empty():
|
|
assert correct_text("") == ""
|
|
|
|
|
|
def test_whitespace_only_returns_unchanged():
|
|
assert correct_text(" ") == " "
|
|
|
|
|
|
def test_existing_marker_not_doubled():
|
|
result = correct_text("[unleserlich] Haus [unleserlich]")
|
|
assert result == "[unleserlich] Haus [unleserlich]"
|
|
|
|
|
|
def test_historical_word_passes_through():
|
|
assert correct_text("Thür") == "Thür"
|
|
|
|
|
|
def test_correctable_ocr_error_gets_corrected():
|
|
result = correct_text("Hauus")
|
|
assert result != "Hauus"
|
|
assert result != "[unleserlich]"
|
|
assert "[?]" in result
|
|
assert result.startswith("Haus")
|
|
|
|
|
|
def test_sentence_with_multiple_corrections():
|
|
result = correct_text("Thür Hauus xqzwrpvmk Garten")
|
|
tokens = result.split()
|
|
assert tokens[0] == "Thür"
|
|
assert "[?]" in tokens[1] and tokens[1].startswith("Haus")
|
|
assert tokens[2] == "[unleserlich]"
|
|
assert tokens[3] == "Garten"
|
|
|
|
|
|
def test_capitalization_preserved_on_correction():
|
|
# "Freunnd" (doubled n) — OCR duplication error for "Freund"
|
|
result = correct_text("Freunnd")
|
|
assert result != "Freunnd"
|
|
assert result != "[unleserlich]"
|
|
assert result[0].isupper()
|
|
|
|
|
|
def test_raises_runtime_error_when_not_loaded(monkeypatch):
|
|
import spell_check as sc
|
|
monkeypatch.setattr(sc, "_spell", None)
|
|
with pytest.raises(RuntimeError, match="not loaded"):
|
|
correct_text("test")
|
|
|
|
|
|
def test_punctuation_attached_token_passes_through():
|
|
assert correct_text("Haus,") == "Haus,"
|
|
assert correct_text("Garten.") == "Garten."
|
|
|
|
|
|
def test_numeric_token_passes_through():
|
|
assert correct_text("1870er") == "1870er"
|
|
assert correct_text("18.") == "18."
|