Files
familienarchiv/ocr-service/spell_check.py
Marcel ec85f228c1 refactor(ocr): document > 50 frequency threshold rationale
Strict greater-than avoids non-determinism: if multiple candidates share
the minimum frequency value, pyspellchecker's ranking is undefined.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 17:21:37 +02:00

118 lines
3.6 KiB
Python

"""German spell-check post-processing for OCR output."""
import logging
import os
from spellchecker import SpellChecker
from confidence import CORRECTION_MARKER, ILLEGIBLE_MARKER, collapse_adjacent_markers
logger = logging.getLogger(__name__)
_MIN_SPELL_CHECK_LEN = 4
_spell: SpellChecker | None = None
def load_spell_checker() -> None:
"""Load German spell checker with supplementary historical wordlist.
Safe to call multiple times — no-op if already loaded.
"""
global _spell
if _spell is not None:
return
logger.info("Loading German spell checker...")
_spell = SpellChecker(language="de")
historical_path = os.path.join(os.path.dirname(__file__), "dictionaries", "de_historical.txt")
if os.path.exists(historical_path):
with open(historical_path, encoding="utf-8") as f:
words = [
line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
_spell.word_frequency.load_words(words)
logger.info("Loaded %d historical German words", len(words))
else:
logger.warning("Historical German wordlist not found at %s", historical_path)
logger.info("German spell checker ready")
def _strip_punctuation(token: str) -> tuple[str, str, str]:
"""Split token into (leading_punct, word, trailing_punct).
'Word' characters are letters (including German umlauts) and digits.
Everything else is treated as punctuation.
"""
start = 0
while start < len(token) and not (token[start].isalpha() or token[start].isdigit()):
start += 1
end = len(token)
while end > start and not (token[end - 1].isalpha() or token[end - 1].isdigit()):
end -= 1
return token[:start], token[start:end], token[end:]
def _is_numeric(word: str) -> bool:
return any(c.isdigit() for c in word)
def correct_text(text: str) -> str:
"""Spell-check OCR text, correcting errors and marking gibberish as [unleserlich].
Already-present [unleserlich] tokens are preserved unchanged.
Words of fewer than 4 characters are exempt (particles, abbreviations).
Tokens containing digits pass through unchanged.
Adjacent [unleserlich] markers are collapsed into one.
Corrected tokens are marked with [?] (e.g. "Hauus""Haus[?]").
Args:
text: OCR output, possibly already containing [unleserlich] from confidence filtering.
Returns:
Corrected text with unresolvable words replaced by [unleserlich].
"""
if _spell is None:
raise RuntimeError("Spell checker not loaded — call load_spell_checker() first")
if not text.strip():
return text
tokens = text.split()
checked: list[str] = []
for token in tokens:
if token == ILLEGIBLE_MARKER:
checked.append(token)
continue
leading, word, trailing = _strip_punctuation(token)
if len(word) < _MIN_SPELL_CHECK_LEN:
checked.append(token)
continue
if _is_numeric(word):
checked.append(token)
continue
if _spell.known([word]):
checked.append(token)
continue
correction = _spell.correction(word)
if correction and _spell.word_frequency[correction] > 50: # strict > avoids non-determinism when candidates tie at the frequency floor
if word[0].isupper() and not correction[0].isupper():
correction = correction.capitalize()
checked.append(leading + correction + CORRECTION_MARKER + trailing)
else:
checked.append(ILLEGIBLE_MARKER)
return " ".join(collapse_adjacent_markers(checked))