"""Prepare historical German wordlist from Deutsches Textarchiv (DTA) corpora. Downloads two 19th-century DTA ZIPs (original spelling, not normalised), tokenises all texts, counts frequencies, filters to alphabetic tokens of length > 3 with at least --min-freq occurrences, and writes the result to ocr-service/dictionaries/de_historical.txt sorted by frequency descending. Usage: python scripts/prepare_historical_dict.py python scripts/prepare_historical_dict.py --min-freq 10 --out path/to/output.txt """ import argparse import io import re import zipfile from collections import Counter from pathlib import Path import urllib.request DTA_URLS = [ "https://www.deutschestextarchiv.de/media/download/dtak/2020-10-23/original/1800-1899.zip", "https://www.deutschestextarchiv.de/media/download/dtae/2020-10-23/original/1800-1899.zip", ] _ALPHA_RE = re.compile(r"[a-zA-ZäöüÄÖÜß]+") _MIN_WORD_LEN = 4 def _download(url: str) -> bytes: print(f"Downloading {url} ...") with urllib.request.urlopen(url, timeout=300) as resp: data = resp.read() print(f" Downloaded {len(data) // 1024 // 1024} MB") return data def _tokenise_zip(data: bytes) -> Counter: counts: Counter = Counter() with zipfile.ZipFile(io.BytesIO(data)) as zf: names = [n for n in zf.namelist() if n.endswith(".txt")] print(f" Tokenising {len(names)} text files ...") for name in names: try: text = zf.read(name).decode("utf-8", errors="replace") except Exception: continue for token in _ALPHA_RE.findall(text): if len(token) >= _MIN_WORD_LEN: counts[token] += 1 return counts def main(min_freq: int, out_path: Path) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) total: Counter = Counter() for url in DTA_URLS: try: data = _download(url) total += _tokenise_zip(data) except Exception as exc: print(f" WARNING: could not process {url}: {exc}") filtered = {word: freq for word, freq in total.items() if freq >= min_freq} print(f"Vocabulary after filtering (min_freq={min_freq}): {len(filtered):,} words") header = ( "# Historical German wordlist derived from Deutsches Textarchiv (DTA)\n" "# Corpora: dtak + dtae 1800-1899 original spelling\n" f"# Filter: alphabetic, length >= {_MIN_WORD_LEN}, min_freq >= {min_freq}\n" "# Sorted by frequency descending\n" ) lines = [f"{word}" for word, _ in sorted(filtered.items(), key=lambda x: -x[1])] out_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8") print(f"Written {len(lines):,} words to {out_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Prepare DTA historical German wordlist") parser.add_argument("--min-freq", type=int, default=5, help="Minimum token frequency") parser.add_argument( "--out", type=Path, default=Path(__file__).parent.parent / "ocr-service" / "dictionaries" / "de_historical.txt", help="Output path", ) args = parser.parse_args() main(args.min_freq, args.out)