feat(ocr): add DTA-derived historical German wordlist and generation script

153K words from dtak+dtae 1800-1899 corpora (min_freq=20), covering pre-reform spellings common in Kurrent/Süterlin documents. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 16:48:26 +02:00
parent 6faaa3b7d6
commit 30a6cbeb7f
2 changed files with 153641 additions and 0 deletions
--- a/ocr-service/dictionaries/de_historical.txt
+++ b/ocr-service/dictionaries/de_historical.txt
--- a/scripts/prepare_historical_dict.py
+++ b/scripts/prepare_historical_dict.py
@@ -0,0 +1,90 @@
+"""Prepare historical German wordlist from Deutsches Textarchiv (DTA) corpora.
+
+Downloads two 19th-century DTA ZIPs (original spelling, not normalised),
+tokenises all texts, counts frequencies, filters to alphabetic tokens of
+length > 3 with at least --min-freq occurrences, and writes the result to
+ocr-service/dictionaries/de_historical.txt sorted by frequency descending.
+
+Usage:
+    python scripts/prepare_historical_dict.py
+    python scripts/prepare_historical_dict.py --min-freq 10 --out path/to/output.txt
+"""
+
+import argparse
+import io
+import re
+import zipfile
+from collections import Counter
+from pathlib import Path
+
+import urllib.request
+
+DTA_URLS = [
+    "https://www.deutschestextarchiv.de/media/download/dtak/2020-10-23/original/1800-1899.zip",
+    "https://www.deutschestextarchiv.de/media/download/dtae/2020-10-23/original/1800-1899.zip",
+]
+
+_ALPHA_RE = re.compile(r"[a-zA-ZäöüÄÖÜß]+")
+_MIN_WORD_LEN = 4
+
+
+def _download(url: str) -> bytes:
+    print(f"Downloading {url} ...")
+    with urllib.request.urlopen(url, timeout=300) as resp:
+        data = resp.read()
+    print(f"  Downloaded {len(data) // 1024 // 1024} MB")
+    return data
+
+
+def _tokenise_zip(data: bytes) -> Counter:
+    counts: Counter = Counter()
+    with zipfile.ZipFile(io.BytesIO(data)) as zf:
+        names = [n for n in zf.namelist() if n.endswith(".txt")]
+        print(f"  Tokenising {len(names)} text files ...")
+        for name in names:
+            try:
+                text = zf.read(name).decode("utf-8", errors="replace")
+            except Exception:
+                continue
+            for token in _ALPHA_RE.findall(text):
+                if len(token) >= _MIN_WORD_LEN:
+                    counts[token] += 1
+    return counts
+
+
+def main(min_freq: int, out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    total: Counter = Counter()
+
+    for url in DTA_URLS:
+        try:
+            data = _download(url)
+            total += _tokenise_zip(data)
+        except Exception as exc:
+            print(f"  WARNING: could not process {url}: {exc}")
+
+    filtered = {word: freq for word, freq in total.items() if freq >= min_freq}
+    print(f"Vocabulary after filtering (min_freq={min_freq}): {len(filtered):,} words")
+
+    header = (
+        "# Historical German wordlist derived from Deutsches Textarchiv (DTA)\n"
+        "# Corpora: dtak + dtae 1800-1899 original spelling\n"
+        f"# Filter: alphabetic, length >= {_MIN_WORD_LEN}, min_freq >= {min_freq}\n"
+        "# Sorted by frequency descending\n"
+    )
+    lines = [f"{word}" for word, _ in sorted(filtered.items(), key=lambda x: -x[1])]
+    out_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8")
+    print(f"Written {len(lines):,} words to {out_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare DTA historical German wordlist")
+    parser.add_argument("--min-freq", type=int, default=5, help="Minimum token frequency")
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=Path(__file__).parent.parent / "ocr-service" / "dictionaries" / "de_historical.txt",
+        help="Output path",
+    )
+    args = parser.parse_args()
+    main(args.min_freq, args.out)