feat(ocr): add DTA-derived historical German wordlist and generation script
153K words from dtak+dtae 1800-1899 corpora (min_freq=20), covering pre-reform spellings common in Kurrent/Süterlin documents. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
153551
ocr-service/dictionaries/de_historical.txt
Normal file
153551
ocr-service/dictionaries/de_historical.txt
Normal file
File diff suppressed because it is too large
Load Diff
90
scripts/prepare_historical_dict.py
Normal file
90
scripts/prepare_historical_dict.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""Prepare historical German wordlist from Deutsches Textarchiv (DTA) corpora.
|
||||
|
||||
Downloads two 19th-century DTA ZIPs (original spelling, not normalised),
|
||||
tokenises all texts, counts frequencies, filters to alphabetic tokens of
|
||||
length > 3 with at least --min-freq occurrences, and writes the result to
|
||||
ocr-service/dictionaries/de_historical.txt sorted by frequency descending.
|
||||
|
||||
Usage:
|
||||
python scripts/prepare_historical_dict.py
|
||||
python scripts/prepare_historical_dict.py --min-freq 10 --out path/to/output.txt
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import re
|
||||
import zipfile
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import urllib.request
|
||||
|
||||
DTA_URLS = [
|
||||
"https://www.deutschestextarchiv.de/media/download/dtak/2020-10-23/original/1800-1899.zip",
|
||||
"https://www.deutschestextarchiv.de/media/download/dtae/2020-10-23/original/1800-1899.zip",
|
||||
]
|
||||
|
||||
_ALPHA_RE = re.compile(r"[a-zA-ZäöüÄÖÜß]+")
|
||||
_MIN_WORD_LEN = 4
|
||||
|
||||
|
||||
def _download(url: str) -> bytes:
|
||||
print(f"Downloading {url} ...")
|
||||
with urllib.request.urlopen(url, timeout=300) as resp:
|
||||
data = resp.read()
|
||||
print(f" Downloaded {len(data) // 1024 // 1024} MB")
|
||||
return data
|
||||
|
||||
|
||||
def _tokenise_zip(data: bytes) -> Counter:
|
||||
counts: Counter = Counter()
|
||||
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
||||
names = [n for n in zf.namelist() if n.endswith(".txt")]
|
||||
print(f" Tokenising {len(names)} text files ...")
|
||||
for name in names:
|
||||
try:
|
||||
text = zf.read(name).decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
continue
|
||||
for token in _ALPHA_RE.findall(text):
|
||||
if len(token) >= _MIN_WORD_LEN:
|
||||
counts[token] += 1
|
||||
return counts
|
||||
|
||||
|
||||
def main(min_freq: int, out_path: Path) -> None:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
total: Counter = Counter()
|
||||
|
||||
for url in DTA_URLS:
|
||||
try:
|
||||
data = _download(url)
|
||||
total += _tokenise_zip(data)
|
||||
except Exception as exc:
|
||||
print(f" WARNING: could not process {url}: {exc}")
|
||||
|
||||
filtered = {word: freq for word, freq in total.items() if freq >= min_freq}
|
||||
print(f"Vocabulary after filtering (min_freq={min_freq}): {len(filtered):,} words")
|
||||
|
||||
header = (
|
||||
"# Historical German wordlist derived from Deutsches Textarchiv (DTA)\n"
|
||||
"# Corpora: dtak + dtae 1800-1899 original spelling\n"
|
||||
f"# Filter: alphabetic, length >= {_MIN_WORD_LEN}, min_freq >= {min_freq}\n"
|
||||
"# Sorted by frequency descending\n"
|
||||
)
|
||||
lines = [f"{word}" for word, _ in sorted(filtered.items(), key=lambda x: -x[1])]
|
||||
out_path.write_text(header + "\n".join(lines) + "\n", encoding="utf-8")
|
||||
print(f"Written {len(lines):,} words to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Prepare DTA historical German wordlist")
|
||||
parser.add_argument("--min-freq", type=int, default=5, help="Minimum token frequency")
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "ocr-service" / "dictionaries" / "de_historical.txt",
|
||||
help="Output path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args.min_freq, args.out)
|
||||
Reference in New Issue
Block a user