feat(normalizer): scaffold tool + config tables
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -26,3 +26,7 @@ node_modules/
|
|||||||
|
|
||||||
# Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift.
|
# Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift.
|
||||||
frontend/yarn.lock
|
frontend/yarn.lock
|
||||||
|
|
||||||
|
**/.venv/
|
||||||
|
**/__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|||||||
5
tools/import-normalizer/.gitignore
vendored
Normal file
5
tools/import-normalizer/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
.venv/
|
||||||
|
out/
|
||||||
|
review/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
100
tools/import-normalizer/config.py
Normal file
100
tools/import-normalizer/config.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
"""Tunables for the import normalizer. No logic here — only data tables."""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# --- Paths ---
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent
|
||||||
|
REPO_ROOT = BASE_DIR.parent.parent
|
||||||
|
IMPORT_DIR = REPO_ROOT / "import"
|
||||||
|
|
||||||
|
DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx"
|
||||||
|
DOCUMENT_SHEET = "Familienarchiv"
|
||||||
|
PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
|
||||||
|
PERSON_SHEET = "Tabelle1"
|
||||||
|
|
||||||
|
OUT_DIR = BASE_DIR / "out"
|
||||||
|
REVIEW_DIR = BASE_DIR / "review"
|
||||||
|
OVERRIDES_DIR = BASE_DIR / "overrides"
|
||||||
|
|
||||||
|
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
||||||
|
DOCUMENT_HEADER_MAP = {
|
||||||
|
"index": "index",
|
||||||
|
"datei": "file",
|
||||||
|
"box": "box",
|
||||||
|
"mappe": "folder",
|
||||||
|
"briefeschreiberin": "sender",
|
||||||
|
"empfängerin": "receivers",
|
||||||
|
"datum des briefes": "date",
|
||||||
|
"ort": "location",
|
||||||
|
"schlagwort": "tags",
|
||||||
|
"inhalt": "summary",
|
||||||
|
}
|
||||||
|
DOCUMENT_REQUIRED_FIELDS = {"index"}
|
||||||
|
|
||||||
|
PERSON_HEADER_MAP = {
|
||||||
|
"generation": "generation",
|
||||||
|
"familienname": "last_name",
|
||||||
|
"vorname": "first_name",
|
||||||
|
"geb als": "maiden_name",
|
||||||
|
"geburtsdatum": "birth_date",
|
||||||
|
"geburtsort": "birth_place",
|
||||||
|
"todesdatum": "death_date",
|
||||||
|
"sterbeort": "death_place",
|
||||||
|
"verheiratet mit": "spouse",
|
||||||
|
"bemerkung": "notes",
|
||||||
|
}
|
||||||
|
PERSON_REQUIRED_FIELDS = {"last_name"}
|
||||||
|
|
||||||
|
# --- Century rule (archive 1873–1957) ---
|
||||||
|
TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy
|
||||||
|
TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
|
||||||
|
|
||||||
|
# --- Seasons -> representative month (day = 1) ---
|
||||||
|
SEASON_MONTHS = {
|
||||||
|
"frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
|
||||||
|
"sommer": 7, "herbst": 10, "winter": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Fixed feasts -> (month, day) ---
|
||||||
|
FIXED_FEASTS = {
|
||||||
|
"neujahr": (1, 1),
|
||||||
|
"heiligabend": (12, 24), "heiliger abend": (12, 24),
|
||||||
|
"weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
|
||||||
|
"silvester": (12, 31), "sylvester": (12, 31),
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Movable feasts -> day offset from Easter Sunday ---
|
||||||
|
MOVABLE_FEASTS = {
|
||||||
|
"karfreitag": -2,
|
||||||
|
"ostern": 0, "ostersonntag": 0, "ostermontag": 1,
|
||||||
|
"himmelfahrt": 39, "christi himmelfahrt": 39,
|
||||||
|
"pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
|
||||||
|
"fronleichnam": 60,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Month names -> number (German + English, full + abbreviations) ---
|
||||||
|
MONTHS = {
|
||||||
|
"januar": 1, "jan": 1, "january": 1,
|
||||||
|
"februar": 2, "feb": 2, "febr": 2, "february": 2,
|
||||||
|
"märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
|
||||||
|
"april": 4, "apr": 4,
|
||||||
|
"mai": 5, "may": 5,
|
||||||
|
"juni": 6, "jun": 6, "june": 6,
|
||||||
|
"juli": 7, "jul": 7, "july": 7,
|
||||||
|
"august": 8, "aug": 8,
|
||||||
|
"september": 9, "sep": 9, "sept": 9,
|
||||||
|
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
|
||||||
|
"november": 11, "nov": 11,
|
||||||
|
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
ROMAN_MONTHS = {
|
||||||
|
"i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
|
||||||
|
"vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Person matching ---
|
||||||
|
KNOWN_LAST_NAMES = [
|
||||||
|
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
|
||||||
|
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
|
||||||
|
]
|
||||||
|
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
|
||||||
2
tools/import-normalizer/requirements.txt
Normal file
2
tools/import-normalizer/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
openpyxl==3.1.5
|
||||||
|
pytest==8.3.4
|
||||||
0
tools/import-normalizer/tests/__init__.py
Normal file
0
tools/import-normalizer/tests/__init__.py
Normal file
13
tools/import-normalizer/tests/test_config.py
Normal file
13
tools/import-normalizer/tests/test_config.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import config
|
||||||
|
|
||||||
|
def test_century_boundaries():
|
||||||
|
assert config.TWO_DIGIT_19XX_MAX == 57
|
||||||
|
assert config.TWO_DIGIT_18XX_MIN == 73
|
||||||
|
|
||||||
|
def test_header_maps_cover_required_fields():
|
||||||
|
assert "index" in config.DOCUMENT_HEADER_MAP.values()
|
||||||
|
assert "last_name" in config.PERSON_HEADER_MAP.values()
|
||||||
|
|
||||||
|
def test_feast_tables_present():
|
||||||
|
assert config.MOVABLE_FEASTS["pfingsten"] == 49
|
||||||
|
assert config.SEASON_MONTHS["herbst"] == 10
|
||||||
Reference in New Issue
Block a user