diff --git a/.gitignore b/.gitignore index 60d3f1e8..2866e304 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,7 @@ node_modules/ # Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift. frontend/yarn.lock + +**/.venv/ +**/__pycache__/ +*.pyc diff --git a/tools/import-normalizer/.gitignore b/tools/import-normalizer/.gitignore new file mode 100644 index 00000000..d48fb3f8 --- /dev/null +++ b/tools/import-normalizer/.gitignore @@ -0,0 +1,5 @@ +.venv/ +out/ +review/ +__pycache__/ +*.pyc diff --git a/tools/import-normalizer/config.py b/tools/import-normalizer/config.py new file mode 100644 index 00000000..180fe06c --- /dev/null +++ b/tools/import-normalizer/config.py @@ -0,0 +1,100 @@ +"""Tunables for the import normalizer. No logic here — only data tables.""" +from pathlib import Path + +# --- Paths --- +BASE_DIR = Path(__file__).resolve().parent +REPO_ROOT = BASE_DIR.parent.parent +IMPORT_DIR = REPO_ROOT / "import" + +DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx" +DOCUMENT_SHEET = "Familienarchiv" +PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx" +PERSON_SHEET = "Tabelle1" + +OUT_DIR = BASE_DIR / "out" +REVIEW_DIR = BASE_DIR / "review" +OVERRIDES_DIR = BASE_DIR / "overrides" + +# --- Header text (lowercased, whitespace-collapsed) -> canonical field --- +DOCUMENT_HEADER_MAP = { + "index": "index", + "datei": "file", + "box": "box", + "mappe": "folder", + "briefeschreiberin": "sender", + "empfängerin": "receivers", + "datum des briefes": "date", + "ort": "location", + "schlagwort": "tags", + "inhalt": "summary", +} +DOCUMENT_REQUIRED_FIELDS = {"index"} + +PERSON_HEADER_MAP = { + "generation": "generation", + "familienname": "last_name", + "vorname": "first_name", + "geb als": "maiden_name", + "geburtsdatum": "birth_date", + "geburtsort": "birth_place", + "todesdatum": "death_date", + "sterbeort": "death_place", + "verheiratet mit": "spouse", + "bemerkung": "notes", +} +PERSON_REQUIRED_FIELDS = {"last_name"} + +# --- Century rule (archive 1873–1957) --- +TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy +TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN + +# --- Seasons -> representative month (day = 1) --- +SEASON_MONTHS = { + "frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4, + "sommer": 7, "herbst": 10, "winter": 1, +} + +# --- Fixed feasts -> (month, day) --- +FIXED_FEASTS = { + "neujahr": (1, 1), + "heiligabend": (12, 24), "heiliger abend": (12, 24), + "weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25), + "silvester": (12, 31), "sylvester": (12, 31), +} + +# --- Movable feasts -> day offset from Easter Sunday --- +MOVABLE_FEASTS = { + "karfreitag": -2, + "ostern": 0, "ostersonntag": 0, "ostermontag": 1, + "himmelfahrt": 39, "christi himmelfahrt": 39, + "pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50, + "fronleichnam": 60, +} + +# --- Month names -> number (German + English, full + abbreviations) --- +MONTHS = { + "januar": 1, "jan": 1, "january": 1, + "februar": 2, "feb": 2, "febr": 2, "february": 2, + "märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3, + "april": 4, "apr": 4, + "mai": 5, "may": 5, + "juni": 6, "jun": 6, "june": 6, + "juli": 7, "jul": 7, "july": 7, + "august": 8, "aug": 8, + "september": 9, "sep": 9, "sept": 9, + "oktober": 10, "okt": 10, "oct": 10, "october": 10, + "november": 11, "nov": 11, + "dezember": 12, "dez": 12, "dec": 12, "december": 12, +} + +ROMAN_MONTHS = { + "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, + "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, +} + +# --- Person matching --- +KNOWN_LAST_NAMES = [ + "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa", + "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", +] +FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied diff --git a/tools/import-normalizer/requirements.txt b/tools/import-normalizer/requirements.txt new file mode 100644 index 00000000..886c2074 --- /dev/null +++ b/tools/import-normalizer/requirements.txt @@ -0,0 +1,2 @@ +openpyxl==3.1.5 +pytest==8.3.4 diff --git a/tools/import-normalizer/tests/__init__.py b/tools/import-normalizer/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/import-normalizer/tests/test_config.py b/tools/import-normalizer/tests/test_config.py new file mode 100644 index 00000000..6384df41 --- /dev/null +++ b/tools/import-normalizer/tests/test_config.py @@ -0,0 +1,13 @@ +import config + +def test_century_boundaries(): + assert config.TWO_DIGIT_19XX_MAX == 57 + assert config.TWO_DIGIT_18XX_MIN == 73 + +def test_header_maps_cover_required_fields(): + assert "index" in config.DOCUMENT_HEADER_MAP.values() + assert "last_name" in config.PERSON_HEADER_MAP.values() + +def test_feast_tables_present(): + assert config.MOVABLE_FEASTS["pfingsten"] == 49 + assert config.SEASON_MONTHS["herbst"] == 10