"""Tunables for the import normalizer. No logic here — only data tables.""" from pathlib import Path # --- Paths --- BASE_DIR = Path(__file__).resolve().parent REPO_ROOT = BASE_DIR.parent.parent IMPORT_DIR = REPO_ROOT / "import" DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx" DOCUMENT_SHEET = "Familienarchiv" PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx" PERSON_SHEET = "Tabelle1" OUT_DIR = BASE_DIR / "out" REVIEW_DIR = BASE_DIR / "review" OVERRIDES_DIR = BASE_DIR / "overrides" # --- Header text (lowercased, whitespace-collapsed) -> canonical field --- DOCUMENT_HEADER_MAP = { "index": "index", "datei": "file", "box": "box", "mappe": "folder", "briefeschreiberin": "sender", "empfängerin": "receivers", "datum des briefes": "date", "ort": "location", "schlagwort": "tags", "inhalt": "summary", } DOCUMENT_REQUIRED_FIELDS = {"index"} PERSON_HEADER_MAP = { "generation": "generation", "familienname": "last_name", "vorname": "first_name", "geb als": "maiden_name", "geburtsdatum": "birth_date", "geburtsort": "birth_place", "todesdatum": "death_date", "sterbeort": "death_place", "verheiratet mit": "spouse", "bemerkung": "notes", } PERSON_REQUIRED_FIELDS = {"last_name"} # --- Century rule (archive 1873–1957) --- TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN # --- Seasons -> representative month (day = 1) --- SEASON_MONTHS = { "frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4, "sommer": 7, "herbst": 10, "winter": 1, } # --- Fixed feasts -> (month, day) --- FIXED_FEASTS = { "neujahr": (1, 1), "heiligabend": (12, 24), "heiliger abend": (12, 24), "weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25), "silvester": (12, 31), "sylvester": (12, 31), } # --- Movable feasts -> day offset from Easter Sunday --- MOVABLE_FEASTS = { "karfreitag": -2, "ostern": 0, "ostersonntag": 0, "ostermontag": 1, "himmelfahrt": 39, "christi himmelfahrt": 39, "pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50, "fronleichnam": 60, } # --- Month names -> number (German + English, full + abbreviations) --- MONTHS = { "januar": 1, "jan": 1, "january": 1, "februar": 2, "feb": 2, "febr": 2, "february": 2, "märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3, "april": 4, "apr": 4, "mai": 5, "may": 5, "juni": 6, "jun": 6, "june": 6, "juli": 7, "jul": 7, "july": 7, "august": 8, "aug": 8, "september": 9, "sep": 9, "sept": 9, "oktober": 10, "okt": 10, "oct": 10, "october": 10, "november": 11, "nov": 11, "dezember": 12, "dez": 12, "dec": 12, "december": 12, # Spanish (Mexican-branch correspondence) "enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6, "julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10, "noviembre": 11, "diciembre": 12, } ROMAN_MONTHS = { "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, } # --- Person matching --- KNOWN_LAST_NAMES = [ "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa", "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", ] FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied # --- Name classification (unresolved-name review) --- # Relational reference terms — a sender/receiver named by relation, not a proper name. RELATIONAL_TERMS = { "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter", "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine", "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter", "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer", } # Collective/group terms — not a single person. Matched against alpha-only word tokens # (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes. COLLECTIVE_TERMS = { "familie", "fam", "kinder", "eltern", "geschwister", "großeltern", "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr", # Plural/group relational terms — added for tag generation heuristic "söhne", "töchter", "brüder", "schwestern", "schwiegereltern", "vettern", "kusinen", "cousinen", "nichten", "neffen", "tanten", "freunde", "bekannte", "geschw", "enkelkinder", "jungens", "verwandten", } # Markers of an unknown/illegible name (the literal "?" is handled separately in code). # All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn" # (it occurs inside real names: Hanni, Johanna, Anna). UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"} # A name-column value longer than this (chars) is treated as prose/description, not a name. PROSE_MAX_LEN = 40 # Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not # in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more. EXTRA_GIVEN_NAMES = { "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara", "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta", }