Files
familienarchiv/tools/import-normalizer/config.py
2026-05-25 15:43:31 +02:00

128 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tunables for the import normalizer. No logic here — only data tables."""
from pathlib import Path
# --- Paths ---
BASE_DIR = Path(__file__).resolve().parent
REPO_ROOT = BASE_DIR.parent.parent
IMPORT_DIR = REPO_ROOT / "import"
DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx"
DOCUMENT_SHEET = "Familienarchiv"
PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
PERSON_SHEET = "Tabelle1"
OUT_DIR = BASE_DIR / "out"
REVIEW_DIR = BASE_DIR / "review"
OVERRIDES_DIR = BASE_DIR / "overrides"
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
DOCUMENT_HEADER_MAP = {
"index": "index",
"datei": "file",
"box": "box",
"mappe": "folder",
"briefeschreiberin": "sender",
"empfängerin": "receivers",
"datum des briefes": "date",
"ort": "location",
"schlagwort": "tags",
"inhalt": "summary",
}
DOCUMENT_REQUIRED_FIELDS = {"index"}
PERSON_HEADER_MAP = {
"generation": "generation",
"familienname": "last_name",
"vorname": "first_name",
"geb als": "maiden_name",
"geburtsdatum": "birth_date",
"geburtsort": "birth_place",
"todesdatum": "death_date",
"sterbeort": "death_place",
"verheiratet mit": "spouse",
"bemerkung": "notes",
}
PERSON_REQUIRED_FIELDS = {"last_name"}
# --- Century rule (archive 18731957) ---
TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy
TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
# --- Seasons -> representative month (day = 1) ---
SEASON_MONTHS = {
"frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
"sommer": 7, "herbst": 10, "winter": 1,
}
# --- Fixed feasts -> (month, day) ---
FIXED_FEASTS = {
"neujahr": (1, 1),
"heiligabend": (12, 24), "heiliger abend": (12, 24),
"weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
"silvester": (12, 31), "sylvester": (12, 31),
}
# --- Movable feasts -> day offset from Easter Sunday ---
MOVABLE_FEASTS = {
"karfreitag": -2,
"ostern": 0, "ostersonntag": 0, "ostermontag": 1,
"himmelfahrt": 39, "christi himmelfahrt": 39,
"pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
"fronleichnam": 60,
}
# --- Month names -> number (German + English, full + abbreviations) ---
MONTHS = {
"januar": 1, "jan": 1, "january": 1,
"februar": 2, "feb": 2, "febr": 2, "february": 2,
"märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
"april": 4, "apr": 4,
"mai": 5, "may": 5,
"juni": 6, "jun": 6, "june": 6,
"juli": 7, "jul": 7, "july": 7,
"august": 8, "aug": 8,
"september": 9, "sep": 9, "sept": 9,
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
"november": 11, "nov": 11,
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
}
ROMAN_MONTHS = {
"i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
"vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
}
# --- Person matching ---
KNOWN_LAST_NAMES = [
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
]
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
# --- Name classification (unresolved-name review) ---
# Relational reference terms — a sender/receiver named by relation, not a proper name.
RELATIONAL_TERMS = {
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
}
# Collective/group terms — not a single person. Matched against alpha-only word tokens
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
COLLECTIVE_TERMS = {
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
}
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
# (it occurs inside real names: Hanni, Johanna, Anna).
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
# A name-column value longer than this (chars) is treated as prose/description, not a name.
PROSE_MAX_LEN = 40
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
EXTRA_GIVEN_NAMES = {
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
}