All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m31s
CI / OCR Service Tests (pull_request) Successful in 22s
CI / Backend Unit Tests (pull_request) Successful in 3m42s
CI / fail2ban Regex (pull_request) Successful in 45s
CI / Semgrep Security Scan (pull_request) Successful in 20s
CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s
Add Spanish month names (Mexican-branch letters) to config.MONTHS and let
the month-first matcher accept a hyphen (not just a dot) before the year, so
"Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound
4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead
of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
132 lines
5.0 KiB
Python
132 lines
5.0 KiB
Python
"""Tunables for the import normalizer. No logic here — only data tables."""
|
||
from pathlib import Path
|
||
|
||
# --- Paths ---
|
||
BASE_DIR = Path(__file__).resolve().parent
|
||
REPO_ROOT = BASE_DIR.parent.parent
|
||
IMPORT_DIR = REPO_ROOT / "import"
|
||
|
||
DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx"
|
||
DOCUMENT_SHEET = "Familienarchiv"
|
||
PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
|
||
PERSON_SHEET = "Tabelle1"
|
||
|
||
OUT_DIR = BASE_DIR / "out"
|
||
REVIEW_DIR = BASE_DIR / "review"
|
||
OVERRIDES_DIR = BASE_DIR / "overrides"
|
||
|
||
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
||
DOCUMENT_HEADER_MAP = {
|
||
"index": "index",
|
||
"datei": "file",
|
||
"box": "box",
|
||
"mappe": "folder",
|
||
"briefeschreiberin": "sender",
|
||
"empfängerin": "receivers",
|
||
"datum des briefes": "date",
|
||
"ort": "location",
|
||
"schlagwort": "tags",
|
||
"inhalt": "summary",
|
||
}
|
||
DOCUMENT_REQUIRED_FIELDS = {"index"}
|
||
|
||
PERSON_HEADER_MAP = {
|
||
"generation": "generation",
|
||
"familienname": "last_name",
|
||
"vorname": "first_name",
|
||
"geb als": "maiden_name",
|
||
"geburtsdatum": "birth_date",
|
||
"geburtsort": "birth_place",
|
||
"todesdatum": "death_date",
|
||
"sterbeort": "death_place",
|
||
"verheiratet mit": "spouse",
|
||
"bemerkung": "notes",
|
||
}
|
||
PERSON_REQUIRED_FIELDS = {"last_name"}
|
||
|
||
# --- Century rule (archive 1873–1957) ---
|
||
TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy
|
||
TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
|
||
|
||
# --- Seasons -> representative month (day = 1) ---
|
||
SEASON_MONTHS = {
|
||
"frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
|
||
"sommer": 7, "herbst": 10, "winter": 1,
|
||
}
|
||
|
||
# --- Fixed feasts -> (month, day) ---
|
||
FIXED_FEASTS = {
|
||
"neujahr": (1, 1),
|
||
"heiligabend": (12, 24), "heiliger abend": (12, 24),
|
||
"weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
|
||
"silvester": (12, 31), "sylvester": (12, 31),
|
||
}
|
||
|
||
# --- Movable feasts -> day offset from Easter Sunday ---
|
||
MOVABLE_FEASTS = {
|
||
"karfreitag": -2,
|
||
"ostern": 0, "ostersonntag": 0, "ostermontag": 1,
|
||
"himmelfahrt": 39, "christi himmelfahrt": 39,
|
||
"pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
|
||
"fronleichnam": 60,
|
||
}
|
||
|
||
# --- Month names -> number (German + English, full + abbreviations) ---
|
||
MONTHS = {
|
||
"januar": 1, "jan": 1, "january": 1,
|
||
"februar": 2, "feb": 2, "febr": 2, "february": 2,
|
||
"märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
|
||
"april": 4, "apr": 4,
|
||
"mai": 5, "may": 5,
|
||
"juni": 6, "jun": 6, "june": 6,
|
||
"juli": 7, "jul": 7, "july": 7,
|
||
"august": 8, "aug": 8,
|
||
"september": 9, "sep": 9, "sept": 9,
|
||
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
|
||
"november": 11, "nov": 11,
|
||
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
|
||
# Spanish (Mexican-branch correspondence)
|
||
"enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
|
||
"julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
|
||
"noviembre": 11, "diciembre": 12,
|
||
}
|
||
|
||
ROMAN_MONTHS = {
|
||
"i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
|
||
"vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
|
||
}
|
||
|
||
# --- Person matching ---
|
||
KNOWN_LAST_NAMES = [
|
||
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
|
||
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
|
||
]
|
||
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
|
||
|
||
# --- Name classification (unresolved-name review) ---
|
||
# Relational reference terms — a sender/receiver named by relation, not a proper name.
|
||
RELATIONAL_TERMS = {
|
||
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
|
||
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
|
||
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
|
||
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
|
||
}
|
||
# Collective/group terms — not a single person. Matched against alpha-only word tokens
|
||
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
|
||
COLLECTIVE_TERMS = {
|
||
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
|
||
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
|
||
}
|
||
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
|
||
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
|
||
# (it occurs inside real names: Hanni, Johanna, Anna).
|
||
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
|
||
# A name-column value longer than this (chars) is treated as prose/description, not a name.
|
||
PROSE_MAX_LEN = 40
|
||
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
|
||
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
|
||
EXTRA_GIVEN_NAMES = {
|
||
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
|
||
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
|
||
}
|