Files
familienarchiv/tools/import-normalizer/config.py
Marcel 94a40237f4 feat(normalizer): generate structured tags from Schlagwort + Inhalt fields
Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>

Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").

COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.

Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.

Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 19:47:36 +02:00

136 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tunables for the import normalizer. No logic here — only data tables."""
from pathlib import Path
# --- Paths ---
BASE_DIR = Path(__file__).resolve().parent
REPO_ROOT = BASE_DIR.parent.parent
IMPORT_DIR = REPO_ROOT / "import"
DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx"
DOCUMENT_SHEET = "Familienarchiv"
PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
PERSON_SHEET = "Tabelle1"
OUT_DIR = BASE_DIR / "out"
REVIEW_DIR = BASE_DIR / "review"
OVERRIDES_DIR = BASE_DIR / "overrides"
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
DOCUMENT_HEADER_MAP = {
"index": "index",
"datei": "file",
"box": "box",
"mappe": "folder",
"briefeschreiberin": "sender",
"empfängerin": "receivers",
"datum des briefes": "date",
"ort": "location",
"schlagwort": "tags",
"inhalt": "summary",
}
DOCUMENT_REQUIRED_FIELDS = {"index"}
PERSON_HEADER_MAP = {
"generation": "generation",
"familienname": "last_name",
"vorname": "first_name",
"geb als": "maiden_name",
"geburtsdatum": "birth_date",
"geburtsort": "birth_place",
"todesdatum": "death_date",
"sterbeort": "death_place",
"verheiratet mit": "spouse",
"bemerkung": "notes",
}
PERSON_REQUIRED_FIELDS = {"last_name"}
# --- Century rule (archive 18731957) ---
TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy
TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
# --- Seasons -> representative month (day = 1) ---
SEASON_MONTHS = {
"frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
"sommer": 7, "herbst": 10, "winter": 1,
}
# --- Fixed feasts -> (month, day) ---
FIXED_FEASTS = {
"neujahr": (1, 1),
"heiligabend": (12, 24), "heiliger abend": (12, 24),
"weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
"silvester": (12, 31), "sylvester": (12, 31),
}
# --- Movable feasts -> day offset from Easter Sunday ---
MOVABLE_FEASTS = {
"karfreitag": -2,
"ostern": 0, "ostersonntag": 0, "ostermontag": 1,
"himmelfahrt": 39, "christi himmelfahrt": 39,
"pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
"fronleichnam": 60,
}
# --- Month names -> number (German + English, full + abbreviations) ---
MONTHS = {
"januar": 1, "jan": 1, "january": 1,
"februar": 2, "feb": 2, "febr": 2, "february": 2,
"märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
"april": 4, "apr": 4,
"mai": 5, "may": 5,
"juni": 6, "jun": 6, "june": 6,
"juli": 7, "jul": 7, "july": 7,
"august": 8, "aug": 8,
"september": 9, "sep": 9, "sept": 9,
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
"november": 11, "nov": 11,
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
# Spanish (Mexican-branch correspondence)
"enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
"julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
"noviembre": 11, "diciembre": 12,
}
ROMAN_MONTHS = {
"i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
"vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
}
# --- Person matching ---
KNOWN_LAST_NAMES = [
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
]
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
# --- Name classification (unresolved-name review) ---
# Relational reference terms — a sender/receiver named by relation, not a proper name.
RELATIONAL_TERMS = {
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
}
# Collective/group terms — not a single person. Matched against alpha-only word tokens
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
COLLECTIVE_TERMS = {
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
# Plural/group relational terms — added for tag generation heuristic
"söhne", "töchter", "brüder", "schwestern", "schwiegereltern",
"vettern", "kusinen", "cousinen", "nichten", "neffen", "tanten",
"freunde", "bekannte", "geschw", "enkelkinder", "jungens", "verwandten",
}
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
# (it occurs inside real names: Hanni, Johanna, Anna).
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
# A name-column value longer than this (chars) is treated as prose/description, not a name.
PROSE_MAX_LEN = 40
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
EXTRA_GIVEN_NAMES = {
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
}