feat(normalizer): config tables for name classification

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 15:43:31 +02:00
parent 5ff0c25e10
commit a7c45b3a0e
2 changed files with 34 additions and 0 deletions

View File

@@ -98,3 +98,30 @@ KNOWN_LAST_NAMES = [
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
] ]
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
# --- Name classification (unresolved-name review) ---
# Relational reference terms — a sender/receiver named by relation, not a proper name.
RELATIONAL_TERMS = {
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
}
# Collective/group terms — not a single person. Matched against alpha-only word tokens
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
COLLECTIVE_TERMS = {
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
}
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
# (it occurs inside real names: Hanni, Johanna, Anna).
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
# A name-column value longer than this (chars) is treated as prose/description, not a name.
PROSE_MAX_LEN = 40
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
EXTRA_GIVEN_NAMES = {
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
}

View File

@@ -11,3 +11,10 @@ def test_header_maps_cover_required_fields():
def test_feast_tables_present(): def test_feast_tables_present():
assert config.MOVABLE_FEASTS["pfingsten"] == 49 assert config.MOVABLE_FEASTS["pfingsten"] == 49
assert config.SEASON_MONTHS["herbst"] == 10 assert config.SEASON_MONTHS["herbst"] == 10
def test_name_classification_tables():
assert "tante" in config.RELATIONAL_TERMS
assert "familie" in config.COLLECTIVE_TERMS
assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
assert config.PROSE_MAX_LEN >= 30
assert "anita" in config.EXTRA_GIVEN_NAMES