feat(normalizer): config tables for name classification
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -98,3 +98,30 @@ KNOWN_LAST_NAMES = [
|
||||
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
|
||||
]
|
||||
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
|
||||
|
||||
# --- Name classification (unresolved-name review) ---
|
||||
# Relational reference terms — a sender/receiver named by relation, not a proper name.
|
||||
RELATIONAL_TERMS = {
|
||||
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
|
||||
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
|
||||
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
|
||||
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
|
||||
}
|
||||
# Collective/group terms — not a single person. Matched against alpha-only word tokens
|
||||
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
|
||||
COLLECTIVE_TERMS = {
|
||||
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
|
||||
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
|
||||
}
|
||||
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
|
||||
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
|
||||
# (it occurs inside real names: Hanni, Johanna, Anna).
|
||||
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
|
||||
# A name-column value longer than this (chars) is treated as prose/description, not a name.
|
||||
PROSE_MAX_LEN = 40
|
||||
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
|
||||
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
|
||||
EXTRA_GIVEN_NAMES = {
|
||||
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
|
||||
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
|
||||
}
|
||||
|
||||
@@ -11,3 +11,10 @@ def test_header_maps_cover_required_fields():
|
||||
def test_feast_tables_present():
|
||||
assert config.MOVABLE_FEASTS["pfingsten"] == 49
|
||||
assert config.SEASON_MONTHS["herbst"] == 10
|
||||
|
||||
def test_name_classification_tables():
|
||||
assert "tante" in config.RELATIONAL_TERMS
|
||||
assert "familie" in config.COLLECTIVE_TERMS
|
||||
assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
|
||||
assert config.PROSE_MAX_LEN >= 30
|
||||
assert "anita" in config.EXTRA_GIVEN_NAMES
|
||||
|
||||
Reference in New Issue
Block a user