diff --git a/tools/import-normalizer/config.py b/tools/import-normalizer/config.py index 180fe06c..d789a2af 100644 --- a/tools/import-normalizer/config.py +++ b/tools/import-normalizer/config.py @@ -98,3 +98,30 @@ KNOWN_LAST_NAMES = [ "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", ] FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied + +# --- Name classification (unresolved-name review) --- +# Relational reference terms — a sender/receiver named by relation, not a proper name. +RELATIONAL_TERMS = { + "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter", + "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine", + "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter", + "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer", +} +# Collective/group terms — not a single person. Matched against alpha-only word tokens +# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes. +COLLECTIVE_TERMS = { + "familie", "fam", "kinder", "eltern", "geschwister", "großeltern", + "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr", +} +# Markers of an unknown/illegible name (the literal "?" is handled separately in code). +# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn" +# (it occurs inside real names: Hanni, Johanna, Anna). +UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"} +# A name-column value longer than this (chars) is treated as prose/description, not a name. +PROSE_MAX_LEN = 40 +# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not +# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more. +EXTRA_GIVEN_NAMES = { + "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara", + "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta", +} diff --git a/tools/import-normalizer/tests/test_config.py b/tools/import-normalizer/tests/test_config.py index 6384df41..a88917d9 100644 --- a/tools/import-normalizer/tests/test_config.py +++ b/tools/import-normalizer/tests/test_config.py @@ -11,3 +11,10 @@ def test_header_maps_cover_required_fields(): def test_feast_tables_present(): assert config.MOVABLE_FEASTS["pfingsten"] == 49 assert config.SEASON_MONTHS["herbst"] == 10 + +def test_name_classification_tables(): + assert "tante" in config.RELATIONAL_TERMS + assert "familie" in config.COLLECTIVE_TERMS + assert "unbekannt" in config.UNKNOWN_NAME_MARKERS + assert config.PROSE_MAX_LEN >= 30 + assert "anita" in config.EXTRA_GIVEN_NAMES