feat(normalizer): classify_name + NameClass

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 15:47:40 +02:00
parent a7c45b3a0e
commit 6478cc58ae
2 changed files with 86 additions and 0 deletions
--- a/tools/import-normalizer/persons.py
+++ b/tools/import-normalizer/persons.py
@@ -4,6 +4,7 @@ import re
 import unicodedata
 from collections import Counter
 from dataclasses import dataclass, field
+from enum import StrEnum

 import config
 import dates
@@ -148,6 +149,54 @@ def _norm(name: str) -> str:
    return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()


+class NameClass(StrEnum):
+    RESOLVABLE = "resolvable"
+    UNKNOWN = "unknown"
+    SINGLE_TOKEN = "single_token"
+    RELATIONAL = "relational"
+    COLLECTIVE = "collective"
+    PROSE = "prose"
+    AMBIGUOUS_PAIR = "ambiguous_pair"
+
+
+_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019"
+
+
+def classify_name(raw: str, given_names: set[str]) -> NameClass:
+    """Classify a (post-split) sender/receiver string by why it may be unresolvable.
+
+    Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
+    SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
+    """
+    s = raw.strip()
+    if not s:
+        return NameClass.RESOLVABLE
+    low = s.lower()
+    tokens = s.split()
+    # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
+    # are matched as whole words (no substring/prefix false positives like "Allerton").
+    alpha_words = re.findall(r"[a-zäöüß]+", low)
+    if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
+        return NameClass.UNKNOWN
+    if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
+            or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
+        return NameClass.PROSE
+    if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
+        return NameClass.COLLECTIVE
+    if any(w in config.RELATIONAL_TERMS for w in alpha_words):
+        return NameClass.RELATIONAL
+    if len(tokens) == 1:
+        return NameClass.SINGLE_TOKEN
+    if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
+        return NameClass.AMBIGUOUS_PAIR
+    return NameClass.RESOLVABLE
+
+
+# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
+# classified PROSE. Such multi-particle names are rare here and usually resolve via the
+# register; if they surface in review, lower-priority than the real prose entries.
+
+
 class AliasIndex:
    def __init__(self, people: list[Person]):
        self._by_alias: dict[str, str] = {}
--- a/tools/import-normalizer/tests/test_persons.py
+++ b/tools/import-normalizer/tests/test_persons.py
@@ -1,5 +1,6 @@
 import config
 import persons
+from persons import NameClass

 def test_slugify():
    assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
@@ -82,3 +83,39 @@ def test_alias_index_first_name_only_when_unambiguous():
    assert idx.resolve("Clara") == people[0].person_id   # unique first name resolves
    assert idx.resolve("Walter") is None                 # ambiguous first name does NOT resolve
    assert idx.display(people[0].person_id) == "Clara Cram"
+
+
+GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
+
+def test_classify_unknown():
+    assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
+    assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
+    assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
+
+def test_classify_prose():
+    assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
+    assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE  # digit
+    assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE        # quote
+
+def test_classify_collective():
+    assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
+    assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
+    assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
+    assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
+
+def test_classify_relational():
+    assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
+    assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
+
+def test_classify_single_token():
+    assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
+    assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
+
+def test_classify_ambiguous_pair():
+    assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
+    assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
+
+def test_classify_resolvable_single_person():
+    # first + surname (surname not a given name) -> one real person, NOT ambiguous
+    assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
+    assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE