From 6478cc58ae3ca4ab998dee5b537e13b0eb2d8315 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 15:47:40 +0200 Subject: [PATCH] feat(normalizer): classify_name + NameClass Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/persons.py | 49 +++++++++++++++++++ tools/import-normalizer/tests/test_persons.py | 37 ++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index 986f58b1..b9106245 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -4,6 +4,7 @@ import re import unicodedata from collections import Counter from dataclasses import dataclass, field +from enum import StrEnum import config import dates @@ -148,6 +149,54 @@ def _norm(name: str) -> str: return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip() +class NameClass(StrEnum): + RESOLVABLE = "resolvable" + UNKNOWN = "unknown" + SINGLE_TOKEN = "single_token" + RELATIONAL = "relational" + COLLECTIVE = "collective" + PROSE = "prose" + AMBIGUOUS_PAIR = "ambiguous_pair" + + +_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019" + + +def classify_name(raw: str, given_names: set[str]) -> NameClass: + """Classify a (post-split) sender/receiver string by why it may be unresolvable. + + Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL -> + SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE. + """ + s = raw.strip() + if not s: + return NameClass.RESOLVABLE + low = s.lower() + tokens = s.split() + # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms + # are matched as whole words (no substring/prefix false positives like "Allerton"). + alpha_words = re.findall(r"[a-zäöüß]+", low) + if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS): + return NameClass.UNKNOWN + if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s) + or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3): + return NameClass.PROSE + if any(w in config.COLLECTIVE_TERMS for w in alpha_words): + return NameClass.COLLECTIVE + if any(w in config.RELATIONAL_TERMS for w in alpha_words): + return NameClass.RELATIONAL + if len(tokens) == 1: + return NameClass.SINGLE_TOKEN + if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens): + return NameClass.AMBIGUOUS_PAIR + return NameClass.RESOLVABLE + + +# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is +# classified PROSE. Such multi-particle names are rare here and usually resolve via the +# register; if they surface in review, lower-priority than the real prose entries. + + class AliasIndex: def __init__(self, people: list[Person]): self._by_alias: dict[str, str] = {} diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py index e680f0d4..53ed62df 100644 --- a/tools/import-normalizer/tests/test_persons.py +++ b/tools/import-normalizer/tests/test_persons.py @@ -1,5 +1,6 @@ import config import persons +from persons import NameClass def test_slugify(): assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie" @@ -82,3 +83,39 @@ def test_alias_index_first_name_only_when_unambiguous(): assert idx.resolve("Clara") == people[0].person_id # unique first name resolves assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve assert idx.display(people[0].person_id) == "Clara Cram" + + +GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"} + +def test_classify_unknown(): + assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN + +def test_classify_prose(): + assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE + assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit + assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote + +def test_classify_collective(): + assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE + +def test_classify_relational(): + assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL + assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL + +def test_classify_single_token(): + assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN + assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN + +def test_classify_ambiguous_pair(): + assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR + assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR + +def test_classify_resolvable_single_person(): + # first + surname (surname not a given name) -> one real person, NOT ambiguous + assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE + assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE