feat(normalizer): classify_name + NameClass

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 15:47:40 +02:00
parent a7c45b3a0e
commit 6478cc58ae
2 changed files with 86 additions and 0 deletions

View File

@@ -4,6 +4,7 @@ import re
import unicodedata import unicodedata
from collections import Counter from collections import Counter
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import StrEnum
import config import config
import dates import dates
@@ -148,6 +149,54 @@ def _norm(name: str) -> str:
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip() return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
class NameClass(StrEnum):
RESOLVABLE = "resolvable"
UNKNOWN = "unknown"
SINGLE_TOKEN = "single_token"
RELATIONAL = "relational"
COLLECTIVE = "collective"
PROSE = "prose"
AMBIGUOUS_PAIR = "ambiguous_pair"
_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019"
def classify_name(raw: str, given_names: set[str]) -> NameClass:
"""Classify a (post-split) sender/receiver string by why it may be unresolvable.
Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
"""
s = raw.strip()
if not s:
return NameClass.RESOLVABLE
low = s.lower()
tokens = s.split()
# alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
# are matched as whole words (no substring/prefix false positives like "Allerton").
alpha_words = re.findall(r"[a-zäöüß]+", low)
if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
return NameClass.UNKNOWN
if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
return NameClass.PROSE
if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
return NameClass.COLLECTIVE
if any(w in config.RELATIONAL_TERMS for w in alpha_words):
return NameClass.RELATIONAL
if len(tokens) == 1:
return NameClass.SINGLE_TOKEN
if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
return NameClass.AMBIGUOUS_PAIR
return NameClass.RESOLVABLE
# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
# classified PROSE. Such multi-particle names are rare here and usually resolve via the
# register; if they surface in review, lower-priority than the real prose entries.
class AliasIndex: class AliasIndex:
def __init__(self, people: list[Person]): def __init__(self, people: list[Person]):
self._by_alias: dict[str, str] = {} self._by_alias: dict[str, str] = {}

View File

@@ -1,5 +1,6 @@
import config import config
import persons import persons
from persons import NameClass
def test_slugify(): def test_slugify():
assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie" assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
@@ -82,3 +83,39 @@ def test_alias_index_first_name_only_when_unambiguous():
assert idx.resolve("Clara") == people[0].person_id # unique first name resolves assert idx.resolve("Clara") == people[0].person_id # unique first name resolves
assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve
assert idx.display(people[0].person_id) == "Clara Cram" assert idx.display(people[0].person_id) == "Clara Cram"
GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
def test_classify_unknown():
assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
def test_classify_prose():
assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit
assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote
def test_classify_collective():
assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
def test_classify_relational():
assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
def test_classify_single_token():
assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
def test_classify_ambiguous_pair():
assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
def test_classify_resolvable_single_person():
# first + surname (surname not a given name) -> one real person, NOT ambiguous
assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE