feat(normalizer): build_given_names from register + supplement

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 15:51:23 +02:00
parent 6478cc58ae
commit f10b80a03f
2 changed files with 27 additions and 0 deletions

View File

@@ -197,6 +197,22 @@ def classify_name(raw: str, given_names: set[str]) -> NameClass:
# register; if they surface in review, lower-priority than the real prose entries.
def build_given_names(register: list[Person], extra: set[str]) -> set[str]:
"""Set of normalized given names from the register (first + extra given) plus a supplement.
Used by classify_name to tell a two-given-name pair (two people) from a first+surname.
"""
names: set[str] = set()
for p in register:
if p.first_name:
names.add(_norm(p.first_name))
for g in p.extra_given_names:
names.add(_norm(g))
for e in extra:
names.add(_norm(e))
return names
class AliasIndex:
def __init__(self, people: list[Person]):
self._by_alias: dict[str, str] = {}

View File

@@ -119,3 +119,14 @@ def test_classify_resolvable_single_person():
# first + surname (surname not a given name) -> one real person, NOT ambiguous
assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
def test_build_given_names():
people = persons.parse_register([
{"last_name": "de Gruyter", "first_name": "Eugenie"},
{"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given
])
g = persons.build_given_names(people, {"Anita"})
assert "eugenie" in g
assert "charlotte" in g and "meta" in g # primary + extra given names
assert "anita" in g # from the extra set, normalized
assert "schefold" not in g