diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index b9106245..41f66458 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -197,6 +197,22 @@ def classify_name(raw: str, given_names: set[str]) -> NameClass: # register; if they surface in review, lower-priority than the real prose entries. +def build_given_names(register: list[Person], extra: set[str]) -> set[str]: + """Set of normalized given names from the register (first + extra given) plus a supplement. + + Used by classify_name to tell a two-given-name pair (two people) from a first+surname. + """ + names: set[str] = set() + for p in register: + if p.first_name: + names.add(_norm(p.first_name)) + for g in p.extra_given_names: + names.add(_norm(g)) + for e in extra: + names.add(_norm(e)) + return names + + class AliasIndex: def __init__(self, people: list[Person]): self._by_alias: dict[str, str] = {} diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py index 53ed62df..5d26ecfa 100644 --- a/tools/import-normalizer/tests/test_persons.py +++ b/tools/import-normalizer/tests/test_persons.py @@ -119,3 +119,14 @@ def test_classify_resolvable_single_person(): # first + surname (surname not a given name) -> one real person, NOT ambiguous assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE + +def test_build_given_names(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie"}, + {"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given + ]) + g = persons.build_given_names(people, {"Anita"}) + assert "eugenie" in g + assert "charlotte" in g and "meta" in g # primary + extra given names + assert "anita" in g # from the extra set, normalized + assert "schefold" not in g