Files
familienarchiv/tools/import-normalizer/tests/test_persons.py
2026-05-25 15:51:23 +02:00

133 lines
6.4 KiB
Python

import config
import persons
from persons import NameClass
def test_slugify():
assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard"
def test_parse_register_basic():
rows = [
{"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi",
"maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel",
"death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"',
"notes": "Schwester v Marie Cram"},
{"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else",
"maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann",
"notes": "Schwester v Herbert"},
]
people = persons.parse_register(rows)
p = people[0]
assert p.person_id == "blomquist-charlotte"
assert p.first_name == "Charlotte"
assert p.maiden_name == "Ruge"
assert p.birth_date == "1862-08-30"
assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse
assert p.spouse == ""
assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names
p2 = people[1]
assert p2.maiden_name == "Cram"
assert p2.spouse == "Ludwig Bohrmann"
assert p2.provisional is False
def test_parse_register_dedups_colliding_ids():
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
people = persons.parse_register([
{"last_name": "Cram", "first_name": "Hans"},
{"last_name": "Cram", "first_name": "Hans"},
])
ids = [p.person_id for p in people]
assert ids == ["cram-hans-1", "cram-hans-2"]
assert len(set(ids)) == 2
def test_split_receivers():
assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"]
assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"]
assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"]
assert persons.split_receivers("Clara u Familie") == ["Clara"]
assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"]
assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"]
assert persons.split_receivers("") == []
assert persons.split_receivers("geb. Müller") == [] # maiden-only cell -> no person
assert persons.split_receivers("Herbert//Clara") == ["Herbert", "Clara"] # // separator
def test_find_known_last_name():
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
assert persons.find_known_last_name("Clara") is None
def test_alias_index_resolves_maiden_and_married():
people = persons.parse_register([
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
{"last_name": "Cram", "first_name": "Clara"},
])
idx = persons.AliasIndex(people)
eugenie = people[0].person_id
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
assert idx.resolve("eugenie müller") == eugenie # normalized
assert idx.resolve("Nobody Unknown") is None
def test_alias_index_suggestion():
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
idx = persons.AliasIndex(people)
sid, score = idx.suggest("Hans Wittkop") # typo
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD
def test_alias_index_first_name_only_when_unambiguous():
people = persons.parse_register([
{"last_name": "Cram", "first_name": "Clara"},
{"last_name": "de Gruyter", "first_name": "Walter"},
{"last_name": "Cram", "first_name": "Walter"}, # 2nd "Walter" -> first name ambiguous
])
idx = persons.AliasIndex(people)
assert idx.resolve("Clara") == people[0].person_id # unique first name resolves
assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve
assert idx.display(people[0].person_id) == "Clara Cram"
GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
def test_classify_unknown():
assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
def test_classify_prose():
assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit
assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote
def test_classify_collective():
assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
def test_classify_relational():
assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
def test_classify_single_token():
assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
def test_classify_ambiguous_pair():
assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
def test_classify_resolvable_single_person():
# first + surname (surname not a given name) -> one real person, NOT ambiguous
assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
def test_build_given_names():
people = persons.parse_register([
{"last_name": "de Gruyter", "first_name": "Eugenie"},
{"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given
])
g = persons.build_given_names(people, {"Anita"})
assert "eugenie" in g
assert "charlotte" in g and "meta" in g # primary + extra given names
assert "anita" in g # from the extra set, normalized
assert "schefold" not in g