feat(normalizer): add name normalization + lookup index to persons_tree

This commit is contained in:
Marcel
2026-05-25 20:56:47 +02:00
parent 47a0770758
commit 306f3b6fe6
2 changed files with 142 additions and 0 deletions

View File

@@ -83,3 +83,91 @@ def test_parse_generation_empty():
def test_parse_generation_none():
assert persons_tree._parse_generation(None) is None
def test_norm_tree_basic():
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
def test_norm_tree_diacritics():
assert persons_tree._norm_tree("Wöhler") == "woehler"
def test_norm_tree_strips_parens():
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
def test_norm_tree_strips_quotes():
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
def test_norm_tree_strips_geographic_suffix():
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
def test_norm_tree_strips_mexiko():
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
def test_norm_tree_collapses_whitespace():
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
def test_build_index_forward_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert "werner allemeyer" in idx
assert idx["werner allemeyer"] == ["row_002"]
def test_build_index_reversed_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("allemeyer werner") == ["row_002"]
def test_build_index_maiden_name_lookup():
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
idx = persons_tree._build_index(persons)
assert idx.get("elsgard woehler") == ["row_002"]
def test_build_index_single_token_fallback():
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("cram") == ["row_028"]
def test_build_index_ambiguous_single_token():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
assert set(idx["cram"]) == {"row_028", "row_019"}
def test_resolve_one_found():
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
assert row_id == "row_003"
assert reason is None
def test_resolve_one_not_found():
idx = {}
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
assert row_id is None
assert reason == "not_found"
def test_resolve_one_ambiguous():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Cram", idx)
assert row_id is None
assert reason == "ambiguous"