feat(normalizer): add name normalization + lookup index to persons_tree
This commit is contained in:
@@ -83,3 +83,91 @@ def test_parse_generation_empty():
|
||||
|
||||
def test_parse_generation_none():
|
||||
assert persons_tree._parse_generation(None) is None
|
||||
|
||||
|
||||
def test_norm_tree_basic():
|
||||
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
|
||||
|
||||
|
||||
def test_norm_tree_diacritics():
|
||||
assert persons_tree._norm_tree("Wöhler") == "woehler"
|
||||
|
||||
|
||||
def test_norm_tree_strips_parens():
|
||||
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
|
||||
|
||||
|
||||
def test_norm_tree_strips_quotes():
|
||||
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
|
||||
|
||||
|
||||
def test_norm_tree_strips_geographic_suffix():
|
||||
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
|
||||
|
||||
|
||||
def test_norm_tree_strips_mexiko():
|
||||
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
|
||||
|
||||
|
||||
def test_norm_tree_collapses_whitespace():
|
||||
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
|
||||
|
||||
|
||||
def test_build_index_forward_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert "werner allemeyer" in idx
|
||||
assert idx["werner allemeyer"] == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_reversed_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("allemeyer werner") == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_maiden_name_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("elsgard woehler") == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_single_token_fallback():
|
||||
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("cram") == ["row_028"]
|
||||
|
||||
|
||||
def test_build_index_ambiguous_single_token():
|
||||
persons = [
|
||||
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||
]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert set(idx["cram"]) == {"row_028", "row_019"}
|
||||
|
||||
|
||||
def test_resolve_one_found():
|
||||
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
|
||||
assert row_id == "row_003"
|
||||
assert reason is None
|
||||
|
||||
|
||||
def test_resolve_one_not_found():
|
||||
idx = {}
|
||||
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
|
||||
assert row_id is None
|
||||
assert reason == "not_found"
|
||||
|
||||
|
||||
def test_resolve_one_ambiguous():
|
||||
persons = [
|
||||
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||
]
|
||||
idx = persons_tree._build_index(persons)
|
||||
row_id, reason = persons_tree._resolve_one("Cram", idx)
|
||||
assert row_id is None
|
||||
assert reason == "ambiguous"
|
||||
|
||||
Reference in New Issue
Block a user