feat(normalizer): add name normalization + lookup index to persons_tree
This commit is contained in:
@@ -74,3 +74,57 @@ def _parse_generation(raw: str | None) -> int | None:
|
||||
return None
|
||||
m = re.search(r"\d+", str(raw))
|
||||
return int(m.group()) if m else None
|
||||
|
||||
|
||||
_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}
|
||||
|
||||
|
||||
def _norm_tree(s: str) -> str:
|
||||
"""Normalize a name string for tree matching.
|
||||
|
||||
- Strip surrounding quotes, remove parenthetical substrings
|
||||
- Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
|
||||
- Remove known geographic/honorific suffix tokens
|
||||
- Collapse whitespace
|
||||
"""
|
||||
s = (s or "").strip().strip("\"'")
|
||||
s = re.sub(r"\([^)]*\)", "", s)
|
||||
s = _strip_accents(s).lower().replace(".", " ")
|
||||
tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
|
||||
return " ".join(tokens).strip("., ")
|
||||
|
||||
|
||||
def _build_index(persons: list[dict]) -> dict[str, list[str]]:
|
||||
"""Build a name → [rowId, …] lookup index with four keys per person."""
|
||||
index: dict[str, list[str]] = {}
|
||||
|
||||
def _add(key: str, row_id: str) -> None:
|
||||
if key:
|
||||
index.setdefault(key, []).append(row_id)
|
||||
|
||||
for p in persons:
|
||||
row_id = p["rowId"]
|
||||
first = p.get("firstName") or ""
|
||||
last = p.get("lastName") or ""
|
||||
maiden = p.get("maidenName") or ""
|
||||
|
||||
_add(_norm_tree(f"{first} {last}"), row_id)
|
||||
_add(_norm_tree(f"{last} {first}"), row_id)
|
||||
if maiden:
|
||||
_add(_norm_tree(f"{first} {maiden}"), row_id)
|
||||
_add(_norm_tree(last), row_id)
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
|
||||
"""Return (row_id, None) on unique match, (None, reason) otherwise."""
|
||||
key = _norm_tree(raw)
|
||||
if not key:
|
||||
return None, "empty"
|
||||
hits = index.get(key, [])
|
||||
if len(hits) == 1:
|
||||
return hits[0], None
|
||||
if len(hits) == 0:
|
||||
return None, "not_found"
|
||||
return None, "ambiguous"
|
||||
|
||||
@@ -83,3 +83,91 @@ def test_parse_generation_empty():
|
||||
|
||||
def test_parse_generation_none():
|
||||
assert persons_tree._parse_generation(None) is None
|
||||
|
||||
|
||||
def test_norm_tree_basic():
|
||||
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
|
||||
|
||||
|
||||
def test_norm_tree_diacritics():
|
||||
assert persons_tree._norm_tree("Wöhler") == "woehler"
|
||||
|
||||
|
||||
def test_norm_tree_strips_parens():
|
||||
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
|
||||
|
||||
|
||||
def test_norm_tree_strips_quotes():
|
||||
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
|
||||
|
||||
|
||||
def test_norm_tree_strips_geographic_suffix():
|
||||
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
|
||||
|
||||
|
||||
def test_norm_tree_strips_mexiko():
|
||||
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
|
||||
|
||||
|
||||
def test_norm_tree_collapses_whitespace():
|
||||
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
|
||||
|
||||
|
||||
def test_build_index_forward_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert "werner allemeyer" in idx
|
||||
assert idx["werner allemeyer"] == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_reversed_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("allemeyer werner") == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_maiden_name_lookup():
|
||||
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("elsgard woehler") == ["row_002"]
|
||||
|
||||
|
||||
def test_build_index_single_token_fallback():
|
||||
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert idx.get("cram") == ["row_028"]
|
||||
|
||||
|
||||
def test_build_index_ambiguous_single_token():
|
||||
persons = [
|
||||
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||
]
|
||||
idx = persons_tree._build_index(persons)
|
||||
assert set(idx["cram"]) == {"row_028", "row_019"}
|
||||
|
||||
|
||||
def test_resolve_one_found():
|
||||
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||
idx = persons_tree._build_index(persons)
|
||||
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
|
||||
assert row_id == "row_003"
|
||||
assert reason is None
|
||||
|
||||
|
||||
def test_resolve_one_not_found():
|
||||
idx = {}
|
||||
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
|
||||
assert row_id is None
|
||||
assert reason == "not_found"
|
||||
|
||||
|
||||
def test_resolve_one_ambiguous():
|
||||
persons = [
|
||||
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||
]
|
||||
idx = persons_tree._build_index(persons)
|
||||
row_id, reason = persons_tree._resolve_one("Cram", idx)
|
||||
assert row_id is None
|
||||
assert reason == "ambiguous"
|
||||
|
||||
Reference in New Issue
Block a user