feat(normalizer): add name normalization + lookup index to persons_tree

This commit is contained in:
Marcel
2026-05-25 20:56:47 +02:00
parent 47a0770758
commit 306f3b6fe6
2 changed files with 142 additions and 0 deletions

View File

@@ -74,3 +74,57 @@ def _parse_generation(raw: str | None) -> int | None:
return None
m = re.search(r"\d+", str(raw))
return int(m.group()) if m else None
_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}
def _norm_tree(s: str) -> str:
"""Normalize a name string for tree matching.
- Strip surrounding quotes, remove parenthetical substrings
- Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
- Remove known geographic/honorific suffix tokens
- Collapse whitespace
"""
s = (s or "").strip().strip("\"'")
s = re.sub(r"\([^)]*\)", "", s)
s = _strip_accents(s).lower().replace(".", " ")
tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
return " ".join(tokens).strip("., ")
def _build_index(persons: list[dict]) -> dict[str, list[str]]:
"""Build a name → [rowId, …] lookup index with four keys per person."""
index: dict[str, list[str]] = {}
def _add(key: str, row_id: str) -> None:
if key:
index.setdefault(key, []).append(row_id)
for p in persons:
row_id = p["rowId"]
first = p.get("firstName") or ""
last = p.get("lastName") or ""
maiden = p.get("maidenName") or ""
_add(_norm_tree(f"{first} {last}"), row_id)
_add(_norm_tree(f"{last} {first}"), row_id)
if maiden:
_add(_norm_tree(f"{first} {maiden}"), row_id)
_add(_norm_tree(last), row_id)
return index
def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
"""Return (row_id, None) on unique match, (None, reason) otherwise."""
key = _norm_tree(raw)
if not key:
return None, "empty"
hits = index.get(key, [])
if len(hits) == 1:
return hits[0], None
if len(hits) == 0:
return None, "not_found"
return None, "ambiguous"

View File

@@ -83,3 +83,91 @@ def test_parse_generation_empty():
def test_parse_generation_none():
assert persons_tree._parse_generation(None) is None
def test_norm_tree_basic():
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
def test_norm_tree_diacritics():
assert persons_tree._norm_tree("Wöhler") == "woehler"
def test_norm_tree_strips_parens():
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
def test_norm_tree_strips_quotes():
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
def test_norm_tree_strips_geographic_suffix():
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
def test_norm_tree_strips_mexiko():
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
def test_norm_tree_collapses_whitespace():
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
def test_build_index_forward_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert "werner allemeyer" in idx
assert idx["werner allemeyer"] == ["row_002"]
def test_build_index_reversed_lookup():
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("allemeyer werner") == ["row_002"]
def test_build_index_maiden_name_lookup():
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
idx = persons_tree._build_index(persons)
assert idx.get("elsgard woehler") == ["row_002"]
def test_build_index_single_token_fallback():
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
idx = persons_tree._build_index(persons)
assert idx.get("cram") == ["row_028"]
def test_build_index_ambiguous_single_token():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
assert set(idx["cram"]) == {"row_028", "row_019"}
def test_resolve_one_found():
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
assert row_id == "row_003"
assert reason is None
def test_resolve_one_not_found():
idx = {}
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
assert row_id is None
assert reason == "not_found"
def test_resolve_one_ambiguous():
persons = [
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
]
idx = persons_tree._build_index(persons)
row_id, reason = persons_tree._resolve_one("Cram", idx)
assert row_id is None
assert reason == "ambiguous"