feat(normalizer): add name normalization + lookup index to persons_tree

This commit is contained in:
Marcel
2026-05-25 20:56:47 +02:00
parent 47a0770758
commit 306f3b6fe6
2 changed files with 142 additions and 0 deletions

View File

@@ -74,3 +74,57 @@ def _parse_generation(raw: str | None) -> int | None:
return None
m = re.search(r"\d+", str(raw))
return int(m.group()) if m else None
_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}
def _norm_tree(s: str) -> str:
"""Normalize a name string for tree matching.
- Strip surrounding quotes, remove parenthetical substrings
- Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
- Remove known geographic/honorific suffix tokens
- Collapse whitespace
"""
s = (s or "").strip().strip("\"'")
s = re.sub(r"\([^)]*\)", "", s)
s = _strip_accents(s).lower().replace(".", " ")
tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
return " ".join(tokens).strip("., ")
def _build_index(persons: list[dict]) -> dict[str, list[str]]:
"""Build a name → [rowId, …] lookup index with four keys per person."""
index: dict[str, list[str]] = {}
def _add(key: str, row_id: str) -> None:
if key:
index.setdefault(key, []).append(row_id)
for p in persons:
row_id = p["rowId"]
first = p.get("firstName") or ""
last = p.get("lastName") or ""
maiden = p.get("maidenName") or ""
_add(_norm_tree(f"{first} {last}"), row_id)
_add(_norm_tree(f"{last} {first}"), row_id)
if maiden:
_add(_norm_tree(f"{first} {maiden}"), row_id)
_add(_norm_tree(last), row_id)
return index
def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
"""Return (row_id, None) on unique match, (None, reason) otherwise."""
key = _norm_tree(raw)
if not key:
return None, "empty"
hits = index.get(key, [])
if len(hits) == 1:
return hits[0], None
if len(hits) == 0:
return None, "not_found"
return None, "ambiguous"