feat(normalizer): alias index with maiden/married/nickname resolution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 14:04:11 +02:00
parent 2d97595e9c
commit 53457d9319
2 changed files with 73 additions and 0 deletions

View File

@@ -1,4 +1,5 @@
"""Person register parsing, name splitting, alias resolution."""
import difflib
import re
import unicodedata
from collections import Counter
@@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]:
result.append(last_seg)
return result
return parts
def _norm(name: str) -> str:
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
class AliasIndex:
def __init__(self, people: list[Person]):
self._by_alias: dict[str, str] = {}
self._display: dict[str, str] = {}
self.known_ids: set[str] = {p.person_id for p in people}
first_name_ids: dict[str, list] = {}
for p in people:
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
forms = [f"{p.first_name} {p.last_name}".strip()]
if p.maiden_name:
forms.append(f"{p.first_name} {p.maiden_name}".strip())
for extra in p.extra_given_names:
forms.append(f"{extra} {p.last_name}".strip())
if p.nickname:
forms.append(p.nickname)
seen = set()
for form in forms:
if form in seen:
continue
seen.add(form)
key = _norm(form)
if key and key not in self._by_alias:
self._by_alias[key] = p.person_id
p.aliases.append(form)
if p.first_name:
ids = first_name_ids.setdefault(_norm(p.first_name), [])
if p.person_id not in ids:
ids.append(p.person_id)
# first-name-only alias, only when unambiguous
for fname, ids in first_name_ids.items():
if len(ids) == 1 and fname not in self._by_alias:
self._by_alias[fname] = ids[0]
def resolve(self, name: str):
return self._by_alias.get(_norm(name))
def display(self, person_id: str) -> str:
return self._display.get(person_id, "")
def suggest(self, name: str):
keys = list(self._by_alias.keys())
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
if not match:
return None, 0.0
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
return self._by_alias[match[0]], score