feat(normalizer): alias index with maiden/married/nickname resolution
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
"""Person register parsing, name splitting, alias resolution."""
|
||||
import difflib
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
@@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]:
|
||||
result.append(last_seg)
|
||||
return result
|
||||
return parts
|
||||
|
||||
|
||||
def _norm(name: str) -> str:
|
||||
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
|
||||
|
||||
|
||||
class AliasIndex:
|
||||
def __init__(self, people: list[Person]):
|
||||
self._by_alias: dict[str, str] = {}
|
||||
self._display: dict[str, str] = {}
|
||||
self.known_ids: set[str] = {p.person_id for p in people}
|
||||
first_name_ids: dict[str, list] = {}
|
||||
for p in people:
|
||||
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
|
||||
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
|
||||
forms = [f"{p.first_name} {p.last_name}".strip()]
|
||||
if p.maiden_name:
|
||||
forms.append(f"{p.first_name} {p.maiden_name}".strip())
|
||||
for extra in p.extra_given_names:
|
||||
forms.append(f"{extra} {p.last_name}".strip())
|
||||
if p.nickname:
|
||||
forms.append(p.nickname)
|
||||
seen = set()
|
||||
for form in forms:
|
||||
if form in seen:
|
||||
continue
|
||||
seen.add(form)
|
||||
key = _norm(form)
|
||||
if key and key not in self._by_alias:
|
||||
self._by_alias[key] = p.person_id
|
||||
p.aliases.append(form)
|
||||
if p.first_name:
|
||||
ids = first_name_ids.setdefault(_norm(p.first_name), [])
|
||||
if p.person_id not in ids:
|
||||
ids.append(p.person_id)
|
||||
# first-name-only alias, only when unambiguous
|
||||
for fname, ids in first_name_ids.items():
|
||||
if len(ids) == 1 and fname not in self._by_alias:
|
||||
self._by_alias[fname] = ids[0]
|
||||
|
||||
def resolve(self, name: str):
|
||||
return self._by_alias.get(_norm(name))
|
||||
|
||||
def display(self, person_id: str) -> str:
|
||||
return self._display.get(person_id, "")
|
||||
|
||||
def suggest(self, name: str):
|
||||
keys = list(self._by_alias.keys())
|
||||
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
|
||||
if not match:
|
||||
return None, 0.0
|
||||
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
|
||||
return self._by_alias[match[0]], score
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import config
|
||||
import persons
|
||||
|
||||
def test_slugify():
|
||||
@@ -52,3 +53,21 @@ def test_split_receivers():
|
||||
def test_find_known_last_name():
|
||||
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
|
||||
assert persons.find_known_last_name("Clara") is None
|
||||
|
||||
def test_alias_index_resolves_maiden_and_married():
|
||||
people = persons.parse_register([
|
||||
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
||||
{"last_name": "Cram", "first_name": "Clara"},
|
||||
])
|
||||
idx = persons.AliasIndex(people)
|
||||
eugenie = people[0].person_id
|
||||
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
|
||||
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
|
||||
assert idx.resolve("eugenie müller") == eugenie # normalized
|
||||
assert idx.resolve("Nobody Unknown") is None
|
||||
|
||||
def test_alias_index_suggestion():
|
||||
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
|
||||
idx = persons.AliasIndex(people)
|
||||
sid, score = idx.suggest("Hans Wittkop") # typo
|
||||
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD
|
||||
|
||||
Reference in New Issue
Block a user