feat(normalizer): alias index with maiden/married/nickname resolution
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
"""Person register parsing, name splitting, alias resolution."""
|
"""Person register parsing, name splitting, alias resolution."""
|
||||||
|
import difflib
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
@@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]:
|
|||||||
result.append(last_seg)
|
result.append(last_seg)
|
||||||
return result
|
return result
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def _norm(name: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
|
||||||
|
|
||||||
|
|
||||||
|
class AliasIndex:
|
||||||
|
def __init__(self, people: list[Person]):
|
||||||
|
self._by_alias: dict[str, str] = {}
|
||||||
|
self._display: dict[str, str] = {}
|
||||||
|
self.known_ids: set[str] = {p.person_id for p in people}
|
||||||
|
first_name_ids: dict[str, list] = {}
|
||||||
|
for p in people:
|
||||||
|
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
|
||||||
|
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
|
||||||
|
forms = [f"{p.first_name} {p.last_name}".strip()]
|
||||||
|
if p.maiden_name:
|
||||||
|
forms.append(f"{p.first_name} {p.maiden_name}".strip())
|
||||||
|
for extra in p.extra_given_names:
|
||||||
|
forms.append(f"{extra} {p.last_name}".strip())
|
||||||
|
if p.nickname:
|
||||||
|
forms.append(p.nickname)
|
||||||
|
seen = set()
|
||||||
|
for form in forms:
|
||||||
|
if form in seen:
|
||||||
|
continue
|
||||||
|
seen.add(form)
|
||||||
|
key = _norm(form)
|
||||||
|
if key and key not in self._by_alias:
|
||||||
|
self._by_alias[key] = p.person_id
|
||||||
|
p.aliases.append(form)
|
||||||
|
if p.first_name:
|
||||||
|
ids = first_name_ids.setdefault(_norm(p.first_name), [])
|
||||||
|
if p.person_id not in ids:
|
||||||
|
ids.append(p.person_id)
|
||||||
|
# first-name-only alias, only when unambiguous
|
||||||
|
for fname, ids in first_name_ids.items():
|
||||||
|
if len(ids) == 1 and fname not in self._by_alias:
|
||||||
|
self._by_alias[fname] = ids[0]
|
||||||
|
|
||||||
|
def resolve(self, name: str):
|
||||||
|
return self._by_alias.get(_norm(name))
|
||||||
|
|
||||||
|
def display(self, person_id: str) -> str:
|
||||||
|
return self._display.get(person_id, "")
|
||||||
|
|
||||||
|
def suggest(self, name: str):
|
||||||
|
keys = list(self._by_alias.keys())
|
||||||
|
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
|
||||||
|
if not match:
|
||||||
|
return None, 0.0
|
||||||
|
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
|
||||||
|
return self._by_alias[match[0]], score
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import config
|
||||||
import persons
|
import persons
|
||||||
|
|
||||||
def test_slugify():
|
def test_slugify():
|
||||||
@@ -52,3 +53,21 @@ def test_split_receivers():
|
|||||||
def test_find_known_last_name():
|
def test_find_known_last_name():
|
||||||
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
|
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
|
||||||
assert persons.find_known_last_name("Clara") is None
|
assert persons.find_known_last_name("Clara") is None
|
||||||
|
|
||||||
|
def test_alias_index_resolves_maiden_and_married():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
||||||
|
{"last_name": "Cram", "first_name": "Clara"},
|
||||||
|
])
|
||||||
|
idx = persons.AliasIndex(people)
|
||||||
|
eugenie = people[0].person_id
|
||||||
|
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
|
||||||
|
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
|
||||||
|
assert idx.resolve("eugenie müller") == eugenie # normalized
|
||||||
|
assert idx.resolve("Nobody Unknown") is None
|
||||||
|
|
||||||
|
def test_alias_index_suggestion():
|
||||||
|
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
|
||||||
|
idx = persons.AliasIndex(people)
|
||||||
|
sid, score = idx.suggest("Hans Wittkop") # typo
|
||||||
|
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD
|
||||||
|
|||||||
Reference in New Issue
Block a user