feat(normalizer): alias index with maiden/married/nickname resolution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 14:04:11 +02:00
parent 2d97595e9c
commit 53457d9319
2 changed files with 73 additions and 0 deletions

View File

@@ -1,4 +1,5 @@
"""Person register parsing, name splitting, alias resolution."""
import difflib
import re
import unicodedata
from collections import Counter
@@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]:
result.append(last_seg)
return result
return parts
def _norm(name: str) -> str:
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
class AliasIndex:
def __init__(self, people: list[Person]):
self._by_alias: dict[str, str] = {}
self._display: dict[str, str] = {}
self.known_ids: set[str] = {p.person_id for p in people}
first_name_ids: dict[str, list] = {}
for p in people:
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
forms = [f"{p.first_name} {p.last_name}".strip()]
if p.maiden_name:
forms.append(f"{p.first_name} {p.maiden_name}".strip())
for extra in p.extra_given_names:
forms.append(f"{extra} {p.last_name}".strip())
if p.nickname:
forms.append(p.nickname)
seen = set()
for form in forms:
if form in seen:
continue
seen.add(form)
key = _norm(form)
if key and key not in self._by_alias:
self._by_alias[key] = p.person_id
p.aliases.append(form)
if p.first_name:
ids = first_name_ids.setdefault(_norm(p.first_name), [])
if p.person_id not in ids:
ids.append(p.person_id)
# first-name-only alias, only when unambiguous
for fname, ids in first_name_ids.items():
if len(ids) == 1 and fname not in self._by_alias:
self._by_alias[fname] = ids[0]
def resolve(self, name: str):
return self._by_alias.get(_norm(name))
def display(self, person_id: str) -> str:
return self._display.get(person_id, "")
def suggest(self, name: str):
keys = list(self._by_alias.keys())
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
if not match:
return None, 0.0
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
return self._by_alias[match[0]], score

View File

@@ -1,3 +1,4 @@
import config
import persons
def test_slugify():
@@ -52,3 +53,21 @@ def test_split_receivers():
def test_find_known_last_name():
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
assert persons.find_known_last_name("Clara") is None
def test_alias_index_resolves_maiden_and_married():
people = persons.parse_register([
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
{"last_name": "Cram", "first_name": "Clara"},
])
idx = persons.AliasIndex(people)
eugenie = people[0].person_id
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
assert idx.resolve("eugenie müller") == eugenie # normalized
assert idx.resolve("Nobody Unknown") is None
def test_alias_index_suggestion():
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
idx = persons.AliasIndex(people)
sid, score = idx.suggest("Hans Wittkop") # typo
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD