feat(normalizer): alias index with maiden/married/nickname resolution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 14:04:11 +02:00
parent 2d97595e9c
commit 53457d9319
2 changed files with 73 additions and 0 deletions

View File

@@ -1,4 +1,5 @@
"""Person register parsing, name splitting, alias resolution.""" """Person register parsing, name splitting, alias resolution."""
import difflib
import re import re
import unicodedata import unicodedata
from collections import Counter from collections import Counter
@@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]:
result.append(last_seg) result.append(last_seg)
return result return result
return parts return parts
def _norm(name: str) -> str:
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
class AliasIndex:
def __init__(self, people: list[Person]):
self._by_alias: dict[str, str] = {}
self._display: dict[str, str] = {}
self.known_ids: set[str] = {p.person_id for p in people}
first_name_ids: dict[str, list] = {}
for p in people:
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
forms = [f"{p.first_name} {p.last_name}".strip()]
if p.maiden_name:
forms.append(f"{p.first_name} {p.maiden_name}".strip())
for extra in p.extra_given_names:
forms.append(f"{extra} {p.last_name}".strip())
if p.nickname:
forms.append(p.nickname)
seen = set()
for form in forms:
if form in seen:
continue
seen.add(form)
key = _norm(form)
if key and key not in self._by_alias:
self._by_alias[key] = p.person_id
p.aliases.append(form)
if p.first_name:
ids = first_name_ids.setdefault(_norm(p.first_name), [])
if p.person_id not in ids:
ids.append(p.person_id)
# first-name-only alias, only when unambiguous
for fname, ids in first_name_ids.items():
if len(ids) == 1 and fname not in self._by_alias:
self._by_alias[fname] = ids[0]
def resolve(self, name: str):
return self._by_alias.get(_norm(name))
def display(self, person_id: str) -> str:
return self._display.get(person_id, "")
def suggest(self, name: str):
keys = list(self._by_alias.keys())
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
if not match:
return None, 0.0
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
return self._by_alias[match[0]], score

View File

@@ -1,3 +1,4 @@
import config
import persons import persons
def test_slugify(): def test_slugify():
@@ -52,3 +53,21 @@ def test_split_receivers():
def test_find_known_last_name(): def test_find_known_last_name():
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter" assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
assert persons.find_known_last_name("Clara") is None assert persons.find_known_last_name("Clara") is None
def test_alias_index_resolves_maiden_and_married():
people = persons.parse_register([
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
{"last_name": "Cram", "first_name": "Clara"},
])
idx = persons.AliasIndex(people)
eugenie = people[0].person_id
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
assert idx.resolve("eugenie müller") == eugenie # normalized
assert idx.resolve("Nobody Unknown") is None
def test_alias_index_suggestion():
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
idx = persons.AliasIndex(people)
sid, score = idx.suggest("Hans Wittkop") # typo
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD