diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index 968cd7bb..b92823a8 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -1,4 +1,5 @@ """Person register parsing, name splitting, alias resolution.""" +import difflib import re import unicodedata from collections import Counter @@ -141,3 +142,56 @@ def split_receivers(raw: str) -> list[str]: result.append(last_seg) return result return parts + + +def _norm(name: str) -> str: + return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip() + + +class AliasIndex: + def __init__(self, people: list[Person]): + self._by_alias: dict[str, str] = {} + self._display: dict[str, str] = {} + self.known_ids: set[str] = {p.person_id for p in people} + first_name_ids: dict[str, list] = {} + for p in people: + self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip() + # Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01. + forms = [f"{p.first_name} {p.last_name}".strip()] + if p.maiden_name: + forms.append(f"{p.first_name} {p.maiden_name}".strip()) + for extra in p.extra_given_names: + forms.append(f"{extra} {p.last_name}".strip()) + if p.nickname: + forms.append(p.nickname) + seen = set() + for form in forms: + if form in seen: + continue + seen.add(form) + key = _norm(form) + if key and key not in self._by_alias: + self._by_alias[key] = p.person_id + p.aliases.append(form) + if p.first_name: + ids = first_name_ids.setdefault(_norm(p.first_name), []) + if p.person_id not in ids: + ids.append(p.person_id) + # first-name-only alias, only when unambiguous + for fname, ids in first_name_ids.items(): + if len(ids) == 1 and fname not in self._by_alias: + self._by_alias[fname] = ids[0] + + def resolve(self, name: str): + return self._by_alias.get(_norm(name)) + + def display(self, person_id: str) -> str: + return self._display.get(person_id, "") + + def suggest(self, name: str): + keys = list(self._by_alias.keys()) + match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD) + if not match: + return None, 0.0 + score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio() + return self._by_alias[match[0]], score diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py index 2137509f..a5c9b0cf 100644 --- a/tools/import-normalizer/tests/test_persons.py +++ b/tools/import-normalizer/tests/test_persons.py @@ -1,3 +1,4 @@ +import config import persons def test_slugify(): @@ -52,3 +53,21 @@ def test_split_receivers(): def test_find_known_last_name(): assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter" assert persons.find_known_last_name("Clara") is None + +def test_alias_index_resolves_maiden_and_married(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + {"last_name": "Cram", "first_name": "Clara"}, + ]) + idx = persons.AliasIndex(people) + eugenie = people[0].person_id + assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical + assert idx.resolve("Eugenie Müller") == eugenie # maiden alias + assert idx.resolve("eugenie müller") == eugenie # normalized + assert idx.resolve("Nobody Unknown") is None + +def test_alias_index_suggestion(): + people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}]) + idx = persons.AliasIndex(people) + sid, score = idx.suggest("Hans Wittkop") # typo + assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD