diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index 2c965f2e..a26c100c 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -1,6 +1,7 @@ """Person register parsing, name splitting, alias resolution.""" import re import unicodedata +from collections import Counter from dataclasses import dataclass, field import config @@ -31,7 +32,7 @@ class Person: maiden_name: str = "" title: str = "" nickname: str = "" - extra_given_names: list = field(default_factory=list) + extra_given_names: list[str] = field(default_factory=list) birth_date: str | None = None birth_date_raw: str = "" birth_place: str = "" @@ -41,7 +42,7 @@ class Person: spouse: str = "" generation: str = "" notes: str = "" - aliases: list = field(default_factory=list) + aliases: list[str] = field(default_factory=list) provisional: bool = False @@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]: spouse=spouse_raw, generation=(r.get("generation") or "").strip(), notes=(r.get("notes") or "").strip(), provisional=False, )) - # De-duplicate colliding ids with numeric suffix - seen = {} + # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix + # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched. + counts = Counter(p.person_id for p in people) + seen: dict[str, int] = {} for p in people: - if p.person_id in seen: - seen[p.person_id] += 1 + if counts[p.person_id] > 1: + seen[p.person_id] = seen.get(p.person_id, 0) + 1 p.person_id = f"{p.person_id}-{seen[p.person_id]}" - else: - seen[p.person_id] = 1 return people diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py index a035dc26..3f1b0649 100644 --- a/tools/import-normalizer/tests/test_persons.py +++ b/tools/import-normalizer/tests/test_persons.py @@ -27,3 +27,13 @@ def test_parse_register_basic(): assert p2.maiden_name == "Cram" assert p2.spouse == "Ludwig Bohrmann" assert p2.provisional is False + +def test_parse_register_dedups_colliding_ids(): + # Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id). + people = persons.parse_register([ + {"last_name": "Cram", "first_name": "Hans"}, + {"last_name": "Cram", "first_name": "Hans"}, + ]) + ids = [p.person_id for p in people] + assert ids == ["cram-hans-1", "cram-hans-2"] + assert len(set(ids)) == 2