fix(normalizer): suffix all members of a colliding person-id group

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:58:35 +02:00
parent 1da1a8d223
commit b7a2332861
2 changed files with 19 additions and 8 deletions

View File

@@ -1,6 +1,7 @@
"""Person register parsing, name splitting, alias resolution."""
import re
import unicodedata
from collections import Counter
from dataclasses import dataclass, field
import config
@@ -31,7 +32,7 @@ class Person:
maiden_name: str = ""
title: str = ""
nickname: str = ""
extra_given_names: list = field(default_factory=list)
extra_given_names: list[str] = field(default_factory=list)
birth_date: str | None = None
birth_date_raw: str = ""
birth_place: str = ""
@@ -41,7 +42,7 @@ class Person:
spouse: str = ""
generation: str = ""
notes: str = ""
aliases: list = field(default_factory=list)
aliases: list[str] = field(default_factory=list)
provisional: bool = False
@@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]:
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
notes=(r.get("notes") or "").strip(), provisional=False,
))
# De-duplicate colliding ids with numeric suffix
seen = {}
# De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
# (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
counts = Counter(p.person_id for p in people)
seen: dict[str, int] = {}
for p in people:
if p.person_id in seen:
seen[p.person_id] += 1
if counts[p.person_id] > 1:
seen[p.person_id] = seen.get(p.person_id, 0) + 1
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
else:
seen[p.person_id] = 1
return people

View File

@@ -27,3 +27,13 @@ def test_parse_register_basic():
assert p2.maiden_name == "Cram"
assert p2.spouse == "Ludwig Bohrmann"
assert p2.provisional is False
def test_parse_register_dedups_colliding_ids():
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
people = persons.parse_register([
{"last_name": "Cram", "first_name": "Hans"},
{"last_name": "Cram", "first_name": "Hans"},
])
ids = [p.person_id for p in people]
assert ids == ["cram-hans-1", "cram-hans-2"]
assert len(set(ids)) == 2