fix(normalizer): suffix all members of a colliding person-id group

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:58:35 +02:00
parent 1da1a8d223
commit b7a2332861
2 changed files with 19 additions and 8 deletions

View File

@@ -1,6 +1,7 @@
"""Person register parsing, name splitting, alias resolution.""" """Person register parsing, name splitting, alias resolution."""
import re import re
import unicodedata import unicodedata
from collections import Counter
from dataclasses import dataclass, field from dataclasses import dataclass, field
import config import config
@@ -31,7 +32,7 @@ class Person:
maiden_name: str = "" maiden_name: str = ""
title: str = "" title: str = ""
nickname: str = "" nickname: str = ""
extra_given_names: list = field(default_factory=list) extra_given_names: list[str] = field(default_factory=list)
birth_date: str | None = None birth_date: str | None = None
birth_date_raw: str = "" birth_date_raw: str = ""
birth_place: str = "" birth_place: str = ""
@@ -41,7 +42,7 @@ class Person:
spouse: str = "" spouse: str = ""
generation: str = "" generation: str = ""
notes: str = "" notes: str = ""
aliases: list = field(default_factory=list) aliases: list[str] = field(default_factory=list)
provisional: bool = False provisional: bool = False
@@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]:
spouse=spouse_raw, generation=(r.get("generation") or "").strip(), spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
notes=(r.get("notes") or "").strip(), provisional=False, notes=(r.get("notes") or "").strip(), provisional=False,
)) ))
# De-duplicate colliding ids with numeric suffix # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
seen = {} # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
counts = Counter(p.person_id for p in people)
seen: dict[str, int] = {}
for p in people: for p in people:
if p.person_id in seen: if counts[p.person_id] > 1:
seen[p.person_id] += 1 seen[p.person_id] = seen.get(p.person_id, 0) + 1
p.person_id = f"{p.person_id}-{seen[p.person_id]}" p.person_id = f"{p.person_id}-{seen[p.person_id]}"
else:
seen[p.person_id] = 1
return people return people

View File

@@ -27,3 +27,13 @@ def test_parse_register_basic():
assert p2.maiden_name == "Cram" assert p2.maiden_name == "Cram"
assert p2.spouse == "Ludwig Bohrmann" assert p2.spouse == "Ludwig Bohrmann"
assert p2.provisional is False assert p2.provisional is False
def test_parse_register_dedups_colliding_ids():
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
people = persons.parse_register([
{"last_name": "Cram", "first_name": "Hans"},
{"last_name": "Cram", "first_name": "Hans"},
])
ids = [p.person_id for p in people]
assert ids == ["cram-hans-1", "cram-hans-2"]
assert len(set(ids)) == 2