fix(normalizer): suffix all members of a colliding person-id group
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""Person register parsing, name splitting, alias resolution."""
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import config
|
||||
@@ -31,7 +32,7 @@ class Person:
|
||||
maiden_name: str = ""
|
||||
title: str = ""
|
||||
nickname: str = ""
|
||||
extra_given_names: list = field(default_factory=list)
|
||||
extra_given_names: list[str] = field(default_factory=list)
|
||||
birth_date: str | None = None
|
||||
birth_date_raw: str = ""
|
||||
birth_place: str = ""
|
||||
@@ -41,7 +42,7 @@ class Person:
|
||||
spouse: str = ""
|
||||
generation: str = ""
|
||||
notes: str = ""
|
||||
aliases: list = field(default_factory=list)
|
||||
aliases: list[str] = field(default_factory=list)
|
||||
provisional: bool = False
|
||||
|
||||
|
||||
@@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]:
|
||||
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
|
||||
notes=(r.get("notes") or "").strip(), provisional=False,
|
||||
))
|
||||
# De-duplicate colliding ids with numeric suffix
|
||||
seen = {}
|
||||
# De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
|
||||
# (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
|
||||
counts = Counter(p.person_id for p in people)
|
||||
seen: dict[str, int] = {}
|
||||
for p in people:
|
||||
if p.person_id in seen:
|
||||
seen[p.person_id] += 1
|
||||
if counts[p.person_id] > 1:
|
||||
seen[p.person_id] = seen.get(p.person_id, 0) + 1
|
||||
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
||||
else:
|
||||
seen[p.person_id] = 1
|
||||
return people
|
||||
|
||||
@@ -27,3 +27,13 @@ def test_parse_register_basic():
|
||||
assert p2.maiden_name == "Cram"
|
||||
assert p2.spouse == "Ludwig Bohrmann"
|
||||
assert p2.provisional is False
|
||||
|
||||
def test_parse_register_dedups_colliding_ids():
|
||||
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
|
||||
people = persons.parse_register([
|
||||
{"last_name": "Cram", "first_name": "Hans"},
|
||||
{"last_name": "Cram", "first_name": "Hans"},
|
||||
])
|
||||
ids = [p.person_id for p in people]
|
||||
assert ids == ["cram-hans-1", "cram-hans-2"]
|
||||
assert len(set(ids)) == 2
|
||||
|
||||
Reference in New Issue
Block a user