fix(normalizer): suffix all members of a colliding person-id group
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
"""Person register parsing, name splitting, alias resolution."""
|
"""Person register parsing, name splitting, alias resolution."""
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from collections import Counter
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
import config
|
import config
|
||||||
@@ -31,7 +32,7 @@ class Person:
|
|||||||
maiden_name: str = ""
|
maiden_name: str = ""
|
||||||
title: str = ""
|
title: str = ""
|
||||||
nickname: str = ""
|
nickname: str = ""
|
||||||
extra_given_names: list = field(default_factory=list)
|
extra_given_names: list[str] = field(default_factory=list)
|
||||||
birth_date: str | None = None
|
birth_date: str | None = None
|
||||||
birth_date_raw: str = ""
|
birth_date_raw: str = ""
|
||||||
birth_place: str = ""
|
birth_place: str = ""
|
||||||
@@ -41,7 +42,7 @@ class Person:
|
|||||||
spouse: str = ""
|
spouse: str = ""
|
||||||
generation: str = ""
|
generation: str = ""
|
||||||
notes: str = ""
|
notes: str = ""
|
||||||
aliases: list = field(default_factory=list)
|
aliases: list[str] = field(default_factory=list)
|
||||||
provisional: bool = False
|
provisional: bool = False
|
||||||
|
|
||||||
|
|
||||||
@@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]:
|
|||||||
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
|
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
|
||||||
notes=(r.get("notes") or "").strip(), provisional=False,
|
notes=(r.get("notes") or "").strip(), provisional=False,
|
||||||
))
|
))
|
||||||
# De-duplicate colliding ids with numeric suffix
|
# De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
|
||||||
seen = {}
|
# (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
|
||||||
|
counts = Counter(p.person_id for p in people)
|
||||||
|
seen: dict[str, int] = {}
|
||||||
for p in people:
|
for p in people:
|
||||||
if p.person_id in seen:
|
if counts[p.person_id] > 1:
|
||||||
seen[p.person_id] += 1
|
seen[p.person_id] = seen.get(p.person_id, 0) + 1
|
||||||
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
||||||
else:
|
|
||||||
seen[p.person_id] = 1
|
|
||||||
return people
|
return people
|
||||||
|
|||||||
@@ -27,3 +27,13 @@ def test_parse_register_basic():
|
|||||||
assert p2.maiden_name == "Cram"
|
assert p2.maiden_name == "Cram"
|
||||||
assert p2.spouse == "Ludwig Bohrmann"
|
assert p2.spouse == "Ludwig Bohrmann"
|
||||||
assert p2.provisional is False
|
assert p2.provisional is False
|
||||||
|
|
||||||
|
def test_parse_register_dedups_colliding_ids():
|
||||||
|
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "Cram", "first_name": "Hans"},
|
||||||
|
{"last_name": "Cram", "first_name": "Hans"},
|
||||||
|
])
|
||||||
|
ids = [p.person_id for p in people]
|
||||||
|
assert ids == ["cram-hans-1", "cram-hans-2"]
|
||||||
|
assert len(set(ids)) == 2
|
||||||
|
|||||||
Reference in New Issue
Block a user