Import normalizer: offline tool to normalize the raw archive spreadsheets #663

Merged
marcel merged 172 commits from docs/import-migration into main 2026-05-28 15:05:51 +02:00
2 changed files with 19 additions and 8 deletions
Showing only changes of commit b7a2332861 - Show all commits

View File

@@ -1,6 +1,7 @@
"""Person register parsing, name splitting, alias resolution.""" """Person register parsing, name splitting, alias resolution."""
import re import re
import unicodedata import unicodedata
from collections import Counter
from dataclasses import dataclass, field from dataclasses import dataclass, field
import config import config
@@ -31,7 +32,7 @@ class Person:
maiden_name: str = "" maiden_name: str = ""
title: str = "" title: str = ""
nickname: str = "" nickname: str = ""
extra_given_names: list = field(default_factory=list) extra_given_names: list[str] = field(default_factory=list)
birth_date: str | None = None birth_date: str | None = None
birth_date_raw: str = "" birth_date_raw: str = ""
birth_place: str = "" birth_place: str = ""
@@ -41,7 +42,7 @@ class Person:
spouse: str = "" spouse: str = ""
generation: str = "" generation: str = ""
notes: str = "" notes: str = ""
aliases: list = field(default_factory=list) aliases: list[str] = field(default_factory=list)
provisional: bool = False provisional: bool = False
@@ -77,12 +78,12 @@ def parse_register(rows: list[dict]) -> list[Person]:
spouse=spouse_raw, generation=(r.get("generation") or "").strip(), spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
notes=(r.get("notes") or "").strip(), provisional=False, notes=(r.get("notes") or "").strip(), provisional=False,
)) ))
# De-duplicate colliding ids with numeric suffix # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
seen = {} # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
counts = Counter(p.person_id for p in people)
seen: dict[str, int] = {}
for p in people: for p in people:
if p.person_id in seen: if counts[p.person_id] > 1:
seen[p.person_id] += 1 seen[p.person_id] = seen.get(p.person_id, 0) + 1
p.person_id = f"{p.person_id}-{seen[p.person_id]}" p.person_id = f"{p.person_id}-{seen[p.person_id]}"
else:
seen[p.person_id] = 1
return people return people

View File

@@ -27,3 +27,13 @@ def test_parse_register_basic():
assert p2.maiden_name == "Cram" assert p2.maiden_name == "Cram"
assert p2.spouse == "Ludwig Bohrmann" assert p2.spouse == "Ludwig Bohrmann"
assert p2.provisional is False assert p2.provisional is False
def test_parse_register_dedups_colliding_ids():
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
people = persons.parse_register([
{"last_name": "Cram", "first_name": "Hans"},
{"last_name": "Cram", "first_name": "Hans"},
])
ids = [p.person_id for p in people]
assert ids == ["cram-hans-1", "cram-hans-2"]
assert len(set(ids)) == 2