feat(normalizer): person register parsing

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 13:54:37 +02:00
parent 59715bdccd
commit 1da1a8d223
2 changed files with 117 additions and 0 deletions
--- a/tools/import-normalizer/persons.py
+++ b/tools/import-normalizer/persons.py
@@ -0,0 +1,88 @@
+"""Person register parsing, name splitting, alias resolution."""
+import re
+import unicodedata
+from dataclasses import dataclass, field
+
+import config
+import dates
+
+_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
+                                "Ä": "ae", "Ö": "oe", "Ü": "ue"})
+
+
+def _strip_accents(s: str) -> str:
+    s = s.translate(_DIACRITIC_MAP)
+    s = unicodedata.normalize("NFKD", s)
+    return "".join(c for c in s if not unicodedata.combining(c))
+
+
+def slugify(last: str, first: str) -> str:
+    raw = f"{last} {first}".strip()
+    raw = _strip_accents(raw).lower()
+    raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-")
+    return raw or "unknown"
+
+
+@dataclass
+class Person:
+    person_id: str
+    last_name: str = ""
+    first_name: str = ""
+    maiden_name: str = ""
+    title: str = ""
+    nickname: str = ""
+    extra_given_names: list = field(default_factory=list)
+    birth_date: str | None = None
+    birth_date_raw: str = ""
+    birth_place: str = ""
+    death_date: str | None = None
+    death_date_raw: str = ""
+    death_place: str = ""
+    spouse: str = ""
+    generation: str = ""
+    notes: str = ""
+    aliases: list = field(default_factory=list)
+    provisional: bool = False
+
+
+_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$')
+
+
+def parse_register(rows: list[dict]) -> list[Person]:
+    people = []
+    for r in rows:
+        last = (r.get("last_name") or "").strip()
+        if not last:
+            continue
+        given_raw = (r.get("first_name") or "").strip()
+        givens = [g.strip() for g in given_raw.split(",") if g.strip()]
+        first = givens[0] if givens else ""
+        extra = givens[1:]
+
+        spouse_raw = (r.get("spouse") or "").strip()
+        nickname = ""
+        m = _QUOTED_RE.match(spouse_raw)
+        if m:
+            nickname = m.group(1)
+            spouse_raw = ""
+
+        birth = dates.parse_date(r.get("birth_date") or "")
+        death = dates.parse_date(r.get("death_date") or "")
+        people.append(Person(
+            person_id=slugify(last, first),
+            last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(),
+            nickname=nickname, extra_given_names=extra,
+            birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(),
+            death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(),
+            spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
+            notes=(r.get("notes") or "").strip(), provisional=False,
+        ))
+    # De-duplicate colliding ids with numeric suffix
+    seen = {}
+    for p in people:
+        if p.person_id in seen:
+            seen[p.person_id] += 1
+            p.person_id = f"{p.person_id}-{seen[p.person_id]}"
+        else:
+            seen[p.person_id] = 1
+    return people
--- a/tools/import-normalizer/tests/test_persons.py
+++ b/tools/import-normalizer/tests/test_persons.py
@@ -0,0 +1,29 @@
+import persons
+
+def test_slugify():
+    assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
+    assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard"
+
+def test_parse_register_basic():
+    rows = [
+        {"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi",
+         "maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel",
+         "death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"',
+         "notes": "Schwester v Marie Cram"},
+        {"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else",
+         "maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann",
+         "notes": "Schwester v Herbert"},
+    ]
+    people = persons.parse_register(rows)
+    p = people[0]
+    assert p.person_id == "blomquist-charlotte"
+    assert p.first_name == "Charlotte"
+    assert p.maiden_name == "Ruge"
+    assert p.birth_date == "1862-08-30"
+    assert p.nickname == "Tante Lolly"     # quoted spouse field is a nickname, not a spouse
+    assert p.spouse == ""
+    assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names
+    p2 = people[1]
+    assert p2.maiden_name == "Cram"
+    assert p2.spouse == "Ludwig Bohrmann"
+    assert p2.provisional is False