feat(normalizer): person register parsing

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:54:37 +02:00
parent 59715bdccd
commit 1da1a8d223
2 changed files with 117 additions and 0 deletions

View File

@@ -0,0 +1,88 @@
"""Person register parsing, name splitting, alias resolution."""
import re
import unicodedata
from dataclasses import dataclass, field
import config
import dates
_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
"Ä": "ae", "Ö": "oe", "Ü": "ue"})
def _strip_accents(s: str) -> str:
s = s.translate(_DIACRITIC_MAP)
s = unicodedata.normalize("NFKD", s)
return "".join(c for c in s if not unicodedata.combining(c))
def slugify(last: str, first: str) -> str:
raw = f"{last} {first}".strip()
raw = _strip_accents(raw).lower()
raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-")
return raw or "unknown"
@dataclass
class Person:
person_id: str
last_name: str = ""
first_name: str = ""
maiden_name: str = ""
title: str = ""
nickname: str = ""
extra_given_names: list = field(default_factory=list)
birth_date: str | None = None
birth_date_raw: str = ""
birth_place: str = ""
death_date: str | None = None
death_date_raw: str = ""
death_place: str = ""
spouse: str = ""
generation: str = ""
notes: str = ""
aliases: list = field(default_factory=list)
provisional: bool = False
_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$')
def parse_register(rows: list[dict]) -> list[Person]:
people = []
for r in rows:
last = (r.get("last_name") or "").strip()
if not last:
continue
given_raw = (r.get("first_name") or "").strip()
givens = [g.strip() for g in given_raw.split(",") if g.strip()]
first = givens[0] if givens else ""
extra = givens[1:]
spouse_raw = (r.get("spouse") or "").strip()
nickname = ""
m = _QUOTED_RE.match(spouse_raw)
if m:
nickname = m.group(1)
spouse_raw = ""
birth = dates.parse_date(r.get("birth_date") or "")
death = dates.parse_date(r.get("death_date") or "")
people.append(Person(
person_id=slugify(last, first),
last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(),
nickname=nickname, extra_given_names=extra,
birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(),
death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(),
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
notes=(r.get("notes") or "").strip(), provisional=False,
))
# De-duplicate colliding ids with numeric suffix
seen = {}
for p in people:
if p.person_id in seen:
seen[p.person_id] += 1
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
else:
seen[p.person_id] = 1
return people