"""Person register parsing, name splitting, alias resolution.""" import re import unicodedata from collections import Counter from dataclasses import dataclass, field import config import dates _DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss", "Ä": "ae", "Ö": "oe", "Ü": "ue"}) def _strip_accents(s: str) -> str: s = s.translate(_DIACRITIC_MAP) s = unicodedata.normalize("NFKD", s) return "".join(c for c in s if not unicodedata.combining(c)) def slugify(last: str, first: str) -> str: raw = f"{last} {first}".strip() raw = _strip_accents(raw).lower() raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-") return raw or "unknown" @dataclass class Person: person_id: str last_name: str = "" first_name: str = "" maiden_name: str = "" title: str = "" nickname: str = "" extra_given_names: list[str] = field(default_factory=list) birth_date: str | None = None birth_date_raw: str = "" birth_place: str = "" death_date: str | None = None death_date_raw: str = "" death_place: str = "" spouse: str = "" generation: str = "" notes: str = "" aliases: list[str] = field(default_factory=list) provisional: bool = False _QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$') def parse_register(rows: list[dict]) -> list[Person]: people = [] for r in rows: last = (r.get("last_name") or "").strip() if not last: continue given_raw = (r.get("first_name") or "").strip() givens = [g.strip() for g in given_raw.split(",") if g.strip()] first = givens[0] if givens else "" extra = givens[1:] spouse_raw = (r.get("spouse") or "").strip() nickname = "" m = _QUOTED_RE.match(spouse_raw) if m: nickname = m.group(1) spouse_raw = "" birth = dates.parse_date(r.get("birth_date") or "") death = dates.parse_date(r.get("death_date") or "") people.append(Person( person_id=slugify(last, first), last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(), nickname=nickname, extra_given_names=extra, birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(), death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(), spouse=spouse_raw, generation=(r.get("generation") or "").strip(), notes=(r.get("notes") or "").strip(), provisional=False, )) # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched. counts = Counter(p.person_id for p in people) seen: dict[str, int] = {} for p in people: if counts[p.person_id] > 1: seen[p.person_id] = seen.get(p.person_id, 0) + 1 p.person_id = f"{p.person_id}-{seen[p.person_id]}" return people