diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py new file mode 100644 index 00000000..2c965f2e --- /dev/null +++ b/tools/import-normalizer/persons.py @@ -0,0 +1,88 @@ +"""Person register parsing, name splitting, alias resolution.""" +import re +import unicodedata +from dataclasses import dataclass, field + +import config +import dates + +_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss", + "Ä": "ae", "Ö": "oe", "Ü": "ue"}) + + +def _strip_accents(s: str) -> str: + s = s.translate(_DIACRITIC_MAP) + s = unicodedata.normalize("NFKD", s) + return "".join(c for c in s if not unicodedata.combining(c)) + + +def slugify(last: str, first: str) -> str: + raw = f"{last} {first}".strip() + raw = _strip_accents(raw).lower() + raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-") + return raw or "unknown" + + +@dataclass +class Person: + person_id: str + last_name: str = "" + first_name: str = "" + maiden_name: str = "" + title: str = "" + nickname: str = "" + extra_given_names: list = field(default_factory=list) + birth_date: str | None = None + birth_date_raw: str = "" + birth_place: str = "" + death_date: str | None = None + death_date_raw: str = "" + death_place: str = "" + spouse: str = "" + generation: str = "" + notes: str = "" + aliases: list = field(default_factory=list) + provisional: bool = False + + +_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$') + + +def parse_register(rows: list[dict]) -> list[Person]: + people = [] + for r in rows: + last = (r.get("last_name") or "").strip() + if not last: + continue + given_raw = (r.get("first_name") or "").strip() + givens = [g.strip() for g in given_raw.split(",") if g.strip()] + first = givens[0] if givens else "" + extra = givens[1:] + + spouse_raw = (r.get("spouse") or "").strip() + nickname = "" + m = _QUOTED_RE.match(spouse_raw) + if m: + nickname = m.group(1) + spouse_raw = "" + + birth = dates.parse_date(r.get("birth_date") or "") + death = dates.parse_date(r.get("death_date") or "") + people.append(Person( + person_id=slugify(last, first), + last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(), + nickname=nickname, extra_given_names=extra, + birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(), + death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(), + spouse=spouse_raw, generation=(r.get("generation") or "").strip(), + notes=(r.get("notes") or "").strip(), provisional=False, + )) + # De-duplicate colliding ids with numeric suffix + seen = {} + for p in people: + if p.person_id in seen: + seen[p.person_id] += 1 + p.person_id = f"{p.person_id}-{seen[p.person_id]}" + else: + seen[p.person_id] = 1 + return people diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py new file mode 100644 index 00000000..a035dc26 --- /dev/null +++ b/tools/import-normalizer/tests/test_persons.py @@ -0,0 +1,29 @@ +import persons + +def test_slugify(): + assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie" + assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard" + +def test_parse_register_basic(): + rows = [ + {"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi", + "maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel", + "death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"', + "notes": "Schwester v Marie Cram"}, + {"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else", + "maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann", + "notes": "Schwester v Herbert"}, + ] + people = persons.parse_register(rows) + p = people[0] + assert p.person_id == "blomquist-charlotte" + assert p.first_name == "Charlotte" + assert p.maiden_name == "Ruge" + assert p.birth_date == "1862-08-30" + assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse + assert p.spouse == "" + assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names + p2 = people[1] + assert p2.maiden_name == "Cram" + assert p2.spouse == "Ludwig Bohrmann" + assert p2.provisional is False