feat(normalizer): person register parsing
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
88
tools/import-normalizer/persons.py
Normal file
88
tools/import-normalizer/persons.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""Person register parsing, name splitting, alias resolution."""
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import config
|
||||
import dates
|
||||
|
||||
_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
|
||||
"Ä": "ae", "Ö": "oe", "Ü": "ue"})
|
||||
|
||||
|
||||
def _strip_accents(s: str) -> str:
|
||||
s = s.translate(_DIACRITIC_MAP)
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
return "".join(c for c in s if not unicodedata.combining(c))
|
||||
|
||||
|
||||
def slugify(last: str, first: str) -> str:
|
||||
raw = f"{last} {first}".strip()
|
||||
raw = _strip_accents(raw).lower()
|
||||
raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-")
|
||||
return raw or "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Person:
|
||||
person_id: str
|
||||
last_name: str = ""
|
||||
first_name: str = ""
|
||||
maiden_name: str = ""
|
||||
title: str = ""
|
||||
nickname: str = ""
|
||||
extra_given_names: list = field(default_factory=list)
|
||||
birth_date: str | None = None
|
||||
birth_date_raw: str = ""
|
||||
birth_place: str = ""
|
||||
death_date: str | None = None
|
||||
death_date_raw: str = ""
|
||||
death_place: str = ""
|
||||
spouse: str = ""
|
||||
generation: str = ""
|
||||
notes: str = ""
|
||||
aliases: list = field(default_factory=list)
|
||||
provisional: bool = False
|
||||
|
||||
|
||||
_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$')
|
||||
|
||||
|
||||
def parse_register(rows: list[dict]) -> list[Person]:
|
||||
people = []
|
||||
for r in rows:
|
||||
last = (r.get("last_name") or "").strip()
|
||||
if not last:
|
||||
continue
|
||||
given_raw = (r.get("first_name") or "").strip()
|
||||
givens = [g.strip() for g in given_raw.split(",") if g.strip()]
|
||||
first = givens[0] if givens else ""
|
||||
extra = givens[1:]
|
||||
|
||||
spouse_raw = (r.get("spouse") or "").strip()
|
||||
nickname = ""
|
||||
m = _QUOTED_RE.match(spouse_raw)
|
||||
if m:
|
||||
nickname = m.group(1)
|
||||
spouse_raw = ""
|
||||
|
||||
birth = dates.parse_date(r.get("birth_date") or "")
|
||||
death = dates.parse_date(r.get("death_date") or "")
|
||||
people.append(Person(
|
||||
person_id=slugify(last, first),
|
||||
last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(),
|
||||
nickname=nickname, extra_given_names=extra,
|
||||
birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(),
|
||||
death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(),
|
||||
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
|
||||
notes=(r.get("notes") or "").strip(), provisional=False,
|
||||
))
|
||||
# De-duplicate colliding ids with numeric suffix
|
||||
seen = {}
|
||||
for p in people:
|
||||
if p.person_id in seen:
|
||||
seen[p.person_id] += 1
|
||||
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
||||
else:
|
||||
seen[p.person_id] = 1
|
||||
return people
|
||||
29
tools/import-normalizer/tests/test_persons.py
Normal file
29
tools/import-normalizer/tests/test_persons.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import persons
|
||||
|
||||
def test_slugify():
|
||||
assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
|
||||
assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard"
|
||||
|
||||
def test_parse_register_basic():
|
||||
rows = [
|
||||
{"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi",
|
||||
"maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel",
|
||||
"death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"',
|
||||
"notes": "Schwester v Marie Cram"},
|
||||
{"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else",
|
||||
"maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann",
|
||||
"notes": "Schwester v Herbert"},
|
||||
]
|
||||
people = persons.parse_register(rows)
|
||||
p = people[0]
|
||||
assert p.person_id == "blomquist-charlotte"
|
||||
assert p.first_name == "Charlotte"
|
||||
assert p.maiden_name == "Ruge"
|
||||
assert p.birth_date == "1862-08-30"
|
||||
assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse
|
||||
assert p.spouse == ""
|
||||
assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names
|
||||
p2 = people[1]
|
||||
assert p2.maiden_name == "Cram"
|
||||
assert p2.spouse == "Ludwig Bohrmann"
|
||||
assert p2.provisional is False
|
||||
Reference in New Issue
Block a user