feat(normalizer): add row parser to persons_tree

This commit is contained in:
Marcel
2026-05-25 20:59:49 +02:00
parent 306f3b6fe6
commit 7012234e6a
2 changed files with 103 additions and 0 deletions

View File

@@ -128,3 +128,48 @@ def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str
if len(hits) == 0: if len(hits) == 0:
return None, "not_found" return None, "not_found"
return None, "ambiguous" return None, "ambiguous"
def _parse_row(row_num: int, fields: dict) -> dict:
"""Produce one person record from a header-mapped row dict.
Internal keys prefixed with '_' are stripped before JSON output in main().
"""
def s(key: str) -> str:
return (fields.get(key) or "").strip()
birth_raw = s("birth_date")
death_raw = s("death_date")
birth_year = _parse_year(birth_raw)
death_year = _parse_year(death_raw)
notes_parts = []
if birth_raw and birth_year is None:
notes_parts.append(f"[Geburtsdatum: {birth_raw}]")
if death_raw and death_year is None:
notes_parts.append(f"[Todesdatum: {death_raw}]")
bemerkung = s("notes")
if bemerkung:
notes_parts.append(bemerkung)
maiden = s("maiden_name") or None
spouse = s("spouse") or None
bemerkung_out = bemerkung or None
return {
"rowId": f"row_{row_num:03d}",
"firstName": s("first_name"),
"lastName": s("last_name"),
"maidenName": maiden,
"alias": None,
"notes": " ".join(notes_parts) or None,
"birthYear": birth_year,
"deathYear": death_year,
"birthPlace": s("birth_place") or None,
"deathPlace": s("death_place") or None,
"generation": _parse_generation(s("generation")),
"familyMember": True,
"_spouse_raw": spouse,
"_bemerkung_raw": bemerkung_out,
}

View File

@@ -171,3 +171,61 @@ def test_resolve_one_ambiguous():
row_id, reason = persons_tree._resolve_one("Cram", idx) row_id, reason = persons_tree._resolve_one("Cram", idx)
assert row_id is None assert row_id is None
assert reason == "ambiguous" assert reason == "ambiguous"
def test_parse_row_serial_dates():
fields = {
"generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
"maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
"death_date": "36222", "death_place": "Espelkamp",
"spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
}
p = persons_tree._parse_row(2, fields)
assert p["rowId"] == "row_002"
assert p["firstName"] == "Elsgard"
assert p["lastName"] == "Allemeyer"
assert p["maidenName"] == "Wöhler"
assert p["birthYear"] == 1920
assert p["deathYear"] == 1999
assert p["birthPlace"] == "Garz"
assert p["deathPlace"] == "Espelkamp"
assert p["generation"] == 3
assert p["familyMember"] is True
assert p["_spouse_raw"] == "Allemeyer Werner"
assert p["_bemerkung_raw"] == "Nichte von Herbert"
assert "[Geburtsdatum" not in (p["notes"] or "")
def test_parse_row_string_birth_date():
fields = {
"generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
"maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
"death_date": "", "death_place": "", "spouse": "", "notes": "",
}
p = persons_tree._parse_row(28, fields)
assert p["birthYear"] == 1890
assert p["deathYear"] is None
assert p["notes"] is None or p["notes"] == ""
def test_parse_row_unresolvable_date_goes_to_notes():
fields = {
"generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
"maiden_name": "", "birth_date": "28.9.", "birth_place": "",
"death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
}
p = persons_tree._parse_row(96, fields)
assert p["birthYear"] is None
assert "[Geburtsdatum: 28.9.]" in p["notes"]
assert "Bruder v Ingrid" in p["notes"]
def test_parse_row_empty_spouse_and_notes():
fields = {
"generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
"maiden_name": "", "birth_date": "", "birth_place": "",
"death_date": "", "death_place": "", "spouse": "", "notes": "",
}
p = persons_tree._parse_row(4, fields)
assert p["_spouse_raw"] is None
assert p["_bemerkung_raw"] is None