From fa4b6b5fc29e873d691906d57374b2543c8902db Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 21:03:46 +0200 Subject: [PATCH] feat(normalizer): add SPOUSE_OF resolution to persons_tree --- tools/import-normalizer/persons_tree.py | 35 +++++++++++++++ .../tests/test_persons_tree.py | 45 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index af3ceb09..9b7f6095 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -207,3 +207,38 @@ def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]: result.append(p) return result, skipped + + +def _resolve_spouses( + persons: list[dict], index: dict[str, list[str]] +) -> tuple[list[dict], list[dict]]: + """Emit SPOUSE_OF edges from each person's _spouse_raw field.""" + relationships: list[dict] = [] + unresolved: list[dict] = [] + emitted: set[frozenset] = set() + + for p in persons: + raw = (p.get("_spouse_raw") or "").strip() + if not raw: + continue + row_id = p["rowId"] + matched_id, reason = _resolve_one(raw, index) + if matched_id: + edge = frozenset([row_id, matched_id]) + if edge not in emitted: + emitted.add(edge) + relationships.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "SPOUSE_OF", + "source": "verheiratet_mit", + }) + else: + unresolved.append({ + "rowId": row_id, + "field": "verheiratet_mit", + "raw": raw, + "reason": reason, + }) + + return relationships, unresolved diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index ea4a1b61..97bbc8bd 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -273,3 +273,48 @@ def test_deduplicate_both_none_birth_year_kept(): result, skipped = persons_tree._deduplicate(persons) assert [p["rowId"] for p in result] == ["row_A"] assert len(skipped) == 1 + + +def _make_persons(*args): + """Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples.""" + return [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3], + "_spouse_raw": a[4], "_bemerkung_raw": None, + "birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None, + "generation": None, "familyMember": True, "alias": None, "notes": None} + for a in args + ] + + +def test_resolve_spouses_success(): + persons = _make_persons( + ("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"), + ("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert len(rels) == 1 + assert rels[0]["type"] == "SPOUSE_OF" + assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"} + assert unres == [] + + +def test_resolve_spouses_not_found(): + persons = _make_persons( + ("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["rowId"] == "row_007" + assert unres[0]["reason"] == "not_found" + + +def test_resolve_spouses_empty_spouse_field(): + persons = _make_persons( + ("row_004", "Jürgen", "Allemeyer", None, None), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] and unres == []