From 6f55489ec26e5715df723139dda2657bc8454a16 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 21:06:24 +0200 Subject: [PATCH] feat(normalizer): add PARENT_OF Bemerkung extraction to persons_tree --- tools/import-normalizer/persons_tree.py | 64 +++++++++++ .../tests/test_persons_tree.py | 100 ++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index 9b7f6095..0866fba4 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -113,6 +113,7 @@ def _build_index(persons: list[dict]) -> dict[str, list[str]]: if maiden: _add(_norm_tree(f"{first} {maiden}"), row_id) _add(_norm_tree(last), row_id) + _add(_norm_tree(first), row_id) return index @@ -242,3 +243,66 @@ def _resolve_spouses( }) return relationships, unresolved + + +_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I) +_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I) +_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I) + + +def _parse_bemerkung( + row_id: str, bemerkung: str, index: dict[str, list[str]] +) -> tuple[list[dict], list[dict], str]: + """Extract PARENT_OF edges from a Bemerkung cell. + + Returns (relationships, unresolved, remaining_notes). + Text that doesn't match a parent pattern goes to remaining_notes unchanged. + """ + if not bemerkung or not bemerkung.strip(): + return [], [], "" + + s = bemerkung.strip() + + for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")): + m = pattern.match(s) + if not m: + continue + + name_part = m.group(1).strip().rstrip("!., ") + parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] + rels: list[dict] = [] + unres: list[dict] = [] + + for part in parts: + part = part.rstrip("!., ") + matched_id, reason = _resolve_one(part, index) + if matched_id: + if direction == "child": + rels.append({ + "personId": matched_id, + "relatedPersonId": row_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + rels.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + unres.append({ + "rowId": row_id, + "field": "bemerkung", + "raw": bemerkung, + "reason": reason, + }) + + remainder = s[m.end():].strip().lstrip(".,! ") + return rels, unres, remainder + + # No pattern matched — full text goes to notes, nothing to unresolved + return [], [], s diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index 97bbc8bd..d08d2029 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -318,3 +318,103 @@ def test_resolve_spouses_empty_spouse_field(): idx = persons_tree._build_index(persons) rels, unres = persons_tree._resolve_spouses(persons, idx) assert rels == [] and unres == [] + + +def _register(*args): + """Build index from (rowId, first, last, maiden) tuples.""" + persons = [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]} + for a in args + ] + return persons, persons_tree._build_index(persons) + + +def test_parse_bemerkung_sohn_two_parents(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara u Herbert", idx + ) + assert len(rels) == 2 + assert all(r["type"] == "PARENT_OF" for r in rels) + child_ids = {r["relatedPersonId"] for r in rels} + parent_ids = {r["personId"] for r in rels} + assert child_ids == {"row_021"} + assert "row_019" in parent_ids and "row_028" in parent_ids + assert unres == [] + assert notes == "" + + +def test_parse_bemerkung_tochter_von(): + _, idx = _register(("row_019", "Clara", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_036", "Tochter von Clara Cram", idx + ) + assert len(rels) == 1 + assert rels[0] == { + "personId": "row_019", + "relatedPersonId": "row_036", + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": "Tochter von Clara Cram", + } + assert notes == "" + + +def test_parse_bemerkung_vater(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_031", "Vater v Herbert", idx + ) + assert len(rels) == 1 + assert rels[0]["personId"] == "row_031" + assert rels[0]["relatedPersonId"] == "row_028" + assert rels[0]["type"] == "PARENT_OF" + + +def test_parse_bemerkung_unmatched_parent_name(): + _, idx = _register() # empty index + rels, unres, notes = persons_tree._parse_bemerkung( + "row_004", "Sohn v Elsgard A.", idx + ) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["reason"] == "not_found" + assert notes == "" + + +def test_parse_bemerkung_skip_nichte(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_002", "Nichte von Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Nichte von Herbert" + + +def test_parse_bemerkung_skip_bruder(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_033", "Bruder v Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Bruder v Herbert" + + +def test_parse_bemerkung_empty(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx) + assert rels == [] and unres == [] and notes == "" + + +def test_parse_bemerkung_plain_remark(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung( + "row_029", "Verfasserin der Cram-Chronik !!", idx + ) + assert rels == [] and unres == [] + assert notes == "Verfasserin der Cram-Chronik !!"