feat(normalizer): add PARENT_OF Bemerkung extraction to persons_tree

This commit is contained in:
Marcel
2026-05-25 21:06:24 +02:00
parent fa4b6b5fc2
commit 6f55489ec2
2 changed files with 164 additions and 0 deletions

View File

@@ -113,6 +113,7 @@ def _build_index(persons: list[dict]) -> dict[str, list[str]]:
if maiden:
_add(_norm_tree(f"{first} {maiden}"), row_id)
_add(_norm_tree(last), row_id)
_add(_norm_tree(first), row_id)
return index
@@ -242,3 +243,66 @@ def _resolve_spouses(
})
return relationships, unresolved
_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I)
_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I)
_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I)
def _parse_bemerkung(
row_id: str, bemerkung: str, index: dict[str, list[str]]
) -> tuple[list[dict], list[dict], str]:
"""Extract PARENT_OF edges from a Bemerkung cell.
Returns (relationships, unresolved, remaining_notes).
Text that doesn't match a parent pattern goes to remaining_notes unchanged.
"""
if not bemerkung or not bemerkung.strip():
return [], [], ""
s = bemerkung.strip()
for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")):
m = pattern.match(s)
if not m:
continue
name_part = m.group(1).strip().rstrip("!., ")
parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
rels: list[dict] = []
unres: list[dict] = []
for part in parts:
part = part.rstrip("!., ")
matched_id, reason = _resolve_one(part, index)
if matched_id:
if direction == "child":
rels.append({
"personId": matched_id,
"relatedPersonId": row_id,
"type": "PARENT_OF",
"source": "bemerkung",
"rawBemerkung": bemerkung,
})
else:
rels.append({
"personId": row_id,
"relatedPersonId": matched_id,
"type": "PARENT_OF",
"source": "bemerkung",
"rawBemerkung": bemerkung,
})
else:
unres.append({
"rowId": row_id,
"field": "bemerkung",
"raw": bemerkung,
"reason": reason,
})
remainder = s[m.end():].strip().lstrip(".,! ")
return rels, unres, remainder
# No pattern matched — full text goes to notes, nothing to unresolved
return [], [], s

View File

@@ -318,3 +318,103 @@ def test_resolve_spouses_empty_spouse_field():
idx = persons_tree._build_index(persons)
rels, unres = persons_tree._resolve_spouses(persons, idx)
assert rels == [] and unres == []
def _register(*args):
"""Build index from (rowId, first, last, maiden) tuples."""
persons = [
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
for a in args
]
return persons, persons_tree._build_index(persons)
def test_parse_bemerkung_sohn_two_parents():
_, idx = _register(
("row_019", "Clara", "Cram", "de Gruyter"),
("row_028", "Herbert", "Cram", None),
)
rels, unres, notes = persons_tree._parse_bemerkung(
"row_021", "Sohn v Clara u Herbert", idx
)
assert len(rels) == 2
assert all(r["type"] == "PARENT_OF" for r in rels)
child_ids = {r["relatedPersonId"] for r in rels}
parent_ids = {r["personId"] for r in rels}
assert child_ids == {"row_021"}
assert "row_019" in parent_ids and "row_028" in parent_ids
assert unres == []
assert notes == ""
def test_parse_bemerkung_tochter_von():
_, idx = _register(("row_019", "Clara", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_036", "Tochter von Clara Cram", idx
)
assert len(rels) == 1
assert rels[0] == {
"personId": "row_019",
"relatedPersonId": "row_036",
"type": "PARENT_OF",
"source": "bemerkung",
"rawBemerkung": "Tochter von Clara Cram",
}
assert notes == ""
def test_parse_bemerkung_vater():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_031", "Vater v Herbert", idx
)
assert len(rels) == 1
assert rels[0]["personId"] == "row_031"
assert rels[0]["relatedPersonId"] == "row_028"
assert rels[0]["type"] == "PARENT_OF"
def test_parse_bemerkung_unmatched_parent_name():
_, idx = _register() # empty index
rels, unres, notes = persons_tree._parse_bemerkung(
"row_004", "Sohn v Elsgard A.", idx
)
assert rels == []
assert len(unres) == 1
assert unres[0]["reason"] == "not_found"
assert notes == ""
def test_parse_bemerkung_skip_nichte():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_002", "Nichte von Herbert", idx
)
assert rels == []
assert unres == []
assert notes == "Nichte von Herbert"
def test_parse_bemerkung_skip_bruder():
_, idx = _register(("row_028", "Herbert", "Cram", None))
rels, unres, notes = persons_tree._parse_bemerkung(
"row_033", "Bruder v Herbert", idx
)
assert rels == []
assert unres == []
assert notes == "Bruder v Herbert"
def test_parse_bemerkung_empty():
_, idx = _register()
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
assert rels == [] and unres == [] and notes == ""
def test_parse_bemerkung_plain_remark():
_, idx = _register()
rels, unres, notes = persons_tree._parse_bemerkung(
"row_029", "Verfasserin der Cram-Chronik !!", idx
)
assert rels == [] and unres == []
assert notes == "Verfasserin der Cram-Chronik !!"