feat(normalizer): add PARENT_OF Bemerkung extraction to persons_tree
This commit is contained in:
@@ -113,6 +113,7 @@ def _build_index(persons: list[dict]) -> dict[str, list[str]]:
|
||||
if maiden:
|
||||
_add(_norm_tree(f"{first} {maiden}"), row_id)
|
||||
_add(_norm_tree(last), row_id)
|
||||
_add(_norm_tree(first), row_id)
|
||||
|
||||
return index
|
||||
|
||||
@@ -242,3 +243,66 @@ def _resolve_spouses(
|
||||
})
|
||||
|
||||
return relationships, unresolved
|
||||
|
||||
|
||||
_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I)
|
||||
_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I)
|
||||
_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I)
|
||||
|
||||
|
||||
def _parse_bemerkung(
|
||||
row_id: str, bemerkung: str, index: dict[str, list[str]]
|
||||
) -> tuple[list[dict], list[dict], str]:
|
||||
"""Extract PARENT_OF edges from a Bemerkung cell.
|
||||
|
||||
Returns (relationships, unresolved, remaining_notes).
|
||||
Text that doesn't match a parent pattern goes to remaining_notes unchanged.
|
||||
"""
|
||||
if not bemerkung or not bemerkung.strip():
|
||||
return [], [], ""
|
||||
|
||||
s = bemerkung.strip()
|
||||
|
||||
for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")):
|
||||
m = pattern.match(s)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
name_part = m.group(1).strip().rstrip("!., ")
|
||||
parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
|
||||
rels: list[dict] = []
|
||||
unres: list[dict] = []
|
||||
|
||||
for part in parts:
|
||||
part = part.rstrip("!., ")
|
||||
matched_id, reason = _resolve_one(part, index)
|
||||
if matched_id:
|
||||
if direction == "child":
|
||||
rels.append({
|
||||
"personId": matched_id,
|
||||
"relatedPersonId": row_id,
|
||||
"type": "PARENT_OF",
|
||||
"source": "bemerkung",
|
||||
"rawBemerkung": bemerkung,
|
||||
})
|
||||
else:
|
||||
rels.append({
|
||||
"personId": row_id,
|
||||
"relatedPersonId": matched_id,
|
||||
"type": "PARENT_OF",
|
||||
"source": "bemerkung",
|
||||
"rawBemerkung": bemerkung,
|
||||
})
|
||||
else:
|
||||
unres.append({
|
||||
"rowId": row_id,
|
||||
"field": "bemerkung",
|
||||
"raw": bemerkung,
|
||||
"reason": reason,
|
||||
})
|
||||
|
||||
remainder = s[m.end():].strip().lstrip(".,! ")
|
||||
return rels, unres, remainder
|
||||
|
||||
# No pattern matched — full text goes to notes, nothing to unresolved
|
||||
return [], [], s
|
||||
|
||||
@@ -318,3 +318,103 @@ def test_resolve_spouses_empty_spouse_field():
|
||||
idx = persons_tree._build_index(persons)
|
||||
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
||||
assert rels == [] and unres == []
|
||||
|
||||
|
||||
def _register(*args):
|
||||
"""Build index from (rowId, first, last, maiden) tuples."""
|
||||
persons = [
|
||||
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
|
||||
for a in args
|
||||
]
|
||||
return persons, persons_tree._build_index(persons)
|
||||
|
||||
|
||||
def test_parse_bemerkung_sohn_two_parents():
|
||||
_, idx = _register(
|
||||
("row_019", "Clara", "Cram", "de Gruyter"),
|
||||
("row_028", "Herbert", "Cram", None),
|
||||
)
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_021", "Sohn v Clara u Herbert", idx
|
||||
)
|
||||
assert len(rels) == 2
|
||||
assert all(r["type"] == "PARENT_OF" for r in rels)
|
||||
child_ids = {r["relatedPersonId"] for r in rels}
|
||||
parent_ids = {r["personId"] for r in rels}
|
||||
assert child_ids == {"row_021"}
|
||||
assert "row_019" in parent_ids and "row_028" in parent_ids
|
||||
assert unres == []
|
||||
assert notes == ""
|
||||
|
||||
|
||||
def test_parse_bemerkung_tochter_von():
|
||||
_, idx = _register(("row_019", "Clara", "Cram", None))
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_036", "Tochter von Clara Cram", idx
|
||||
)
|
||||
assert len(rels) == 1
|
||||
assert rels[0] == {
|
||||
"personId": "row_019",
|
||||
"relatedPersonId": "row_036",
|
||||
"type": "PARENT_OF",
|
||||
"source": "bemerkung",
|
||||
"rawBemerkung": "Tochter von Clara Cram",
|
||||
}
|
||||
assert notes == ""
|
||||
|
||||
|
||||
def test_parse_bemerkung_vater():
|
||||
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_031", "Vater v Herbert", idx
|
||||
)
|
||||
assert len(rels) == 1
|
||||
assert rels[0]["personId"] == "row_031"
|
||||
assert rels[0]["relatedPersonId"] == "row_028"
|
||||
assert rels[0]["type"] == "PARENT_OF"
|
||||
|
||||
|
||||
def test_parse_bemerkung_unmatched_parent_name():
|
||||
_, idx = _register() # empty index
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_004", "Sohn v Elsgard A.", idx
|
||||
)
|
||||
assert rels == []
|
||||
assert len(unres) == 1
|
||||
assert unres[0]["reason"] == "not_found"
|
||||
assert notes == ""
|
||||
|
||||
|
||||
def test_parse_bemerkung_skip_nichte():
|
||||
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_002", "Nichte von Herbert", idx
|
||||
)
|
||||
assert rels == []
|
||||
assert unres == []
|
||||
assert notes == "Nichte von Herbert"
|
||||
|
||||
|
||||
def test_parse_bemerkung_skip_bruder():
|
||||
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_033", "Bruder v Herbert", idx
|
||||
)
|
||||
assert rels == []
|
||||
assert unres == []
|
||||
assert notes == "Bruder v Herbert"
|
||||
|
||||
|
||||
def test_parse_bemerkung_empty():
|
||||
_, idx = _register()
|
||||
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
|
||||
assert rels == [] and unres == [] and notes == ""
|
||||
|
||||
|
||||
def test_parse_bemerkung_plain_remark():
|
||||
_, idx = _register()
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_029", "Verfasserin der Cram-Chronik !!", idx
|
||||
)
|
||||
assert rels == [] and unres == []
|
||||
assert notes == "Verfasserin der Cram-Chronik !!"
|
||||
|
||||
Reference in New Issue
Block a user