fix(normalizer): preserve trailing Bemerkung text after parent pattern

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 21:12:45 +02:00
parent ace41ad209
commit 34c40cb0ee
2 changed files with 20 additions and 2 deletions

View File

@@ -267,7 +267,13 @@ def _parse_bemerkung(
if not m: if not m:
continue continue
name_part = m.group(1).strip().rstrip("!., ") # Split the captured group on the first comma or semicolon to separate
# the name part from any trailing description (e.g. ", nach Mexiko emigriert")
raw_names, _, trailing = m.group(1).strip().partition(",")
if not trailing:
raw_names, _, trailing = raw_names.partition(";")
name_part = raw_names.strip().rstrip("!., ")
remainder = trailing.strip().lstrip(".,! ")
parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
rels: list[dict] = [] rels: list[dict] = []
unres: list[dict] = [] unres: list[dict] = []
@@ -300,7 +306,6 @@ def _parse_bemerkung(
"reason": reason, "reason": reason,
}) })
remainder = s[m.end():].strip().lstrip(".,! ")
return rels, unres, remainder return rels, unres, remainder
# No pattern matched — full text goes to notes, nothing to unresolved # No pattern matched — full text goes to notes, nothing to unresolved

View File

@@ -418,3 +418,16 @@ def test_parse_bemerkung_plain_remark():
) )
assert rels == [] and unres == [] assert rels == [] and unres == []
assert notes == "Verfasserin der Cram-Chronik !!" assert notes == "Verfasserin der Cram-Chronik !!"
def test_parse_bemerkung_sohn_with_trailing_remark():
_, idx = _register(
("row_019", "Clara", "Cram", "de Gruyter"),
("row_028", "Herbert", "Cram", None),
)
rels, unres, notes = persons_tree._parse_bemerkung(
"row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx
)
assert len(rels) == 2
assert unres == []
assert notes == "nach Mexiko emigriert"