fix(normalizer): preserve trailing Bemerkung text after parent pattern
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -267,7 +267,13 @@ def _parse_bemerkung(
|
||||
if not m:
|
||||
continue
|
||||
|
||||
name_part = m.group(1).strip().rstrip("!., ")
|
||||
# Split the captured group on the first comma or semicolon to separate
|
||||
# the name part from any trailing description (e.g. ", nach Mexiko emigriert")
|
||||
raw_names, _, trailing = m.group(1).strip().partition(",")
|
||||
if not trailing:
|
||||
raw_names, _, trailing = raw_names.partition(";")
|
||||
name_part = raw_names.strip().rstrip("!., ")
|
||||
remainder = trailing.strip().lstrip(".,! ")
|
||||
parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
|
||||
rels: list[dict] = []
|
||||
unres: list[dict] = []
|
||||
@@ -300,7 +306,6 @@ def _parse_bemerkung(
|
||||
"reason": reason,
|
||||
})
|
||||
|
||||
remainder = s[m.end():].strip().lstrip(".,! ")
|
||||
return rels, unres, remainder
|
||||
|
||||
# No pattern matched — full text goes to notes, nothing to unresolved
|
||||
|
||||
@@ -418,3 +418,16 @@ def test_parse_bemerkung_plain_remark():
|
||||
)
|
||||
assert rels == [] and unres == []
|
||||
assert notes == "Verfasserin der Cram-Chronik !!"
|
||||
|
||||
|
||||
def test_parse_bemerkung_sohn_with_trailing_remark():
|
||||
_, idx = _register(
|
||||
("row_019", "Clara", "Cram", "de Gruyter"),
|
||||
("row_028", "Herbert", "Cram", None),
|
||||
)
|
||||
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||
"row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx
|
||||
)
|
||||
assert len(rels) == 2
|
||||
assert unres == []
|
||||
assert notes == "nach Mexiko emigriert"
|
||||
|
||||
Reference in New Issue
Block a user