diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index 6d10fc97..74bf01da 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -267,7 +267,13 @@ def _parse_bemerkung( if not m: continue - name_part = m.group(1).strip().rstrip("!., ") + # Split the captured group on the first comma or semicolon to separate + # the name part from any trailing description (e.g. ", nach Mexiko emigriert") + raw_names, _, trailing = m.group(1).strip().partition(",") + if not trailing: + raw_names, _, trailing = raw_names.partition(";") + name_part = raw_names.strip().rstrip("!., ") + remainder = trailing.strip().lstrip(".,! ") parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] rels: list[dict] = [] unres: list[dict] = [] @@ -300,7 +306,6 @@ def _parse_bemerkung( "reason": reason, }) - remainder = s[m.end():].strip().lstrip(".,! ") return rels, unres, remainder # No pattern matched — full text goes to notes, nothing to unresolved diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index d73eee94..7970a172 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -418,3 +418,16 @@ def test_parse_bemerkung_plain_remark(): ) assert rels == [] and unres == [] assert notes == "Verfasserin der Cram-Chronik !!" + + +def test_parse_bemerkung_sohn_with_trailing_remark(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx + ) + assert len(rels) == 2 + assert unres == [] + assert notes == "nach Mexiko emigriert"