From 34c40cb0ee2637319bc1d60955fabcfb8c10dd28 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 21:12:45 +0200 Subject: [PATCH] fix(normalizer): preserve trailing Bemerkung text after parent pattern Co-Authored-By: Claude Sonnet 4.6 --- tools/import-normalizer/persons_tree.py | 9 +++++++-- tools/import-normalizer/tests/test_persons_tree.py | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index 6d10fc97..74bf01da 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -267,7 +267,13 @@ def _parse_bemerkung( if not m: continue - name_part = m.group(1).strip().rstrip("!., ") + # Split the captured group on the first comma or semicolon to separate + # the name part from any trailing description (e.g. ", nach Mexiko emigriert") + raw_names, _, trailing = m.group(1).strip().partition(",") + if not trailing: + raw_names, _, trailing = raw_names.partition(";") + name_part = raw_names.strip().rstrip("!., ") + remainder = trailing.strip().lstrip(".,! ") parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] rels: list[dict] = [] unres: list[dict] = [] @@ -300,7 +306,6 @@ def _parse_bemerkung( "reason": reason, }) - remainder = s[m.end():].strip().lstrip(".,! ") return rels, unres, remainder # No pattern matched — full text goes to notes, nothing to unresolved diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index d73eee94..7970a172 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -418,3 +418,16 @@ def test_parse_bemerkung_plain_remark(): ) assert rels == [] and unres == [] assert notes == "Verfasserin der Cram-Chronik !!" + + +def test_parse_bemerkung_sohn_with_trailing_remark(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx + ) + assert len(rels) == 2 + assert unres == [] + assert notes == "nach Mexiko emigriert"