From a177077b401119a49333186585026f224348e262 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 13:59:51 +0200 Subject: [PATCH] feat(normalizer): receiver splitting Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/persons.py | 52 +++++++++++++++++++ tools/import-normalizer/tests/test_persons.py | 13 +++++ 2 files changed, 65 insertions(+) diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index a26c100c..312df9d1 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -87,3 +87,55 @@ def parse_register(rows: list[dict]) -> list[Person]: seen[p.person_id] = seen.get(p.person_id, 0) + 1 p.person_id = f"{p.person_id}-{seen[p.person_id]}" return people + + +_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I) +_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$") +_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I) + + +def find_known_last_name(segment: str): + seg = segment.strip() + for ln in config.KNOWN_LAST_NAMES: # config lists longest-first + if seg == ln or seg.endswith(" " + ln): + return ln + return None + + +def split_receivers(raw: str) -> list[str]: + if not raw or not raw.strip(): + return [] + # 0. split on "//" + if "//" in raw: + out = [] + for seg in raw.split("//"): + out.extend(split_receivers(seg)) + return out + cleaned = _GEB_RE.sub("", raw).strip() + if not _MULTI_RE.search(cleaned): + return [cleaned] + shared_last = None + pm = _PAREN_RE.search(cleaned) + if pm: + shared_last = pm.group(1).strip() + cleaned = cleaned[:pm.start()].strip() + parts = [p.strip() for p in _MULTI_RE.split(cleaned)] + parts = [p for p in parts if p and p.lower() != "familie"] + if not parts: + return [] + if len(parts) == 1: + return [parts[0]] + if shared_last: + return [p if " " in p else f"{p} {shared_last}" for p in parts] + last_seg = parts[-1] + detected = find_known_last_name(last_seg) + if detected: + result = [] + for p in parts[:-1]: + if " " not in p and find_known_last_name(p) is None: + result.append(f"{p} {detected}") + else: + result.append(p) + result.append(last_seg) + return result + return parts diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py index 3f1b0649..ea0d2409 100644 --- a/tools/import-normalizer/tests/test_persons.py +++ b/tools/import-normalizer/tests/test_persons.py @@ -37,3 +37,16 @@ def test_parse_register_dedups_colliding_ids(): ids = [p.person_id for p in people] assert ids == ["cram-hans-1", "cram-hans-2"] assert len(set(ids)) == 2 + +def test_split_receivers(): + assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"] + assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"] + assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"] + assert persons.split_receivers("Clara u Familie") == ["Clara"] + assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"] + assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"] + assert persons.split_receivers("") == [] + +def test_find_known_last_name(): + assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter" + assert persons.find_known_last_name("Clara") is None