feat(normalizer): receiver splitting

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 13:59:51 +02:00
parent b7a2332861
commit a177077b40
2 changed files with 65 additions and 0 deletions

View File

@@ -87,3 +87,55 @@ def parse_register(rows: list[dict]) -> list[Person]:
seen[p.person_id] = seen.get(p.person_id, 0) + 1 seen[p.person_id] = seen.get(p.person_id, 0) + 1
p.person_id = f"{p.person_id}-{seen[p.person_id]}" p.person_id = f"{p.person_id}-{seen[p.person_id]}"
return people return people
_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I)
_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$")
_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I)
def find_known_last_name(segment: str):
seg = segment.strip()
for ln in config.KNOWN_LAST_NAMES: # config lists longest-first
if seg == ln or seg.endswith(" " + ln):
return ln
return None
def split_receivers(raw: str) -> list[str]:
if not raw or not raw.strip():
return []
# 0. split on "//"
if "//" in raw:
out = []
for seg in raw.split("//"):
out.extend(split_receivers(seg))
return out
cleaned = _GEB_RE.sub("", raw).strip()
if not _MULTI_RE.search(cleaned):
return [cleaned]
shared_last = None
pm = _PAREN_RE.search(cleaned)
if pm:
shared_last = pm.group(1).strip()
cleaned = cleaned[:pm.start()].strip()
parts = [p.strip() for p in _MULTI_RE.split(cleaned)]
parts = [p for p in parts if p and p.lower() != "familie"]
if not parts:
return []
if len(parts) == 1:
return [parts[0]]
if shared_last:
return [p if " " in p else f"{p} {shared_last}" for p in parts]
last_seg = parts[-1]
detected = find_known_last_name(last_seg)
if detected:
result = []
for p in parts[:-1]:
if " " not in p and find_known_last_name(p) is None:
result.append(f"{p} {detected}")
else:
result.append(p)
result.append(last_seg)
return result
return parts

View File

@@ -37,3 +37,16 @@ def test_parse_register_dedups_colliding_ids():
ids = [p.person_id for p in people] ids = [p.person_id for p in people]
assert ids == ["cram-hans-1", "cram-hans-2"] assert ids == ["cram-hans-1", "cram-hans-2"]
assert len(set(ids)) == 2 assert len(set(ids)) == 2
def test_split_receivers():
assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"]
assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"]
assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"]
assert persons.split_receivers("Clara u Familie") == ["Clara"]
assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"]
assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"]
assert persons.split_receivers("") == []
def test_find_known_last_name():
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
assert persons.find_known_last_name("Clara") is None