feat(normalizer): receiver splitting
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -87,3 +87,55 @@ def parse_register(rows: list[dict]) -> list[Person]:
|
||||
seen[p.person_id] = seen.get(p.person_id, 0) + 1
|
||||
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
||||
return people
|
||||
|
||||
|
||||
_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I)
|
||||
_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$")
|
||||
_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I)
|
||||
|
||||
|
||||
def find_known_last_name(segment: str):
|
||||
seg = segment.strip()
|
||||
for ln in config.KNOWN_LAST_NAMES: # config lists longest-first
|
||||
if seg == ln or seg.endswith(" " + ln):
|
||||
return ln
|
||||
return None
|
||||
|
||||
|
||||
def split_receivers(raw: str) -> list[str]:
|
||||
if not raw or not raw.strip():
|
||||
return []
|
||||
# 0. split on "//"
|
||||
if "//" in raw:
|
||||
out = []
|
||||
for seg in raw.split("//"):
|
||||
out.extend(split_receivers(seg))
|
||||
return out
|
||||
cleaned = _GEB_RE.sub("", raw).strip()
|
||||
if not _MULTI_RE.search(cleaned):
|
||||
return [cleaned]
|
||||
shared_last = None
|
||||
pm = _PAREN_RE.search(cleaned)
|
||||
if pm:
|
||||
shared_last = pm.group(1).strip()
|
||||
cleaned = cleaned[:pm.start()].strip()
|
||||
parts = [p.strip() for p in _MULTI_RE.split(cleaned)]
|
||||
parts = [p for p in parts if p and p.lower() != "familie"]
|
||||
if not parts:
|
||||
return []
|
||||
if len(parts) == 1:
|
||||
return [parts[0]]
|
||||
if shared_last:
|
||||
return [p if " " in p else f"{p} {shared_last}" for p in parts]
|
||||
last_seg = parts[-1]
|
||||
detected = find_known_last_name(last_seg)
|
||||
if detected:
|
||||
result = []
|
||||
for p in parts[:-1]:
|
||||
if " " not in p and find_known_last_name(p) is None:
|
||||
result.append(f"{p} {detected}")
|
||||
else:
|
||||
result.append(p)
|
||||
result.append(last_seg)
|
||||
return result
|
||||
return parts
|
||||
|
||||
Reference in New Issue
Block a user