feat(normalizer): person resolution context + to_canonical
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -195,3 +195,78 @@ class AliasIndex:
|
||||
return None, 0.0
|
||||
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
|
||||
return self._by_alias[match[0]], score
|
||||
|
||||
|
||||
class ResolutionContext:
|
||||
"""Resolves raw name strings to person ids; accumulates provisional persons and review data."""
|
||||
def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str]):
|
||||
self.index = alias_index
|
||||
self.name_overrides = name_overrides
|
||||
self.provisional: dict[str, Person] = {}
|
||||
self.unmatched: dict[str, list] = {}
|
||||
self.ambiguous: list[tuple] = []
|
||||
self._raw_to_pid: dict[str, str] = {}
|
||||
self.override_hits = 0
|
||||
|
||||
def _unique_id(self, base: str) -> str:
|
||||
"""A provisional id must never collide with a register id or another provisional."""
|
||||
used = self.index.known_ids | set(self.provisional)
|
||||
pid, n = base, 1
|
||||
while pid in used:
|
||||
n += 1
|
||||
pid = f"{base}-{n}"
|
||||
return pid
|
||||
|
||||
def resolve_one(self, raw_name: str, source_row: int):
|
||||
"""Return (person_id, display_name, matched: bool). '' name -> ('', '', True)."""
|
||||
name = (raw_name or "").strip()
|
||||
if not name:
|
||||
return "", "", True
|
||||
if name in self.name_overrides:
|
||||
self.override_hits += 1
|
||||
pid = self.name_overrides[name]
|
||||
return pid, self.index.display(pid) or name, True
|
||||
pid = self.index.resolve(name)
|
||||
if pid:
|
||||
return pid, self.index.display(pid) or name, True
|
||||
# provisional person (unmatched) — never reuse a register id
|
||||
self.unmatched.setdefault(name, []).append(source_row)
|
||||
if name in self._raw_to_pid:
|
||||
return self._raw_to_pid[name], name, False
|
||||
last, first = _last_first(name)
|
||||
pid = self._unique_id(slugify(last, first))
|
||||
self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True)
|
||||
self._raw_to_pid[name] = pid
|
||||
return pid, name, False
|
||||
|
||||
def resolve_sender(self, raw: str, source_row: int):
|
||||
"""Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged."""
|
||||
parts = split_receivers(raw)
|
||||
if not parts:
|
||||
return "", "", True, False
|
||||
pid, name, matched = self.resolve_one(parts[0], source_row)
|
||||
for extra in parts[1:]:
|
||||
self.resolve_one(extra, source_row) # register the others as persons too
|
||||
return pid, name, matched, len(parts) > 1
|
||||
|
||||
def resolve_receivers(self, raw: str, source_row: int):
|
||||
results = []
|
||||
for part in split_receivers(raw):
|
||||
pid, name, matched = self.resolve_one(part, source_row)
|
||||
if not matched and " " in part and find_known_last_name(part) is None and len(part.split()) == 2:
|
||||
self.ambiguous.append((raw, part, source_row))
|
||||
results.append((pid, name, matched))
|
||||
return results
|
||||
|
||||
|
||||
def _last_first(name: str):
|
||||
"""Best-effort split of a free name string into (last, first) for slug/provisional building."""
|
||||
name = name.strip()
|
||||
ln = find_known_last_name(name)
|
||||
if ln:
|
||||
first = name[: -len(ln)].strip()
|
||||
return ln, first
|
||||
tokens = name.split()
|
||||
if len(tokens) >= 2:
|
||||
return tokens[-1], " ".join(tokens[:-1])
|
||||
return name, ""
|
||||
|
||||
Reference in New Issue
Block a user