feat(normalizer): classify_name + NameClass
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from enum import StrEnum
|
||||
|
||||
import config
|
||||
import dates
|
||||
@@ -148,6 +149,54 @@ def _norm(name: str) -> str:
|
||||
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
|
||||
|
||||
|
||||
class NameClass(StrEnum):
|
||||
RESOLVABLE = "resolvable"
|
||||
UNKNOWN = "unknown"
|
||||
SINGLE_TOKEN = "single_token"
|
||||
RELATIONAL = "relational"
|
||||
COLLECTIVE = "collective"
|
||||
PROSE = "prose"
|
||||
AMBIGUOUS_PAIR = "ambiguous_pair"
|
||||
|
||||
|
||||
_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019"
|
||||
|
||||
|
||||
def classify_name(raw: str, given_names: set[str]) -> NameClass:
|
||||
"""Classify a (post-split) sender/receiver string by why it may be unresolvable.
|
||||
|
||||
Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
|
||||
SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
|
||||
"""
|
||||
s = raw.strip()
|
||||
if not s:
|
||||
return NameClass.RESOLVABLE
|
||||
low = s.lower()
|
||||
tokens = s.split()
|
||||
# alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
|
||||
# are matched as whole words (no substring/prefix false positives like "Allerton").
|
||||
alpha_words = re.findall(r"[a-zäöüß]+", low)
|
||||
if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
|
||||
return NameClass.UNKNOWN
|
||||
if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
|
||||
or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
|
||||
return NameClass.PROSE
|
||||
if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
|
||||
return NameClass.COLLECTIVE
|
||||
if any(w in config.RELATIONAL_TERMS for w in alpha_words):
|
||||
return NameClass.RELATIONAL
|
||||
if len(tokens) == 1:
|
||||
return NameClass.SINGLE_TOKEN
|
||||
if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
|
||||
return NameClass.AMBIGUOUS_PAIR
|
||||
return NameClass.RESOLVABLE
|
||||
|
||||
|
||||
# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
|
||||
# classified PROSE. Such multi-particle names are rare here and usually resolve via the
|
||||
# register; if they surface in review, lower-priority than the real prose entries.
|
||||
|
||||
|
||||
class AliasIndex:
|
||||
def __init__(self, people: list[Person]):
|
||||
self._by_alias: dict[str, str] = {}
|
||||
|
||||
Reference in New Issue
Block a user