feat(normalizer): person resolution context + to_canonical

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 14:18:09 +02:00
parent 3066d3d3ff
commit 88c8063227
3 changed files with 166 additions and 0 deletions
--- a/tools/import-normalizer/documents.py
+++ b/tools/import-normalizer/documents.py
@@ -2,6 +2,8 @@
 from dataclasses import dataclass, field
 from enum import Enum, auto

+import dates as _dates
+

 class Triage(Enum):
    OK = auto()
@@ -84,3 +86,33 @@ def index_file_mismatch(index: str, file_path: str) -> bool:
    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
    stem = basename.rsplit(".", 1)[0]
    return stem != index
+
+
+def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument:
+    pd = _dates.parse_date(raw.date, date_overrides)
+    flags = []
+
+    sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
+    if raw.sender.strip() and not sender_matched:
+        flags.append("unmatched_sender")
+    if sender_multi:
+        flags.append("multi_sender")
+
+    receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
+    if any(not matched for _, _, matched in receivers):
+        flags.append("unmatched_receiver")
+
+    if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
+        flags.append("unparsed_date")
+    if index_file_mismatch(raw.index, raw.file):
+        flags.append("index_file_mismatch")
+
+    return CanonicalDocument(
+        index=raw.index, box=raw.box, folder=raw.folder,
+        sender_person_id=sender_id, sender_name=sender_name,
+        receiver_person_ids=[r[0] for r in receivers],
+        receiver_names=[r[1] for r in receivers],
+        date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
+        location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary,
+        source_row=raw.source_row, needs_review=flags,
+    )
--- a/tools/import-normalizer/persons.py
+++ b/tools/import-normalizer/persons.py
@@ -195,3 +195,78 @@ class AliasIndex:
            return None, 0.0
        score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
        return self._by_alias[match[0]], score
+
+
+class ResolutionContext:
+    """Resolves raw name strings to person ids; accumulates provisional persons and review data."""
+    def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str]):
+        self.index = alias_index
+        self.name_overrides = name_overrides
+        self.provisional: dict[str, Person] = {}
+        self.unmatched: dict[str, list] = {}
+        self.ambiguous: list[tuple] = []
+        self._raw_to_pid: dict[str, str] = {}
+        self.override_hits = 0
+
+    def _unique_id(self, base: str) -> str:
+        """A provisional id must never collide with a register id or another provisional."""
+        used = self.index.known_ids | set(self.provisional)
+        pid, n = base, 1
+        while pid in used:
+            n += 1
+            pid = f"{base}-{n}"
+        return pid
+
+    def resolve_one(self, raw_name: str, source_row: int):
+        """Return (person_id, display_name, matched: bool). '' name -> ('', '', True)."""
+        name = (raw_name or "").strip()
+        if not name:
+            return "", "", True
+        if name in self.name_overrides:
+            self.override_hits += 1
+            pid = self.name_overrides[name]
+            return pid, self.index.display(pid) or name, True
+        pid = self.index.resolve(name)
+        if pid:
+            return pid, self.index.display(pid) or name, True
+        # provisional person (unmatched) — never reuse a register id
+        self.unmatched.setdefault(name, []).append(source_row)
+        if name in self._raw_to_pid:
+            return self._raw_to_pid[name], name, False
+        last, first = _last_first(name)
+        pid = self._unique_id(slugify(last, first))
+        self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True)
+        self._raw_to_pid[name] = pid
+        return pid, name, False
+
+    def resolve_sender(self, raw: str, source_row: int):
+        """Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged."""
+        parts = split_receivers(raw)
+        if not parts:
+            return "", "", True, False
+        pid, name, matched = self.resolve_one(parts[0], source_row)
+        for extra in parts[1:]:
+            self.resolve_one(extra, source_row)  # register the others as persons too
+        return pid, name, matched, len(parts) > 1
+
+    def resolve_receivers(self, raw: str, source_row: int):
+        results = []
+        for part in split_receivers(raw):
+            pid, name, matched = self.resolve_one(part, source_row)
+            if not matched and " " in part and find_known_last_name(part) is None and len(part.split()) == 2:
+                self.ambiguous.append((raw, part, source_row))
+            results.append((pid, name, matched))
+        return results
+
+
+def _last_first(name: str):
+    """Best-effort split of a free name string into (last, first) for slug/provisional building."""
+    name = name.strip()
+    ln = find_known_last_name(name)
+    if ln:
+        first = name[: -len(ln)].strip()
+        return ln, first
+    tokens = name.split()
+    if len(tokens) >= 2:
+        return tokens[-1], " ".join(tokens[:-1])
+    return name, ""
--- a/tools/import-normalizer/tests/test_documents.py
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -1,3 +1,4 @@
+import persons
 import documents
 from documents import Triage

@@ -31,3 +32,61 @@ def test_index_file_mismatch():
    assert documents.index_file_mismatch("W-0001", "") is False
    assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False  # unix path
    assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False         # no dir
+
+
+def _ctx():
+    people = persons.parse_register([
+        {"last_name": "de Gruyter", "first_name": "Walter"},
+        {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
+    ])
+    return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
+
+def test_to_canonical_resolves_and_flags():
+    ctx = _ctx()
+    raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
+                           sender="Walter de Gruyter", receivers="Eugenie Müller",
+                           date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
+                           summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
+    doc = documents.to_canonical(raw, ctx, date_overrides={})
+    assert doc.sender_person_id == "de-gruyter-walter"
+    assert doc.receiver_person_ids == ["de-gruyter-eugenie"]   # matched via maiden alias
+    assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
+    assert doc.tags == ["Brautbriefe"]
+    assert doc.needs_review == []
+
+def test_to_canonical_unmatched_and_unparsed():
+    ctx = _ctx()
+    raw = documents.RawRow(source_row=9, index="C-0001",
+                           sender="Hans Wittkopf", receivers="", date="Freitag 1919")
+    doc = documents.to_canonical(raw, ctx, date_overrides={})
+    assert doc.sender_person_id == "wittkopf-hans"            # provisional
+    assert "unmatched_sender" in doc.needs_review
+    assert "unparsed_date" in doc.needs_review
+    assert ctx.unmatched["Hans Wittkopf"] == [9]
+    assert any(p.provisional for p in ctx.provisional.values())
+
+def test_to_canonical_splits_multi_sender():
+    # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged.
+    ctx = _ctx()
+    raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="")
+    doc = documents.to_canonical(raw, ctx, date_overrides={})
+    assert doc.sender_person_id == "de-gruyter-walter"   # first part is primary
+    assert "multi_sender" in doc.needs_review
+
+def test_provisional_id_never_collides_with_register():
+    # A provisional built from an unmatched string must not steal a register person_id.
+    people = persons.parse_register([{"last_name": "Cram", "first_name": "Clara"}])
+    ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
+    # Force a provisional whose natural slug equals the register id by using a string the
+    # alias index will not resolve but that slugs to "cram-clara":
+    pid, _, matched = ctx.resolve_one("Clara Cram (unsicher)", source_row=1)
+    assert matched is False
+    assert pid not in {"cram-clara"} or pid.endswith("-2")  # suffixed away from the register id
+
+def test_ambiguous_space_pair_flagged_not_split():
+    # US-PERS-02 AC4: "Ella Anita" is kept as one provisional + flagged, never guessed into two.
+    ctx = _ctx()
+    raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
+    doc = documents.to_canonical(raw, ctx, date_overrides={})
+    assert len(doc.receiver_person_ids) == 1          # not split
+    assert any(part == "Ella Anita" for _, part, _ in ctx.ambiguous)