From 88c8063227a36c8bad6cc4b2c38b701cbac239fc Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 14:18:09 +0200 Subject: [PATCH] feat(normalizer): person resolution context + to_canonical Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/documents.py | 32 ++++++++ tools/import-normalizer/persons.py | 75 +++++++++++++++++++ .../import-normalizer/tests/test_documents.py | 59 +++++++++++++++ 3 files changed, 166 insertions(+) diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py index f9b56e72..4edb124e 100644 --- a/tools/import-normalizer/documents.py +++ b/tools/import-normalizer/documents.py @@ -2,6 +2,8 @@ from dataclasses import dataclass, field from enum import Enum, auto +import dates as _dates + class Triage(Enum): OK = auto() @@ -84,3 +86,33 @@ def index_file_mismatch(index: str, file_path: str) -> bool: basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] stem = basename.rsplit(".", 1)[0] return stem != index + + +def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument: + pd = _dates.parse_date(raw.date, date_overrides) + flags = [] + + sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row) + if raw.sender.strip() and not sender_matched: + flags.append("unmatched_sender") + if sender_multi: + flags.append("multi_sender") + + receivers = ctx.resolve_receivers(raw.receivers, raw.source_row) + if any(not matched for _, _, matched in receivers): + flags.append("unmatched_receiver") + + if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN: + flags.append("unparsed_date") + if index_file_mismatch(raw.index, raw.file): + flags.append("index_file_mismatch") + + return CanonicalDocument( + index=raw.index, box=raw.box, folder=raw.folder, + sender_person_id=sender_id, sender_name=sender_name, + receiver_person_ids=[r[0] for r in receivers], + receiver_names=[r[1] for r in receivers], + date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision), + location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary, + source_row=raw.source_row, needs_review=flags, + ) diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index b92823a8..986f58b1 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -195,3 +195,78 @@ class AliasIndex: return None, 0.0 score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio() return self._by_alias[match[0]], score + + +class ResolutionContext: + """Resolves raw name strings to person ids; accumulates provisional persons and review data.""" + def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str]): + self.index = alias_index + self.name_overrides = name_overrides + self.provisional: dict[str, Person] = {} + self.unmatched: dict[str, list] = {} + self.ambiguous: list[tuple] = [] + self._raw_to_pid: dict[str, str] = {} + self.override_hits = 0 + + def _unique_id(self, base: str) -> str: + """A provisional id must never collide with a register id or another provisional.""" + used = self.index.known_ids | set(self.provisional) + pid, n = base, 1 + while pid in used: + n += 1 + pid = f"{base}-{n}" + return pid + + def resolve_one(self, raw_name: str, source_row: int): + """Return (person_id, display_name, matched: bool). '' name -> ('', '', True).""" + name = (raw_name or "").strip() + if not name: + return "", "", True + if name in self.name_overrides: + self.override_hits += 1 + pid = self.name_overrides[name] + return pid, self.index.display(pid) or name, True + pid = self.index.resolve(name) + if pid: + return pid, self.index.display(pid) or name, True + # provisional person (unmatched) — never reuse a register id + self.unmatched.setdefault(name, []).append(source_row) + if name in self._raw_to_pid: + return self._raw_to_pid[name], name, False + last, first = _last_first(name) + pid = self._unique_id(slugify(last, first)) + self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True) + self._raw_to_pid[name] = pid + return pid, name, False + + def resolve_sender(self, raw: str, source_row: int): + """Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged.""" + parts = split_receivers(raw) + if not parts: + return "", "", True, False + pid, name, matched = self.resolve_one(parts[0], source_row) + for extra in parts[1:]: + self.resolve_one(extra, source_row) # register the others as persons too + return pid, name, matched, len(parts) > 1 + + def resolve_receivers(self, raw: str, source_row: int): + results = [] + for part in split_receivers(raw): + pid, name, matched = self.resolve_one(part, source_row) + if not matched and " " in part and find_known_last_name(part) is None and len(part.split()) == 2: + self.ambiguous.append((raw, part, source_row)) + results.append((pid, name, matched)) + return results + + +def _last_first(name: str): + """Best-effort split of a free name string into (last, first) for slug/provisional building.""" + name = name.strip() + ln = find_known_last_name(name) + if ln: + first = name[: -len(ln)].strip() + return ln, first + tokens = name.split() + if len(tokens) >= 2: + return tokens[-1], " ".join(tokens[:-1]) + return name, "" diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index ec3066d6..f2b39bcb 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -1,3 +1,4 @@ +import persons import documents from documents import Triage @@ -31,3 +32,61 @@ def test_index_file_mismatch(): assert documents.index_file_mismatch("W-0001", "") is False assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir + + +def _ctx(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Walter"}, + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + ]) + return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + +def test_to_canonical_resolves_and_flags(): + ctx = _ctx() + raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1", + sender="Walter de Gruyter", receivers="Eugenie Müller", + date="15.2.1888", location="Rotterdam", tags="Brautbriefe", + summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" + assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias + assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" + assert doc.tags == ["Brautbriefe"] + assert doc.needs_review == [] + +def test_to_canonical_unmatched_and_unparsed(): + ctx = _ctx() + raw = documents.RawRow(source_row=9, index="C-0001", + sender="Hans Wittkopf", receivers="", date="Freitag 1919") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "wittkopf-hans" # provisional + assert "unmatched_sender" in doc.needs_review + assert "unparsed_date" in doc.needs_review + assert ctx.unmatched["Hans Wittkopf"] == [9] + assert any(p.provisional for p in ctx.provisional.values()) + +def test_to_canonical_splits_multi_sender(): + # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged. + ctx = _ctx() + raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" # first part is primary + assert "multi_sender" in doc.needs_review + +def test_provisional_id_never_collides_with_register(): + # A provisional built from an unmatched string must not steal a register person_id. + people = persons.parse_register([{"last_name": "Cram", "first_name": "Clara"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + # Force a provisional whose natural slug equals the register id by using a string the + # alias index will not resolve but that slugs to "cram-clara": + pid, _, matched = ctx.resolve_one("Clara Cram (unsicher)", source_row=1) + assert matched is False + assert pid not in {"cram-clara"} or pid.endswith("-2") # suffixed away from the register id + +def test_ambiguous_space_pair_flagged_not_split(): + # US-PERS-02 AC4: "Ella Anita" is kept as one provisional + flagged, never guessed into two. + ctx = _ctx() + raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert len(doc.receiver_person_ids) == 1 # not split + assert any(part == "Ella Anita" for _, part, _ in ctx.ambiguous)