feat(normalizer): person resolution context + to_canonical
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import persons
|
||||
import documents
|
||||
from documents import Triage
|
||||
|
||||
@@ -31,3 +32,61 @@ def test_index_file_mismatch():
|
||||
assert documents.index_file_mismatch("W-0001", "") is False
|
||||
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
||||
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
||||
|
||||
|
||||
def _ctx():
|
||||
people = persons.parse_register([
|
||||
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
||||
])
|
||||
return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
||||
|
||||
def test_to_canonical_resolves_and_flags():
|
||||
ctx = _ctx()
|
||||
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
||||
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
||||
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
||||
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert doc.sender_person_id == "de-gruyter-walter"
|
||||
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||
assert doc.tags == ["Brautbriefe"]
|
||||
assert doc.needs_review == []
|
||||
|
||||
def test_to_canonical_unmatched_and_unparsed():
|
||||
ctx = _ctx()
|
||||
raw = documents.RawRow(source_row=9, index="C-0001",
|
||||
sender="Hans Wittkopf", receivers="", date="Freitag 1919")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert doc.sender_person_id == "wittkopf-hans" # provisional
|
||||
assert "unmatched_sender" in doc.needs_review
|
||||
assert "unparsed_date" in doc.needs_review
|
||||
assert ctx.unmatched["Hans Wittkopf"] == [9]
|
||||
assert any(p.provisional for p in ctx.provisional.values())
|
||||
|
||||
def test_to_canonical_splits_multi_sender():
|
||||
# REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged.
|
||||
ctx = _ctx()
|
||||
raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert doc.sender_person_id == "de-gruyter-walter" # first part is primary
|
||||
assert "multi_sender" in doc.needs_review
|
||||
|
||||
def test_provisional_id_never_collides_with_register():
|
||||
# A provisional built from an unmatched string must not steal a register person_id.
|
||||
people = persons.parse_register([{"last_name": "Cram", "first_name": "Clara"}])
|
||||
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
||||
# Force a provisional whose natural slug equals the register id by using a string the
|
||||
# alias index will not resolve but that slugs to "cram-clara":
|
||||
pid, _, matched = ctx.resolve_one("Clara Cram (unsicher)", source_row=1)
|
||||
assert matched is False
|
||||
assert pid not in {"cram-clara"} or pid.endswith("-2") # suffixed away from the register id
|
||||
|
||||
def test_ambiguous_space_pair_flagged_not_split():
|
||||
# US-PERS-02 AC4: "Ella Anita" is kept as one provisional + flagged, never guessed into two.
|
||||
ctx = _ctx()
|
||||
raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert len(doc.receiver_person_ids) == 1 # not split
|
||||
assert any(part == "Ella Anita" for _, part, _ in ctx.ambiguous)
|
||||
|
||||
Reference in New Issue
Block a user