import persons import documents from documents import Triage def test_extract_row(): header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4, "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9} cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"] raw = documents.extract_row(cells, header, source_row=3) assert raw.index == "W-0001" assert raw.sender == "Walter de Gruyter" assert raw.date == "15.2.1888" assert raw.source_row == 3 def test_triage(): assert documents.triage(["", "", ""]) == Triage.EMPTY assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX assert documents.triage(["W-0001", "x"]) == Triage.OK def test_classify_blank_index(): header = {"sender": 4, "receivers": 5} banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""] data = ["", "", "V", "1", "", "Eugenie"] assert documents.classify_blank_index(banner, header) == "section_banner" assert documents.classify_blank_index(data, header) == "data_no_index" def test_index_file_mismatch(): assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False assert documents.index_file_mismatch("W-0001", "") is False assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir def _ctx(): people = persons.parse_register([ {"last_name": "de Gruyter", "first_name": "Walter"}, {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, ]) return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) def test_to_canonical_resolves_and_flags(): ctx = _ctx() raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1", sender="Walter de Gruyter", receivers="Eugenie Müller", date="15.2.1888", location="Rotterdam", tags="Brautbriefe", summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.sender_person_id == "de-gruyter-walter" assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" assert doc.tags == ["Themen/Brautbriefe"] assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer assert doc.needs_review == [] def test_to_canonical_carries_file_name(): ctx = _ctx() raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", file="H-0730.pdf") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.file == "H-0730.pdf" def test_to_canonical_unmatched_and_unparsed(): ctx = _ctx() raw = documents.RawRow(source_row=9, index="C-0001", sender="Hans Wittkopf", receivers="", date="Freitag 1919") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.sender_person_id == "wittkopf-hans" # provisional assert "unmatched_sender" in doc.needs_review assert "unparsed_date" in doc.needs_review assert ctx.unmatched["Hans Wittkopf"] == [9] assert any(p.provisional for p in ctx.provisional.values()) def test_to_canonical_splits_multi_sender(): # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged. ctx = _ctx() raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.sender_person_id == "de-gruyter-walter" # first part is primary assert "multi_sender" in doc.needs_review def test_provisional_id_never_collides_with_register(): # A provisional built from an unmatched string must not steal a register person_id. people = persons.parse_register([{"last_name": "Xyz", "first_name": "Abc"}]) # id "xyz-abc" ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) # "Abc, Xyz" misses the alias index (the comma changes the normalized key) but its # provisional slug is "xyz-abc" — already the register person's id, so it MUST be suffixed. pid, _, matched = ctx.resolve_one("Abc, Xyz", source_row=1) assert matched is False assert "xyz-abc" in ctx.index.known_ids assert pid == "xyz-abc-2" # suffixed away from the register id, not reused def test_resolve_one_override_increments_hits(): people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Eugenie"}]) ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={"Genie": "de-gruyter-eugenie"}) pid, name, matched = ctx.resolve_one("Genie", source_row=1) assert pid == "de-gruyter-eugenie" and matched is True assert name == "Eugenie de Gruyter" # display comes from the alias index assert ctx.override_hits == 1 def test_ambiguous_pair_recorded_in_unresolved(): people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}]) ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}, given_names={"ella", "anita"}) raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert len(doc.receiver_person_ids) == 1 # not split — one provisional assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved) def test_resolvable_first_surname_pair_not_unresolved(): ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={}, given_names={"ella", "anita"}) ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name assert ctx.unresolved == [] # RESOLVABLE -> not recorded