Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>
Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").
COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.
Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.
Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
110 lines
5.9 KiB
Python
110 lines
5.9 KiB
Python
import persons
|
|
import documents
|
|
from documents import Triage
|
|
|
|
def test_extract_row():
|
|
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
|
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
|
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
|
raw = documents.extract_row(cells, header, source_row=3)
|
|
assert raw.index == "W-0001"
|
|
assert raw.sender == "Walter de Gruyter"
|
|
assert raw.date == "15.2.1888"
|
|
assert raw.source_row == 3
|
|
|
|
def test_triage():
|
|
assert documents.triage(["", "", ""]) == Triage.EMPTY
|
|
assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index
|
|
assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX
|
|
assert documents.triage(["W-0001", "x"]) == Triage.OK
|
|
|
|
def test_classify_blank_index():
|
|
header = {"sender": 4, "receivers": 5}
|
|
banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""]
|
|
data = ["", "", "V", "1", "", "Eugenie"]
|
|
assert documents.classify_blank_index(banner, header) == "section_banner"
|
|
assert documents.classify_blank_index(data, header) == "data_no_index"
|
|
|
|
def test_index_file_mismatch():
|
|
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
|
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
|
assert documents.index_file_mismatch("W-0001", "") is False
|
|
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
|
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
|
|
|
|
|
def _ctx():
|
|
people = persons.parse_register([
|
|
{"last_name": "de Gruyter", "first_name": "Walter"},
|
|
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
|
])
|
|
return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
|
|
|
def test_to_canonical_resolves_and_flags():
|
|
ctx = _ctx()
|
|
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
|
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
|
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
|
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
assert doc.sender_person_id == "de-gruyter-walter"
|
|
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
|
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
|
assert doc.tags == ["Themen/Brautbriefe"]
|
|
assert doc.needs_review == []
|
|
|
|
def test_to_canonical_unmatched_and_unparsed():
|
|
ctx = _ctx()
|
|
raw = documents.RawRow(source_row=9, index="C-0001",
|
|
sender="Hans Wittkopf", receivers="", date="Freitag 1919")
|
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
assert doc.sender_person_id == "wittkopf-hans" # provisional
|
|
assert "unmatched_sender" in doc.needs_review
|
|
assert "unparsed_date" in doc.needs_review
|
|
assert ctx.unmatched["Hans Wittkopf"] == [9]
|
|
assert any(p.provisional for p in ctx.provisional.values())
|
|
|
|
def test_to_canonical_splits_multi_sender():
|
|
# REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged.
|
|
ctx = _ctx()
|
|
raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="")
|
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
assert doc.sender_person_id == "de-gruyter-walter" # first part is primary
|
|
assert "multi_sender" in doc.needs_review
|
|
|
|
def test_provisional_id_never_collides_with_register():
|
|
# A provisional built from an unmatched string must not steal a register person_id.
|
|
people = persons.parse_register([{"last_name": "Xyz", "first_name": "Abc"}]) # id "xyz-abc"
|
|
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
|
# "Abc, Xyz" misses the alias index (the comma changes the normalized key) but its
|
|
# provisional slug is "xyz-abc" — already the register person's id, so it MUST be suffixed.
|
|
pid, _, matched = ctx.resolve_one("Abc, Xyz", source_row=1)
|
|
assert matched is False
|
|
assert "xyz-abc" in ctx.index.known_ids
|
|
assert pid == "xyz-abc-2" # suffixed away from the register id, not reused
|
|
|
|
def test_resolve_one_override_increments_hits():
|
|
people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Eugenie"}])
|
|
ctx = persons.ResolutionContext(persons.AliasIndex(people),
|
|
name_overrides={"Genie": "de-gruyter-eugenie"})
|
|
pid, name, matched = ctx.resolve_one("Genie", source_row=1)
|
|
assert pid == "de-gruyter-eugenie" and matched is True
|
|
assert name == "Eugenie de Gruyter" # display comes from the alias index
|
|
assert ctx.override_hits == 1
|
|
|
|
def test_ambiguous_pair_recorded_in_unresolved():
|
|
people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}])
|
|
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={},
|
|
given_names={"ella", "anita"})
|
|
raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
|
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
assert len(doc.receiver_person_ids) == 1 # not split — one provisional
|
|
assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved)
|
|
|
|
def test_resolvable_first_surname_pair_not_unresolved():
|
|
ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={},
|
|
given_names={"ella", "anita"})
|
|
ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name
|
|
assert ctx.unresolved == [] # RESOLVABLE -> not recorded
|