refactor(normalizer): drop file column now PDFs resolve by index
The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -3,9 +3,9 @@ import documents
|
||||
from documents import Triage
|
||||
|
||||
def test_extract_row():
|
||||
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
||||
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
||||
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||
header = {"index": 0, "box": 1, "folder": 2, "sender": 3,
|
||||
"receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8}
|
||||
cells = ["W-0001", "V", "1", "Walter de Gruyter",
|
||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
||||
raw = documents.extract_row(cells, header, source_row=3)
|
||||
assert raw.index == "W-0001"
|
||||
@@ -26,14 +26,6 @@ def test_classify_blank_index():
|
||||
assert documents.classify_blank_index(banner, header) == "section_banner"
|
||||
assert documents.classify_blank_index(data, header) == "data_no_index"
|
||||
|
||||
def test_index_file_mismatch():
|
||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
||||
assert documents.index_file_mismatch("W-0001", "") is False
|
||||
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
||||
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
||||
|
||||
|
||||
def _ctx():
|
||||
people = persons.parse_register([
|
||||
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||
@@ -46,22 +38,19 @@ def test_to_canonical_resolves_and_flags():
|
||||
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
||||
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
||||
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
||||
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
||||
summary="Geschäftsreise")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert doc.sender_person_id == "de-gruyter-walter"
|
||||
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||
assert doc.tags == ["Themen/Brautbriefe"]
|
||||
assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer
|
||||
assert doc.needs_review == []
|
||||
|
||||
|
||||
def test_to_canonical_carries_file_name():
|
||||
ctx = _ctx()
|
||||
raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="",
|
||||
file="H-0730.pdf")
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||
assert doc.file == "H-0730.pdf"
|
||||
def test_canonical_document_has_no_file_field():
|
||||
# #686: PDFs resolve by index (<index>.pdf) in the importer; the file field is gone.
|
||||
doc = documents.CanonicalDocument(index="W-0001")
|
||||
assert not hasattr(doc, "file")
|
||||
|
||||
|
||||
def test_to_canonical_range_carries_date_end():
|
||||
|
||||
@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
|
||||
assert row["needs_review"] == "unparsed_date"
|
||||
|
||||
|
||||
def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
|
||||
def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
|
||||
# #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
|
||||
doc = documents.CanonicalDocument(
|
||||
index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
|
||||
index="H-0730", date_iso="1917-01-10",
|
||||
date_precision="RANGE", date_end="1917-01-11")
|
||||
out = tmp_path / "docs.xlsx"
|
||||
writers.write_documents_xlsx([doc], out)
|
||||
wb = openpyxl.load_workbook(out)
|
||||
ws = wb.active
|
||||
header = [c.value for c in ws[1]]
|
||||
assert "file" in header and "date_end" in header
|
||||
assert "file" not in header
|
||||
assert "date_end" in header
|
||||
row = {h: c.value for h, c in zip(header, ws[2])}
|
||||
assert row["file"] == "H-0730.pdf"
|
||||
assert row["date_end"] == "1917-01-11"
|
||||
|
||||
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
||||
|
||||
Reference in New Issue
Block a user