refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the
file column (the spreadsheet's datei value) is redundant. Remove file
from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS,
plus the now-moot index_file_mismatch review flag/CSV/stat and the
datei header mapping. date_end and the tree person_id are kept.

Refs #686

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-27 20:54:37 +02:00
parent 929acf6964
commit 09ba7e74e3
7 changed files with 17 additions and 46 deletions

View File

@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
assert row["needs_review"] == "unparsed_date"
def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
# #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
doc = documents.CanonicalDocument(
index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
index="H-0730", date_iso="1917-01-10",
date_precision="RANGE", date_end="1917-01-11")
out = tmp_path / "docs.xlsx"
writers.write_documents_xlsx([doc], out)
wb = openpyxl.load_workbook(out)
ws = wb.active
header = [c.value for c in ws[1]]
assert "file" in header and "date_end" in header
assert "file" not in header
assert "date_end" in header
row = {h: c.value for h, c in zip(header, ws[2])}
assert row["file"] == "H-0730.pdf"
assert row["date_end"] == "1917-01-11"
def test_write_documents_xlsx_pins_timestamp(tmp_path):