refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 20:54:37 +02:00
parent 929acf6964
commit 09ba7e74e3
7 changed files with 17 additions and 46 deletions
--- a/tools/import-normalizer/tests/test_writers.py
+++ b/tools/import-normalizer/tests/test_writers.py
@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
    assert row["needs_review"] == "unparsed_date"


-def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
+def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
+    # #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
    doc = documents.CanonicalDocument(
-        index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
+        index="H-0730", date_iso="1917-01-10",
        date_precision="RANGE", date_end="1917-01-11")
    out = tmp_path / "docs.xlsx"
    writers.write_documents_xlsx([doc], out)
    wb = openpyxl.load_workbook(out)
    ws = wb.active
    header = [c.value for c in ws[1]]
-    assert "file" in header and "date_end" in header
+    assert "file" not in header
+    assert "date_end" in header
    row = {h: c.value for h, c in zip(header, ws[2])}
-    assert row["file"] == "H-0730.pdf"
    assert row["date_end"] == "1917-01-11"

 def test_write_documents_xlsx_pins_timestamp(tmp_path):