refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 20:54:37 +02:00
parent 929acf6964
commit 09ba7e74e3
7 changed files with 17 additions and 46 deletions
--- a/tools/import-normalizer/tests/test_documents.py
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -3,9 +3,9 @@ import documents
 from documents import Triage

 def test_extract_row():
-    header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
-              "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
-    cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
+    header = {"index": 0, "box": 1, "folder": 2, "sender": 3,
+              "receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8}
+    cells = ["W-0001", "V", "1", "Walter de Gruyter",
             "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
    raw = documents.extract_row(cells, header, source_row=3)
    assert raw.index == "W-0001"
@@ -26,14 +26,6 @@ def test_classify_blank_index():
    assert documents.classify_blank_index(banner, header) == "section_banner"
    assert documents.classify_blank_index(data, header) == "data_no_index"

-def test_index_file_mismatch():
-    assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
-    assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
-    assert documents.index_file_mismatch("W-0001", "") is False
-    assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False  # unix path
-    assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False         # no dir
-
-
 def _ctx():
    people = persons.parse_register([
        {"last_name": "de Gruyter", "first_name": "Walter"},
@@ -46,22 +38,19 @@ def test_to_canonical_resolves_and_flags():
    raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
                           sender="Walter de Gruyter", receivers="Eugenie Müller",
                           date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
-                           summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
+                           summary="Geschäftsreise")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert doc.sender_person_id == "de-gruyter-walter"
    assert doc.receiver_person_ids == ["de-gruyter-eugenie"]   # matched via maiden alias
    assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
    assert doc.tags == ["Themen/Brautbriefe"]
-    assert doc.file == r"..\__scan\W-0001.pdf"   # file name carried through for the importer
    assert doc.needs_review == []


-def test_to_canonical_carries_file_name():
-    ctx = _ctx()
-    raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="",
-                           file="H-0730.pdf")
-    doc = documents.to_canonical(raw, ctx, date_overrides={})
-    assert doc.file == "H-0730.pdf"
+def test_canonical_document_has_no_file_field():
+    # #686: PDFs resolve by index (<index>.pdf) in the importer; the file field is gone.
+    doc = documents.CanonicalDocument(index="W-0001")
+    assert not hasattr(doc, "file")


 def test_to_canonical_range_carries_date_end():
--- a/tools/import-normalizer/tests/test_writers.py
+++ b/tools/import-normalizer/tests/test_writers.py
@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
    assert row["needs_review"] == "unparsed_date"


-def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
+def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
+    # #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
    doc = documents.CanonicalDocument(
-        index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
+        index="H-0730", date_iso="1917-01-10",
        date_precision="RANGE", date_end="1917-01-11")
    out = tmp_path / "docs.xlsx"
    writers.write_documents_xlsx([doc], out)
    wb = openpyxl.load_workbook(out)
    ws = wb.active
    header = [c.value for c in ws[1]]
-    assert "file" in header and "date_end" in header
+    assert "file" not in header
+    assert "date_end" in header
    row = {h: c.value for h, c in zip(header, ws[2])}
-    assert row["file"] == "H-0730.pdf"
    assert row["date_end"] == "1917-01-11"

 def test_write_documents_xlsx_pins_timestamp(tmp_path):