diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py index 3ebac821..c8060719 100644 --- a/tools/import-normalizer/documents.py +++ b/tools/import-normalizer/documents.py @@ -31,6 +31,7 @@ class RawRow: @dataclass class CanonicalDocument: index: str + file: str = "" box: str = "" folder: str = "" sender_person_id: str = "" @@ -40,6 +41,7 @@ class CanonicalDocument: date_iso: str = "" date_raw: str = "" date_precision: str = "" + date_end: str = "" location: str = "" tags: list = field(default_factory=list) summary: str = "" @@ -109,7 +111,7 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr flags.append("index_file_mismatch") return CanonicalDocument( - index=raw.index, box=raw.box, folder=raw.folder, + index=raw.index, file=raw.file, box=raw.box, folder=raw.folder, sender_person_id=sender_id, sender_name=sender_name, receiver_person_ids=[r[0] for r in receivers], receiver_names=[r[1] for r in receivers], diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index 52f5025f..3395275b 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -52,8 +52,17 @@ def test_to_canonical_resolves_and_flags(): assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" assert doc.tags == ["Themen/Brautbriefe"] + assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer assert doc.needs_review == [] + +def test_to_canonical_carries_file_name(): + ctx = _ctx() + raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", + file="H-0730.pdf") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.file == "H-0730.pdf" + def test_to_canonical_unmatched_and_unparsed(): ctx = _ctx() raw = documents.RawRow(source_row=9, index="C-0001", diff --git a/tools/import-normalizer/tests/test_writers.py b/tools/import-normalizer/tests/test_writers.py index 37c4e199..9f20d501 100644 --- a/tools/import-normalizer/tests/test_writers.py +++ b/tools/import-normalizer/tests/test_writers.py @@ -31,6 +31,21 @@ def test_write_documents_xlsx_joins_lists(tmp_path): assert row["receiver_person_ids"] == "a|b" assert row["needs_review"] == "unparsed_date" + +def test_write_documents_xlsx_carries_file_and_date_end(tmp_path): + doc = documents.CanonicalDocument( + index="H-0730", file="H-0730.pdf", date_iso="1917-01-10", + date_precision="RANGE", date_end="1917-01-11") + out = tmp_path / "docs.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + ws = wb.active + header = [c.value for c in ws[1]] + assert "file" in header and "date_end" in header + row = {h: c.value for h, c in zip(header, ws[2])} + assert row["file"] == "H-0730.pdf" + assert row["date_end"] == "1917-01-11" + def test_write_documents_xlsx_pins_timestamp(tmp_path): # determinism (NFR-IDEM-01): workbook created/modified are pinned, not the current time doc = documents.CanonicalDocument(index="W-0001") diff --git a/tools/import-normalizer/writers.py b/tools/import-normalizer/writers.py index 05b4d52e..5b9799e1 100644 --- a/tools/import-normalizer/writers.py +++ b/tools/import-normalizer/writers.py @@ -22,9 +22,10 @@ def _csv_safe(value): return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s -DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name", +DOC_COLUMNS = ["index", "file", "box", "folder", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", - "date_precision", "location", "tags", "summary", "source_row", "needs_review"] + "date_precision", "date_end", "location", "tags", "summary", + "source_row", "needs_review"] PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname", "birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw",