diff --git a/tools/import-normalizer/README.md b/tools/import-normalizer/README.md index ca820519..4500db5c 100644 --- a/tools/import-normalizer/README.md +++ b/tools/import-normalizer/README.md @@ -26,7 +26,6 @@ Outputs: | --- | --- | | `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). | | `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). | -| `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. | | `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. | > `unresolved-names.csv` is the focused "names that need a human" list. Non-family diff --git a/tools/import-normalizer/config.py b/tools/import-normalizer/config.py index 66261d06..afea47c9 100644 --- a/tools/import-normalizer/config.py +++ b/tools/import-normalizer/config.py @@ -18,7 +18,6 @@ OVERRIDES_DIR = BASE_DIR / "overrides" # --- Header text (lowercased, whitespace-collapsed) -> canonical field --- DOCUMENT_HEADER_MAP = { "index": "index", - "datei": "file", "box": "box", "mappe": "folder", "briefeschreiberin": "sender", diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py index 94381acf..08bb1c31 100644 --- a/tools/import-normalizer/documents.py +++ b/tools/import-normalizer/documents.py @@ -17,7 +17,6 @@ class Triage(Enum): class RawRow: source_row: int index: str = "" - file: str = "" box: str = "" folder: str = "" sender: str = "" @@ -31,7 +30,6 @@ class RawRow: @dataclass class CanonicalDocument: index: str - file: str = "" box: str = "" folder: str = "" sender_person_id: str = "" @@ -49,7 +47,7 @@ class CanonicalDocument: needs_review: list = field(default_factory=list) -_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] +_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow: @@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: return "data_no_index" -def index_file_mismatch(index: str, file_path: str) -> bool: - # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf). - if not file_path.strip(): - return False - basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] - stem = basename.rsplit(".", 1)[0] - return stem != index - - def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument: pd = _dates.parse_date(raw.date, date_overrides) flags = [] @@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr flags.append("unparsed_date") if pd.needs_review: flags.append("range_end_unparsed") - if index_file_mismatch(raw.index, raw.file): - flags.append("index_file_mismatch") return CanonicalDocument( - index=raw.index, file=raw.file, box=raw.box, folder=raw.folder, + index=raw.index, box=raw.box, folder=raw.folder, sender_person_id=sender_id, sender_name=sender_name, receiver_person_ids=[r[0] for r in receivers], receiver_names=[r[1] for r in receivers], diff --git a/tools/import-normalizer/normalize.py b/tools/import-normalizer/normalize.py index 2e4fd98d..5bde0246 100644 --- a/tools/import-normalizer/normalize.py +++ b/tools/import-normalizer/normalize.py @@ -33,7 +33,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS) index_col = d_fields["index"] - canon_docs, blank_index, skipped_x, mismatches = [], [], [], [] + canon_docs, blank_index, skipped_x = [], [], [] unparsed_by_raw: dict[str, list] = {} dates_by_override = 0 empty_count = 0 @@ -59,8 +59,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes)) if "unparsed_date" in doc.needs_review: unparsed_by_raw.setdefault(raw.date, []).append(source_row) - if "index_file_mismatch" in doc.needs_review: - mismatches.append([source_row, raw.index, raw.file]) canon_docs.append(doc) # REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them. @@ -102,7 +100,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, key=lambda r: (r[0], -r[2], r[1])) writers.write_review_csv(review_dir / "unresolved-names.csv", ["category", "raw", "count", "example_rows"], unresolved_rows) - writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches) all_summaries = [doc.summary for doc in canon_docs if doc.summary] candidates = _tags.mine_summary_candidates(all_summaries) @@ -140,7 +137,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, "blank_index_rows": len(blank_index), "skipped_x_suffix": len(skipped_x), "duplicate_index_rows": len(duplicates), - "index_file_mismatches": len(mismatches), "# OVERRIDES": "", "date_overrides_loaded": len(date_overrides), "name_overrides_loaded": len(name_overrides), diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index fe07f40d..bbf8c4c1 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -3,9 +3,9 @@ import documents from documents import Triage def test_extract_row(): - header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4, - "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9} - cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + header = {"index": 0, "box": 1, "folder": 2, "sender": 3, + "receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8} + cells = ["W-0001", "V", "1", "Walter de Gruyter", "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"] raw = documents.extract_row(cells, header, source_row=3) assert raw.index == "W-0001" @@ -26,14 +26,6 @@ def test_classify_blank_index(): assert documents.classify_blank_index(banner, header) == "section_banner" assert documents.classify_blank_index(data, header) == "data_no_index" -def test_index_file_mismatch(): - assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True - assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False - assert documents.index_file_mismatch("W-0001", "") is False - assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path - assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir - - def _ctx(): people = persons.parse_register([ {"last_name": "de Gruyter", "first_name": "Walter"}, @@ -46,22 +38,19 @@ def test_to_canonical_resolves_and_flags(): raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1", sender="Walter de Gruyter", receivers="Eugenie Müller", date="15.2.1888", location="Rotterdam", tags="Brautbriefe", - summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf") + summary="Geschäftsreise") doc = documents.to_canonical(raw, ctx, date_overrides={}) assert doc.sender_person_id == "de-gruyter-walter" assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" assert doc.tags == ["Themen/Brautbriefe"] - assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer assert doc.needs_review == [] -def test_to_canonical_carries_file_name(): - ctx = _ctx() - raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", - file="H-0730.pdf") - doc = documents.to_canonical(raw, ctx, date_overrides={}) - assert doc.file == "H-0730.pdf" +def test_canonical_document_has_no_file_field(): + # #686: PDFs resolve by index (.pdf) in the importer; the file field is gone. + doc = documents.CanonicalDocument(index="W-0001") + assert not hasattr(doc, "file") def test_to_canonical_range_carries_date_end(): diff --git a/tools/import-normalizer/tests/test_writers.py b/tools/import-normalizer/tests/test_writers.py index 9f20d501..fe6dfbe9 100644 --- a/tools/import-normalizer/tests/test_writers.py +++ b/tools/import-normalizer/tests/test_writers.py @@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path): assert row["needs_review"] == "unparsed_date" -def test_write_documents_xlsx_carries_file_and_date_end(tmp_path): +def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path): + # #686: PDFs resolve by index (.pdf), so the redundant "file" column is dropped. doc = documents.CanonicalDocument( - index="H-0730", file="H-0730.pdf", date_iso="1917-01-10", + index="H-0730", date_iso="1917-01-10", date_precision="RANGE", date_end="1917-01-11") out = tmp_path / "docs.xlsx" writers.write_documents_xlsx([doc], out) wb = openpyxl.load_workbook(out) ws = wb.active header = [c.value for c in ws[1]] - assert "file" in header and "date_end" in header + assert "file" not in header + assert "date_end" in header row = {h: c.value for h, c in zip(header, ws[2])} - assert row["file"] == "H-0730.pdf" assert row["date_end"] == "1917-01-11" def test_write_documents_xlsx_pins_timestamp(tmp_path): diff --git a/tools/import-normalizer/writers.py b/tools/import-normalizer/writers.py index 5b9799e1..b7c3e816 100644 --- a/tools/import-normalizer/writers.py +++ b/tools/import-normalizer/writers.py @@ -22,7 +22,7 @@ def _csv_safe(value): return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s -DOC_COLUMNS = ["index", "file", "box", "folder", "sender_person_id", "sender_name", +DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name", "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary", "source_row", "needs_review"]