refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the
file column (the spreadsheet's datei value) is redundant. Remove file
from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS,
plus the now-moot index_file_mismatch review flag/CSV/stat and the
datei header mapping. date_end and the tree person_id are kept.

Refs #686

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-27 20:54:37 +02:00
parent 929acf6964
commit 09ba7e74e3
7 changed files with 17 additions and 46 deletions

View File

@@ -33,7 +33,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
index_col = d_fields["index"]
canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
canon_docs, blank_index, skipped_x = [], [], []
unparsed_by_raw: dict[str, list] = {}
dates_by_override = 0
empty_count = 0
@@ -59,8 +59,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
if "unparsed_date" in doc.needs_review:
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
if "index_file_mismatch" in doc.needs_review:
mismatches.append([source_row, raw.index, raw.file])
canon_docs.append(doc)
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
@@ -102,7 +100,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
key=lambda r: (r[0], -r[2], r[1]))
writers.write_review_csv(review_dir / "unresolved-names.csv",
["category", "raw", "count", "example_rows"], unresolved_rows)
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
candidates = _tags.mine_summary_candidates(all_summaries)
@@ -140,7 +137,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
"blank_index_rows": len(blank_index),
"skipped_x_suffix": len(skipped_x),
"duplicate_index_rows": len(duplicates),
"index_file_mismatches": len(mismatches),
"# OVERRIDES": "",
"date_overrides_loaded": len(date_overrides),
"name_overrides_loaded": len(name_overrides),