refactor(normalizer): drop file column now PDFs resolve by index
The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -33,7 +33,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
||||
index_col = d_fields["index"]
|
||||
|
||||
canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
|
||||
canon_docs, blank_index, skipped_x = [], [], []
|
||||
unparsed_by_raw: dict[str, list] = {}
|
||||
dates_by_override = 0
|
||||
empty_count = 0
|
||||
@@ -59,8 +59,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
||||
if "unparsed_date" in doc.needs_review:
|
||||
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
||||
if "index_file_mismatch" in doc.needs_review:
|
||||
mismatches.append([source_row, raw.index, raw.file])
|
||||
canon_docs.append(doc)
|
||||
|
||||
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
||||
@@ -102,7 +100,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
key=lambda r: (r[0], -r[2], r[1]))
|
||||
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
||||
|
||||
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
||||
candidates = _tags.mine_summary_candidates(all_summaries)
|
||||
@@ -140,7 +137,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
"blank_index_rows": len(blank_index),
|
||||
"skipped_x_suffix": len(skipped_x),
|
||||
"duplicate_index_rows": len(duplicates),
|
||||
"index_file_mismatches": len(mismatches),
|
||||
"# OVERRIDES": "",
|
||||
"date_overrides_loaded": len(date_overrides),
|
||||
"name_overrides_loaded": len(name_overrides),
|
||||
|
||||
Reference in New Issue
Block a user