refactor(normalizer): drop file column now PDFs resolve by index
The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -26,7 +26,6 @@ Outputs:
|
|||||||
| --- | --- |
|
| --- | --- |
|
||||||
| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
|
| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
|
||||||
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
|
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
|
||||||
| `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. |
|
|
||||||
| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
|
| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
|
||||||
|
|
||||||
> `unresolved-names.csv` is the focused "names that need a human" list. Non-family
|
> `unresolved-names.csv` is the focused "names that need a human" list. Non-family
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ OVERRIDES_DIR = BASE_DIR / "overrides"
|
|||||||
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
||||||
DOCUMENT_HEADER_MAP = {
|
DOCUMENT_HEADER_MAP = {
|
||||||
"index": "index",
|
"index": "index",
|
||||||
"datei": "file",
|
|
||||||
"box": "box",
|
"box": "box",
|
||||||
"mappe": "folder",
|
"mappe": "folder",
|
||||||
"briefeschreiberin": "sender",
|
"briefeschreiberin": "sender",
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ class Triage(Enum):
|
|||||||
class RawRow:
|
class RawRow:
|
||||||
source_row: int
|
source_row: int
|
||||||
index: str = ""
|
index: str = ""
|
||||||
file: str = ""
|
|
||||||
box: str = ""
|
box: str = ""
|
||||||
folder: str = ""
|
folder: str = ""
|
||||||
sender: str = ""
|
sender: str = ""
|
||||||
@@ -31,7 +30,6 @@ class RawRow:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class CanonicalDocument:
|
class CanonicalDocument:
|
||||||
index: str
|
index: str
|
||||||
file: str = ""
|
|
||||||
box: str = ""
|
box: str = ""
|
||||||
folder: str = ""
|
folder: str = ""
|
||||||
sender_person_id: str = ""
|
sender_person_id: str = ""
|
||||||
@@ -49,7 +47,7 @@ class CanonicalDocument:
|
|||||||
needs_review: list = field(default_factory=list)
|
needs_review: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||||
|
|
||||||
|
|
||||||
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
||||||
@@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
|||||||
return "data_no_index"
|
return "data_no_index"
|
||||||
|
|
||||||
|
|
||||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
|
||||||
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
|
||||||
if not file_path.strip():
|
|
||||||
return False
|
|
||||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
|
||||||
stem = basename.rsplit(".", 1)[0]
|
|
||||||
return stem != index
|
|
||||||
|
|
||||||
|
|
||||||
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
||||||
pd = _dates.parse_date(raw.date, date_overrides)
|
pd = _dates.parse_date(raw.date, date_overrides)
|
||||||
flags = []
|
flags = []
|
||||||
@@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
|
|||||||
flags.append("unparsed_date")
|
flags.append("unparsed_date")
|
||||||
if pd.needs_review:
|
if pd.needs_review:
|
||||||
flags.append("range_end_unparsed")
|
flags.append("range_end_unparsed")
|
||||||
if index_file_mismatch(raw.index, raw.file):
|
|
||||||
flags.append("index_file_mismatch")
|
|
||||||
|
|
||||||
return CanonicalDocument(
|
return CanonicalDocument(
|
||||||
index=raw.index, file=raw.file, box=raw.box, folder=raw.folder,
|
index=raw.index, box=raw.box, folder=raw.folder,
|
||||||
sender_person_id=sender_id, sender_name=sender_name,
|
sender_person_id=sender_id, sender_name=sender_name,
|
||||||
receiver_person_ids=[r[0] for r in receivers],
|
receiver_person_ids=[r[0] for r in receivers],
|
||||||
receiver_names=[r[1] for r in receivers],
|
receiver_names=[r[1] for r in receivers],
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
||||||
index_col = d_fields["index"]
|
index_col = d_fields["index"]
|
||||||
|
|
||||||
canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
|
canon_docs, blank_index, skipped_x = [], [], []
|
||||||
unparsed_by_raw: dict[str, list] = {}
|
unparsed_by_raw: dict[str, list] = {}
|
||||||
dates_by_override = 0
|
dates_by_override = 0
|
||||||
empty_count = 0
|
empty_count = 0
|
||||||
@@ -59,8 +59,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
||||||
if "unparsed_date" in doc.needs_review:
|
if "unparsed_date" in doc.needs_review:
|
||||||
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
||||||
if "index_file_mismatch" in doc.needs_review:
|
|
||||||
mismatches.append([source_row, raw.index, raw.file])
|
|
||||||
canon_docs.append(doc)
|
canon_docs.append(doc)
|
||||||
|
|
||||||
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
||||||
@@ -102,7 +100,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
key=lambda r: (r[0], -r[2], r[1]))
|
key=lambda r: (r[0], -r[2], r[1]))
|
||||||
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||||
["category", "raw", "count", "example_rows"], unresolved_rows)
|
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||||
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
|
||||||
|
|
||||||
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
||||||
candidates = _tags.mine_summary_candidates(all_summaries)
|
candidates = _tags.mine_summary_candidates(all_summaries)
|
||||||
@@ -140,7 +137,6 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
|||||||
"blank_index_rows": len(blank_index),
|
"blank_index_rows": len(blank_index),
|
||||||
"skipped_x_suffix": len(skipped_x),
|
"skipped_x_suffix": len(skipped_x),
|
||||||
"duplicate_index_rows": len(duplicates),
|
"duplicate_index_rows": len(duplicates),
|
||||||
"index_file_mismatches": len(mismatches),
|
|
||||||
"# OVERRIDES": "",
|
"# OVERRIDES": "",
|
||||||
"date_overrides_loaded": len(date_overrides),
|
"date_overrides_loaded": len(date_overrides),
|
||||||
"name_overrides_loaded": len(name_overrides),
|
"name_overrides_loaded": len(name_overrides),
|
||||||
|
|||||||
@@ -3,9 +3,9 @@ import documents
|
|||||||
from documents import Triage
|
from documents import Triage
|
||||||
|
|
||||||
def test_extract_row():
|
def test_extract_row():
|
||||||
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
header = {"index": 0, "box": 1, "folder": 2, "sender": 3,
|
||||||
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
"receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8}
|
||||||
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
cells = ["W-0001", "V", "1", "Walter de Gruyter",
|
||||||
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
||||||
raw = documents.extract_row(cells, header, source_row=3)
|
raw = documents.extract_row(cells, header, source_row=3)
|
||||||
assert raw.index == "W-0001"
|
assert raw.index == "W-0001"
|
||||||
@@ -26,14 +26,6 @@ def test_classify_blank_index():
|
|||||||
assert documents.classify_blank_index(banner, header) == "section_banner"
|
assert documents.classify_blank_index(banner, header) == "section_banner"
|
||||||
assert documents.classify_blank_index(data, header) == "data_no_index"
|
assert documents.classify_blank_index(data, header) == "data_no_index"
|
||||||
|
|
||||||
def test_index_file_mismatch():
|
|
||||||
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
|
||||||
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
|
||||||
assert documents.index_file_mismatch("W-0001", "") is False
|
|
||||||
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
|
||||||
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
|
||||||
|
|
||||||
|
|
||||||
def _ctx():
|
def _ctx():
|
||||||
people = persons.parse_register([
|
people = persons.parse_register([
|
||||||
{"last_name": "de Gruyter", "first_name": "Walter"},
|
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||||
@@ -46,22 +38,19 @@ def test_to_canonical_resolves_and_flags():
|
|||||||
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
||||||
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
||||||
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
||||||
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
summary="Geschäftsreise")
|
||||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
assert doc.sender_person_id == "de-gruyter-walter"
|
assert doc.sender_person_id == "de-gruyter-walter"
|
||||||
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||||
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||||
assert doc.tags == ["Themen/Brautbriefe"]
|
assert doc.tags == ["Themen/Brautbriefe"]
|
||||||
assert doc.file == r"..\__scan\W-0001.pdf" # file name carried through for the importer
|
|
||||||
assert doc.needs_review == []
|
assert doc.needs_review == []
|
||||||
|
|
||||||
|
|
||||||
def test_to_canonical_carries_file_name():
|
def test_canonical_document_has_no_file_field():
|
||||||
ctx = _ctx()
|
# #686: PDFs resolve by index (<index>.pdf) in the importer; the file field is gone.
|
||||||
raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="",
|
doc = documents.CanonicalDocument(index="W-0001")
|
||||||
file="H-0730.pdf")
|
assert not hasattr(doc, "file")
|
||||||
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
|
||||||
assert doc.file == "H-0730.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_canonical_range_carries_date_end():
|
def test_to_canonical_range_carries_date_end():
|
||||||
|
|||||||
@@ -32,18 +32,19 @@ def test_write_documents_xlsx_joins_lists(tmp_path):
|
|||||||
assert row["needs_review"] == "unparsed_date"
|
assert row["needs_review"] == "unparsed_date"
|
||||||
|
|
||||||
|
|
||||||
def test_write_documents_xlsx_carries_file_and_date_end(tmp_path):
|
def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path):
|
||||||
|
# #686: PDFs resolve by index (<index>.pdf), so the redundant "file" column is dropped.
|
||||||
doc = documents.CanonicalDocument(
|
doc = documents.CanonicalDocument(
|
||||||
index="H-0730", file="H-0730.pdf", date_iso="1917-01-10",
|
index="H-0730", date_iso="1917-01-10",
|
||||||
date_precision="RANGE", date_end="1917-01-11")
|
date_precision="RANGE", date_end="1917-01-11")
|
||||||
out = tmp_path / "docs.xlsx"
|
out = tmp_path / "docs.xlsx"
|
||||||
writers.write_documents_xlsx([doc], out)
|
writers.write_documents_xlsx([doc], out)
|
||||||
wb = openpyxl.load_workbook(out)
|
wb = openpyxl.load_workbook(out)
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
header = [c.value for c in ws[1]]
|
header = [c.value for c in ws[1]]
|
||||||
assert "file" in header and "date_end" in header
|
assert "file" not in header
|
||||||
|
assert "date_end" in header
|
||||||
row = {h: c.value for h, c in zip(header, ws[2])}
|
row = {h: c.value for h, c in zip(header, ws[2])}
|
||||||
assert row["file"] == "H-0730.pdf"
|
|
||||||
assert row["date_end"] == "1917-01-11"
|
assert row["date_end"] == "1917-01-11"
|
||||||
|
|
||||||
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ def _csv_safe(value):
|
|||||||
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
||||||
|
|
||||||
|
|
||||||
DOC_COLUMNS = ["index", "file", "box", "folder", "sender_person_id", "sender_name",
|
DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name",
|
||||||
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
||||||
"date_precision", "date_end", "location", "tags", "summary",
|
"date_precision", "date_end", "location", "tags", "summary",
|
||||||
"source_row", "needs_review"]
|
"source_row", "needs_review"]
|
||||||
|
|||||||
Reference in New Issue
Block a user