refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the
file column (the spreadsheet's datei value) is redundant. Remove file
from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS,
plus the now-moot index_file_mismatch review flag/CSV/stat and the
datei header mapping. date_end and the tree person_id are kept.

Refs #686

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-27 20:54:37 +02:00
committed by marcel
parent 7183d15fe5
commit 227116fe2d
7 changed files with 17 additions and 46 deletions

View File

@@ -17,7 +17,6 @@ class Triage(Enum):
class RawRow:
source_row: int
index: str = ""
file: str = ""
box: str = ""
folder: str = ""
sender: str = ""
@@ -31,7 +30,6 @@ class RawRow:
@dataclass
class CanonicalDocument:
index: str
file: str = ""
box: str = ""
folder: str = ""
sender_person_id: str = ""
@@ -49,7 +47,7 @@ class CanonicalDocument:
needs_review: list = field(default_factory=list)
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
@@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
return "data_no_index"
def index_file_mismatch(index: str, file_path: str) -> bool:
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
if not file_path.strip():
return False
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
stem = basename.rsplit(".", 1)[0]
return stem != index
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
pd = _dates.parse_date(raw.date, date_overrides)
flags = []
@@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
flags.append("unparsed_date")
if pd.needs_review:
flags.append("range_end_unparsed")
if index_file_mismatch(raw.index, raw.file):
flags.append("index_file_mismatch")
return CanonicalDocument(
index=raw.index, file=raw.file, box=raw.box, folder=raw.folder,
index=raw.index, box=raw.box, folder=raw.folder,
sender_person_id=sender_id, sender_name=sender_name,
receiver_person_ids=[r[0] for r in receivers],
receiver_names=[r[1] for r in receivers],