refactor(normalizer): drop file column now PDFs resolve by index

The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 20:54:37 +02:00
parent 7183d15fe5
commit 227116fe2d
7 changed files with 17 additions and 46 deletions
--- a/tools/import-normalizer/documents.py
+++ b/tools/import-normalizer/documents.py
@@ -17,7 +17,6 @@ class Triage(Enum):
 class RawRow:
    source_row: int
    index: str = ""
-    file: str = ""
    box: str = ""
    folder: str = ""
    sender: str = ""
@@ -31,7 +30,6 @@ class RawRow:
@dataclass
 class CanonicalDocument:
    index: str
-    file: str = ""
    box: str = ""
    folder: str = ""
    sender_person_id: str = ""
@@ -49,7 +47,7 @@ class CanonicalDocument:
    needs_review: list = field(default_factory=list)


-_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
+_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]


 def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
@@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
    return "data_no_index"


-def index_file_mismatch(index: str, file_path: str) -> bool:
-    # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
-    if not file_path.strip():
-        return False
-    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
-    stem = basename.rsplit(".", 1)[0]
-    return stem != index
-
-
 def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
    pd = _dates.parse_date(raw.date, date_overrides)
    flags = []
@@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
        flags.append("unparsed_date")
    if pd.needs_review:
        flags.append("range_end_unparsed")
-    if index_file_mismatch(raw.index, raw.file):
-        flags.append("index_file_mismatch")

    return CanonicalDocument(
-        index=raw.index, file=raw.file, box=raw.box, folder=raw.folder,
+        index=raw.index, box=raw.box, folder=raw.folder,
        sender_person_id=sender_id, sender_name=sender_name,
        receiver_person_ids=[r[0] for r in receivers],
        receiver_names=[r[1] for r in receivers],