refactor(normalizer): drop file column now PDFs resolve by index
The import corpus is uniform: every PDF is named <index>.pdf, so the file column (the spreadsheet's datei value) is redundant. Remove file from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS, plus the now-moot index_file_mismatch review flag/CSV/stat and the datei header mapping. date_end and the tree person_id are kept. Refs #686 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,6 @@ class Triage(Enum):
|
||||
class RawRow:
|
||||
source_row: int
|
||||
index: str = ""
|
||||
file: str = ""
|
||||
box: str = ""
|
||||
folder: str = ""
|
||||
sender: str = ""
|
||||
@@ -31,7 +30,6 @@ class RawRow:
|
||||
@dataclass
|
||||
class CanonicalDocument:
|
||||
index: str
|
||||
file: str = ""
|
||||
box: str = ""
|
||||
folder: str = ""
|
||||
sender_person_id: str = ""
|
||||
@@ -49,7 +47,7 @@ class CanonicalDocument:
|
||||
needs_review: list = field(default_factory=list)
|
||||
|
||||
|
||||
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||
_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||
|
||||
|
||||
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
||||
@@ -82,15 +80,6 @@ def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
||||
return "data_no_index"
|
||||
|
||||
|
||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
||||
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
||||
if not file_path.strip():
|
||||
return False
|
||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
||||
stem = basename.rsplit(".", 1)[0]
|
||||
return stem != index
|
||||
|
||||
|
||||
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
||||
pd = _dates.parse_date(raw.date, date_overrides)
|
||||
flags = []
|
||||
@@ -109,11 +98,9 @@ def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = fr
|
||||
flags.append("unparsed_date")
|
||||
if pd.needs_review:
|
||||
flags.append("range_end_unparsed")
|
||||
if index_file_mismatch(raw.index, raw.file):
|
||||
flags.append("index_file_mismatch")
|
||||
|
||||
return CanonicalDocument(
|
||||
index=raw.index, file=raw.file, box=raw.box, folder=raw.folder,
|
||||
index=raw.index, box=raw.box, folder=raw.folder,
|
||||
sender_person_id=sender_id, sender_name=sender_name,
|
||||
receiver_person_ids=[r[0] for r in receivers],
|
||||
receiver_names=[r[1] for r in receivers],
|
||||
|
||||
Reference in New Issue
Block a user