"""Document row extraction, triage, and the canonical document record.""" from dataclasses import dataclass, field from enum import Enum, auto import dates as _dates import tags as _tags class Triage(Enum): OK = auto() EMPTY = auto() BLANK_INDEX = auto() X_SUFFIX = auto() @dataclass class RawRow: source_row: int index: str = "" file: str = "" box: str = "" folder: str = "" sender: str = "" receivers: str = "" date: str = "" location: str = "" tags: str = "" summary: str = "" @dataclass class CanonicalDocument: index: str file: str = "" box: str = "" folder: str = "" sender_person_id: str = "" sender_name: str = "" receiver_person_ids: list = field(default_factory=list) receiver_names: list = field(default_factory=list) date_iso: str = "" date_raw: str = "" date_precision: str = "" date_end: str = "" location: str = "" tags: list = field(default_factory=list) summary: str = "" source_row: int = 0 needs_review: list = field(default_factory=list) _FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow: def get(field_name): idx = header.get(field_name) if idx is None or idx >= len(cells): return "" return (cells[idx] or "").strip() return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS}) def triage(cells: list[str], index_col: int = 0) -> Triage: nonempty = [c for c in cells if c and str(c).strip()] if not nonempty: return Triage.EMPTY index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else "" if not index: return Triage.BLANK_INDEX if index.endswith("x"): return Triage.X_SUFFIX return Triage.OK def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'.""" name_cols = {header.get("sender"), header.get("receivers")} - {None} populated = {i for i, c in enumerate(cells) if c and str(c).strip()} if populated and populated <= name_cols: return "section_banner" return "data_no_index" def index_file_mismatch(index: str, file_path: str) -> bool: # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf). if not file_path.strip(): return False basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] stem = basename.rsplit(".", 1)[0] return stem != index def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument: pd = _dates.parse_date(raw.date, date_overrides) flags = [] sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row) if raw.sender.strip() and not sender_matched: flags.append("unmatched_sender") if sender_multi: flags.append("multi_sender") receivers = ctx.resolve_receivers(raw.receivers, raw.source_row) if any(not matched for _, _, matched in receivers): flags.append("unmatched_receiver") if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN: flags.append("unparsed_date") if pd.needs_review: flags.append("range_end_unparsed") if index_file_mismatch(raw.index, raw.file): flags.append("index_file_mismatch") return CanonicalDocument( index=raw.index, file=raw.file, box=raw.box, folder=raw.folder, sender_person_id=sender_id, sender_name=sender_name, receiver_person_ids=[r[0] for r in receivers], receiver_names=[r[1] for r in receivers], date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision), date_end=pd.end or "", location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary, source_row=raw.source_row, needs_review=flags, )