119 lines
3.8 KiB
Python
119 lines
3.8 KiB
Python
"""Document row extraction, triage, and the canonical document record."""
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum, auto
|
|
|
|
import dates as _dates
|
|
|
|
|
|
class Triage(Enum):
|
|
OK = auto()
|
|
EMPTY = auto()
|
|
BLANK_INDEX = auto()
|
|
X_SUFFIX = auto()
|
|
|
|
|
|
@dataclass
|
|
class RawRow:
|
|
source_row: int
|
|
index: str = ""
|
|
file: str = ""
|
|
box: str = ""
|
|
folder: str = ""
|
|
sender: str = ""
|
|
receivers: str = ""
|
|
date: str = ""
|
|
location: str = ""
|
|
tags: str = ""
|
|
summary: str = ""
|
|
|
|
|
|
@dataclass
|
|
class CanonicalDocument:
|
|
index: str
|
|
box: str = ""
|
|
folder: str = ""
|
|
sender_person_id: str = ""
|
|
sender_name: str = ""
|
|
receiver_person_ids: list = field(default_factory=list)
|
|
receiver_names: list = field(default_factory=list)
|
|
date_iso: str = ""
|
|
date_raw: str = ""
|
|
date_precision: str = ""
|
|
location: str = ""
|
|
tags: list = field(default_factory=list)
|
|
summary: str = ""
|
|
source_row: int = 0
|
|
needs_review: list = field(default_factory=list)
|
|
|
|
|
|
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
|
|
|
|
|
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
|
def get(field_name):
|
|
idx = header.get(field_name)
|
|
if idx is None or idx >= len(cells):
|
|
return ""
|
|
return (cells[idx] or "").strip()
|
|
return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
|
|
|
|
|
|
def triage(cells: list[str], index_col: int = 0) -> Triage:
|
|
nonempty = [c for c in cells if c and str(c).strip()]
|
|
if not nonempty:
|
|
return Triage.EMPTY
|
|
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
|
|
if not index:
|
|
return Triage.BLANK_INDEX
|
|
if index.endswith("x"):
|
|
return Triage.X_SUFFIX
|
|
return Triage.OK
|
|
|
|
|
|
def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
|
"""REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
|
|
name_cols = {header.get("sender"), header.get("receivers")} - {None}
|
|
populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
|
|
if populated and populated <= name_cols:
|
|
return "section_banner"
|
|
return "data_no_index"
|
|
|
|
|
|
def index_file_mismatch(index: str, file_path: str) -> bool:
|
|
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
|
if not file_path.strip():
|
|
return False
|
|
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
|
stem = basename.rsplit(".", 1)[0]
|
|
return stem != index
|
|
|
|
|
|
def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument:
|
|
pd = _dates.parse_date(raw.date, date_overrides)
|
|
flags = []
|
|
|
|
sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
|
|
if raw.sender.strip() and not sender_matched:
|
|
flags.append("unmatched_sender")
|
|
if sender_multi:
|
|
flags.append("multi_sender")
|
|
|
|
receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
|
|
if any(not matched for _, _, matched in receivers):
|
|
flags.append("unmatched_receiver")
|
|
|
|
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
|
|
flags.append("unparsed_date")
|
|
if index_file_mismatch(raw.index, raw.file):
|
|
flags.append("index_file_mismatch")
|
|
|
|
return CanonicalDocument(
|
|
index=raw.index, box=raw.box, folder=raw.folder,
|
|
sender_person_id=sender_id, sender_name=sender_name,
|
|
receiver_person_ids=[r[0] for r in receivers],
|
|
receiver_names=[r[1] for r in receivers],
|
|
date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
|
|
location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary,
|
|
source_row=raw.source_row, needs_review=flags,
|
|
)
|