familienarchiv/tools/import-normalizer/documents.py

"""Document row extraction, triage, and the canonical document record."""
from dataclasses import dataclass, field
from enum import Enum, auto


class Triage(Enum):
    OK = auto()
    EMPTY = auto()
    BLANK_INDEX = auto()
    X_SUFFIX = auto()


@dataclass
class RawRow:
    source_row: int
    index: str = ""
    file: str = ""
    box: str = ""
    folder: str = ""
    sender: str = ""
    receivers: str = ""
    date: str = ""
    location: str = ""
    tags: str = ""
    summary: str = ""


@dataclass
class CanonicalDocument:
    index: str
    box: str = ""
    folder: str = ""
    sender_person_id: str = ""
    sender_name: str = ""
    receiver_person_ids: list = field(default_factory=list)
    receiver_names: list = field(default_factory=list)
    date_iso: str = ""
    date_raw: str = ""
    date_precision: str = ""
    location: str = ""
    tags: list = field(default_factory=list)
    summary: str = ""
    source_row: int = 0
    needs_review: list = field(default_factory=list)


_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]


def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
    def get(field_name):
        idx = header.get(field_name)
        if idx is None or idx >= len(cells):
            return ""
        return (cells[idx] or "").strip()
    return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})


def triage(cells: list[str], index_col: int = 0) -> Triage:
    nonempty = [c for c in cells if c and str(c).strip()]
    if not nonempty:
        return Triage.EMPTY
    index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
    if not index:
        return Triage.BLANK_INDEX
    if index.endswith("x"):
        return Triage.X_SUFFIX
    return Triage.OK


def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
    """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
    name_cols = {header.get("sender"), header.get("receivers")} - {None}
    populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
    if populated and populated <= name_cols:
        return "section_banner"
    return "data_no_index"


def index_file_mismatch(index: str, file_path: str) -> bool:
    # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
    if not file_path.strip():
        return False
    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
    stem = basename.rsplit(".", 1)[0]
    return stem != index