Files
familienarchiv/tools/import-normalizer/documents.py
2026-05-25 14:18:09 +02:00

119 lines
3.8 KiB
Python

"""Document row extraction, triage, and the canonical document record."""
from dataclasses import dataclass, field
from enum import Enum, auto
import dates as _dates
class Triage(Enum):
OK = auto()
EMPTY = auto()
BLANK_INDEX = auto()
X_SUFFIX = auto()
@dataclass
class RawRow:
source_row: int
index: str = ""
file: str = ""
box: str = ""
folder: str = ""
sender: str = ""
receivers: str = ""
date: str = ""
location: str = ""
tags: str = ""
summary: str = ""
@dataclass
class CanonicalDocument:
index: str
box: str = ""
folder: str = ""
sender_person_id: str = ""
sender_name: str = ""
receiver_person_ids: list = field(default_factory=list)
receiver_names: list = field(default_factory=list)
date_iso: str = ""
date_raw: str = ""
date_precision: str = ""
location: str = ""
tags: list = field(default_factory=list)
summary: str = ""
source_row: int = 0
needs_review: list = field(default_factory=list)
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
def get(field_name):
idx = header.get(field_name)
if idx is None or idx >= len(cells):
return ""
return (cells[idx] or "").strip()
return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
def triage(cells: list[str], index_col: int = 0) -> Triage:
nonempty = [c for c in cells if c and str(c).strip()]
if not nonempty:
return Triage.EMPTY
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
if not index:
return Triage.BLANK_INDEX
if index.endswith("x"):
return Triage.X_SUFFIX
return Triage.OK
def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
"""REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
name_cols = {header.get("sender"), header.get("receivers")} - {None}
populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
if populated and populated <= name_cols:
return "section_banner"
return "data_no_index"
def index_file_mismatch(index: str, file_path: str) -> bool:
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
if not file_path.strip():
return False
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
stem = basename.rsplit(".", 1)[0]
return stem != index
def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument:
pd = _dates.parse_date(raw.date, date_overrides)
flags = []
sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
if raw.sender.strip() and not sender_matched:
flags.append("unmatched_sender")
if sender_multi:
flags.append("multi_sender")
receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
if any(not matched for _, _, matched in receivers):
flags.append("unmatched_receiver")
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
flags.append("unparsed_date")
if index_file_mismatch(raw.index, raw.file):
flags.append("index_file_mismatch")
return CanonicalDocument(
index=raw.index, box=raw.box, folder=raw.folder,
sender_person_id=sender_id, sender_name=sender_name,
receiver_person_ids=[r[0] for r in receivers],
receiver_names=[r[1] for r in receivers],
date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary,
source_row=raw.source_row, needs_review=flags,
)