Files
familienarchiv/tools/import-normalizer/documents.py
Marcel 09ba7e74e3 refactor(normalizer): drop file column now PDFs resolve by index
The import corpus is uniform: every PDF is named <index>.pdf, so the
file column (the spreadsheet's datei value) is redundant. Remove file
from CanonicalDocument, RawRow, _FIELDS, to_canonical, and DOC_COLUMNS,
plus the now-moot index_file_mismatch review flag/CSV/stat and the
datei header mapping. date_end and the tree person_id are kept.

Refs #686

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 20:54:37 +02:00

112 lines
3.5 KiB
Python

"""Document row extraction, triage, and the canonical document record."""
from dataclasses import dataclass, field
from enum import Enum, auto
import dates as _dates
import tags as _tags
class Triage(Enum):
OK = auto()
EMPTY = auto()
BLANK_INDEX = auto()
X_SUFFIX = auto()
@dataclass
class RawRow:
source_row: int
index: str = ""
box: str = ""
folder: str = ""
sender: str = ""
receivers: str = ""
date: str = ""
location: str = ""
tags: str = ""
summary: str = ""
@dataclass
class CanonicalDocument:
index: str
box: str = ""
folder: str = ""
sender_person_id: str = ""
sender_name: str = ""
receiver_person_ids: list = field(default_factory=list)
receiver_names: list = field(default_factory=list)
date_iso: str = ""
date_raw: str = ""
date_precision: str = ""
date_end: str = ""
location: str = ""
tags: list = field(default_factory=list)
summary: str = ""
source_row: int = 0
needs_review: list = field(default_factory=list)
_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
def get(field_name):
idx = header.get(field_name)
if idx is None or idx >= len(cells):
return ""
return (cells[idx] or "").strip()
return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
def triage(cells: list[str], index_col: int = 0) -> Triage:
nonempty = [c for c in cells if c and str(c).strip()]
if not nonempty:
return Triage.EMPTY
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
if not index:
return Triage.BLANK_INDEX
if index.endswith("x"):
return Triage.X_SUFFIX
return Triage.OK
def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
"""REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
name_cols = {header.get("sender"), header.get("receivers")} - {None}
populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
if populated and populated <= name_cols:
return "section_banner"
return "data_no_index"
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
pd = _dates.parse_date(raw.date, date_overrides)
flags = []
sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
if raw.sender.strip() and not sender_matched:
flags.append("unmatched_sender")
if sender_multi:
flags.append("multi_sender")
receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
if any(not matched for _, _, matched in receivers):
flags.append("unmatched_receiver")
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
flags.append("unparsed_date")
if pd.needs_review:
flags.append("range_end_unparsed")
return CanonicalDocument(
index=raw.index, box=raw.box, folder=raw.folder,
sender_person_id=sender_id, sender_name=sender_name,
receiver_person_ids=[r[0] for r in receivers],
receiver_names=[r[1] for r in receivers],
date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
date_end=pd.end or "",
location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary,
source_row=raw.source_row, needs_review=flags,
)