feat(normalizer): row extraction, triage, canonical record
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
85
tools/import-normalizer/documents.py
Normal file
85
tools/import-normalizer/documents.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Document row extraction, triage, and the canonical document record."""
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, auto
|
||||
|
||||
|
||||
class Triage(Enum):
|
||||
OK = auto()
|
||||
EMPTY = auto()
|
||||
BLANK_INDEX = auto()
|
||||
X_SUFFIX = auto()
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawRow:
|
||||
source_row: int
|
||||
index: str = ""
|
||||
file: str = ""
|
||||
box: str = ""
|
||||
folder: str = ""
|
||||
sender: str = ""
|
||||
receivers: str = ""
|
||||
date: str = ""
|
||||
location: str = ""
|
||||
tags: str = ""
|
||||
summary: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanonicalDocument:
|
||||
index: str
|
||||
box: str = ""
|
||||
folder: str = ""
|
||||
sender_person_id: str = ""
|
||||
sender_name: str = ""
|
||||
receiver_person_ids: list = field(default_factory=list)
|
||||
receiver_names: list = field(default_factory=list)
|
||||
date_iso: str = ""
|
||||
date_raw: str = ""
|
||||
date_precision: str = ""
|
||||
location: str = ""
|
||||
tags: list = field(default_factory=list)
|
||||
summary: str = ""
|
||||
source_row: int = 0
|
||||
needs_review: list = field(default_factory=list)
|
||||
|
||||
|
||||
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||
|
||||
|
||||
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
||||
def get(field_name):
|
||||
idx = header.get(field_name)
|
||||
if idx is None or idx >= len(cells):
|
||||
return ""
|
||||
return (cells[idx] or "").strip()
|
||||
return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
|
||||
|
||||
|
||||
def triage(cells: list[str], index_col: int = 0) -> Triage:
|
||||
nonempty = [c for c in cells if c and str(c).strip()]
|
||||
if not nonempty:
|
||||
return Triage.EMPTY
|
||||
index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
|
||||
if not index:
|
||||
return Triage.BLANK_INDEX
|
||||
if index.endswith("x"):
|
||||
return Triage.X_SUFFIX
|
||||
return Triage.OK
|
||||
|
||||
|
||||
def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
||||
"""REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
|
||||
name_cols = {header.get("sender"), header.get("receivers")} - {None}
|
||||
populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
|
||||
if populated and populated <= name_cols:
|
||||
return "section_banner"
|
||||
return "data_no_index"
|
||||
|
||||
|
||||
def index_file_mismatch(index: str, file_path: str) -> bool:
|
||||
if not file_path.strip():
|
||||
return False
|
||||
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
||||
stem = basename.rsplit(".", 1)[0]
|
||||
return stem != index
|
||||
Reference in New Issue
Block a user