diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py new file mode 100644 index 00000000..e33f2901 --- /dev/null +++ b/tools/import-normalizer/documents.py @@ -0,0 +1,85 @@ +"""Document row extraction, triage, and the canonical document record.""" +from dataclasses import dataclass, field +from enum import Enum, auto + + +class Triage(Enum): + OK = auto() + EMPTY = auto() + BLANK_INDEX = auto() + X_SUFFIX = auto() + + +@dataclass +class RawRow: + source_row: int + index: str = "" + file: str = "" + box: str = "" + folder: str = "" + sender: str = "" + receivers: str = "" + date: str = "" + location: str = "" + tags: str = "" + summary: str = "" + + +@dataclass +class CanonicalDocument: + index: str + box: str = "" + folder: str = "" + sender_person_id: str = "" + sender_name: str = "" + receiver_person_ids: list = field(default_factory=list) + receiver_names: list = field(default_factory=list) + date_iso: str = "" + date_raw: str = "" + date_precision: str = "" + location: str = "" + tags: list = field(default_factory=list) + summary: str = "" + source_row: int = 0 + needs_review: list = field(default_factory=list) + + +_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] + + +def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow: + def get(field_name): + idx = header.get(field_name) + if idx is None or idx >= len(cells): + return "" + return (cells[idx] or "").strip() + return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS}) + + +def triage(cells: list[str], index_col: int = 0) -> Triage: + nonempty = [c for c in cells if c and str(c).strip()] + if not nonempty: + return Triage.EMPTY + index = (cells[index_col] or "").strip() if index_col < len(cells) else "" + if not index: + return Triage.BLANK_INDEX + if index.endswith("x"): + return Triage.X_SUFFIX + return Triage.OK + + +def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: + """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'.""" + name_cols = {header.get("sender"), header.get("receivers")} - {None} + populated = {i for i, c in enumerate(cells) if c and str(c).strip()} + if populated and populated <= name_cols: + return "section_banner" + return "data_no_index" + + +def index_file_mismatch(index: str, file_path: str) -> bool: + if not file_path.strip(): + return False + basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] + stem = basename.rsplit(".", 1)[0] + return stem != index diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py new file mode 100644 index 00000000..4c4f76a4 --- /dev/null +++ b/tools/import-normalizer/tests/test_documents.py @@ -0,0 +1,31 @@ +import documents +from documents import Triage + +def test_extract_row(): + header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4, + "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9} + cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"] + raw = documents.extract_row(cells, header, source_row=3) + assert raw.index == "W-0001" + assert raw.sender == "Walter de Gruyter" + assert raw.date == "15.2.1888" + assert raw.source_row == 3 + +def test_triage(): + assert documents.triage(["", "", ""]) == Triage.EMPTY + assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index + assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX + assert documents.triage(["W-0001", "x"]) == Triage.OK + +def test_classify_blank_index(): + header = {"sender": 4, "receivers": 5} + banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""] + data = ["", "", "V", "1", "", "Eugenie"] + assert documents.classify_blank_index(banner, header) == "section_banner" + assert documents.classify_blank_index(data, header) == "data_no_index" + +def test_index_file_mismatch(): + assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True + assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False + assert documents.index_file_mismatch("W-0001", "") is False