diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py
new file mode 100644
index 00000000..e33f2901
--- /dev/null
+++ b/tools/import-normalizer/documents.py
@@ -0,0 +1,85 @@
+"""Document row extraction, triage, and the canonical document record."""
+from dataclasses import dataclass, field
+from enum import Enum, auto
+
+
+class Triage(Enum):
+    OK = auto()
+    EMPTY = auto()
+    BLANK_INDEX = auto()
+    X_SUFFIX = auto()
+
+
+@dataclass
+class RawRow:
+    source_row: int
+    index: str = ""
+    file: str = ""
+    box: str = ""
+    folder: str = ""
+    sender: str = ""
+    receivers: str = ""
+    date: str = ""
+    location: str = ""
+    tags: str = ""
+    summary: str = ""
+
+
+@dataclass
+class CanonicalDocument:
+    index: str
+    box: str = ""
+    folder: str = ""
+    sender_person_id: str = ""
+    sender_name: str = ""
+    receiver_person_ids: list = field(default_factory=list)
+    receiver_names: list = field(default_factory=list)
+    date_iso: str = ""
+    date_raw: str = ""
+    date_precision: str = ""
+    location: str = ""
+    tags: list = field(default_factory=list)
+    summary: str = ""
+    source_row: int = 0
+    needs_review: list = field(default_factory=list)
+
+
+_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
+
+
+def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
+    def get(field_name):
+        idx = header.get(field_name)
+        if idx is None or idx >= len(cells):
+            return ""
+        return (cells[idx] or "").strip()
+    return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
+
+
+def triage(cells: list[str], index_col: int = 0) -> Triage:
+    nonempty = [c for c in cells if c and str(c).strip()]
+    if not nonempty:
+        return Triage.EMPTY
+    index = (cells[index_col] or "").strip() if index_col < len(cells) else ""
+    if not index:
+        return Triage.BLANK_INDEX
+    if index.endswith("x"):
+        return Triage.X_SUFFIX
+    return Triage.OK
+
+
+def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
+    """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
+    name_cols = {header.get("sender"), header.get("receivers")} - {None}
+    populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
+    if populated and populated <= name_cols:
+        return "section_banner"
+    return "data_no_index"
+
+
+def index_file_mismatch(index: str, file_path: str) -> bool:
+    if not file_path.strip():
+        return False
+    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
+    stem = basename.rsplit(".", 1)[0]
+    return stem != index
diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py
new file mode 100644
index 00000000..4c4f76a4
--- /dev/null
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -0,0 +1,31 @@
+import documents
+from documents import Triage
+
+def test_extract_row():
+    header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
+              "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
+    cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
+             "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
+    raw = documents.extract_row(cells, header, source_row=3)
+    assert raw.index == "W-0001"
+    assert raw.sender == "Walter de Gruyter"
+    assert raw.date == "15.2.1888"
+    assert raw.source_row == 3
+
+def test_triage():
+    assert documents.triage(["", "", ""]) == Triage.EMPTY
+    assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX  # data but no index
+    assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX
+    assert documents.triage(["W-0001", "x"]) == Triage.OK
+
+def test_classify_blank_index():
+    header = {"sender": 4, "receivers": 5}
+    banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""]
+    data = ["", "", "V", "1", "", "Eugenie"]
+    assert documents.classify_blank_index(banner, header) == "section_banner"
+    assert documents.classify_blank_index(data, header) == "data_no_index"
+
+def test_index_file_mismatch():
+    assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
+    assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
+    assert documents.index_file_mismatch("W-0001", "") is False