From 74c4c390fcb1ea0ba21a01e95596bf873d402d81 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Mon, 25 May 2026 14:08:30 +0200
Subject: [PATCH] feat(normalizer): xlsx ingest + header mapping

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tools/import-normalizer/ingest.py            | 48 ++++++++++++++++++++
 tools/import-normalizer/tests/test_ingest.py | 36 +++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 tools/import-normalizer/ingest.py
 create mode 100644 tools/import-normalizer/tests/test_ingest.py

diff --git a/tools/import-normalizer/ingest.py b/tools/import-normalizer/ingest.py
new file mode 100644
index 00000000..171a1a24
--- /dev/null
+++ b/tools/import-normalizer/ingest.py
@@ -0,0 +1,48 @@
+"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields."""
+import datetime
+from pathlib import Path
+import openpyxl
+
+
+def _cell_to_str(value) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, datetime.datetime):
+        return value.date().isoformat()
+    if isinstance(value, datetime.date):
+        return value.isoformat()
+    if isinstance(value, float) and value.is_integer():
+        return str(int(value))
+    if isinstance(value, int):
+        return str(value)
+    return str(value).strip()
+
+
+def read_sheet(path: Path, sheet_name: str) -> list[list[str]]:
+    wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
+    if sheet_name not in wb.sheetnames:
+        raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}")
+    ws = wb[sheet_name]
+    rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)]
+    wb.close()
+    return rows
+
+
+def _norm_header(text: str) -> str:
+    return " ".join(text.lower().split())
+
+
+def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]):
+    """Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing."""
+    fields: dict[str, int] = {}
+    unknown: list[str] = []
+    for idx, raw in enumerate(header_row):
+        key = _norm_header(raw)
+        if key in field_map:
+            fields[field_map[key]] = idx
+        elif raw.strip():
+            unknown.append(raw)
+    missing = required - set(fields)
+    if missing:
+        raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})")
+    return fields, unknown
diff --git a/tools/import-normalizer/tests/test_ingest.py b/tools/import-normalizer/tests/test_ingest.py
new file mode 100644
index 00000000..ba745c88
--- /dev/null
+++ b/tools/import-normalizer/tests/test_ingest.py
@@ -0,0 +1,36 @@
+import datetime
+import openpyxl
+import pytest
+import ingest
+
+def _make_workbook(tmp_path, sheet_name, rows):
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = sheet_name
+    for r in rows:
+        ws.append(r)
+    path = tmp_path / "wb.xlsx"
+    wb.save(path)
+    return path
+
+def test_read_sheet_converts_cells(tmp_path):
+    path = _make_workbook(tmp_path, "S", [
+        ["Index", "Datum"],
+        ["W-0001", datetime.datetime(1888, 2, 15)],
+        ["W-0002", 1],
+    ])
+    rows = ingest.read_sheet(path, "S")
+    assert rows[0] == ["Index", "Datum"]
+    assert rows[1] == ["W-0001", "1888-02-15"]   # Excel date -> ISO string
+    assert rows[2] == ["W-0002", "1"]            # integer -> plain string
+
+def test_build_header_map_collapses_whitespace_and_case():
+    header = ["Index", "Datum  des Briefes", "EmpfängerIn", "Mystery"]
+    field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"}
+    fields, unknown = ingest.build_header_map(header, field_map, required={"index"})
+    assert fields == {"index": 0, "date": 1, "receivers": 2}
+    assert unknown == ["Mystery"]
+
+def test_build_header_map_missing_required_raises():
+    with pytest.raises(ValueError, match="index"):
+        ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"})