From 74c4c390fcb1ea0ba21a01e95596bf873d402d81 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 14:08:30 +0200 Subject: [PATCH] feat(normalizer): xlsx ingest + header mapping Co-Authored-By: Claude Opus 4.7 --- tools/import-normalizer/ingest.py | 48 ++++++++++++++++++++ tools/import-normalizer/tests/test_ingest.py | 36 +++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tools/import-normalizer/ingest.py create mode 100644 tools/import-normalizer/tests/test_ingest.py diff --git a/tools/import-normalizer/ingest.py b/tools/import-normalizer/ingest.py new file mode 100644 index 00000000..171a1a24 --- /dev/null +++ b/tools/import-normalizer/ingest.py @@ -0,0 +1,48 @@ +"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields.""" +import datetime +from pathlib import Path +import openpyxl + + +def _cell_to_str(value) -> str: + if value is None: + return "" + if isinstance(value, datetime.datetime): + return value.date().isoformat() + if isinstance(value, datetime.date): + return value.isoformat() + if isinstance(value, float) and value.is_integer(): + return str(int(value)) + if isinstance(value, int): + return str(value) + return str(value).strip() + + +def read_sheet(path: Path, sheet_name: str) -> list[list[str]]: + wb = openpyxl.load_workbook(path, read_only=True, data_only=True) + if sheet_name not in wb.sheetnames: + raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}") + ws = wb[sheet_name] + rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)] + wb.close() + return rows + + +def _norm_header(text: str) -> str: + return " ".join(text.lower().split()) + + +def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]): + """Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing.""" + fields: dict[str, int] = {} + unknown: list[str] = [] + for idx, raw in enumerate(header_row): + key = _norm_header(raw) + if key in field_map: + fields[field_map[key]] = idx + elif raw.strip(): + unknown.append(raw) + missing = required - set(fields) + if missing: + raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})") + return fields, unknown diff --git a/tools/import-normalizer/tests/test_ingest.py b/tools/import-normalizer/tests/test_ingest.py new file mode 100644 index 00000000..ba745c88 --- /dev/null +++ b/tools/import-normalizer/tests/test_ingest.py @@ -0,0 +1,36 @@ +import datetime +import openpyxl +import pytest +import ingest + +def _make_workbook(tmp_path, sheet_name, rows): + wb = openpyxl.Workbook() + ws = wb.active + ws.title = sheet_name + for r in rows: + ws.append(r) + path = tmp_path / "wb.xlsx" + wb.save(path) + return path + +def test_read_sheet_converts_cells(tmp_path): + path = _make_workbook(tmp_path, "S", [ + ["Index", "Datum"], + ["W-0001", datetime.datetime(1888, 2, 15)], + ["W-0002", 1], + ]) + rows = ingest.read_sheet(path, "S") + assert rows[0] == ["Index", "Datum"] + assert rows[1] == ["W-0001", "1888-02-15"] # Excel date -> ISO string + assert rows[2] == ["W-0002", "1"] # integer -> plain string + +def test_build_header_map_collapses_whitespace_and_case(): + header = ["Index", "Datum des Briefes", "EmpfängerIn", "Mystery"] + field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"} + fields, unknown = ingest.build_header_map(header, field_map, required={"index"}) + assert fields == {"index": 0, "date": 1, "receivers": 2} + assert unknown == ["Mystery"] + +def test_build_header_map_missing_required_raises(): + with pytest.raises(ValueError, match="index"): + ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"})