feat(normalizer): xlsx ingest + header mapping
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
48
tools/import-normalizer/ingest.py
Normal file
48
tools/import-normalizer/ingest.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields."""
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
import openpyxl
|
||||
|
||||
|
||||
def _cell_to_str(value) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, datetime.datetime):
|
||||
return value.date().isoformat()
|
||||
if isinstance(value, datetime.date):
|
||||
return value.isoformat()
|
||||
if isinstance(value, float) and value.is_integer():
|
||||
return str(int(value))
|
||||
if isinstance(value, int):
|
||||
return str(value)
|
||||
return str(value).strip()
|
||||
|
||||
|
||||
def read_sheet(path: Path, sheet_name: str) -> list[list[str]]:
|
||||
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
||||
if sheet_name not in wb.sheetnames:
|
||||
raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}")
|
||||
ws = wb[sheet_name]
|
||||
rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)]
|
||||
wb.close()
|
||||
return rows
|
||||
|
||||
|
||||
def _norm_header(text: str) -> str:
|
||||
return " ".join(text.lower().split())
|
||||
|
||||
|
||||
def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]):
|
||||
"""Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing."""
|
||||
fields: dict[str, int] = {}
|
||||
unknown: list[str] = []
|
||||
for idx, raw in enumerate(header_row):
|
||||
key = _norm_header(raw)
|
||||
if key in field_map:
|
||||
fields[field_map[key]] = idx
|
||||
elif raw.strip():
|
||||
unknown.append(raw)
|
||||
missing = required - set(fields)
|
||||
if missing:
|
||||
raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})")
|
||||
return fields, unknown
|
||||
36
tools/import-normalizer/tests/test_ingest.py
Normal file
36
tools/import-normalizer/tests/test_ingest.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import datetime
|
||||
import openpyxl
|
||||
import pytest
|
||||
import ingest
|
||||
|
||||
def _make_workbook(tmp_path, sheet_name, rows):
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = sheet_name
|
||||
for r in rows:
|
||||
ws.append(r)
|
||||
path = tmp_path / "wb.xlsx"
|
||||
wb.save(path)
|
||||
return path
|
||||
|
||||
def test_read_sheet_converts_cells(tmp_path):
|
||||
path = _make_workbook(tmp_path, "S", [
|
||||
["Index", "Datum"],
|
||||
["W-0001", datetime.datetime(1888, 2, 15)],
|
||||
["W-0002", 1],
|
||||
])
|
||||
rows = ingest.read_sheet(path, "S")
|
||||
assert rows[0] == ["Index", "Datum"]
|
||||
assert rows[1] == ["W-0001", "1888-02-15"] # Excel date -> ISO string
|
||||
assert rows[2] == ["W-0002", "1"] # integer -> plain string
|
||||
|
||||
def test_build_header_map_collapses_whitespace_and_case():
|
||||
header = ["Index", "Datum des Briefes", "EmpfängerIn", "Mystery"]
|
||||
field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"}
|
||||
fields, unknown = ingest.build_header_map(header, field_map, required={"index"})
|
||||
assert fields == {"index": 0, "date": 1, "receivers": 2}
|
||||
assert unknown == ["Mystery"]
|
||||
|
||||
def test_build_header_map_missing_required_raises():
|
||||
with pytest.raises(ValueError, match="index"):
|
||||
ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"})
|
||||
Reference in New Issue
Block a user