diff --git a/tools/import-normalizer/overrides.py b/tools/import-normalizer/overrides.py new file mode 100644 index 00000000..65638dff --- /dev/null +++ b/tools/import-normalizer/overrides.py @@ -0,0 +1,21 @@ +"""Load human-supplied corrections. Missing files are not an error.""" +import csv +from pathlib import Path + + +def load_overrides(dates_path: Path, names_path: Path): + date_overrides: dict[str, tuple[str, str]] = {} + name_overrides: dict[str, str] = {} + if Path(dates_path).exists(): + with open(dates_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + date_overrides[raw] = ((row.get("iso") or "").strip(), (row.get("precision") or "UNKNOWN").strip()) + if Path(names_path).exists(): + with open(names_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + name_overrides[raw] = (row.get("person_id") or "").strip() + return date_overrides, name_overrides diff --git a/tools/import-normalizer/tests/test_writers.py b/tools/import-normalizer/tests/test_writers.py new file mode 100644 index 00000000..97bd7ce8 --- /dev/null +++ b/tools/import-normalizer/tests/test_writers.py @@ -0,0 +1,52 @@ +import csv +import openpyxl +import overrides +import writers +import documents + +def test_load_overrides_missing_files(tmp_path): + d, n = overrides.load_overrides(tmp_path / "dates.csv", tmp_path / "names.csv") + assert d == {} and n == {} + +def test_load_overrides_parsed(tmp_path): + dp = tmp_path / "dates.csv" + dp.write_text("raw,iso,precision\n13.5.65,1965-05-13,DAY\n", encoding="utf-8") + np = tmp_path / "names.csv" + np.write_text("raw,person_id\nEugenie Müller,de-gruyter-eugenie\n", encoding="utf-8") + d, n = overrides.load_overrides(dp, np) + assert d["13.5.65"] == ("1965-05-13", "DAY") + assert n["Eugenie Müller"] == "de-gruyter-eugenie" + +def test_write_documents_xlsx_joins_lists(tmp_path): + doc = documents.CanonicalDocument( + index="W-0001", receiver_person_ids=["a", "b"], receiver_names=["A", "B"], + tags=["Brautbriefe"], date_precision="DAY", needs_review=["unparsed_date"]) + out = tmp_path / "docs.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + ws = wb.active + header = [c.value for c in ws[1]] + assert "receiver_person_ids" in header and "needs_review" in header + row = {h: c.value for h, c in zip(header, ws[2])} + assert row["receiver_person_ids"] == "a|b" + assert row["needs_review"] == "unparsed_date" + +def test_write_review_csv(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["?", 3], ["x", 1]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[0] == ["raw", "count"] + assert rows[1] == ["?", "3"] + +def test_write_review_csv_defangs_formula_injection(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["=cmd|'/C calc'!A0", 1], ["-2+3", 2]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[1][0].startswith("'=") # leading '=' neutralised + assert rows[2][0].startswith("'-") + +def test_write_summary_sections(tmp_path): + out = tmp_path / "s.txt" + writers.write_summary(out, {"# INPUTS": "", "rows": 10, "# DATES": "", "unknown_date_rate": "3.2%"}) + text = out.read_text(encoding="utf-8") + assert "INPUTS:" in text and "DATES:" in text and " rows: 10" in text diff --git a/tools/import-normalizer/writers.py b/tools/import-normalizer/writers.py new file mode 100644 index 00000000..ff24b055 --- /dev/null +++ b/tools/import-normalizer/writers.py @@ -0,0 +1,73 @@ +"""Write canonical .xlsx outputs and review .csv files.""" +import csv +import datetime +from pathlib import Path +import openpyxl + +_PIPE = "|" +# Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl +# otherwise stamps docProps with the current time on every save. +_FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0) + + +def _join(value): + if isinstance(value, list): + return _PIPE.join(str(v) for v in value) + return "" if value is None else str(value) + + +def _csv_safe(value): + """Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs.""" + s = "" if value is None else str(value) + return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r") else s + + +DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", + "date_precision", "location", "tags", "summary", "source_row", "needs_review"] + +PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname", + "birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw", + "death_place", "spouse", "generation", "notes", "aliases", "provisional"] + + +def _write_xlsx(records, columns, path: Path): + wb = openpyxl.Workbook() + ws = wb.active + ws.append(columns) + for rec in records: + ws.append([_join(getattr(rec, col)) for col in columns]) + wb.properties.created = _FIXED_TS + wb.properties.modified = _FIXED_TS + Path(path).parent.mkdir(parents=True, exist_ok=True) + wb.save(path) + + +def write_documents_xlsx(docs, path: Path): + _write_xlsx(docs, DOC_COLUMNS, path) + + +def write_persons_xlsx(people, path: Path): + _write_xlsx(people, PERSON_COLUMNS, path) + + +def write_review_csv(path: Path, header: list[str], rows: list[list]): + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8", newline="") as f: + w = csv.writer(f) + w.writerow(header) + for row in rows: + w.writerow([_csv_safe(c) for c in row]) + + +def write_summary(path: Path, stats: dict): + """Render a grouped, scannable summary. Keys beginning with '#' are section headers.""" + Path(path).parent.mkdir(parents=True, exist_ok=True) + lines = [] + for k, v in stats.items(): + if k.startswith("#"): + lines.append("") + lines.append(k[1:].strip() + ":") + else: + lines.append(f" {k}: {v}") + Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8")