diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index 74bf01da..e2d92d6b 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -310,3 +310,100 @@ def _parse_bemerkung( # No pattern matched — full text goes to notes, nothing to unresolved return [], [], s + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json" + ) + parser.add_argument( + "--input", default=str(config.PERSON_WORKBOOK), + help="Path to Personendatei 2.xlsx" + ) + parser.add_argument( + "--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"), + help="Path for output JSON" + ) + parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write") + args = parser.parse_args() + + from ingest import read_sheet, build_header_map + + rows = read_sheet(Path(args.input), config.PERSON_SHEET) + if not rows: + print("ERROR: sheet is empty", file=sys.stderr) + sys.exit(1) + + header_row = [str(v) for v in rows[0]] + fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS) + + # --- Pass 1: parse rows --- + persons_raw: list[dict] = [] + for row_num, row in enumerate(rows[1:], start=2): + field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()} + if not field_dict.get("last_name", "").strip(): + continue + persons_raw.append(_parse_row(row_num, field_dict)) + + persons, skipped_msgs = _deduplicate(persons_raw) + for msg in skipped_msgs: + print(f" SKIP {msg}", file=sys.stderr) + + index = _build_index(persons) + + # --- Pass 2: resolve relationships --- + all_rels: list[dict] = [] + all_unresolved: list[dict] = [] + + spouse_rels, spouse_unres = _resolve_spouses(persons, index) + all_rels.extend(spouse_rels) + all_unresolved.extend(spouse_unres) + + for p in persons: + bemerkung = p.pop("_bemerkung_raw", None) or "" + p.pop("_spouse_raw", None) + + rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index) + all_rels.extend(rels) + all_unresolved.extend(unres) + + if remaining: + existing = p.get("notes") or "" + if remaining not in existing: + p["notes"] = (existing + " " + remaining).strip() if existing else remaining + + # --- Stats output --- + spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF") + parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF") + print(f"✓ {len(persons)} persons parsed") + print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)") + if all_unresolved: + print(f"⚠ {len(all_unresolved)} unresolved (see unresolved[] in output)") + + if args.dry_run: + print("\n--- dry-run: first 5 unresolved ---") + for u in all_unresolved[:5]: + print(f" {u}") + return + + output = { + "generated_at": datetime.datetime.now().isoformat(), + "source": Path(args.input).name, + "stats": { + "persons": len(persons), + "relationships": len(all_rels), + "unresolved": len(all_unresolved), + }, + "persons": persons, + "relationships": all_rels, + "unresolved": all_unresolved, + } + + out_path = Path(args.output) + out_path.parent.mkdir(exist_ok=True) + out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"→ {args.output}") + + +if __name__ == "__main__": + main() diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index 7970a172..d8de1e67 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -431,3 +431,27 @@ def test_parse_bemerkung_sohn_with_trailing_remark(): assert len(rels) == 2 assert unres == [] assert notes == "nach Mexiko emigriert" + + +import subprocess + + +def test_dry_run_exits_zero(tmp_path): + """dry-run should complete without writing any file and exit 0.""" + input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx" + if not input_path.exists(): + import pytest + pytest.skip("source Excel file not present") + + result = subprocess.run( + [ + sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"), + "--input", str(input_path), + "--output", str(tmp_path / "out.json"), + "--dry-run", + ], + capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr + assert not (tmp_path / "out.json").exists() + assert "persons parsed" in result.stdout