feat(normalizer): add main() CLI to persons_tree
Wires the two-pass pipeline (parse → deduplicate → index → resolve) into a runnable CLI with --input, --output, and --dry-run flags. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -310,3 +310,100 @@ def _parse_bemerkung(
|
|||||||
|
|
||||||
# No pattern matched — full text goes to notes, nothing to unresolved
|
# No pattern matched — full text goes to notes, nothing to unresolved
|
||||||
return [], [], s
|
return [], [], s
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", default=str(config.PERSON_WORKBOOK),
|
||||||
|
help="Path to Personendatei 2.xlsx"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"),
|
||||||
|
help="Path for output JSON"
|
||||||
|
)
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
from ingest import read_sheet, build_header_map
|
||||||
|
|
||||||
|
rows = read_sheet(Path(args.input), config.PERSON_SHEET)
|
||||||
|
if not rows:
|
||||||
|
print("ERROR: sheet is empty", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
header_row = [str(v) for v in rows[0]]
|
||||||
|
fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
|
||||||
|
|
||||||
|
# --- Pass 1: parse rows ---
|
||||||
|
persons_raw: list[dict] = []
|
||||||
|
for row_num, row in enumerate(rows[1:], start=2):
|
||||||
|
field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
|
||||||
|
if not field_dict.get("last_name", "").strip():
|
||||||
|
continue
|
||||||
|
persons_raw.append(_parse_row(row_num, field_dict))
|
||||||
|
|
||||||
|
persons, skipped_msgs = _deduplicate(persons_raw)
|
||||||
|
for msg in skipped_msgs:
|
||||||
|
print(f" SKIP {msg}", file=sys.stderr)
|
||||||
|
|
||||||
|
index = _build_index(persons)
|
||||||
|
|
||||||
|
# --- Pass 2: resolve relationships ---
|
||||||
|
all_rels: list[dict] = []
|
||||||
|
all_unresolved: list[dict] = []
|
||||||
|
|
||||||
|
spouse_rels, spouse_unres = _resolve_spouses(persons, index)
|
||||||
|
all_rels.extend(spouse_rels)
|
||||||
|
all_unresolved.extend(spouse_unres)
|
||||||
|
|
||||||
|
for p in persons:
|
||||||
|
bemerkung = p.pop("_bemerkung_raw", None) or ""
|
||||||
|
p.pop("_spouse_raw", None)
|
||||||
|
|
||||||
|
rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index)
|
||||||
|
all_rels.extend(rels)
|
||||||
|
all_unresolved.extend(unres)
|
||||||
|
|
||||||
|
if remaining:
|
||||||
|
existing = p.get("notes") or ""
|
||||||
|
if remaining not in existing:
|
||||||
|
p["notes"] = (existing + " " + remaining).strip() if existing else remaining
|
||||||
|
|
||||||
|
# --- Stats output ---
|
||||||
|
spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF")
|
||||||
|
parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF")
|
||||||
|
print(f"✓ {len(persons)} persons parsed")
|
||||||
|
print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)")
|
||||||
|
if all_unresolved:
|
||||||
|
print(f"⚠ {len(all_unresolved)} unresolved (see unresolved[] in output)")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n--- dry-run: first 5 unresolved ---")
|
||||||
|
for u in all_unresolved[:5]:
|
||||||
|
print(f" {u}")
|
||||||
|
return
|
||||||
|
|
||||||
|
output = {
|
||||||
|
"generated_at": datetime.datetime.now().isoformat(),
|
||||||
|
"source": Path(args.input).name,
|
||||||
|
"stats": {
|
||||||
|
"persons": len(persons),
|
||||||
|
"relationships": len(all_rels),
|
||||||
|
"unresolved": len(all_unresolved),
|
||||||
|
},
|
||||||
|
"persons": persons,
|
||||||
|
"relationships": all_rels,
|
||||||
|
"unresolved": all_unresolved,
|
||||||
|
}
|
||||||
|
|
||||||
|
out_path = Path(args.output)
|
||||||
|
out_path.parent.mkdir(exist_ok=True)
|
||||||
|
out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"→ {args.output}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|||||||
@@ -431,3 +431,27 @@ def test_parse_bemerkung_sohn_with_trailing_remark():
|
|||||||
assert len(rels) == 2
|
assert len(rels) == 2
|
||||||
assert unres == []
|
assert unres == []
|
||||||
assert notes == "nach Mexiko emigriert"
|
assert notes == "nach Mexiko emigriert"
|
||||||
|
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def test_dry_run_exits_zero(tmp_path):
|
||||||
|
"""dry-run should complete without writing any file and exit 0."""
|
||||||
|
input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx"
|
||||||
|
if not input_path.exists():
|
||||||
|
import pytest
|
||||||
|
pytest.skip("source Excel file not present")
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"),
|
||||||
|
"--input", str(input_path),
|
||||||
|
"--output", str(tmp_path / "out.json"),
|
||||||
|
"--dry-run",
|
||||||
|
],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, result.stderr
|
||||||
|
assert not (tmp_path / "out.json").exists()
|
||||||
|
assert "persons parsed" in result.stdout
|
||||||
|
|||||||
Reference in New Issue
Block a user