2026-05-28 15:05:51 +02:00
1 changed files with 58 additions and 0 deletions
--- a/tools/import-normalizer/tests/test_normalize.py
+++ b/tools/import-normalizer/tests/test_normalize.py
@@ -1,3 +1,8 @@
+import json
+import subprocess
+import sys
+from pathlib import Path
+
 import openpyxl
 import normalize

@@ -119,3 +124,56 @@ def test_approved_themes_applied(tmp_path):
    tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
    # W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
    assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
+
+
+def _person_wb_with_collision(tmp_path):
+    # Two "Hans Cram" rows force the register to suffix the colliding slug (-1/-2);
+    # the tree must carry those exact suffixed ids so the join still reconciles.
+    wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1"
+    ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum",
+               "Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"])
+    ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""])
+    ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""])
+    ws.append(["G 2", "Cram", "Hans", "", "1890", "", "", "", "", ""])
+    ws.append(["G 3", "Cram", "Hans", "", "1925", "", "", "", "", ""])
+    p = tmp_path / "persons.xlsx"; wb.save(p); return p
+
+
+def _generate_tree(person_wb, out_path):
+    script = Path(__file__).parent.parent / "persons_tree.py"
+    result = subprocess.run(
+        [sys.executable, str(script), "--input", str(person_wb), "--output", str(out_path)],
+        capture_output=True, text=True,
+    )
+    assert result.returncode == 0, result.stderr
+    return json.loads(out_path.read_text(encoding="utf-8"))
+
+
+def test_tree_person_ids_reconcile_with_persons_xlsx(tmp_path):
+    # The real #669 contract: every personId in canonical-persons-tree.json must join
+    # 1:1 onto a person_id in canonical-persons.xlsx — no orphan tree id, no duplicate.
+    # Both artifacts are produced from the SAME person workbook (collision included).
+    person_wb = _person_wb_with_collision(tmp_path)
+    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
+
+    normalize.run(
+        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
+        person_workbook=person_wb, person_sheet="Tabelle1",
+        out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={})
+
+    tree = _generate_tree(person_wb, tmp_path / "tree.json")
+    tree_ids = [p["personId"] for p in tree["persons"]]
+
+    wb = openpyxl.load_workbook(out_dir / "canonical-persons.xlsx")
+    ws = wb.active
+    header = [c.value for c in ws[1]]
+    pid_col = header.index("person_id")
+    register_ids = [ws.cell(row=r, column=pid_col + 1).value for r in range(2, ws.max_row + 1)]
+
+    # tree ids are unique (no duplicate join key)
+    assert len(tree_ids) == len(set(tree_ids))
+    # the suffixed collision ids actually reached the tree
+    assert "cram-hans-1" in tree_ids and "cram-hans-2" in tree_ids
+    # every tree id resolves to exactly one register row — the join is total and 1:1
+    register_counts = {pid: register_ids.count(pid) for pid in tree_ids}
+    assert all(count == 1 for count in register_counts.values()), register_counts