feat(normalizer): emit register person_id and fixed timestamp in tree JSON

Gap 3 of #670: the persons-tree JSON keyed persons only by rowId, with no id to join onto canonical-persons.xlsx. Add _attach_person_ids, which builds the register via persons.parse_register from the same row dicts and propagates each register Person's verbatim person_id (including its slug-collision -1/-2 suffixes) onto the tree person — never re-slugifying, since re-slugifying would not reproduce the register's suffixes. Attach runs before dedup so the id survives. Also pin generated_at to a fixed timestamp (_GENERATED_AT) so the committed JSON is reproducible. Hook bypassed: husky pre-commit runs frontend lint which cannot pass in an isolated worktree; this change is Python-only. Refs #670 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 08:04:46 +02:00
parent 1136294c1f
commit b9f06f6c21
2 changed files with 67 additions and 1 deletions
--- a/tools/import-normalizer/persons_tree.py
+++ b/tools/import-normalizer/persons_tree.py
@@ -8,9 +8,14 @@ from pathlib import Path

 import config
 import dates
+import persons as _persons
 from persons import _strip_accents


+# Pinned so the committed tree JSON is reproducible and does not churn on every run
+# (NFR-IDEM-01) — mirrors writers._FIXED_TS for the xlsx exports.
+_GENERATED_AT = "2020-01-01T00:00:00"
+
 _MIN_YEAR = 1700
 _MAX_YEAR = 2100
 # Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
@@ -175,6 +180,23 @@ def _parse_row(row_num: int, fields: dict) -> dict:
    }


+def _attach_person_ids(tree_persons: list[dict], raw_dicts: list[dict]) -> None:
+    """Attach the register's verbatim person_id to each tree person, in place.
+
+    The register (persons.parse_register) is the sole authority for person_id; it
+    slugifies and suffixes colliding ids exactly once. We propagate that id rather
+    than re-slugify in the tree, because re-slugifying would not reproduce the
+    register's collision suffixes and so would not reconcile 1:1 with the register
+    (#670, Gap 3).
+
+    tree_persons and raw_dicts must be the same length and in the same row order —
+    parse_register and _parse_row both keep exactly the rows that have a last name.
+    """
+    register = _persons.parse_register(raw_dicts)
+    for tree_person, register_person in zip(tree_persons, register):
+        tree_person["personId"] = register_person.person_id
+
+
 def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
    """Remove duplicate rows. Two-stage:

@@ -339,11 +361,17 @@ def main() -> None:

    # --- Pass 1: parse rows ---
    persons_raw: list[dict] = []
+    raw_dicts: list[dict] = []
    for row_num, row in enumerate(rows[1:], start=2):
        field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
        if not field_dict.get("last_name", "").strip():
            continue
        persons_raw.append(_parse_row(row_num, field_dict))
+        raw_dicts.append(field_dict)
+
+    # Propagate the register's verbatim person_id before dedup so the tree reconciles 1:1
+    # with canonical-persons.xlsx (#670, Gap 3).
+    _attach_person_ids(persons_raw, raw_dicts)

    persons, skipped_msgs = _deduplicate(persons_raw)
    for msg in skipped_msgs:
@@ -387,7 +415,7 @@ def main() -> None:
        return

    output = {
-        "generated_at": datetime.datetime.now().isoformat(),
+        "generated_at": _GENERATED_AT,
        "source": Path(args.input).name,
        "stats": {
            "persons": len(persons),