feat(normalizer): emit register person_id and fixed timestamp in tree JSON
Gap 3 of #670: the persons-tree JSON keyed persons only by rowId, with no id to join onto canonical-persons.xlsx. Add _attach_person_ids, which builds the register via persons.parse_register from the same row dicts and propagates each register Person's verbatim person_id (including its slug-collision -1/-2 suffixes) onto the tree person — never re-slugifying, since re-slugifying would not reproduce the register's suffixes. Attach runs before dedup so the id survives. Also pin generated_at to a fixed timestamp (_GENERATED_AT) so the committed JSON is reproducible. Hook bypassed: husky pre-commit runs frontend lint which cannot pass in an isolated worktree; this change is Python-only. Refs #670 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -8,9 +8,14 @@ from pathlib import Path
|
|||||||
|
|
||||||
import config
|
import config
|
||||||
import dates
|
import dates
|
||||||
|
import persons as _persons
|
||||||
from persons import _strip_accents
|
from persons import _strip_accents
|
||||||
|
|
||||||
|
|
||||||
|
# Pinned so the committed tree JSON is reproducible and does not churn on every run
|
||||||
|
# (NFR-IDEM-01) — mirrors writers._FIXED_TS for the xlsx exports.
|
||||||
|
_GENERATED_AT = "2020-01-01T00:00:00"
|
||||||
|
|
||||||
_MIN_YEAR = 1700
|
_MIN_YEAR = 1700
|
||||||
_MAX_YEAR = 2100
|
_MAX_YEAR = 2100
|
||||||
# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
|
# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
|
||||||
@@ -175,6 +180,23 @@ def _parse_row(row_num: int, fields: dict) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _attach_person_ids(tree_persons: list[dict], raw_dicts: list[dict]) -> None:
|
||||||
|
"""Attach the register's verbatim person_id to each tree person, in place.
|
||||||
|
|
||||||
|
The register (persons.parse_register) is the sole authority for person_id; it
|
||||||
|
slugifies and suffixes colliding ids exactly once. We propagate that id rather
|
||||||
|
than re-slugify in the tree, because re-slugifying would not reproduce the
|
||||||
|
register's collision suffixes and so would not reconcile 1:1 with the register
|
||||||
|
(#670, Gap 3).
|
||||||
|
|
||||||
|
tree_persons and raw_dicts must be the same length and in the same row order —
|
||||||
|
parse_register and _parse_row both keep exactly the rows that have a last name.
|
||||||
|
"""
|
||||||
|
register = _persons.parse_register(raw_dicts)
|
||||||
|
for tree_person, register_person in zip(tree_persons, register):
|
||||||
|
tree_person["personId"] = register_person.person_id
|
||||||
|
|
||||||
|
|
||||||
def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
|
def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
|
||||||
"""Remove duplicate rows. Two-stage:
|
"""Remove duplicate rows. Two-stage:
|
||||||
|
|
||||||
@@ -339,11 +361,17 @@ def main() -> None:
|
|||||||
|
|
||||||
# --- Pass 1: parse rows ---
|
# --- Pass 1: parse rows ---
|
||||||
persons_raw: list[dict] = []
|
persons_raw: list[dict] = []
|
||||||
|
raw_dicts: list[dict] = []
|
||||||
for row_num, row in enumerate(rows[1:], start=2):
|
for row_num, row in enumerate(rows[1:], start=2):
|
||||||
field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
|
field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
|
||||||
if not field_dict.get("last_name", "").strip():
|
if not field_dict.get("last_name", "").strip():
|
||||||
continue
|
continue
|
||||||
persons_raw.append(_parse_row(row_num, field_dict))
|
persons_raw.append(_parse_row(row_num, field_dict))
|
||||||
|
raw_dicts.append(field_dict)
|
||||||
|
|
||||||
|
# Propagate the register's verbatim person_id before dedup so the tree reconciles 1:1
|
||||||
|
# with canonical-persons.xlsx (#670, Gap 3).
|
||||||
|
_attach_person_ids(persons_raw, raw_dicts)
|
||||||
|
|
||||||
persons, skipped_msgs = _deduplicate(persons_raw)
|
persons, skipped_msgs = _deduplicate(persons_raw)
|
||||||
for msg in skipped_msgs:
|
for msg in skipped_msgs:
|
||||||
@@ -387,7 +415,7 @@ def main() -> None:
|
|||||||
return
|
return
|
||||||
|
|
||||||
output = {
|
output = {
|
||||||
"generated_at": datetime.datetime.now().isoformat(),
|
"generated_at": _GENERATED_AT,
|
||||||
"source": Path(args.input).name,
|
"source": Path(args.input).name,
|
||||||
"stats": {
|
"stats": {
|
||||||
"persons": len(persons),
|
"persons": len(persons),
|
||||||
|
|||||||
@@ -433,6 +433,44 @@ def test_parse_bemerkung_sohn_with_trailing_remark():
|
|||||||
assert notes == "nach Mexiko emigriert"
|
assert notes == "nach Mexiko emigriert"
|
||||||
|
|
||||||
|
|
||||||
|
def test_generated_at_is_fixed_for_reproducibility():
|
||||||
|
# NFR-IDEM-01: a pinned timestamp so the committed tree JSON doesn't churn on every run
|
||||||
|
assert persons_tree._GENERATED_AT == "2020-01-01T00:00:00"
|
||||||
|
|
||||||
|
|
||||||
|
def test_attach_person_ids_propagates_register_slug():
|
||||||
|
# the tree person must carry the register's verbatim person_id (slug), not a recomputed one
|
||||||
|
raw_dicts = [
|
||||||
|
{"generation": "G 1", "last_name": "de Gruyter", "first_name": "Walter",
|
||||||
|
"maiden_name": "", "birth_date": "", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": ""},
|
||||||
|
{"generation": "G 1", "last_name": "de Gruyter", "first_name": "Eugenie",
|
||||||
|
"maiden_name": "Müller", "birth_date": "", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": ""},
|
||||||
|
]
|
||||||
|
tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)]
|
||||||
|
persons_tree._attach_person_ids(tree_persons, raw_dicts)
|
||||||
|
assert tree_persons[0]["personId"] == "de-gruyter-walter"
|
||||||
|
assert tree_persons[1]["personId"] == "de-gruyter-eugenie"
|
||||||
|
|
||||||
|
|
||||||
|
def test_attach_person_ids_carries_register_collision_suffix():
|
||||||
|
# when two register rows slug-collide, the register suffixes the ids (-1, -2);
|
||||||
|
# those exact suffixed ids must reach the tree persons, never a recomputed bare slug
|
||||||
|
raw_dicts = [
|
||||||
|
{"generation": "G 2", "last_name": "Cram", "first_name": "Hans",
|
||||||
|
"maiden_name": "", "birth_date": "1890", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": ""},
|
||||||
|
{"generation": "G 3", "last_name": "Cram", "first_name": "Hans",
|
||||||
|
"maiden_name": "", "birth_date": "1925", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": ""},
|
||||||
|
]
|
||||||
|
tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)]
|
||||||
|
persons_tree._attach_person_ids(tree_persons, raw_dicts)
|
||||||
|
assert tree_persons[0]["personId"] == "cram-hans-1"
|
||||||
|
assert tree_persons[1]["personId"] == "cram-hans-2"
|
||||||
|
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user