feat(normalizer): add _deduplicate() to persons_tree

This commit is contained in:
Marcel
2026-05-25 21:02:02 +02:00
parent 7012234e6a
commit 1f2351e3c0
2 changed files with 78 additions and 0 deletions

View File

@@ -173,3 +173,37 @@ def _parse_row(row_num: int, fields: dict) -> dict:
"_spouse_raw": spouse,
"_bemerkung_raw": bemerkung_out,
}
def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
"""Remove duplicate rows. Two-stage:
1. Exact (firstName, lastName, birthYear) match.
2. (firstName, lastName) where the later entry has birthYear=None and an earlier
entry already has a known birthYear.
"""
seen_full: dict[tuple, str] = {} # (first, last, year) -> rowId
seen_name: dict[tuple, str] = {} # (first, last) -> rowId of first entry with a year
result: list[dict] = []
skipped: list[str] = []
for p in persons:
first, last, year = p["firstName"], p["lastName"], p["birthYear"]
key_full = (first, last, year)
key_name = (first, last)
if key_full in seen_full:
skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})")
continue
if year is None and key_name in seen_name:
skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)")
continue
seen_full[key_full] = p["rowId"]
if year is not None:
seen_name[key_name] = p["rowId"]
result.append(p)
return result, skipped