diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index 66e1b660..af3ceb09 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -173,3 +173,37 @@ def _parse_row(row_num: int, fields: dict) -> dict: "_spouse_raw": spouse, "_bemerkung_raw": bemerkung_out, } + + +def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]: + """Remove duplicate rows. Two-stage: + + 1. Exact (firstName, lastName, birthYear) match. + 2. (firstName, lastName) where the later entry has birthYear=None and an earlier + entry already has a known birthYear. + """ + seen_full: dict[tuple, str] = {} # (first, last, year) -> rowId + seen_name: dict[tuple, str] = {} # (first, last) -> rowId of first entry with a year + result: list[dict] = [] + skipped: list[str] = [] + + for p in persons: + first, last, year = p["firstName"], p["lastName"], p["birthYear"] + key_full = (first, last, year) + key_name = (first, last) + + if key_full in seen_full: + skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})") + continue + + if year is None and key_name in seen_name: + skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)") + continue + + seen_full[key_full] = p["rowId"] + if year is not None: + seen_name[key_name] = p["rowId"] + + result.append(p) + + return result, skipped diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index 4b509156..ea4a1b61 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -229,3 +229,47 @@ def test_parse_row_empty_spouse_and_notes(): p = persons_tree._parse_row(4, fields) assert p["_spouse_raw"] is None assert p["_bemerkung_raw"] is None + + +def test_deduplicate_no_duplicates(): + persons = [ + {"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920}, + {"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert len(result) == 2 + assert skipped == [] + + +def test_deduplicate_exact_match(): + # rows 127/138: same firstName, lastName, birthYear + persons = [ + {"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + {"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_127"] + assert len(skipped) == 1 + assert "row_138" in skipped[0] + + +def test_deduplicate_none_birth_year_after_known(): + # rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None + persons = [ + {"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964}, + {"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_129"] + assert len(skipped) == 1 + + +def test_deduplicate_both_none_birth_year_kept(): + # Two people with no birth year but same name: keep first only + persons = [ + {"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + {"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_A"] + assert len(skipped) == 1