feat(normalizer): add _deduplicate() to persons_tree
This commit is contained in:
@@ -229,3 +229,47 @@ def test_parse_row_empty_spouse_and_notes():
|
||||
p = persons_tree._parse_row(4, fields)
|
||||
assert p["_spouse_raw"] is None
|
||||
assert p["_bemerkung_raw"] is None
|
||||
|
||||
|
||||
def test_deduplicate_no_duplicates():
|
||||
persons = [
|
||||
{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
|
||||
{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
|
||||
]
|
||||
result, skipped = persons_tree._deduplicate(persons)
|
||||
assert len(result) == 2
|
||||
assert skipped == []
|
||||
|
||||
|
||||
def test_deduplicate_exact_match():
|
||||
# rows 127/138: same firstName, lastName, birthYear
|
||||
persons = [
|
||||
{"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
||||
{"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
||||
]
|
||||
result, skipped = persons_tree._deduplicate(persons)
|
||||
assert [p["rowId"] for p in result] == ["row_127"]
|
||||
assert len(skipped) == 1
|
||||
assert "row_138" in skipped[0]
|
||||
|
||||
|
||||
def test_deduplicate_none_birth_year_after_known():
|
||||
# rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
|
||||
persons = [
|
||||
{"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
|
||||
{"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
|
||||
]
|
||||
result, skipped = persons_tree._deduplicate(persons)
|
||||
assert [p["rowId"] for p in result] == ["row_129"]
|
||||
assert len(skipped) == 1
|
||||
|
||||
|
||||
def test_deduplicate_both_none_birth_year_kept():
|
||||
# Two people with no birth year but same name: keep first only
|
||||
persons = [
|
||||
{"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
||||
{"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
||||
]
|
||||
result, skipped = persons_tree._deduplicate(persons)
|
||||
assert [p["rowId"] for p in result] == ["row_A"]
|
||||
assert len(skipped) == 1
|
||||
|
||||
Reference in New Issue
Block a user