Remove the 5th unauthorized index key (_norm_tree(first)) from _build_index. The spec requires exactly 4 keys per person: 1. forward (first last) 2. reversed (last first) 3. maiden name (first maiden) if maiden set 4. lastName only (last) Update test data to use full names in Bemerkung fields (e.g., 'Clara Cram' instead of 'Clara') since single first names alone are no longer resolvable. All 52 tests pass.
421 lines
14 KiB
Python
421 lines
14 KiB
Python
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
import persons_tree
|
|
|
|
|
|
def test_parse_year_iso_string():
|
|
assert persons_tree._parse_year("1920-09-20") == 1920
|
|
|
|
|
|
def test_parse_year_excel_serial_birth():
|
|
# 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
|
|
assert persons_tree._parse_year("7568") == 1920
|
|
|
|
|
|
def test_parse_year_excel_serial_death():
|
|
# 36222 days from 1899-12-30 ≈ 1999
|
|
assert persons_tree._parse_year("36222") == 1999
|
|
|
|
|
|
def test_parse_year_excel_serial_small():
|
|
# 177 days from 1899-12-30 = 1900-06-25
|
|
assert persons_tree._parse_year("177") == 1900
|
|
|
|
|
|
def test_parse_year_german_date_string():
|
|
assert persons_tree._parse_year("30.8.1862") == 1862
|
|
|
|
|
|
def test_parse_year_year_only():
|
|
assert persons_tree._parse_year("1930") == 1930
|
|
|
|
|
|
def test_parse_year_free_text():
|
|
assert persons_tree._parse_year("August 1941") == 1941
|
|
|
|
|
|
def test_parse_year_none():
|
|
assert persons_tree._parse_year(None) is None
|
|
|
|
|
|
def test_parse_year_empty():
|
|
assert persons_tree._parse_year("") is None
|
|
|
|
|
|
def test_parse_year_unresolvable_truncated():
|
|
# "2.9.196" has no valid 4-digit year — returns None
|
|
assert persons_tree._parse_year("2.9.196") is None
|
|
|
|
|
|
def test_parse_year_typo_year():
|
|
# "4.3.1023" — year 1023 outside 1700-2100 guard — returns None
|
|
assert persons_tree._parse_year("4.3.1023") is None
|
|
|
|
|
|
def test_parse_year_bare_out_of_range_year_is_none():
|
|
# "1023" is a plausible typo for "1923" but is NOT an Excel serial.
|
|
# parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
|
|
# The serial branch must NOT re-interpret it as a serial.
|
|
assert persons_tree._parse_year("1023") is None
|
|
|
|
|
|
def test_parse_generation_space():
|
|
assert persons_tree._parse_generation("G 3") == 3
|
|
|
|
|
|
def test_parse_generation_no_space():
|
|
assert persons_tree._parse_generation("G3") == 3
|
|
|
|
|
|
def test_parse_generation_extra_spaces():
|
|
assert persons_tree._parse_generation("G 0") == 0
|
|
|
|
|
|
def test_parse_generation_trailing_garbage():
|
|
assert persons_tree._parse_generation("G 2 de Gruyter") == 2
|
|
|
|
|
|
def test_parse_generation_empty():
|
|
assert persons_tree._parse_generation("") is None
|
|
|
|
|
|
def test_parse_generation_none():
|
|
assert persons_tree._parse_generation(None) is None
|
|
|
|
|
|
def test_norm_tree_basic():
|
|
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
|
|
|
|
|
|
def test_norm_tree_diacritics():
|
|
assert persons_tree._norm_tree("Wöhler") == "woehler"
|
|
|
|
|
|
def test_norm_tree_strips_parens():
|
|
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
|
|
|
|
|
|
def test_norm_tree_strips_quotes():
|
|
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
|
|
|
|
|
|
def test_norm_tree_strips_geographic_suffix():
|
|
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
|
|
|
|
|
|
def test_norm_tree_strips_mexiko():
|
|
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
|
|
|
|
|
|
def test_norm_tree_collapses_whitespace():
|
|
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
|
|
|
|
|
|
def test_build_index_forward_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert "werner allemeyer" in idx
|
|
assert idx["werner allemeyer"] == ["row_002"]
|
|
|
|
|
|
def test_build_index_reversed_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("allemeyer werner") == ["row_002"]
|
|
|
|
|
|
def test_build_index_maiden_name_lookup():
|
|
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("elsgard woehler") == ["row_002"]
|
|
|
|
|
|
def test_build_index_single_token_fallback():
|
|
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
assert idx.get("cram") == ["row_028"]
|
|
|
|
|
|
def test_build_index_ambiguous_single_token():
|
|
persons = [
|
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
|
]
|
|
idx = persons_tree._build_index(persons)
|
|
assert set(idx["cram"]) == {"row_028", "row_019"}
|
|
|
|
|
|
def test_resolve_one_found():
|
|
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
|
idx = persons_tree._build_index(persons)
|
|
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
|
|
assert row_id == "row_003"
|
|
assert reason is None
|
|
|
|
|
|
def test_resolve_one_not_found():
|
|
idx = {}
|
|
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
|
|
assert row_id is None
|
|
assert reason == "not_found"
|
|
|
|
|
|
def test_resolve_one_ambiguous():
|
|
persons = [
|
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
|
]
|
|
idx = persons_tree._build_index(persons)
|
|
row_id, reason = persons_tree._resolve_one("Cram", idx)
|
|
assert row_id is None
|
|
assert reason == "ambiguous"
|
|
|
|
|
|
def test_parse_row_serial_dates():
|
|
fields = {
|
|
"generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
|
|
"maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
|
|
"death_date": "36222", "death_place": "Espelkamp",
|
|
"spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
|
|
}
|
|
p = persons_tree._parse_row(2, fields)
|
|
assert p["rowId"] == "row_002"
|
|
assert p["firstName"] == "Elsgard"
|
|
assert p["lastName"] == "Allemeyer"
|
|
assert p["maidenName"] == "Wöhler"
|
|
assert p["birthYear"] == 1920
|
|
assert p["deathYear"] == 1999
|
|
assert p["birthPlace"] == "Garz"
|
|
assert p["deathPlace"] == "Espelkamp"
|
|
assert p["generation"] == 3
|
|
assert p["familyMember"] is True
|
|
assert p["_spouse_raw"] == "Allemeyer Werner"
|
|
assert p["_bemerkung_raw"] == "Nichte von Herbert"
|
|
assert "[Geburtsdatum" not in (p["notes"] or "")
|
|
|
|
|
|
def test_parse_row_string_birth_date():
|
|
fields = {
|
|
"generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
|
|
"maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
|
}
|
|
p = persons_tree._parse_row(28, fields)
|
|
assert p["birthYear"] == 1890
|
|
assert p["deathYear"] is None
|
|
assert p["notes"] is None or p["notes"] == ""
|
|
|
|
|
|
def test_parse_row_unresolvable_date_goes_to_notes():
|
|
fields = {
|
|
"generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
|
|
"maiden_name": "", "birth_date": "28.9.", "birth_place": "",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
|
|
}
|
|
p = persons_tree._parse_row(96, fields)
|
|
assert p["birthYear"] is None
|
|
assert "[Geburtsdatum: 28.9.]" in p["notes"]
|
|
assert "Bruder v Ingrid" in p["notes"]
|
|
|
|
|
|
def test_parse_row_empty_spouse_and_notes():
|
|
fields = {
|
|
"generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
|
|
"maiden_name": "", "birth_date": "", "birth_place": "",
|
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
|
}
|
|
p = persons_tree._parse_row(4, fields)
|
|
assert p["_spouse_raw"] is None
|
|
assert p["_bemerkung_raw"] is None
|
|
|
|
|
|
def test_deduplicate_no_duplicates():
|
|
persons = [
|
|
{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
|
|
{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert len(result) == 2
|
|
assert skipped == []
|
|
|
|
|
|
def test_deduplicate_exact_match():
|
|
# rows 127/138: same firstName, lastName, birthYear
|
|
persons = [
|
|
{"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
|
{"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_127"]
|
|
assert len(skipped) == 1
|
|
assert "row_138" in skipped[0]
|
|
|
|
|
|
def test_deduplicate_none_birth_year_after_known():
|
|
# rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
|
|
persons = [
|
|
{"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
|
|
{"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_129"]
|
|
assert len(skipped) == 1
|
|
|
|
|
|
def test_deduplicate_both_none_birth_year_kept():
|
|
# Two people with no birth year but same name: keep first only
|
|
persons = [
|
|
{"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
|
{"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
|
]
|
|
result, skipped = persons_tree._deduplicate(persons)
|
|
assert [p["rowId"] for p in result] == ["row_A"]
|
|
assert len(skipped) == 1
|
|
|
|
|
|
def _make_persons(*args):
|
|
"""Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples."""
|
|
return [
|
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3],
|
|
"_spouse_raw": a[4], "_bemerkung_raw": None,
|
|
"birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None,
|
|
"generation": None, "familyMember": True, "alias": None, "notes": None}
|
|
for a in args
|
|
]
|
|
|
|
|
|
def test_resolve_spouses_success():
|
|
persons = _make_persons(
|
|
("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"),
|
|
("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert len(rels) == 1
|
|
assert rels[0]["type"] == "SPOUSE_OF"
|
|
assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"}
|
|
assert unres == []
|
|
|
|
|
|
def test_resolve_spouses_not_found():
|
|
persons = _make_persons(
|
|
("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert rels == []
|
|
assert len(unres) == 1
|
|
assert unres[0]["rowId"] == "row_007"
|
|
assert unres[0]["reason"] == "not_found"
|
|
|
|
|
|
def test_resolve_spouses_empty_spouse_field():
|
|
persons = _make_persons(
|
|
("row_004", "Jürgen", "Allemeyer", None, None),
|
|
)
|
|
idx = persons_tree._build_index(persons)
|
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
|
assert rels == [] and unres == []
|
|
|
|
|
|
def _register(*args):
|
|
"""Build index from (rowId, first, last, maiden) tuples."""
|
|
persons = [
|
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
|
|
for a in args
|
|
]
|
|
return persons, persons_tree._build_index(persons)
|
|
|
|
|
|
def test_parse_bemerkung_sohn_two_parents():
|
|
_, idx = _register(
|
|
("row_019", "Clara", "Cram", "de Gruyter"),
|
|
("row_028", "Herbert", "Cram", None),
|
|
)
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_021", "Sohn v Clara Cram u Herbert Cram", idx
|
|
)
|
|
assert len(rels) == 2
|
|
assert all(r["type"] == "PARENT_OF" for r in rels)
|
|
child_ids = {r["relatedPersonId"] for r in rels}
|
|
parent_ids = {r["personId"] for r in rels}
|
|
assert child_ids == {"row_021"}
|
|
assert "row_019" in parent_ids and "row_028" in parent_ids
|
|
assert unres == []
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_tochter_von():
|
|
_, idx = _register(("row_019", "Clara", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_036", "Tochter von Clara Cram", idx
|
|
)
|
|
assert len(rels) == 1
|
|
assert rels[0] == {
|
|
"personId": "row_019",
|
|
"relatedPersonId": "row_036",
|
|
"type": "PARENT_OF",
|
|
"source": "bemerkung",
|
|
"rawBemerkung": "Tochter von Clara Cram",
|
|
}
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_vater():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_031", "Vater v Herbert Cram", idx
|
|
)
|
|
assert len(rels) == 1
|
|
assert rels[0]["personId"] == "row_031"
|
|
assert rels[0]["relatedPersonId"] == "row_028"
|
|
assert rels[0]["type"] == "PARENT_OF"
|
|
|
|
|
|
def test_parse_bemerkung_unmatched_parent_name():
|
|
_, idx = _register() # empty index
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_004", "Sohn v Elsgard A.", idx
|
|
)
|
|
assert rels == []
|
|
assert len(unres) == 1
|
|
assert unres[0]["reason"] == "not_found"
|
|
assert notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_skip_nichte():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_002", "Nichte von Herbert", idx
|
|
)
|
|
assert rels == []
|
|
assert unres == []
|
|
assert notes == "Nichte von Herbert"
|
|
|
|
|
|
def test_parse_bemerkung_skip_bruder():
|
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_033", "Bruder v Herbert", idx
|
|
)
|
|
assert rels == []
|
|
assert unres == []
|
|
assert notes == "Bruder v Herbert"
|
|
|
|
|
|
def test_parse_bemerkung_empty():
|
|
_, idx = _register()
|
|
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
|
|
assert rels == [] and unres == [] and notes == ""
|
|
|
|
|
|
def test_parse_bemerkung_plain_remark():
|
|
_, idx = _register()
|
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
|
"row_029", "Verfasserin der Cram-Chronik !!", idx
|
|
)
|
|
assert rels == [] and unres == []
|
|
assert notes == "Verfasserin der Cram-Chronik !!"
|