From 306f3b6fe6dffea2bbc730c4d768e994dcddf483 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 25 May 2026 20:56:47 +0200 Subject: [PATCH] feat(normalizer): add name normalization + lookup index to persons_tree --- tools/import-normalizer/persons_tree.py | 54 ++++++++++++ .../tests/test_persons_tree.py | 88 +++++++++++++++++++ 2 files changed, 142 insertions(+) diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py index e346d8ab..f6b1b3c8 100644 --- a/tools/import-normalizer/persons_tree.py +++ b/tools/import-normalizer/persons_tree.py @@ -74,3 +74,57 @@ def _parse_generation(raw: str | None) -> int | None: return None m = re.search(r"\d+", str(raw)) return int(m.group()) if m else None + + +_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"} + + +def _norm_tree(s: str) -> str: + """Normalize a name string for tree matching. + + - Strip surrounding quotes, remove parenthetical substrings + - Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces + - Remove known geographic/honorific suffix tokens + - Collapse whitespace + """ + s = (s or "").strip().strip("\"'") + s = re.sub(r"\([^)]*\)", "", s) + s = _strip_accents(s).lower().replace(".", " ") + tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES] + return " ".join(tokens).strip("., ") + + +def _build_index(persons: list[dict]) -> dict[str, list[str]]: + """Build a name → [rowId, …] lookup index with four keys per person.""" + index: dict[str, list[str]] = {} + + def _add(key: str, row_id: str) -> None: + if key: + index.setdefault(key, []).append(row_id) + + for p in persons: + row_id = p["rowId"] + first = p.get("firstName") or "" + last = p.get("lastName") or "" + maiden = p.get("maidenName") or "" + + _add(_norm_tree(f"{first} {last}"), row_id) + _add(_norm_tree(f"{last} {first}"), row_id) + if maiden: + _add(_norm_tree(f"{first} {maiden}"), row_id) + _add(_norm_tree(last), row_id) + + return index + + +def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]: + """Return (row_id, None) on unique match, (None, reason) otherwise.""" + key = _norm_tree(raw) + if not key: + return None, "empty" + hits = index.get(key, []) + if len(hits) == 1: + return hits[0], None + if len(hits) == 0: + return None, "not_found" + return None, "ambiguous" diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py index bfb7d908..8b040e1d 100644 --- a/tools/import-normalizer/tests/test_persons_tree.py +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -83,3 +83,91 @@ def test_parse_generation_empty(): def test_parse_generation_none(): assert persons_tree._parse_generation(None) is None + + +def test_norm_tree_basic(): + assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer" + + +def test_norm_tree_diacritics(): + assert persons_tree._norm_tree("Wöhler") == "woehler" + + +def test_norm_tree_strips_parens(): + assert persons_tree._norm_tree("Otto (Herbert)") == "otto" + + +def test_norm_tree_strips_quotes(): + assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly" + + +def test_norm_tree_strips_geographic_suffix(): + assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram" + + +def test_norm_tree_strips_mexiko(): + assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram" + + +def test_norm_tree_collapses_whitespace(): + assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter" + + +def test_build_index_forward_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert "werner allemeyer" in idx + assert idx["werner allemeyer"] == ["row_002"] + + +def test_build_index_reversed_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert idx.get("allemeyer werner") == ["row_002"] + + +def test_build_index_maiden_name_lookup(): + persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}] + idx = persons_tree._build_index(persons) + assert idx.get("elsgard woehler") == ["row_002"] + + +def test_build_index_single_token_fallback(): + persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert idx.get("cram") == ["row_028"] + + +def test_build_index_ambiguous_single_token(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + assert set(idx["cram"]) == {"row_028", "row_019"} + + +def test_resolve_one_found(): + persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx) + assert row_id == "row_003" + assert reason is None + + +def test_resolve_one_not_found(): + idx = {} + row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx) + assert row_id is None + assert reason == "not_found" + + +def test_resolve_one_ambiguous(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Cram", idx) + assert row_id is None + assert reason == "ambiguous"