diff --git a/tools/import-normalizer/normalize.py b/tools/import-normalizer/normalize.py index cabbe45c..e9840c34 100644 --- a/tools/import-normalizer/normalize.py +++ b/tools/import-normalizer/normalize.py @@ -21,7 +21,8 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]] register = persons.parse_register(person_dicts) alias_index = persons.AliasIndex(register) - ctx = persons.ResolutionContext(alias_index, name_overrides) + given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES) + ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names) # --- documents --- doc_rows = ingest.read_sheet(document_workbook, document_sheet) @@ -93,7 +94,15 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates) writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index) writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x) - writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous) + unresolved_agg: dict[tuple, list] = {} + for name, category, row in ctx.unresolved: + unresolved_agg.setdefault((category, name), []).append(row) + unresolved_rows = sorted( + ([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))] + for (cat, name), rows in unresolved_agg.items()), + key=lambda r: (r[0], -r[2], r[1])) + writers.write_review_csv(review_dir / "unresolved-names.csv", + ["category", "raw", "count", "example_rows"], unresolved_rows) writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches) dated = sum(1 for d in canon_docs if d.date_raw.strip()) @@ -115,7 +124,13 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet, "distinct_unparsed_formats": len(unparsed_by_raw), "# NAMES": "", "unmatched_name_strings": len(ctx.unmatched), - "ambiguous_receivers": len(ctx.ambiguous), + "unresolved_name_occurrences": len(ctx.unresolved), + "unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"), + "unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"), + "unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"), + "unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"), + "unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"), + "unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"), "# ANOMALIES": "", "empty_rows": empty_count, "blank_index_rows": len(blank_index), diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py index 41f66458..fa257510 100644 --- a/tools/import-normalizer/persons.py +++ b/tools/import-normalizer/persons.py @@ -264,12 +264,14 @@ class AliasIndex: class ResolutionContext: """Resolves raw name strings to person ids; accumulates provisional persons and review data.""" - def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str]): + def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str], + given_names: set[str] | None = None): self.index = alias_index self.name_overrides = name_overrides + self.given_names = given_names or set() self.provisional: dict[str, Person] = {} self.unmatched: dict[str, list] = {} - self.ambiguous: list[tuple] = [] + self.unresolved: list[tuple] = [] # (raw_name, category, source_row) for non-RESOLVABLE names self._raw_to_pid: dict[str, str] = {} self.override_hits = 0 @@ -296,6 +298,9 @@ class ResolutionContext: return pid, self.index.display(pid) or name, True # provisional person (unmatched) — never reuse a register id self.unmatched.setdefault(name, []).append(source_row) + category = classify_name(name, self.given_names) + if category is not NameClass.RESOLVABLE: + self.unresolved.append((name, str(category), source_row)) if name in self._raw_to_pid: return self._raw_to_pid[name], name, False last, first = _last_first(name) @@ -315,13 +320,7 @@ class ResolutionContext: return pid, name, matched, len(parts) > 1 def resolve_receivers(self, raw: str, source_row: int): - results = [] - for part in split_receivers(raw): - pid, name, matched = self.resolve_one(part, source_row) - if not matched and " " in part and find_known_last_name(part) is None and len(part.split()) == 2: - self.ambiguous.append((raw, part, source_row)) - results.append((pid, name, matched)) - return results + return [self.resolve_one(part, source_row) for part in split_receivers(raw)] def _last_first(name: str): diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py index fd866554..139eb427 100644 --- a/tools/import-normalizer/tests/test_documents.py +++ b/tools/import-normalizer/tests/test_documents.py @@ -93,10 +93,17 @@ def test_resolve_one_override_increments_hits(): assert name == "Eugenie de Gruyter" # display comes from the alias index assert ctx.override_hits == 1 -def test_ambiguous_space_pair_flagged_not_split(): - # US-PERS-02 AC4: "Ella Anita" is kept as one provisional + flagged, never guessed into two. - ctx = _ctx() +def test_ambiguous_pair_recorded_in_unresolved(): + people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}, + given_names={"ella", "anita"}) raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") doc = documents.to_canonical(raw, ctx, date_overrides={}) - assert len(doc.receiver_person_ids) == 1 # not split - assert any(part == "Ella Anita" for _, part, _ in ctx.ambiguous) + assert len(doc.receiver_person_ids) == 1 # not split — one provisional + assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved) + +def test_resolvable_first_surname_pair_not_unresolved(): + ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={}, + given_names={"ella", "anita"}) + ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name + assert ctx.unresolved == [] # RESOLVABLE -> not recorded diff --git a/tools/import-normalizer/tests/test_normalize.py b/tools/import-normalizer/tests/test_normalize.py index 2fd26f29..d32cee90 100644 --- a/tools/import-normalizer/tests/test_normalize.py +++ b/tools/import-normalizer/tests/test_normalize.py @@ -10,7 +10,7 @@ def _doc_wb(tmp_path): "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]) ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""]) ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""]) - ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""]) + ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""]) ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"]) p = tmp_path / "docs.xlsx"; wb.save(p); return p @@ -42,6 +42,11 @@ def test_run_end_to_end(tmp_path): assert (review_dir / "unparsed-dates.csv").exists() # C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01) assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") + assert (out_dir / "canonical-documents.xlsx").exists() # (keep existing asserts above) + assert (review_dir / "unresolved-names.csv").exists() + unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8") + assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver + assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced # determinism (NFR-IDEM-01): a second run yields identical canonical content + review files def _matrix(p):