feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -21,7 +21,8 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]]
|
||||
register = persons.parse_register(person_dicts)
|
||||
alias_index = persons.AliasIndex(register)
|
||||
ctx = persons.ResolutionContext(alias_index, name_overrides)
|
||||
given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES)
|
||||
ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names)
|
||||
|
||||
# --- documents ---
|
||||
doc_rows = ingest.read_sheet(document_workbook, document_sheet)
|
||||
@@ -93,7 +94,15 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates)
|
||||
writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index)
|
||||
writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x)
|
||||
writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous)
|
||||
unresolved_agg: dict[tuple, list] = {}
|
||||
for name, category, row in ctx.unresolved:
|
||||
unresolved_agg.setdefault((category, name), []).append(row)
|
||||
unresolved_rows = sorted(
|
||||
([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))]
|
||||
for (cat, name), rows in unresolved_agg.items()),
|
||||
key=lambda r: (r[0], -r[2], r[1]))
|
||||
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
||||
|
||||
dated = sum(1 for d in canon_docs if d.date_raw.strip())
|
||||
@@ -115,7 +124,13 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
"distinct_unparsed_formats": len(unparsed_by_raw),
|
||||
"# NAMES": "",
|
||||
"unmatched_name_strings": len(ctx.unmatched),
|
||||
"ambiguous_receivers": len(ctx.ambiguous),
|
||||
"unresolved_name_occurrences": len(ctx.unresolved),
|
||||
"unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"),
|
||||
"unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"),
|
||||
"unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"),
|
||||
"unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"),
|
||||
"unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"),
|
||||
"unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"),
|
||||
"# ANOMALIES": "",
|
||||
"empty_rows": empty_count,
|
||||
"blank_index_rows": len(blank_index),
|
||||
|
||||
Reference in New Issue
Block a user