feat(normalizer): generate structured tags from Schlagwort + Inhalt fields
Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>
Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").
COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.
Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.
Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,13 +8,17 @@ import ingest
|
||||
import persons
|
||||
import documents
|
||||
import overrides as overrides_mod
|
||||
import tags as _tags
|
||||
import writers
|
||||
|
||||
|
||||
def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
out_dir, review_dir, date_overrides, name_overrides) -> dict:
|
||||
out_dir, review_dir, date_overrides, name_overrides,
|
||||
approved_themes_path=None) -> dict:
|
||||
out_dir, review_dir = Path(out_dir), Path(review_dir)
|
||||
|
||||
approved_themes = _tags.load_approved_themes(Path(approved_themes_path)) if approved_themes_path else set()
|
||||
|
||||
# --- persons ---
|
||||
person_rows = ingest.read_sheet(person_workbook, person_sheet)
|
||||
p_fields, _ = ingest.build_header_map(person_rows[0], config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
|
||||
@@ -52,7 +56,7 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
seen_index[raw.index] += 1
|
||||
if raw.date.strip() and raw.date.strip() in date_overrides:
|
||||
dates_by_override += 1
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides)
|
||||
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
||||
if "unparsed_date" in doc.needs_review:
|
||||
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
||||
if "index_file_mismatch" in doc.needs_review:
|
||||
@@ -74,6 +78,9 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
writers.write_documents_xlsx(canon_docs, out_dir / "canonical-documents.xlsx")
|
||||
writers.write_persons_xlsx(all_people, out_dir / "canonical-persons.xlsx")
|
||||
|
||||
all_tag_paths = [path for doc in canon_docs for path in doc.tags]
|
||||
writers.write_tag_tree_xlsx(_tags.build_tag_tree(all_tag_paths), out_dir / "canonical-tag-tree.xlsx")
|
||||
|
||||
# --- review files ---
|
||||
# unparsed dates: most-frequent first, with example source rows + blank override cells so a
|
||||
# corrected row can be pasted straight into overrides/dates.csv (same raw,iso,precision shape).
|
||||
@@ -97,6 +104,11 @@ def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
||||
|
||||
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
||||
candidates = _tags.mine_summary_candidates(all_summaries)
|
||||
writers.write_review_csv(review_dir / "tag-candidates.csv", ["candidate", "count"],
|
||||
[[c, n] for c, n in candidates])
|
||||
|
||||
dated = sum(1 for d in canon_docs if d.date_raw.strip())
|
||||
unknown = sum(1 for d in canon_docs if d.date_raw.strip() and d.date_precision == "UNKNOWN")
|
||||
unknown_rate = f"{(100 * unknown / dated):.1f}%" if dated else "0.0%"
|
||||
@@ -148,7 +160,8 @@ def main():
|
||||
document_workbook=config.DOCUMENT_WORKBOOK, document_sheet=config.DOCUMENT_SHEET,
|
||||
person_workbook=config.PERSON_WORKBOOK, person_sheet=config.PERSON_SHEET,
|
||||
out_dir=config.OUT_DIR, review_dir=config.REVIEW_DIR,
|
||||
date_overrides=date_overrides, name_overrides=name_overrides)
|
||||
date_overrides=date_overrides, name_overrides=name_overrides,
|
||||
approved_themes_path=config.OVERRIDES_DIR / "approved-themes.csv")
|
||||
print("Normalization complete:")
|
||||
for k, v in stats.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
Reference in New Issue
Block a user