feat(normalizer): generate structured tags from Schlagwort + Inhalt fields
Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>
Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").
COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.
Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.
Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -62,3 +62,60 @@ def test_run_end_to_end(tmp_path):
|
||||
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
|
||||
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
|
||||
assert len(docs1) == 4 # header + 3 docs
|
||||
|
||||
|
||||
def test_tag_tree_output_emitted(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
assert (out_dir / "canonical-tag-tree.xlsx").exists()
|
||||
|
||||
|
||||
def test_tag_candidates_review_emitted(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
assert (review_dir / "tag-candidates.csv").exists()
|
||||
text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
|
||||
assert "candidate" in text and "count" in text
|
||||
|
||||
|
||||
def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||
ws = wb.active
|
||||
header = [c.value for c in ws[1]]
|
||||
tag_col = header.index("tags")
|
||||
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||
assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
|
||||
assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
|
||||
|
||||
|
||||
def test_approved_themes_applied(tmp_path):
|
||||
themes_file = tmp_path / "approved-themes.csv"
|
||||
themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={},
|
||||
approved_themes_path=themes_file)
|
||||
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||
ws = wb.active
|
||||
header = [c.value for c in ws[1]]
|
||||
tag_col = header.index("tags")
|
||||
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||
# W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
|
||||
assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
|
||||
|
||||
Reference in New Issue
Block a user