feat(normalizer): generate structured tags from Schlagwort + Inhalt fields

Adds tags.py module implementing a three-outcome heuristic: - Individual-to-individual correspondence tags ("Clara an Herbert") → dropped - Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value> - Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value> Three correspondence patterns detected: space-an-space, starts-with-"an ", and abbreviated-sender form ("Maria W.an Clara"). COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms (söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel. Also adds two-phase summary mining: every run emits review/tag-candidates.csv; subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags. Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths; canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 19:47:36 +02:00
parent 5efe3b8a7c
commit 94a40237f4
9 changed files with 405 additions and 6 deletions
--- a/tools/import-normalizer/tests/test_documents.py
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -51,7 +51,7 @@ def test_to_canonical_resolves_and_flags():
    assert doc.sender_person_id == "de-gruyter-walter"
    assert doc.receiver_person_ids == ["de-gruyter-eugenie"]   # matched via maiden alias
    assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
-    assert doc.tags == ["Brautbriefe"]
+    assert doc.tags == ["Themen/Brautbriefe"]
    assert doc.needs_review == []

 def test_to_canonical_unmatched_and_unparsed():
--- a/tools/import-normalizer/tests/test_normalize.py
+++ b/tools/import-normalizer/tests/test_normalize.py
@@ -62,3 +62,60 @@ def test_run_end_to_end(tmp_path):
    assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
    assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
    assert len(docs1) == 4  # header + 3 docs
+
+
+def test_tag_tree_output_emitted(tmp_path):
+    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
+    normalize.run(
+        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
+        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
+        out_dir=out_dir, review_dir=review_dir,
+        date_overrides={}, name_overrides={})
+    assert (out_dir / "canonical-tag-tree.xlsx").exists()
+
+
+def test_tag_candidates_review_emitted(tmp_path):
+    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
+    normalize.run(
+        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
+        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
+        out_dir=out_dir, review_dir=review_dir,
+        date_overrides={}, name_overrides={})
+    assert (review_dir / "tag-candidates.csv").exists()
+    text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
+    assert "candidate" in text and "count" in text
+
+
+def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
+    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
+    normalize.run(
+        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
+        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
+        out_dir=out_dir, review_dir=review_dir,
+        date_overrides={}, name_overrides={})
+    wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
+    ws = wb.active
+    header = [c.value for c in ws[1]]
+    tag_col = header.index("tags")
+    tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
+    assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
+    assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
+
+
+def test_approved_themes_applied(tmp_path):
+    themes_file = tmp_path / "approved-themes.csv"
+    themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
+    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
+    normalize.run(
+        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
+        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
+        out_dir=out_dir, review_dir=review_dir,
+        date_overrides={}, name_overrides={},
+        approved_themes_path=themes_file)
+    wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
+    ws = wb.active
+    header = [c.value for c in ws[1]]
+    tag_col = header.index("tags")
+    tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
+    # W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
+    assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
--- a/tools/import-normalizer/tests/test_tags.py
+++ b/tools/import-normalizer/tests/test_tags.py
@@ -0,0 +1,191 @@
+import tags
+
+
+# --- classify_schlagwort ---
+
+def test_semantic_tag_kept_as_themen():
+    assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"]
+
+def test_everyday_tag_kept_as_themen():
+    assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"]
+
+def test_event_tag_kept_as_themen():
+    assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"]
+
+def test_individual_correspondence_dropped():
+    assert tags.classify_schlagwort("Clara an Herbert") == []
+
+def test_individual_correspondence_with_year_dropped():
+    assert tags.classify_schlagwort("Herbert an Clara 1918") == []
+
+def test_individual_with_role_dropped():
+    assert tags.classify_schlagwort("Vater Juan an Herbert") == []
+
+def test_relational_receiver_dropped():
+    assert tags.classify_schlagwort("Clara an ihre Mutter") == []
+
+def test_group_receiver_kinder_kept_as_briefwechsel():
+    assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"]
+
+def test_group_receiver_eltern_kept():
+    assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"]
+
+def test_group_receiver_geschwister_kept():
+    assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"]
+
+def test_group_receiver_schwiegereltern_kept():
+    assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"]
+
+def test_group_receiver_soehne_kept():
+    assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"]
+
+def test_group_receiver_brueder_kept():
+    assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"]
+
+def test_group_receiver_cousinen_kept():
+    assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"]
+
+def test_group_receiver_freunde_kept():
+    assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"]
+
+def test_group_sender_geschwister_kept():
+    # collective on the LEFT side of "an"
+    assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"]
+
+def test_receiver_only_individual_dropped():
+    # starts with "an " — single individual receiver
+    assert tags.classify_schlagwort("an Walter de Gruyter") == []
+
+def test_receiver_only_group_kept():
+    # starts with "an " — collective receiver
+    assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"]
+
+def test_abbreviated_sender_individual_dropped():
+    # "Maria W.an Clara" — abbreviated name + ".an"
+    assert tags.classify_schlagwort("Maria W.an Clara") == []
+
+def test_abbreviated_sender_group_kept():
+    assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"]
+
+def test_empty_schlagwort_returns_empty():
+    assert tags.classify_schlagwort("") == []
+
+def test_einzelkinder_kept():
+    assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"]
+
+def test_geschw_abbreviation_kept():
+    # "Geschw." abbreviation for Geschwister — appears after "u" in receiver side
+    assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."]
+
+
+# --- mine_summary_candidates ---
+
+def test_mine_candidates_counts_words():
+    summaries = ["Reise, Hochzeit", "Reise", "Krieg"]
+    candidates = dict(tags.mine_summary_candidates(summaries))
+    assert candidates["reise"] == 2
+    assert candidates["hochzeit"] == 1
+    assert candidates["krieg"] == 1
+
+def test_mine_candidates_filters_stop_words():
+    summaries = ["und die Reise", "das ist eine Reise"]
+    candidates = dict(tags.mine_summary_candidates(summaries))
+    assert "reise" in candidates
+    assert "und" not in candidates
+    assert "die" not in candidates
+    assert "das" not in candidates
+    assert "ist" not in candidates
+    assert "eine" not in candidates
+
+def test_mine_candidates_filters_contracted_prepositions():
+    # im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim
+    summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"]
+    candidates = dict(tags.mine_summary_candidates(summaries))
+    for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"):
+        assert stop not in candidates, f"stop word '{stop}' leaked through"
+    assert "besuch" in candidates
+    assert "hochzeit" in candidates
+
+def test_mine_candidates_filters_single_chars():
+    summaries = ["x Reise y"]
+    candidates = dict(tags.mine_summary_candidates(summaries))
+    assert "x" not in candidates
+    assert "y" not in candidates
+
+def test_mine_candidates_sorted_descending():
+    summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"]
+    result = tags.mine_summary_candidates(summaries)
+    counts = [count for _, count in result]
+    assert counts == sorted(counts, reverse=True)
+
+def test_mine_candidates_empty_summaries():
+    assert tags.mine_summary_candidates([]) == []
+    assert tags.mine_summary_candidates([""]) == []
+
+
+# --- load_approved_themes and apply_approved_themes ---
+
+def test_apply_themes_match_found(tmp_path):
+    themes = {"reise", "hochzeit"}
+    result = tags.apply_approved_themes("Reise nach Berlin", themes)
+    assert "Themen/reise" in result
+
+def test_apply_themes_case_insensitive(tmp_path):
+    themes = {"reise"}
+    result = tags.apply_approved_themes("REISE", themes)
+    assert "Themen/reise" in result
+
+def test_apply_themes_no_match(tmp_path):
+    themes = {"krieg"}
+    result = tags.apply_approved_themes("Alltag in Ruhrort", themes)
+    assert result == []
+
+def test_apply_themes_multiple_matches():
+    themes = {"reise", "hochzeit"}
+    result = tags.apply_approved_themes("Reise zur Hochzeit", themes)
+    assert len(result) == 2
+    assert "Themen/reise" in result
+    assert "Themen/hochzeit" in result
+
+
+# --- encode_tags ---
+
+def test_encode_tags_single():
+    assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe"
+
+def test_encode_tags_multiple():
+    result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"])
+    assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder"
+
+def test_encode_tags_empty():
+    assert tags.encode_tags([]) == ""
+
+
+# --- build_tag_tree ---
+
+def test_build_tag_tree_includes_roots():
+    paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]
+    tree = tags.build_tag_tree(paths)
+    tag_paths = [row["tag_path"] for row in tree]
+    assert "Themen" in tag_paths
+    assert "Briefwechsel" in tag_paths
+
+def test_build_tag_tree_includes_children():
+    paths = ["Themen/Brautbriefe"]
+    tree = tags.build_tag_tree(paths)
+    child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe")
+    assert child["parent_name"] == "Themen"
+    assert child["tag_name"] == "Brautbriefe"
+
+def test_build_tag_tree_root_has_empty_parent():
+    paths = ["Themen/Brautbriefe"]
+    tree = tags.build_tag_tree(paths)
+    root = next(r for r in tree if r["tag_path"] == "Themen")
+    assert root["parent_name"] == ""
+    assert root["tag_name"] == "Themen"
+
+def test_build_tag_tree_no_duplicates():
+    paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"]
+    tree = tags.build_tag_tree(paths)
+    tag_paths = [row["tag_path"] for row in tree]
+    assert len(tag_paths) == len(set(tag_paths))