feat(normalizer): generate structured tags from Schlagwort + Inhalt fields

Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>

Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").

COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.

Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.

Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-25 19:47:36 +02:00
parent 5efe3b8a7c
commit 94a40237f4
9 changed files with 405 additions and 6 deletions

View File

@@ -51,7 +51,7 @@ def test_to_canonical_resolves_and_flags():
assert doc.sender_person_id == "de-gruyter-walter"
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
assert doc.tags == ["Brautbriefe"]
assert doc.tags == ["Themen/Brautbriefe"]
assert doc.needs_review == []
def test_to_canonical_unmatched_and_unparsed():

View File

@@ -62,3 +62,60 @@ def test_run_end_to_end(tmp_path):
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
assert len(docs1) == 4 # header + 3 docs
def test_tag_tree_output_emitted(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
assert (out_dir / "canonical-tag-tree.xlsx").exists()
def test_tag_candidates_review_emitted(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
assert (review_dir / "tag-candidates.csv").exists()
text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
assert "candidate" in text and "count" in text
def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={})
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
ws = wb.active
header = [c.value for c in ws[1]]
tag_col = header.index("tags")
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
def test_approved_themes_applied(tmp_path):
themes_file = tmp_path / "approved-themes.csv"
themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
normalize.run(
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
out_dir=out_dir, review_dir=review_dir,
date_overrides={}, name_overrides={},
approved_themes_path=themes_file)
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
ws = wb.active
header = [c.value for c in ws[1]]
tag_col = header.index("tags")
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
# W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
assert any(v and "Themen/geschäftsreise" in v for v in tag_values)

View File

@@ -0,0 +1,191 @@
import tags
# --- classify_schlagwort ---
def test_semantic_tag_kept_as_themen():
assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"]
def test_everyday_tag_kept_as_themen():
assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"]
def test_event_tag_kept_as_themen():
assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"]
def test_individual_correspondence_dropped():
assert tags.classify_schlagwort("Clara an Herbert") == []
def test_individual_correspondence_with_year_dropped():
assert tags.classify_schlagwort("Herbert an Clara 1918") == []
def test_individual_with_role_dropped():
assert tags.classify_schlagwort("Vater Juan an Herbert") == []
def test_relational_receiver_dropped():
assert tags.classify_schlagwort("Clara an ihre Mutter") == []
def test_group_receiver_kinder_kept_as_briefwechsel():
assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"]
def test_group_receiver_eltern_kept():
assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"]
def test_group_receiver_geschwister_kept():
assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"]
def test_group_receiver_schwiegereltern_kept():
assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"]
def test_group_receiver_soehne_kept():
assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"]
def test_group_receiver_brueder_kept():
assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"]
def test_group_receiver_cousinen_kept():
assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"]
def test_group_receiver_freunde_kept():
assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"]
def test_group_sender_geschwister_kept():
# collective on the LEFT side of "an"
assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"]
def test_receiver_only_individual_dropped():
# starts with "an " — single individual receiver
assert tags.classify_schlagwort("an Walter de Gruyter") == []
def test_receiver_only_group_kept():
# starts with "an " — collective receiver
assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"]
def test_abbreviated_sender_individual_dropped():
# "Maria W.an Clara" — abbreviated name + ".an"
assert tags.classify_schlagwort("Maria W.an Clara") == []
def test_abbreviated_sender_group_kept():
assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"]
def test_empty_schlagwort_returns_empty():
assert tags.classify_schlagwort("") == []
def test_einzelkinder_kept():
assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"]
def test_geschw_abbreviation_kept():
# "Geschw." abbreviation for Geschwister — appears after "u" in receiver side
assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."]
# --- mine_summary_candidates ---
def test_mine_candidates_counts_words():
summaries = ["Reise, Hochzeit", "Reise", "Krieg"]
candidates = dict(tags.mine_summary_candidates(summaries))
assert candidates["reise"] == 2
assert candidates["hochzeit"] == 1
assert candidates["krieg"] == 1
def test_mine_candidates_filters_stop_words():
summaries = ["und die Reise", "das ist eine Reise"]
candidates = dict(tags.mine_summary_candidates(summaries))
assert "reise" in candidates
assert "und" not in candidates
assert "die" not in candidates
assert "das" not in candidates
assert "ist" not in candidates
assert "eine" not in candidates
def test_mine_candidates_filters_contracted_prepositions():
# im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim
summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"]
candidates = dict(tags.mine_summary_candidates(summaries))
for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"):
assert stop not in candidates, f"stop word '{stop}' leaked through"
assert "besuch" in candidates
assert "hochzeit" in candidates
def test_mine_candidates_filters_single_chars():
summaries = ["x Reise y"]
candidates = dict(tags.mine_summary_candidates(summaries))
assert "x" not in candidates
assert "y" not in candidates
def test_mine_candidates_sorted_descending():
summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"]
result = tags.mine_summary_candidates(summaries)
counts = [count for _, count in result]
assert counts == sorted(counts, reverse=True)
def test_mine_candidates_empty_summaries():
assert tags.mine_summary_candidates([]) == []
assert tags.mine_summary_candidates([""]) == []
# --- load_approved_themes and apply_approved_themes ---
def test_apply_themes_match_found(tmp_path):
themes = {"reise", "hochzeit"}
result = tags.apply_approved_themes("Reise nach Berlin", themes)
assert "Themen/reise" in result
def test_apply_themes_case_insensitive(tmp_path):
themes = {"reise"}
result = tags.apply_approved_themes("REISE", themes)
assert "Themen/reise" in result
def test_apply_themes_no_match(tmp_path):
themes = {"krieg"}
result = tags.apply_approved_themes("Alltag in Ruhrort", themes)
assert result == []
def test_apply_themes_multiple_matches():
themes = {"reise", "hochzeit"}
result = tags.apply_approved_themes("Reise zur Hochzeit", themes)
assert len(result) == 2
assert "Themen/reise" in result
assert "Themen/hochzeit" in result
# --- encode_tags ---
def test_encode_tags_single():
assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe"
def test_encode_tags_multiple():
result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"])
assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder"
def test_encode_tags_empty():
assert tags.encode_tags([]) == ""
# --- build_tag_tree ---
def test_build_tag_tree_includes_roots():
paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]
tree = tags.build_tag_tree(paths)
tag_paths = [row["tag_path"] for row in tree]
assert "Themen" in tag_paths
assert "Briefwechsel" in tag_paths
def test_build_tag_tree_includes_children():
paths = ["Themen/Brautbriefe"]
tree = tags.build_tag_tree(paths)
child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe")
assert child["parent_name"] == "Themen"
assert child["tag_name"] == "Brautbriefe"
def test_build_tag_tree_root_has_empty_parent():
paths = ["Themen/Brautbriefe"]
tree = tags.build_tag_tree(paths)
root = next(r for r in tree if r["tag_path"] == "Themen")
assert root["parent_name"] == ""
assert root["tag_name"] == "Themen"
def test_build_tag_tree_no_duplicates():
paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"]
tree = tags.build_tag_tree(paths)
tag_paths = [row["tag_path"] for row in tree]
assert len(tag_paths) == len(set(tag_paths))