feat(normalizer): generate structured tags from Schlagwort + Inhalt fields
Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>
Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").
COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.
Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.
Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -51,7 +51,7 @@ def test_to_canonical_resolves_and_flags():
|
||||
assert doc.sender_person_id == "de-gruyter-walter"
|
||||
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||
assert doc.tags == ["Brautbriefe"]
|
||||
assert doc.tags == ["Themen/Brautbriefe"]
|
||||
assert doc.needs_review == []
|
||||
|
||||
def test_to_canonical_unmatched_and_unparsed():
|
||||
|
||||
@@ -62,3 +62,60 @@ def test_run_end_to_end(tmp_path):
|
||||
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
|
||||
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
|
||||
assert len(docs1) == 4 # header + 3 docs
|
||||
|
||||
|
||||
def test_tag_tree_output_emitted(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
assert (out_dir / "canonical-tag-tree.xlsx").exists()
|
||||
|
||||
|
||||
def test_tag_candidates_review_emitted(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
assert (review_dir / "tag-candidates.csv").exists()
|
||||
text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
|
||||
assert "candidate" in text and "count" in text
|
||||
|
||||
|
||||
def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={})
|
||||
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||
ws = wb.active
|
||||
header = [c.value for c in ws[1]]
|
||||
tag_col = header.index("tags")
|
||||
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||
assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
|
||||
assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
|
||||
|
||||
|
||||
def test_approved_themes_applied(tmp_path):
|
||||
themes_file = tmp_path / "approved-themes.csv"
|
||||
themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
|
||||
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||
normalize.run(
|
||||
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||
out_dir=out_dir, review_dir=review_dir,
|
||||
date_overrides={}, name_overrides={},
|
||||
approved_themes_path=themes_file)
|
||||
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||
ws = wb.active
|
||||
header = [c.value for c in ws[1]]
|
||||
tag_col = header.index("tags")
|
||||
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||
# W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
|
||||
assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
|
||||
|
||||
191
tools/import-normalizer/tests/test_tags.py
Normal file
191
tools/import-normalizer/tests/test_tags.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import tags
|
||||
|
||||
|
||||
# --- classify_schlagwort ---
|
||||
|
||||
def test_semantic_tag_kept_as_themen():
|
||||
assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"]
|
||||
|
||||
def test_everyday_tag_kept_as_themen():
|
||||
assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"]
|
||||
|
||||
def test_event_tag_kept_as_themen():
|
||||
assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"]
|
||||
|
||||
def test_individual_correspondence_dropped():
|
||||
assert tags.classify_schlagwort("Clara an Herbert") == []
|
||||
|
||||
def test_individual_correspondence_with_year_dropped():
|
||||
assert tags.classify_schlagwort("Herbert an Clara 1918") == []
|
||||
|
||||
def test_individual_with_role_dropped():
|
||||
assert tags.classify_schlagwort("Vater Juan an Herbert") == []
|
||||
|
||||
def test_relational_receiver_dropped():
|
||||
assert tags.classify_schlagwort("Clara an ihre Mutter") == []
|
||||
|
||||
def test_group_receiver_kinder_kept_as_briefwechsel():
|
||||
assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"]
|
||||
|
||||
def test_group_receiver_eltern_kept():
|
||||
assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"]
|
||||
|
||||
def test_group_receiver_geschwister_kept():
|
||||
assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"]
|
||||
|
||||
def test_group_receiver_schwiegereltern_kept():
|
||||
assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"]
|
||||
|
||||
def test_group_receiver_soehne_kept():
|
||||
assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"]
|
||||
|
||||
def test_group_receiver_brueder_kept():
|
||||
assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"]
|
||||
|
||||
def test_group_receiver_cousinen_kept():
|
||||
assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"]
|
||||
|
||||
def test_group_receiver_freunde_kept():
|
||||
assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"]
|
||||
|
||||
def test_group_sender_geschwister_kept():
|
||||
# collective on the LEFT side of "an"
|
||||
assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"]
|
||||
|
||||
def test_receiver_only_individual_dropped():
|
||||
# starts with "an " — single individual receiver
|
||||
assert tags.classify_schlagwort("an Walter de Gruyter") == []
|
||||
|
||||
def test_receiver_only_group_kept():
|
||||
# starts with "an " — collective receiver
|
||||
assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"]
|
||||
|
||||
def test_abbreviated_sender_individual_dropped():
|
||||
# "Maria W.an Clara" — abbreviated name + ".an"
|
||||
assert tags.classify_schlagwort("Maria W.an Clara") == []
|
||||
|
||||
def test_abbreviated_sender_group_kept():
|
||||
assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"]
|
||||
|
||||
def test_empty_schlagwort_returns_empty():
|
||||
assert tags.classify_schlagwort("") == []
|
||||
|
||||
def test_einzelkinder_kept():
|
||||
assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"]
|
||||
|
||||
def test_geschw_abbreviation_kept():
|
||||
# "Geschw." abbreviation for Geschwister — appears after "u" in receiver side
|
||||
assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."]
|
||||
|
||||
|
||||
# --- mine_summary_candidates ---
|
||||
|
||||
def test_mine_candidates_counts_words():
|
||||
summaries = ["Reise, Hochzeit", "Reise", "Krieg"]
|
||||
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||
assert candidates["reise"] == 2
|
||||
assert candidates["hochzeit"] == 1
|
||||
assert candidates["krieg"] == 1
|
||||
|
||||
def test_mine_candidates_filters_stop_words():
|
||||
summaries = ["und die Reise", "das ist eine Reise"]
|
||||
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||
assert "reise" in candidates
|
||||
assert "und" not in candidates
|
||||
assert "die" not in candidates
|
||||
assert "das" not in candidates
|
||||
assert "ist" not in candidates
|
||||
assert "eine" not in candidates
|
||||
|
||||
def test_mine_candidates_filters_contracted_prepositions():
|
||||
# im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim
|
||||
summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"]
|
||||
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||
for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"):
|
||||
assert stop not in candidates, f"stop word '{stop}' leaked through"
|
||||
assert "besuch" in candidates
|
||||
assert "hochzeit" in candidates
|
||||
|
||||
def test_mine_candidates_filters_single_chars():
|
||||
summaries = ["x Reise y"]
|
||||
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||
assert "x" not in candidates
|
||||
assert "y" not in candidates
|
||||
|
||||
def test_mine_candidates_sorted_descending():
|
||||
summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"]
|
||||
result = tags.mine_summary_candidates(summaries)
|
||||
counts = [count for _, count in result]
|
||||
assert counts == sorted(counts, reverse=True)
|
||||
|
||||
def test_mine_candidates_empty_summaries():
|
||||
assert tags.mine_summary_candidates([]) == []
|
||||
assert tags.mine_summary_candidates([""]) == []
|
||||
|
||||
|
||||
# --- load_approved_themes and apply_approved_themes ---
|
||||
|
||||
def test_apply_themes_match_found(tmp_path):
|
||||
themes = {"reise", "hochzeit"}
|
||||
result = tags.apply_approved_themes("Reise nach Berlin", themes)
|
||||
assert "Themen/reise" in result
|
||||
|
||||
def test_apply_themes_case_insensitive(tmp_path):
|
||||
themes = {"reise"}
|
||||
result = tags.apply_approved_themes("REISE", themes)
|
||||
assert "Themen/reise" in result
|
||||
|
||||
def test_apply_themes_no_match(tmp_path):
|
||||
themes = {"krieg"}
|
||||
result = tags.apply_approved_themes("Alltag in Ruhrort", themes)
|
||||
assert result == []
|
||||
|
||||
def test_apply_themes_multiple_matches():
|
||||
themes = {"reise", "hochzeit"}
|
||||
result = tags.apply_approved_themes("Reise zur Hochzeit", themes)
|
||||
assert len(result) == 2
|
||||
assert "Themen/reise" in result
|
||||
assert "Themen/hochzeit" in result
|
||||
|
||||
|
||||
# --- encode_tags ---
|
||||
|
||||
def test_encode_tags_single():
|
||||
assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe"
|
||||
|
||||
def test_encode_tags_multiple():
|
||||
result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"])
|
||||
assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder"
|
||||
|
||||
def test_encode_tags_empty():
|
||||
assert tags.encode_tags([]) == ""
|
||||
|
||||
|
||||
# --- build_tag_tree ---
|
||||
|
||||
def test_build_tag_tree_includes_roots():
|
||||
paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]
|
||||
tree = tags.build_tag_tree(paths)
|
||||
tag_paths = [row["tag_path"] for row in tree]
|
||||
assert "Themen" in tag_paths
|
||||
assert "Briefwechsel" in tag_paths
|
||||
|
||||
def test_build_tag_tree_includes_children():
|
||||
paths = ["Themen/Brautbriefe"]
|
||||
tree = tags.build_tag_tree(paths)
|
||||
child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe")
|
||||
assert child["parent_name"] == "Themen"
|
||||
assert child["tag_name"] == "Brautbriefe"
|
||||
|
||||
def test_build_tag_tree_root_has_empty_parent():
|
||||
paths = ["Themen/Brautbriefe"]
|
||||
tree = tags.build_tag_tree(paths)
|
||||
root = next(r for r in tree if r["tag_path"] == "Themen")
|
||||
assert root["parent_name"] == ""
|
||||
assert root["tag_name"] == "Themen"
|
||||
|
||||
def test_build_tag_tree_no_duplicates():
|
||||
paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"]
|
||||
tree = tags.build_tag_tree(paths)
|
||||
tag_paths = [row["tag_path"] for row in tree]
|
||||
assert len(tag_paths) == len(set(tag_paths))
|
||||
Reference in New Issue
Block a user