Adds tags.py module implementing a three-outcome heuristic:
- Individual-to-individual correspondence tags ("Clara an Herbert") → dropped
- Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value>
- Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value>
Three correspondence patterns detected: space-an-space, starts-with-"an ",
and abbreviated-sender form ("Maria W.an Clara").
COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms
(söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel.
Also adds two-phase summary mining: every run emits review/tag-candidates.csv;
subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags.
Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths;
canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
87 lines
2.9 KiB
Python
87 lines
2.9 KiB
Python
"""Write canonical .xlsx outputs and review .csv files."""
|
|
import csv
|
|
import datetime
|
|
from pathlib import Path
|
|
import openpyxl
|
|
|
|
_PIPE = "|"
|
|
# Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl
|
|
# otherwise stamps docProps with the current time on every save.
|
|
_FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0)
|
|
|
|
|
|
def _join(value):
|
|
if isinstance(value, list):
|
|
return _PIPE.join(str(v) for v in value)
|
|
return "" if value is None else str(value)
|
|
|
|
|
|
def _csv_safe(value):
|
|
"""Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs."""
|
|
s = "" if value is None else str(value)
|
|
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
|
|
|
|
|
DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name",
|
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
|
"date_precision", "location", "tags", "summary", "source_row", "needs_review"]
|
|
|
|
PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname",
|
|
"birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw",
|
|
"death_place", "spouse", "generation", "notes", "aliases", "provisional"]
|
|
|
|
|
|
def _write_xlsx(records, columns, path: Path):
|
|
wb = openpyxl.Workbook()
|
|
ws = wb.active
|
|
ws.append(columns)
|
|
for rec in records:
|
|
ws.append([_join(getattr(rec, col)) for col in columns])
|
|
wb.properties.created = _FIXED_TS
|
|
wb.properties.modified = _FIXED_TS
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
wb.save(path)
|
|
|
|
|
|
def write_documents_xlsx(docs, path: Path):
|
|
_write_xlsx(docs, DOC_COLUMNS, path)
|
|
|
|
|
|
def write_tag_tree_xlsx(tree: list[dict], path: Path):
|
|
columns = ["tag_path", "parent_name", "tag_name"]
|
|
wb = openpyxl.Workbook()
|
|
ws = wb.active
|
|
ws.append(columns)
|
|
for row in tree:
|
|
ws.append([row.get(col, "") for col in columns])
|
|
wb.properties.created = _FIXED_TS
|
|
wb.properties.modified = _FIXED_TS
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
wb.save(path)
|
|
|
|
|
|
def write_persons_xlsx(people, path: Path):
|
|
_write_xlsx(people, PERSON_COLUMNS, path)
|
|
|
|
|
|
def write_review_csv(path: Path, header: list[str], rows: list[list]):
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
with open(path, "w", encoding="utf-8", newline="") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(header)
|
|
for row in rows:
|
|
w.writerow([_csv_safe(c) for c in row])
|
|
|
|
|
|
def write_summary(path: Path, stats: dict):
|
|
"""Render a grouped, scannable summary. Keys beginning with '#' are section headers."""
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
lines = []
|
|
for k, v in stats.items():
|
|
if k.startswith("#"):
|
|
lines.append("")
|
|
lines.append(k[1:].strip() + ":")
|
|
else:
|
|
lines.append(f" {k}: {v}")
|
|
Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|