chore(normalizer): unignore canonical-persons-tree.json from out/ exclusion

feat(normalizer): generate canonical-persons-tree.json from Personendatei 2.xlsx
157 persons, 43 relationships (29 SPOUSE_OF + 14 PARENT_OF), 89 unresolved references. 6 duplicate rows skipped (Seils family block + Christa Schütz). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 21:19:02 +02:00 · 2026-05-25 21:18:24 +02:00 · 2026-05-25 21:16:21 +02:00 · 2026-05-25 21:12:45 +02:00 · 2026-05-25 21:08:49 +02:00 · 2026-05-25 21:06:24 +02:00
63 changed files with 11573 additions and 594 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,7 @@ node_modules/
 # Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift.
 frontend/yarn.lock
 **/.venv/
 **/__pycache__/
 *.pyc
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -197,7 +197,6 @@ frontend/src/routes/
 ├── aktivitaeten/           Unified activity feed (Chronik)
 ├── geschichten/            Stories — list, [id], [id]/edit, new
 ├── stammbaum/              Family tree (Stammbaum)
 ├── themen/                 Topics directory — browsable tag index
 ├── enrich/                 Enrichment workflow — [id], done
 ├── admin/                  User, group, tag, OCR, system management
 ├── hilfe/transkription/    Transcription help page
--- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java
@@ -6,7 +6,6 @@ import org.raddatz.familienarchiv.person.Person;
 import org.raddatz.familienarchiv.tag.Tag;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
 import java.util.List;
 import java.util.UUID;
@@ -33,9 +32,5 @@ public record DocumentListItem(
        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
        List<ActivityActorDTO> contributors,
        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
-        SearchMatchData matchData,
+        SearchMatchData matchData
        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
        LocalDateTime createdAt,
        @Schema(requiredMode = Schema.RequiredMode.REQUIRED)
        LocalDateTime updatedAt
 ) {}
--- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java
@@ -767,9 +767,7 @@ public class DocumentService {
                doc.getSummary(),
                completionPct,
                contributors,
-                match,
+                match
                doc.getCreatedAt(),
                doc.getUpdatedAt()
        );
    }
--- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java
@@ -135,8 +135,7 @@ class DocumentControllerTest {
                .thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
                        docId, "Brief an Anna", "brief.pdf", null, null, null,
                        List.of(), List.of(), null, null, null, null,
-                        0, List.of(), matchData,
+                        0, List.of(), matchData))));
                        LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)))));
        mockMvc.perform(get("/api/documents/search").param("q", "Brief"))
                .andExpect(status().isOk())
@@ -154,8 +153,7 @@ class DocumentControllerTest {
                .thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
                        docId, "Brief an Anna", "brief.pdf", null, null, null,
                        List.of(), List.of(), null, null, null, null,
-                        0, List.of(), matchData,
+                        0, List.of(), matchData))));
                        LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)))));
        mockMvc.perform(get("/api/documents/search"))
                .andExpect(status().isOk())
--- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java
@@ -5,7 +5,6 @@ import org.junit.jupiter.api.Test;
 import org.raddatz.familienarchiv.audit.ActivityActorDTO;
 import org.springframework.data.domain.PageRequest;
 import java.time.LocalDateTime;
 import java.util.List;
 import java.util.UUID;
@@ -17,8 +16,7 @@ class DocumentSearchResultTest {
        return new DocumentListItem(
                docId, "Test", "test.pdf", null, null, null,
                List.of(), List.of(), null, null, null, null,
-                0, List.of(), SearchMatchData.empty(),
+                0, List.of(), SearchMatchData.empty());
                LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0));
    }
    @Test
@@ -68,8 +66,7 @@ class DocumentSearchResultTest {
        DocumentListItem item = new DocumentListItem(
                id, "T", "t.pdf", null, null, null,
                List.of(), List.of(), null, null, null, null,
-                75, List.of(actor), SearchMatchData.empty(),
+                75, List.of(actor), SearchMatchData.empty());
                LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0));
        DocumentSearchResult result = DocumentSearchResult.of(List.of(item));
--- a/docs/architecture/c4/l3-frontend-3c-people-stories.puml
+++ b/docs/architecture/c4/l3-frontend-3c-people-stories.puml
@@ -14,7 +14,6 @@ System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") {
    Component(geschichten, "/geschichten and /geschichten/[id]", "SvelteKit Routes", "Story list and detail pages. Loader: GET /api/geschichten?status=PUBLISHED.")
    Component(geschichtenEdit, "/geschichten/[id]/edit and /geschichten/new", "SvelteKit Routes", "Story editor with rich text, person and document linking. Actions: PUT/POST /api/geschichten. Requires BLOG_WRITE permission.")
    Component(stammbaum, "/stammbaum", "SvelteKit Route", "Family tree visualisation. Loader: GET /api/network (nodes + edges). Renders interactive family tree from network graph data.")
    Component(themen, "/themen", "SvelteKit Route", "Browsable topic index. Shows all root tags as cards with color bars and child rows. ThemenWidget also embedded in the home dashboard (reader + editor sidebar). Loader: GET /api/tags/tree.")
    Component(profilePage, "/profile", "SvelteKit Route", "Current user profile settings. Loader: GET /api/users/me/notification-preferences. Actions: update name/password and notification preferences.")
    Component(userProfile, "/users/[id]", "SvelteKit Route", "Public user profile view. Loader: GET /api/users/{id}.")
 }
@@ -27,7 +26,6 @@ Rel(aktivitaeten, backend, "GET /api/dashboard/activity, GET /api/notifications"
 Rel(geschichten, backend, "GET /api/geschichten", "HTTP / JSON")
 Rel(geschichtenEdit, backend, "GET/PUT/POST /api/geschichten", "HTTP / JSON")
 Rel(stammbaum, backend, "GET /api/network", "HTTP / JSON")
 Rel(themen, backend, "GET /api/tags/tree", "HTTP / JSON")
 Rel(profilePage, backend, "GET/PUT /api/users/me, notification-preferences", "HTTP / JSON")
 Rel(userProfile, backend, "GET /api/users/{id}", "HTTP / JSON")
--- a/docs/import-migration/01-findings-spreadsheet-analysis.md
+++ b/docs/import-migration/01-findings-spreadsheet-analysis.md
@@ -0,0 +1,313 @@
 # Spreadsheet Analysis — Findings (2026-05-25)
 Analysis of the **real raw archive** spreadsheets against the current `MassImportService`
 (`backend/.../importing/MassImportService.java`). Goal: import ~7,600 letter rows + a
 163-person register, with PDFs to follow.
 Every issue has an ID (`IMP-NN`), severity, evidence, and a proposed approach.
 ---
 ## 0. Context: how the importer reads a row today
 `MassImportService` reads **sheet index 0** and maps columns by configurable indices
 (`app.import.col.*`, defaults in the source):
 | Property | Default col | Meaning |
 | --- | --- | --- |
 | `colIndex` | 0 | Index (→ filename `<index>.pdf`) |
 | `colBox` | 1 | Box |
 | `colFolder` | 2 | Mappe |
 | `colSender` | 3 | Sender (raw) |
 | `colReceivers` | 5 | Receivers (raw) |
 | `colDate` | 7 | Date |
 | `colLocation` | 9 | Location |
 | `colTags` | 10 | Tag (single) |
 | `colSummary` | 11 | Summary |
 | `colTranscription` | 13 | Transcription |
 These defaults match the **ODS** file exactly (`Index, Box, Mappe, Von, BriefeschreiberIn,
 An, EmpfängerIn, Datum, Datum Originalformat, Ort, Schlagwort, Inhalt, Zeitlicher Kontext,
 Transkript` = 14 cols). The ODS was the development target. The new xlsx is a different beast.
 Per-row pipeline: skip if Index blank → derive filename from Index → validate filename →
 look for file on disk (recursive; metadata-only if absent) → check PDF magic bytes →
 `importSingleDocument` (upsert by `originalFilename`, dedupe non-placeholders as
 `ALREADY_EXISTS`). Date parsing is **ISO-only** (`LocalDate.parse`).
 ---
 ## IMP-01 — New xlsx column layout ≠ importer defaults 🔴 BLOCKER
 The new `…aktuell…xlsx` (sheet `Familienarchiv`, 7,943 rows × 12 cols) has a **denser,
 different** layout. There is an extra `Datei` column at index 1, and the normalized
 `Von`/`An`/ISO-`Datum` columns from the ODS **do not exist**.
 | col | New xlsx header | Importer default expects | Result with defaults |
 | --- | --- | --- | --- |
 | 0 | Index | Index | ✅ ok |
 | 1 | **Datei** (path) | Box | ❌ Box ← `..\__scan\W-0001.pdf` |
 | 2 | Box | Mappe | ❌ Mappe ← `V` |
 | 3 | Mappe | Sender | ❌ Sender ← `1` |
 | 4 | BriefeschreiberIn (sender) | — (unused) | ❌ sender ignored |
 | 5 | EmpfängerIn (receiver) | Receivers | ✅ coincidentally ok |
 | 6 | Datum des Briefes | — (unused) | ❌ date ignored |
 | 7 | Ort (location) | Date | ❌ Date ← `Rotterdam` → null |
 | 8 | Schlagwort (tag) | — (unused) | ❌ tag ignored |
 | 9 | Inhalt (summary) | Location | ❌ Location ← summary text |
 | 10 | — | Tag | ❌ empty |
 | 11 | — | Summary | ❌ empty |
 | 13 | — | Transcription | ❌ column doesn't exist |
 **Impact:** importing as-is produces almost entirely garbage metadata.
 **Proposed approach (decide with Marcel):**
 - (a) Re-map via the existing `app.import.col.*` properties — fast, no code. New mapping:
  `index=0, box=2, folder=3, sender=4, receivers=5, date=6, location=7, tags=8, summary=9`,
  and there is **no** transcription column (point it past the end or add a "missing column"
  convention). Caveat: tags land in `colTags` but the real per-letter keywords are in
  `Inhalt` (col 9) — see IMP-08 note on tags vs summary.
 - (b) Make the importer **header-driven** (map by header name, not index) so it survives
  layout drift across files. More robust, needs a code change (→ Gitea issue).
 Recommendation: (b) is the durable fix given we have ≥3 different layouts already.
 ---
 ## IMP-02 — 90% of dates are free-text the parser can't read 🔴 BLOCKER
 The dates are written **as in the letter**. `parseDate()` only does `LocalDate.parse()`
 (ISO `yyyy-MM-dd`), so anything non-ISO becomes `null`.
 Of **7,319** rows with a date value (col 6):
 | kind | count | parses today? |
 | --- | --- | --- |
 | Real Excel date cells (→ ISO via POI) | 748 | ✅ |
 | Free-text date strings | 6,571 | ❌ → null |
 → **90% of dated rows lose their date.** (623 rows have no date at all.)
 Observed free-text formats (counts approximate, from col 6):
 | Format | Count | Examples |
 | --- | --- | --- |
 | `D.M.YY` | 1,338 | `11.10.08`, `13.5.09` |
 | `D.RomanMonth.YY/YYYY` | ~1,527 | `22.III.18`, `19.XII.1954`, `1.III.27` |
 | `D.Month YYYY` | 950 | `6.März 1888`, `9.März 1888` (note: **no space** after the dot) |
 | `D.M.YYYY` | 358 | `15.2.1888`, `7.3.1888` |
 | Approximate / unknown | 146 | `?`, `13.7.18?`, `17.Nov (?) 1887`, `13.Januar ? 1907` |
 | `Month YYYY` / season / holiday | 41+27 | `Mai 1895`, `Herbst 1913`, `Pfingsten 1922`, `Ostern 1890` |
 | `YYYY` only | 17 | `1905`, `1949` |
 | `D.M.` no year | 10 | `8.9.`, `14.3.` |
 | Ranges | 5+ | `8.1.1916 - 15.3.1916`, `1881/82`, `1945/46?` |
 | Abbrev/English months, no space | many | `29.Sept.1891`, `10.Oct.95`, `9.December1889`, `18.Dez.1916` |
 | Slash separator | ~315 | `2/2. 18`, `17/6. 1916`, `10/4. 1917` |
 | English `Month D. YYYY` | several | `April 12. 1922`, `Oct.5. 1916`, `Mai 23. 1917` |
 | Trailing notes | 5+ | `26.4.1888, 2. Brief`, `31.8.1888,2.Brief` |
 | 3-digit year (typo) | 107 | `30.1.889` (→ 1889), `4.3.1023` (in person file → 1923) |
 | Day-range within month | several | `7./8. Sept.1923` |
 **Proposed approach:** build a tolerant German/historical date parser (→ Gitea issue, it's
 a code change). Requirements:
 - Numeric `D.M.YY[YY]` and `D/M. YY[YY]` (slash = dot).
 - Roman-numeral months (`I`–`XII`).
 - German + English month names, full + abbreviated, with/without separating space
  (`März`, `Sept.`, `Dez`, `December`, `Oct.`).
 - 2-digit and 3-digit year normalization (`08`→1908? needs a century rule; `889`→1889).
 - Partial dates → store what's known. The schema only has a single `documentDate
  LocalDate`; **decide** whether to (i) store first-of-month/year, (ii) add a
  `datePrecision` enum + `dateOriginal` text column, or (iii) keep raw text in a new
  `documentDateRaw` field and leave `documentate` null when imprecise. Recommendation:
  preserve the **original string** always (new column) + best-effort parsed date +
  precision flag, so nothing is lost and the UI can show "ca. 1916".
 - Unparseable/approximate (`?`, `Herbst 1913`) → keep raw, leave parsed date null, **do
  not drop the row**.
 **Cross-check:** even after IMP-01 is fixed so the date column is read, IMP-02 still bites.
 Both must be solved before a real import.
 ---
 ## IMP-03 — New xlsx has no normalized/ISO date or name columns 🔴 BLOCKER
 The ODS had helper columns the importer relied on: `Von`/`An` (normalized names) and
 `Datum` (ISO) alongside `Datum Originalformat`. The new xlsx has **only the raw**
 `BriefeschreiberIn` / `EmpfängerIn` / `Datum des Briefes`. So:
 - Names must be parsed from raw strings (PersonNameParser already does receivers; **sender
  is taken raw, never split** — fine for senders, which are single, but no normalization).
 - Dates must be parsed from raw (IMP-02).
 This is the root reason IMP-01/02 exist: the new file is the *uncurated* source, not the
 hand-normalized ODS. Tie any importer redesign to this reality — we will not get clean
 helper columns in the 7k-row file.
 ---
 ## IMP-04 — Person register not imported at all 🟠 MAJOR
 `Personendatei 2.xlsx` → sheet `Tabelle1`, **163 people**, columns:
 `Generation, Familienname, Vorname, geb als (maiden), Geburtsdatum, Geburtsort,
 Todesdatum, Sterbeort, verheiratet mit, Bemerkung`.
 Today `MassImportService` has **no person-register import**. Persons are only
 auto-created as bare aliases from the document sender/receiver strings
 (`personService.findOrCreateByAlias`). All this rich genealogical data is unused:
 - birth/death dates + places,
 - maiden names (the key to dedup — see IMP-05),
 - `verheiratet mit` (marriage links → `PersonRelationship` domain),
 - `Bemerkung` relationship hints (`"Schwester v Marie Cram"`, `"Nichte von Herbert"`),
 - `Generation` (G 1–G 4),
 - nicknames in quotes (`"Tante Lolly"`).
 Data-quality notes in this file too: multi-value `Vorname` (`Charlotte,Meta,Jacobi`);
 mixed Excel-date vs text dates; typos (`4.3.1023`); missing-day dates (`.12.1955`);
 trailing spaces (`30.8.1862 `).
 **Proposed approach:** a separate **Person import** (→ Gitea issue). Order matters: import
 persons *first* so documents can link to real people instead of creating alias stubs.
 Use `geb als` + `verheiratet mit` to pre-build the alias/relationship graph.
 ---
 ## IMP-05 — Name variations create duplicate Persons 🟠 MAJOR
 The same person appears under several surface forms across the document sheet:
 - `Eugenie Müller` (151) vs `Eugenie de Gruyter` (452) — maiden vs married.
 - `Clara Cram` (sender 1,284) vs `Clara de Gruyter` (455) vs `Clara de Gruyter sen.` (66).
 - `Walter de Gruyter` (589) vs bare `Walter` (78).
 `findOrCreateByAlias` keys on the raw string, so each variant becomes (or matches) a
 distinct alias and likely a **distinct Person**. Result: fragmented person records,
 broken Briefwechsel pairing, wrong stats.
 **Proposed approach:** drive dedup from the register's `geb als` column (IMP-04) —
 `Eugenie de Gruyter geb Müller` tells us the two strings are one person. Build an alias
 map (married ↔ maiden ↔ nickname) before/while importing documents. This is partly data
 (an alias mapping table/sheet) and partly code (consume it). Likely a Gitea issue once the
 mapping format is decided.
 945 distinct sender strings / 274 distinct receiver strings — expect a long-tail of
 variants to reconcile. Don't try to be perfect on the first pass; get the high-frequency
 names right.
 ---
 ## IMP-06 — 93 data rows with blank Index are silently dropped 🟠 MAJOR
 `processRows` does `if (index.isBlank()) continue;`. **93 rows** have a blank Index but
 carry other data (sender/receiver/date/etc.). These are silently skipped — they don't even
 appear in the `skippedFiles` report (that list only covers rows that *had* an index but
 failed file checks).
 **Proposed approach:** before import, triage these 93 rows — are they continuation rows,
 section markers, or genuine letters missing an ID? At minimum, surface a count/warning so
 nothing vanishes unnoticed. Possibly a small importer change to report blank-index skips.
 ---
 ## IMP-07 — 43 duplicate Index values 🟡 MINOR
 43 Index values repeat (e.g. `W-0388`, `Eu-0332`, `C-0234`, `C-0235`, `C-0236`, `J-0175`).
 Since the filename is derived from Index, the importer's upsert keys both rows on the same
 `originalFilename`: the second occurrence is treated as `ALREADY_EXISTS` (if the first
 isn't a placeholder) and **its metadata is lost**, or it overwrites a placeholder.
 **Proposed approach:** list the 43 duplicates, check whether they're true duplicates or
 two distinct letters that share an ID by mistake. Fix in the source data, or extend the ID
 scheme. Data task first; software only if the ID scheme must change.
 ---
 ## IMP-08 — Section/title rows interleaved with data 🟡 MINOR
 Row 2 of the sheet is a section header sitting only in the sender column
 (`Brautbriefe von Walter der Gruyter an Eugenie Müller`) with a blank Index — caught by the
 blank-Index skip (overlaps IMP-06). There may be more such banners scattered through 7,943
 rows. Also relevant: the per-letter **keywords live in `Inhalt` (col 9)** as comma-joined
 values (`Tilburg,Verwandschaft`, `poetisch,Reise nach Breda`), while `Schlagwort` (col 8)
 holds a single broad tag (`Brautbriefe`). The importer only takes **one** tag column —
 decide which column feeds tags vs summary, and whether to split comma-lists into multiple
 tags.
 **Proposed approach:** scan for rows where Index is blank but other cells are set (already
 have the count: relates to the 93 in IMP-06). Confirm tag vs summary column choice with
 Marcel.
 ---
 ## IMP-09 — Index ↔ Datei filename mismatches 🟡 MINOR
 The `Datei` column (col 1) holds explicit relative paths (`..\__scan\W-0001.pdf`) but they
 don't always agree with the Index. Example: row 20 has Index `W-0010x` but Datei
 `..\__scan\W-0011x.pdf`. The importer derives the filename from **Index**, so it will look
 for `W-0010x.pdf` and may miss the actual scan. (Note: the `Datei` paths themselves are
 Windows-style with `\` and `..` and would be **rejected** by `isValidImportFilename` if anyone
 tried to use that column directly — 7,623 rows use backslashes, 7,455 contain `..`.)
 **Proposed approach:** when the PDFs arrive, reconcile Index-derived names against actual
 filenames; produce a mismatch report. Keep deriving from Index (stable IDs) but flag
 disagreements. Mostly a data/QA task.
 ---
 ## IMP-10 — `x`-suffix rows (letter backsides / enclosures) 🟡 MINOR
 **42 rows** have an `x`-suffixed Index (`W-0001x`, `W-0002x`, …). They're sparse — typically
 only Index + Datei + sender + receiver, no box/folder/date. They appear to be the reverse
 side or an enclosure of the preceding letter. The importer treats each as an independent
 Document, and the `metadataComplete` heuristic flags them complete as soon as a sender is
 present (date/box/folder all missing).
 **Proposed approach:** decide whether `x` rows should be (a) separate documents, (b) extra
 pages/files attached to their parent, or (c) skipped. Affects both the data model and the
 `metadataComplete` heuristic. Discuss with Marcel.
 ---
 ## IMP-11 — Multi-receiver separators include bare `u` / `u.` 🟡 MINOR
 `PersonNameParser.parseReceivers` already handles ` und `, ` u `, `//`, `geb.`,
 parenthesised shared surnames, and `Familie` filtering — good. But the real data also uses
 the abbreviation in forms the top-receivers list shows are common:
 `Eugenie u Walter de Gruyter` (230), `Herbert u Clara` (94), `Juan u Marie Cram` (75),
 and space-joined pairs like `Ella Anita` (79) that may be two people.
 Raw separator tally on receivers: ` und ` ×70, `,` ×11, `;` ×2, `/` ×1 — plus the many ` u `
 cases above. Senders are **not** parsed at all (taken raw), which is fine unless a sender
 cell ever holds two names.
 **Proposed approach:** add `MassImportServiceTest` cases for the real-world strings above;
 extend the parser only where it actually fails. `Ella Anita`-style space-joined pairs are
 ambiguous — likely leave as one person unless the register says otherwise (ties to IMP-05).
 ---
 ## IMP-12 — Importer reads only the first sheet, no validation 🟡 MINOR
 `readXlsx` does `workbook.getSheetAt(0)`. For the new xlsx that's `Familienarchiv` (✅), but
 the file also contains `Inhaltsverzeichnis grob`, `Inhaltsverzeichnis WdG`, `Tabelle4`.
 There is no header validation: if the wrong file/sheet is dropped in `/import`, the importer
 will happily map columns positionally and import nonsense. Also `findSpreadsheetFile()` picks
 the **first** spreadsheet found in `/import` — with three spreadsheets present there today,
 which one wins is filesystem-order-dependent.
 **Proposed approach:** (a) validate the header row against expected names before importing;
 (b) make the target sheet/file explicit (config or header match) rather than "first found".
 Ties into the header-driven mapping in IMP-01(b).
 ---
 ## Summary of recommended sequencing
 1. **Decide the importer mapping strategy** (IMP-01): positional re-config vs header-driven.
   Header-driven is the durable choice and unblocks IMP-03/12.
 2. **Build the tolerant date parser** (IMP-02) with original-string preservation + precision.
 3. **Import the Person register first** (IMP-04) and build the alias/marriage graph,
   which feeds person dedup (IMP-05).
 4. **Then import documents**, with reporting for blank-index (IMP-06), duplicates (IMP-07),
   and section rows (IMP-08).
 5. **Reconcile files** when the ~7,000 PDFs arrive (IMP-09), and decide `x`-row semantics
   (IMP-10).
 Code-change items (→ Gitea issues when we get there): IMP-01(b), IMP-02, IMP-04, IMP-05
 (consume side), IMP-06 reporting, IMP-12. Pure-data items stay in this folder.
--- a/docs/import-migration/02-normalization-spec.md
+++ b/docs/import-migration/02-normalization-spec.md
@@ -0,0 +1,386 @@
 # Spec — Import Normalizer
 > Authored in the voice of **"Elicit"**, requirements engineer (see
 > `.claude/personas/req_engineer.md`). This is a requirements artifact: it states
 > *what* the normalizer must do and *how we'll know it's done*, in problem/behaviour
 > language. Technology choices already made during brainstorming (Python, openpyxl,
 > overrides-and-rerun) are recorded as **constraints**, not re-litigated here.
 - **Status:** Draft for review
 - **Date:** 2026-05-25
 - **Related:** [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) (issues `IMP-01..12`), [`README.md`](./README.md)
 - **Scope boundary:** This spec covers the **offline normalizer** that turns the raw
  spreadsheets into a clean, canonical dataset + review artifacts. Wiring the canonical
  contract into the Java `MassImportService` and the `Document`/`Person` model is **Phase 2**
  and gets its own spec. This spec only *defines the contract* Phase 2 must satisfy.
 ---
 ## 1. Project Brief
 **Vision.** Turn the family's human-curated, free-form archive spreadsheets into a clean,
 canonical dataset that imports deterministically — without hand-editing thousands of rows
 and without losing the historical nuance of how things were originally written.
 **Problem.** The real archive (`…aktuell…xlsx`, 7,943 rows) and the person register
 (`Personendatei 2.xlsx`, 163 people) were authored for humans to read, not machines to
 import. Dates are written as they appeared in each letter (≈90% unparseable by the current
 importer), the column layout differs from what the importer expects, and the same person
 appears under many names. Importing as-is produces garbage (see `IMP-01..12`).
 **Goal (measurable).**
 - G1 — After the automated pass, **≤ 5%** of dated rows remain `UNKNOWN`; after the
  overrides-iteration loop, **≤ 0.5%**.
 - G2 — **100%** of source rows are represented in the canonical output or in a review file —
  *zero silent drops*.
 - G3 — **100%** of original values (raw date string, raw name string, source row number)
  are preserved.
 - G4 — A full run over the current inputs completes in **< 60 s** on the dev laptop and is
  **content-deterministic** when re-run with unchanged inputs+overrides: identical canonical
  cell matrices and identical review-file contents. (Workbook metadata is pinned; literal xlsx
  byte-identity is not guaranteed because the zip container stores entry metadata.)
 **Primary actor.** Marcel — solo owner & data steward (tech comfort 4/5). Also: a future
 agent re-running the pipeline; and the `MassImportService` as the downstream consumer.
 **Non-Goals (explicitly out of scope).**
 - NG1 — Changing `MassImportService` or the DB schema (that is Phase 2).
 - NG2 — Uploading/attaching the ~7,000 PDFs (they arrive later; import matches by `index`).
 - NG3 — A GUI. The interface is spreadsheets in, CSVs out, an overrides file hand-edited.
 - NG4 — Perfect genealogical reconstruction. We resolve confidently-matchable people; the
  long tail stays as provisional persons.
 - NG5 — OCR/transcription content (the new xlsx has no transcription column).
 **Key assumptions.** (A1) Sheet `Familienarchiv` is the document source of truth.
 (A2) Archive date range is **1873–1957** (drives the 2-digit-year century rule).
 (A3) `index` is the stable document key and the basis for future PDF matching.
 (A4) `Schlagwort` is a broad tag; `Inhalt` is a short summary/topic.
 **Risks.** (R1) 2-digit/partial dates are genuinely ambiguous → mitigated by precision flag
 + overrides. (R2) Name matching false-positives merge distinct people → mitigated by
 conservative matching + review before merge. (R3) Source spreadsheet may be re-exported with
 layout drift → mitigated by header-name-based mapping, not fixed indices.
 ---
 ## 2. Personas
 **Marcel — Data Steward.** Role: solo owner of Familienarchiv. Context: holds the complete
 raw archive; PDFs follow. Tech comfort: 4/5 (semi-technical, reads CSV/spreadsheets fluently,
 not keen to hand-edit 7,600 rows). Primary goal: a clean, importable dataset he trusts.
 Frustrations: dates in ~20 formats; one ancestor under 4 name variants. **JTBD:** *"When I
 have raw, human-curated archive spreadsheets, I want to transform them into a clean importable
 dataset without losing how things were originally written, so I can load the archive and keep
 correcting edge cases as they surface."*
 **The Returning Agent.** Role: a future assistant session resuming the work. Goal: re-run the
 pipeline deterministically and understand exactly what still needs human input. **JTBD:**
 *"When I pick this up cold, I want one command and a clear residue report, so I can continue
 without re-deriving context."*
 ---
 ## 3. Constraints & Decisions Already Made
 These were settled during brainstorming and are fixed inputs to the requirements below.
 | # | Decision | Rationale |
 | --- | --- | --- |
 | C1 | **New canonical layout** with explicit headers (not the old positional ODS shape). | Fits the new data; importer becomes header-driven in Phase 2. |
 | C2 | Dates stored as **parsed (nullable) + raw + precision**. | Historical archive; never lose the original; enable "ca. 1916". |
 | C3 | **Include person resolution** (register + alias/marriage map → canonical persons) in this effort. | Maiden-name dedup needs the register. |
 | C4 | **Overrides-file + re-run** loop for residue. | Deterministic, diffable, repeatable. |
 | C5 | Implementation: **Python 3.12 + openpyxl**, standalone tool at `tools/import-normalizer/`. | Fast iteration; no Spring rebuild / coverage gate on transform code. |
 | C6 | Century rule for archive **1873–1957**: 2-digit `00–57`→`19YY`, `73–99`→`18YY`, `58–72`→**flag**; 3-digit `DDD`→`1DDD`; never 20xx. | Stated by Marcel. Boundaries live in config. |
 | C7 | `Schlagwort`→tag, `Inhalt`→summary. | Matches importer's existing semantics. |
 | C8 | Non-register correspondents become **provisional persons**. | ~945 distinct sender strings vs 163 register people. |
 ---
 ## 4. Functional Requirements
 Each requirement has a stable ID. User stories use Connextra + Given-When-Then; system rules
 use EARS. Traceability to findings in §8.
 ### 4.1 Ingest & layout (`FR-INGEST`, `FR-MAP`)
 **US-MAP-01** — *As the data steward, I want each source column mapped to a named canonical
 field regardless of its position, so a re-exported spreadsheet with shifted columns still
 imports correctly.*
 - AC1 — Given the `Familienarchiv` sheet, when the normalizer reads the header row, then it
  maps columns by **header name** (not fixed index) to the canonical fields.
 - AC2 — Given a header the normalizer does not recognise, when it runs, then it records the
  unknown header in `review/summary.txt` and continues (does not crash).
 - AC3 — Given a required source header is **absent**, when it runs, then it aborts with a
  clear message naming the missing header (fail loud, before producing partial output).
 - **REQ-INGEST-01** — The normalizer shall read only the `Familienarchiv` sheet of the
  document workbook and the `Tabelle1` sheet of the person workbook.
 - **REQ-MAP-01** — Header matching shall be case-insensitive and tolerant of internal
  multiple spaces (e.g. `"Datum  des Briefes"`).
 ### 4.2 Row triage (`FR-TRIAGE`) — resolves IMP-06, IMP-07, IMP-08
 **US-TRIAGE-01** — *As the data steward, I want rows that have data but no index surfaced
 rather than dropped, so I never lose a letter silently.*
 - AC1 — Given a row whose `index` is blank but which has any other non-empty cell, when the
  normalizer runs, then that row is written to `review/blank-index-rows.csv` with its source
  row number and is **not** emitted as a canonical document.
 - AC2 — Given a fully empty row, when it runs, then the row is skipped and counted (not
  reported as an anomaly).
 - **REQ-TRIAGE-01** — If two or more rows resolve to the same `index`, then the normalizer
  shall emit all of them to `review/duplicate-index.csv` and mark each canonical row
  `needs_review = duplicate_index` (it shall **not** silently drop either).
 - **REQ-TRIAGE-02** — Where a row is identified as a section/banner row (blank index, text
  only in a name column), the normalizer shall classify it as such in the blank-index report.
 - **REQ-TRIAGE-03** — Rows whose `index` ends in `x` (a transcription/back-side of the base
  letter, not yet independently mappable) shall be **skipped** — not emitted as a canonical
  document — and written to `review/skipped-x-suffix.csv` with their source row and base index
  (`index` minus the trailing `x`), so they can be linked in a later pass. (Resolves IMP-10.)
 ### 4.3 Date normalization (`FR-DATE`) — resolves IMP-02, IMP-03
 **US-DATE-01** — *As the data steward, I want every date interpreted as precisely as the
 source allows, with the original always kept, so I can sort the archive and still see what the
 letter actually said.*
 - AC1 — Given a parseable date, when normalized, then `date_iso` holds the best-effort ISO
  date, `date_raw` holds the verbatim source string, and `date_precision` ∈
  `{DAY, MONTH, SEASON, YEAR, RANGE, APPROX, UNKNOWN}`.
 - AC2 — Given an unparseable date, when normalized, then `date_iso` is empty,
  `date_precision = UNKNOWN`, `date_raw` is preserved, and the value appears in
  `review/unparsed-dates.csv`.
 - AC3 — Given the same `date_raw` appears in `overrides/dates.csv`, when normalized, then the
  override's `(iso, precision)` wins over the automatic parse.
 - **REQ-DATE-01** — The parser shall accept, at minimum, these forms (see §10 examples):
  Excel/ISO; `D.M.YYYY`/`D.M.YY`; `D/M. YY[YY]` (slash treated as dot); Roman-numeral months
  `I–XII`; German + English month names, full and abbreviated, with or without a separating
  space; `Month YYYY`; season/holiday + year; bare `YYYY`; and start-anchored ranges.
 - **REQ-DATE-02** — Precision shall be assigned by what is known: full day → `DAY`; month+year
  → `MONTH` (day = 1); a **named feast/holiday + year** → resolved to its **actual calendar
  date for that year** → `DAY`; a **season + year** → representative mid-season month (day = 1)
  → `SEASON`; year only → `YEAR` (month = Jan, day = 1); a range → start date + `RANGE`; a
  value carrying an uncertainty marker (`?`, `um`, `ca`, `circa`) → `APPROX` with best-effort date.
 - **REQ-DATE-03** — Two-digit and three-digit years shall be expanded per **C6**; a 2-digit
  year in `58–72` shall yield `UNKNOWN` + a review entry rather than a guess.
 - **REQ-DATE-04** — Trailing editorial notes (e.g. `", 2. Brief"`) shall be stripped before
  parsing and preserved (kept within `date_raw`; not invented into the date).
 - **REQ-DATE-05** — The parser shall be pure and side-effect-free so it can be unit-tested in
  isolation (see NFR-TEST-01).
 - **REQ-DATE-06** — **Movable feasts are never mapped to a fixed month**; they shall be
  computed per year from Easter (Gauss/Butcher computus): Karfreitag = Easter−2, Ostern =
  Easter Sunday, Himmelfahrt = Easter+39, Pfingst(sonntag) = Easter+49, Pfingstmontag =
  Easter+50, Fronleichnam = Easter+60, 1.–4. Advent = the 4th…1st Sunday before 25 Dec. Fixed
  feasts use a lookup table (Neujahr=01-01, Heiligabend=12-24, Weihnachten=12-25,
  Silvester=12-31, …). Seasons map to representative months: Frühling/Frühjahr=Apr, Sommer=Jul,
  Herbst=Oct, Winter=Jan. The feast/season tables and Easter algorithm live in `config.py`
  (NFR-MAINT-01).
 ### 4.4 Person resolution & dedup (`FR-PERS`, `FR-DEDUP`) — resolves IMP-04, IMP-05, IMP-11
 **US-PERS-01** — *As the data steward, I want the genealogical register turned into canonical
 people with all their known facts, so documents can link to real persons.*
 - AC1 — Given a register row, when parsed, then a canonical person is produced with
  `person_id`, name parts, `maiden_name`, birth/death (parsed + raw + place), spouse,
  generation, nickname, notes — applying the same date rules as §4.3 to birth/death dates.
 - AC2 — Given multi-value given names (`"Charlotte,Meta,Jacobi"`), when parsed, then the
  primary given name is the first; the remainder are retained as additional names/aliases.
 **US-PERS-02** — *As the data steward, I want each sender/receiver string matched to a
 canonical person where possible and never dropped otherwise, so the correspondence graph is
 complete.*
 - AC1 — Given a sender/receiver string, when resolved, then it maps to a register
  `person_id` via the alias index (exact → normalized/casefold → conservative fuzzy).
 - AC2 — Given no confident match, when resolved, then a **provisional person** is created from
  the cleaned string, linked, and listed in `review/unmatched-names.csv` (occurrence count +
  example source rows).
 - AC3 — Given the string appears in `overrides/names.csv`, when resolved, then it maps to the
  specified `person_id` (override wins).
 - AC4 — Given a multi-person receiver cell (`"Eugenie u Walter de Gruyter"`, `"Herbert u
  Clara"`, `"…//…"`, `"Hedi und Tutu (Gruber)"`), when resolved, then it is split into
  individual people, each resolved independently; ambiguous space-joined pairs
  (`"Ella Anita"`) are emitted to `review/ambiguous-receivers.csv` rather than guessed.
 - **REQ-DEDUP-01** — The alias index shall be derived from the register: canonical
  "First Last", maiden form (`geb als`), spouse-surname married form, nickname, and
  first-name-only **only when unambiguous** across the register.
 - **REQ-DEDUP-02** — The normalizer shall not merge two distinct strings into one person on
  fuzzy similarity alone above a configured threshold without the match being reported; merges
  must be auditable.
 - **REQ-PERS-01** — Sender cells shall be parsed for multi-person content using the same rules
  as receiver cells (today the importer parses only receivers — IMP-11).
 ### 4.5 Overrides & idempotency (`FR-OVR`) — supports the iteration loop
 - **REQ-OVR-01** — When the normalizer runs, then it shall load `overrides/dates.csv` and
  `overrides/names.csv` if present and apply them; absence of either file shall not be an error.
 - **REQ-OVR-02** — While overrides are unchanged and inputs are unchanged, re-running shall
  produce **byte-identical** canonical outputs and review files (NFR-IDEM-01).
 - **REQ-OVR-03** — Each override application shall be counted in `review/summary.txt` (how many
  dates/names were resolved by override vs automatically).
 ### 4.6 Canonical output & provenance (`FR-OUT`, `FR-PROV`) — resolves IMP-01, IMP-09, IMP-12
 - **REQ-OUT-01** — The normalizer shall write `out/canonical-documents.xlsx` and
  `out/canonical-persons.xlsx` with the headered schemas in §6.
 - **REQ-PROV-01** — Every canonical document row shall carry `source_row` (1-based row number
  in the source sheet) so any value can be traced back to the original.
 - **REQ-PROV-02** — Every canonical row shall carry a `needs_review` field listing zero or more
  flags (`duplicate_index`, `unparsed_date`, `unmatched_sender`, `unmatched_receiver`,
  `index_file_mismatch`, …) so the import and the UI can foreground uncertain data.
 - **REQ-OUT-02** — Where the source `Datei` path disagrees with the index-derived filename
  (IMP-09), the normalizer shall record the discrepancy in `review/index-file-mismatch.csv`
  and flag the row; it shall **not** alter the `index` (the stable key).
 ---
 ## 5. Non-Functional Requirements
 | ID | Category | Requirement (measurable) |
 | --- | --- | --- |
 | NFR-DATA-01 | Data integrity | 100% of source rows are accounted for in output **or** a review file; 100% of original date/name strings preserved verbatim. |
 | NFR-IDEM-01 | Determinism | Identical inputs + overrides ⇒ identical *logical* output across runs/machines: identical canonical cell matrices and review-file contents. Workbook `created`/`modified` metadata is pinned to a constant; ordering of all generated rows/aliases is stable (no set-iteration leakage). xlsx byte-identity is explicitly not required — determinism is asserted on content. |
 | NFR-PERF-01 | Performance | Full run over 7,943 doc rows + 163 person rows completes in < 60 s on the dev laptop. |
 | NFR-ACCUR-01 | Date accuracy | After automated pass, `UNKNOWN` dates ≤ 5% of dated rows; after overrides iteration, ≤ 0.5%. |
 | NFR-ACCUR-02 | Name coverage | Every sender/receiver occurrence yields a linked person (register or provisional); 0 dropped. |
 | NFR-I18N-01 | Encoding | UTF-8 end-to-end; German diacritics and ß round-trip with no mojibake in any output. |
 | NFR-TEST-01 | Testability | `dates.py` and `persons.py` have pytest tests covering every format/alias category in §10 with real examples from the archive. |
 | NFR-MAINT-01 | Maintainability | Column-name map, century boundaries, season→month map, and fuzzy threshold live in `config.py`, not inline in logic. |
 | NFR-OBSERV-01 | Observability | `review/summary.txt` reports per-run stats: rows in, documents out, dates by precision, names matched vs provisional, overrides applied, anomalies by type. |
 | NFR-SAFETY-01 | Source safety | Source workbooks are opened read-only and never written. |
 ---
 ## 6. Data Dictionary (canonical contract)
 This is the contract Phase 2 (the importer) must consume. Field-level, format-level — not a
 DB schema.
 ### 6.1 `canonical-documents.xlsx`
 | Field | Required | Format / values | Notes |
 | --- | --- | --- | --- |
 | `index` | yes | string | Stable key; basis for PDF matching. |
 | `box` | no | string | from `Box`. |
 | `folder` | no | string | from `Mappe`. |
 | `sender_person_id` | no | person_id | resolved; empty if no sender. |
 | `sender_name` | no | string | canonical display name (or cleaned raw if provisional). |
 | `receiver_person_ids` | no | `id\|id\|…` | pipe-separated. |
 | `receiver_names` | no | `name\|name\|…` | pipe-separated, aligned with ids. |
 | `date_iso` | no | `YYYY-MM-DD` | best-effort; empty if `UNKNOWN`. |
 | `date_raw` | no | string | verbatim source date. |
 | `date_precision` | yes | enum | `DAY\|MONTH\|SEASON\|YEAR\|RANGE\|APPROX\|UNKNOWN`. |
 | `location` | no | string | from `Ort`. |
 | `tags` | no | `tag\|tag` | from `Schlagwort`. |
 | `summary` | no | string | from `Inhalt`. |
 | `source_row` | yes | int | provenance (NFR-DATA-01). |
 | `needs_review` | yes | `flag\|flag` or empty | review flags (REQ-PROV-02). |
 ### 6.2 `canonical-persons.xlsx`
 | Field | Required | Format | Notes |
 | --- | --- | --- | --- |
 | `person_id` | yes | slug | stable id (e.g. `de-gruyter-eugenie`); collisions suffixed. |
 | `last_name` | yes | string | from `Familienname`. |
 | `first_name` | no | string | primary given name. |
 | `maiden_name` | no | string | from `geb als` — drives dedup. |
 | `title` | no | string | e.g. honorifics if present. |
 | `nickname` | no | string | from quoted `Bemerkung`/spouse field. |
 | `birth_date` / `birth_date_raw` / `birth_place` | no | ISO / string / string | §4.3 rules. |
 | `death_date` / `death_date_raw` / `death_place` | no | ISO / string / string | §4.3 rules. |
 | `spouse` | no | person_id or name | from `verheiratet mit`. |
 | `generation` | no | string | `G 1`..`G 4`. |
 | `notes` | no | string | from `Bemerkung`. |
 | `aliases` | no | `a\|b\|c` | every surface form that maps here. |
 | `provisional` | yes | bool | true if created from a document string, not the register. |
 ---
 ## 7. Prioritized Backlog (MoSCoW)
 | ID | Item | MoSCoW | Effort | Depends on |
 | --- | --- | --- | --- | --- |
 | B1 | Project scaffolding + read both workbooks (`FR-INGEST`, header map `FR-MAP`) | Must | S | — |
 | B2 | Row triage + blank/duplicate/empty reports (`FR-TRIAGE`) | Must | S | B1 |
 | B3 | Date parser + precision + century rule + Easter/feast computus + season map + tests (`FR-DATE`) | Must | L | B1 |
 | B4 | Person register parser → canonical persons (`FR-PERS` US-PERS-01) | Must | M | B1 |
 | B5 | Alias index + name resolution + multi-person split (`FR-DEDUP`, US-PERS-02) | Must | L | B4 |
 | B6 | Overrides load + apply + idempotency (`FR-OVR`) | Must | S | B3,B5 |
 | B7 | Canonical writers + provenance + review summary (`FR-OUT`, `FR-PROV`) | Must | M | B2,B3,B5 |
 | B8 | Index↔Datei mismatch report (`REQ-OUT-02`) | Should | XS | B1 |
 | B9 | Ambiguous-receiver review path (US-PERS-02 AC4) | Should | S | B5 |
 | B10 | Comma-split `Inhalt` into extra tags | Could | XS | B7 |
 | B11 | Phase-2 importer wiring (separate spec) | Won't (this spec) | — | B7 |
 ---
 ## 8. Traceability — Findings → Requirements
 | Finding | Severity | Addressed by |
 | --- | --- | --- |
 | IMP-01 layout mismatch | blocker | C1, FR-MAP, REQ-OUT-01 |
 | IMP-02 free-text dates | blocker | FR-DATE (all), C2, C6 |
 | IMP-03 no ISO/normalized cols | blocker | FR-DATE, FR-PERS |
 | IMP-04 register unimported | major | C3, US-PERS-01, §6.2 |
 | IMP-05 name variants → dupes | major | C3, FR-DEDUP |
 | IMP-06 blank-index dropped | major | US-TRIAGE-01 |
 | IMP-07 duplicate indices | minor | REQ-TRIAGE-01 |
 | IMP-08 section rows / tags vs summary | minor | REQ-TRIAGE-02, C7 |
 | IMP-09 index↔file mismatch | minor | REQ-OUT-02, B8 |
 | IMP-10 `x`-suffix rows | minor | REQ-TRIAGE-03 (skip + log this pass) |
 | IMP-11 sender not split / ` u ` sep | minor | REQ-PERS-01, US-PERS-02 AC4 |
 | IMP-12 first-sheet, no validation | minor | REQ-INGEST-01, FR-MAP AC2/AC3 |
 ---
 ## 9. Open Questions / TBD Register
 | ID | Question | Why it matters | Ref | Resolution |
 | --- | --- | --- | --- | --- |
 | OQ-01 ✅ | Season/holiday → date. | Accuracy of ~70 SEASON/feast rows. | REQ-DATE-06 | **Resolved (2026-05-25):** movable feasts (Ostern, Pfingsten, Himmelfahrt, Advent, …) **computed per year from Easter — never a fixed month**; fixed feasts looked up (Weihnachten=12-25, Neujahr=01-01, …); seasons = mid-season month (Frühling=Apr, Sommer=Jul, Herbst=Oct, Winter=Jan). |
 | OQ-02 ✅ | Date ranges: start only, or start+end? | Sorting/display of ~315 range values. | REQ-DATE-02 | **Confirmed:** store **start** in `date_iso`, precision `RANGE`, full text in `date_raw`. |
 | OQ-03 ✅ | `person_id` format. | Stability across re-runs; diffability. | §6 | **Confirmed:** readable slug `lastname-firstname`, numeric suffix on collision. |
 | OQ-04 ✅ | `x`-suffix row handling. | 42 rows. | REQ-TRIAGE-03 | **Resolved (2026-05-25):** `x` rows are transcriptions of the base letter but not yet mappable → **skip this pass**, log to `review/skipped-x-suffix.csv` for later linking. |
 | OQ-05 ✅ | Importer output format. | Phase-2 reader. | B11 | **Confirmed:** `.xlsx` (openpyxl-native, headered). |
 | OQ-06 ✅ | Fuzzy-match policy. | False-positive person merges (R2). | REQ-DEDUP-02 | **Confirmed:** conservative — report all fuzzy matches; no silent merge. |
 *All open questions resolved as of 2026-05-25. New ambiguities discovered during build go here.*
 ---
 ## 10. Glossary & Worked Examples
 **Precision** — how exactly a date is known (`DAY` … `UNKNOWN`). **Provisional person** — a
 person created from a document name string with no register match. **Alias index** — map from
 every known surface form of a name to a canonical `person_id`. **Override** — a
 human-supplied correction applied deterministically on each run.
 **Date examples → expected outcome:**
 | `date_raw` | `date_iso` | `date_precision` |
 | --- | --- | --- |
 | `15.2.1888` | 1888-02-15 | DAY |
 | `6.März 1888` | 1888-03-06 | DAY |
 | `22.III.18` | 1918-03-22 | DAY |
 | `13.5.09` | 1909-05-13 | DAY |
 | `10.Oct.95` | 1895-10-10 | DAY |
 | `17/6. 1916` | 1916-06-17 | DAY |
 | `Mai 1895` | 1895-05-01 | MONTH |
 | `Pfingsten 1922` | 1922-06-04 | DAY (computed: Easter 1922 = Apr 16, +49 days) |
 | `Herbst 1913` | 1913-10-01 | SEASON |
 | `1905` | 1905-01-01 | YEAR |
 | `8.1.1916 - 15.3.1916` | 1916-01-08 | RANGE |
 | `17.Nov (?) 1887` | 1887-11-17 | APPROX |
 | `?` | *(empty)* | UNKNOWN |
 **Name examples → expected outcome:**
 | raw cell | resolves to |
 | --- | --- |
 | `Eugenie Müller` (+ register `geb Müller`) | `de-gruyter-eugenie` (matched via maiden alias) |
 | `Eugenie de Gruyter` | `de-gruyter-eugenie` |
 | `Herbert u Clara` | `cram-herbert` + `cram-clara` (split, surname distributed) |
 | `Hedi und Tutu (Gruber)` | `gruber-hedi` + `gruber-tutu` |
 | `Ella Anita` | → `review/ambiguous-receivers.csv` (not auto-split) |
 | `Hans Wittkopf` (not in register) | provisional `wittkopf-hans` |
--- a/docs/import-migration/03-normalizer-implementation-plan.md
+++ b/docs/import-migration/03-normalizer-implementation-plan.md
--- a/docs/import-migration/04-unresolved-names-plan.md
+++ b/docs/import-migration/04-unresolved-names-plan.md
@@ -0,0 +1,502 @@
 # Unresolved-Name Classification Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Add a focused `review/unresolved-names.csv` that isolates sender/receiver strings whose *name itself* is problematic (unknown/illegible, single-token, relational-only, collective/group, prose-in-name-column, or a genuine two-given-name pair), and fix the ambiguous-pair heuristic so a plain `First Surname` external person (e.g. `Mieze Schefold`) is no longer falsely flagged.
 **Architecture:** A pure `classify_name(raw, given_names)` function in `persons.py` returns a `NameClass`. `ResolutionContext` classifies every *unmatched* name and records the non-`RESOLVABLE` ones in `self.unresolved`. A runtime-built given-name set (register first names + a small config supplement) lets the classifier distinguish a two-given-name pair (`Ella Anita` → two people) from a first+surname single person (`Mieze Schefold`). The orchestrator writes the aggregated report and per-category stats, replacing the noisy `ambiguous-receivers.csv`.
 **Tech Stack:** Python 3.12, openpyxl, pytest — extends the existing `tools/import-normalizer/`.
 **Context:** This builds on the completed normalizer (PR #663). Run all tests with CWD = the tool dir, e.g. `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_X.py -v`. Reuse the existing venv at `tools/import-normalizer/.venv` (do NOT recreate it). Commit on the current branch `docs/import-migration` (never main, never push). Each commit message ends with a trailing `Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>` line.
 ---
 ## File Structure
 ```
 tools/import-normalizer/
 ├── config.py        # + RELATIONAL_TERMS, COLLECTIVE_TERMS, UNKNOWN_NAME_MARKERS, PROSE_MAX_LEN, EXTRA_GIVEN_NAMES
 ├── persons.py       # + NameClass, classify_name(), build_given_names(); ResolutionContext gains given_names + self.unresolved
 ├── normalize.py     # writes unresolved-names.csv (replaces ambiguous-receivers.csv) + per-category stats
 ├── README.md        # + unresolved-names.csv row in the review-file table
 └── tests/
    ├── test_config.py     # + name-table presence test
    ├── test_persons.py    # + classify_name + build_given_names tests
    ├── test_documents.py  # ambiguous test → unresolved test (+ resolvable-pair test)
    └── test_normalize.py  # integration asserts unresolved-names.csv
 ```
 ---
 ### Task 1: Config — name-classification tables
 **Files:**
 - Modify: `tools/import-normalizer/config.py`
 - Modify: `tools/import-normalizer/tests/test_config.py`
 - [ ] **Step 1: Add the failing test** to `tests/test_config.py`
 ```python
 def test_name_classification_tables():
    assert "tante" in config.RELATIONAL_TERMS
    assert "familie" in config.COLLECTIVE_TERMS
    assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
    assert config.PROSE_MAX_LEN >= 30
    assert "anita" in config.EXTRA_GIVEN_NAMES
 ```
 - [ ] **Step 2: Run to verify it fails**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py::test_name_classification_tables -v && cd -`
 Expected: FAIL — `AttributeError: module 'config' has no attribute 'RELATIONAL_TERMS'`.
 - [ ] **Step 3: Implement** — append to `config.py` (after the existing tables, before/after `KNOWN_LAST_NAMES` — anywhere at module level)
 ```python
 # --- Name classification (unresolved-name review) ---
 # Relational reference terms — a sender/receiver named by relation, not a proper name.
 RELATIONAL_TERMS = {
    "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
    "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
    "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
    "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
 }
 # Collective/group terms — not a single person. Matched against alpha-only word tokens
 # (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
 COLLECTIVE_TERMS = {
    "familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
    "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
 }
 # Markers of an unknown/illegible name (the literal "?" is handled separately in code).
 # All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
 # (it occurs inside real names: Hanni, Johanna, Anna).
 UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
 # A name-column value longer than this (chars) is treated as prose/description, not a name.
 PROSE_MAX_LEN = 40
 # Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
 # in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
 EXTRA_GIVEN_NAMES = {
    "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
    "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
 }
 ```
 - [ ] **Step 4: Run to verify it passes**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py -v && cd -`
 Expected: PASS (all config tests).
 - [ ] **Step 5: Commit**
 ```bash
 git add tools/import-normalizer/config.py tools/import-normalizer/tests/test_config.py
 git commit -m "feat(normalizer): config tables for name classification"
 ```
 ---
 ### Task 2: `classify_name` + `NameClass`
 **Files:**
 - Modify: `tools/import-normalizer/persons.py`
 - Modify: `tools/import-normalizer/tests/test_persons.py`
 - [ ] **Step 1: Add failing tests** to `tests/test_persons.py`
 ```python
 from persons import NameClass
 GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
 def test_classify_unknown():
    assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
    assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
    assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
 def test_classify_prose():
    assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
    assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE  # digit
    assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE        # quote
 def test_classify_collective():
    assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
 def test_classify_relational():
    assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
    assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
 def test_classify_single_token():
    assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
    assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
 def test_classify_ambiguous_pair():
    assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
    assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
 def test_classify_resolvable_single_person():
    # first + surname (surname not a given name) -> one real person, NOT ambiguous
    assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
    assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
 ```
 - [ ] **Step 2: Run to verify it fails**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k classify -v && cd -`
 Expected: FAIL — `NameClass` / `classify_name` not defined.
 - [ ] **Step 3: Implement** — add to `persons.py`. Add `from enum import StrEnum` to the imports if not present, then add:
 ```python
 class NameClass(StrEnum):
    RESOLVABLE = "resolvable"
    UNKNOWN = "unknown"
    SINGLE_TOKEN = "single_token"
    RELATIONAL = "relational"
    COLLECTIVE = "collective"
    PROSE = "prose"
    AMBIGUOUS_PAIR = "ambiguous_pair"
 _QUOTE_CHARS = "\"'“”„‚‘’"
 def classify_name(raw: str, given_names: set[str]) -> NameClass:
    """Classify a (post-split) sender/receiver string by why it may be unresolvable.
    Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
    SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
    """
    s = raw.strip()
    if not s:
        return NameClass.RESOLVABLE
    low = s.lower()
    tokens = s.split()
    # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
    # are matched as whole words (no substring/prefix false positives like "Allerton").
    alpha_words = re.findall(r"[a-zäöüß]+", low)
    if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
        return NameClass.UNKNOWN
    if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
            or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
        return NameClass.PROSE
    if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
        return NameClass.COLLECTIVE
    if any(w in config.RELATIONAL_TERMS for w in alpha_words):
        return NameClass.RELATIONAL
    if len(tokens) == 1:
        return NameClass.SINGLE_TOKEN
    if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
        return NameClass.AMBIGUOUS_PAIR
    return NameClass.RESOLVABLE
 # Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
 # classified PROSE. Such multi-particle names are rare here and usually resolve via the
 # register; if they surface in review, lower-priority than the real prose entries.
 ```
 > Note: `_norm` already exists in `persons.py` (added in the alias-index task) and strips accents + lowercases. `classify_name` uses it so given-name matching is accent-insensitive.
 - [ ] **Step 4: Run to verify it passes**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -`
 Expected: PASS (all persons tests, including the 7 new classify tests).
 - [ ] **Step 5: Commit**
 ```bash
 git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py
 git commit -m "feat(normalizer): classify_name + NameClass"
 ```
 ---
 ### Task 3: `build_given_names`
 **Files:**
 - Modify: `tools/import-normalizer/persons.py`
 - Modify: `tools/import-normalizer/tests/test_persons.py`
 - [ ] **Step 1: Add failing test** to `tests/test_persons.py`
 ```python
 def test_build_given_names():
    people = persons.parse_register([
        {"last_name": "de Gruyter", "first_name": "Eugenie"},
        {"last_name": "Cram", "first_name": "Charlotte,Meta"},  # comma -> primary + extra given
    ])
    g = persons.build_given_names(people, {"Anita"})
    assert "eugenie" in g
    assert "charlotte" in g and "meta" in g   # primary + extra given names
    assert "anita" in g                        # from the extra set, normalized
    assert "schefold" not in g
 ```
 - [ ] **Step 2: Run to verify it fails**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py::test_build_given_names -v && cd -`
 Expected: FAIL — `build_given_names` not defined.
 - [ ] **Step 3: Implement** — add to `persons.py`
 ```python
 def build_given_names(register: list[Person], extra: set[str]) -> set[str]:
    """Set of normalized given names from the register (first + extra given) plus a supplement.
    Used by classify_name to tell a two-given-name pair (two people) from a first+surname.
    """
    names: set[str] = set()
    for p in register:
        if p.first_name:
            names.add(_norm(p.first_name))
        for g in p.extra_given_names:
            names.add(_norm(g))
    for e in extra:
        names.add(_norm(e))
    return names
 ```
 - [ ] **Step 4: Run to verify it passes**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -`
 Expected: PASS.
 - [ ] **Step 5: Commit**
 ```bash
 git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py
 git commit -m "feat(normalizer): build_given_names from register + supplement"
 ```
 ---
 ### Task 4: Integrate — ResolutionContext records unresolved; orchestrator writes the report
 This task touches `persons.py`, `normalize.py`, and two test files together so the whole suite stays green in one commit (removing `ctx.ambiguous` requires updating its only consumer, `normalize.py`, in the same change).
 **Files:**
 - Modify: `tools/import-normalizer/persons.py` (ResolutionContext)
 - Modify: `tools/import-normalizer/normalize.py`
 - Modify: `tools/import-normalizer/tests/test_documents.py`
 - Modify: `tools/import-normalizer/tests/test_normalize.py`
 - [ ] **Step 1: Update the failing tests first**
 In `tests/test_documents.py`, **replace** the existing `test_ambiguous_space_pair_flagged_not_split` function entirely with these two functions:
 ```python
 def test_ambiguous_pair_recorded_in_unresolved():
    people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}])
    ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={},
                                    given_names={"ella", "anita"})
    raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert len(doc.receiver_person_ids) == 1   # not split — one provisional
    assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved)
 def test_resolvable_first_surname_pair_not_unresolved():
    ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={},
                                    given_names={"ella", "anita"})
    ctx.resolve_one("Mieze Schefold", source_row=1)   # surname is not a given name
    assert ctx.unresolved == []                        # RESOLVABLE -> not recorded
 ```
 In `tests/test_normalize.py`, in the `_doc_wb` fixture, change the `C-0001` row's receiver from empty to `"?"` so the run produces an unresolved entry. Find the line that appends the `C-0001` row and set its `EmpfängerIn` cell to `"?"`. For example the row currently reads:
 ```python
    ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""])
 ```
 change the 6th cell (EmpfängerIn) from `""` to `"?"`:
 ```python
    ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
 ```
 Then add these assertions inside `test_run_end_to_end`, right after the existing `assert (review_dir / "unparsed-dates.csv").exists()` line:
 ```python
    assert (out_dir / "canonical-documents.xlsx").exists()  # (keep existing asserts above)
    assert (review_dir / "unresolved-names.csv").exists()
    unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
    assert "unknown" in unresolved_text and "?" in unresolved_text   # the "?" receiver
    assert not (review_dir / "ambiguous-receivers.csv").exists()      # replaced
 ```
 - [ ] **Step 2: Run to verify they fail**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py tests/test_normalize.py -v && cd -`
 Expected: FAIL — `ResolutionContext` has no `given_names`/`unresolved`; `unresolved-names.csv` not written.
 - [ ] **Step 3a: Implement — `ResolutionContext` in `persons.py`**
 Replace the `ResolutionContext.__init__` body's two lines (`self.ambiguous` and add `given_names`) and the relevant methods. The new `__init__`:
 ```python
    def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str],
                 given_names: set[str] | None = None):
        self.index = alias_index
        self.name_overrides = name_overrides
        self.given_names = given_names or set()
        self.provisional: dict[str, Person] = {}
        self.unmatched: dict[str, list] = {}
        self.unresolved: list[tuple] = []   # (raw_name, category, source_row) for non-RESOLVABLE names
        self._raw_to_pid: dict[str, str] = {}
        self.override_hits = 0
 ```
 In `resolve_one`, the provisional branch must classify the name. Replace this existing block:
 ```python
        # provisional person (unmatched) — never reuse a register id
        self.unmatched.setdefault(name, []).append(source_row)
        if name in self._raw_to_pid:
            return self._raw_to_pid[name], name, False
 ```
 with:
 ```python
        # provisional person (unmatched) — never reuse a register id
        self.unmatched.setdefault(name, []).append(source_row)
        category = classify_name(name, self.given_names)
        if category is not NameClass.RESOLVABLE:
            self.unresolved.append((name, str(category), source_row))
        if name in self._raw_to_pid:
            return self._raw_to_pid[name], name, False
 ```
 Replace the entire `resolve_receivers` method (the ambiguous detection now lives in `resolve_one` via `classify_name`):
 ```python
    def resolve_receivers(self, raw: str, source_row: int):
        return [self.resolve_one(part, source_row) for part in split_receivers(raw)]
 ```
 - [ ] **Step 3b: Implement — `normalize.py`**
 Find the line that builds the context:
 ```python
    ctx = persons.ResolutionContext(alias_index, name_overrides)
 ```
 replace it with (build the given-name set from the register + config supplement):
 ```python
    given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES)
    ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names)
 ```
 Replace the `ambiguous-receivers.csv` write line:
 ```python
    writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous)
 ```
 with an aggregated unresolved-names report:
 ```python
    unresolved_agg: dict[tuple, list] = {}
    for name, category, row in ctx.unresolved:
        unresolved_agg.setdefault((category, name), []).append(row)
    unresolved_rows = sorted(
        ([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))]
         for (cat, name), rows in unresolved_agg.items()),
        key=lambda r: (r[0], -r[2], r[1]))
    writers.write_review_csv(review_dir / "unresolved-names.csv",
                             ["category", "raw", "count", "example_rows"], unresolved_rows)
 ```
 In the `stats` dict, replace the `"ambiguous_receivers"` line:
 ```python
        "ambiguous_receivers": len(ctx.ambiguous),
 ```
 with a per-category breakdown:
 ```python
        "unresolved_name_occurrences": len(ctx.unresolved),
        "unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"),
        "unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"),
        "unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"),
        "unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"),
        "unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"),
        "unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"),
 ```
 - [ ] **Step 4: Run the whole suite to verify green**
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -`
 Expected: PASS (all tests, no `ambiguous` references remain).
 Also grep to confirm no dangling references:
 Run: `grep -rn "ctx.ambiguous\|ambiguous-receivers\|ambiguous_receivers\|self.ambiguous" tools/import-normalizer/*.py`
 Expected: no matches.
 - [ ] **Step 5: Commit**
 ```bash
 git add tools/import-normalizer/persons.py tools/import-normalizer/normalize.py tools/import-normalizer/tests/test_documents.py tools/import-normalizer/tests/test_normalize.py
 git commit -m "feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging"
 ```
 ---
 ### Task 5: README — document the new report
 **Files:**
 - Modify: `tools/import-normalizer/README.md`
 - [ ] **Step 1: Update the review-file table** in `README.md`. Replace the `ambiguous-receivers.csv` row with an `unresolved-names.csv` row. Find the table row referencing `ambiguous-receivers.csv` and replace it with:
 ```markdown
 | `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv`. |
 ```
 If the README has no such row (older version), add the row above to the review-file table.
 - [ ] **Step 2: Add a note** to the iteration-loop section of `README.md` (after the table):
 ```markdown
 > `unresolved-names.csv` is the focused "names that need a human" list — distinct from
 > `unmatched-names.csv` (which is just non-family correspondents that got provisional persons).
 > The given-name set that drives `ambiguous_pair` detection is the register's first names plus
 > `config.EXTRA_GIVEN_NAMES` — add names there if a real two-person cell isn't being flagged.
 ```
 - [ ] **Step 3: Verify the suite is still green** (README-only change, but confirm nothing references the old file)
 Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -`
 Expected: PASS.
 - [ ] **Step 4: Commit**
 ```bash
 git add tools/import-normalizer/README.md
 git commit -m "docs(normalizer): document unresolved-names.csv review report"
 ```
 ---
 ## Self-Review
 **Spec coverage** (against the agreed proposal):
 - Focused report isolating problem name classes → Task 4 writes `review/unresolved-names.csv` with a `category` column; categories defined in Task 2 `classify_name`. ✓
 - Fix ambiguous over-flagging of `First Surname` → Task 2 `AMBIGUOUS_PAIR` requires *both* tokens in the given-name set; `Mieze Schefold` → `RESOLVABLE` (tested). ✓
 - Distinguish "not fully known" (unknown/single-token/relational/collective/prose) from "can't split cleanly" (ambiguous_pair) → all are `NameClass` values, each its own category column value. ✓
 - Per-category counts in summary → Task 4 stats. ✓
 - Senders covered too (not just receivers) → classification happens in `resolve_one`, which both `resolve_sender` and `resolve_receivers` call. ✓
 **Placeholder scan:** No TBD/TODO; every code step has complete code. The README replacement gives the exact row text.
 **Type consistency:** `NameClass` (StrEnum) defined Task 2; `classify_name(raw, given_names)` and `build_given_names(register, extra)` signatures used consistently in Task 4; `ResolutionContext(alias_index, name_overrides, given_names=…)` matches the new `__init__`; `self.unresolved` is `list[tuple]` of `(raw, category, source_row)` and read with that shape in both the report and the stats. `str(category)` yields the StrEnum value (e.g. `"ambiguous_pair"`), matching the stat comparisons and the test assertions.
 **Cross-task green:** Task 4 deliberately bundles the `persons.py` + `normalize.py` + test changes into one commit because removing `ctx.ambiguous` breaks its consumer otherwise — no red commit is left behind (lesson from the prior build).
 **Out of scope (future):** Spanish month names + `Mon DD-YYYY` date form (separate date-parser enhancement); promoting `unresolved` rows into a document-level `needs_review` flag; auto-splitting confirmed `ambiguous_pair` entries via overrides.
--- a/docs/import-migration/README.md
+++ b/docs/import-migration/README.md
@@ -0,0 +1,62 @@
 # Import Migration — Working Folder
 This folder tracks the iterative work of mass-importing the **real, raw family archive**
 spreadsheets (≈7,600 letter rows + ~7,000 PDFs that arrive later) into Familienarchiv.
 It is intentionally **local docs, not Gitea issues**. We only open a Gitea issue when a
 finding requires a *software* change (e.g. a new date parser). Pure data observations and
 the running plan live here so any agent can pick the work up cold.
 ## Source files (in `/import`)
 | File | What it is | Importer support today |
 | --- | --- | --- |
 | `zzfamilienarchiv aktuell  2 - Kopie 2025-07-05.xlsx` | The **real raw archive** — 7,943 rows, sheet `Familienarchiv`. Human-readable, dates as written in the letters. | ❌ layout does **not** match importer defaults |
 | `Personendatei 2.xlsx` | Genealogical **person register** — 163 people, sheet `Tabelle1` (maiden names, birth/death, marriages, relationships). | ❌ no importer at all |
 | `zzfamilienarchiv Walter und Eugenie 2025-04-10.ods` | A small, **already-normalized** subset (Walter & Eugenie brautbriefe). 14 clean columns incl. ISO dates. | ✅ this is what `MassImportService` was built for |
 The PDFs (~7,000) will follow later. The importer matches files by the **Index** column
 (e.g. `W-0001` → `W-0001.pdf`), and already imports metadata-only when a file is missing —
 so we can import all metadata now and the PDFs will attach on a re-run.
 ## How to inspect the spreadsheets
 `openpyxl` is installed in the OCR service venv:
 ```bash
 /home/marcel/Desktop/familienarchiv/ocr-service/.venv/bin/python3 -c "import openpyxl; print(openpyxl.__version__)"
 ```
 ## Documents in this folder
 - [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) — full analysis of every data-quality / importer issue found (2026-05-25). Each issue has an ID `IMP-NN`.
 - [`02-normalization-spec.md`](./02-normalization-spec.md) — requirements spec for the offline **import normalizer** (the agreed strategy: normalize the raw sheets into a clean canonical dataset before import). Requirements `FR-*`/`NFR-*`, traceable to the `IMP-NN` findings.
 - `WORKLOG.md` — running log of what each session did and what's next. **Start here when resuming.**
 ## Strategy (decided 2026-05-25)
 Normalize **before** import. A standalone Python tool (`tools/import-normalizer/`, not yet
 built) transforms the raw xlsx + person register into a clean canonical dataset
 (`canonical-documents.xlsx`, `canonical-persons.xlsx`) plus review CSVs. Residual cases
 (unparseable dates, unmatched names) are fixed via a version-controlled overrides file and
 re-run. The Java importer is adjusted to consume the canonical contract in a later **Phase 2**.
 See the spec for the full contract.
 ## Status board
 | ID | Issue | Severity | Status |
 | --- | --- | --- | --- |
 | IMP-01 | New xlsx column layout ≠ importer defaults | 🔴 blocker | open |
 | IMP-02 | 90% of dates are free-text the parser can't read | 🔴 blocker | open |
 | IMP-03 | No ISO/normalized date column in the new xlsx | 🔴 blocker | open |
 | IMP-04 | Person register (`Personendatei 2.xlsx`) not imported | 🟠 major | open |
 | IMP-05 | Name variations = duplicate Persons (maiden vs married) | 🟠 major | open |
 | IMP-06 | 93 data rows with blank Index are silently dropped | 🟠 major | open |
 | IMP-07 | 43 duplicate Index values | 🟡 minor | open |
 | IMP-08 | Section/title rows interleaved in data | 🟡 minor | open |
 | IMP-09 | Index↔Datei filename mismatches | 🟡 minor | open |
 | IMP-10 | `x`-suffix rows (letter backsides/enclosures) | 🟡 minor | open |
 | IMP-11 | Multi-receiver separators incl. bare `u`/`u.` | 🟡 minor | open |
 | IMP-12 | Importer reads only the first sheet, no validation | 🟡 minor | open |
 See the findings doc for detail and proposed approach per issue.
--- a/docs/import-migration/WORKLOG.md
+++ b/docs/import-migration/WORKLOG.md
@@ -0,0 +1,147 @@
 # Import Migration — Worklog
 Running log of each working session. **Resume here.** Newest entry on top.
 ---
 ## 2026-05-25 (session 5) — Unresolved-name classification
 **Did:** Implemented [`04-unresolved-names-plan.md`](./04-unresolved-names-plan.md) subagent-driven
 (5 tasks, TDD, per-task spec + code-quality review; 67 tests pass). Added `classify_name` +
 `NameClass` + `build_given_names` in `persons.py`; `ResolutionContext` now records non-RESOLVABLE
 names in `self.unresolved`; orchestrator writes `review/unresolved-names.csv` (replaces the noisy
 `ambiguous-receivers.csv`) with per-category stats.
 **Why:** `unmatched-names.csv` mixes boring non-family correspondents (expected) with genuinely
 unresolvable entries. The new report isolates the latter so review focuses on ~440 real cases.
 **Real-run result:** unresolved-names.csv = single_token 191 / prose 103 / unknown 74 /
 collective 46 / relational 21 / ambiguous_pair **5** (distinct). The ambiguous over-flagging fix
 cut `ambiguous_pair` from 303 → 5 (genuine two-given-name pairs only; `Mieze Schefold` etc. now
 correctly RESOLVABLE). given-name set = register first names ∪ `config.EXTRA_GIVEN_NAMES`.
 **Next:** populate `overrides/names.csv` from unresolved-names.csv (highest-count first); extend
 `EXTRA_GIVEN_NAMES` if a real pair isn't flagged; still-open date work (Spanish months, 58–72 band).
 ---
 ## 2026-05-25 (session 4) — Built the normalizer (subagent-driven, all 17 tasks)
 **Did:** Executed the plan subagent-driven (implementer + spec review + code-quality review per
 task). The tool `tools/import-normalizer/` is **complete and passing (57 tests)**. Final
 opus review: **READY** — determinism verified on the real corpus (two runs → identical cell
 matrices + byte-identical review files), zero silent drops.
 **Per-task code review caught & fixed real issues** (all in the committed code): leading
 qualifiers `nach/vor/…` now → APPROX; English month-first matcher hardened to structurally
 not shadow `Mai 1895`; person-id collision de-dup suffixes *all* members; `split_receivers`
 returns `[]` for a `geb.`-only cell; boolean cells no longer coerced to `1/0`; duplicate-index
 flags every occurrence; provisional ids never steal a register id; CSV-injection defanged.
 **REAL DRY-RUN** (`python normalize.py` over the actual archive — outputs are gitignored):
 - documents_emitted **7,582** (+225 empty +93 blank-index +42 x-suffix = 7,942 rows read, 0 dropped)
 - register_persons **163**, provisional_persons **942**
 - dates: DAY 6,509 / MONTH 36 / RANGE 36 / APPROX 28 / YEAR 17 / SEASON 1 / UNKNOWN 955
 - **unknown_date_rate 9.2%** (of dated rows; target ≤5% pre-override, ≤0.5% after overrides)
 - duplicate_index 85, index_file_mismatches 550, ambiguous_receivers 303
 **⚠️ Concurrency incident:** a parallel Claude session committed reader-dashboard work to this
 branch and hard-reset it mid-execution, deleting the Task 15 files and orphaning a commit.
 Recovered via reflog (`reset --hard 366b4848` + `checkout 401160e3 -- <task15 files>`); no code
 lost. Casualty: my *during-execution* edits to the plan/spec docs (02/03) for Tasks 5–14 were
 discarded — **the committed code + tests are the source of truth**, not the plan doc, which now
 reflects the pre-execution + persona-review version.
 **Next steps (iterative refinement — the overrides loop, as designed):**
 1. Shave the 9.2% UNKNOWN cheaply: add **Spanish month names** (Enero…Diciembre) and the
   `Mon DD-YYYY` dash form to `config.MONTHS`/the parser (Mexican-branch correspondence);
   revisit the 58–72 two-digit-year band (real `…58/59/60` dates = 1958–1960, just past the
   1873–1957 window — decide whether to extend the upper bound in `config`).
 2. `?` (99×) is genuinely "date unknown" — leave UNKNOWN or add a convention.
 3. Populate `overrides/dates.csv` + `overrides/names.csv` from the review CSVs and re-run.
 4. README note: a leading `'`/`!` in a `review/*.csv` `raw` cell may be a CSV-defang artifact —
   match against the true source value when writing overrides.
 5. Phase 2 (separate spec): wire the canonical contract into the Java `MassImportService`.
 ---
 ## 2026-05-25 (session 3) — Implementation plan + persona review
 **Did:**
 - Wrote [`03-normalizer-implementation-plan.md`](./03-normalizer-implementation-plan.md): 17
  bite-sized TDD tasks for `tools/import-normalizer/` (Python, openpyxl), bottom-up — date
  parser w/ Easter computus first, then persons/alias, ingest, mapping, orchestrator, writers.
 - Ran a 6-persona inline review (architect, developer, tester, req-engineer, security, devops;
  ui-expert too) via parallel agents. Acted on all material findings.
 **Key fixes from review (see plan §"Review feedback incorporated"):**
 - Idempotency redefined byte-identical → **content-deterministic** (spec G4/NFR-IDEM-01);
  pinned workbook timestamps + deterministic alias ordering + a real two-run equality test.
 - Real bug: duplicate-index only reported repeats → now flags/reports every occurrence.
 - Provisional `person_id` could overwrite a register id → now suffixed.
 - Date parser gaps: invalid-calendar-date → UNKNOWN, intra-month day-range (`7./8. Sept.1923`).
 - Multi-person sender now split + flagged (REQ-PERS-01); CSV-injection defanged in review files;
  pinned deps + hardened root `.gitignore`.
 **Next:**
 - Marcel reviews the plan. Then execute it (subagent-driven or inline) — the date parser
  (Task 3/8 + Easter computus) is the meatiest piece.
 ---
 ## 2026-05-25 (session 2) — Strategy + normalizer spec
 **Did:**
 - Decided strategy with Marcel: **normalize the raw sheets first**, then import (higher
  leverage than making the Java importer tolerate every mess).
 - Locked design decisions (see spec §3): new canonical layout; dates = parsed + raw +
  precision; include person register + dedup in this effort; overrides-file + re-run loop;
  Python tool at `tools/import-normalizer/`.
 - Century rule fixed by Marcel: archive spans **1873–1957**; 2-digit `00–57`→19YY,
  `73–99`→18YY, `58–72`→flag; 3-digit→1DDD; never 20xx.
 - Wrote [`02-normalization-spec.md`](./02-normalization-spec.md) in the requirements-engineer
  persona (FR/NFR, Given-When-Then ACs, traceability to IMP-NN, TBD register).
 **All 6 open questions resolved (spec §9):** OQ-01 — movable feasts (Ostern, Pfingsten, …)
 **computed per year from Easter**, never a fixed month; seasons → mid-season month
 (Sommer=Jul, Herbst=Oct). OQ-02 ranges → start+RANGE. OQ-03 slug ids. OQ-04 — `x`-suffix rows
 **skipped + logged** this pass (they're transcriptions of the base letter, not yet mappable).
 OQ-05 → `.xlsx`. OQ-06 → conservative, no silent merge.
 **Git:** moved off the unrelated `feat/issue-356-…` branch; pulled `main`; created clean
 branch **`docs/import-migration`** and committed these docs there. (The dirty `.venv`
 pycache + `skills/implement/SKILL.md` in the tree are pre-existing/environmental noise — left
 uncommitted, not ours.)
 **Next:**
 - Marcel reviews the spec.
 - Then writing-plans → build the normalizer at `tools/import-normalizer/` (backlog B1–B7 are
  the Musts; B3 date parser incl. Easter computus is the big one).
 ---
 ## 2026-05-25 (session 1) — Initial analysis
 **Did:**
 - Got the real raw archive xlsx (7,943 rows) + person register (163 people). PDFs to follow.
 - Compared the new xlsx layout against `MassImportService` defaults and the old ODS.
 - Full statistical scan of all rows: dates, indices, senders/receivers, file column.
 - Wrote [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md)
  with 12 issues (IMP-01..IMP-12) + recommended sequencing.
 - Installed `openpyxl` into the OCR service venv for inspection.
 **Key facts established:**
 - Importer defaults match the **ODS**, not the new xlsx → wrong column mapping (IMP-01).
 - **90%** of dated rows (6,571 / 7,319) are free-text dates the ISO-only parser drops (IMP-02).
 - Person register is rich but **unimported**; holds the maiden-name dedup key (IMP-04/05).
 **Decisions pending from Marcel (blockers for any code work):**
 1. IMP-01: positional re-config of `app.import.col.*` vs header-driven mapping rewrite?
 2. IMP-02: how to store imprecise dates — new `dateOriginal` + `precision` columns, or lossy?
 3. IMP-04/05: format for the person/alias mapping; import persons before documents?
 4. IMP-10: are `x`-suffix rows separate documents, attachments, or skipped?
 **Next:**
 - Get Marcel's calls on the 4 decisions above.
 - Then split the code-change items into Gitea issues (IMP-01b, IMP-02, IMP-04, IMP-06, IMP-12).
 - Pure-data tasks (IMP-07 dup list, IMP-09 file reconcile) stay here.
--- a/docs/superpowers/plans/2026-05-25-personendatei-importer.md
+++ b/docs/superpowers/plans/2026-05-25-personendatei-importer.md
--- a/docs/superpowers/specs/2026-05-25-personendatei-importer-design.md
+++ b/docs/superpowers/specs/2026-05-25-personendatei-importer-design.md
@@ -0,0 +1,292 @@
 # Personendatei Importer — Design Spec
 **Date:** 2026-05-25
 **Source file:** `import/Personendatei 2.xlsx`
 **Output:** `tools/import-normalizer/out/canonical-persons-tree.json`
 **Tool location:** `tools/import-normalizer/persons_tree.py`
 ---
 ## 1. Purpose
 Normalize the 163-person family register in `Personendatei 2.xlsx` into a machine-readable JSON file that a future backend importer can consume to seed the `persons` and `person_relationships` tables. The tool is offline (no backend required) and produces a reviewable artifact with an explicit `unresolved[]` list for manual follow-up.
 ---
 ## 2. Source Data — Column Map
 Sheet: `Tabelle1` (rows 2–164; row 1 is the header).
 | Col | Header | Content | Notes |
 |-----|--------|---------|-------|
 | A | Generation | `G 0`–`G 5` | Generation relative to Herbert & Clara Cram (G 2). Inconsistent formatting: `"G3"`, `"G  0"`, `"G 2         de Gruyter"` — strip non-digit chars and parse the integer. |
 | B | Familienname | Last name | Sometimes compound: `"de Gruyter"`, `"Cram Heydrich"`, `"Burkhard-  Meier"` |
 | C | Vorname | First name | Sometimes multiple: `"Charlotte,Meta,Jacobi"`, nicknames in parens: `"Otto (Herbert)"` |
 | D | geb als | Maiden name | Used as a name alias for matching |
 | E | Geburtsdatum | Birth date | **Mixed types** — see §4 |
 | F | Geburtsort | Birth place | Free-text string, stored verbatim |
 | G | Todesdatum | Death date | Same mixed types as col E |
 | H | Sterbeort | Death place | Free-text string, stored verbatim |
 | I | verheiratet mit | Spouse name | Partial name in either `"Firstname Lastname"` or `"Lastname Firstname"` order |
 | J | Bemerkung | German relationship notes | `"Sohn v Clara u Herbert"`, `"Nichte v Herbert"`, free text |
 ---
 ## 3. Two-Pass Architecture
 ### Pass 1 — Parse & Normalize (rows → person records)
 For each row:
 1. Read all 10 columns.
 2. Assign a stable `rowId`: `"row_{i:03d}"` where `i` is the 1-based row number (e.g. `row_002`).
 3. Normalize fields per §4 and §5.
 4. Build the **name-lookup index** (see §6).
 5. Emit a person record.
 ### Pass 2 — Resolve Relationships
 Walk every person record:
 1. Resolve col I (spouse) → emit `SPOUSE_OF` edge or `unresolved` entry.
 2. Parse col J (Bemerkung) for parent/child patterns → emit `PARENT_OF` edges or `unresolved` entries.
 3. Append unmatched Bemerkung text to `person.notes`.
 ---
 ## 4. Date Parsing
 Both col E (birth) and col G (death) arrive as either an Excel numeric serial or a string.
 ### Excel serial conversion
 When the cell value is an integer (or a float with no string representation):
 ```
 date = datetime(1899, 12, 30) + timedelta(days=int(value))
 year = date.year
 ```
 Excel's epoch is 1899-12-30 (accounts for the Lotus 1-2-3 leap-year bug).
 ### String fallback — reuse existing `dates.parse_date()`
 Pass the raw string to the existing `tools/import-normalizer/dates.parse_date()`. It already handles:
 - `DD.MM.YYYY` and `D.M.YY`
 - Year-only (`1930`)
 - Month + year (`August 1941`, `Sept. 1913`)
 - Partial/approximate markers
 Extract `.year` from the returned `ParsedDate.iso` if `iso` is not `None`.
 ### Unresolvable dates
 If both paths yield `None` (e.g. `"2.9.196"`, `"4.3.1023"`, `".12.1955"`):
 - Set `birthYear`/`deathYear` to `null`.
 - Append the raw value to `person.notes` as `"[Geburtsdatum: <raw>]"` or `"[Todesdatum: <raw>]"` for human review.
 ---
 ## 5. Person Record Normalization
 ### Name fields
 - **lastName** = col B, stripped.
 - **firstName** = col C. Keep as-is (including multi-name strings and parenthetical nicknames) — the backend can split later.
 - **maidenName** = col D, stripped. Stored in the JSON; the backend maps this to a `PersonNameAlias` of type `BIRTH_NAME`.
 - **alias** = `null` (the tool does not invent aliases; maiden name is the alias).
 ### Generation
 Extract the first digit sequence from col A:
 ```python
 import re
 m = re.search(r"\d+", raw_generation)
 generation = int(m.group()) if m else None
 ```
 Handles all observed variants: `"G 3"`, `"G3"`, `"G  0"`, `"G 2         de Gruyter"`, `"G  0"`.
 Stored as `generation: int | null` in the JSON (informational; not mapped to a backend field directly).
 ### familyMember
 Set `true` for all records. Every person in this register is part of the family network. The backend can refine this.
 ### notes
 Constructed by concatenation:
 1. Unmatched Bemerkung text (after relationship pattern is stripped).
 2. Unresolvable date raw values (prefixed with field name).
 ---
 ## 6. Name Lookup Index
 After pass 1, build a `dict[str, list[str]]` mapping normalized name keys → list of `rowId`s.
 ### Normalization function `_norm(s) -> str`
 1. Lowercase.
 2. Strip surrounding `"` and `'`.
 3. Remove parenthetical substrings: `r"\([^)]*\)"`.
 4. Collapse internal whitespace.
 5. Strip geographic/honorific suffixes: `aachen`, `mex.`, `mexiko`, `sen`, `jun`, `jr`.
 6. Strip trailing commas, dots.
 ### Keys indexed per person
 For a person with firstName `F`, lastName `L`, maidenName `M`:
 - `_norm(f"{F} {L}")` — canonical order
 - `_norm(f"{L} {F}")` — reversed order (col I uses this heavily)
 - `_norm(f"{F} {M}")` if maidenName is set — maiden-name reference
 - `_norm(L)` alone — single-token fallback
 ### Match resolution
 Given a raw name string from col I or col J:
 1. `_norm(raw)` → look up in index.
 2. **Exactly one hit** → match confirmed, use that `rowId`.
 3. **Zero hits** → `reason: "not_found"` → `unresolved[]`.
 4. **Multiple hits** → `reason: "ambiguous"` → `unresolved[]`.
 ---
 ## 7. Relationship Extraction
 ### 7.1 SPOUSE_OF (col I — `verheiratet mit`)
 1. Normalize col I value.
 2. Resolve via name index (§6).
 3. If matched: emit one edge `{ personId, relatedPersonId, type: "SPOUSE_OF", source: "verheiratet_mit" }`.
   - Skip if an identical edge (regardless of direction) already exists in the relationship list.
 4. If unresolved: add to `unresolved[]`.
 ### 7.2 PARENT_OF (col J — `Bemerkung`)
 Apply these regex patterns in order, case-insensitive, with optional whitespace:
 | Pattern | Direction | Note |
 |---------|-----------|------|
 | `(Sohn\|Tochter)\s+v(?:on)?\s+(.+)` | Named person(s) → this person | "Sohn v Clara u Herbert" |
 | `(Vater\|Mutter)\s+v(?:on)?\s+(.+)` | This person → named person(s) | "Vater v Herbert" |
 **Multi-parent extraction:** The parent string may contain two parents joined by `\s+u(?:nd)?\s+`. Split on this pattern, resolve each part independently.
 **Emit** one `PARENT_OF` edge per resolved parent:
 ```json
 {
  "personId": "<parent_rowId>",
  "relatedPersonId": "<child_rowId>",
  "type": "PARENT_OF",
  "source": "bemerkung",
  "rawBemerkung": "<original col J value>"
 }
 ```
 **Skip** (do not emit, do not add to `unresolved[]`, leave in notes):
 - Patterns starting with `Neffe`, `Nichte`, `Enkel`, `Enkelin`, `Urenkel`, `Urenkelin` — too indirect.
 - Patterns starting with `Bruder`, `Schwester` — SIBLING_OF is out of scope for this tool.
 - Any other Bemerkung text that does not match the parent patterns.
 **After extraction:** the matched portion of the Bemerkung is removed; the remainder goes into `person.notes`.
 ---
 ## 8. Output JSON Schema
 File: `tools/import-normalizer/out/canonical-persons-tree.json`
 ```json
 {
  "generated_at": "<ISO-8601 timestamp>",
  "source": "Personendatei 2.xlsx",
  "stats": {
    "persons": 163,
    "relationships": 87,
    "unresolved": 12
  },
  "persons": [
    {
      "rowId": "row_002",
      "firstName": "Elsgard",
      "lastName": "Allemeyer",
      "maidenName": "Wöhler",
      "alias": null,
      "notes": "Nichte von Herbert",
      "birthYear": 1920,
      "deathYear": 1999,
      "birthPlace": "Garz",
      "deathPlace": "Espelkamp",
      "generation": 3,
      "familyMember": true
    }
  ],
  "relationships": [
    {
      "personId": "row_002",
      "relatedPersonId": "row_003",
      "type": "SPOUSE_OF",
      "source": "verheiratet_mit"
    },
    {
      "personId": "row_019",
      "relatedPersonId": "row_021",
      "type": "PARENT_OF",
      "source": "bemerkung",
      "rawBemerkung": "Tochter v Clara u Herbert"
    }
  ],
  "unresolved": [
    {
      "rowId": "row_007",
      "field": "verheiratet_mit",
      "raw": "\"Tante Lolly\"",
      "reason": "not_found"
    },
    {
      "rowId": "row_042",
      "field": "bemerkung",
      "raw": "Zwillingsbruder v Herbert",
      "reason": "not_found"
    }
  ]
 }
 ```
 ---
 ## 9. CLI Interface
 ```
 python3 persons_tree.py [--input PATH] [--output PATH] [--dry-run]
 ```
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--input` | `../../import/Personendatei 2.xlsx` | Source Excel file |
 | `--output` | `out/canonical-persons-tree.json` | Output JSON file |
 | `--dry-run` | off | Print stats + first 5 unresolved entries; do not write file |
 On success, print:
 ```
 ✓ 163 persons parsed
 ✓ 87 relationships emitted (52 SPOUSE_OF, 35 PARENT_OF)
 ⚠  12 unresolved (see unresolved[] in output)
 →  out/canonical-persons-tree.json
 ```
 ---
 ## 10. Module Reuse
 | Existing module | What we reuse |
 |-----------------|---------------|
 | `dates.parse_date()` | String date parsing — handles DD.MM.YYYY, year-only, month+year, approximate markers |
 | `config.MONTHS` | Month name → integer mapping (German + Spanish month names already present) |
 The Excel serial conversion is new logic added directly in `persons_tree.py` (3 lines).
 ---
 ## 11. What This Tool Does NOT Do
 - Does not call the backend API or touch the database.
 - Does not create `PersonNameAlias` records — it emits `maidenName` as a field; the future backend importer maps it.
 - Does not infer SIBLING_OF edges (requires symmetric lookup across multiple rows — deferred).
 - Does not deduplicate persons that appear in both this file and `canonical-persons.xlsx` — deduplication is the backend importer's responsibility.
 - Does produce `birthPlace` / `deathPlace` as top-level fields in the JSON (see §8) — they are free-text strings and informational only. The `Person` entity has no corresponding columns; the future backend importer decides whether to add columns or fold the values into `notes`.
 ---
 ## 12. Resolved Decisions
 | OQ | Question | Decision |
 |----|----------|----------|
 | OQ-01 | Duplicate rows (127/138 — Christa Schütz; 129/139 — Christoph Seils). | **Tool deduplicates.** On pass 1, after building the person list, detect rows with identical `(firstName, lastName, birthYear)` and keep only the first occurrence. Log skipped row ids to stdout. |
 | OQ-02 | `birthPlace` / `deathPlace` absent from `Person` entity. | **Keep as separate top-level fields** in the JSON (`birthPlace`, `deathPlace`). The future backend importer may add columns to the `persons` table; the field is preserved here to avoid data loss. |
 | OQ-03 | `firstName` = `"Charlotte,Meta,Jacobi"` (multi-name comma string). | **Store verbatim as `firstName`.** No splitting. |
--- a/frontend/messages/de.json
+++ b/frontend/messages/de.json
@@ -1084,10 +1084,5 @@
 	"timeline_dragging_aria_live": "Zeitraum {from} bis {to} ausgewählt",
 	"error_page_id_label": "Fehler-ID",
 	"error_copy_id_label": "ID kopieren",
-	"error_copied": "Kopiert!",
+	"error_copied": "Kopiert!"
 	"themen_widget_title": "Themen",
 	"themen_alle": "Alle Themen",
 	"themen_leer": "Noch keine Themen vergeben.",
 	"themen_weitere": "+ {count} weitere",
 	"themen_dokumente": "{count} Dokumente"
 }
--- a/frontend/messages/en.json
+++ b/frontend/messages/en.json
@@ -1084,10 +1084,5 @@
 	"timeline_dragging_aria_live": "Range {from} to {to} selected",
 	"error_page_id_label": "Error ID",
 	"error_copy_id_label": "Copy ID",
-	"error_copied": "Copied!",
+	"error_copied": "Copied!"
 	"themen_widget_title": "Topics",
 	"themen_alle": "All Topics",
 	"themen_leer": "No topics assigned yet.",
 	"themen_weitere": "+ {count} more",
 	"themen_dokumente": "{count} documents"
 }
--- a/frontend/messages/es.json
+++ b/frontend/messages/es.json
@@ -1084,10 +1084,5 @@
 	"timeline_dragging_aria_live": "Rango {from} a {to} seleccionado",
 	"error_page_id_label": "ID de error",
 	"error_copy_id_label": "Copiar ID",
-	"error_copied": "¡Copiado!",
+	"error_copied": "¡Copiado!"
 	"themen_widget_title": "Temas",
 	"themen_alle": "Todos los temas",
 	"themen_leer": "Aún no hay temas.",
 	"themen_weitere": "+ {count} más",
 	"themen_dokumente": "{count} documentos"
 }
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -23,9 +23,9 @@
 				"@eslint/compat": "^1.4.0",
 				"@eslint/js": "^9.39.1",
 				"@inlang/paraglide-js": "^2.5.0",
-				"@playwright/test": "^1.60.0",
+				"@playwright/test": "^1.58.2",
-				"@sveltejs/adapter-node": "^5.5.4",
+				"@sveltejs/adapter-node": "^5.4.0",
-				"@sveltejs/kit": "^2.60.1",
+				"@sveltejs/kit": "^2.48.5",
 				"@sveltejs/vite-plugin-svelte": "^6.2.1",
 				"@tailwindcss/forms": "^0.5.10",
 				"@tailwindcss/typography": "^0.5.19",
@@ -43,7 +43,7 @@
 				"globals": "^16.5.0",
 				"openapi-typescript": "^7.8.0",
 				"patch-package": "^8.0.0",
-				"playwright": "^1.60.0",
+				"playwright": "^1.56.1",
 				"prettier": "^3.6.2",
 				"prettier-plugin-svelte": "^3.4.0",
 				"prettier-plugin-tailwindcss": "^0.7.1",
@@ -52,7 +52,7 @@
 				"tailwindcss": "^4.1.17",
 				"typescript": "^5.9.3",
 				"typescript-eslint": "^8.47.0",
-				"vite": "^7.3.3",
+				"vite": "^7.2.2",
 				"vite-plugin-devtools-json": "^1.0.0",
 				"vitest": "^4.0.10",
 				"vitest-browser-svelte": "^2.0.1"
--- a/frontend/src/lib/generated/api.ts
+++ b/frontend/src/lib/generated/api.ts
@@ -2205,10 +2205,10 @@ export interface components {
            totalStories: number;
        };
        PersonSummaryDTO: {
            title?: string;
            /** Format: uuid */
            id?: string;
            displayName?: string;
            title?: string;
            firstName?: string;
            lastName?: string;
            /** Format: int64 */
@@ -2315,6 +2315,8 @@ export interface components {
            /** Format: int32 */
            totalPages?: number;
            pageable?: components["schemas"]["PageableObject"];
            first?: boolean;
            last?: boolean;
            /** Format: int32 */
            size?: number;
            content?: components["schemas"]["NotificationDTO"][];
@@ -2323,8 +2325,6 @@ export interface components {
            sort?: components["schemas"]["SortObject"];
            /** Format: int32 */
            numberOfElements?: number;
            first?: boolean;
            last?: boolean;
            empty?: boolean;
        };
        PageableObject: {
@@ -2407,10 +2407,6 @@ export interface components {
            completionPercentage: number;
            contributors: components["schemas"]["ActivityActorDTO"][];
            matchData: components["schemas"]["SearchMatchData"];
            /** Format: date-time */
            createdAt: string;
            /** Format: date-time */
            updatedAt: string;
        };
        DocumentSearchResult: {
            items: components["schemas"]["DocumentListItem"][];
--- a/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte
+++ b/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte
@@ -3,16 +3,16 @@ import * as m from '$lib/paraglide/messages.js';
 import { relativeTimeDe } from '$lib/shared/relativeTime';
 import type { components } from '$lib/generated/api';
-type DocumentListItem = components['schemas']['DocumentListItem'];
+type Document = components['schemas']['Document'];
 interface Props {
-	documents: DocumentListItem[];
+	documents: Document[];
 }
 const { documents }: Props = $props();
-function isNew(doc: DocumentListItem): boolean {
+function isNew(doc: Document): boolean {
-	return new Date(doc.createdAt).getTime() > Date.now() - 7 * 24 * 60 * 60 * 1000;
+	return new Date(doc.createdAt).getTime() === new Date(doc.updatedAt).getTime();
 }
 </script>
--- a/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte.spec.ts
+++ b/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte.spec.ts
@@ -5,33 +5,24 @@ import { page } from 'vitest/browser';
 import ReaderRecentDocs from './ReaderRecentDocs.svelte';
 import type { components } from '$lib/generated/api';
-type DocumentListItem = components['schemas']['DocumentListItem'];
+type Document = components['schemas']['Document'];
 afterEach(() => {
 	cleanup();
 });
-const baseDoc: DocumentListItem = {
+const baseDoc: Document = {
 	id: 'doc1',
 	title: 'Brief an Hans',
 	originalFilename: 'brief.pdf',
-	completionPercentage: 0,
+	status: 'UPLOADED',
-	receivers: [],
+	metadataComplete: true,
-	tags: [],
+	scriptType: 'HANDWRITING_KURRENT',
 	contributors: [],
 	matchData: {
 		titleOffsets: [],
 		senderMatched: false,
 		matchedReceiverIds: [],
 		matchedTagIds: [],
 		snippetOffsets: [],
 		summaryOffsets: []
 	},
 	createdAt: '2025-01-01T12:00:00Z',
 	updatedAt: '2025-01-01T12:00:00Z'
 };
-const updatedDoc: DocumentListItem = {
+const updatedDoc: Document = {
 	...baseDoc,
 	id: 'doc2',
 	title: 'Urkunde 1920',
@@ -97,14 +88,8 @@ describe('ReaderRecentDocs', () => {
 		expect(thumb!.className).toMatch(/rounded-/);
 	});
-	it('shows "Neu" accent-pill badge when document was created within the last 7 days', async () => {
+	it('shows "Neu" accent-pill badge when createdAt equals updatedAt', async () => {
-		const recentDoc: DocumentListItem = {
+		render(ReaderRecentDocs, { documents: [baseDoc] });
 			...baseDoc,
 			id: 'doc-recent',
 			createdAt: new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString(),
 			updatedAt: new Date(Date.now() - 1 * 24 * 60 * 60 * 1000).toISOString()
 		};
 		render(ReaderRecentDocs, { documents: [recentDoc] });
 		const badge = page.getByText(/^Neu$/i);
 		await expect.element(badge).toBeInTheDocument();
 		const cls = ((await badge.element()) as HTMLElement).className;
@@ -113,7 +98,7 @@ describe('ReaderRecentDocs', () => {
 		expect(cls).toMatch(/\btext-ink\b/);
 	});
-	it('shows no badge when document was created more than 7 days ago', async () => {
+	it('shows no badge when updatedAt differs from createdAt', async () => {
 		render(ReaderRecentDocs, { documents: [updatedDoc] });
 		const badge = page.getByText(/^Neu$/i);
 		await expect.element(badge).not.toBeInTheDocument();
@@ -121,20 +106,20 @@ describe('ReaderRecentDocs', () => {
 		await expect.element(updatedBadge).not.toBeInTheDocument();
 	});
-	it('shows "Neu" badge when document was created 6 days ago', async () => {
+	it('shows "Neu" badge when createdAt and updatedAt represent the same instant in different ISO formats', async () => {
-		const almostOldDoc: DocumentListItem = {
+		const sameInstantDoc: Document = {
 			...baseDoc,
-			id: 'doc-almost-old',
+			id: 'doc-same-instant',
-			createdAt: new Date(Date.now() - 6 * 24 * 60 * 60 * 1000).toISOString(),
+			createdAt: '2025-01-01T12:00:00Z',
-			updatedAt: new Date(Date.now() - 5 * 24 * 60 * 60 * 1000).toISOString()
+			updatedAt: '2025-01-01T12:00:00.000Z'
 		};
-		render(ReaderRecentDocs, { documents: [almostOldDoc] });
+		render(ReaderRecentDocs, { documents: [sameInstantDoc] });
 		const badge = page.getByText(/^Neu$/i);
 		await expect.element(badge).toBeInTheDocument();
 	});
 	it('renders sender name text when sender is present', async () => {
-		const docWithSender: DocumentListItem = {
+		const docWithSender: Document = {
 			...baseDoc,
 			sender: {
 				id: 'p1',
--- a/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte.test.ts
+++ b/frontend/src/lib/shared/dashboard/ReaderRecentDocs.svelte.test.ts
@@ -31,25 +31,25 @@ describe('ReaderRecentDocs', () => {
 			.toHaveAttribute('href', '/documents');
 	});
-	it('renders the New badge when document was created within the last 7 days', async () => {
+	it('renders the New badge when createdAt equals updatedAt', async () => {
 		const recentDate = new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString();
 		const laterUpdate = new Date(Date.now() - 1 * 24 * 60 * 60 * 1000).toISOString();
 		render(ReaderRecentDocs, {
 			props: {
-				documents: [makeDoc({ createdAt: recentDate, updatedAt: laterUpdate })]
+				documents: [
 					makeDoc({ createdAt: '2026-04-15T10:00:00Z', updatedAt: '2026-04-15T10:00:00Z' })
 				]
 			}
 		});
 		await expect.element(page.getByText('Neu')).toBeVisible();
 	});
-	it('hides the New badge when document was created more than 7 days ago', async () => {
+	it('hides the New badge when document was updated after creation', async () => {
 		render(ReaderRecentDocs, {
 			props: {
 				documents: [
 					makeDoc({
 						createdAt: '2026-04-15T10:00:00Z',
-						updatedAt: '2026-04-15T10:00:00Z'
+						updatedAt: '2026-04-15T11:00:00Z'
 					})
 				]
 			}
--- a/frontend/src/lib/shared/dashboard/ThemenWidget.svelte
+++ b/frontend/src/lib/shared/dashboard/ThemenWidget.svelte
@@ -1,67 +0,0 @@
 <script lang="ts">
 import * as m from '$lib/paraglide/messages.js';
 import type { components } from '$lib/generated/api';
 import { hasAnyDocuments } from '$lib/shared/utils/tagUtils';
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 interface Props {
 	tags: TagTreeNodeDTO[];
 	compact?: boolean;
 }
 const MAX_VISIBLE_TAGS = 6;
 const { tags, compact = false }: Props = $props();
 const visibleTags = $derived.by(() => tags.filter(hasAnyDocuments));
 const shownTags = $derived(visibleTags.slice(0, MAX_VISIBLE_TAGS));
 </script>
 <section class="rounded-sm border border-line bg-surface p-5 shadow-sm">
 	<div class="mb-4 flex items-center justify-between">
 		<h2 class="font-sans text-xs font-bold tracking-widest text-ink-3 uppercase">
 			{m.themen_widget_title()}
 		</h2>
 		<a
 			href="/themen"
 			class="flex min-h-[44px] items-center text-[11px] font-semibold text-ink-2 no-underline focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none"
 		>
 			{m.themen_alle()} →
 		</a>
 	</div>
 	{#if visibleTags.length === 0}
 		<p class="font-sans text-sm text-ink-3">{m.themen_leer()}</p>
 	{:else}
 		<div
 			class="grid gap-2 {compact ? 'grid-cols-1' : 'grid-cols-1 sm:grid-cols-2'}"
 			data-compact={compact}
 		>
 			{#each shownTags as tag (tag.id)}
 				<a
 					href="/documents?tag={encodeURIComponent(tag.name)}"
 					aria-label="{tag.name}{tag.documentCount > 0
 						? ', ' + m.themen_dokumente({ count: tag.documentCount })
 						: ''}"
 					class="flex cursor-pointer items-stretch overflow-hidden rounded-sm border border-line bg-canvas hover:bg-surface focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none"
 					style="min-height: 56px"
 				>
 					<span
 						class="w-1 flex-shrink-0 self-stretch"
 						aria-hidden="true"
 						style="background: var(--c-tag-{tag.color ?? 'slate'})"
 					></span>
 					<span class="flex min-w-0 flex-1 flex-col justify-center gap-0.5 px-3 py-3">
 						<span class="truncate font-serif text-sm font-semibold text-ink">{tag.name}</span>
 						{#if tag.documentCount > 0}
 							<span class="font-sans text-xs text-ink-3 tabular-nums">
 								{m.themen_dokumente({ count: tag.documentCount })}
 							</span>
 						{/if}
 					</span>
 				</a>
 			{/each}
 		</div>
 	{/if}
 </section>
--- a/frontend/src/lib/shared/dashboard/ThemenWidget.svelte.spec.ts
+++ b/frontend/src/lib/shared/dashboard/ThemenWidget.svelte.spec.ts
@@ -1,58 +0,0 @@
 import { describe, it, expect, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
 import ThemenWidget from './ThemenWidget.svelte';
 import type { components } from '$lib/generated/api';
 afterEach(() => {
 	cleanup();
 });
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 function makeTag(
 	name: string,
 	documentCount: number,
 	children: TagTreeNodeDTO[] = []
 ): TagTreeNodeDTO {
 	return { id: 'id-' + name, name, documentCount, children };
 }
 describe('ThemenWidget', () => {
 	it('renders a card link per visible tag', async () => {
 		const tags = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
 		const { getByRole } = render(ThemenWidget, { tags });
 		await expect.element(getByRole('link', { name: /Briefe/ })).toBeInTheDocument();
 		await expect.element(getByRole('link', { name: /Fotos/ })).toBeInTheDocument();
 	});
 	it('hides tags where no document exists in the subtree', async () => {
 		const tags = [makeTag('Briefe', 5), makeTag('Leer', 0)];
 		render(ThemenWidget, { tags });
 		expect(document.body.textContent).toContain('Briefe');
 		expect(document.body.textContent).not.toContain('Leer');
 	});
 	it('shows the empty state text when all tags are filtered out', async () => {
 		render(ThemenWidget, { tags: [makeTag('Leer', 0)] });
 		expect(document.body.textContent).toMatch(/Noch keine Themen/);
 	});
 	it('shows empty state when tags array is empty', async () => {
 		render(ThemenWidget, { tags: [] });
 		expect(document.body.textContent).toMatch(/Noch keine Themen/);
 	});
 	it('renders in compact single-column mode when compact prop is true', async () => {
 		const tags = [makeTag('Briefe', 5)];
 		const { container } = render(ThemenWidget, { tags, compact: true });
 		const grid = container.querySelector('[data-compact="true"]');
 		expect(grid).not.toBeNull();
 	});
 	it('links to "Alle Themen" page', async () => {
 		const tags = [makeTag('Briefe', 5)];
 		const { getByRole } = render(ThemenWidget, { tags });
 		const link = getByRole('link', { name: /Alle Themen/ });
 		await expect.element(link).toHaveAttribute('href', '/themen');
 	});
 });
--- a/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte.spec.ts
+++ b/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte.spec.ts
@@ -409,24 +409,19 @@ describe('PersonMentionEditor — onExit cancels pending debounce', () => {
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		const fetchesBeforeEscape = fetchMock.mock.calls.length;
-		// Freeze setTimeout so the 150 ms debounce cannot fire before Escape
+		// Trigger a new debounced search (queues runSearch after 150 ms), then
-		// triggers onExit. We install fake timers only now — after the setup
+		// immediately Escape *while focus is back in the editor* so Tiptap's
-		// above — so that vi.waitFor()'s real-timer polling still worked.
+		// suggestion-plugin Escape handler fires onExit before the debounce.
-		vi.useFakeTimers();
+		// Without onExit cancelling the pending debounce, runSearch executes
-		try {
+		// against the now-unmounted dropdown's state.
 			// fill() dispatches the input event synchronously via CDP; by the
 			// time the await resolves, onSearch('Walter') has run and the fake
 			// debounce timer is set.
 		await page.getByRole('searchbox').fill('Walter');
 		// Focus the editor so the Escape lands on Tiptap's suggestion handler.
 		(page.getByRole('textbox').element() as HTMLElement).focus();
 		await userEvent.keyboard('{Escape}');
-			// onExit has now called debouncedSearch.cancel(). Advance past the
+
-			// debounce window — the cancelled timer must not fire.
+		// Wait past the debounce window. If onExit did not cancel the pending
-			await vi.advanceTimersByTimeAsync(SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS);
+		// debounce, a fetch with q=Walter would still fire here.
-		} finally {
+		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 			vi.useRealTimers();
 		}
 		const newFetches = fetchMock.mock.calls.slice(fetchesBeforeEscape);
 		const walterFetches = newFetches.filter(
--- a/frontend/src/lib/shared/utils/tagUtils.test.ts
+++ b/frontend/src/lib/shared/utils/tagUtils.test.ts
@@ -1,29 +0,0 @@
 import { describe, it, expect } from 'vitest';
 import { hasAnyDocuments } from './tagUtils';
 import type { components } from '$lib/generated/api';
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 function makeNode(documentCount: number, children: TagTreeNodeDTO[] = []): TagTreeNodeDTO {
 	return { id: 'id', name: 'name', documentCount, children };
 }
 describe('hasAnyDocuments', () => {
 	it('returns false for a leaf node with documentCount=0', () => {
 		expect(hasAnyDocuments(makeNode(0))).toBe(false);
 	});
 	it('returns true for a leaf node with documentCount=3', () => {
 		expect(hasAnyDocuments(makeNode(3))).toBe(true);
 	});
 	it('returns true for a root with documentCount=0 but a child with documentCount=5', () => {
 		const node = makeNode(0, [makeNode(5)]);
 		expect(hasAnyDocuments(node)).toBe(true);
 	});
 	it('returns false for a root with documentCount=0 and all children also 0', () => {
 		const node = makeNode(0, [makeNode(0), makeNode(0)]);
 		expect(hasAnyDocuments(node)).toBe(false);
 	});
 });
--- a/frontend/src/lib/shared/utils/tagUtils.ts
+++ b/frontend/src/lib/shared/utils/tagUtils.ts
@@ -1,7 +0,0 @@
 import type { components } from '$lib/generated/api';
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 export function hasAnyDocuments(node: TagTreeNodeDTO): boolean {
 	return (node.documentCount ?? 0) > 0 || (node.children ?? []).some(hasAnyDocuments);
 }
--- a/frontend/src/routes/+page.server.ts
+++ b/frontend/src/routes/+page.server.ts
@@ -10,9 +10,8 @@ type DashboardPulseDTO = components['schemas']['DashboardPulseDTO'];
 type ActivityFeedItemDTO = components['schemas']['ActivityFeedItemDTO'];
 type IncompleteDocumentDTO = components['schemas']['IncompleteDocumentDTO'];
 type PersonSummaryDTO = components['schemas']['PersonSummaryDTO'];
-type DocumentListItem = components['schemas']['DocumentListItem'];
+type Document = components['schemas']['Document'];
 type Geschichte = components['schemas']['Geschichte'];
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 function settled<T>(res: PromiseSettledResult<unknown> | undefined): T | null {
 	if (res?.status !== 'fulfilled') return null;
@@ -41,8 +40,7 @@ export async function load({ fetch, parent }) {
 				api.GET('/api/documents/search', {
 					params: { query: { sort: 'UPDATED_AT', dir: 'DESC', size: 5 } }
 				}),
-				api.GET('/api/geschichten', { params: { query: { status: 'PUBLISHED', limit: 3 } } }),
+				api.GET('/api/geschichten', { params: { query: { status: 'PUBLISHED', limit: 3 } } })
 				api.GET('/api/tags/tree')
 			];
 			if (canBlogWrite) {
 				readerFetches.push(
@@ -50,15 +48,14 @@ export async function load({ fetch, parent }) {
 				);
 			}
-			const [statsRes, topPersonsRes, recentDocsRes, recentStoriesRes, tagTreeRes, draftsRes] =
+			const [statsRes, topPersonsRes, recentDocsRes, recentStoriesRes, draftsRes] =
 				await Promise.allSettled(readerFetches);
 			const readerStats = settled<StatsDTO>(statsRes);
 			const topPersons = settled<PersonSummaryDTO[]>(topPersonsRes) ?? [];
-			const searchData = settled<{ items: DocumentListItem[] }>(recentDocsRes);
+			const searchData = settled<{ items: { document: Document }[] }>(recentDocsRes);
-			const recentDocs = searchData?.items ?? [];
+			const recentDocs = searchData?.items.map((i) => i.document) ?? [];
 			const recentStories = settled<Geschichte[]>(recentStoriesRes) ?? [];
 			const tagTree = settled<TagTreeNodeDTO[]>(tagTreeRes) ?? [];
 			const drafts = settled<Geschichte[]>(draftsRes) ?? [];
 			return {
@@ -68,7 +65,6 @@ export async function load({ fetch, parent }) {
 				topPersons,
 				recentDocs,
 				recentStories,
 				tagTree,
 				drafts,
 				error: null as string | null
 			};
@@ -84,8 +80,7 @@ export async function load({ fetch, parent }) {
 			readyResult,
 			weeklyStatsResult,
 			incompleteResult,
-			incompleteCountResult,
+			incompleteCountResult
 			tagTreeResult
 		] = await Promise.allSettled([
 			api.GET('/api/stats'),
 			api.GET('/api/dashboard/resume'),
@@ -96,8 +91,7 @@ export async function load({ fetch, parent }) {
 			api.GET('/api/transcription/ready-to-read'),
 			api.GET('/api/transcription/weekly-stats'),
 			api.GET('/api/documents/incomplete', { params: { query: { size: 5 } } }),
-			api.GET('/api/documents/incomplete-count'),
+			api.GET('/api/documents/incomplete-count')
 			api.GET('/api/tags/tree')
 		]);
 		let stats: StatsDTO | null = null;
@@ -110,7 +104,6 @@ export async function load({ fetch, parent }) {
 		let weeklyStats: TranscriptionWeeklyStatsDTO | null = null;
 		let incompleteDocs: IncompleteDocumentDTO[] = [];
 		let incompleteTotal = 0;
 		let tagTree: TagTreeNodeDTO[] = [];
 		if (statsResult.status === 'fulfilled' && statsResult.value.response.ok) {
 			stats = statsResult.value.data ?? null;
@@ -142,9 +135,6 @@ export async function load({ fetch, parent }) {
 		if (incompleteCountResult.status === 'fulfilled' && incompleteCountResult.value.response.ok) {
 			incompleteTotal = (incompleteCountResult.value.data?.count as number | undefined) ?? 0;
 		}
 		if (tagTreeResult.status === 'fulfilled' && tagTreeResult.value.response.ok) {
 			tagTree = (tagTreeResult.value.data as TagTreeNodeDTO[]) ?? [];
 		}
 		return {
 			isReader: false as const,
@@ -158,7 +148,6 @@ export async function load({ fetch, parent }) {
 			weeklyStats,
 			incompleteDocs,
 			incompleteTotal,
 			tagTree,
 			error: null as string | null
 		};
 	} catch (e) {
@@ -178,9 +167,8 @@ export async function load({ fetch, parent }) {
 			incompleteTotal: 0,
 			readerStats: null,
 			topPersons: [] as PersonSummaryDTO[],
-			recentDocs: [] as DocumentListItem[],
+			recentDocs: [] as Document[],
 			recentStories: [] as Geschichte[],
 			tagTree: [] as TagTreeNodeDTO[],
 			drafts: [] as Geschichte[],
 			error: 'Daten konnten nicht geladen werden.' as string | null
 		};
--- a/frontend/src/routes/+page.svelte
+++ b/frontend/src/routes/+page.svelte
@@ -10,7 +10,6 @@ import ReaderPersonChips from '$lib/shared/dashboard/ReaderPersonChips.svelte';
 import ReaderDraftsModule from '$lib/shared/dashboard/ReaderDraftsModule.svelte';
 import ReaderRecentDocs from '$lib/shared/dashboard/ReaderRecentDocs.svelte';
 import ReaderRecentStories from '$lib/shared/dashboard/ReaderRecentStories.svelte';
 import ThemenWidget from '$lib/shared/dashboard/ThemenWidget.svelte';
 import { m } from '$lib/paraglide/messages.js';
 let { data } = $props();
@@ -46,8 +45,6 @@ const greetingText = $derived.by(() => {
 			<ReaderPersonChips persons={data.topPersons ?? []} />
 			<ThemenWidget tags={data.tagTree ?? []} />
 			<div class="grid grid-cols-1 gap-1.5 sm:grid-cols-2">
 				<ReaderRecentDocs documents={data.recentDocs ?? []} />
 				<ReaderRecentStories stories={data.recentStories ?? []} />
@@ -59,13 +56,10 @@ const greetingText = $derived.by(() => {
 				<h1 class="font-serif text-[2rem] text-ink">{greetingText}</h1>
 			</div>
 		{/if}
 		<div class="grid grid-cols-1 gap-5 lg:grid-cols-[1fr_320px] lg:items-start">
 			<div class="flex flex-col gap-5">
 				<DashboardResumeStrip resumeDoc={data.resumeDoc ?? null} />
 			<ThemenWidget tags={data.tagTree ?? []} />
 			<div class="grid grid-cols-1 gap-5 lg:grid-cols-[1fr_320px] lg:items-start">
 				<div class="flex flex-col gap-5">
 				<EnrichmentBlock
 					topDocs={data.incompleteDocs ?? []}
 					totalCount={data.incompleteTotal ?? 0}
@@ -94,6 +88,5 @@ const greetingText = $derived.by(() => {
 				{/if}
 			</div>
 		</div>
 		</div>
 	{/if}
 </main>
--- a/frontend/src/routes/page.server.spec.ts
+++ b/frontend/src/routes/page.server.spec.ts
@@ -108,8 +108,7 @@ describe('home page load — dashboard', () => {
 				data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
 			}) // weekly-stats
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
-			.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }) // incomplete-count
+			.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }); // incomplete-count
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
 		vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
 			typeof createApiClient
 		>);
@@ -147,8 +146,7 @@ describe('home page load — dashboard', () => {
 				data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
 			}) // weekly-stats
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
-			.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }) // incomplete-count
+			.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }); // incomplete-count
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
 		vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
 			typeof createApiClient
 		>);
@@ -396,56 +394,6 @@ describe('home page load — reader branch (isReader = !canWrite && !canAnnotate
 		expect(result.isReader).toBe(false);
 	});
 	it('maps search result items directly to recentDocs without wrapping in a .document property', async () => {
 		const searchItem = {
 			id: 'd1',
 			title: 'Liebesbrief',
 			originalFilename: 'letter.pdf',
 			completionPercentage: 80,
 			receivers: [],
 			tags: [],
 			contributors: [],
 			matchData: { titleOffsets: [], senderMatched: false },
 			createdAt: '2026-05-01T10:00:00Z',
 			updatedAt: '2026-05-10T08:00:00Z'
 		};
 		const mockGet = vi
 			.fn()
 			.mockResolvedValueOnce({ response: { ok: true, status: 200 }, data: [] }) // initial persons
 			.mockResolvedValueOnce({
 				response: { ok: true },
 				data: { totalDocuments: 1, totalPersons: 1 }
 			}) // stats
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // topPersons
 			.mockResolvedValueOnce({
 				response: { ok: true },
 				data: { items: [searchItem], totalElements: 1, pageNumber: 0, pageSize: 5, totalPages: 1 }
 			}) // search
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // stories
 			.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
 		vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
 			typeof createApiClient
 		>);
 		const result = await load({
 			url: makeUrl(),
 			request: new Request('http://localhost/'),
 			fetch: vi.fn() as unknown as typeof fetch,
 			parent: vi
 				.fn()
 				.mockResolvedValue({ canWrite: false, canAnnotate: false, canBlogWrite: false })
 		} as Parameters<typeof load>[0]);
 		expect(result.isReader).toBe(true);
 		if (result.isReader) {
 			expect(result.recentDocs).toHaveLength(1);
 			expect(result.recentDocs[0]).toBeDefined();
 			expect(result.recentDocs[0].id).toBe('d1');
 			expect(result.recentDocs[0].createdAt).toBe('2026-05-01T10:00:00Z');
 			expect(result.recentDocs[0].updatedAt).toBe('2026-05-10T08:00:00Z');
 		}
 	});
 	it('returns topPersons=[] when topPersons fetch fails, rest of data still loads', async () => {
 		const okStats = {
 			response: { ok: true, status: 200 },
@@ -461,8 +409,7 @@ describe('home page load — reader branch (isReader = !canWrite && !canAnnotate
 			.mockResolvedValueOnce(okStats)
 			.mockReturnValueOnce(failPersons)
 			.mockResolvedValueOnce(okSearch)
-			.mockResolvedValueOnce(okStories)
+			.mockResolvedValueOnce(okStories);
 			.mockResolvedValueOnce({ response: { ok: true, status: 200 }, data: [] }); // tags/tree
 		vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
 			typeof createApiClient
 		>);
--- a/frontend/src/routes/themen/+page.server.ts
+++ b/frontend/src/routes/themen/+page.server.ts
@@ -1,12 +0,0 @@
 import { error } from '@sveltejs/kit';
 import { createApiClient } from '$lib/shared/api.server';
 import type { components } from '$lib/generated/api';
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 export async function load({ fetch }: Parameters<import('./$types').PageServerLoad>[0]) {
 	const api = createApiClient(fetch);
 	const result = await api.GET('/api/tags/tree');
 	if (!result.response.ok) throw error(500, 'Themen konnten nicht geladen werden.');
 	return { tree: (result.data ?? []) as TagTreeNodeDTO[] };
 }
--- a/frontend/src/routes/themen/+page.svelte
+++ b/frontend/src/routes/themen/+page.svelte
@@ -1,85 +0,0 @@
 <script lang="ts">
 import * as m from '$lib/paraglide/messages.js';
 import BackButton from '$lib/shared/primitives/BackButton.svelte';
 import { hasAnyDocuments } from '$lib/shared/utils/tagUtils';
 import type { components } from '$lib/generated/api';
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 const MAX_VISIBLE_CHILDREN = 5;
 let { data }: { data: { tree: TagTreeNodeDTO[] } } = $props();
 const visibleTree = $derived.by(() => data.tree.filter(hasAnyDocuments));
 </script>
 <svelte:head>
 	<title>{m.themen_widget_title()}</title>
 </svelte:head>
 <main class="mx-auto max-w-7xl px-4 py-8 sm:px-6 lg:px-8">
 	<div class="mb-6 flex items-center gap-3">
 		<BackButton />
 		<h1 class="font-serif text-2xl font-semibold text-ink">{m.themen_widget_title()}</h1>
 	</div>
 	{#if visibleTree.length === 0}
 		<p class="font-sans text-sm text-ink-3">{m.themen_leer()}</p>
 	{:else}
 		<div class="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:grid-cols-3">
 			{#each visibleTree as tag (tag.id)}
 				{@const visibleChildren = (tag.children ?? []).filter(hasAnyDocuments)}
 				{@const shownChildren = visibleChildren.slice(0, MAX_VISIBLE_CHILDREN)}
 				{@const hiddenCount = visibleChildren.length - shownChildren.length}
 				<div class="overflow-hidden rounded-sm border border-line bg-surface shadow-sm">
 					<div
 						class="h-1.5 w-full flex-shrink-0"
 						aria-hidden="true"
 						style="background: var(--c-tag-{tag.color ?? 'slate'})"
 					></div>
 					<a
 						href="/documents?tag={encodeURIComponent(tag.name)}"
 						aria-label="{tag.name}{tag.documentCount > 0
 							? ', ' + m.themen_dokumente({ count: tag.documentCount })
 							: ''}"
 						class="flex min-h-[56px] items-center justify-between px-4 pt-4 pb-3 hover:bg-canvas focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
 					>
 						<span class="font-serif text-base font-semibold text-ink">{tag.name}</span>
 						<span class="mr-1 ml-auto font-sans text-sm text-ink-3 tabular-nums">
 							{#if tag.documentCount > 0}{tag.documentCount}{/if}
 						</span>
 						<span aria-hidden="true" class="h-3.5 w-3.5 flex-shrink-0 text-brand-mint">›</span>
 					</a>
 					{#if shownChildren.length > 0}
 						<div class="mx-4 border-t border-line"></div>
 						{#each shownChildren as child (child.id)}
 							<a
 								href="/documents?tag={encodeURIComponent(child.name)}"
 								class="flex min-h-[44px] items-center justify-between px-4 py-2.5 hover:bg-canvas focus-visible:bg-canvas focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
 							>
 								<span class="font-sans text-sm text-ink">{child.name}</span>
 								<span class="mr-1 ml-auto font-sans text-xs text-ink-3 tabular-nums">
 									{#if child.documentCount > 0}{child.documentCount}{/if}
 								</span>
 								<span aria-hidden="true" class="h-3 w-3 flex-shrink-0 text-brand-mint">›</span>
 							</a>
 						{/each}
 						{#if hiddenCount > 0}
 							<a
 								href="/documents?tag={encodeURIComponent(tag.name)}"
 								class="block min-h-[44px] px-4 py-2.5 font-sans text-sm text-ink-3 hover:bg-canvas hover:text-ink focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
 							>
 								{m.themen_weitere({ count: hiddenCount })} →
 							</a>
 						{/if}
 					{/if}
 				</div>
 			{/each}
 		</div>
 	{/if}
 </main>
--- a/frontend/src/routes/themen/page.server.spec.ts
+++ b/frontend/src/routes/themen/page.server.spec.ts
@@ -1,60 +0,0 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
 beforeEach(() => vi.clearAllMocks());
 function mockApiGet(ok: boolean, data: unknown) {
 	vi.mocked(createApiClient).mockReturnValue({
 		GET: vi.fn().mockResolvedValue({ response: { ok }, data })
 	} as ReturnType<typeof createApiClient>);
 }
 const makeTag = (name: string, documentCount = 0) => ({
 	id: 'id-' + name,
 	name,
 	documentCount,
 	children: []
 });
 describe('/themen +page.server load', () => {
 	function makeLoadEvent() {
 		return {
 			fetch: vi.fn() as unknown as typeof fetch,
 			request: new Request('http://localhost/themen'),
 			url: new URL('http://localhost/themen')
 		};
 	}
 	it('returns tag tree when API succeeds', async () => {
 		const tree = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
 		mockApiGet(true, tree);
 		const { load } = await import('./+page.server');
 		const result = await load(makeLoadEvent());
 		expect(result.tree).toEqual(tree);
 	});
 	it('returns empty array when API returns empty list', async () => {
 		mockApiGet(true, []);
 		const { load } = await import('./+page.server');
 		const result = await load(makeLoadEvent());
 		expect(result.tree).toEqual([]);
 	});
 	it('throws 500 when API call fails', async () => {
 		mockApiGet(false, null);
 		const { load } = await import('./+page.server');
 		await expect(load(makeLoadEvent())).rejects.toMatchObject({ status: 500 });
 	});
 });
--- a/frontend/src/routes/themen/page.svelte.spec.ts
+++ b/frontend/src/routes/themen/page.svelte.spec.ts
@@ -1,57 +0,0 @@
 import { describe, it, expect, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
 import ThemenPage from './+page.svelte';
 import type { components } from '$lib/generated/api';
 afterEach(() => {
 	cleanup();
 });
 type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
 function makeTag(
 	name: string,
 	documentCount: number,
 	children: TagTreeNodeDTO[] = []
 ): TagTreeNodeDTO {
 	return { id: 'id-' + name, name, documentCount, children };
 }
 describe('/themen +page', () => {
 	it('renders one card per visible root tag', async () => {
 		const tree = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
 		render(ThemenPage, { data: { tree } });
 		expect(document.body.textContent).toContain('Briefe');
 		expect(document.body.textContent).toContain('Fotos');
 	});
 	it('does not render a tag with no documents in its subtree', async () => {
 		const tree = [makeTag('Briefe', 5), makeTag('Leer', 0)];
 		render(ThemenPage, { data: { tree } });
 		expect(document.body.textContent).not.toContain('Leer');
 	});
 	it('shows empty state when all tags filtered out', async () => {
 		render(ThemenPage, { data: { tree: [makeTag('Leer', 0)] } });
 		expect(document.body.textContent).toMatch(/Noch keine Themen/);
 	});
 	it('shows empty state when tree is empty', async () => {
 		render(ThemenPage, { data: { tree: [] } });
 		expect(document.body.textContent).toMatch(/Noch keine Themen/);
 	});
 	it('renders child tags for a root tag', async () => {
 		const tree = [makeTag('Briefe', 5, [makeTag('Brautbriefe', 3), makeTag('Kriegsbriefe', 2)])];
 		render(ThemenPage, { data: { tree } });
 		expect(document.body.textContent).toContain('Brautbriefe');
 		expect(document.body.textContent).toContain('Kriegsbriefe');
 	});
 	it('shows "+ N weitere" when a root tag has more than 5 children', async () => {
 		const children = Array.from({ length: 7 }, (_, i) => makeTag(`Kind${i}`, i + 1));
 		const tree = [makeTag('Briefe', 10, children)];
 		render(ThemenPage, { data: { tree } });
 		expect(document.body.textContent).toMatch(/\+\s*2\s*weitere/);
 	});
 });
--- a/tools/import-normalizer/.gitignore
+++ b/tools/import-normalizer/.gitignore
@@ -0,0 +1,6 @@
 .venv/
 out/
 !out/canonical-persons-tree.json
 review/
 __pycache__/
 *.pyc
--- a/tools/import-normalizer/README.md
+++ b/tools/import-normalizer/README.md
@@ -0,0 +1,44 @@
 # Import Normalizer
 Transforms the raw family-archive spreadsheets in `../../import/` into a clean canonical
 dataset (`out/`) plus review reports (`review/`). See the spec:
 `../../docs/import-migration/02-normalization-spec.md`.
 ## Setup
 Requires **Python 3.12** (uses `StrEnum`).
 ```bash
 python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
 ```
 ## Run
 ```bash
 .venv/bin/python normalize.py
 ```
 Outputs:
 - `out/canonical-documents.xlsx`, `out/canonical-persons.xlsx`
 - `review/*.csv` (residue to fix), `review/summary.txt` (grouped run stats incl. unknown-date rate)
 ## Iteration loop
 1. **Run.** Read `review/summary.txt` for the health snapshot.
 2. **Fix the residue** by editing the version-controlled overrides files, then re-run. Repeat.
 | Review file | What to do |
 | --- | --- |
 | `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
 | `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
 | `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. |
 | `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
 > `unresolved-names.csv` is the focused "names that need a human" list. Non-family
 > correspondents that simply aren't in the register are NOT reported — they just become
 > provisional persons in `out/canonical-persons.xlsx` (the `unmatched_name_strings` count in
 > `summary.txt` tracks how many). The given-name set that drives `ambiguous_pair` detection is
 > the register's first names plus `config.EXTRA_GIVEN_NAMES` — add names there if a real
 > two-person cell isn't being flagged.
 **Valid `person_id` values** all come from the `person_id` column of `out/canonical-persons.xlsx`.
 ## Tests
 ```bash
 .venv/bin/python -m pytest tests/test_dates.py -v   # run files individually (never the whole suite at once)
 ```
--- a/tools/import-normalizer/config.py
+++ b/tools/import-normalizer/config.py
@@ -0,0 +1,135 @@
 """Tunables for the import normalizer. No logic here — only data tables."""
 from pathlib import Path
 # --- Paths ---
 BASE_DIR = Path(__file__).resolve().parent
 REPO_ROOT = BASE_DIR.parent.parent
 IMPORT_DIR = REPO_ROOT / "import"
 DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell  2 - Kopie 2025-07-05.xlsx"
 DOCUMENT_SHEET = "Familienarchiv"
 PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
 PERSON_SHEET = "Tabelle1"
 OUT_DIR = BASE_DIR / "out"
 REVIEW_DIR = BASE_DIR / "review"
 OVERRIDES_DIR = BASE_DIR / "overrides"
 # --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
 DOCUMENT_HEADER_MAP = {
    "index": "index",
    "datei": "file",
    "box": "box",
    "mappe": "folder",
    "briefeschreiberin": "sender",
    "empfängerin": "receivers",
    "datum des briefes": "date",
    "ort": "location",
    "schlagwort": "tags",
    "inhalt": "summary",
 }
 DOCUMENT_REQUIRED_FIELDS = {"index"}
 PERSON_HEADER_MAP = {
    "generation": "generation",
    "familienname": "last_name",
    "vorname": "first_name",
    "geb als": "maiden_name",
    "geburtsdatum": "birth_date",
    "geburtsort": "birth_place",
    "todesdatum": "death_date",
    "sterbeort": "death_place",
    "verheiratet mit": "spouse",
    "bemerkung": "notes",
 }
 PERSON_REQUIRED_FIELDS = {"last_name"}
 # --- Century rule (archive 1873–1957) ---
 TWO_DIGIT_19XX_MAX = 57   # 00..57 -> 1900+yy
 TWO_DIGIT_18XX_MIN = 73   # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
 # --- Seasons -> representative month (day = 1) ---
 SEASON_MONTHS = {
    "frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
    "sommer": 7, "herbst": 10, "winter": 1,
 }
 # --- Fixed feasts -> (month, day) ---
 FIXED_FEASTS = {
    "neujahr": (1, 1),
    "heiligabend": (12, 24), "heiliger abend": (12, 24),
    "weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
    "silvester": (12, 31), "sylvester": (12, 31),
 }
 # --- Movable feasts -> day offset from Easter Sunday ---
 MOVABLE_FEASTS = {
    "karfreitag": -2,
    "ostern": 0, "ostersonntag": 0, "ostermontag": 1,
    "himmelfahrt": 39, "christi himmelfahrt": 39,
    "pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
    "fronleichnam": 60,
 }
 # --- Month names -> number (German + English, full + abbreviations) ---
 MONTHS = {
    "januar": 1, "jan": 1, "january": 1,
    "februar": 2, "feb": 2, "febr": 2, "february": 2,
    "märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
    "april": 4, "apr": 4,
    "mai": 5, "may": 5,
    "juni": 6, "jun": 6, "june": 6,
    "juli": 7, "jul": 7, "july": 7,
    "august": 8, "aug": 8,
    "september": 9, "sep": 9, "sept": 9,
    "oktober": 10, "okt": 10, "oct": 10, "october": 10,
    "november": 11, "nov": 11,
    "dezember": 12, "dez": 12, "dec": 12, "december": 12,
    # Spanish (Mexican-branch correspondence)
    "enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
    "julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
    "noviembre": 11, "diciembre": 12,
 }
 ROMAN_MONTHS = {
    "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
    "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
 }
 # --- Person matching ---
 KNOWN_LAST_NAMES = [
    "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
    "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
 ]
 FUZZY_SUGGEST_THRESHOLD = 0.82  # difflib ratio; suggestions only, never auto-applied
 # --- Name classification (unresolved-name review) ---
 # Relational reference terms — a sender/receiver named by relation, not a proper name.
 RELATIONAL_TERMS = {
    "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
    "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
    "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
    "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
 }
 # Collective/group terms — not a single person. Matched against alpha-only word tokens
 # (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
 COLLECTIVE_TERMS = {
    "familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
    "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
    # Plural/group relational terms — added for tag generation heuristic
    "söhne", "töchter", "brüder", "schwestern", "schwiegereltern",
    "vettern", "kusinen", "cousinen", "nichten", "neffen", "tanten",
    "freunde", "bekannte", "geschw", "enkelkinder", "jungens", "verwandten",
 }
 # Markers of an unknown/illegible name (the literal "?" is handled separately in code).
 # All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
 # (it occurs inside real names: Hanni, Johanna, Anna).
 UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
 # A name-column value longer than this (chars) is treated as prose/description, not a name.
 PROSE_MAX_LEN = 40
 # Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
 # in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
 EXTRA_GIVEN_NAMES = {
    "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
    "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
 }
--- a/tools/import-normalizer/dates.py
+++ b/tools/import-normalizer/dates.py
@@ -0,0 +1,279 @@
 """Tolerant historical date parsing for the family archive."""
 import datetime
 import re
 from dataclasses import dataclass
 from enum import StrEnum
 import config
 class Precision(StrEnum):
    DAY = "DAY"
    MONTH = "MONTH"
    SEASON = "SEASON"
    YEAR = "YEAR"
    RANGE = "RANGE"
    APPROX = "APPROX"
    UNKNOWN = "UNKNOWN"
 def _advent_sunday(year: int, n: int) -> datetime.date:
    """n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
    dec24 = datetime.date(year, 12, 24)
    back_to_sunday = (dec24.weekday() - 6) % 7  # Mon=0..Sun=6
    fourth = dec24 - datetime.timedelta(days=back_to_sunday)
    return fourth - datetime.timedelta(days=(4 - n) * 7)
 def resolve_feast_or_season(token: str, year: int):
    """Return (iso, Precision) for a known feast/season token, else None."""
    key = " ".join(token.lower().split()).strip(" .")
    if key in config.MOVABLE_FEASTS:
        d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
        return d.isoformat(), Precision.DAY
    if key in config.FIXED_FEASTS:
        month, day = config.FIXED_FEASTS[key]
        return datetime.date(year, month, day).isoformat(), Precision.DAY
    advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
    if key in advent:
        return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
    if key in config.SEASON_MONTHS:
        return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
    return None
 def expand_year(token: str):
    """Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous."""
    token = token.strip()
    if not token.isdigit():
        return None
    n, v = len(token), int(token)
    if n == 4:
        # reject gross typos (e.g. "9003") so they go to review instead of a bogus year
        return v if 1700 <= v <= 2100 else None
    if n == 3:
        return 1000 + v
    if n == 2:
        if v <= config.TWO_DIGIT_19XX_MAX:
            return 1900 + v
        if v >= config.TWO_DIGIT_18XX_MIN:
            return 1800 + v
        return None
    return None
@dataclass(frozen=True)
 class ParsedDate:
    iso: str | None
    precision: Precision
    raw: str
 _LEADING_MARKERS = re.compile(
    r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
 def _preprocess(raw: str):
    """Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
    s = (raw or "").strip()
    if not s:
        return "", False
    low = s.lower()
    approx = ("?" in s) or any(
        m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
    s = re.sub(r"\(\s*\?\s*\)", " ", s)   # remove "(?)"
    s = s.replace("?", " ")
    s = re.sub(r",.*$", "", s)            # drop trailing editorial note (", 2. Brief")
    stripped = _LEADING_MARKERS.sub("", s)
    if stripped != s:                     # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
        approx = True
    s = re.sub(r"\s+", " ", stripped).strip(" .,")
    return s, approx
 _NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
 def _match_iso(s):
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
        try:
            datetime.date.fromisoformat(s)
            return s, Precision.DAY
        except ValueError:
            return None
    return None
 def _match_numeric(s):
    m = _NUM_RE.fullmatch(s)
    if not m:
        return None
    day, month = int(m.group(1)), int(m.group(2))
    year = expand_year(m.group(3))
    if year is None or not (1 <= month <= 12):
        return None
    try:
        return datetime.date(year, month, day).isoformat(), Precision.DAY
    except ValueError:
        return None
 _ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I)
 def _match_roman(s):
    m = _ROMAN_RE.fullmatch(s)
    if not m:
        return None
    day = int(m.group(1))
    month = config.ROMAN_MONTHS.get(m.group(2).lower())
    year = expand_year(m.group(3))
    if not month or year is None:
        return None
    try:
        return datetime.date(year, month, day).isoformat(), Precision.DAY
    except ValueError:
        return None
 _MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})")
 def _lookup_month(token: str):
    return config.MONTHS.get(token.lower().strip(" ."))
 def _build_day_month_year(day, month, year):
    if not month or year is None or not (1 <= month <= 12):
        return None
    try:
        return datetime.date(year, month, day).isoformat(), Precision.DAY
    except ValueError:
        return None
 def _match_monthname_a(s):
    m = _MONTH_A_RE.fullmatch(s)
    if not m:
        return None
    return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
 # A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
 # "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
 _MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-–]\s*(\d{2,4})")
 def _match_monthname_b(s):
    m = _MONTH_B_RE.fullmatch(s)
    if not m:
        return None
    return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3)))
 _MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})")
 _TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})")
 _YEAR_ONLY_RE = re.compile(r"\d{4}")
 _RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}")
 _RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*")
 # Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it
 # does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/").
 _RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)")
 def _match_month_year(s):
    m = _MONTH_YEAR_RE.fullmatch(s)
    if not m:
        return None
    month = _lookup_month(m.group(1))
    year = expand_year(m.group(2))
    if not month or year is None:
        return None
    return datetime.date(year, month, 1).isoformat(), Precision.MONTH
 def _match_feast_season(s):
    m = _TOKEN_YEAR_RE.fullmatch(s)
    if not m:
        return None
    year = expand_year(m.group(2))
    if year is None:
        return None
    return resolve_feast_or_season(m.group(1), year)
 def _match_year_only(s):
    if _YEAR_ONLY_RE.fullmatch(s):
        return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
    return None
 def _match_range(s):
    m = _RANGE_YY_RE.fullmatch(s)
    if m:
        return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE
    m = _RANGE_DAY_RE.fullmatch(s)
    if m:
        first = f"{m.group(1)}.{m.group(3)}"  # "7." + "Sept.1923" -> "7.Sept.1923"
        for matcher in (_match_numeric, _match_monthname_a):
            r = matcher(first)
            if r:
                return r[0], Precision.RANGE
    m = _RANGE_HYPHEN_RE.fullmatch(s)
    if m:
        start = m.group(1).strip()
        for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only):
            r = matcher(start)
            if r:
                return r[0], Precision.RANGE
    return None
 _MATCHERS = [
    _match_iso,
    _match_range,
    _match_numeric,
    _match_roman,
    _match_monthname_a,
    _match_month_year,
    _match_monthname_b,
    _match_feast_season,
    _match_year_only,
 ]
 def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
    if date_overrides:
        key = (raw or "").strip()
        if key in date_overrides:
            iso, prec = date_overrides[key]
            return ParsedDate(iso or None, Precision(prec), raw)
    cleaned, approx = _preprocess(raw)
    if not cleaned:
        return ParsedDate(None, Precision.UNKNOWN, raw)
    for matcher in _MATCHERS:
        result = matcher(cleaned)
        if result:
            iso, precision = result
            if approx:
                precision = Precision.APPROX
            return ParsedDate(iso, precision, raw)
    return ParsedDate(None, Precision.UNKNOWN, raw)
 def easter(year: int) -> datetime.date:
    """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
    a = year % 19
    b = year // 100
    c = year % 100
    d = b // 4
    e = b % 4
    f = (b + 8) // 25
    g = (b - f + 1) // 3
    h = (19 * a + b - d - g + 15) % 30
    i = c // 4
    k = c % 4
    l = (32 + 2 * e + 2 * i - h - k) % 7
    m = (a + 11 * h + 22 * l) // 451
    month = (h + l - 7 * m + 114) // 31
    day = ((h + l - 7 * m + 114) % 31) + 1
    return datetime.date(year, month, day)
--- a/tools/import-normalizer/documents.py
+++ b/tools/import-normalizer/documents.py
@@ -0,0 +1,119 @@
 """Document row extraction, triage, and the canonical document record."""
 from dataclasses import dataclass, field
 from enum import Enum, auto
 import dates as _dates
 import tags as _tags
 class Triage(Enum):
    OK = auto()
    EMPTY = auto()
    BLANK_INDEX = auto()
    X_SUFFIX = auto()
@dataclass
 class RawRow:
    source_row: int
    index: str = ""
    file: str = ""
    box: str = ""
    folder: str = ""
    sender: str = ""
    receivers: str = ""
    date: str = ""
    location: str = ""
    tags: str = ""
    summary: str = ""
@dataclass
 class CanonicalDocument:
    index: str
    box: str = ""
    folder: str = ""
    sender_person_id: str = ""
    sender_name: str = ""
    receiver_person_ids: list = field(default_factory=list)
    receiver_names: list = field(default_factory=list)
    date_iso: str = ""
    date_raw: str = ""
    date_precision: str = ""
    location: str = ""
    tags: list = field(default_factory=list)
    summary: str = ""
    source_row: int = 0
    needs_review: list = field(default_factory=list)
 _FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
 def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
    def get(field_name):
        idx = header.get(field_name)
        if idx is None or idx >= len(cells):
            return ""
        return (cells[idx] or "").strip()
    return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
 def triage(cells: list[str], index_col: int = 0) -> Triage:
    nonempty = [c for c in cells if c and str(c).strip()]
    if not nonempty:
        return Triage.EMPTY
    index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
    if not index:
        return Triage.BLANK_INDEX
    if index.endswith("x"):
        return Triage.X_SUFFIX
    return Triage.OK
 def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
    """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
    name_cols = {header.get("sender"), header.get("receivers")} - {None}
    populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
    if populated and populated <= name_cols:
        return "section_banner"
    return "data_no_index"
 def index_file_mismatch(index: str, file_path: str) -> bool:
    # Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
    if not file_path.strip():
        return False
    basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
    stem = basename.rsplit(".", 1)[0]
    return stem != index
 def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
    pd = _dates.parse_date(raw.date, date_overrides)
    flags = []
    sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
    if raw.sender.strip() and not sender_matched:
        flags.append("unmatched_sender")
    if sender_multi:
        flags.append("multi_sender")
    receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
    if any(not matched for _, _, matched in receivers):
        flags.append("unmatched_receiver")
    if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
        flags.append("unparsed_date")
    if index_file_mismatch(raw.index, raw.file):
        flags.append("index_file_mismatch")
    return CanonicalDocument(
        index=raw.index, box=raw.box, folder=raw.folder,
        sender_person_id=sender_id, sender_name=sender_name,
        receiver_person_ids=[r[0] for r in receivers],
        receiver_names=[r[1] for r in receivers],
        date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
        location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary,
        source_row=raw.source_row, needs_review=flags,
    )
--- a/tools/import-normalizer/ingest.py
+++ b/tools/import-normalizer/ingest.py
@@ -0,0 +1,50 @@
 """Read .xlsx sheets into neutral list[list[str]] and map headers to fields."""
 import datetime
 from pathlib import Path
 import openpyxl
 def _cell_to_str(value) -> str:
    if value is None:
        return ""
    if isinstance(value, bool):   # bool is a subclass of int — handle before the int branch
        return str(value)
    if isinstance(value, datetime.datetime):
        return value.date().isoformat()
    if isinstance(value, datetime.date):
        return value.isoformat()
    if isinstance(value, float) and value.is_integer():
        return str(int(value))
    if isinstance(value, int):
        return str(value)
    return str(value).strip()
 def read_sheet(path: Path, sheet_name: str) -> list[list[str]]:
    wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
    if sheet_name not in wb.sheetnames:
        raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}")
    ws = wb[sheet_name]
    rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)]
    wb.close()
    return rows
 def _norm_header(text: str) -> str:
    return " ".join(text.lower().split())
 def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]):
    """Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing."""
    fields: dict[str, int] = {}
    unknown: list[str] = []
    for idx, raw in enumerate(header_row):
        key = _norm_header(raw)
        if key in field_map:
            fields[field_map[key]] = idx
        elif raw.strip():
            unknown.append(raw)
    missing = required - set(fields)
    if missing:
        raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})")
    return fields, unknown
--- a/tools/import-normalizer/normalize.py
+++ b/tools/import-normalizer/normalize.py
@@ -0,0 +1,171 @@
 """Orchestrator: read raw workbooks -> canonical outputs + review reports."""
 import argparse
 from collections import Counter
 from pathlib import Path
 import config
 import ingest
 import persons
 import documents
 import overrides as overrides_mod
 import tags as _tags
 import writers
 def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
        out_dir, review_dir, date_overrides, name_overrides,
        approved_themes_path=None) -> dict:
    out_dir, review_dir = Path(out_dir), Path(review_dir)
    approved_themes = _tags.load_approved_themes(Path(approved_themes_path)) if approved_themes_path else set()
    # --- persons ---
    person_rows = ingest.read_sheet(person_workbook, person_sheet)
    p_fields, _ = ingest.build_header_map(person_rows[0], config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
    person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]]
    register = persons.parse_register(person_dicts)
    alias_index = persons.AliasIndex(register)
    given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES)
    ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names)
    # --- documents ---
    doc_rows = ingest.read_sheet(document_workbook, document_sheet)
    d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
    index_col = d_fields["index"]
    canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
    unparsed_by_raw: dict[str, list] = {}
    dates_by_override = 0
    empty_count = 0
    seen_index = Counter()
    for source_row, cells in enumerate(doc_rows[1:], start=2):
        t = documents.triage(cells, index_col)
        if t is documents.Triage.EMPTY:
            empty_count += 1
            continue
        if t is documents.Triage.BLANK_INDEX:
            blank_index.append([source_row, documents.classify_blank_index(cells, d_fields),
                                " | ".join(c for c in cells if c)])
            continue
        if t is documents.Triage.X_SUFFIX:
            idx = (cells[index_col] or "").strip()
            skipped_x.append([source_row, idx, idx[:-1]])
            continue
        raw = documents.extract_row(cells, d_fields, source_row)
        seen_index[raw.index] += 1
        if raw.date.strip() and raw.date.strip() in date_overrides:
            dates_by_override += 1
        doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
        if "unparsed_date" in doc.needs_review:
            unparsed_by_raw.setdefault(raw.date, []).append(source_row)
        if "index_file_mismatch" in doc.needs_review:
            mismatches.append([source_row, raw.index, raw.file])
        canon_docs.append(doc)
    # REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
    dup_indexes = {idx for idx, n in seen_index.items() if n > 1}
    duplicates = []
    for doc in canon_docs:
        if doc.index in dup_indexes:
            if "duplicate_index" not in doc.needs_review:
                doc.needs_review.append("duplicate_index")
            duplicates.append([doc.source_row, doc.index])
    all_people = register + list(ctx.provisional.values())
    # --- write canonical outputs ---
    writers.write_documents_xlsx(canon_docs, out_dir / "canonical-documents.xlsx")
    writers.write_persons_xlsx(all_people, out_dir / "canonical-persons.xlsx")
    all_tag_paths = [path for doc in canon_docs for path in doc.tags]
    writers.write_tag_tree_xlsx(_tags.build_tag_tree(all_tag_paths), out_dir / "canonical-tag-tree.xlsx")
    # --- review files ---
    # unparsed dates: most-frequent first, with example source rows + blank override cells so a
    # corrected row can be pasted straight into overrides/dates.csv (same raw,iso,precision shape).
    unparsed_rows = sorted(
        ([raw, len(rows), " ".join(map(str, rows[:5])), "", ""] for raw, rows in unparsed_by_raw.items()),
        key=lambda r: (-r[1], r[0]))
    writers.write_review_csv(review_dir / "unparsed-dates.csv",
                             ["raw", "count", "example_rows", "suggested_iso", "suggested_precision"], unparsed_rows)
    writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates)
    writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index)
    writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x)
    unresolved_agg: dict[tuple, list] = {}
    for name, category, row in ctx.unresolved:
        unresolved_agg.setdefault((category, name), []).append(row)
    unresolved_rows = sorted(
        ([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))]
         for (cat, name), rows in unresolved_agg.items()),
        key=lambda r: (r[0], -r[2], r[1]))
    writers.write_review_csv(review_dir / "unresolved-names.csv",
                             ["category", "raw", "count", "example_rows"], unresolved_rows)
    writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
    all_summaries = [doc.summary for doc in canon_docs if doc.summary]
    candidates = _tags.mine_summary_candidates(all_summaries)
    writers.write_review_csv(review_dir / "tag-candidates.csv", ["candidate", "count"],
                             [[c, n] for c, n in candidates])
    dated = sum(1 for d in canon_docs if d.date_raw.strip())
    unknown = sum(1 for d in canon_docs if d.date_raw.strip() and d.date_precision == "UNKNOWN")
    unknown_rate = f"{(100 * unknown / dated):.1f}%" if dated else "0.0%"
    stats = {
        "# INPUTS": "",
        "document_rows_read": len(doc_rows) - 1,
        "register_persons": len(register),
        "unknown_headers": ", ".join(unknown_headers) or "(none)",
        "# OUTPUTS": "",
        "documents_emitted": len(canon_docs),
        "provisional_persons": len(ctx.provisional),
        "# DATES": "",
        "dated_rows": dated,
        "unparsed_dates": unknown,
        "unknown_date_rate": f"{unknown_rate} (target <=5%)",
        "distinct_unparsed_formats": len(unparsed_by_raw),
        "# NAMES": "",
        "unmatched_name_strings": len(ctx.unmatched),
        "unresolved_name_occurrences": len(ctx.unresolved),
        "unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"),
        "unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"),
        "unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"),
        "unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"),
        "unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"),
        "unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"),
        "# ANOMALIES": "",
        "empty_rows": empty_count,
        "blank_index_rows": len(blank_index),
        "skipped_x_suffix": len(skipped_x),
        "duplicate_index_rows": len(duplicates),
        "index_file_mismatches": len(mismatches),
        "# OVERRIDES": "",
        "date_overrides_loaded": len(date_overrides),
        "name_overrides_loaded": len(name_overrides),
        "dates_resolved_by_override": dates_by_override,
        "names_resolved_by_override": ctx.override_hits,
    }
    writers.write_summary(review_dir / "summary.txt", stats)
    return stats
 def main():
    parser = argparse.ArgumentParser(description="Normalize the family archive spreadsheets.")
    parser.parse_args()
    date_overrides, name_overrides = overrides_mod.load_overrides(
        config.OVERRIDES_DIR / "dates.csv", config.OVERRIDES_DIR / "names.csv")
    stats = run(
        document_workbook=config.DOCUMENT_WORKBOOK, document_sheet=config.DOCUMENT_SHEET,
        person_workbook=config.PERSON_WORKBOOK, person_sheet=config.PERSON_SHEET,
        out_dir=config.OUT_DIR, review_dir=config.REVIEW_DIR,
        date_overrides=date_overrides, name_overrides=name_overrides,
        approved_themes_path=config.OVERRIDES_DIR / "approved-themes.csv")
    print("Normalization complete:")
    for k, v in stats.items():
        print(f"  {k}: {v}")
 if __name__ == "__main__":
    main()
--- a/tools/import-normalizer/out/canonical-persons-tree.json
+++ b/tools/import-normalizer/out/canonical-persons-tree.json
--- a/tools/import-normalizer/overrides.py
+++ b/tools/import-normalizer/overrides.py
@@ -0,0 +1,21 @@
 """Load human-supplied corrections. Missing files are not an error."""
 import csv
 from pathlib import Path
 def load_overrides(dates_path: Path, names_path: Path):
    date_overrides: dict[str, tuple[str, str]] = {}
    name_overrides: dict[str, str] = {}
    if Path(dates_path).exists():
        with open(dates_path, encoding="utf-8", newline="") as f:
            for row in csv.DictReader(f):
                raw = (row.get("raw") or "").strip()
                if raw:
                    date_overrides[raw] = ((row.get("iso") or "").strip(), (row.get("precision") or "UNKNOWN").strip())
    if Path(names_path).exists():
        with open(names_path, encoding="utf-8", newline="") as f:
            for row in csv.DictReader(f):
                raw = (row.get("raw") or "").strip()
                if raw:
                    name_overrides[raw] = (row.get("person_id") or "").strip()
    return date_overrides, name_overrides
--- a/tools/import-normalizer/overrides/README.md
+++ b/tools/import-normalizer/overrides/README.md
@@ -0,0 +1,81 @@
 # Overrides
 Human corrections applied **deterministically on every run**. An override **wins** over the
 automatic date parser / name matcher, so this is how you fix the residue the tool can't resolve
 on its own. Two CSV files live here; both are read by `overrides.load_overrides()`.
 - Missing or header-only files are fine — they just contribute zero overrides.
 - Keep these files committed to git (they're your curated corrections); the generated `out/`
  and `review/` folders are *not* committed.
 - Matching is **exact** on the `raw` value after trimming surrounding whitespace. Copy the
  `raw` value verbatim from the matching `review/*.csv`.
 ## The iteration loop
 1. Run `python normalize.py`.
 2. Open `review/unparsed-dates.csv` and `review/unresolved-names.csv` (sorted by frequency).
 3. Add correction rows here, then re-run. Repeat until the residue is acceptable.
 ---
 ## `dates.csv` — fix unparseable dates
 Header: `raw,iso,precision`
 | column | meaning |
 | --- | --- |
 | `raw` | the date string exactly as written in the spreadsheet (= the `raw` column in `review/unparsed-dates.csv`). |
 | `iso` | the corrected date as `YYYY-MM-DD`. For partial dates use the 1st: month-only → `YYYY-MM-01`, year-only → `YYYY-01-01`. Leave **empty** if truly unknown. |
 | `precision` | one of `DAY`, `MONTH`, `SEASON`, `YEAR`, `RANGE`, `APPROX`, `UNKNOWN`. |
 ### Example
 ```csv
 raw,iso,precision
 23.Juni 58,1958-06-23,DAY
 8.März 60,1960-03-08,DAY
 Mayo 18-1929,1929-05-18,DAY
 Abril 10-929,1929-04-10,DAY
 30.April,1909-04-30,DAY
 Mai 1895,1895-05-01,MONTH
 Herbst 1913,1913-10-01,SEASON
 1945/46,1945-01-01,RANGE
 um 1920,1920-01-01,APPROX
 ?,,UNKNOWN
 ```
 Notes:
 - `23.Juni 58` / `8.März 60` — two-digit years `58`/`60` fall in the parser's ambiguous
  `58–72` band (just past the 1873–1957 window), so they aren't auto-parsed; here you assert 1958/1960.
 - `Mayo`/`Abril` — Spanish month names (Mexican-branch letters) the parser doesn't know yet.
 - `30.April` — month+day with no year; pick the year from the letter's context.
 - Empty `iso` + `UNKNOWN` records a deliberate "unknown date" (stops it showing up as residue).
 ---
 ## `names.csv` — map a name string to a canonical person
 Header: `raw,person_id`
 | column | meaning |
 | --- | --- |
 | `raw` | the sender/receiver name string exactly as written (= the `raw` column in `review/unresolved-names.csv`). For a multi-name cell that was split (e.g. `"Walter und Eugenie"`), use the **individual** name part. |
 | `person_id` | the canonical id to map it to. **Must be a real id** from the `person_id` column of `out/canonical-persons.xlsx` (a register person or an already-created provisional). |
 ### Example
 ```csv
 raw,person_id
 A.Klucke,klucke-anna
 ? Hans de Gruyter,de-gruyter-hans
 Eltern Cram,cram-john-james
 Tante Lolly,blomquist-charlotte
 ```
 Notes:
 - Use this for partial / misspelled / illegible / aliased names that should point at a known person.
 - It maps one string → **one** person. It does **not** split a two-person cell: for genuine
  pairs like `Ella Anita` (flagged `ambiguous_pair`), there is no split-via-override yet — leave
  them, or add both given names to `config.EXTRA_GIVEN_NAMES` so they keep getting flagged.
 - Look up valid `person_id` values in `out/canonical-persons.xlsx`. An id that doesn't exist
  there will create a dangling reference (no validation yet).
--- a/tools/import-normalizer/overrides/approved-themes.csv
+++ b/tools/import-normalizer/overrides/approved-themes.csv
@@ -0,0 +1 @@
 candidate
--- a/tools/import-normalizer/overrides/dates.csv
+++ b/tools/import-normalizer/overrides/dates.csv
@@ -0,0 +1 @@
 raw,iso,precision
--- a/tools/import-normalizer/overrides/names.csv
+++ b/tools/import-normalizer/overrides/names.csv
@@ -0,0 +1 @@
 raw,person_id
--- a/tools/import-normalizer/persons.py
+++ b/tools/import-normalizer/persons.py
@@ -0,0 +1,336 @@
 """Person register parsing, name splitting, alias resolution."""
 import difflib
 import re
 import unicodedata
 from collections import Counter
 from dataclasses import dataclass, field
 from enum import StrEnum
 import config
 import dates
 _DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
                                "Ä": "ae", "Ö": "oe", "Ü": "ue"})
 def _strip_accents(s: str) -> str:
    s = s.translate(_DIACRITIC_MAP)
    s = unicodedata.normalize("NFKD", s)
    return "".join(c for c in s if not unicodedata.combining(c))
 def slugify(last: str, first: str) -> str:
    raw = f"{last} {first}".strip()
    raw = _strip_accents(raw).lower()
    raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-")
    return raw or "unknown"
@dataclass
 class Person:
    person_id: str
    last_name: str = ""
    first_name: str = ""
    maiden_name: str = ""
    title: str = ""
    nickname: str = ""
    extra_given_names: list[str] = field(default_factory=list)
    birth_date: str | None = None
    birth_date_raw: str = ""
    birth_place: str = ""
    death_date: str | None = None
    death_date_raw: str = ""
    death_place: str = ""
    spouse: str = ""
    generation: str = ""
    notes: str = ""
    aliases: list[str] = field(default_factory=list)
    provisional: bool = False
 _QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$')
 def parse_register(rows: list[dict]) -> list[Person]:
    people = []
    for r in rows:
        last = (r.get("last_name") or "").strip()
        if not last:
            continue
        given_raw = (r.get("first_name") or "").strip()
        givens = [g.strip() for g in given_raw.split(",") if g.strip()]
        first = givens[0] if givens else ""
        extra = givens[1:]
        spouse_raw = (r.get("spouse") or "").strip()
        nickname = ""
        m = _QUOTED_RE.match(spouse_raw)
        if m:
            nickname = m.group(1)
            spouse_raw = ""
        birth = dates.parse_date(r.get("birth_date") or "")
        death = dates.parse_date(r.get("death_date") or "")
        people.append(Person(
            person_id=slugify(last, first),
            last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(),
            nickname=nickname, extra_given_names=extra,
            birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(),
            death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(),
            spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
            notes=(r.get("notes") or "").strip(), provisional=False,
        ))
    # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
    # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
    counts = Counter(p.person_id for p in people)
    seen: dict[str, int] = {}
    for p in people:
        if counts[p.person_id] > 1:
            seen[p.person_id] = seen.get(p.person_id, 0) + 1
            p.person_id = f"{p.person_id}-{seen[p.person_id]}"
    return people
 _GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I)
 _PAREN_RE = re.compile(r"\(([^)]+)\)\s*$")
 _MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I)
 def find_known_last_name(segment: str) -> str | None:
    seg = segment.strip()
    for ln in config.KNOWN_LAST_NAMES:  # config lists longest-first
        if seg == ln or seg.endswith(" " + ln):
            return ln
    return None
 def split_receivers(raw: str) -> list[str]:
    if not raw or not raw.strip():
        return []
    # 0. split on "//"
    if "//" in raw:
        out = []
        for seg in raw.split("//"):
            out.extend(split_receivers(seg))
        return out
    cleaned = _GEB_RE.sub("", raw).strip()
    if not cleaned:            # e.g. a "geb. Müller"-only cell strips to empty
        return []
    if not _MULTI_RE.search(cleaned):
        return [cleaned]
    shared_last = None
    pm = _PAREN_RE.search(cleaned)
    if pm:
        shared_last = pm.group(1).strip()
        cleaned = cleaned[:pm.start()].strip()
    parts = [p.strip() for p in _MULTI_RE.split(cleaned)]
    parts = [p for p in parts if p and p.lower() != "familie"]
    if not parts:
        return []
    if len(parts) == 1:
        return [parts[0]]
    if shared_last:
        return [p if " " in p else f"{p} {shared_last}" for p in parts]
    last_seg = parts[-1]
    detected = find_known_last_name(last_seg)
    if detected:
        result = []
        for p in parts[:-1]:
            if " " not in p and find_known_last_name(p) is None:
                result.append(f"{p} {detected}")
            else:
                result.append(p)
        result.append(last_seg)
        return result
    return parts
 def _norm(name: str) -> str:
    return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
 class NameClass(StrEnum):
    RESOLVABLE = "resolvable"
    UNKNOWN = "unknown"
    SINGLE_TOKEN = "single_token"
    RELATIONAL = "relational"
    COLLECTIVE = "collective"
    PROSE = "prose"
    AMBIGUOUS_PAIR = "ambiguous_pair"
 _QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019"
 def classify_name(raw: str, given_names: set[str]) -> NameClass:
    """Classify a (post-split) sender/receiver string by why it may be unresolvable.
    Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
    SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
    """
    s = raw.strip()
    if not s:
        return NameClass.RESOLVABLE
    low = s.lower()
    tokens = s.split()
    # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
    # are matched as whole words (no substring/prefix false positives like "Allerton").
    alpha_words = re.findall(r"[a-zäöüß]+", low)
    if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
        return NameClass.UNKNOWN
    if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
            or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
        return NameClass.PROSE
    if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
        return NameClass.COLLECTIVE
    if any(w in config.RELATIONAL_TERMS for w in alpha_words):
        return NameClass.RELATIONAL
    if len(tokens) == 1:
        return NameClass.SINGLE_TOKEN
    if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
        return NameClass.AMBIGUOUS_PAIR
    return NameClass.RESOLVABLE
 # Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
 # classified PROSE. Such multi-particle names are rare here and usually resolve via the
 # register; if they surface in review, lower-priority than the real prose entries.
 def build_given_names(register: list[Person], extra: set[str]) -> set[str]:
    """Set of normalized given names from the register (first + extra given) plus a supplement.
    Used by classify_name to tell a two-given-name pair (two people) from a first+surname.
    """
    names: set[str] = set()
    for p in register:
        if p.first_name:
            names.add(_norm(p.first_name))
        for g in p.extra_given_names:
            names.add(_norm(g))
    for e in extra:
        names.add(_norm(e))
    return names
 class AliasIndex:
    def __init__(self, people: list[Person]):
        self._by_alias: dict[str, str] = {}
        self._display: dict[str, str] = {}
        self.known_ids: set[str] = {p.person_id for p in people}
        first_name_ids: dict[str, list] = {}
        for p in people:
            self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
            # Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
            forms = [f"{p.first_name} {p.last_name}".strip()]
            if p.maiden_name:
                forms.append(f"{p.first_name} {p.maiden_name}".strip())
            for extra in p.extra_given_names:
                forms.append(f"{extra} {p.last_name}".strip())
            if p.nickname:
                forms.append(p.nickname)
            seen = set()
            for form in forms:
                if form in seen:
                    continue
                seen.add(form)
                key = _norm(form)
                if key and key not in self._by_alias:
                    self._by_alias[key] = p.person_id
                    p.aliases.append(form)
            if p.first_name:
                ids = first_name_ids.setdefault(_norm(p.first_name), [])
                if p.person_id not in ids:
                    ids.append(p.person_id)
        # first-name-only alias, only when unambiguous
        for fname, ids in first_name_ids.items():
            if len(ids) == 1 and fname not in self._by_alias:
                self._by_alias[fname] = ids[0]
    def resolve(self, name: str):
        return self._by_alias.get(_norm(name))
    def display(self, person_id: str) -> str:
        return self._display.get(person_id, "")
    def suggest(self, name: str):
        keys = list(self._by_alias.keys())
        match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
        if not match:
            return None, 0.0
        score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
        return self._by_alias[match[0]], score
 class ResolutionContext:
    """Resolves raw name strings to person ids; accumulates provisional persons and review data."""
    def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str],
                 given_names: set[str] | None = None):
        self.index = alias_index
        self.name_overrides = name_overrides
        self.given_names = given_names or set()
        self.provisional: dict[str, Person] = {}
        self.unmatched: dict[str, list] = {}
        self.unresolved: list[tuple] = []   # (raw_name, category, source_row) for non-RESOLVABLE names
        self._raw_to_pid: dict[str, str] = {}
        self.override_hits = 0
    def _unique_id(self, base: str) -> str:
        """A provisional id must never collide with a register id or another provisional."""
        used = self.index.known_ids | set(self.provisional)
        pid, n = base, 1
        while pid in used:
            n += 1
            pid = f"{base}-{n}"
        return pid
    def resolve_one(self, raw_name: str, source_row: int):
        """Return (person_id, display_name, matched: bool). '' name -> ('', '', True)."""
        name = (raw_name or "").strip()
        if not name:
            return "", "", True
        if name in self.name_overrides:
            self.override_hits += 1
            pid = self.name_overrides[name]
            return pid, self.index.display(pid) or name, True
        pid = self.index.resolve(name)
        if pid:
            return pid, self.index.display(pid) or name, True
        # provisional person (unmatched) — never reuse a register id
        self.unmatched.setdefault(name, []).append(source_row)
        category = classify_name(name, self.given_names)
        if category is not NameClass.RESOLVABLE:
            self.unresolved.append((name, str(category), source_row))
        if name in self._raw_to_pid:
            return self._raw_to_pid[name], name, False
        last, first = _last_first(name)
        pid = self._unique_id(slugify(last, first))
        self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True)
        self._raw_to_pid[name] = pid
        return pid, name, False
    def resolve_sender(self, raw: str, source_row: int):
        """Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged."""
        parts = split_receivers(raw)
        if not parts:
            return "", "", True, False
        pid, name, matched = self.resolve_one(parts[0], source_row)
        for extra in parts[1:]:
            self.resolve_one(extra, source_row)  # register the others as persons too
        return pid, name, matched, len(parts) > 1
    def resolve_receivers(self, raw: str, source_row: int):
        return [self.resolve_one(part, source_row) for part in split_receivers(raw)]
 def _last_first(name: str):
    """Best-effort split of a free name string into (last, first) for slug/provisional building."""
    name = name.strip()
    ln = find_known_last_name(name)
    if ln:
        first = name[: -len(ln)].strip()
        return ln, first
    tokens = name.split()
    if len(tokens) >= 2:
        return tokens[-1], " ".join(tokens[:-1])
    return name, ""
--- a/tools/import-normalizer/persons_tree.py
+++ b/tools/import-normalizer/persons_tree.py
@@ -0,0 +1,409 @@
 """Normalize Personendatei 2.xlsx into canonical-persons-tree.json."""
 import argparse
 import datetime
 import json
 import re
 import sys
 from pathlib import Path
 import config
 import dates
 from persons import _strip_accents
 _MIN_YEAR = 1700
 _MAX_YEAR = 2100
 # Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
 # but the year is a plausible typo (1000-3000), don't try serial conversion.
 # Years outside this range (e.g., 7568) are implausible and should try serial conversion.
 _PLAUSIBLE_TYPO_MIN = 1000
 _PLAUSIBLE_TYPO_MAX = 3000
 def _parse_year(raw: str | None) -> int | None:
    """Extract a birth/death year from an Excel cell string.
    Handles three cases:
    1. ISO / German / text string parseable by parse_date() → extract year if in range
    2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion
       (unless it's a plausible typo year, e.g., "1023" for "1923")
    3. Mixed-format or unresolvable → None
    Serial conversion only fires for pure-digit strings and implausible years,
    preventing typo years like "1023" from being mis-converted as serials.
    """
    if raw is None:
        return None
    s = str(raw).strip()
    if not s:
        return None
    # Check if it's a pure-digit string (candidate for serial conversion)
    is_pure_digit = re.fullmatch(r"\d+", s) is not None
    # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
    result = dates.parse_date(s)
    if result.iso:
        year = int(result.iso[:4])
        if _MIN_YEAR <= year <= _MAX_YEAR:
            return year
        # Year is out of range. Only try serial conversion if it's an implausible year.
        # Plausible typos (e.g., 1023 for 1923) should not be converted as serials.
        if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX):
            n = int(s)
            if 1 <= n <= 80_000:
                d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
                if _MIN_YEAR <= d.year <= _MAX_YEAR:
                    return d.year
        return None
    # parse_date() found nothing. Try serial conversion only for pure-digit strings.
    if is_pure_digit:
        n = int(s)
        if 1 <= n <= 80_000:
            d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
            if _MIN_YEAR <= d.year <= _MAX_YEAR:
                return d.year
    return None
 def _parse_generation(raw: str | None) -> int | None:
    """Extract the generation integer from column A values like 'G 3', 'G3', 'G  0'."""
    if not raw:
        return None
    m = re.search(r"\d+", str(raw))
    return int(m.group()) if m else None
 _GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}
 def _norm_tree(s: str) -> str:
    """Normalize a name string for tree matching.
    - Strip surrounding quotes, remove parenthetical substrings
    - Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
    - Remove known geographic/honorific suffix tokens
    - Collapse whitespace
    """
    s = (s or "").strip().strip("\"'")
    s = re.sub(r"\([^)]*\)", "", s)
    s = _strip_accents(s).lower().replace(".", " ")
    tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
    return " ".join(tokens).strip("., ")
 def _build_index(persons: list[dict]) -> dict[str, list[str]]:
    """Build a name → [rowId, …] lookup index with four keys per person."""
    index: dict[str, list[str]] = {}
    def _add(key: str, row_id: str) -> None:
        if key:
            index.setdefault(key, []).append(row_id)
    for p in persons:
        row_id = p["rowId"]
        first = p.get("firstName") or ""
        last = p.get("lastName") or ""
        maiden = p.get("maidenName") or ""
        _add(_norm_tree(f"{first} {last}"), row_id)
        _add(_norm_tree(f"{last} {first}"), row_id)
        if maiden:
            _add(_norm_tree(f"{first} {maiden}"), row_id)
        _add(_norm_tree(last), row_id)
    return index
 def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
    """Return (row_id, None) on unique match, (None, reason) otherwise."""
    key = _norm_tree(raw)
    if not key:
        return None, "empty"
    hits = index.get(key, [])
    if len(hits) == 1:
        return hits[0], None
    if len(hits) == 0:
        return None, "not_found"
    return None, "ambiguous"
 def _parse_row(row_num: int, fields: dict) -> dict:
    """Produce one person record from a header-mapped row dict.
    Internal keys prefixed with '_' are stripped before JSON output in main().
    """
    def s(key: str) -> str:
        return (fields.get(key) or "").strip()
    birth_raw = s("birth_date")
    death_raw = s("death_date")
    birth_year = _parse_year(birth_raw)
    death_year = _parse_year(death_raw)
    notes_parts = []
    if birth_raw and birth_year is None:
        notes_parts.append(f"[Geburtsdatum: {birth_raw}]")
    if death_raw and death_year is None:
        notes_parts.append(f"[Todesdatum: {death_raw}]")
    bemerkung = s("notes")
    if bemerkung:
        notes_parts.append(bemerkung)
    maiden = s("maiden_name") or None
    spouse = s("spouse") or None
    bemerkung_out = bemerkung or None
    return {
        "rowId": f"row_{row_num:03d}",
        "firstName": s("first_name"),
        "lastName": s("last_name"),
        "maidenName": maiden,
        "alias": None,
        "notes": " ".join(notes_parts) or None,
        "birthYear": birth_year,
        "deathYear": death_year,
        "birthPlace": s("birth_place") or None,
        "deathPlace": s("death_place") or None,
        "generation": _parse_generation(s("generation")),
        "familyMember": True,
        "_spouse_raw": spouse,
        "_bemerkung_raw": bemerkung_out,
    }
 def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
    """Remove duplicate rows. Two-stage:
    1. Exact (firstName, lastName, birthYear) match.
    2. (firstName, lastName) where the later entry has birthYear=None and an earlier
       entry already has a known birthYear.
    """
    seen_full: dict[tuple, str] = {}   # (first, last, year) -> rowId
    seen_name: dict[tuple, str] = {}   # (first, last) -> rowId of first entry with a year
    result: list[dict] = []
    skipped: list[str] = []
    for p in persons:
        first, last, year = p["firstName"], p["lastName"], p["birthYear"]
        key_full = (first, last, year)
        key_name = (first, last)
        if key_full in seen_full:
            skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})")
            continue
        if year is None and key_name in seen_name:
            skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)")
            continue
        seen_full[key_full] = p["rowId"]
        if year is not None:
            seen_name[key_name] = p["rowId"]
        result.append(p)
    return result, skipped
 def _resolve_spouses(
    persons: list[dict], index: dict[str, list[str]]
 ) -> tuple[list[dict], list[dict]]:
    """Emit SPOUSE_OF edges from each person's _spouse_raw field."""
    relationships: list[dict] = []
    unresolved: list[dict] = []
    emitted: set[frozenset] = set()
    for p in persons:
        raw = (p.get("_spouse_raw") or "").strip()
        if not raw:
            continue
        row_id = p["rowId"]
        matched_id, reason = _resolve_one(raw, index)
        if matched_id:
            edge = frozenset([row_id, matched_id])
            if edge not in emitted:
                emitted.add(edge)
                relationships.append({
                    "personId": row_id,
                    "relatedPersonId": matched_id,
                    "type": "SPOUSE_OF",
                    "source": "verheiratet_mit",
                })
        else:
            unresolved.append({
                "rowId": row_id,
                "field": "verheiratet_mit",
                "raw": raw,
                "reason": reason,
            })
    return relationships, unresolved
 _CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I)
 _PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I)
 _AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I)
 def _parse_bemerkung(
    row_id: str, bemerkung: str, index: dict[str, list[str]]
 ) -> tuple[list[dict], list[dict], str]:
    """Extract PARENT_OF edges from a Bemerkung cell.
    Returns (relationships, unresolved, remaining_notes).
    Text that doesn't match a parent pattern goes to remaining_notes unchanged.
    """
    if not bemerkung or not bemerkung.strip():
        return [], [], ""
    s = bemerkung.strip()
    for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")):
        m = pattern.match(s)
        if not m:
            continue
        # Split the captured group on the first comma or semicolon to separate
        # the name part from any trailing description (e.g. ", nach Mexiko emigriert")
        raw_names, _, trailing = m.group(1).strip().partition(",")
        if not trailing:
            raw_names, _, trailing = raw_names.partition(";")
        name_part = raw_names.strip().rstrip("!., ")
        remainder = trailing.strip().lstrip(".,! ")
        parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
        rels: list[dict] = []
        unres: list[dict] = []
        for part in parts:
            part = part.rstrip("!., ")
            matched_id, reason = _resolve_one(part, index)
            if matched_id:
                if direction == "child":
                    rels.append({
                        "personId": matched_id,
                        "relatedPersonId": row_id,
                        "type": "PARENT_OF",
                        "source": "bemerkung",
                        "rawBemerkung": bemerkung,
                    })
                else:
                    rels.append({
                        "personId": row_id,
                        "relatedPersonId": matched_id,
                        "type": "PARENT_OF",
                        "source": "bemerkung",
                        "rawBemerkung": bemerkung,
                    })
            else:
                unres.append({
                    "rowId": row_id,
                    "field": "bemerkung",
                    "raw": bemerkung,
                    "reason": reason,
                })
        return rels, unres, remainder
    # No pattern matched — full text goes to notes, nothing to unresolved
    return [], [], s
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json"
    )
    parser.add_argument(
        "--input", default=str(config.PERSON_WORKBOOK),
        help="Path to Personendatei 2.xlsx"
    )
    parser.add_argument(
        "--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"),
        help="Path for output JSON"
    )
    parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write")
    args = parser.parse_args()
    from ingest import read_sheet, build_header_map
    rows = read_sheet(Path(args.input), config.PERSON_SHEET)
    if not rows:
        print("ERROR: sheet is empty", file=sys.stderr)
        sys.exit(1)
    header_row = [str(v) for v in rows[0]]
    fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
    # --- Pass 1: parse rows ---
    persons_raw: list[dict] = []
    for row_num, row in enumerate(rows[1:], start=2):
        field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
        if not field_dict.get("last_name", "").strip():
            continue
        persons_raw.append(_parse_row(row_num, field_dict))
    persons, skipped_msgs = _deduplicate(persons_raw)
    for msg in skipped_msgs:
        print(f"  SKIP {msg}", file=sys.stderr)
    index = _build_index(persons)
    # --- Pass 2: resolve relationships ---
    all_rels: list[dict] = []
    all_unresolved: list[dict] = []
    spouse_rels, spouse_unres = _resolve_spouses(persons, index)
    all_rels.extend(spouse_rels)
    all_unresolved.extend(spouse_unres)
    for p in persons:
        bemerkung = p.pop("_bemerkung_raw", None) or ""
        p.pop("_spouse_raw", None)
        rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index)
        all_rels.extend(rels)
        all_unresolved.extend(unres)
        if remaining:
            existing = p.get("notes") or ""
            if remaining not in existing:
                p["notes"] = (existing + " " + remaining).strip() if existing else remaining
    # --- Stats output ---
    spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF")
    parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF")
    print(f"✓ {len(persons)} persons parsed")
    print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)")
    if all_unresolved:
        print(f"⚠  {len(all_unresolved)} unresolved (see unresolved[] in output)")
    if args.dry_run:
        print("\n--- dry-run: first 5 unresolved ---")
        for u in all_unresolved[:5]:
            print(f"  {u}")
        return
    output = {
        "generated_at": datetime.datetime.now().isoformat(),
        "source": Path(args.input).name,
        "stats": {
            "persons": len(persons),
            "relationships": len(all_rels),
            "unresolved": len(all_unresolved),
        },
        "persons": persons,
        "relationships": all_rels,
        "unresolved": all_unresolved,
    }
    out_path = Path(args.output)
    out_path.parent.mkdir(exist_ok=True)
    out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"→  {args.output}")
 if __name__ == "__main__":
    main()
--- a/tools/import-normalizer/requirements.txt
+++ b/tools/import-normalizer/requirements.txt
@@ -0,0 +1,2 @@
 openpyxl==3.1.5
 pytest==8.3.4
--- a/tools/import-normalizer/tags.py
+++ b/tools/import-normalizer/tags.py
@@ -0,0 +1,119 @@
 import csv
 import re
 from collections import Counter
 from pathlib import Path
 import config
 _COLLECTIVE = config.COLLECTIVE_TERMS
 _GERMAN_STOP_WORDS = {
    "der", "die", "das", "ein", "eine", "einer", "einen", "einem", "eines",
    "und", "oder", "aber", "an", "in", "auf", "für", "mit", "von", "zu",
    "bei", "nach", "vor", "aus", "ist", "sind", "war", "waren", "hat",
    "haben", "wird", "werden", "ich", "du", "er", "sie", "es", "wir",
    "ihr", "ihn", "ihm", "ihnen", "mich", "mir", "dich", "dir",
    "ihre", "ihren", "seinem", "seinen", "seiner", "seine",
    "auch", "nicht", "noch", "dann", "durch", "dem", "den",
    "des", "als", "wie", "dass", "um", "über", "unter", "zwischen",
    "all", "alle", "was", "wer", "wo", "wann", "welche", "welcher",
    "mehr", "sehr", "nur", "schon", "dabei", "dazu",
    "bis", "seit", "gegen", "ohne", "doch", "wenn", "weil",
    "ob", "so", "da", "dort", "hier", "nun", "ja", "nein",
    "ihrer", "ihrem",
    # Contracted prepositions common in German Inhalt summaries
    "im", "am", "ans", "ins", "zum", "zur", "vom", "beim", "sich",
    "hat", "hatte", "wird", "wurde", "wurden", "worden",
    "kann", "konnte", "soll", "sollte", "will", "wollte",
    "ihm", "dieses", "dieser", "diesem", "diesen",
 }
 def _is_correspondence(raw: str) -> bool:
    lower = raw.lower()
    return " an " in lower or lower.startswith("an ") or ".an " in lower
 def _tokenize(text: str) -> list[str]:
    return [t.lower() for t in re.findall(r"[a-zA-ZäöüÄÖÜß]+", text)]
 def _has_collective(tokens: list[str]) -> bool:
    return any(t in _COLLECTIVE for t in tokens)
 def classify_schlagwort(raw: str) -> list[str]:
    if not raw or not raw.strip():
        return []
    if not _is_correspondence(raw):
        return [f"Themen/{raw}"]
    if _has_collective(_tokenize(raw)):
        return [f"Briefwechsel/{raw}"]
    return []
 def mine_summary_candidates(summaries: list[str]) -> list[tuple[str, int]]:
    counter: Counter = Counter()
    for summary in summaries:
        for token in re.split(r"[,;\s]+", summary.lower()):
            token = re.sub(r"[^a-zA-ZäöüÄÖÜß]", "", token)
            if len(token) >= 2 and token not in _GERMAN_STOP_WORDS:
                counter[token] += 1
    return counter.most_common()
 def load_approved_themes(path: Path) -> set[str]:
    if not path.exists():
        return set()
    themes: set[str] = set()
    with open(path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("candidate"):
                themes.add(row["candidate"].strip().lower())
    return themes
 def apply_approved_themes(summary: str, themes: set[str]) -> list[str]:
    lower = summary.lower()
    return [
        f"Themen/{theme}"
        for theme in themes
        if re.search(r"\b" + re.escape(theme) + r"\b", lower)
    ]
 def generate_tags(schlagwort: str, summary: str, themes: set[str]) -> list[str]:
    result = classify_schlagwort(schlagwort or "")
    if summary and themes:
        result = result + apply_approved_themes(summary, themes)
    return result
 def encode_tags(tag_list: list[str]) -> str:
    return "|".join(tag_list)
 def build_tag_tree(all_tag_paths: list[str]) -> list[dict]:
    unique_paths = list(dict.fromkeys(all_tag_paths))
    roots: dict[str, None] = {}
    children: dict[str, tuple[str, str]] = {}
    for path in unique_paths:
        if "/" in path:
            parent, child = path.split("/", 1)
            roots[parent] = None
            children[path] = (parent, child)
        else:
            roots[path] = None
    rows: list[dict] = []
    seen: set[str] = set()
    for root in roots:
        if root not in seen:
            rows.append({"tag_path": root, "parent_name": "", "tag_name": root})
            seen.add(root)
    for path, (parent, child) in children.items():
        if path not in seen:
            rows.append({"tag_path": path, "parent_name": parent, "tag_name": child})
            seen.add(path)
    return rows
--- a/tools/import-normalizer/tests/init.py
+++ b/tools/import-normalizer/tests/init.py
--- a/tools/import-normalizer/tests/test_config.py
+++ b/tools/import-normalizer/tests/test_config.py
@@ -0,0 +1,20 @@
 import config
 def test_century_boundaries():
    assert config.TWO_DIGIT_19XX_MAX == 57
    assert config.TWO_DIGIT_18XX_MIN == 73
 def test_header_maps_cover_required_fields():
    assert "index" in config.DOCUMENT_HEADER_MAP.values()
    assert "last_name" in config.PERSON_HEADER_MAP.values()
 def test_feast_tables_present():
    assert config.MOVABLE_FEASTS["pfingsten"] == 49
    assert config.SEASON_MONTHS["herbst"] == 10
 def test_name_classification_tables():
    assert "tante" in config.RELATIONAL_TERMS
    assert "familie" in config.COLLECTIVE_TERMS
    assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
    assert config.PROSE_MAX_LEN >= 30
    assert "anita" in config.EXTRA_GIVEN_NAMES
--- a/tools/import-normalizer/tests/test_dates.py
+++ b/tools/import-normalizer/tests/test_dates.py
@@ -0,0 +1,148 @@
 import datetime
 import dates
 from dates import Precision
 def test_easter_known_years():
    # Anonymous Gregorian algorithm — verified against published tables
    assert dates.easter(2024) == datetime.date(2024, 3, 31)
    assert dates.easter(2000) == datetime.date(2000, 4, 23)
    assert dates.easter(1922) == datetime.date(1922, 4, 16)
    assert dates.easter(1888) == datetime.date(1888, 4, 1)
 def test_resolve_feast_movable():
    assert dates.resolve_feast_or_season("Pfingsten", 1922) == ("1922-06-04", Precision.DAY)
    assert dates.resolve_feast_or_season("Ostern", 2024) == ("2024-03-31", Precision.DAY)
    assert dates.resolve_feast_or_season("Pfingstmontag", 1922) == ("1922-06-05", Precision.DAY)
 def test_resolve_feast_fixed():
    assert dates.resolve_feast_or_season("Weihnachten", 1900) == ("1900-12-25", Precision.DAY)
    assert dates.resolve_feast_or_season("Neujahr", 1910) == ("1910-01-01", Precision.DAY)
 def test_resolve_season():
    assert dates.resolve_feast_or_season("Herbst", 1913) == ("1913-10-01", Precision.SEASON)
    assert dates.resolve_feast_or_season("Sommer", 1910) == ("1910-07-01", Precision.SEASON)
 def test_resolve_unknown_token_returns_none():
    assert dates.resolve_feast_or_season("Freitag", 1919) is None
 def test_expand_year():
    assert dates.expand_year("1888") == 1888
    assert dates.expand_year("889") == 1889      # 3-digit -> 1DDD
    assert dates.expand_year("923") == 1923
    assert dates.expand_year("08") == 1908       # 00..57 -> 19xx
    assert dates.expand_year("17") == 1917
    assert dates.expand_year("57") == 1957
    assert dates.expand_year("73") == 1873       # 73..99 -> 18xx
    assert dates.expand_year("99") == 1899
    assert dates.expand_year("65") is None       # 58..72 ambiguous
    assert dates.expand_year("9003") is None      # implausible 4-digit year -> reject (typo)
    assert dates.expand_year("x") is None
 def test_parse_iso_and_empty():
    assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
    assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
    assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
 def test_parse_numeric_forms():
    assert dates.parse_date("15.2.1888").iso == "1888-02-15"
    assert dates.parse_date("13.5.09").iso == "1909-05-13"
    assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
    assert dates.parse_date("11.10.08").iso == "1908-10-11"
    assert dates.parse_date("30.1.889").iso == "1889-01-30"
    assert dates.parse_date("15.2.1888").precision == Precision.DAY
 def test_parse_numeric_unparseable():
    assert dates.parse_date("8.9.").precision == Precision.UNKNOWN     # no year
    assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN  # ambiguous 2-digit year
 def test_parse_approx_marker_upgrades_precision():
    r = dates.parse_date("17.Nov (?) 1887")  # month-name matcher now active; (?) marks approx
    assert r.raw == "17.Nov (?) 1887"
    assert r.precision == Precision.APPROX  # month-name matcher parses date; (?) upgrades to APPROX
 def test_parse_leading_qualifier_is_approx():
    r = dates.parse_date("nach 1.5.1900")  # qualifier stripped, numeric date salvaged, precision APPROX
    assert r.iso == "1900-05-01"
    assert r.precision == Precision.APPROX
 def test_parse_roman_months():
    assert dates.parse_date("22.III.18").iso == "1918-03-22"
    assert dates.parse_date("19.XII.1954").iso == "1954-12-19"
    assert dates.parse_date("1.III.27").iso == "1927-03-01"
    assert dates.parse_date("22.III.18").precision == Precision.DAY
 def test_parse_monthname_day_first():
    assert dates.parse_date("6.März 1888").iso == "1888-03-06"
    assert dates.parse_date("29.Sept.1891").iso == "1891-09-29"
    assert dates.parse_date("10.Oct.95").iso == "1895-10-10"
    assert dates.parse_date("9.December1889").iso == "1889-12-09"
    assert dates.parse_date("18.Dez.1916").iso == "1916-12-18"
    assert dates.parse_date("4Dezember 1936").iso == "1936-12-04"
    assert dates.parse_date("25 August 1968").iso == "1968-08-25"
 def test_parse_month_year_year_only():
    assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
    assert dates.parse_date("October 1903") == dates.ParsedDate("1903-10-01", Precision.MONTH, "October 1903")
    assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905")
 def test_parse_feast_and_season_via_parse_date():
    assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922")
    assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913")
    assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY
 def test_parse_ranges():
    assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916")
    assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82")
    assert dates.parse_date("1945/46?").iso == "1945-01-01"  # '?' stripped -> RANGE, then APPROX
    assert dates.parse_date("1945/46?").precision == Precision.APPROX
 def test_parse_approx_full():
    r = dates.parse_date("17.Nov (?) 1887")
    assert r.iso == "1887-11-17"
    assert r.precision == Precision.APPROX
 def test_parse_english_month_first_now_works():
    assert dates.parse_date("April 12. 1922").iso == "1922-04-12"
    assert dates.parse_date("Mai 1895").iso == "1895-05-01"  # not shadowed by month-first matcher
 def test_parse_unparseable_examples():
    assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN
 def test_parse_invalid_calendar_date_is_unknown():
    # try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review),
    # never silently clamp. This is the most likely real-data bug class at 7,600 rows.
    assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN
    assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN
 def test_parse_intra_month_day_range():
    # "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916".
    assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923")
    assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916")
 def test_parse_trailing_note_stripped_but_raw_preserved():
    r = dates.parse_date("17.Nov 1887, 2. Brief")  # REQ-DATE-04
    assert r.iso == "1887-11-17"
    assert "2. Brief" in r.raw   # original string preserved verbatim
 def test_parse_date_override_wins():
    ovr = {"13.5.65": ("1965-05-13", "DAY")}
    r = dates.parse_date("13.5.65", ovr)  # ambiguous without override
    assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
 def test_parse_spanish_months():
    # Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
    assert dates.parse_date("21.Enero 1911").iso == "1911-01-21"   # day-first
    assert dates.parse_date("Junio 17.929").iso == "1929-06-17"    # month-first, dot, 3-digit year
    assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18"    # month-first, hyphen
    assert dates.parse_date("Abril 10-929").iso == "1929-04-10"    # hyphen, 3-digit year
    assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
    assert dates.parse_date("febrero 14-29").iso == "1929-02-14"   # hyphen, 2-digit year
    assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
 def test_implausible_year_goes_to_review():
    # a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
    assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
 def test_hyphen_month_first_does_not_shadow_month_year():
    # the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
    assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
--- a/tools/import-normalizer/tests/test_documents.py
+++ b/tools/import-normalizer/tests/test_documents.py
@@ -0,0 +1,109 @@
 import persons
 import documents
 from documents import Triage
 def test_extract_row():
    header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
              "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
    cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
             "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
    raw = documents.extract_row(cells, header, source_row=3)
    assert raw.index == "W-0001"
    assert raw.sender == "Walter de Gruyter"
    assert raw.date == "15.2.1888"
    assert raw.source_row == 3
 def test_triage():
    assert documents.triage(["", "", ""]) == Triage.EMPTY
    assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX  # data but no index
    assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX
    assert documents.triage(["W-0001", "x"]) == Triage.OK
 def test_classify_blank_index():
    header = {"sender": 4, "receivers": 5}
    banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""]
    data = ["", "", "V", "1", "", "Eugenie"]
    assert documents.classify_blank_index(banner, header) == "section_banner"
    assert documents.classify_blank_index(data, header) == "data_no_index"
 def test_index_file_mismatch():
    assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
    assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
    assert documents.index_file_mismatch("W-0001", "") is False
    assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False  # unix path
    assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False         # no dir
 def _ctx():
    people = persons.parse_register([
        {"last_name": "de Gruyter", "first_name": "Walter"},
        {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
    ])
    return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
 def test_to_canonical_resolves_and_flags():
    ctx = _ctx()
    raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
                           sender="Walter de Gruyter", receivers="Eugenie Müller",
                           date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
                           summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert doc.sender_person_id == "de-gruyter-walter"
    assert doc.receiver_person_ids == ["de-gruyter-eugenie"]   # matched via maiden alias
    assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
    assert doc.tags == ["Themen/Brautbriefe"]
    assert doc.needs_review == []
 def test_to_canonical_unmatched_and_unparsed():
    ctx = _ctx()
    raw = documents.RawRow(source_row=9, index="C-0001",
                           sender="Hans Wittkopf", receivers="", date="Freitag 1919")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert doc.sender_person_id == "wittkopf-hans"            # provisional
    assert "unmatched_sender" in doc.needs_review
    assert "unparsed_date" in doc.needs_review
    assert ctx.unmatched["Hans Wittkopf"] == [9]
    assert any(p.provisional for p in ctx.provisional.values())
 def test_to_canonical_splits_multi_sender():
    # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged.
    ctx = _ctx()
    raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert doc.sender_person_id == "de-gruyter-walter"   # first part is primary
    assert "multi_sender" in doc.needs_review
 def test_provisional_id_never_collides_with_register():
    # A provisional built from an unmatched string must not steal a register person_id.
    people = persons.parse_register([{"last_name": "Xyz", "first_name": "Abc"}])  # id "xyz-abc"
    ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
    # "Abc, Xyz" misses the alias index (the comma changes the normalized key) but its
    # provisional slug is "xyz-abc" — already the register person's id, so it MUST be suffixed.
    pid, _, matched = ctx.resolve_one("Abc, Xyz", source_row=1)
    assert matched is False
    assert "xyz-abc" in ctx.index.known_ids
    assert pid == "xyz-abc-2"   # suffixed away from the register id, not reused
 def test_resolve_one_override_increments_hits():
    people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Eugenie"}])
    ctx = persons.ResolutionContext(persons.AliasIndex(people),
                                    name_overrides={"Genie": "de-gruyter-eugenie"})
    pid, name, matched = ctx.resolve_one("Genie", source_row=1)
    assert pid == "de-gruyter-eugenie" and matched is True
    assert name == "Eugenie de Gruyter"   # display comes from the alias index
    assert ctx.override_hits == 1
 def test_ambiguous_pair_recorded_in_unresolved():
    people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}])
    ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={},
                                    given_names={"ella", "anita"})
    raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
    doc = documents.to_canonical(raw, ctx, date_overrides={})
    assert len(doc.receiver_person_ids) == 1   # not split — one provisional
    assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved)
 def test_resolvable_first_surname_pair_not_unresolved():
    ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={},
                                    given_names={"ella", "anita"})
    ctx.resolve_one("Mieze Schefold", source_row=1)   # surname is not a given name
    assert ctx.unresolved == []                        # RESOLVABLE -> not recorded
--- a/tools/import-normalizer/tests/test_ingest.py
+++ b/tools/import-normalizer/tests/test_ingest.py
@@ -0,0 +1,46 @@
 import datetime
 import openpyxl
 import pytest
 import ingest
 def _make_workbook(tmp_path, sheet_name, rows):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = sheet_name
    for r in rows:
        ws.append(r)
    path = tmp_path / "wb.xlsx"
    wb.save(path)
    return path
 def test_read_sheet_converts_cells(tmp_path):
    path = _make_workbook(tmp_path, "S", [
        ["Index", "Datum"],
        ["W-0001", datetime.datetime(1888, 2, 15)],
        ["W-0002", 1],
    ])
    rows = ingest.read_sheet(path, "S")
    assert rows[0] == ["Index", "Datum"]
    assert rows[1] == ["W-0001", "1888-02-15"]   # Excel date -> ISO string
    assert rows[2] == ["W-0002", "1"]            # integer -> plain string
 def test_build_header_map_collapses_whitespace_and_case():
    header = ["Index", "Datum  des Briefes", "EmpfängerIn", "Mystery"]
    field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"}
    fields, unknown = ingest.build_header_map(header, field_map, required={"index"})
    assert fields == {"index": 0, "date": 1, "receivers": 2}
    assert unknown == ["Mystery"]
 def test_build_header_map_missing_required_raises():
    with pytest.raises(ValueError, match="index"):
        ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"})
 def test_read_sheet_bool_not_coerced_to_int(tmp_path):
    path = _make_workbook(tmp_path, "S", [["Flag"], [True], [False]])
    rows = ingest.read_sheet(path, "S")
    assert rows[1] == ["True"] and rows[2] == ["False"]   # not "1"/"0"
 def test_read_sheet_missing_sheet_raises(tmp_path):
    path = _make_workbook(tmp_path, "S", [["A"]])
    with pytest.raises(ValueError, match="not found"):
        ingest.read_sheet(path, "Nope")
--- a/tools/import-normalizer/tests/test_normalize.py
+++ b/tools/import-normalizer/tests/test_normalize.py
@@ -0,0 +1,121 @@
 import openpyxl
 import normalize
 def _doc_wb(tmp_path):
    wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv"
    ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn",
               "Datum des Briefes", "Ort", "Schlagwort", "Inhalt"])
    ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
               "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"])
    ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""])
    ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""])
    ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
    ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
               "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"])
    p = tmp_path / "docs.xlsx"; wb.save(p); return p
 def _person_wb(tmp_path):
    wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1"
    ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum",
               "Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"])
    ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""])
    ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""])
    p = tmp_path / "persons.xlsx"; wb.save(p); return p
 def test_run_end_to_end(tmp_path):
    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
    stats = normalize.run(
        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
        out_dir=out_dir, review_dir=review_dir,
        date_overrides={}, name_overrides={})
    assert (out_dir / "canonical-documents.xlsx").exists()
    assert (out_dir / "canonical-persons.xlsx").exists()
    assert stats["documents_emitted"] == 3        # W-0001, C-0001, W-0001 (dup) — x and blank excluded
    assert stats["skipped_x_suffix"] == 1
    assert stats["blank_index_rows"] == 1
    assert stats["duplicate_index_rows"] == 2
    assert stats["unresolved_unknown"] >= 1   # the "?" receiver is an UNKNOWN-class name
    assert (review_dir / "skipped-x-suffix.csv").exists()
    assert (review_dir / "unparsed-dates.csv").exists()
    # C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01)
    assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
    assert (review_dir / "unresolved-names.csv").exists()
    unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
    assert "unknown" in unresolved_text and "?" in unresolved_text   # the "?" receiver
    assert not (review_dir / "ambiguous-receivers.csv").exists()      # replaced
    # determinism (NFR-IDEM-01): a second run yields identical canonical content + review files
    def _matrix(p):
        wb = openpyxl.load_workbook(p)
        return [[c.value for c in row] for row in wb.active.iter_rows()]
    docs1 = _matrix(out_dir / "canonical-documents.xlsx")
    persons1 = _matrix(out_dir / "canonical-persons.xlsx")
    unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
    normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
                  person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
                  out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={})
    assert _matrix(out_dir / "canonical-documents.xlsx") == docs1
    assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
    assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
    assert len(docs1) == 4  # header + 3 docs
 def test_tag_tree_output_emitted(tmp_path):
    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
    normalize.run(
        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
        out_dir=out_dir, review_dir=review_dir,
        date_overrides={}, name_overrides={})
    assert (out_dir / "canonical-tag-tree.xlsx").exists()
 def test_tag_candidates_review_emitted(tmp_path):
    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
    normalize.run(
        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
        out_dir=out_dir, review_dir=review_dir,
        date_overrides={}, name_overrides={})
    assert (review_dir / "tag-candidates.csv").exists()
    text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
    assert "candidate" in text and "count" in text
 def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
    normalize.run(
        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
        out_dir=out_dir, review_dir=review_dir,
        date_overrides={}, name_overrides={})
    wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
    ws = wb.active
    header = [c.value for c in ws[1]]
    tag_col = header.index("tags")
    tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
    assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
    assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
 def test_approved_themes_applied(tmp_path):
    themes_file = tmp_path / "approved-themes.csv"
    themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
    out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
    normalize.run(
        document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
        person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
        out_dir=out_dir, review_dir=review_dir,
        date_overrides={}, name_overrides={},
        approved_themes_path=themes_file)
    wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
    ws = wb.active
    header = [c.value for c in ws[1]]
    tag_col = header.index("tags")
    tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
    # W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
    assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
--- a/tools/import-normalizer/tests/test_persons.py
+++ b/tools/import-normalizer/tests/test_persons.py
@@ -0,0 +1,132 @@
 import config
 import persons
 from persons import NameClass
 def test_slugify():
    assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
    assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard"
 def test_parse_register_basic():
    rows = [
        {"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi",
         "maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel",
         "death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"',
         "notes": "Schwester v Marie Cram"},
        {"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else",
         "maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann",
         "notes": "Schwester v Herbert"},
    ]
    people = persons.parse_register(rows)
    p = people[0]
    assert p.person_id == "blomquist-charlotte"
    assert p.first_name == "Charlotte"
    assert p.maiden_name == "Ruge"
    assert p.birth_date == "1862-08-30"
    assert p.nickname == "Tante Lolly"     # quoted spouse field is a nickname, not a spouse
    assert p.spouse == ""
    assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names
    p2 = people[1]
    assert p2.maiden_name == "Cram"
    assert p2.spouse == "Ludwig Bohrmann"
    assert p2.provisional is False
 def test_parse_register_dedups_colliding_ids():
    # Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
    people = persons.parse_register([
        {"last_name": "Cram", "first_name": "Hans"},
        {"last_name": "Cram", "first_name": "Hans"},
    ])
    ids = [p.person_id for p in people]
    assert ids == ["cram-hans-1", "cram-hans-2"]
    assert len(set(ids)) == 2
 def test_split_receivers():
    assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"]
    assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"]
    assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"]
    assert persons.split_receivers("Clara u Familie") == ["Clara"]
    assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"]
    assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"]
    assert persons.split_receivers("") == []
    assert persons.split_receivers("geb. Müller") == []          # maiden-only cell -> no person
    assert persons.split_receivers("Herbert//Clara") == ["Herbert", "Clara"]  # // separator
 def test_find_known_last_name():
    assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
    assert persons.find_known_last_name("Clara") is None
 def test_alias_index_resolves_maiden_and_married():
    people = persons.parse_register([
        {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
        {"last_name": "Cram", "first_name": "Clara"},
    ])
    idx = persons.AliasIndex(people)
    eugenie = people[0].person_id
    assert idx.resolve("Eugenie de Gruyter") == eugenie   # canonical
    assert idx.resolve("Eugenie Müller") == eugenie        # maiden alias
    assert idx.resolve("eugenie  müller") == eugenie        # normalized
    assert idx.resolve("Nobody Unknown") is None
 def test_alias_index_suggestion():
    people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
    idx = persons.AliasIndex(people)
    sid, score = idx.suggest("Hans Wittkop")  # typo
    assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD
 def test_alias_index_first_name_only_when_unambiguous():
    people = persons.parse_register([
        {"last_name": "Cram", "first_name": "Clara"},
        {"last_name": "de Gruyter", "first_name": "Walter"},
        {"last_name": "Cram", "first_name": "Walter"},  # 2nd "Walter" -> first name ambiguous
    ])
    idx = persons.AliasIndex(people)
    assert idx.resolve("Clara") == people[0].person_id   # unique first name resolves
    assert idx.resolve("Walter") is None                 # ambiguous first name does NOT resolve
    assert idx.display(people[0].person_id) == "Clara Cram"
 GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
 def test_classify_unknown():
    assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
    assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
    assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
 def test_classify_prose():
    assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
    assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE  # digit
    assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE        # quote
 def test_classify_collective():
    assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
    assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
 def test_classify_relational():
    assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
    assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
 def test_classify_single_token():
    assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
    assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
 def test_classify_ambiguous_pair():
    assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
    assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
 def test_classify_resolvable_single_person():
    # first + surname (surname not a given name) -> one real person, NOT ambiguous
    assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
    assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
 def test_build_given_names():
    people = persons.parse_register([
        {"last_name": "de Gruyter", "first_name": "Eugenie"},
        {"last_name": "Cram", "first_name": "Charlotte,Meta"},  # comma -> primary + extra given
    ])
    g = persons.build_given_names(people, {"Anita"})
    assert "eugenie" in g
    assert "charlotte" in g and "meta" in g   # primary + extra given names
    assert "anita" in g                        # from the extra set, normalized
    assert "schefold" not in g
--- a/tools/import-normalizer/tests/test_persons_tree.py
+++ b/tools/import-normalizer/tests/test_persons_tree.py
@@ -0,0 +1,457 @@
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import persons_tree
 def test_parse_year_iso_string():
    assert persons_tree._parse_year("1920-09-20") == 1920
 def test_parse_year_excel_serial_birth():
    # 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
    assert persons_tree._parse_year("7568") == 1920
 def test_parse_year_excel_serial_death():
    # 36222 days from 1899-12-30 ≈ 1999
    assert persons_tree._parse_year("36222") == 1999
 def test_parse_year_excel_serial_small():
    # 177 days from 1899-12-30 = 1900-06-25
    assert persons_tree._parse_year("177") == 1900
 def test_parse_year_german_date_string():
    assert persons_tree._parse_year("30.8.1862") == 1862
 def test_parse_year_year_only():
    assert persons_tree._parse_year("1930") == 1930
 def test_parse_year_free_text():
    assert persons_tree._parse_year("August 1941") == 1941
 def test_parse_year_none():
    assert persons_tree._parse_year(None) is None
 def test_parse_year_empty():
    assert persons_tree._parse_year("") is None
 def test_parse_year_unresolvable_truncated():
    # "2.9.196" has no valid 4-digit year — returns None
    assert persons_tree._parse_year("2.9.196") is None
 def test_parse_year_typo_year():
    # "4.3.1023" — year 1023 outside 1700-2100 guard — returns None
    assert persons_tree._parse_year("4.3.1023") is None
 def test_parse_year_bare_out_of_range_year_is_none():
    # "1023" is a plausible typo for "1923" but is NOT an Excel serial.
    # parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
    # The serial branch must NOT re-interpret it as a serial.
    assert persons_tree._parse_year("1023") is None
 def test_parse_generation_space():
    assert persons_tree._parse_generation("G 3") == 3
 def test_parse_generation_no_space():
    assert persons_tree._parse_generation("G3") == 3
 def test_parse_generation_extra_spaces():
    assert persons_tree._parse_generation("G  0") == 0
 def test_parse_generation_trailing_garbage():
    assert persons_tree._parse_generation("G 2         de Gruyter") == 2
 def test_parse_generation_empty():
    assert persons_tree._parse_generation("") is None
 def test_parse_generation_none():
    assert persons_tree._parse_generation(None) is None
 def test_norm_tree_basic():
    assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
 def test_norm_tree_diacritics():
    assert persons_tree._norm_tree("Wöhler") == "woehler"
 def test_norm_tree_strips_parens():
    assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
 def test_norm_tree_strips_quotes():
    assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
 def test_norm_tree_strips_geographic_suffix():
    assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
 def test_norm_tree_strips_mexiko():
    assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
 def test_norm_tree_collapses_whitespace():
    assert persons_tree._norm_tree("  Clara   de Gruyter  ") == "clara de gruyter"
 def test_build_index_forward_lookup():
    persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
    idx = persons_tree._build_index(persons)
    assert "werner allemeyer" in idx
    assert idx["werner allemeyer"] == ["row_002"]
 def test_build_index_reversed_lookup():
    persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
    idx = persons_tree._build_index(persons)
    assert idx.get("allemeyer werner") == ["row_002"]
 def test_build_index_maiden_name_lookup():
    persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
    idx = persons_tree._build_index(persons)
    assert idx.get("elsgard woehler") == ["row_002"]
 def test_build_index_single_token_fallback():
    persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
    idx = persons_tree._build_index(persons)
    assert idx.get("cram") == ["row_028"]
 def test_build_index_ambiguous_single_token():
    persons = [
        {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
        {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
    ]
    idx = persons_tree._build_index(persons)
    assert set(idx["cram"]) == {"row_028", "row_019"}
 def test_resolve_one_found():
    persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
    idx = persons_tree._build_index(persons)
    row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
    assert row_id == "row_003"
    assert reason is None
 def test_resolve_one_not_found():
    idx = {}
    row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
    assert row_id is None
    assert reason == "not_found"
 def test_resolve_one_ambiguous():
    persons = [
        {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
        {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
    ]
    idx = persons_tree._build_index(persons)
    row_id, reason = persons_tree._resolve_one("Cram", idx)
    assert row_id is None
    assert reason == "ambiguous"
 def test_parse_row_serial_dates():
    fields = {
        "generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
        "maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
        "death_date": "36222", "death_place": "Espelkamp",
        "spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
    }
    p = persons_tree._parse_row(2, fields)
    assert p["rowId"] == "row_002"
    assert p["firstName"] == "Elsgard"
    assert p["lastName"] == "Allemeyer"
    assert p["maidenName"] == "Wöhler"
    assert p["birthYear"] == 1920
    assert p["deathYear"] == 1999
    assert p["birthPlace"] == "Garz"
    assert p["deathPlace"] == "Espelkamp"
    assert p["generation"] == 3
    assert p["familyMember"] is True
    assert p["_spouse_raw"] == "Allemeyer Werner"
    assert p["_bemerkung_raw"] == "Nichte von Herbert"
    assert "[Geburtsdatum" not in (p["notes"] or "")
 def test_parse_row_string_birth_date():
    fields = {
        "generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
        "maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
        "death_date": "", "death_place": "", "spouse": "", "notes": "",
    }
    p = persons_tree._parse_row(28, fields)
    assert p["birthYear"] == 1890
    assert p["deathYear"] is None
    assert p["notes"] is None or p["notes"] == ""
 def test_parse_row_unresolvable_date_goes_to_notes():
    fields = {
        "generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
        "maiden_name": "", "birth_date": "28.9.", "birth_place": "",
        "death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
    }
    p = persons_tree._parse_row(96, fields)
    assert p["birthYear"] is None
    assert "[Geburtsdatum: 28.9.]" in p["notes"]
    assert "Bruder v Ingrid" in p["notes"]
 def test_parse_row_empty_spouse_and_notes():
    fields = {
        "generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
        "maiden_name": "", "birth_date": "", "birth_place": "",
        "death_date": "", "death_place": "", "spouse": "", "notes": "",
    }
    p = persons_tree._parse_row(4, fields)
    assert p["_spouse_raw"] is None
    assert p["_bemerkung_raw"] is None
 def test_deduplicate_no_duplicates():
    persons = [
        {"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
        {"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
    ]
    result, skipped = persons_tree._deduplicate(persons)
    assert len(result) == 2
    assert skipped == []
 def test_deduplicate_exact_match():
    # rows 127/138: same firstName, lastName, birthYear
    persons = [
        {"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
        {"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
    ]
    result, skipped = persons_tree._deduplicate(persons)
    assert [p["rowId"] for p in result] == ["row_127"]
    assert len(skipped) == 1
    assert "row_138" in skipped[0]
 def test_deduplicate_none_birth_year_after_known():
    # rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
    persons = [
        {"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
        {"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
    ]
    result, skipped = persons_tree._deduplicate(persons)
    assert [p["rowId"] for p in result] == ["row_129"]
    assert len(skipped) == 1
 def test_deduplicate_both_none_birth_year_kept():
    # Two people with no birth year but same name: keep first only
    persons = [
        {"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
        {"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
    ]
    result, skipped = persons_tree._deduplicate(persons)
    assert [p["rowId"] for p in result] == ["row_A"]
    assert len(skipped) == 1
 def _make_persons(*args):
    """Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples."""
    return [
        {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3],
         "_spouse_raw": a[4], "_bemerkung_raw": None,
         "birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None,
         "generation": None, "familyMember": True, "alias": None, "notes": None}
        for a in args
    ]
 def test_resolve_spouses_success():
    persons = _make_persons(
        ("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"),
        ("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"),
    )
    idx = persons_tree._build_index(persons)
    rels, unres = persons_tree._resolve_spouses(persons, idx)
    assert len(rels) == 1
    assert rels[0]["type"] == "SPOUSE_OF"
    assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"}
    assert unres == []
 def test_resolve_spouses_not_found():
    persons = _make_persons(
        ("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'),
    )
    idx = persons_tree._build_index(persons)
    rels, unres = persons_tree._resolve_spouses(persons, idx)
    assert rels == []
    assert len(unres) == 1
    assert unres[0]["rowId"] == "row_007"
    assert unres[0]["reason"] == "not_found"
 def test_resolve_spouses_empty_spouse_field():
    persons = _make_persons(
        ("row_004", "Jürgen", "Allemeyer", None, None),
    )
    idx = persons_tree._build_index(persons)
    rels, unres = persons_tree._resolve_spouses(persons, idx)
    assert rels == [] and unres == []
 def _register(*args):
    """Build index from (rowId, first, last, maiden) tuples."""
    persons = [
        {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
        for a in args
    ]
    return persons, persons_tree._build_index(persons)
 def test_parse_bemerkung_sohn_two_parents():
    _, idx = _register(
        ("row_019", "Clara", "Cram", "de Gruyter"),
        ("row_028", "Herbert", "Cram", None),
    )
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_021", "Sohn v Clara Cram u Herbert Cram", idx
    )
    assert len(rels) == 2
    assert all(r["type"] == "PARENT_OF" for r in rels)
    child_ids = {r["relatedPersonId"] for r in rels}
    parent_ids = {r["personId"] for r in rels}
    assert child_ids == {"row_021"}
    assert "row_019" in parent_ids and "row_028" in parent_ids
    assert unres == []
    assert notes == ""
 def test_parse_bemerkung_tochter_von():
    _, idx = _register(("row_019", "Clara", "Cram", None))
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_036", "Tochter von Clara Cram", idx
    )
    assert len(rels) == 1
    assert rels[0] == {
        "personId": "row_019",
        "relatedPersonId": "row_036",
        "type": "PARENT_OF",
        "source": "bemerkung",
        "rawBemerkung": "Tochter von Clara Cram",
    }
    assert notes == ""
 def test_parse_bemerkung_vater():
    _, idx = _register(("row_028", "Herbert", "Cram", None))
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_031", "Vater v Herbert Cram", idx
    )
    assert len(rels) == 1
    assert rels[0]["personId"] == "row_031"
    assert rels[0]["relatedPersonId"] == "row_028"
    assert rels[0]["type"] == "PARENT_OF"
 def test_parse_bemerkung_unmatched_parent_name():
    _, idx = _register()   # empty index
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_004", "Sohn v Elsgard A.", idx
    )
    assert rels == []
    assert len(unres) == 1
    assert unres[0]["reason"] == "not_found"
    assert notes == ""
 def test_parse_bemerkung_skip_nichte():
    _, idx = _register(("row_028", "Herbert", "Cram", None))
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_002", "Nichte von Herbert", idx
    )
    assert rels == []
    assert unres == []
    assert notes == "Nichte von Herbert"
 def test_parse_bemerkung_skip_bruder():
    _, idx = _register(("row_028", "Herbert", "Cram", None))
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_033", "Bruder v Herbert", idx
    )
    assert rels == []
    assert unres == []
    assert notes == "Bruder v Herbert"
 def test_parse_bemerkung_empty():
    _, idx = _register()
    rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
    assert rels == [] and unres == [] and notes == ""
 def test_parse_bemerkung_plain_remark():
    _, idx = _register()
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_029", "Verfasserin der Cram-Chronik !!", idx
    )
    assert rels == [] and unres == []
    assert notes == "Verfasserin der Cram-Chronik !!"
 def test_parse_bemerkung_sohn_with_trailing_remark():
    _, idx = _register(
        ("row_019", "Clara", "Cram", "de Gruyter"),
        ("row_028", "Herbert", "Cram", None),
    )
    rels, unres, notes = persons_tree._parse_bemerkung(
        "row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx
    )
    assert len(rels) == 2
    assert unres == []
    assert notes == "nach Mexiko emigriert"
 import subprocess
 def test_dry_run_exits_zero(tmp_path):
    """dry-run should complete without writing any file and exit 0."""
    input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx"
    if not input_path.exists():
        import pytest
        pytest.skip("source Excel file not present")
    result = subprocess.run(
        [
            sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"),
            "--input", str(input_path),
            "--output", str(tmp_path / "out.json"),
            "--dry-run",
        ],
        capture_output=True, text=True,
    )
    assert result.returncode == 0, result.stderr
    assert not (tmp_path / "out.json").exists()
    assert "persons parsed" in result.stdout
--- a/tools/import-normalizer/tests/test_tags.py
+++ b/tools/import-normalizer/tests/test_tags.py
@@ -0,0 +1,191 @@
 import tags
 # --- classify_schlagwort ---
 def test_semantic_tag_kept_as_themen():
    assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"]
 def test_everyday_tag_kept_as_themen():
    assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"]
 def test_event_tag_kept_as_themen():
    assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"]
 def test_individual_correspondence_dropped():
    assert tags.classify_schlagwort("Clara an Herbert") == []
 def test_individual_correspondence_with_year_dropped():
    assert tags.classify_schlagwort("Herbert an Clara 1918") == []
 def test_individual_with_role_dropped():
    assert tags.classify_schlagwort("Vater Juan an Herbert") == []
 def test_relational_receiver_dropped():
    assert tags.classify_schlagwort("Clara an ihre Mutter") == []
 def test_group_receiver_kinder_kept_as_briefwechsel():
    assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"]
 def test_group_receiver_eltern_kept():
    assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"]
 def test_group_receiver_geschwister_kept():
    assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"]
 def test_group_receiver_schwiegereltern_kept():
    assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"]
 def test_group_receiver_soehne_kept():
    assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"]
 def test_group_receiver_brueder_kept():
    assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"]
 def test_group_receiver_cousinen_kept():
    assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"]
 def test_group_receiver_freunde_kept():
    assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"]
 def test_group_sender_geschwister_kept():
    # collective on the LEFT side of "an"
    assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"]
 def test_receiver_only_individual_dropped():
    # starts with "an " — single individual receiver
    assert tags.classify_schlagwort("an Walter de Gruyter") == []
 def test_receiver_only_group_kept():
    # starts with "an " — collective receiver
    assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"]
 def test_abbreviated_sender_individual_dropped():
    # "Maria W.an Clara" — abbreviated name + ".an"
    assert tags.classify_schlagwort("Maria W.an Clara") == []
 def test_abbreviated_sender_group_kept():
    assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"]
 def test_empty_schlagwort_returns_empty():
    assert tags.classify_schlagwort("") == []
 def test_einzelkinder_kept():
    assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"]
 def test_geschw_abbreviation_kept():
    # "Geschw." abbreviation for Geschwister — appears after "u" in receiver side
    assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."]
 # --- mine_summary_candidates ---
 def test_mine_candidates_counts_words():
    summaries = ["Reise, Hochzeit", "Reise", "Krieg"]
    candidates = dict(tags.mine_summary_candidates(summaries))
    assert candidates["reise"] == 2
    assert candidates["hochzeit"] == 1
    assert candidates["krieg"] == 1
 def test_mine_candidates_filters_stop_words():
    summaries = ["und die Reise", "das ist eine Reise"]
    candidates = dict(tags.mine_summary_candidates(summaries))
    assert "reise" in candidates
    assert "und" not in candidates
    assert "die" not in candidates
    assert "das" not in candidates
    assert "ist" not in candidates
    assert "eine" not in candidates
 def test_mine_candidates_filters_contracted_prepositions():
    # im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim
    summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"]
    candidates = dict(tags.mine_summary_candidates(summaries))
    for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"):
        assert stop not in candidates, f"stop word '{stop}' leaked through"
    assert "besuch" in candidates
    assert "hochzeit" in candidates
 def test_mine_candidates_filters_single_chars():
    summaries = ["x Reise y"]
    candidates = dict(tags.mine_summary_candidates(summaries))
    assert "x" not in candidates
    assert "y" not in candidates
 def test_mine_candidates_sorted_descending():
    summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"]
    result = tags.mine_summary_candidates(summaries)
    counts = [count for _, count in result]
    assert counts == sorted(counts, reverse=True)
 def test_mine_candidates_empty_summaries():
    assert tags.mine_summary_candidates([]) == []
    assert tags.mine_summary_candidates([""]) == []
 # --- load_approved_themes and apply_approved_themes ---
 def test_apply_themes_match_found(tmp_path):
    themes = {"reise", "hochzeit"}
    result = tags.apply_approved_themes("Reise nach Berlin", themes)
    assert "Themen/reise" in result
 def test_apply_themes_case_insensitive(tmp_path):
    themes = {"reise"}
    result = tags.apply_approved_themes("REISE", themes)
    assert "Themen/reise" in result
 def test_apply_themes_no_match(tmp_path):
    themes = {"krieg"}
    result = tags.apply_approved_themes("Alltag in Ruhrort", themes)
    assert result == []
 def test_apply_themes_multiple_matches():
    themes = {"reise", "hochzeit"}
    result = tags.apply_approved_themes("Reise zur Hochzeit", themes)
    assert len(result) == 2
    assert "Themen/reise" in result
    assert "Themen/hochzeit" in result
 # --- encode_tags ---
 def test_encode_tags_single():
    assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe"
 def test_encode_tags_multiple():
    result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"])
    assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder"
 def test_encode_tags_empty():
    assert tags.encode_tags([]) == ""
 # --- build_tag_tree ---
 def test_build_tag_tree_includes_roots():
    paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]
    tree = tags.build_tag_tree(paths)
    tag_paths = [row["tag_path"] for row in tree]
    assert "Themen" in tag_paths
    assert "Briefwechsel" in tag_paths
 def test_build_tag_tree_includes_children():
    paths = ["Themen/Brautbriefe"]
    tree = tags.build_tag_tree(paths)
    child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe")
    assert child["parent_name"] == "Themen"
    assert child["tag_name"] == "Brautbriefe"
 def test_build_tag_tree_root_has_empty_parent():
    paths = ["Themen/Brautbriefe"]
    tree = tags.build_tag_tree(paths)
    root = next(r for r in tree if r["tag_path"] == "Themen")
    assert root["parent_name"] == ""
    assert root["tag_name"] == "Themen"
 def test_build_tag_tree_no_duplicates():
    paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"]
    tree = tags.build_tag_tree(paths)
    tag_paths = [row["tag_path"] for row in tree]
    assert len(tag_paths) == len(set(tag_paths))
--- a/tools/import-normalizer/tests/test_writers.py
+++ b/tools/import-normalizer/tests/test_writers.py
@@ -0,0 +1,60 @@
 import csv
 import openpyxl
 import overrides
 import writers
 import documents
 def test_load_overrides_missing_files(tmp_path):
    d, n = overrides.load_overrides(tmp_path / "dates.csv", tmp_path / "names.csv")
    assert d == {} and n == {}
 def test_load_overrides_parsed(tmp_path):
    dp = tmp_path / "dates.csv"
    dp.write_text("raw,iso,precision\n13.5.65,1965-05-13,DAY\n", encoding="utf-8")
    np = tmp_path / "names.csv"
    np.write_text("raw,person_id\nEugenie Müller,de-gruyter-eugenie\n", encoding="utf-8")
    d, n = overrides.load_overrides(dp, np)
    assert d["13.5.65"] == ("1965-05-13", "DAY")
    assert n["Eugenie Müller"] == "de-gruyter-eugenie"
 def test_write_documents_xlsx_joins_lists(tmp_path):
    doc = documents.CanonicalDocument(
        index="W-0001", receiver_person_ids=["a", "b"], receiver_names=["A", "B"],
        tags=["Brautbriefe"], date_precision="DAY", needs_review=["unparsed_date"])
    out = tmp_path / "docs.xlsx"
    writers.write_documents_xlsx([doc], out)
    wb = openpyxl.load_workbook(out)
    ws = wb.active
    header = [c.value for c in ws[1]]
    assert "receiver_person_ids" in header and "needs_review" in header
    row = {h: c.value for h, c in zip(header, ws[2])}
    assert row["receiver_person_ids"] == "a|b"
    assert row["needs_review"] == "unparsed_date"
 def test_write_documents_xlsx_pins_timestamp(tmp_path):
    # determinism (NFR-IDEM-01): workbook created/modified are pinned, not the current time
    doc = documents.CanonicalDocument(index="W-0001")
    out = tmp_path / "d.xlsx"
    writers.write_documents_xlsx([doc], out)
    wb = openpyxl.load_workbook(out)
    assert (wb.properties.created.year, wb.properties.created.month, wb.properties.created.day) == (2020, 1, 1)
 def test_write_review_csv(tmp_path):
    out = tmp_path / "r.csv"
    writers.write_review_csv(out, ["raw", "count"], [["?", 3], ["x", 1]])
    rows = list(csv.reader(out.open(encoding="utf-8")))
    assert rows[0] == ["raw", "count"]
    assert rows[1] == ["?", "3"]
 def test_write_review_csv_defangs_formula_injection(tmp_path):
    out = tmp_path / "r.csv"
    writers.write_review_csv(out, ["raw", "count"], [["=cmd|'/C calc'!A0", 1], ["-2+3", 2]])
    rows = list(csv.reader(out.open(encoding="utf-8")))
    assert rows[1][0].startswith("'=")   # leading '=' neutralised
    assert rows[2][0].startswith("'-")
 def test_write_summary_sections(tmp_path):
    out = tmp_path / "s.txt"
    writers.write_summary(out, {"# INPUTS": "", "rows": 10, "# DATES": "", "unknown_date_rate": "3.2%"})
    text = out.read_text(encoding="utf-8")
    assert "INPUTS:" in text and "DATES:" in text and "  rows: 10" in text
--- a/tools/import-normalizer/writers.py
+++ b/tools/import-normalizer/writers.py
@@ -0,0 +1,86 @@
 """Write canonical .xlsx outputs and review .csv files."""
 import csv
 import datetime
 from pathlib import Path
 import openpyxl
 _PIPE = "|"
 # Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl
 # otherwise stamps docProps with the current time on every save.
 _FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0)
 def _join(value):
    if isinstance(value, list):
        return _PIPE.join(str(v) for v in value)
    return "" if value is None else str(value)
 def _csv_safe(value):
    """Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs."""
    s = "" if value is None else str(value)
    return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
 DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name",
               "receiver_person_ids", "receiver_names", "date_iso", "date_raw",
               "date_precision", "location", "tags", "summary", "source_row", "needs_review"]
 PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname",
                  "birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw",
                  "death_place", "spouse", "generation", "notes", "aliases", "provisional"]
 def _write_xlsx(records, columns, path: Path):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(columns)
    for rec in records:
        ws.append([_join(getattr(rec, col)) for col in columns])
    wb.properties.created = _FIXED_TS
    wb.properties.modified = _FIXED_TS
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    wb.save(path)
 def write_documents_xlsx(docs, path: Path):
    _write_xlsx(docs, DOC_COLUMNS, path)
 def write_tag_tree_xlsx(tree: list[dict], path: Path):
    columns = ["tag_path", "parent_name", "tag_name"]
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(columns)
    for row in tree:
        ws.append([row.get(col, "") for col in columns])
    wb.properties.created = _FIXED_TS
    wb.properties.modified = _FIXED_TS
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    wb.save(path)
 def write_persons_xlsx(people, path: Path):
    _write_xlsx(people, PERSON_COLUMNS, path)
 def write_review_csv(path: Path, header: list[str], rows: list[list]):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(header)
        for row in rows:
            w.writerow([_csv_safe(c) for c in row])
 def write_summary(path: Path, stats: dict):
    """Render a grouped, scannable summary. Keys beginning with '#' are section headers."""
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    lines = []
    for k, v in stats.items():
        if k.startswith("#"):
            lines.append("")
            lines.append(k[1:].strip() + ":")
        else:
            lines.append(f"  {k}: {v}")
    Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
Author	SHA1	Message	Date
Marcel	2e59c0ef5b	chore(normalizer): unignore canonical-persons-tree.json from out/ exclusion All checks were successful CI / Unit & Component Tests (pull_request) Successful in 3m33s Details CI / OCR Service Tests (pull_request) Successful in 22s Details CI / Backend Unit Tests (pull_request) Successful in 3m42s Details CI / fail2ban Regex (pull_request) Successful in 47s Details CI / Semgrep Security Scan (pull_request) Successful in 21s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m3s Details	2026-05-25 21:19:02 +02:00
Marcel	309436b9a4	feat(normalizer): generate canonical-persons-tree.json from Personendatei 2.xlsx 157 persons, 43 relationships (29 SPOUSE_OF + 14 PARENT_OF), 89 unresolved references. 6 duplicate rows skipped (Seils family block + Christa Schütz). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 21:18:24 +02:00
Marcel	e326630318	feat(normalizer): add main() CLI to persons_tree Wires the two-pass pipeline (parse → deduplicate → index → resolve) into a runnable CLI with --input, --output, and --dry-run flags. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 21:16:21 +02:00
Marcel	34c40cb0ee	fix(normalizer): preserve trailing Bemerkung text after parent pattern Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 21:12:45 +02:00
Marcel	ace41ad209	fix(normalizer): remove unauthorized first-name index key from _build_index Remove the 5th unauthorized index key (_norm_tree(first)) from _build_index. The spec requires exactly 4 keys per person: 1. forward (first last) 2. reversed (last first) 3. maiden name (first maiden) if maiden set 4. lastName only (last) Update test data to use full names in Bemerkung fields (e.g., 'Clara Cram' instead of 'Clara') since single first names alone are no longer resolvable. All 52 tests pass.	2026-05-25 21:08:49 +02:00
Marcel	6f55489ec2	feat(normalizer): add PARENT_OF Bemerkung extraction to persons_tree	2026-05-25 21:06:24 +02:00
Marcel	fa4b6b5fc2	feat(normalizer): add SPOUSE_OF resolution to persons_tree	2026-05-25 21:03:46 +02:00
Marcel	1f2351e3c0	feat(normalizer): add _deduplicate() to persons_tree	2026-05-25 21:02:02 +02:00
Marcel	7012234e6a	feat(normalizer): add row parser to persons_tree	2026-05-25 20:59:49 +02:00
Marcel	306f3b6fe6	feat(normalizer): add name normalization + lookup index to persons_tree	2026-05-25 20:56:47 +02:00
Marcel	47a0770758	feat(normalizer): add generation parser to persons_tree	2026-05-25 20:54:38 +02:00
Marcel	889d301f16	fix(normalizer): correct _MIN_YEAR comment in test (1700 not 1500)	2026-05-25 20:53:16 +02:00
Marcel	443c7a48db	fix(normalizer): don't convert plausible typo years as Excel serials	2026-05-25 20:46:42 +02:00
Marcel	9ae1196d1c	feat(normalizer): add persons_tree skeleton + year extraction	2026-05-25 20:41:25 +02:00
Marcel	b37fd1728b	docs(importer): add Personendatei importer implementation plan 9-task TDD plan for persons_tree.py — year extraction, name index, deduplication, SPOUSE_OF/PARENT_OF extraction, CLI + JSON output. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 20:38:14 +02:00
Marcel	6103d5d229	docs(importer): resolve open questions in Personendatei importer spec OQ-01: tool deduplicates rows with identical (firstName, lastName, birthYear) OQ-02: birthPlace/deathPlace kept as separate JSON fields OQ-03: multi-name firstName stored verbatim Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 20:28:45 +02:00
Marcel	7b483d357a	docs(importer): add Personendatei importer design spec Two-pass Python tool (persons_tree.py) that normalizes import/Personendatei 2.xlsx into canonical-persons-tree.json with persons, SPOUSE_OF/PARENT_OF relationships, and an unresolved[] list for manual review. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 20:26:30 +02:00
Marcel	94a40237f4	feat(normalizer): generate structured tags from Schlagwort + Inhalt fields Adds tags.py module implementing a three-outcome heuristic: - Individual-to-individual correspondence tags ("Clara an Herbert") → dropped - Group/collective correspondence ("Clara an Kinder", "Walter an Geschwister") → Briefwechsel/<value> - Semantic/event tags ("Brautbriefe", "Alltag", "zur Hochzeit") → Themen/<value> Three correspondence patterns detected: space-an-space, starts-with-"an ", and abbreviated-sender form ("Maria W.an Clara"). COLLECTIVE_TERMS in config.py extended with 17 plural/group relational terms (söhne, brüder, schwiegereltern, cousinen, etc.) confirmed against the full Excel. Also adds two-phase summary mining: every run emits review/tag-candidates.csv; subsequent runs apply keywords from overrides/approved-themes.csv as Themen tags. Outputs: canonical-documents.xlsx gets pipe-separated "Parent/Child" tag paths; canonical-tag-tree.xlsx provides the full tag hierarchy for backend pre-import. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-25 19:47:36 +02:00
Marcel	5efe3b8a7c	feat(normalizer): parse Spanish month names + Month DD-YYYY hyphen form All checks were successful CI / Unit & Component Tests (pull_request) Successful in 3m31s Details CI / OCR Service Tests (pull_request) Successful in 22s Details CI / Backend Unit Tests (pull_request) Successful in 3m42s Details CI / fail2ban Regex (pull_request) Successful in 45s Details CI / Semgrep Security Scan (pull_request) Successful in 20s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s Details Add Spanish month names (Mexican-branch letters) to config.MONTHS and let the month-first matcher accept a hyphen (not just a dot) before the year, so "Mayo 18-1929"/"Junio 7-904" parse without manual overrides. Also bound 4-digit years to 1700-2100 so gross typos ("23-9003") stay in review instead of producing a bogus year. Cuts unknown-date rate 9.2% -> 7.9%. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 17:00:33 +02:00
Marcel	0f1f9055c3	docs(normalizer): add overrides/ README with structure + examples All checks were successful CI / Unit & Component Tests (pull_request) Successful in 3m27s Details CI / OCR Service Tests (pull_request) Successful in 21s Details CI / Backend Unit Tests (pull_request) Successful in 3m40s Details CI / fail2ban Regex (pull_request) Successful in 45s Details CI / Semgrep Security Scan (pull_request) Successful in 21s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m3s Details Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 16:53:03 +02:00
Marcel	8cac63e938	feat(normalizer): drop unmatched-names.csv; unresolved-names is the names report All checks were successful CI / Unit & Component Tests (pull_request) Successful in 3m32s Details CI / OCR Service Tests (pull_request) Successful in 19s Details CI / Backend Unit Tests (pull_request) Successful in 3m26s Details CI / fail2ban Regex (pull_request) Successful in 47s Details CI / Semgrep Security Scan (pull_request) Successful in 21s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m0s Details The unmatched list was just non-family correspondents (expected noise); their count stays in summary.txt and they remain in canonical-persons.xlsx. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 16:46:08 +02:00
Marcel	97db718f81	docs(import): add unresolved-names plan + worklog entry All checks were successful CI / OCR Service Tests (pull_request) Successful in 22s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m1s Details CI / Backend Unit Tests (pull_request) Successful in 3m52s Details CI / fail2ban Regex (pull_request) Successful in 42s Details CI / Unit & Component Tests (pull_request) Successful in 4m13s Details CI / Semgrep Security Scan (pull_request) Successful in 20s Details Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 16:01:18 +02:00
Marcel	06127724de	docs(normalizer): document unresolved-names.csv review report Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:59:45 +02:00
Marcel	7c017eca2a	test(normalizer): assert unresolved stat key + drop duplicate assertion Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:58:34 +02:00
Marcel	97ab9e38df	feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:54:37 +02:00
Marcel	f10b80a03f	feat(normalizer): build_given_names from register + supplement Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:51:23 +02:00
Marcel	6478cc58ae	feat(normalizer): classify_name + NameClass Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:47:40 +02:00
Marcel	a7c45b3a0e	feat(normalizer): config tables for name classification Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:43:31 +02:00
Marcel	5ff0c25e10	chore: drop stray reader-dashboard test from this branch All checks were successful CI / Semgrep Security Scan (pull_request) Successful in 23s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m2s Details CI / Unit & Component Tests (pull_request) Successful in 3m31s Details CI / OCR Service Tests (pull_request) Successful in 20s Details CI / Backend Unit Tests (pull_request) Successful in 3m53s Details CI / fail2ban Regex (pull_request) Successful in 41s Details page.server.spec.ts picked up an unrelated reader-dashboard test case via a cross-session staging race; restore it to match main so this PR only touches the import-normalizer tool + docs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 15:07:14 +02:00
Marcel	7ba3a29592	docs(import): record normalizer completion + dry-run results in worklog Some checks failed CI / Unit & Component Tests (pull_request) Failing after 1m17s Details CI / OCR Service Tests (pull_request) Successful in 19s Details CI / Backend Unit Tests (pull_request) Successful in 3m46s Details CI / fail2ban Regex (pull_request) Successful in 41s Details CI / Semgrep Security Scan (pull_request) Successful in 20s Details CI / Compose Bucket Idempotency (pull_request) Successful in 1m1s Details Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:56:20 +02:00
Marcel	d314fd9338	docs(normalizer): README + seed overrides Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:51:20 +02:00
Marcel	18d5a1e2da	feat(normalizer): orchestrator + end-to-end integration test Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:46:13 +02:00
Marcel	df00ea4238	fix(normalizer): defang leading LF in CSV + assert pinned workbook timestamp Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:43:45 +02:00
Marcel	ff1a7c07f1	feat(normalizer): overrides loader + xlsx/csv writers Recovered from an entangled commit: these files were correct but had been bundled into an unrelated reader-dashboard commit by a concurrent session. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:39:28 +02:00
Marcel	366b484815	test(normalizer): real provisional-vs-register collision + override-hits coverage Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:25:49 +02:00
Marcel	88c8063227	feat(normalizer): person resolution context + to_canonical Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:18:09 +02:00
Marcel	3066d3d3ff	refactor(normalizer): harden triage index guard + index_file_mismatch tests Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:15:50 +02:00
Marcel	3e7ddea90a	feat(normalizer): row extraction, triage, canonical record Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:12:48 +02:00
Marcel	75b3ca8b9e	fix(normalizer): don't coerce boolean cells to 1/0 Add bool guard before the int branch in _cell_to_str so True/False cells are preserved as "True"/"False" instead of "1"/"0". Add two regression tests covering the fix and missing-sheet error. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:11:19 +02:00
Marcel	74c4c390fc	feat(normalizer): xlsx ingest + header mapping Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:08:30 +02:00
Marcel	29087319e6	test(normalizer): cover AliasIndex unambiguous first-name resolution Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:07:20 +02:00
Marcel	53457d9319	feat(normalizer): alias index with maiden/married/nickname resolution Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:04:11 +02:00
Marcel	2d97595e9c	fix(normalizer): split_receivers returns [] for a geb.-only cell Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 14:02:35 +02:00
Marcel	a177077b40	feat(normalizer): receiver splitting Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:59:51 +02:00
Marcel	b7a2332861	fix(normalizer): suffix all members of a colliding person-id group Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:58:35 +02:00
Marcel	1da1a8d223	feat(normalizer): person register parsing Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:54:37 +02:00
Marcel	59715bdccd	fix(normalizer): require day-dot in English month-first matcher (structural anti-shadow) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:53:05 +02:00
Marcel	53a661adb6	feat(normalizer): month/year, feast/season, range matchers + overrides Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:47:26 +02:00
Marcel	4942c0ea07	feat(normalizer): day-first month-name matcher Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:42:36 +02:00
Marcel	7edc002ebb	feat(normalizer): roman-numeral month matcher Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:38:32 +02:00
Marcel	b43dd6cdd4	fix(normalizer): keep Task 5 scoped — drop year-only matcher (belongs to Task 8) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:36:48 +02:00
Marcel	cff486dda7	fix(normalizer): treat leading date qualifiers (nach/vor/…) as APPROX _preprocess now sets approx=True when a leading marker is stripped; add _match_year_only so bare years (e.g. "nach 1900" -> "1900") resolve to 1900-01-01/YEAR before being upgraded to APPROX. Strengthen test_parse_approx_marker_upgrades_precision and add test_parse_leading_qualifier_is_approx (11 tests, all pass). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:35:19 +02:00
Marcel	df14e6b1ee	feat(normalizer): parse_date dispatch + iso/numeric matchers Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:30:07 +02:00
Marcel	1908dde859	feat(normalizer): year expansion century rule Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:27:26 +02:00
Marcel	4845e7a3c1	feat(normalizer): feast + season resolution Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:24:26 +02:00
Marcel	c6cceec6e9	feat(normalizer): Easter computus Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:21:39 +02:00
Marcel	8f6f4f2d62	feat(normalizer): scaffold tool + config tables Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 13:18:52 +02:00
Marcel	6f7aa643c9	docs(import): add normalizer implementation plan + apply persona review 17-task TDD plan for tools/import-normalizer/. Incorporates inline 6-persona review: content-deterministic idempotency, duplicate-index fix, provisional-id collision guard, date-parser edge cases, multi-sender split, CSV-injection defang, pinned deps. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 12:55:50 +02:00
Marcel	adfff420a5	docs(import): add import-migration analysis + normalizer spec Document the raw archive spreadsheet findings (IMP-01..12) and a requirements spec for an offline normalizer that produces a clean canonical dataset before import. Local docs only; no Gitea issue yet. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-25 12:32:37 +02:00