Compare commits
59 Commits
worktree-f
...
docs/impor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2e59c0ef5b | ||
|
|
309436b9a4 | ||
|
|
e326630318 | ||
|
|
34c40cb0ee | ||
|
|
ace41ad209 | ||
|
|
6f55489ec2 | ||
|
|
fa4b6b5fc2 | ||
|
|
1f2351e3c0 | ||
|
|
7012234e6a | ||
|
|
306f3b6fe6 | ||
|
|
47a0770758 | ||
|
|
889d301f16 | ||
|
|
443c7a48db | ||
|
|
9ae1196d1c | ||
|
|
b37fd1728b | ||
|
|
6103d5d229 | ||
|
|
7b483d357a | ||
|
|
94a40237f4 | ||
|
|
5efe3b8a7c | ||
|
|
0f1f9055c3 | ||
|
|
8cac63e938 | ||
|
|
97db718f81 | ||
|
|
06127724de | ||
|
|
7c017eca2a | ||
|
|
97ab9e38df | ||
|
|
f10b80a03f | ||
|
|
6478cc58ae | ||
|
|
a7c45b3a0e | ||
|
|
5ff0c25e10 | ||
|
|
7ba3a29592 | ||
|
|
d314fd9338 | ||
|
|
18d5a1e2da | ||
|
|
df00ea4238 | ||
|
|
ff1a7c07f1 | ||
|
|
366b484815 | ||
|
|
88c8063227 | ||
|
|
3066d3d3ff | ||
|
|
3e7ddea90a | ||
|
|
75b3ca8b9e | ||
|
|
74c4c390fc | ||
|
|
29087319e6 | ||
|
|
53457d9319 | ||
|
|
2d97595e9c | ||
|
|
a177077b40 | ||
|
|
b7a2332861 | ||
|
|
1da1a8d223 | ||
|
|
59715bdccd | ||
|
|
53a661adb6 | ||
|
|
4942c0ea07 | ||
|
|
7edc002ebb | ||
|
|
b43dd6cdd4 | ||
|
|
cff486dda7 | ||
|
|
df14e6b1ee | ||
|
|
1908dde859 | ||
|
|
4845e7a3c1 | ||
|
|
c6cceec6e9 | ||
|
|
8f6f4f2d62 | ||
|
|
6f7aa643c9 | ||
|
|
adfff420a5 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -26,3 +26,7 @@ node_modules/
|
|||||||
|
|
||||||
# Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift.
|
# Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift.
|
||||||
frontend/yarn.lock
|
frontend/yarn.lock
|
||||||
|
|
||||||
|
**/.venv/
|
||||||
|
**/__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|||||||
@@ -197,7 +197,6 @@ frontend/src/routes/
|
|||||||
├── aktivitaeten/ Unified activity feed (Chronik)
|
├── aktivitaeten/ Unified activity feed (Chronik)
|
||||||
├── geschichten/ Stories — list, [id], [id]/edit, new
|
├── geschichten/ Stories — list, [id], [id]/edit, new
|
||||||
├── stammbaum/ Family tree (Stammbaum)
|
├── stammbaum/ Family tree (Stammbaum)
|
||||||
├── themen/ Topics directory — browsable tag index
|
|
||||||
├── enrich/ Enrichment workflow — [id], done
|
├── enrich/ Enrichment workflow — [id], done
|
||||||
├── admin/ User, group, tag, OCR, system management
|
├── admin/ User, group, tag, OCR, system management
|
||||||
├── hilfe/transkription/ Transcription help page
|
├── hilfe/transkription/ Transcription help page
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import org.raddatz.familienarchiv.person.Person;
|
|||||||
import org.raddatz.familienarchiv.tag.Tag;
|
import org.raddatz.familienarchiv.tag.Tag;
|
||||||
|
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
@@ -33,9 +32,5 @@ public record DocumentListItem(
|
|||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
List<ActivityActorDTO> contributors,
|
List<ActivityActorDTO> contributors,
|
||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
SearchMatchData matchData,
|
SearchMatchData matchData
|
||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
|
||||||
LocalDateTime createdAt,
|
|
||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
|
||||||
LocalDateTime updatedAt
|
|
||||||
) {}
|
) {}
|
||||||
|
|||||||
@@ -767,9 +767,7 @@ public class DocumentService {
|
|||||||
doc.getSummary(),
|
doc.getSummary(),
|
||||||
completionPct,
|
completionPct,
|
||||||
contributors,
|
contributors,
|
||||||
match,
|
match
|
||||||
doc.getCreatedAt(),
|
|
||||||
doc.getUpdatedAt()
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -135,8 +135,7 @@ class DocumentControllerTest {
|
|||||||
.thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
|
.thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
|
||||||
docId, "Brief an Anna", "brief.pdf", null, null, null,
|
docId, "Brief an Anna", "brief.pdf", null, null, null,
|
||||||
List.of(), List.of(), null, null, null, null,
|
List.of(), List.of(), null, null, null, null,
|
||||||
0, List.of(), matchData,
|
0, List.of(), matchData))));
|
||||||
LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)))));
|
|
||||||
|
|
||||||
mockMvc.perform(get("/api/documents/search").param("q", "Brief"))
|
mockMvc.perform(get("/api/documents/search").param("q", "Brief"))
|
||||||
.andExpect(status().isOk())
|
.andExpect(status().isOk())
|
||||||
@@ -154,8 +153,7 @@ class DocumentControllerTest {
|
|||||||
.thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
|
.thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem(
|
||||||
docId, "Brief an Anna", "brief.pdf", null, null, null,
|
docId, "Brief an Anna", "brief.pdf", null, null, null,
|
||||||
List.of(), List.of(), null, null, null, null,
|
List.of(), List.of(), null, null, null, null,
|
||||||
0, List.of(), matchData,
|
0, List.of(), matchData))));
|
||||||
LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)))));
|
|
||||||
|
|
||||||
mockMvc.perform(get("/api/documents/search"))
|
mockMvc.perform(get("/api/documents/search"))
|
||||||
.andExpect(status().isOk())
|
.andExpect(status().isOk())
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ import org.junit.jupiter.api.Test;
|
|||||||
import org.raddatz.familienarchiv.audit.ActivityActorDTO;
|
import org.raddatz.familienarchiv.audit.ActivityActorDTO;
|
||||||
import org.springframework.data.domain.PageRequest;
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
@@ -17,8 +16,7 @@ class DocumentSearchResultTest {
|
|||||||
return new DocumentListItem(
|
return new DocumentListItem(
|
||||||
docId, "Test", "test.pdf", null, null, null,
|
docId, "Test", "test.pdf", null, null, null,
|
||||||
List.of(), List.of(), null, null, null, null,
|
List.of(), List.of(), null, null, null, null,
|
||||||
0, List.of(), SearchMatchData.empty(),
|
0, List.of(), SearchMatchData.empty());
|
||||||
LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -68,8 +66,7 @@ class DocumentSearchResultTest {
|
|||||||
DocumentListItem item = new DocumentListItem(
|
DocumentListItem item = new DocumentListItem(
|
||||||
id, "T", "t.pdf", null, null, null,
|
id, "T", "t.pdf", null, null, null,
|
||||||
List.of(), List.of(), null, null, null, null,
|
List.of(), List.of(), null, null, null, null,
|
||||||
75, List.of(actor), SearchMatchData.empty(),
|
75, List.of(actor), SearchMatchData.empty());
|
||||||
LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0));
|
|
||||||
|
|
||||||
DocumentSearchResult result = DocumentSearchResult.of(List.of(item));
|
DocumentSearchResult result = DocumentSearchResult.of(List.of(item));
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") {
|
|||||||
Component(geschichten, "/geschichten and /geschichten/[id]", "SvelteKit Routes", "Story list and detail pages. Loader: GET /api/geschichten?status=PUBLISHED.")
|
Component(geschichten, "/geschichten and /geschichten/[id]", "SvelteKit Routes", "Story list and detail pages. Loader: GET /api/geschichten?status=PUBLISHED.")
|
||||||
Component(geschichtenEdit, "/geschichten/[id]/edit and /geschichten/new", "SvelteKit Routes", "Story editor with rich text, person and document linking. Actions: PUT/POST /api/geschichten. Requires BLOG_WRITE permission.")
|
Component(geschichtenEdit, "/geschichten/[id]/edit and /geschichten/new", "SvelteKit Routes", "Story editor with rich text, person and document linking. Actions: PUT/POST /api/geschichten. Requires BLOG_WRITE permission.")
|
||||||
Component(stammbaum, "/stammbaum", "SvelteKit Route", "Family tree visualisation. Loader: GET /api/network (nodes + edges). Renders interactive family tree from network graph data.")
|
Component(stammbaum, "/stammbaum", "SvelteKit Route", "Family tree visualisation. Loader: GET /api/network (nodes + edges). Renders interactive family tree from network graph data.")
|
||||||
Component(themen, "/themen", "SvelteKit Route", "Browsable topic index. Shows all root tags as cards with color bars and child rows. ThemenWidget also embedded in the home dashboard (reader + editor sidebar). Loader: GET /api/tags/tree.")
|
|
||||||
Component(profilePage, "/profile", "SvelteKit Route", "Current user profile settings. Loader: GET /api/users/me/notification-preferences. Actions: update name/password and notification preferences.")
|
Component(profilePage, "/profile", "SvelteKit Route", "Current user profile settings. Loader: GET /api/users/me/notification-preferences. Actions: update name/password and notification preferences.")
|
||||||
Component(userProfile, "/users/[id]", "SvelteKit Route", "Public user profile view. Loader: GET /api/users/{id}.")
|
Component(userProfile, "/users/[id]", "SvelteKit Route", "Public user profile view. Loader: GET /api/users/{id}.")
|
||||||
}
|
}
|
||||||
@@ -27,7 +26,6 @@ Rel(aktivitaeten, backend, "GET /api/dashboard/activity, GET /api/notifications"
|
|||||||
Rel(geschichten, backend, "GET /api/geschichten", "HTTP / JSON")
|
Rel(geschichten, backend, "GET /api/geschichten", "HTTP / JSON")
|
||||||
Rel(geschichtenEdit, backend, "GET/PUT/POST /api/geschichten", "HTTP / JSON")
|
Rel(geschichtenEdit, backend, "GET/PUT/POST /api/geschichten", "HTTP / JSON")
|
||||||
Rel(stammbaum, backend, "GET /api/network", "HTTP / JSON")
|
Rel(stammbaum, backend, "GET /api/network", "HTTP / JSON")
|
||||||
Rel(themen, backend, "GET /api/tags/tree", "HTTP / JSON")
|
|
||||||
Rel(profilePage, backend, "GET/PUT /api/users/me, notification-preferences", "HTTP / JSON")
|
Rel(profilePage, backend, "GET/PUT /api/users/me, notification-preferences", "HTTP / JSON")
|
||||||
Rel(userProfile, backend, "GET /api/users/{id}", "HTTP / JSON")
|
Rel(userProfile, backend, "GET /api/users/{id}", "HTTP / JSON")
|
||||||
|
|
||||||
|
|||||||
313
docs/import-migration/01-findings-spreadsheet-analysis.md
Normal file
313
docs/import-migration/01-findings-spreadsheet-analysis.md
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
# Spreadsheet Analysis — Findings (2026-05-25)
|
||||||
|
|
||||||
|
Analysis of the **real raw archive** spreadsheets against the current `MassImportService`
|
||||||
|
(`backend/.../importing/MassImportService.java`). Goal: import ~7,600 letter rows + a
|
||||||
|
163-person register, with PDFs to follow.
|
||||||
|
|
||||||
|
Every issue has an ID (`IMP-NN`), severity, evidence, and a proposed approach.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Context: how the importer reads a row today
|
||||||
|
|
||||||
|
`MassImportService` reads **sheet index 0** and maps columns by configurable indices
|
||||||
|
(`app.import.col.*`, defaults in the source):
|
||||||
|
|
||||||
|
| Property | Default col | Meaning |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `colIndex` | 0 | Index (→ filename `<index>.pdf`) |
|
||||||
|
| `colBox` | 1 | Box |
|
||||||
|
| `colFolder` | 2 | Mappe |
|
||||||
|
| `colSender` | 3 | Sender (raw) |
|
||||||
|
| `colReceivers` | 5 | Receivers (raw) |
|
||||||
|
| `colDate` | 7 | Date |
|
||||||
|
| `colLocation` | 9 | Location |
|
||||||
|
| `colTags` | 10 | Tag (single) |
|
||||||
|
| `colSummary` | 11 | Summary |
|
||||||
|
| `colTranscription` | 13 | Transcription |
|
||||||
|
|
||||||
|
These defaults match the **ODS** file exactly (`Index, Box, Mappe, Von, BriefeschreiberIn,
|
||||||
|
An, EmpfängerIn, Datum, Datum Originalformat, Ort, Schlagwort, Inhalt, Zeitlicher Kontext,
|
||||||
|
Transkript` = 14 cols). The ODS was the development target. The new xlsx is a different beast.
|
||||||
|
|
||||||
|
Per-row pipeline: skip if Index blank → derive filename from Index → validate filename →
|
||||||
|
look for file on disk (recursive; metadata-only if absent) → check PDF magic bytes →
|
||||||
|
`importSingleDocument` (upsert by `originalFilename`, dedupe non-placeholders as
|
||||||
|
`ALREADY_EXISTS`). Date parsing is **ISO-only** (`LocalDate.parse`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-01 — New xlsx column layout ≠ importer defaults 🔴 BLOCKER
|
||||||
|
|
||||||
|
The new `…aktuell…xlsx` (sheet `Familienarchiv`, 7,943 rows × 12 cols) has a **denser,
|
||||||
|
different** layout. There is an extra `Datei` column at index 1, and the normalized
|
||||||
|
`Von`/`An`/ISO-`Datum` columns from the ODS **do not exist**.
|
||||||
|
|
||||||
|
| col | New xlsx header | Importer default expects | Result with defaults |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| 0 | Index | Index | ✅ ok |
|
||||||
|
| 1 | **Datei** (path) | Box | ❌ Box ← `..\__scan\W-0001.pdf` |
|
||||||
|
| 2 | Box | Mappe | ❌ Mappe ← `V` |
|
||||||
|
| 3 | Mappe | Sender | ❌ Sender ← `1` |
|
||||||
|
| 4 | BriefeschreiberIn (sender) | — (unused) | ❌ sender ignored |
|
||||||
|
| 5 | EmpfängerIn (receiver) | Receivers | ✅ coincidentally ok |
|
||||||
|
| 6 | Datum des Briefes | — (unused) | ❌ date ignored |
|
||||||
|
| 7 | Ort (location) | Date | ❌ Date ← `Rotterdam` → null |
|
||||||
|
| 8 | Schlagwort (tag) | — (unused) | ❌ tag ignored |
|
||||||
|
| 9 | Inhalt (summary) | Location | ❌ Location ← summary text |
|
||||||
|
| 10 | — | Tag | ❌ empty |
|
||||||
|
| 11 | — | Summary | ❌ empty |
|
||||||
|
| 13 | — | Transcription | ❌ column doesn't exist |
|
||||||
|
|
||||||
|
**Impact:** importing as-is produces almost entirely garbage metadata.
|
||||||
|
|
||||||
|
**Proposed approach (decide with Marcel):**
|
||||||
|
- (a) Re-map via the existing `app.import.col.*` properties — fast, no code. New mapping:
|
||||||
|
`index=0, box=2, folder=3, sender=4, receivers=5, date=6, location=7, tags=8, summary=9`,
|
||||||
|
and there is **no** transcription column (point it past the end or add a "missing column"
|
||||||
|
convention). Caveat: tags land in `colTags` but the real per-letter keywords are in
|
||||||
|
`Inhalt` (col 9) — see IMP-08 note on tags vs summary.
|
||||||
|
- (b) Make the importer **header-driven** (map by header name, not index) so it survives
|
||||||
|
layout drift across files. More robust, needs a code change (→ Gitea issue).
|
||||||
|
|
||||||
|
Recommendation: (b) is the durable fix given we have ≥3 different layouts already.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-02 — 90% of dates are free-text the parser can't read 🔴 BLOCKER
|
||||||
|
|
||||||
|
The dates are written **as in the letter**. `parseDate()` only does `LocalDate.parse()`
|
||||||
|
(ISO `yyyy-MM-dd`), so anything non-ISO becomes `null`.
|
||||||
|
|
||||||
|
Of **7,319** rows with a date value (col 6):
|
||||||
|
|
||||||
|
| kind | count | parses today? |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Real Excel date cells (→ ISO via POI) | 748 | ✅ |
|
||||||
|
| Free-text date strings | 6,571 | ❌ → null |
|
||||||
|
|
||||||
|
→ **90% of dated rows lose their date.** (623 rows have no date at all.)
|
||||||
|
|
||||||
|
Observed free-text formats (counts approximate, from col 6):
|
||||||
|
|
||||||
|
| Format | Count | Examples |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `D.M.YY` | 1,338 | `11.10.08`, `13.5.09` |
|
||||||
|
| `D.RomanMonth.YY/YYYY` | ~1,527 | `22.III.18`, `19.XII.1954`, `1.III.27` |
|
||||||
|
| `D.Month YYYY` | 950 | `6.März 1888`, `9.März 1888` (note: **no space** after the dot) |
|
||||||
|
| `D.M.YYYY` | 358 | `15.2.1888`, `7.3.1888` |
|
||||||
|
| Approximate / unknown | 146 | `?`, `13.7.18?`, `17.Nov (?) 1887`, `13.Januar ? 1907` |
|
||||||
|
| `Month YYYY` / season / holiday | 41+27 | `Mai 1895`, `Herbst 1913`, `Pfingsten 1922`, `Ostern 1890` |
|
||||||
|
| `YYYY` only | 17 | `1905`, `1949` |
|
||||||
|
| `D.M.` no year | 10 | `8.9.`, `14.3.` |
|
||||||
|
| Ranges | 5+ | `8.1.1916 - 15.3.1916`, `1881/82`, `1945/46?` |
|
||||||
|
| Abbrev/English months, no space | many | `29.Sept.1891`, `10.Oct.95`, `9.December1889`, `18.Dez.1916` |
|
||||||
|
| Slash separator | ~315 | `2/2. 18`, `17/6. 1916`, `10/4. 1917` |
|
||||||
|
| English `Month D. YYYY` | several | `April 12. 1922`, `Oct.5. 1916`, `Mai 23. 1917` |
|
||||||
|
| Trailing notes | 5+ | `26.4.1888, 2. Brief`, `31.8.1888,2.Brief` |
|
||||||
|
| 3-digit year (typo) | 107 | `30.1.889` (→ 1889), `4.3.1023` (in person file → 1923) |
|
||||||
|
| Day-range within month | several | `7./8. Sept.1923` |
|
||||||
|
|
||||||
|
**Proposed approach:** build a tolerant German/historical date parser (→ Gitea issue, it's
|
||||||
|
a code change). Requirements:
|
||||||
|
- Numeric `D.M.YY[YY]` and `D/M. YY[YY]` (slash = dot).
|
||||||
|
- Roman-numeral months (`I`–`XII`).
|
||||||
|
- German + English month names, full + abbreviated, with/without separating space
|
||||||
|
(`März`, `Sept.`, `Dez`, `December`, `Oct.`).
|
||||||
|
- 2-digit and 3-digit year normalization (`08`→1908? needs a century rule; `889`→1889).
|
||||||
|
- Partial dates → store what's known. The schema only has a single `documentDate
|
||||||
|
LocalDate`; **decide** whether to (i) store first-of-month/year, (ii) add a
|
||||||
|
`datePrecision` enum + `dateOriginal` text column, or (iii) keep raw text in a new
|
||||||
|
`documentDateRaw` field and leave `documentate` null when imprecise. Recommendation:
|
||||||
|
preserve the **original string** always (new column) + best-effort parsed date +
|
||||||
|
precision flag, so nothing is lost and the UI can show "ca. 1916".
|
||||||
|
- Unparseable/approximate (`?`, `Herbst 1913`) → keep raw, leave parsed date null, **do
|
||||||
|
not drop the row**.
|
||||||
|
|
||||||
|
**Cross-check:** even after IMP-01 is fixed so the date column is read, IMP-02 still bites.
|
||||||
|
Both must be solved before a real import.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-03 — New xlsx has no normalized/ISO date or name columns 🔴 BLOCKER
|
||||||
|
|
||||||
|
The ODS had helper columns the importer relied on: `Von`/`An` (normalized names) and
|
||||||
|
`Datum` (ISO) alongside `Datum Originalformat`. The new xlsx has **only the raw**
|
||||||
|
`BriefeschreiberIn` / `EmpfängerIn` / `Datum des Briefes`. So:
|
||||||
|
- Names must be parsed from raw strings (PersonNameParser already does receivers; **sender
|
||||||
|
is taken raw, never split** — fine for senders, which are single, but no normalization).
|
||||||
|
- Dates must be parsed from raw (IMP-02).
|
||||||
|
|
||||||
|
This is the root reason IMP-01/02 exist: the new file is the *uncurated* source, not the
|
||||||
|
hand-normalized ODS. Tie any importer redesign to this reality — we will not get clean
|
||||||
|
helper columns in the 7k-row file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-04 — Person register not imported at all 🟠 MAJOR
|
||||||
|
|
||||||
|
`Personendatei 2.xlsx` → sheet `Tabelle1`, **163 people**, columns:
|
||||||
|
`Generation, Familienname, Vorname, geb als (maiden), Geburtsdatum, Geburtsort,
|
||||||
|
Todesdatum, Sterbeort, verheiratet mit, Bemerkung`.
|
||||||
|
|
||||||
|
Today `MassImportService` has **no person-register import**. Persons are only
|
||||||
|
auto-created as bare aliases from the document sender/receiver strings
|
||||||
|
(`personService.findOrCreateByAlias`). All this rich genealogical data is unused:
|
||||||
|
- birth/death dates + places,
|
||||||
|
- maiden names (the key to dedup — see IMP-05),
|
||||||
|
- `verheiratet mit` (marriage links → `PersonRelationship` domain),
|
||||||
|
- `Bemerkung` relationship hints (`"Schwester v Marie Cram"`, `"Nichte von Herbert"`),
|
||||||
|
- `Generation` (G 1–G 4),
|
||||||
|
- nicknames in quotes (`"Tante Lolly"`).
|
||||||
|
|
||||||
|
Data-quality notes in this file too: multi-value `Vorname` (`Charlotte,Meta,Jacobi`);
|
||||||
|
mixed Excel-date vs text dates; typos (`4.3.1023`); missing-day dates (`.12.1955`);
|
||||||
|
trailing spaces (`30.8.1862 `).
|
||||||
|
|
||||||
|
**Proposed approach:** a separate **Person import** (→ Gitea issue). Order matters: import
|
||||||
|
persons *first* so documents can link to real people instead of creating alias stubs.
|
||||||
|
Use `geb als` + `verheiratet mit` to pre-build the alias/relationship graph.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-05 — Name variations create duplicate Persons 🟠 MAJOR
|
||||||
|
|
||||||
|
The same person appears under several surface forms across the document sheet:
|
||||||
|
- `Eugenie Müller` (151) vs `Eugenie de Gruyter` (452) — maiden vs married.
|
||||||
|
- `Clara Cram` (sender 1,284) vs `Clara de Gruyter` (455) vs `Clara de Gruyter sen.` (66).
|
||||||
|
- `Walter de Gruyter` (589) vs bare `Walter` (78).
|
||||||
|
|
||||||
|
`findOrCreateByAlias` keys on the raw string, so each variant becomes (or matches) a
|
||||||
|
distinct alias and likely a **distinct Person**. Result: fragmented person records,
|
||||||
|
broken Briefwechsel pairing, wrong stats.
|
||||||
|
|
||||||
|
**Proposed approach:** drive dedup from the register's `geb als` column (IMP-04) —
|
||||||
|
`Eugenie de Gruyter geb Müller` tells us the two strings are one person. Build an alias
|
||||||
|
map (married ↔ maiden ↔ nickname) before/while importing documents. This is partly data
|
||||||
|
(an alias mapping table/sheet) and partly code (consume it). Likely a Gitea issue once the
|
||||||
|
mapping format is decided.
|
||||||
|
|
||||||
|
945 distinct sender strings / 274 distinct receiver strings — expect a long-tail of
|
||||||
|
variants to reconcile. Don't try to be perfect on the first pass; get the high-frequency
|
||||||
|
names right.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-06 — 93 data rows with blank Index are silently dropped 🟠 MAJOR
|
||||||
|
|
||||||
|
`processRows` does `if (index.isBlank()) continue;`. **93 rows** have a blank Index but
|
||||||
|
carry other data (sender/receiver/date/etc.). These are silently skipped — they don't even
|
||||||
|
appear in the `skippedFiles` report (that list only covers rows that *had* an index but
|
||||||
|
failed file checks).
|
||||||
|
|
||||||
|
**Proposed approach:** before import, triage these 93 rows — are they continuation rows,
|
||||||
|
section markers, or genuine letters missing an ID? At minimum, surface a count/warning so
|
||||||
|
nothing vanishes unnoticed. Possibly a small importer change to report blank-index skips.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-07 — 43 duplicate Index values 🟡 MINOR
|
||||||
|
|
||||||
|
43 Index values repeat (e.g. `W-0388`, `Eu-0332`, `C-0234`, `C-0235`, `C-0236`, `J-0175`).
|
||||||
|
Since the filename is derived from Index, the importer's upsert keys both rows on the same
|
||||||
|
`originalFilename`: the second occurrence is treated as `ALREADY_EXISTS` (if the first
|
||||||
|
isn't a placeholder) and **its metadata is lost**, or it overwrites a placeholder.
|
||||||
|
|
||||||
|
**Proposed approach:** list the 43 duplicates, check whether they're true duplicates or
|
||||||
|
two distinct letters that share an ID by mistake. Fix in the source data, or extend the ID
|
||||||
|
scheme. Data task first; software only if the ID scheme must change.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-08 — Section/title rows interleaved with data 🟡 MINOR
|
||||||
|
|
||||||
|
Row 2 of the sheet is a section header sitting only in the sender column
|
||||||
|
(`Brautbriefe von Walter der Gruyter an Eugenie Müller`) with a blank Index — caught by the
|
||||||
|
blank-Index skip (overlaps IMP-06). There may be more such banners scattered through 7,943
|
||||||
|
rows. Also relevant: the per-letter **keywords live in `Inhalt` (col 9)** as comma-joined
|
||||||
|
values (`Tilburg,Verwandschaft`, `poetisch,Reise nach Breda`), while `Schlagwort` (col 8)
|
||||||
|
holds a single broad tag (`Brautbriefe`). The importer only takes **one** tag column —
|
||||||
|
decide which column feeds tags vs summary, and whether to split comma-lists into multiple
|
||||||
|
tags.
|
||||||
|
|
||||||
|
**Proposed approach:** scan for rows where Index is blank but other cells are set (already
|
||||||
|
have the count: relates to the 93 in IMP-06). Confirm tag vs summary column choice with
|
||||||
|
Marcel.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-09 — Index ↔ Datei filename mismatches 🟡 MINOR
|
||||||
|
|
||||||
|
The `Datei` column (col 1) holds explicit relative paths (`..\__scan\W-0001.pdf`) but they
|
||||||
|
don't always agree with the Index. Example: row 20 has Index `W-0010x` but Datei
|
||||||
|
`..\__scan\W-0011x.pdf`. The importer derives the filename from **Index**, so it will look
|
||||||
|
for `W-0010x.pdf` and may miss the actual scan. (Note: the `Datei` paths themselves are
|
||||||
|
Windows-style with `\` and `..` and would be **rejected** by `isValidImportFilename` if anyone
|
||||||
|
tried to use that column directly — 7,623 rows use backslashes, 7,455 contain `..`.)
|
||||||
|
|
||||||
|
**Proposed approach:** when the PDFs arrive, reconcile Index-derived names against actual
|
||||||
|
filenames; produce a mismatch report. Keep deriving from Index (stable IDs) but flag
|
||||||
|
disagreements. Mostly a data/QA task.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-10 — `x`-suffix rows (letter backsides / enclosures) 🟡 MINOR
|
||||||
|
|
||||||
|
**42 rows** have an `x`-suffixed Index (`W-0001x`, `W-0002x`, …). They're sparse — typically
|
||||||
|
only Index + Datei + sender + receiver, no box/folder/date. They appear to be the reverse
|
||||||
|
side or an enclosure of the preceding letter. The importer treats each as an independent
|
||||||
|
Document, and the `metadataComplete` heuristic flags them complete as soon as a sender is
|
||||||
|
present (date/box/folder all missing).
|
||||||
|
|
||||||
|
**Proposed approach:** decide whether `x` rows should be (a) separate documents, (b) extra
|
||||||
|
pages/files attached to their parent, or (c) skipped. Affects both the data model and the
|
||||||
|
`metadataComplete` heuristic. Discuss with Marcel.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-11 — Multi-receiver separators include bare `u` / `u.` 🟡 MINOR
|
||||||
|
|
||||||
|
`PersonNameParser.parseReceivers` already handles ` und `, ` u `, `//`, `geb.`,
|
||||||
|
parenthesised shared surnames, and `Familie` filtering — good. But the real data also uses
|
||||||
|
the abbreviation in forms the top-receivers list shows are common:
|
||||||
|
`Eugenie u Walter de Gruyter` (230), `Herbert u Clara` (94), `Juan u Marie Cram` (75),
|
||||||
|
and space-joined pairs like `Ella Anita` (79) that may be two people.
|
||||||
|
Raw separator tally on receivers: ` und ` ×70, `,` ×11, `;` ×2, `/` ×1 — plus the many ` u `
|
||||||
|
cases above. Senders are **not** parsed at all (taken raw), which is fine unless a sender
|
||||||
|
cell ever holds two names.
|
||||||
|
|
||||||
|
**Proposed approach:** add `MassImportServiceTest` cases for the real-world strings above;
|
||||||
|
extend the parser only where it actually fails. `Ella Anita`-style space-joined pairs are
|
||||||
|
ambiguous — likely leave as one person unless the register says otherwise (ties to IMP-05).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## IMP-12 — Importer reads only the first sheet, no validation 🟡 MINOR
|
||||||
|
|
||||||
|
`readXlsx` does `workbook.getSheetAt(0)`. For the new xlsx that's `Familienarchiv` (✅), but
|
||||||
|
the file also contains `Inhaltsverzeichnis grob`, `Inhaltsverzeichnis WdG`, `Tabelle4`.
|
||||||
|
There is no header validation: if the wrong file/sheet is dropped in `/import`, the importer
|
||||||
|
will happily map columns positionally and import nonsense. Also `findSpreadsheetFile()` picks
|
||||||
|
the **first** spreadsheet found in `/import` — with three spreadsheets present there today,
|
||||||
|
which one wins is filesystem-order-dependent.
|
||||||
|
|
||||||
|
**Proposed approach:** (a) validate the header row against expected names before importing;
|
||||||
|
(b) make the target sheet/file explicit (config or header match) rather than "first found".
|
||||||
|
Ties into the header-driven mapping in IMP-01(b).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary of recommended sequencing
|
||||||
|
|
||||||
|
1. **Decide the importer mapping strategy** (IMP-01): positional re-config vs header-driven.
|
||||||
|
Header-driven is the durable choice and unblocks IMP-03/12.
|
||||||
|
2. **Build the tolerant date parser** (IMP-02) with original-string preservation + precision.
|
||||||
|
3. **Import the Person register first** (IMP-04) and build the alias/marriage graph,
|
||||||
|
which feeds person dedup (IMP-05).
|
||||||
|
4. **Then import documents**, with reporting for blank-index (IMP-06), duplicates (IMP-07),
|
||||||
|
and section rows (IMP-08).
|
||||||
|
5. **Reconcile files** when the ~7,000 PDFs arrive (IMP-09), and decide `x`-row semantics
|
||||||
|
(IMP-10).
|
||||||
|
|
||||||
|
Code-change items (→ Gitea issues when we get there): IMP-01(b), IMP-02, IMP-04, IMP-05
|
||||||
|
(consume side), IMP-06 reporting, IMP-12. Pure-data items stay in this folder.
|
||||||
386
docs/import-migration/02-normalization-spec.md
Normal file
386
docs/import-migration/02-normalization-spec.md
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
# Spec — Import Normalizer
|
||||||
|
|
||||||
|
> Authored in the voice of **"Elicit"**, requirements engineer (see
|
||||||
|
> `.claude/personas/req_engineer.md`). This is a requirements artifact: it states
|
||||||
|
> *what* the normalizer must do and *how we'll know it's done*, in problem/behaviour
|
||||||
|
> language. Technology choices already made during brainstorming (Python, openpyxl,
|
||||||
|
> overrides-and-rerun) are recorded as **constraints**, not re-litigated here.
|
||||||
|
|
||||||
|
- **Status:** Draft for review
|
||||||
|
- **Date:** 2026-05-25
|
||||||
|
- **Related:** [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) (issues `IMP-01..12`), [`README.md`](./README.md)
|
||||||
|
- **Scope boundary:** This spec covers the **offline normalizer** that turns the raw
|
||||||
|
spreadsheets into a clean, canonical dataset + review artifacts. Wiring the canonical
|
||||||
|
contract into the Java `MassImportService` and the `Document`/`Person` model is **Phase 2**
|
||||||
|
and gets its own spec. This spec only *defines the contract* Phase 2 must satisfy.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Project Brief
|
||||||
|
|
||||||
|
**Vision.** Turn the family's human-curated, free-form archive spreadsheets into a clean,
|
||||||
|
canonical dataset that imports deterministically — without hand-editing thousands of rows
|
||||||
|
and without losing the historical nuance of how things were originally written.
|
||||||
|
|
||||||
|
**Problem.** The real archive (`…aktuell…xlsx`, 7,943 rows) and the person register
|
||||||
|
(`Personendatei 2.xlsx`, 163 people) were authored for humans to read, not machines to
|
||||||
|
import. Dates are written as they appeared in each letter (≈90% unparseable by the current
|
||||||
|
importer), the column layout differs from what the importer expects, and the same person
|
||||||
|
appears under many names. Importing as-is produces garbage (see `IMP-01..12`).
|
||||||
|
|
||||||
|
**Goal (measurable).**
|
||||||
|
- G1 — After the automated pass, **≤ 5%** of dated rows remain `UNKNOWN`; after the
|
||||||
|
overrides-iteration loop, **≤ 0.5%**.
|
||||||
|
- G2 — **100%** of source rows are represented in the canonical output or in a review file —
|
||||||
|
*zero silent drops*.
|
||||||
|
- G3 — **100%** of original values (raw date string, raw name string, source row number)
|
||||||
|
are preserved.
|
||||||
|
- G4 — A full run over the current inputs completes in **< 60 s** on the dev laptop and is
|
||||||
|
**content-deterministic** when re-run with unchanged inputs+overrides: identical canonical
|
||||||
|
cell matrices and identical review-file contents. (Workbook metadata is pinned; literal xlsx
|
||||||
|
byte-identity is not guaranteed because the zip container stores entry metadata.)
|
||||||
|
|
||||||
|
**Primary actor.** Marcel — solo owner & data steward (tech comfort 4/5). Also: a future
|
||||||
|
agent re-running the pipeline; and the `MassImportService` as the downstream consumer.
|
||||||
|
|
||||||
|
**Non-Goals (explicitly out of scope).**
|
||||||
|
- NG1 — Changing `MassImportService` or the DB schema (that is Phase 2).
|
||||||
|
- NG2 — Uploading/attaching the ~7,000 PDFs (they arrive later; import matches by `index`).
|
||||||
|
- NG3 — A GUI. The interface is spreadsheets in, CSVs out, an overrides file hand-edited.
|
||||||
|
- NG4 — Perfect genealogical reconstruction. We resolve confidently-matchable people; the
|
||||||
|
long tail stays as provisional persons.
|
||||||
|
- NG5 — OCR/transcription content (the new xlsx has no transcription column).
|
||||||
|
|
||||||
|
**Key assumptions.** (A1) Sheet `Familienarchiv` is the document source of truth.
|
||||||
|
(A2) Archive date range is **1873–1957** (drives the 2-digit-year century rule).
|
||||||
|
(A3) `index` is the stable document key and the basis for future PDF matching.
|
||||||
|
(A4) `Schlagwort` is a broad tag; `Inhalt` is a short summary/topic.
|
||||||
|
|
||||||
|
**Risks.** (R1) 2-digit/partial dates are genuinely ambiguous → mitigated by precision flag
|
||||||
|
+ overrides. (R2) Name matching false-positives merge distinct people → mitigated by
|
||||||
|
conservative matching + review before merge. (R3) Source spreadsheet may be re-exported with
|
||||||
|
layout drift → mitigated by header-name-based mapping, not fixed indices.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Personas
|
||||||
|
|
||||||
|
**Marcel — Data Steward.** Role: solo owner of Familienarchiv. Context: holds the complete
|
||||||
|
raw archive; PDFs follow. Tech comfort: 4/5 (semi-technical, reads CSV/spreadsheets fluently,
|
||||||
|
not keen to hand-edit 7,600 rows). Primary goal: a clean, importable dataset he trusts.
|
||||||
|
Frustrations: dates in ~20 formats; one ancestor under 4 name variants. **JTBD:** *"When I
|
||||||
|
have raw, human-curated archive spreadsheets, I want to transform them into a clean importable
|
||||||
|
dataset without losing how things were originally written, so I can load the archive and keep
|
||||||
|
correcting edge cases as they surface."*
|
||||||
|
|
||||||
|
**The Returning Agent.** Role: a future assistant session resuming the work. Goal: re-run the
|
||||||
|
pipeline deterministically and understand exactly what still needs human input. **JTBD:**
|
||||||
|
*"When I pick this up cold, I want one command and a clear residue report, so I can continue
|
||||||
|
without re-deriving context."*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Constraints & Decisions Already Made
|
||||||
|
|
||||||
|
These were settled during brainstorming and are fixed inputs to the requirements below.
|
||||||
|
|
||||||
|
| # | Decision | Rationale |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| C1 | **New canonical layout** with explicit headers (not the old positional ODS shape). | Fits the new data; importer becomes header-driven in Phase 2. |
|
||||||
|
| C2 | Dates stored as **parsed (nullable) + raw + precision**. | Historical archive; never lose the original; enable "ca. 1916". |
|
||||||
|
| C3 | **Include person resolution** (register + alias/marriage map → canonical persons) in this effort. | Maiden-name dedup needs the register. |
|
||||||
|
| C4 | **Overrides-file + re-run** loop for residue. | Deterministic, diffable, repeatable. |
|
||||||
|
| C5 | Implementation: **Python 3.12 + openpyxl**, standalone tool at `tools/import-normalizer/`. | Fast iteration; no Spring rebuild / coverage gate on transform code. |
|
||||||
|
| C6 | Century rule for archive **1873–1957**: 2-digit `00–57`→`19YY`, `73–99`→`18YY`, `58–72`→**flag**; 3-digit `DDD`→`1DDD`; never 20xx. | Stated by Marcel. Boundaries live in config. |
|
||||||
|
| C7 | `Schlagwort`→tag, `Inhalt`→summary. | Matches importer's existing semantics. |
|
||||||
|
| C8 | Non-register correspondents become **provisional persons**. | ~945 distinct sender strings vs 163 register people. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Functional Requirements
|
||||||
|
|
||||||
|
Each requirement has a stable ID. User stories use Connextra + Given-When-Then; system rules
|
||||||
|
use EARS. Traceability to findings in §8.
|
||||||
|
|
||||||
|
### 4.1 Ingest & layout (`FR-INGEST`, `FR-MAP`)
|
||||||
|
|
||||||
|
**US-MAP-01** — *As the data steward, I want each source column mapped to a named canonical
|
||||||
|
field regardless of its position, so a re-exported spreadsheet with shifted columns still
|
||||||
|
imports correctly.*
|
||||||
|
- AC1 — Given the `Familienarchiv` sheet, when the normalizer reads the header row, then it
|
||||||
|
maps columns by **header name** (not fixed index) to the canonical fields.
|
||||||
|
- AC2 — Given a header the normalizer does not recognise, when it runs, then it records the
|
||||||
|
unknown header in `review/summary.txt` and continues (does not crash).
|
||||||
|
- AC3 — Given a required source header is **absent**, when it runs, then it aborts with a
|
||||||
|
clear message naming the missing header (fail loud, before producing partial output).
|
||||||
|
|
||||||
|
- **REQ-INGEST-01** — The normalizer shall read only the `Familienarchiv` sheet of the
|
||||||
|
document workbook and the `Tabelle1` sheet of the person workbook.
|
||||||
|
- **REQ-MAP-01** — Header matching shall be case-insensitive and tolerant of internal
|
||||||
|
multiple spaces (e.g. `"Datum des Briefes"`).
|
||||||
|
|
||||||
|
### 4.2 Row triage (`FR-TRIAGE`) — resolves IMP-06, IMP-07, IMP-08
|
||||||
|
|
||||||
|
**US-TRIAGE-01** — *As the data steward, I want rows that have data but no index surfaced
|
||||||
|
rather than dropped, so I never lose a letter silently.*
|
||||||
|
- AC1 — Given a row whose `index` is blank but which has any other non-empty cell, when the
|
||||||
|
normalizer runs, then that row is written to `review/blank-index-rows.csv` with its source
|
||||||
|
row number and is **not** emitted as a canonical document.
|
||||||
|
- AC2 — Given a fully empty row, when it runs, then the row is skipped and counted (not
|
||||||
|
reported as an anomaly).
|
||||||
|
|
||||||
|
- **REQ-TRIAGE-01** — If two or more rows resolve to the same `index`, then the normalizer
|
||||||
|
shall emit all of them to `review/duplicate-index.csv` and mark each canonical row
|
||||||
|
`needs_review = duplicate_index` (it shall **not** silently drop either).
|
||||||
|
- **REQ-TRIAGE-02** — Where a row is identified as a section/banner row (blank index, text
|
||||||
|
only in a name column), the normalizer shall classify it as such in the blank-index report.
|
||||||
|
- **REQ-TRIAGE-03** — Rows whose `index` ends in `x` (a transcription/back-side of the base
|
||||||
|
letter, not yet independently mappable) shall be **skipped** — not emitted as a canonical
|
||||||
|
document — and written to `review/skipped-x-suffix.csv` with their source row and base index
|
||||||
|
(`index` minus the trailing `x`), so they can be linked in a later pass. (Resolves IMP-10.)
|
||||||
|
|
||||||
|
### 4.3 Date normalization (`FR-DATE`) — resolves IMP-02, IMP-03
|
||||||
|
|
||||||
|
**US-DATE-01** — *As the data steward, I want every date interpreted as precisely as the
|
||||||
|
source allows, with the original always kept, so I can sort the archive and still see what the
|
||||||
|
letter actually said.*
|
||||||
|
- AC1 — Given a parseable date, when normalized, then `date_iso` holds the best-effort ISO
|
||||||
|
date, `date_raw` holds the verbatim source string, and `date_precision` ∈
|
||||||
|
`{DAY, MONTH, SEASON, YEAR, RANGE, APPROX, UNKNOWN}`.
|
||||||
|
- AC2 — Given an unparseable date, when normalized, then `date_iso` is empty,
|
||||||
|
`date_precision = UNKNOWN`, `date_raw` is preserved, and the value appears in
|
||||||
|
`review/unparsed-dates.csv`.
|
||||||
|
- AC3 — Given the same `date_raw` appears in `overrides/dates.csv`, when normalized, then the
|
||||||
|
override's `(iso, precision)` wins over the automatic parse.
|
||||||
|
|
||||||
|
- **REQ-DATE-01** — The parser shall accept, at minimum, these forms (see §10 examples):
|
||||||
|
Excel/ISO; `D.M.YYYY`/`D.M.YY`; `D/M. YY[YY]` (slash treated as dot); Roman-numeral months
|
||||||
|
`I–XII`; German + English month names, full and abbreviated, with or without a separating
|
||||||
|
space; `Month YYYY`; season/holiday + year; bare `YYYY`; and start-anchored ranges.
|
||||||
|
- **REQ-DATE-02** — Precision shall be assigned by what is known: full day → `DAY`; month+year
|
||||||
|
→ `MONTH` (day = 1); a **named feast/holiday + year** → resolved to its **actual calendar
|
||||||
|
date for that year** → `DAY`; a **season + year** → representative mid-season month (day = 1)
|
||||||
|
→ `SEASON`; year only → `YEAR` (month = Jan, day = 1); a range → start date + `RANGE`; a
|
||||||
|
value carrying an uncertainty marker (`?`, `um`, `ca`, `circa`) → `APPROX` with best-effort date.
|
||||||
|
- **REQ-DATE-03** — Two-digit and three-digit years shall be expanded per **C6**; a 2-digit
|
||||||
|
year in `58–72` shall yield `UNKNOWN` + a review entry rather than a guess.
|
||||||
|
- **REQ-DATE-04** — Trailing editorial notes (e.g. `", 2. Brief"`) shall be stripped before
|
||||||
|
parsing and preserved (kept within `date_raw`; not invented into the date).
|
||||||
|
- **REQ-DATE-05** — The parser shall be pure and side-effect-free so it can be unit-tested in
|
||||||
|
isolation (see NFR-TEST-01).
|
||||||
|
- **REQ-DATE-06** — **Movable feasts are never mapped to a fixed month**; they shall be
|
||||||
|
computed per year from Easter (Gauss/Butcher computus): Karfreitag = Easter−2, Ostern =
|
||||||
|
Easter Sunday, Himmelfahrt = Easter+39, Pfingst(sonntag) = Easter+49, Pfingstmontag =
|
||||||
|
Easter+50, Fronleichnam = Easter+60, 1.–4. Advent = the 4th…1st Sunday before 25 Dec. Fixed
|
||||||
|
feasts use a lookup table (Neujahr=01-01, Heiligabend=12-24, Weihnachten=12-25,
|
||||||
|
Silvester=12-31, …). Seasons map to representative months: Frühling/Frühjahr=Apr, Sommer=Jul,
|
||||||
|
Herbst=Oct, Winter=Jan. The feast/season tables and Easter algorithm live in `config.py`
|
||||||
|
(NFR-MAINT-01).
|
||||||
|
|
||||||
|
### 4.4 Person resolution & dedup (`FR-PERS`, `FR-DEDUP`) — resolves IMP-04, IMP-05, IMP-11
|
||||||
|
|
||||||
|
**US-PERS-01** — *As the data steward, I want the genealogical register turned into canonical
|
||||||
|
people with all their known facts, so documents can link to real persons.*
|
||||||
|
- AC1 — Given a register row, when parsed, then a canonical person is produced with
|
||||||
|
`person_id`, name parts, `maiden_name`, birth/death (parsed + raw + place), spouse,
|
||||||
|
generation, nickname, notes — applying the same date rules as §4.3 to birth/death dates.
|
||||||
|
- AC2 — Given multi-value given names (`"Charlotte,Meta,Jacobi"`), when parsed, then the
|
||||||
|
primary given name is the first; the remainder are retained as additional names/aliases.
|
||||||
|
|
||||||
|
**US-PERS-02** — *As the data steward, I want each sender/receiver string matched to a
|
||||||
|
canonical person where possible and never dropped otherwise, so the correspondence graph is
|
||||||
|
complete.*
|
||||||
|
- AC1 — Given a sender/receiver string, when resolved, then it maps to a register
|
||||||
|
`person_id` via the alias index (exact → normalized/casefold → conservative fuzzy).
|
||||||
|
- AC2 — Given no confident match, when resolved, then a **provisional person** is created from
|
||||||
|
the cleaned string, linked, and listed in `review/unmatched-names.csv` (occurrence count +
|
||||||
|
example source rows).
|
||||||
|
- AC3 — Given the string appears in `overrides/names.csv`, when resolved, then it maps to the
|
||||||
|
specified `person_id` (override wins).
|
||||||
|
- AC4 — Given a multi-person receiver cell (`"Eugenie u Walter de Gruyter"`, `"Herbert u
|
||||||
|
Clara"`, `"…//…"`, `"Hedi und Tutu (Gruber)"`), when resolved, then it is split into
|
||||||
|
individual people, each resolved independently; ambiguous space-joined pairs
|
||||||
|
(`"Ella Anita"`) are emitted to `review/ambiguous-receivers.csv` rather than guessed.
|
||||||
|
|
||||||
|
- **REQ-DEDUP-01** — The alias index shall be derived from the register: canonical
|
||||||
|
"First Last", maiden form (`geb als`), spouse-surname married form, nickname, and
|
||||||
|
first-name-only **only when unambiguous** across the register.
|
||||||
|
- **REQ-DEDUP-02** — The normalizer shall not merge two distinct strings into one person on
|
||||||
|
fuzzy similarity alone above a configured threshold without the match being reported; merges
|
||||||
|
must be auditable.
|
||||||
|
- **REQ-PERS-01** — Sender cells shall be parsed for multi-person content using the same rules
|
||||||
|
as receiver cells (today the importer parses only receivers — IMP-11).
|
||||||
|
|
||||||
|
### 4.5 Overrides & idempotency (`FR-OVR`) — supports the iteration loop
|
||||||
|
|
||||||
|
- **REQ-OVR-01** — When the normalizer runs, then it shall load `overrides/dates.csv` and
|
||||||
|
`overrides/names.csv` if present and apply them; absence of either file shall not be an error.
|
||||||
|
- **REQ-OVR-02** — While overrides are unchanged and inputs are unchanged, re-running shall
|
||||||
|
produce **byte-identical** canonical outputs and review files (NFR-IDEM-01).
|
||||||
|
- **REQ-OVR-03** — Each override application shall be counted in `review/summary.txt` (how many
|
||||||
|
dates/names were resolved by override vs automatically).
|
||||||
|
|
||||||
|
### 4.6 Canonical output & provenance (`FR-OUT`, `FR-PROV`) — resolves IMP-01, IMP-09, IMP-12
|
||||||
|
|
||||||
|
- **REQ-OUT-01** — The normalizer shall write `out/canonical-documents.xlsx` and
|
||||||
|
`out/canonical-persons.xlsx` with the headered schemas in §6.
|
||||||
|
- **REQ-PROV-01** — Every canonical document row shall carry `source_row` (1-based row number
|
||||||
|
in the source sheet) so any value can be traced back to the original.
|
||||||
|
- **REQ-PROV-02** — Every canonical row shall carry a `needs_review` field listing zero or more
|
||||||
|
flags (`duplicate_index`, `unparsed_date`, `unmatched_sender`, `unmatched_receiver`,
|
||||||
|
`index_file_mismatch`, …) so the import and the UI can foreground uncertain data.
|
||||||
|
- **REQ-OUT-02** — Where the source `Datei` path disagrees with the index-derived filename
|
||||||
|
(IMP-09), the normalizer shall record the discrepancy in `review/index-file-mismatch.csv`
|
||||||
|
and flag the row; it shall **not** alter the `index` (the stable key).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Non-Functional Requirements
|
||||||
|
|
||||||
|
| ID | Category | Requirement (measurable) |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| NFR-DATA-01 | Data integrity | 100% of source rows are accounted for in output **or** a review file; 100% of original date/name strings preserved verbatim. |
|
||||||
|
| NFR-IDEM-01 | Determinism | Identical inputs + overrides ⇒ identical *logical* output across runs/machines: identical canonical cell matrices and review-file contents. Workbook `created`/`modified` metadata is pinned to a constant; ordering of all generated rows/aliases is stable (no set-iteration leakage). xlsx byte-identity is explicitly not required — determinism is asserted on content. |
|
||||||
|
| NFR-PERF-01 | Performance | Full run over 7,943 doc rows + 163 person rows completes in < 60 s on the dev laptop. |
|
||||||
|
| NFR-ACCUR-01 | Date accuracy | After automated pass, `UNKNOWN` dates ≤ 5% of dated rows; after overrides iteration, ≤ 0.5%. |
|
||||||
|
| NFR-ACCUR-02 | Name coverage | Every sender/receiver occurrence yields a linked person (register or provisional); 0 dropped. |
|
||||||
|
| NFR-I18N-01 | Encoding | UTF-8 end-to-end; German diacritics and ß round-trip with no mojibake in any output. |
|
||||||
|
| NFR-TEST-01 | Testability | `dates.py` and `persons.py` have pytest tests covering every format/alias category in §10 with real examples from the archive. |
|
||||||
|
| NFR-MAINT-01 | Maintainability | Column-name map, century boundaries, season→month map, and fuzzy threshold live in `config.py`, not inline in logic. |
|
||||||
|
| NFR-OBSERV-01 | Observability | `review/summary.txt` reports per-run stats: rows in, documents out, dates by precision, names matched vs provisional, overrides applied, anomalies by type. |
|
||||||
|
| NFR-SAFETY-01 | Source safety | Source workbooks are opened read-only and never written. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Data Dictionary (canonical contract)
|
||||||
|
|
||||||
|
This is the contract Phase 2 (the importer) must consume. Field-level, format-level — not a
|
||||||
|
DB schema.
|
||||||
|
|
||||||
|
### 6.1 `canonical-documents.xlsx`
|
||||||
|
|
||||||
|
| Field | Required | Format / values | Notes |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `index` | yes | string | Stable key; basis for PDF matching. |
|
||||||
|
| `box` | no | string | from `Box`. |
|
||||||
|
| `folder` | no | string | from `Mappe`. |
|
||||||
|
| `sender_person_id` | no | person_id | resolved; empty if no sender. |
|
||||||
|
| `sender_name` | no | string | canonical display name (or cleaned raw if provisional). |
|
||||||
|
| `receiver_person_ids` | no | `id\|id\|…` | pipe-separated. |
|
||||||
|
| `receiver_names` | no | `name\|name\|…` | pipe-separated, aligned with ids. |
|
||||||
|
| `date_iso` | no | `YYYY-MM-DD` | best-effort; empty if `UNKNOWN`. |
|
||||||
|
| `date_raw` | no | string | verbatim source date. |
|
||||||
|
| `date_precision` | yes | enum | `DAY\|MONTH\|SEASON\|YEAR\|RANGE\|APPROX\|UNKNOWN`. |
|
||||||
|
| `location` | no | string | from `Ort`. |
|
||||||
|
| `tags` | no | `tag\|tag` | from `Schlagwort`. |
|
||||||
|
| `summary` | no | string | from `Inhalt`. |
|
||||||
|
| `source_row` | yes | int | provenance (NFR-DATA-01). |
|
||||||
|
| `needs_review` | yes | `flag\|flag` or empty | review flags (REQ-PROV-02). |
|
||||||
|
|
||||||
|
### 6.2 `canonical-persons.xlsx`
|
||||||
|
|
||||||
|
| Field | Required | Format | Notes |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `person_id` | yes | slug | stable id (e.g. `de-gruyter-eugenie`); collisions suffixed. |
|
||||||
|
| `last_name` | yes | string | from `Familienname`. |
|
||||||
|
| `first_name` | no | string | primary given name. |
|
||||||
|
| `maiden_name` | no | string | from `geb als` — drives dedup. |
|
||||||
|
| `title` | no | string | e.g. honorifics if present. |
|
||||||
|
| `nickname` | no | string | from quoted `Bemerkung`/spouse field. |
|
||||||
|
| `birth_date` / `birth_date_raw` / `birth_place` | no | ISO / string / string | §4.3 rules. |
|
||||||
|
| `death_date` / `death_date_raw` / `death_place` | no | ISO / string / string | §4.3 rules. |
|
||||||
|
| `spouse` | no | person_id or name | from `verheiratet mit`. |
|
||||||
|
| `generation` | no | string | `G 1`..`G 4`. |
|
||||||
|
| `notes` | no | string | from `Bemerkung`. |
|
||||||
|
| `aliases` | no | `a\|b\|c` | every surface form that maps here. |
|
||||||
|
| `provisional` | yes | bool | true if created from a document string, not the register. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Prioritized Backlog (MoSCoW)
|
||||||
|
|
||||||
|
| ID | Item | MoSCoW | Effort | Depends on |
|
||||||
|
| --- | --- | --- | --- | --- |
|
||||||
|
| B1 | Project scaffolding + read both workbooks (`FR-INGEST`, header map `FR-MAP`) | Must | S | — |
|
||||||
|
| B2 | Row triage + blank/duplicate/empty reports (`FR-TRIAGE`) | Must | S | B1 |
|
||||||
|
| B3 | Date parser + precision + century rule + Easter/feast computus + season map + tests (`FR-DATE`) | Must | L | B1 |
|
||||||
|
| B4 | Person register parser → canonical persons (`FR-PERS` US-PERS-01) | Must | M | B1 |
|
||||||
|
| B5 | Alias index + name resolution + multi-person split (`FR-DEDUP`, US-PERS-02) | Must | L | B4 |
|
||||||
|
| B6 | Overrides load + apply + idempotency (`FR-OVR`) | Must | S | B3,B5 |
|
||||||
|
| B7 | Canonical writers + provenance + review summary (`FR-OUT`, `FR-PROV`) | Must | M | B2,B3,B5 |
|
||||||
|
| B8 | Index↔Datei mismatch report (`REQ-OUT-02`) | Should | XS | B1 |
|
||||||
|
| B9 | Ambiguous-receiver review path (US-PERS-02 AC4) | Should | S | B5 |
|
||||||
|
| B10 | Comma-split `Inhalt` into extra tags | Could | XS | B7 |
|
||||||
|
| B11 | Phase-2 importer wiring (separate spec) | Won't (this spec) | — | B7 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Traceability — Findings → Requirements
|
||||||
|
|
||||||
|
| Finding | Severity | Addressed by |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| IMP-01 layout mismatch | blocker | C1, FR-MAP, REQ-OUT-01 |
|
||||||
|
| IMP-02 free-text dates | blocker | FR-DATE (all), C2, C6 |
|
||||||
|
| IMP-03 no ISO/normalized cols | blocker | FR-DATE, FR-PERS |
|
||||||
|
| IMP-04 register unimported | major | C3, US-PERS-01, §6.2 |
|
||||||
|
| IMP-05 name variants → dupes | major | C3, FR-DEDUP |
|
||||||
|
| IMP-06 blank-index dropped | major | US-TRIAGE-01 |
|
||||||
|
| IMP-07 duplicate indices | minor | REQ-TRIAGE-01 |
|
||||||
|
| IMP-08 section rows / tags vs summary | minor | REQ-TRIAGE-02, C7 |
|
||||||
|
| IMP-09 index↔file mismatch | minor | REQ-OUT-02, B8 |
|
||||||
|
| IMP-10 `x`-suffix rows | minor | REQ-TRIAGE-03 (skip + log this pass) |
|
||||||
|
| IMP-11 sender not split / ` u ` sep | minor | REQ-PERS-01, US-PERS-02 AC4 |
|
||||||
|
| IMP-12 first-sheet, no validation | minor | REQ-INGEST-01, FR-MAP AC2/AC3 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Open Questions / TBD Register
|
||||||
|
|
||||||
|
| ID | Question | Why it matters | Ref | Resolution |
|
||||||
|
| --- | --- | --- | --- | --- |
|
||||||
|
| OQ-01 ✅ | Season/holiday → date. | Accuracy of ~70 SEASON/feast rows. | REQ-DATE-06 | **Resolved (2026-05-25):** movable feasts (Ostern, Pfingsten, Himmelfahrt, Advent, …) **computed per year from Easter — never a fixed month**; fixed feasts looked up (Weihnachten=12-25, Neujahr=01-01, …); seasons = mid-season month (Frühling=Apr, Sommer=Jul, Herbst=Oct, Winter=Jan). |
|
||||||
|
| OQ-02 ✅ | Date ranges: start only, or start+end? | Sorting/display of ~315 range values. | REQ-DATE-02 | **Confirmed:** store **start** in `date_iso`, precision `RANGE`, full text in `date_raw`. |
|
||||||
|
| OQ-03 ✅ | `person_id` format. | Stability across re-runs; diffability. | §6 | **Confirmed:** readable slug `lastname-firstname`, numeric suffix on collision. |
|
||||||
|
| OQ-04 ✅ | `x`-suffix row handling. | 42 rows. | REQ-TRIAGE-03 | **Resolved (2026-05-25):** `x` rows are transcriptions of the base letter but not yet mappable → **skip this pass**, log to `review/skipped-x-suffix.csv` for later linking. |
|
||||||
|
| OQ-05 ✅ | Importer output format. | Phase-2 reader. | B11 | **Confirmed:** `.xlsx` (openpyxl-native, headered). |
|
||||||
|
| OQ-06 ✅ | Fuzzy-match policy. | False-positive person merges (R2). | REQ-DEDUP-02 | **Confirmed:** conservative — report all fuzzy matches; no silent merge. |
|
||||||
|
|
||||||
|
*All open questions resolved as of 2026-05-25. New ambiguities discovered during build go here.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Glossary & Worked Examples
|
||||||
|
|
||||||
|
**Precision** — how exactly a date is known (`DAY` … `UNKNOWN`). **Provisional person** — a
|
||||||
|
person created from a document name string with no register match. **Alias index** — map from
|
||||||
|
every known surface form of a name to a canonical `person_id`. **Override** — a
|
||||||
|
human-supplied correction applied deterministically on each run.
|
||||||
|
|
||||||
|
**Date examples → expected outcome:**
|
||||||
|
|
||||||
|
| `date_raw` | `date_iso` | `date_precision` |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `15.2.1888` | 1888-02-15 | DAY |
|
||||||
|
| `6.März 1888` | 1888-03-06 | DAY |
|
||||||
|
| `22.III.18` | 1918-03-22 | DAY |
|
||||||
|
| `13.5.09` | 1909-05-13 | DAY |
|
||||||
|
| `10.Oct.95` | 1895-10-10 | DAY |
|
||||||
|
| `17/6. 1916` | 1916-06-17 | DAY |
|
||||||
|
| `Mai 1895` | 1895-05-01 | MONTH |
|
||||||
|
| `Pfingsten 1922` | 1922-06-04 | DAY (computed: Easter 1922 = Apr 16, +49 days) |
|
||||||
|
| `Herbst 1913` | 1913-10-01 | SEASON |
|
||||||
|
| `1905` | 1905-01-01 | YEAR |
|
||||||
|
| `8.1.1916 - 15.3.1916` | 1916-01-08 | RANGE |
|
||||||
|
| `17.Nov (?) 1887` | 1887-11-17 | APPROX |
|
||||||
|
| `?` | *(empty)* | UNKNOWN |
|
||||||
|
|
||||||
|
**Name examples → expected outcome:**
|
||||||
|
|
||||||
|
| raw cell | resolves to |
|
||||||
|
| --- | --- |
|
||||||
|
| `Eugenie Müller` (+ register `geb Müller`) | `de-gruyter-eugenie` (matched via maiden alias) |
|
||||||
|
| `Eugenie de Gruyter` | `de-gruyter-eugenie` |
|
||||||
|
| `Herbert u Clara` | `cram-herbert` + `cram-clara` (split, surname distributed) |
|
||||||
|
| `Hedi und Tutu (Gruber)` | `gruber-hedi` + `gruber-tutu` |
|
||||||
|
| `Ella Anita` | → `review/ambiguous-receivers.csv` (not auto-split) |
|
||||||
|
| `Hans Wittkopf` (not in register) | provisional `wittkopf-hans` |
|
||||||
2281
docs/import-migration/03-normalizer-implementation-plan.md
Normal file
2281
docs/import-migration/03-normalizer-implementation-plan.md
Normal file
File diff suppressed because it is too large
Load Diff
502
docs/import-migration/04-unresolved-names-plan.md
Normal file
502
docs/import-migration/04-unresolved-names-plan.md
Normal file
@@ -0,0 +1,502 @@
|
|||||||
|
# Unresolved-Name Classification Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** Add a focused `review/unresolved-names.csv` that isolates sender/receiver strings whose *name itself* is problematic (unknown/illegible, single-token, relational-only, collective/group, prose-in-name-column, or a genuine two-given-name pair), and fix the ambiguous-pair heuristic so a plain `First Surname` external person (e.g. `Mieze Schefold`) is no longer falsely flagged.
|
||||||
|
|
||||||
|
**Architecture:** A pure `classify_name(raw, given_names)` function in `persons.py` returns a `NameClass`. `ResolutionContext` classifies every *unmatched* name and records the non-`RESOLVABLE` ones in `self.unresolved`. A runtime-built given-name set (register first names + a small config supplement) lets the classifier distinguish a two-given-name pair (`Ella Anita` → two people) from a first+surname single person (`Mieze Schefold`). The orchestrator writes the aggregated report and per-category stats, replacing the noisy `ambiguous-receivers.csv`.
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.12, openpyxl, pytest — extends the existing `tools/import-normalizer/`.
|
||||||
|
|
||||||
|
**Context:** This builds on the completed normalizer (PR #663). Run all tests with CWD = the tool dir, e.g. `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_X.py -v`. Reuse the existing venv at `tools/import-normalizer/.venv` (do NOT recreate it). Commit on the current branch `docs/import-migration` (never main, never push). Each commit message ends with a trailing `Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>` line.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
tools/import-normalizer/
|
||||||
|
├── config.py # + RELATIONAL_TERMS, COLLECTIVE_TERMS, UNKNOWN_NAME_MARKERS, PROSE_MAX_LEN, EXTRA_GIVEN_NAMES
|
||||||
|
├── persons.py # + NameClass, classify_name(), build_given_names(); ResolutionContext gains given_names + self.unresolved
|
||||||
|
├── normalize.py # writes unresolved-names.csv (replaces ambiguous-receivers.csv) + per-category stats
|
||||||
|
├── README.md # + unresolved-names.csv row in the review-file table
|
||||||
|
└── tests/
|
||||||
|
├── test_config.py # + name-table presence test
|
||||||
|
├── test_persons.py # + classify_name + build_given_names tests
|
||||||
|
├── test_documents.py # ambiguous test → unresolved test (+ resolvable-pair test)
|
||||||
|
└── test_normalize.py # integration asserts unresolved-names.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Config — name-classification tables
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tools/import-normalizer/config.py`
|
||||||
|
- Modify: `tools/import-normalizer/tests/test_config.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add the failing test** to `tests/test_config.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_name_classification_tables():
|
||||||
|
assert "tante" in config.RELATIONAL_TERMS
|
||||||
|
assert "familie" in config.COLLECTIVE_TERMS
|
||||||
|
assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
|
||||||
|
assert config.PROSE_MAX_LEN >= 30
|
||||||
|
assert "anita" in config.EXTRA_GIVEN_NAMES
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify it fails**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py::test_name_classification_tables -v && cd -`
|
||||||
|
Expected: FAIL — `AttributeError: module 'config' has no attribute 'RELATIONAL_TERMS'`.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Implement** — append to `config.py` (after the existing tables, before/after `KNOWN_LAST_NAMES` — anywhere at module level)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# --- Name classification (unresolved-name review) ---
|
||||||
|
# Relational reference terms — a sender/receiver named by relation, not a proper name.
|
||||||
|
RELATIONAL_TERMS = {
|
||||||
|
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
|
||||||
|
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
|
||||||
|
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
|
||||||
|
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
|
||||||
|
}
|
||||||
|
# Collective/group terms — not a single person. Matched against alpha-only word tokens
|
||||||
|
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
|
||||||
|
COLLECTIVE_TERMS = {
|
||||||
|
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
|
||||||
|
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
|
||||||
|
}
|
||||||
|
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
|
||||||
|
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
|
||||||
|
# (it occurs inside real names: Hanni, Johanna, Anna).
|
||||||
|
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
|
||||||
|
# A name-column value longer than this (chars) is treated as prose/description, not a name.
|
||||||
|
PROSE_MAX_LEN = 40
|
||||||
|
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
|
||||||
|
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
|
||||||
|
EXTRA_GIVEN_NAMES = {
|
||||||
|
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
|
||||||
|
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run to verify it passes**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py -v && cd -`
|
||||||
|
Expected: PASS (all config tests).
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tools/import-normalizer/config.py tools/import-normalizer/tests/test_config.py
|
||||||
|
git commit -m "feat(normalizer): config tables for name classification"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: `classify_name` + `NameClass`
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tools/import-normalizer/persons.py`
|
||||||
|
- Modify: `tools/import-normalizer/tests/test_persons.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add failing tests** to `tests/test_persons.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from persons import NameClass
|
||||||
|
|
||||||
|
GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
|
||||||
|
|
||||||
|
def test_classify_unknown():
|
||||||
|
assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
|
||||||
|
assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
|
||||||
|
assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
|
||||||
|
|
||||||
|
def test_classify_prose():
|
||||||
|
assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
|
||||||
|
assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit
|
||||||
|
assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote
|
||||||
|
|
||||||
|
def test_classify_collective():
|
||||||
|
assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
|
||||||
|
def test_classify_relational():
|
||||||
|
assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
|
||||||
|
assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
|
||||||
|
|
||||||
|
def test_classify_single_token():
|
||||||
|
assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
|
||||||
|
assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
|
||||||
|
|
||||||
|
def test_classify_ambiguous_pair():
|
||||||
|
assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
|
||||||
|
assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
|
||||||
|
|
||||||
|
def test_classify_resolvable_single_person():
|
||||||
|
# first + surname (surname not a given name) -> one real person, NOT ambiguous
|
||||||
|
assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
|
||||||
|
assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify it fails**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k classify -v && cd -`
|
||||||
|
Expected: FAIL — `NameClass` / `classify_name` not defined.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Implement** — add to `persons.py`. Add `from enum import StrEnum` to the imports if not present, then add:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class NameClass(StrEnum):
|
||||||
|
RESOLVABLE = "resolvable"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
SINGLE_TOKEN = "single_token"
|
||||||
|
RELATIONAL = "relational"
|
||||||
|
COLLECTIVE = "collective"
|
||||||
|
PROSE = "prose"
|
||||||
|
AMBIGUOUS_PAIR = "ambiguous_pair"
|
||||||
|
|
||||||
|
|
||||||
|
_QUOTE_CHARS = "\"'“”„‚‘’"
|
||||||
|
|
||||||
|
|
||||||
|
def classify_name(raw: str, given_names: set[str]) -> NameClass:
|
||||||
|
"""Classify a (post-split) sender/receiver string by why it may be unresolvable.
|
||||||
|
|
||||||
|
Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
|
||||||
|
SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
|
||||||
|
"""
|
||||||
|
s = raw.strip()
|
||||||
|
if not s:
|
||||||
|
return NameClass.RESOLVABLE
|
||||||
|
low = s.lower()
|
||||||
|
tokens = s.split()
|
||||||
|
# alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
|
||||||
|
# are matched as whole words (no substring/prefix false positives like "Allerton").
|
||||||
|
alpha_words = re.findall(r"[a-zäöüß]+", low)
|
||||||
|
if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
|
||||||
|
return NameClass.UNKNOWN
|
||||||
|
if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
|
||||||
|
or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
|
||||||
|
return NameClass.PROSE
|
||||||
|
if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
|
||||||
|
return NameClass.COLLECTIVE
|
||||||
|
if any(w in config.RELATIONAL_TERMS for w in alpha_words):
|
||||||
|
return NameClass.RELATIONAL
|
||||||
|
if len(tokens) == 1:
|
||||||
|
return NameClass.SINGLE_TOKEN
|
||||||
|
if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
|
||||||
|
return NameClass.AMBIGUOUS_PAIR
|
||||||
|
return NameClass.RESOLVABLE
|
||||||
|
|
||||||
|
|
||||||
|
# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
|
||||||
|
# classified PROSE. Such multi-particle names are rare here and usually resolve via the
|
||||||
|
# register; if they surface in review, lower-priority than the real prose entries.
|
||||||
|
```
|
||||||
|
|
||||||
|
> Note: `_norm` already exists in `persons.py` (added in the alias-index task) and strips accents + lowercases. `classify_name` uses it so given-name matching is accent-insensitive.
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run to verify it passes**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -`
|
||||||
|
Expected: PASS (all persons tests, including the 7 new classify tests).
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py
|
||||||
|
git commit -m "feat(normalizer): classify_name + NameClass"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: `build_given_names`
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tools/import-normalizer/persons.py`
|
||||||
|
- Modify: `tools/import-normalizer/tests/test_persons.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add failing test** to `tests/test_persons.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_build_given_names():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Eugenie"},
|
||||||
|
{"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given
|
||||||
|
])
|
||||||
|
g = persons.build_given_names(people, {"Anita"})
|
||||||
|
assert "eugenie" in g
|
||||||
|
assert "charlotte" in g and "meta" in g # primary + extra given names
|
||||||
|
assert "anita" in g # from the extra set, normalized
|
||||||
|
assert "schefold" not in g
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify it fails**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py::test_build_given_names -v && cd -`
|
||||||
|
Expected: FAIL — `build_given_names` not defined.
|
||||||
|
|
||||||
|
- [ ] **Step 3: Implement** — add to `persons.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def build_given_names(register: list[Person], extra: set[str]) -> set[str]:
|
||||||
|
"""Set of normalized given names from the register (first + extra given) plus a supplement.
|
||||||
|
|
||||||
|
Used by classify_name to tell a two-given-name pair (two people) from a first+surname.
|
||||||
|
"""
|
||||||
|
names: set[str] = set()
|
||||||
|
for p in register:
|
||||||
|
if p.first_name:
|
||||||
|
names.add(_norm(p.first_name))
|
||||||
|
for g in p.extra_given_names:
|
||||||
|
names.add(_norm(g))
|
||||||
|
for e in extra:
|
||||||
|
names.add(_norm(e))
|
||||||
|
return names
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run to verify it passes**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py
|
||||||
|
git commit -m "feat(normalizer): build_given_names from register + supplement"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: Integrate — ResolutionContext records unresolved; orchestrator writes the report
|
||||||
|
|
||||||
|
This task touches `persons.py`, `normalize.py`, and two test files together so the whole suite stays green in one commit (removing `ctx.ambiguous` requires updating its only consumer, `normalize.py`, in the same change).
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tools/import-normalizer/persons.py` (ResolutionContext)
|
||||||
|
- Modify: `tools/import-normalizer/normalize.py`
|
||||||
|
- Modify: `tools/import-normalizer/tests/test_documents.py`
|
||||||
|
- Modify: `tools/import-normalizer/tests/test_normalize.py`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Update the failing tests first**
|
||||||
|
|
||||||
|
In `tests/test_documents.py`, **replace** the existing `test_ambiguous_space_pair_flagged_not_split` function entirely with these two functions:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_ambiguous_pair_recorded_in_unresolved():
|
||||||
|
people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}])
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={},
|
||||||
|
given_names={"ella", "anita"})
|
||||||
|
raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert len(doc.receiver_person_ids) == 1 # not split — one provisional
|
||||||
|
assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved)
|
||||||
|
|
||||||
|
def test_resolvable_first_surname_pair_not_unresolved():
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={},
|
||||||
|
given_names={"ella", "anita"})
|
||||||
|
ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name
|
||||||
|
assert ctx.unresolved == [] # RESOLVABLE -> not recorded
|
||||||
|
```
|
||||||
|
|
||||||
|
In `tests/test_normalize.py`, in the `_doc_wb` fixture, change the `C-0001` row's receiver from empty to `"?"` so the run produces an unresolved entry. Find the line that appends the `C-0001` row and set its `EmpfängerIn` cell to `"?"`. For example the row currently reads:
|
||||||
|
|
||||||
|
```python
|
||||||
|
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""])
|
||||||
|
```
|
||||||
|
|
||||||
|
change the 6th cell (EmpfängerIn) from `""` to `"?"`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
|
||||||
|
```
|
||||||
|
|
||||||
|
Then add these assertions inside `test_run_end_to_end`, right after the existing `assert (review_dir / "unparsed-dates.csv").exists()` line:
|
||||||
|
|
||||||
|
```python
|
||||||
|
assert (out_dir / "canonical-documents.xlsx").exists() # (keep existing asserts above)
|
||||||
|
assert (review_dir / "unresolved-names.csv").exists()
|
||||||
|
unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
|
||||||
|
assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver
|
||||||
|
assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify they fail**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py tests/test_normalize.py -v && cd -`
|
||||||
|
Expected: FAIL — `ResolutionContext` has no `given_names`/`unresolved`; `unresolved-names.csv` not written.
|
||||||
|
|
||||||
|
- [ ] **Step 3a: Implement — `ResolutionContext` in `persons.py`**
|
||||||
|
|
||||||
|
Replace the `ResolutionContext.__init__` body's two lines (`self.ambiguous` and add `given_names`) and the relevant methods. The new `__init__`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str],
|
||||||
|
given_names: set[str] | None = None):
|
||||||
|
self.index = alias_index
|
||||||
|
self.name_overrides = name_overrides
|
||||||
|
self.given_names = given_names or set()
|
||||||
|
self.provisional: dict[str, Person] = {}
|
||||||
|
self.unmatched: dict[str, list] = {}
|
||||||
|
self.unresolved: list[tuple] = [] # (raw_name, category, source_row) for non-RESOLVABLE names
|
||||||
|
self._raw_to_pid: dict[str, str] = {}
|
||||||
|
self.override_hits = 0
|
||||||
|
```
|
||||||
|
|
||||||
|
In `resolve_one`, the provisional branch must classify the name. Replace this existing block:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# provisional person (unmatched) — never reuse a register id
|
||||||
|
self.unmatched.setdefault(name, []).append(source_row)
|
||||||
|
if name in self._raw_to_pid:
|
||||||
|
return self._raw_to_pid[name], name, False
|
||||||
|
```
|
||||||
|
|
||||||
|
with:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# provisional person (unmatched) — never reuse a register id
|
||||||
|
self.unmatched.setdefault(name, []).append(source_row)
|
||||||
|
category = classify_name(name, self.given_names)
|
||||||
|
if category is not NameClass.RESOLVABLE:
|
||||||
|
self.unresolved.append((name, str(category), source_row))
|
||||||
|
if name in self._raw_to_pid:
|
||||||
|
return self._raw_to_pid[name], name, False
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace the entire `resolve_receivers` method (the ambiguous detection now lives in `resolve_one` via `classify_name`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
def resolve_receivers(self, raw: str, source_row: int):
|
||||||
|
return [self.resolve_one(part, source_row) for part in split_receivers(raw)]
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3b: Implement — `normalize.py`**
|
||||||
|
|
||||||
|
Find the line that builds the context:
|
||||||
|
|
||||||
|
```python
|
||||||
|
ctx = persons.ResolutionContext(alias_index, name_overrides)
|
||||||
|
```
|
||||||
|
|
||||||
|
replace it with (build the given-name set from the register + config supplement):
|
||||||
|
|
||||||
|
```python
|
||||||
|
given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES)
|
||||||
|
ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names)
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace the `ambiguous-receivers.csv` write line:
|
||||||
|
|
||||||
|
```python
|
||||||
|
writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous)
|
||||||
|
```
|
||||||
|
|
||||||
|
with an aggregated unresolved-names report:
|
||||||
|
|
||||||
|
```python
|
||||||
|
unresolved_agg: dict[tuple, list] = {}
|
||||||
|
for name, category, row in ctx.unresolved:
|
||||||
|
unresolved_agg.setdefault((category, name), []).append(row)
|
||||||
|
unresolved_rows = sorted(
|
||||||
|
([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))]
|
||||||
|
for (cat, name), rows in unresolved_agg.items()),
|
||||||
|
key=lambda r: (r[0], -r[2], r[1]))
|
||||||
|
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||||
|
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||||
|
```
|
||||||
|
|
||||||
|
In the `stats` dict, replace the `"ambiguous_receivers"` line:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"ambiguous_receivers": len(ctx.ambiguous),
|
||||||
|
```
|
||||||
|
|
||||||
|
with a per-category breakdown:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"unresolved_name_occurrences": len(ctx.unresolved),
|
||||||
|
"unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"),
|
||||||
|
"unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"),
|
||||||
|
"unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"),
|
||||||
|
"unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"),
|
||||||
|
"unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"),
|
||||||
|
"unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"),
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run the whole suite to verify green**
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -`
|
||||||
|
Expected: PASS (all tests, no `ambiguous` references remain).
|
||||||
|
|
||||||
|
Also grep to confirm no dangling references:
|
||||||
|
Run: `grep -rn "ctx.ambiguous\|ambiguous-receivers\|ambiguous_receivers\|self.ambiguous" tools/import-normalizer/*.py`
|
||||||
|
Expected: no matches.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tools/import-normalizer/persons.py tools/import-normalizer/normalize.py tools/import-normalizer/tests/test_documents.py tools/import-normalizer/tests/test_normalize.py
|
||||||
|
git commit -m "feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: README — document the new report
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `tools/import-normalizer/README.md`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Update the review-file table** in `README.md`. Replace the `ambiguous-receivers.csv` row with an `unresolved-names.csv` row. Find the table row referencing `ambiguous-receivers.csv` and replace it with:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv`. |
|
||||||
|
```
|
||||||
|
|
||||||
|
If the README has no such row (older version), add the row above to the review-file table.
|
||||||
|
|
||||||
|
- [ ] **Step 2: Add a note** to the iteration-loop section of `README.md` (after the table):
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
> `unresolved-names.csv` is the focused "names that need a human" list — distinct from
|
||||||
|
> `unmatched-names.csv` (which is just non-family correspondents that got provisional persons).
|
||||||
|
> The given-name set that drives `ambiguous_pair` detection is the register's first names plus
|
||||||
|
> `config.EXTRA_GIVEN_NAMES` — add names there if a real two-person cell isn't being flagged.
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3: Verify the suite is still green** (README-only change, but confirm nothing references the old file)
|
||||||
|
|
||||||
|
Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -`
|
||||||
|
Expected: PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 4: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tools/import-normalizer/README.md
|
||||||
|
git commit -m "docs(normalizer): document unresolved-names.csv review report"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Self-Review
|
||||||
|
|
||||||
|
**Spec coverage** (against the agreed proposal):
|
||||||
|
- Focused report isolating problem name classes → Task 4 writes `review/unresolved-names.csv` with a `category` column; categories defined in Task 2 `classify_name`. ✓
|
||||||
|
- Fix ambiguous over-flagging of `First Surname` → Task 2 `AMBIGUOUS_PAIR` requires *both* tokens in the given-name set; `Mieze Schefold` → `RESOLVABLE` (tested). ✓
|
||||||
|
- Distinguish "not fully known" (unknown/single-token/relational/collective/prose) from "can't split cleanly" (ambiguous_pair) → all are `NameClass` values, each its own category column value. ✓
|
||||||
|
- Per-category counts in summary → Task 4 stats. ✓
|
||||||
|
- Senders covered too (not just receivers) → classification happens in `resolve_one`, which both `resolve_sender` and `resolve_receivers` call. ✓
|
||||||
|
|
||||||
|
**Placeholder scan:** No TBD/TODO; every code step has complete code. The README replacement gives the exact row text.
|
||||||
|
|
||||||
|
**Type consistency:** `NameClass` (StrEnum) defined Task 2; `classify_name(raw, given_names)` and `build_given_names(register, extra)` signatures used consistently in Task 4; `ResolutionContext(alias_index, name_overrides, given_names=…)` matches the new `__init__`; `self.unresolved` is `list[tuple]` of `(raw, category, source_row)` and read with that shape in both the report and the stats. `str(category)` yields the StrEnum value (e.g. `"ambiguous_pair"`), matching the stat comparisons and the test assertions.
|
||||||
|
|
||||||
|
**Cross-task green:** Task 4 deliberately bundles the `persons.py` + `normalize.py` + test changes into one commit because removing `ctx.ambiguous` breaks its consumer otherwise — no red commit is left behind (lesson from the prior build).
|
||||||
|
|
||||||
|
**Out of scope (future):** Spanish month names + `Mon DD-YYYY` date form (separate date-parser enhancement); promoting `unresolved` rows into a document-level `needs_review` flag; auto-splitting confirmed `ambiguous_pair` entries via overrides.
|
||||||
62
docs/import-migration/README.md
Normal file
62
docs/import-migration/README.md
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# Import Migration — Working Folder
|
||||||
|
|
||||||
|
This folder tracks the iterative work of mass-importing the **real, raw family archive**
|
||||||
|
spreadsheets (≈7,600 letter rows + ~7,000 PDFs that arrive later) into Familienarchiv.
|
||||||
|
|
||||||
|
It is intentionally **local docs, not Gitea issues**. We only open a Gitea issue when a
|
||||||
|
finding requires a *software* change (e.g. a new date parser). Pure data observations and
|
||||||
|
the running plan live here so any agent can pick the work up cold.
|
||||||
|
|
||||||
|
## Source files (in `/import`)
|
||||||
|
|
||||||
|
| File | What it is | Importer support today |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx` | The **real raw archive** — 7,943 rows, sheet `Familienarchiv`. Human-readable, dates as written in the letters. | ❌ layout does **not** match importer defaults |
|
||||||
|
| `Personendatei 2.xlsx` | Genealogical **person register** — 163 people, sheet `Tabelle1` (maiden names, birth/death, marriages, relationships). | ❌ no importer at all |
|
||||||
|
| `zzfamilienarchiv Walter und Eugenie 2025-04-10.ods` | A small, **already-normalized** subset (Walter & Eugenie brautbriefe). 14 clean columns incl. ISO dates. | ✅ this is what `MassImportService` was built for |
|
||||||
|
|
||||||
|
The PDFs (~7,000) will follow later. The importer matches files by the **Index** column
|
||||||
|
(e.g. `W-0001` → `W-0001.pdf`), and already imports metadata-only when a file is missing —
|
||||||
|
so we can import all metadata now and the PDFs will attach on a re-run.
|
||||||
|
|
||||||
|
## How to inspect the spreadsheets
|
||||||
|
|
||||||
|
`openpyxl` is installed in the OCR service venv:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
/home/marcel/Desktop/familienarchiv/ocr-service/.venv/bin/python3 -c "import openpyxl; print(openpyxl.__version__)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documents in this folder
|
||||||
|
|
||||||
|
- [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) — full analysis of every data-quality / importer issue found (2026-05-25). Each issue has an ID `IMP-NN`.
|
||||||
|
- [`02-normalization-spec.md`](./02-normalization-spec.md) — requirements spec for the offline **import normalizer** (the agreed strategy: normalize the raw sheets into a clean canonical dataset before import). Requirements `FR-*`/`NFR-*`, traceable to the `IMP-NN` findings.
|
||||||
|
- `WORKLOG.md` — running log of what each session did and what's next. **Start here when resuming.**
|
||||||
|
|
||||||
|
## Strategy (decided 2026-05-25)
|
||||||
|
|
||||||
|
Normalize **before** import. A standalone Python tool (`tools/import-normalizer/`, not yet
|
||||||
|
built) transforms the raw xlsx + person register into a clean canonical dataset
|
||||||
|
(`canonical-documents.xlsx`, `canonical-persons.xlsx`) plus review CSVs. Residual cases
|
||||||
|
(unparseable dates, unmatched names) are fixed via a version-controlled overrides file and
|
||||||
|
re-run. The Java importer is adjusted to consume the canonical contract in a later **Phase 2**.
|
||||||
|
See the spec for the full contract.
|
||||||
|
|
||||||
|
## Status board
|
||||||
|
|
||||||
|
| ID | Issue | Severity | Status |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| IMP-01 | New xlsx column layout ≠ importer defaults | 🔴 blocker | open |
|
||||||
|
| IMP-02 | 90% of dates are free-text the parser can't read | 🔴 blocker | open |
|
||||||
|
| IMP-03 | No ISO/normalized date column in the new xlsx | 🔴 blocker | open |
|
||||||
|
| IMP-04 | Person register (`Personendatei 2.xlsx`) not imported | 🟠 major | open |
|
||||||
|
| IMP-05 | Name variations = duplicate Persons (maiden vs married) | 🟠 major | open |
|
||||||
|
| IMP-06 | 93 data rows with blank Index are silently dropped | 🟠 major | open |
|
||||||
|
| IMP-07 | 43 duplicate Index values | 🟡 minor | open |
|
||||||
|
| IMP-08 | Section/title rows interleaved in data | 🟡 minor | open |
|
||||||
|
| IMP-09 | Index↔Datei filename mismatches | 🟡 minor | open |
|
||||||
|
| IMP-10 | `x`-suffix rows (letter backsides/enclosures) | 🟡 minor | open |
|
||||||
|
| IMP-11 | Multi-receiver separators incl. bare `u`/`u.` | 🟡 minor | open |
|
||||||
|
| IMP-12 | Importer reads only the first sheet, no validation | 🟡 minor | open |
|
||||||
|
|
||||||
|
See the findings doc for detail and proposed approach per issue.
|
||||||
147
docs/import-migration/WORKLOG.md
Normal file
147
docs/import-migration/WORKLOG.md
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
# Import Migration — Worklog
|
||||||
|
|
||||||
|
Running log of each working session. **Resume here.** Newest entry on top.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-05-25 (session 5) — Unresolved-name classification
|
||||||
|
|
||||||
|
**Did:** Implemented [`04-unresolved-names-plan.md`](./04-unresolved-names-plan.md) subagent-driven
|
||||||
|
(5 tasks, TDD, per-task spec + code-quality review; 67 tests pass). Added `classify_name` +
|
||||||
|
`NameClass` + `build_given_names` in `persons.py`; `ResolutionContext` now records non-RESOLVABLE
|
||||||
|
names in `self.unresolved`; orchestrator writes `review/unresolved-names.csv` (replaces the noisy
|
||||||
|
`ambiguous-receivers.csv`) with per-category stats.
|
||||||
|
|
||||||
|
**Why:** `unmatched-names.csv` mixes boring non-family correspondents (expected) with genuinely
|
||||||
|
unresolvable entries. The new report isolates the latter so review focuses on ~440 real cases.
|
||||||
|
|
||||||
|
**Real-run result:** unresolved-names.csv = single_token 191 / prose 103 / unknown 74 /
|
||||||
|
collective 46 / relational 21 / ambiguous_pair **5** (distinct). The ambiguous over-flagging fix
|
||||||
|
cut `ambiguous_pair` from 303 → 5 (genuine two-given-name pairs only; `Mieze Schefold` etc. now
|
||||||
|
correctly RESOLVABLE). given-name set = register first names ∪ `config.EXTRA_GIVEN_NAMES`.
|
||||||
|
|
||||||
|
**Next:** populate `overrides/names.csv` from unresolved-names.csv (highest-count first); extend
|
||||||
|
`EXTRA_GIVEN_NAMES` if a real pair isn't flagged; still-open date work (Spanish months, 58–72 band).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-05-25 (session 4) — Built the normalizer (subagent-driven, all 17 tasks)
|
||||||
|
|
||||||
|
**Did:** Executed the plan subagent-driven (implementer + spec review + code-quality review per
|
||||||
|
task). The tool `tools/import-normalizer/` is **complete and passing (57 tests)**. Final
|
||||||
|
opus review: **READY** — determinism verified on the real corpus (two runs → identical cell
|
||||||
|
matrices + byte-identical review files), zero silent drops.
|
||||||
|
|
||||||
|
**Per-task code review caught & fixed real issues** (all in the committed code): leading
|
||||||
|
qualifiers `nach/vor/…` now → APPROX; English month-first matcher hardened to structurally
|
||||||
|
not shadow `Mai 1895`; person-id collision de-dup suffixes *all* members; `split_receivers`
|
||||||
|
returns `[]` for a `geb.`-only cell; boolean cells no longer coerced to `1/0`; duplicate-index
|
||||||
|
flags every occurrence; provisional ids never steal a register id; CSV-injection defanged.
|
||||||
|
|
||||||
|
**REAL DRY-RUN** (`python normalize.py` over the actual archive — outputs are gitignored):
|
||||||
|
- documents_emitted **7,582** (+225 empty +93 blank-index +42 x-suffix = 7,942 rows read, 0 dropped)
|
||||||
|
- register_persons **163**, provisional_persons **942**
|
||||||
|
- dates: DAY 6,509 / MONTH 36 / RANGE 36 / APPROX 28 / YEAR 17 / SEASON 1 / UNKNOWN 955
|
||||||
|
- **unknown_date_rate 9.2%** (of dated rows; target ≤5% pre-override, ≤0.5% after overrides)
|
||||||
|
- duplicate_index 85, index_file_mismatches 550, ambiguous_receivers 303
|
||||||
|
|
||||||
|
**⚠️ Concurrency incident:** a parallel Claude session committed reader-dashboard work to this
|
||||||
|
branch and hard-reset it mid-execution, deleting the Task 15 files and orphaning a commit.
|
||||||
|
Recovered via reflog (`reset --hard 366b4848` + `checkout 401160e3 -- <task15 files>`); no code
|
||||||
|
lost. Casualty: my *during-execution* edits to the plan/spec docs (02/03) for Tasks 5–14 were
|
||||||
|
discarded — **the committed code + tests are the source of truth**, not the plan doc, which now
|
||||||
|
reflects the pre-execution + persona-review version.
|
||||||
|
|
||||||
|
**Next steps (iterative refinement — the overrides loop, as designed):**
|
||||||
|
1. Shave the 9.2% UNKNOWN cheaply: add **Spanish month names** (Enero…Diciembre) and the
|
||||||
|
`Mon DD-YYYY` dash form to `config.MONTHS`/the parser (Mexican-branch correspondence);
|
||||||
|
revisit the 58–72 two-digit-year band (real `…58/59/60` dates = 1958–1960, just past the
|
||||||
|
1873–1957 window — decide whether to extend the upper bound in `config`).
|
||||||
|
2. `?` (99×) is genuinely "date unknown" — leave UNKNOWN or add a convention.
|
||||||
|
3. Populate `overrides/dates.csv` + `overrides/names.csv` from the review CSVs and re-run.
|
||||||
|
4. README note: a leading `'`/`!` in a `review/*.csv` `raw` cell may be a CSV-defang artifact —
|
||||||
|
match against the true source value when writing overrides.
|
||||||
|
5. Phase 2 (separate spec): wire the canonical contract into the Java `MassImportService`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-05-25 (session 3) — Implementation plan + persona review
|
||||||
|
|
||||||
|
**Did:**
|
||||||
|
- Wrote [`03-normalizer-implementation-plan.md`](./03-normalizer-implementation-plan.md): 17
|
||||||
|
bite-sized TDD tasks for `tools/import-normalizer/` (Python, openpyxl), bottom-up — date
|
||||||
|
parser w/ Easter computus first, then persons/alias, ingest, mapping, orchestrator, writers.
|
||||||
|
- Ran a 6-persona inline review (architect, developer, tester, req-engineer, security, devops;
|
||||||
|
ui-expert too) via parallel agents. Acted on all material findings.
|
||||||
|
|
||||||
|
**Key fixes from review (see plan §"Review feedback incorporated"):**
|
||||||
|
- Idempotency redefined byte-identical → **content-deterministic** (spec G4/NFR-IDEM-01);
|
||||||
|
pinned workbook timestamps + deterministic alias ordering + a real two-run equality test.
|
||||||
|
- Real bug: duplicate-index only reported repeats → now flags/reports every occurrence.
|
||||||
|
- Provisional `person_id` could overwrite a register id → now suffixed.
|
||||||
|
- Date parser gaps: invalid-calendar-date → UNKNOWN, intra-month day-range (`7./8. Sept.1923`).
|
||||||
|
- Multi-person sender now split + flagged (REQ-PERS-01); CSV-injection defanged in review files;
|
||||||
|
pinned deps + hardened root `.gitignore`.
|
||||||
|
|
||||||
|
**Next:**
|
||||||
|
- Marcel reviews the plan. Then execute it (subagent-driven or inline) — the date parser
|
||||||
|
(Task 3/8 + Easter computus) is the meatiest piece.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-05-25 (session 2) — Strategy + normalizer spec
|
||||||
|
|
||||||
|
**Did:**
|
||||||
|
- Decided strategy with Marcel: **normalize the raw sheets first**, then import (higher
|
||||||
|
leverage than making the Java importer tolerate every mess).
|
||||||
|
- Locked design decisions (see spec §3): new canonical layout; dates = parsed + raw +
|
||||||
|
precision; include person register + dedup in this effort; overrides-file + re-run loop;
|
||||||
|
Python tool at `tools/import-normalizer/`.
|
||||||
|
- Century rule fixed by Marcel: archive spans **1873–1957**; 2-digit `00–57`→19YY,
|
||||||
|
`73–99`→18YY, `58–72`→flag; 3-digit→1DDD; never 20xx.
|
||||||
|
- Wrote [`02-normalization-spec.md`](./02-normalization-spec.md) in the requirements-engineer
|
||||||
|
persona (FR/NFR, Given-When-Then ACs, traceability to IMP-NN, TBD register).
|
||||||
|
|
||||||
|
**All 6 open questions resolved (spec §9):** OQ-01 — movable feasts (Ostern, Pfingsten, …)
|
||||||
|
**computed per year from Easter**, never a fixed month; seasons → mid-season month
|
||||||
|
(Sommer=Jul, Herbst=Oct). OQ-02 ranges → start+RANGE. OQ-03 slug ids. OQ-04 — `x`-suffix rows
|
||||||
|
**skipped + logged** this pass (they're transcriptions of the base letter, not yet mappable).
|
||||||
|
OQ-05 → `.xlsx`. OQ-06 → conservative, no silent merge.
|
||||||
|
|
||||||
|
**Git:** moved off the unrelated `feat/issue-356-…` branch; pulled `main`; created clean
|
||||||
|
branch **`docs/import-migration`** and committed these docs there. (The dirty `.venv`
|
||||||
|
pycache + `skills/implement/SKILL.md` in the tree are pre-existing/environmental noise — left
|
||||||
|
uncommitted, not ours.)
|
||||||
|
|
||||||
|
**Next:**
|
||||||
|
- Marcel reviews the spec.
|
||||||
|
- Then writing-plans → build the normalizer at `tools/import-normalizer/` (backlog B1–B7 are
|
||||||
|
the Musts; B3 date parser incl. Easter computus is the big one).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2026-05-25 (session 1) — Initial analysis
|
||||||
|
|
||||||
|
**Did:**
|
||||||
|
- Got the real raw archive xlsx (7,943 rows) + person register (163 people). PDFs to follow.
|
||||||
|
- Compared the new xlsx layout against `MassImportService` defaults and the old ODS.
|
||||||
|
- Full statistical scan of all rows: dates, indices, senders/receivers, file column.
|
||||||
|
- Wrote [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md)
|
||||||
|
with 12 issues (IMP-01..IMP-12) + recommended sequencing.
|
||||||
|
- Installed `openpyxl` into the OCR service venv for inspection.
|
||||||
|
|
||||||
|
**Key facts established:**
|
||||||
|
- Importer defaults match the **ODS**, not the new xlsx → wrong column mapping (IMP-01).
|
||||||
|
- **90%** of dated rows (6,571 / 7,319) are free-text dates the ISO-only parser drops (IMP-02).
|
||||||
|
- Person register is rich but **unimported**; holds the maiden-name dedup key (IMP-04/05).
|
||||||
|
|
||||||
|
**Decisions pending from Marcel (blockers for any code work):**
|
||||||
|
1. IMP-01: positional re-config of `app.import.col.*` vs header-driven mapping rewrite?
|
||||||
|
2. IMP-02: how to store imprecise dates — new `dateOriginal` + `precision` columns, or lossy?
|
||||||
|
3. IMP-04/05: format for the person/alias mapping; import persons before documents?
|
||||||
|
4. IMP-10: are `x`-suffix rows separate documents, attachments, or skipped?
|
||||||
|
|
||||||
|
**Next:**
|
||||||
|
- Get Marcel's calls on the 4 decisions above.
|
||||||
|
- Then split the code-change items into Gitea issues (IMP-01b, IMP-02, IMP-04, IMP-06, IMP-12).
|
||||||
|
- Pure-data tasks (IMP-07 dup list, IMP-09 file reconcile) stay here.
|
||||||
1329
docs/superpowers/plans/2026-05-25-personendatei-importer.md
Normal file
1329
docs/superpowers/plans/2026-05-25-personendatei-importer.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,292 @@
|
|||||||
|
# Personendatei Importer — Design Spec
|
||||||
|
|
||||||
|
**Date:** 2026-05-25
|
||||||
|
**Source file:** `import/Personendatei 2.xlsx`
|
||||||
|
**Output:** `tools/import-normalizer/out/canonical-persons-tree.json`
|
||||||
|
**Tool location:** `tools/import-normalizer/persons_tree.py`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Purpose
|
||||||
|
|
||||||
|
Normalize the 163-person family register in `Personendatei 2.xlsx` into a machine-readable JSON file that a future backend importer can consume to seed the `persons` and `person_relationships` tables. The tool is offline (no backend required) and produces a reviewable artifact with an explicit `unresolved[]` list for manual follow-up.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Source Data — Column Map
|
||||||
|
|
||||||
|
Sheet: `Tabelle1` (rows 2–164; row 1 is the header).
|
||||||
|
|
||||||
|
| Col | Header | Content | Notes |
|
||||||
|
|-----|--------|---------|-------|
|
||||||
|
| A | Generation | `G 0`–`G 5` | Generation relative to Herbert & Clara Cram (G 2). Inconsistent formatting: `"G3"`, `"G 0"`, `"G 2 de Gruyter"` — strip non-digit chars and parse the integer. |
|
||||||
|
| B | Familienname | Last name | Sometimes compound: `"de Gruyter"`, `"Cram Heydrich"`, `"Burkhard- Meier"` |
|
||||||
|
| C | Vorname | First name | Sometimes multiple: `"Charlotte,Meta,Jacobi"`, nicknames in parens: `"Otto (Herbert)"` |
|
||||||
|
| D | geb als | Maiden name | Used as a name alias for matching |
|
||||||
|
| E | Geburtsdatum | Birth date | **Mixed types** — see §4 |
|
||||||
|
| F | Geburtsort | Birth place | Free-text string, stored verbatim |
|
||||||
|
| G | Todesdatum | Death date | Same mixed types as col E |
|
||||||
|
| H | Sterbeort | Death place | Free-text string, stored verbatim |
|
||||||
|
| I | verheiratet mit | Spouse name | Partial name in either `"Firstname Lastname"` or `"Lastname Firstname"` order |
|
||||||
|
| J | Bemerkung | German relationship notes | `"Sohn v Clara u Herbert"`, `"Nichte v Herbert"`, free text |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Two-Pass Architecture
|
||||||
|
|
||||||
|
### Pass 1 — Parse & Normalize (rows → person records)
|
||||||
|
|
||||||
|
For each row:
|
||||||
|
1. Read all 10 columns.
|
||||||
|
2. Assign a stable `rowId`: `"row_{i:03d}"` where `i` is the 1-based row number (e.g. `row_002`).
|
||||||
|
3. Normalize fields per §4 and §5.
|
||||||
|
4. Build the **name-lookup index** (see §6).
|
||||||
|
5. Emit a person record.
|
||||||
|
|
||||||
|
### Pass 2 — Resolve Relationships
|
||||||
|
|
||||||
|
Walk every person record:
|
||||||
|
1. Resolve col I (spouse) → emit `SPOUSE_OF` edge or `unresolved` entry.
|
||||||
|
2. Parse col J (Bemerkung) for parent/child patterns → emit `PARENT_OF` edges or `unresolved` entries.
|
||||||
|
3. Append unmatched Bemerkung text to `person.notes`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Date Parsing
|
||||||
|
|
||||||
|
Both col E (birth) and col G (death) arrive as either an Excel numeric serial or a string.
|
||||||
|
|
||||||
|
### Excel serial conversion
|
||||||
|
When the cell value is an integer (or a float with no string representation):
|
||||||
|
```
|
||||||
|
date = datetime(1899, 12, 30) + timedelta(days=int(value))
|
||||||
|
year = date.year
|
||||||
|
```
|
||||||
|
Excel's epoch is 1899-12-30 (accounts for the Lotus 1-2-3 leap-year bug).
|
||||||
|
|
||||||
|
### String fallback — reuse existing `dates.parse_date()`
|
||||||
|
Pass the raw string to the existing `tools/import-normalizer/dates.parse_date()`. It already handles:
|
||||||
|
- `DD.MM.YYYY` and `D.M.YY`
|
||||||
|
- Year-only (`1930`)
|
||||||
|
- Month + year (`August 1941`, `Sept. 1913`)
|
||||||
|
- Partial/approximate markers
|
||||||
|
|
||||||
|
Extract `.year` from the returned `ParsedDate.iso` if `iso` is not `None`.
|
||||||
|
|
||||||
|
### Unresolvable dates
|
||||||
|
If both paths yield `None` (e.g. `"2.9.196"`, `"4.3.1023"`, `".12.1955"`):
|
||||||
|
- Set `birthYear`/`deathYear` to `null`.
|
||||||
|
- Append the raw value to `person.notes` as `"[Geburtsdatum: <raw>]"` or `"[Todesdatum: <raw>]"` for human review.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Person Record Normalization
|
||||||
|
|
||||||
|
### Name fields
|
||||||
|
- **lastName** = col B, stripped.
|
||||||
|
- **firstName** = col C. Keep as-is (including multi-name strings and parenthetical nicknames) — the backend can split later.
|
||||||
|
- **maidenName** = col D, stripped. Stored in the JSON; the backend maps this to a `PersonNameAlias` of type `BIRTH_NAME`.
|
||||||
|
- **alias** = `null` (the tool does not invent aliases; maiden name is the alias).
|
||||||
|
|
||||||
|
### Generation
|
||||||
|
Extract the first digit sequence from col A:
|
||||||
|
```python
|
||||||
|
import re
|
||||||
|
m = re.search(r"\d+", raw_generation)
|
||||||
|
generation = int(m.group()) if m else None
|
||||||
|
```
|
||||||
|
Handles all observed variants: `"G 3"`, `"G3"`, `"G 0"`, `"G 2 de Gruyter"`, `"G 0"`.
|
||||||
|
Stored as `generation: int | null` in the JSON (informational; not mapped to a backend field directly).
|
||||||
|
|
||||||
|
### familyMember
|
||||||
|
Set `true` for all records. Every person in this register is part of the family network. The backend can refine this.
|
||||||
|
|
||||||
|
### notes
|
||||||
|
Constructed by concatenation:
|
||||||
|
1. Unmatched Bemerkung text (after relationship pattern is stripped).
|
||||||
|
2. Unresolvable date raw values (prefixed with field name).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Name Lookup Index
|
||||||
|
|
||||||
|
After pass 1, build a `dict[str, list[str]]` mapping normalized name keys → list of `rowId`s.
|
||||||
|
|
||||||
|
### Normalization function `_norm(s) -> str`
|
||||||
|
1. Lowercase.
|
||||||
|
2. Strip surrounding `"` and `'`.
|
||||||
|
3. Remove parenthetical substrings: `r"\([^)]*\)"`.
|
||||||
|
4. Collapse internal whitespace.
|
||||||
|
5. Strip geographic/honorific suffixes: `aachen`, `mex.`, `mexiko`, `sen`, `jun`, `jr`.
|
||||||
|
6. Strip trailing commas, dots.
|
||||||
|
|
||||||
|
### Keys indexed per person
|
||||||
|
For a person with firstName `F`, lastName `L`, maidenName `M`:
|
||||||
|
- `_norm(f"{F} {L}")` — canonical order
|
||||||
|
- `_norm(f"{L} {F}")` — reversed order (col I uses this heavily)
|
||||||
|
- `_norm(f"{F} {M}")` if maidenName is set — maiden-name reference
|
||||||
|
- `_norm(L)` alone — single-token fallback
|
||||||
|
|
||||||
|
### Match resolution
|
||||||
|
Given a raw name string from col I or col J:
|
||||||
|
1. `_norm(raw)` → look up in index.
|
||||||
|
2. **Exactly one hit** → match confirmed, use that `rowId`.
|
||||||
|
3. **Zero hits** → `reason: "not_found"` → `unresolved[]`.
|
||||||
|
4. **Multiple hits** → `reason: "ambiguous"` → `unresolved[]`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Relationship Extraction
|
||||||
|
|
||||||
|
### 7.1 SPOUSE_OF (col I — `verheiratet mit`)
|
||||||
|
|
||||||
|
1. Normalize col I value.
|
||||||
|
2. Resolve via name index (§6).
|
||||||
|
3. If matched: emit one edge `{ personId, relatedPersonId, type: "SPOUSE_OF", source: "verheiratet_mit" }`.
|
||||||
|
- Skip if an identical edge (regardless of direction) already exists in the relationship list.
|
||||||
|
4. If unresolved: add to `unresolved[]`.
|
||||||
|
|
||||||
|
### 7.2 PARENT_OF (col J — `Bemerkung`)
|
||||||
|
|
||||||
|
Apply these regex patterns in order, case-insensitive, with optional whitespace:
|
||||||
|
|
||||||
|
| Pattern | Direction | Note |
|
||||||
|
|---------|-----------|------|
|
||||||
|
| `(Sohn\|Tochter)\s+v(?:on)?\s+(.+)` | Named person(s) → this person | "Sohn v Clara u Herbert" |
|
||||||
|
| `(Vater\|Mutter)\s+v(?:on)?\s+(.+)` | This person → named person(s) | "Vater v Herbert" |
|
||||||
|
|
||||||
|
**Multi-parent extraction:** The parent string may contain two parents joined by `\s+u(?:nd)?\s+`. Split on this pattern, resolve each part independently.
|
||||||
|
|
||||||
|
**Emit** one `PARENT_OF` edge per resolved parent:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"personId": "<parent_rowId>",
|
||||||
|
"relatedPersonId": "<child_rowId>",
|
||||||
|
"type": "PARENT_OF",
|
||||||
|
"source": "bemerkung",
|
||||||
|
"rawBemerkung": "<original col J value>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Skip** (do not emit, do not add to `unresolved[]`, leave in notes):
|
||||||
|
- Patterns starting with `Neffe`, `Nichte`, `Enkel`, `Enkelin`, `Urenkel`, `Urenkelin` — too indirect.
|
||||||
|
- Patterns starting with `Bruder`, `Schwester` — SIBLING_OF is out of scope for this tool.
|
||||||
|
- Any other Bemerkung text that does not match the parent patterns.
|
||||||
|
|
||||||
|
**After extraction:** the matched portion of the Bemerkung is removed; the remainder goes into `person.notes`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Output JSON Schema
|
||||||
|
|
||||||
|
File: `tools/import-normalizer/out/canonical-persons-tree.json`
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"generated_at": "<ISO-8601 timestamp>",
|
||||||
|
"source": "Personendatei 2.xlsx",
|
||||||
|
"stats": {
|
||||||
|
"persons": 163,
|
||||||
|
"relationships": 87,
|
||||||
|
"unresolved": 12
|
||||||
|
},
|
||||||
|
"persons": [
|
||||||
|
{
|
||||||
|
"rowId": "row_002",
|
||||||
|
"firstName": "Elsgard",
|
||||||
|
"lastName": "Allemeyer",
|
||||||
|
"maidenName": "Wöhler",
|
||||||
|
"alias": null,
|
||||||
|
"notes": "Nichte von Herbert",
|
||||||
|
"birthYear": 1920,
|
||||||
|
"deathYear": 1999,
|
||||||
|
"birthPlace": "Garz",
|
||||||
|
"deathPlace": "Espelkamp",
|
||||||
|
"generation": 3,
|
||||||
|
"familyMember": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"relationships": [
|
||||||
|
{
|
||||||
|
"personId": "row_002",
|
||||||
|
"relatedPersonId": "row_003",
|
||||||
|
"type": "SPOUSE_OF",
|
||||||
|
"source": "verheiratet_mit"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"personId": "row_019",
|
||||||
|
"relatedPersonId": "row_021",
|
||||||
|
"type": "PARENT_OF",
|
||||||
|
"source": "bemerkung",
|
||||||
|
"rawBemerkung": "Tochter v Clara u Herbert"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"unresolved": [
|
||||||
|
{
|
||||||
|
"rowId": "row_007",
|
||||||
|
"field": "verheiratet_mit",
|
||||||
|
"raw": "\"Tante Lolly\"",
|
||||||
|
"reason": "not_found"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"rowId": "row_042",
|
||||||
|
"field": "bemerkung",
|
||||||
|
"raw": "Zwillingsbruder v Herbert",
|
||||||
|
"reason": "not_found"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. CLI Interface
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 persons_tree.py [--input PATH] [--output PATH] [--dry-run]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Flag | Default | Description |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| `--input` | `../../import/Personendatei 2.xlsx` | Source Excel file |
|
||||||
|
| `--output` | `out/canonical-persons-tree.json` | Output JSON file |
|
||||||
|
| `--dry-run` | off | Print stats + first 5 unresolved entries; do not write file |
|
||||||
|
|
||||||
|
On success, print:
|
||||||
|
```
|
||||||
|
✓ 163 persons parsed
|
||||||
|
✓ 87 relationships emitted (52 SPOUSE_OF, 35 PARENT_OF)
|
||||||
|
⚠ 12 unresolved (see unresolved[] in output)
|
||||||
|
→ out/canonical-persons-tree.json
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Module Reuse
|
||||||
|
|
||||||
|
| Existing module | What we reuse |
|
||||||
|
|-----------------|---------------|
|
||||||
|
| `dates.parse_date()` | String date parsing — handles DD.MM.YYYY, year-only, month+year, approximate markers |
|
||||||
|
| `config.MONTHS` | Month name → integer mapping (German + Spanish month names already present) |
|
||||||
|
|
||||||
|
The Excel serial conversion is new logic added directly in `persons_tree.py` (3 lines).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. What This Tool Does NOT Do
|
||||||
|
|
||||||
|
- Does not call the backend API or touch the database.
|
||||||
|
- Does not create `PersonNameAlias` records — it emits `maidenName` as a field; the future backend importer maps it.
|
||||||
|
- Does not infer SIBLING_OF edges (requires symmetric lookup across multiple rows — deferred).
|
||||||
|
- Does not deduplicate persons that appear in both this file and `canonical-persons.xlsx` — deduplication is the backend importer's responsibility.
|
||||||
|
- Does produce `birthPlace` / `deathPlace` as top-level fields in the JSON (see §8) — they are free-text strings and informational only. The `Person` entity has no corresponding columns; the future backend importer decides whether to add columns or fold the values into `notes`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Resolved Decisions
|
||||||
|
|
||||||
|
| OQ | Question | Decision |
|
||||||
|
|----|----------|----------|
|
||||||
|
| OQ-01 | Duplicate rows (127/138 — Christa Schütz; 129/139 — Christoph Seils). | **Tool deduplicates.** On pass 1, after building the person list, detect rows with identical `(firstName, lastName, birthYear)` and keep only the first occurrence. Log skipped row ids to stdout. |
|
||||||
|
| OQ-02 | `birthPlace` / `deathPlace` absent from `Person` entity. | **Keep as separate top-level fields** in the JSON (`birthPlace`, `deathPlace`). The future backend importer may add columns to the `persons` table; the field is preserved here to avoid data loss. |
|
||||||
|
| OQ-03 | `firstName` = `"Charlotte,Meta,Jacobi"` (multi-name comma string). | **Store verbatim as `firstName`.** No splitting. |
|
||||||
@@ -1084,10 +1084,5 @@
|
|||||||
"timeline_dragging_aria_live": "Zeitraum {from} bis {to} ausgewählt",
|
"timeline_dragging_aria_live": "Zeitraum {from} bis {to} ausgewählt",
|
||||||
"error_page_id_label": "Fehler-ID",
|
"error_page_id_label": "Fehler-ID",
|
||||||
"error_copy_id_label": "ID kopieren",
|
"error_copy_id_label": "ID kopieren",
|
||||||
"error_copied": "Kopiert!",
|
"error_copied": "Kopiert!"
|
||||||
"themen_widget_title": "Themen",
|
|
||||||
"themen_alle": "Alle Themen",
|
|
||||||
"themen_leer": "Noch keine Themen vergeben.",
|
|
||||||
"themen_weitere": "+ {count} weitere",
|
|
||||||
"themen_dokumente": "{count} Dokumente"
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1084,10 +1084,5 @@
|
|||||||
"timeline_dragging_aria_live": "Range {from} to {to} selected",
|
"timeline_dragging_aria_live": "Range {from} to {to} selected",
|
||||||
"error_page_id_label": "Error ID",
|
"error_page_id_label": "Error ID",
|
||||||
"error_copy_id_label": "Copy ID",
|
"error_copy_id_label": "Copy ID",
|
||||||
"error_copied": "Copied!",
|
"error_copied": "Copied!"
|
||||||
"themen_widget_title": "Topics",
|
|
||||||
"themen_alle": "All Topics",
|
|
||||||
"themen_leer": "No topics assigned yet.",
|
|
||||||
"themen_weitere": "+ {count} more",
|
|
||||||
"themen_dokumente": "{count} documents"
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1084,10 +1084,5 @@
|
|||||||
"timeline_dragging_aria_live": "Rango {from} a {to} seleccionado",
|
"timeline_dragging_aria_live": "Rango {from} a {to} seleccionado",
|
||||||
"error_page_id_label": "ID de error",
|
"error_page_id_label": "ID de error",
|
||||||
"error_copy_id_label": "Copiar ID",
|
"error_copy_id_label": "Copiar ID",
|
||||||
"error_copied": "¡Copiado!",
|
"error_copied": "¡Copiado!"
|
||||||
"themen_widget_title": "Temas",
|
|
||||||
"themen_alle": "Todos los temas",
|
|
||||||
"themen_leer": "Aún no hay temas.",
|
|
||||||
"themen_weitere": "+ {count} más",
|
|
||||||
"themen_dokumente": "{count} documentos"
|
|
||||||
}
|
}
|
||||||
|
|||||||
10
frontend/package-lock.json
generated
10
frontend/package-lock.json
generated
@@ -23,9 +23,9 @@
|
|||||||
"@eslint/compat": "^1.4.0",
|
"@eslint/compat": "^1.4.0",
|
||||||
"@eslint/js": "^9.39.1",
|
"@eslint/js": "^9.39.1",
|
||||||
"@inlang/paraglide-js": "^2.5.0",
|
"@inlang/paraglide-js": "^2.5.0",
|
||||||
"@playwright/test": "^1.60.0",
|
"@playwright/test": "^1.58.2",
|
||||||
"@sveltejs/adapter-node": "^5.5.4",
|
"@sveltejs/adapter-node": "^5.4.0",
|
||||||
"@sveltejs/kit": "^2.60.1",
|
"@sveltejs/kit": "^2.48.5",
|
||||||
"@sveltejs/vite-plugin-svelte": "^6.2.1",
|
"@sveltejs/vite-plugin-svelte": "^6.2.1",
|
||||||
"@tailwindcss/forms": "^0.5.10",
|
"@tailwindcss/forms": "^0.5.10",
|
||||||
"@tailwindcss/typography": "^0.5.19",
|
"@tailwindcss/typography": "^0.5.19",
|
||||||
@@ -43,7 +43,7 @@
|
|||||||
"globals": "^16.5.0",
|
"globals": "^16.5.0",
|
||||||
"openapi-typescript": "^7.8.0",
|
"openapi-typescript": "^7.8.0",
|
||||||
"patch-package": "^8.0.0",
|
"patch-package": "^8.0.0",
|
||||||
"playwright": "^1.60.0",
|
"playwright": "^1.56.1",
|
||||||
"prettier": "^3.6.2",
|
"prettier": "^3.6.2",
|
||||||
"prettier-plugin-svelte": "^3.4.0",
|
"prettier-plugin-svelte": "^3.4.0",
|
||||||
"prettier-plugin-tailwindcss": "^0.7.1",
|
"prettier-plugin-tailwindcss": "^0.7.1",
|
||||||
@@ -52,7 +52,7 @@
|
|||||||
"tailwindcss": "^4.1.17",
|
"tailwindcss": "^4.1.17",
|
||||||
"typescript": "^5.9.3",
|
"typescript": "^5.9.3",
|
||||||
"typescript-eslint": "^8.47.0",
|
"typescript-eslint": "^8.47.0",
|
||||||
"vite": "^7.3.3",
|
"vite": "^7.2.2",
|
||||||
"vite-plugin-devtools-json": "^1.0.0",
|
"vite-plugin-devtools-json": "^1.0.0",
|
||||||
"vitest": "^4.0.10",
|
"vitest": "^4.0.10",
|
||||||
"vitest-browser-svelte": "^2.0.1"
|
"vitest-browser-svelte": "^2.0.1"
|
||||||
|
|||||||
@@ -2205,10 +2205,10 @@ export interface components {
|
|||||||
totalStories: number;
|
totalStories: number;
|
||||||
};
|
};
|
||||||
PersonSummaryDTO: {
|
PersonSummaryDTO: {
|
||||||
title?: string;
|
|
||||||
/** Format: uuid */
|
/** Format: uuid */
|
||||||
id?: string;
|
id?: string;
|
||||||
displayName?: string;
|
displayName?: string;
|
||||||
|
title?: string;
|
||||||
firstName?: string;
|
firstName?: string;
|
||||||
lastName?: string;
|
lastName?: string;
|
||||||
/** Format: int64 */
|
/** Format: int64 */
|
||||||
@@ -2315,6 +2315,8 @@ export interface components {
|
|||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
totalPages?: number;
|
totalPages?: number;
|
||||||
pageable?: components["schemas"]["PageableObject"];
|
pageable?: components["schemas"]["PageableObject"];
|
||||||
|
first?: boolean;
|
||||||
|
last?: boolean;
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
size?: number;
|
size?: number;
|
||||||
content?: components["schemas"]["NotificationDTO"][];
|
content?: components["schemas"]["NotificationDTO"][];
|
||||||
@@ -2323,8 +2325,6 @@ export interface components {
|
|||||||
sort?: components["schemas"]["SortObject"];
|
sort?: components["schemas"]["SortObject"];
|
||||||
/** Format: int32 */
|
/** Format: int32 */
|
||||||
numberOfElements?: number;
|
numberOfElements?: number;
|
||||||
first?: boolean;
|
|
||||||
last?: boolean;
|
|
||||||
empty?: boolean;
|
empty?: boolean;
|
||||||
};
|
};
|
||||||
PageableObject: {
|
PageableObject: {
|
||||||
@@ -2407,10 +2407,6 @@ export interface components {
|
|||||||
completionPercentage: number;
|
completionPercentage: number;
|
||||||
contributors: components["schemas"]["ActivityActorDTO"][];
|
contributors: components["schemas"]["ActivityActorDTO"][];
|
||||||
matchData: components["schemas"]["SearchMatchData"];
|
matchData: components["schemas"]["SearchMatchData"];
|
||||||
/** Format: date-time */
|
|
||||||
createdAt: string;
|
|
||||||
/** Format: date-time */
|
|
||||||
updatedAt: string;
|
|
||||||
};
|
};
|
||||||
DocumentSearchResult: {
|
DocumentSearchResult: {
|
||||||
items: components["schemas"]["DocumentListItem"][];
|
items: components["schemas"]["DocumentListItem"][];
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ import * as m from '$lib/paraglide/messages.js';
|
|||||||
import { relativeTimeDe } from '$lib/shared/relativeTime';
|
import { relativeTimeDe } from '$lib/shared/relativeTime';
|
||||||
import type { components } from '$lib/generated/api';
|
import type { components } from '$lib/generated/api';
|
||||||
|
|
||||||
type DocumentListItem = components['schemas']['DocumentListItem'];
|
type Document = components['schemas']['Document'];
|
||||||
|
|
||||||
interface Props {
|
interface Props {
|
||||||
documents: DocumentListItem[];
|
documents: Document[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const { documents }: Props = $props();
|
const { documents }: Props = $props();
|
||||||
|
|
||||||
function isNew(doc: DocumentListItem): boolean {
|
function isNew(doc: Document): boolean {
|
||||||
return new Date(doc.createdAt).getTime() > Date.now() - 7 * 24 * 60 * 60 * 1000;
|
return new Date(doc.createdAt).getTime() === new Date(doc.updatedAt).getTime();
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|||||||
@@ -5,33 +5,24 @@ import { page } from 'vitest/browser';
|
|||||||
import ReaderRecentDocs from './ReaderRecentDocs.svelte';
|
import ReaderRecentDocs from './ReaderRecentDocs.svelte';
|
||||||
import type { components } from '$lib/generated/api';
|
import type { components } from '$lib/generated/api';
|
||||||
|
|
||||||
type DocumentListItem = components['schemas']['DocumentListItem'];
|
type Document = components['schemas']['Document'];
|
||||||
|
|
||||||
afterEach(() => {
|
afterEach(() => {
|
||||||
cleanup();
|
cleanup();
|
||||||
});
|
});
|
||||||
|
|
||||||
const baseDoc: DocumentListItem = {
|
const baseDoc: Document = {
|
||||||
id: 'doc1',
|
id: 'doc1',
|
||||||
title: 'Brief an Hans',
|
title: 'Brief an Hans',
|
||||||
originalFilename: 'brief.pdf',
|
originalFilename: 'brief.pdf',
|
||||||
completionPercentage: 0,
|
status: 'UPLOADED',
|
||||||
receivers: [],
|
metadataComplete: true,
|
||||||
tags: [],
|
scriptType: 'HANDWRITING_KURRENT',
|
||||||
contributors: [],
|
|
||||||
matchData: {
|
|
||||||
titleOffsets: [],
|
|
||||||
senderMatched: false,
|
|
||||||
matchedReceiverIds: [],
|
|
||||||
matchedTagIds: [],
|
|
||||||
snippetOffsets: [],
|
|
||||||
summaryOffsets: []
|
|
||||||
},
|
|
||||||
createdAt: '2025-01-01T12:00:00Z',
|
createdAt: '2025-01-01T12:00:00Z',
|
||||||
updatedAt: '2025-01-01T12:00:00Z'
|
updatedAt: '2025-01-01T12:00:00Z'
|
||||||
};
|
};
|
||||||
|
|
||||||
const updatedDoc: DocumentListItem = {
|
const updatedDoc: Document = {
|
||||||
...baseDoc,
|
...baseDoc,
|
||||||
id: 'doc2',
|
id: 'doc2',
|
||||||
title: 'Urkunde 1920',
|
title: 'Urkunde 1920',
|
||||||
@@ -97,14 +88,8 @@ describe('ReaderRecentDocs', () => {
|
|||||||
expect(thumb!.className).toMatch(/rounded-/);
|
expect(thumb!.className).toMatch(/rounded-/);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('shows "Neu" accent-pill badge when document was created within the last 7 days', async () => {
|
it('shows "Neu" accent-pill badge when createdAt equals updatedAt', async () => {
|
||||||
const recentDoc: DocumentListItem = {
|
render(ReaderRecentDocs, { documents: [baseDoc] });
|
||||||
...baseDoc,
|
|
||||||
id: 'doc-recent',
|
|
||||||
createdAt: new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString(),
|
|
||||||
updatedAt: new Date(Date.now() - 1 * 24 * 60 * 60 * 1000).toISOString()
|
|
||||||
};
|
|
||||||
render(ReaderRecentDocs, { documents: [recentDoc] });
|
|
||||||
const badge = page.getByText(/^Neu$/i);
|
const badge = page.getByText(/^Neu$/i);
|
||||||
await expect.element(badge).toBeInTheDocument();
|
await expect.element(badge).toBeInTheDocument();
|
||||||
const cls = ((await badge.element()) as HTMLElement).className;
|
const cls = ((await badge.element()) as HTMLElement).className;
|
||||||
@@ -113,7 +98,7 @@ describe('ReaderRecentDocs', () => {
|
|||||||
expect(cls).toMatch(/\btext-ink\b/);
|
expect(cls).toMatch(/\btext-ink\b/);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('shows no badge when document was created more than 7 days ago', async () => {
|
it('shows no badge when updatedAt differs from createdAt', async () => {
|
||||||
render(ReaderRecentDocs, { documents: [updatedDoc] });
|
render(ReaderRecentDocs, { documents: [updatedDoc] });
|
||||||
const badge = page.getByText(/^Neu$/i);
|
const badge = page.getByText(/^Neu$/i);
|
||||||
await expect.element(badge).not.toBeInTheDocument();
|
await expect.element(badge).not.toBeInTheDocument();
|
||||||
@@ -121,20 +106,20 @@ describe('ReaderRecentDocs', () => {
|
|||||||
await expect.element(updatedBadge).not.toBeInTheDocument();
|
await expect.element(updatedBadge).not.toBeInTheDocument();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('shows "Neu" badge when document was created 6 days ago', async () => {
|
it('shows "Neu" badge when createdAt and updatedAt represent the same instant in different ISO formats', async () => {
|
||||||
const almostOldDoc: DocumentListItem = {
|
const sameInstantDoc: Document = {
|
||||||
...baseDoc,
|
...baseDoc,
|
||||||
id: 'doc-almost-old',
|
id: 'doc-same-instant',
|
||||||
createdAt: new Date(Date.now() - 6 * 24 * 60 * 60 * 1000).toISOString(),
|
createdAt: '2025-01-01T12:00:00Z',
|
||||||
updatedAt: new Date(Date.now() - 5 * 24 * 60 * 60 * 1000).toISOString()
|
updatedAt: '2025-01-01T12:00:00.000Z'
|
||||||
};
|
};
|
||||||
render(ReaderRecentDocs, { documents: [almostOldDoc] });
|
render(ReaderRecentDocs, { documents: [sameInstantDoc] });
|
||||||
const badge = page.getByText(/^Neu$/i);
|
const badge = page.getByText(/^Neu$/i);
|
||||||
await expect.element(badge).toBeInTheDocument();
|
await expect.element(badge).toBeInTheDocument();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('renders sender name text when sender is present', async () => {
|
it('renders sender name text when sender is present', async () => {
|
||||||
const docWithSender: DocumentListItem = {
|
const docWithSender: Document = {
|
||||||
...baseDoc,
|
...baseDoc,
|
||||||
sender: {
|
sender: {
|
||||||
id: 'p1',
|
id: 'p1',
|
||||||
|
|||||||
@@ -31,25 +31,25 @@ describe('ReaderRecentDocs', () => {
|
|||||||
.toHaveAttribute('href', '/documents');
|
.toHaveAttribute('href', '/documents');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('renders the New badge when document was created within the last 7 days', async () => {
|
it('renders the New badge when createdAt equals updatedAt', async () => {
|
||||||
const recentDate = new Date(Date.now() - 2 * 24 * 60 * 60 * 1000).toISOString();
|
|
||||||
const laterUpdate = new Date(Date.now() - 1 * 24 * 60 * 60 * 1000).toISOString();
|
|
||||||
render(ReaderRecentDocs, {
|
render(ReaderRecentDocs, {
|
||||||
props: {
|
props: {
|
||||||
documents: [makeDoc({ createdAt: recentDate, updatedAt: laterUpdate })]
|
documents: [
|
||||||
|
makeDoc({ createdAt: '2026-04-15T10:00:00Z', updatedAt: '2026-04-15T10:00:00Z' })
|
||||||
|
]
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
await expect.element(page.getByText('Neu')).toBeVisible();
|
await expect.element(page.getByText('Neu')).toBeVisible();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('hides the New badge when document was created more than 7 days ago', async () => {
|
it('hides the New badge when document was updated after creation', async () => {
|
||||||
render(ReaderRecentDocs, {
|
render(ReaderRecentDocs, {
|
||||||
props: {
|
props: {
|
||||||
documents: [
|
documents: [
|
||||||
makeDoc({
|
makeDoc({
|
||||||
createdAt: '2026-04-15T10:00:00Z',
|
createdAt: '2026-04-15T10:00:00Z',
|
||||||
updatedAt: '2026-04-15T10:00:00Z'
|
updatedAt: '2026-04-15T11:00:00Z'
|
||||||
})
|
})
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,67 +0,0 @@
|
|||||||
<script lang="ts">
|
|
||||||
import * as m from '$lib/paraglide/messages.js';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
import { hasAnyDocuments } from '$lib/shared/utils/tagUtils';
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
interface Props {
|
|
||||||
tags: TagTreeNodeDTO[];
|
|
||||||
compact?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
const MAX_VISIBLE_TAGS = 6;
|
|
||||||
|
|
||||||
const { tags, compact = false }: Props = $props();
|
|
||||||
|
|
||||||
const visibleTags = $derived.by(() => tags.filter(hasAnyDocuments));
|
|
||||||
const shownTags = $derived(visibleTags.slice(0, MAX_VISIBLE_TAGS));
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<section class="rounded-sm border border-line bg-surface p-5 shadow-sm">
|
|
||||||
<div class="mb-4 flex items-center justify-between">
|
|
||||||
<h2 class="font-sans text-xs font-bold tracking-widest text-ink-3 uppercase">
|
|
||||||
{m.themen_widget_title()}
|
|
||||||
</h2>
|
|
||||||
<a
|
|
||||||
href="/themen"
|
|
||||||
class="flex min-h-[44px] items-center text-[11px] font-semibold text-ink-2 no-underline focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none"
|
|
||||||
>
|
|
||||||
{m.themen_alle()} →
|
|
||||||
</a>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{#if visibleTags.length === 0}
|
|
||||||
<p class="font-sans text-sm text-ink-3">{m.themen_leer()}</p>
|
|
||||||
{:else}
|
|
||||||
<div
|
|
||||||
class="grid gap-2 {compact ? 'grid-cols-1' : 'grid-cols-1 sm:grid-cols-2'}"
|
|
||||||
data-compact={compact}
|
|
||||||
>
|
|
||||||
{#each shownTags as tag (tag.id)}
|
|
||||||
<a
|
|
||||||
href="/documents?tag={encodeURIComponent(tag.name)}"
|
|
||||||
aria-label="{tag.name}{tag.documentCount > 0
|
|
||||||
? ', ' + m.themen_dokumente({ count: tag.documentCount })
|
|
||||||
: ''}"
|
|
||||||
class="flex cursor-pointer items-stretch overflow-hidden rounded-sm border border-line bg-canvas hover:bg-surface focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none"
|
|
||||||
style="min-height: 56px"
|
|
||||||
>
|
|
||||||
<span
|
|
||||||
class="w-1 flex-shrink-0 self-stretch"
|
|
||||||
aria-hidden="true"
|
|
||||||
style="background: var(--c-tag-{tag.color ?? 'slate'})"
|
|
||||||
></span>
|
|
||||||
<span class="flex min-w-0 flex-1 flex-col justify-center gap-0.5 px-3 py-3">
|
|
||||||
<span class="truncate font-serif text-sm font-semibold text-ink">{tag.name}</span>
|
|
||||||
{#if tag.documentCount > 0}
|
|
||||||
<span class="font-sans text-xs text-ink-3 tabular-nums">
|
|
||||||
{m.themen_dokumente({ count: tag.documentCount })}
|
|
||||||
</span>
|
|
||||||
{/if}
|
|
||||||
</span>
|
|
||||||
</a>
|
|
||||||
{/each}
|
|
||||||
</div>
|
|
||||||
{/if}
|
|
||||||
</section>
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
import { describe, it, expect, afterEach } from 'vitest';
|
|
||||||
import { cleanup, render } from 'vitest-browser-svelte';
|
|
||||||
import ThemenWidget from './ThemenWidget.svelte';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
afterEach(() => {
|
|
||||||
cleanup();
|
|
||||||
});
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
function makeTag(
|
|
||||||
name: string,
|
|
||||||
documentCount: number,
|
|
||||||
children: TagTreeNodeDTO[] = []
|
|
||||||
): TagTreeNodeDTO {
|
|
||||||
return { id: 'id-' + name, name, documentCount, children };
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('ThemenWidget', () => {
|
|
||||||
it('renders a card link per visible tag', async () => {
|
|
||||||
const tags = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
|
|
||||||
const { getByRole } = render(ThemenWidget, { tags });
|
|
||||||
await expect.element(getByRole('link', { name: /Briefe/ })).toBeInTheDocument();
|
|
||||||
await expect.element(getByRole('link', { name: /Fotos/ })).toBeInTheDocument();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('hides tags where no document exists in the subtree', async () => {
|
|
||||||
const tags = [makeTag('Briefe', 5), makeTag('Leer', 0)];
|
|
||||||
render(ThemenWidget, { tags });
|
|
||||||
expect(document.body.textContent).toContain('Briefe');
|
|
||||||
expect(document.body.textContent).not.toContain('Leer');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('shows the empty state text when all tags are filtered out', async () => {
|
|
||||||
render(ThemenWidget, { tags: [makeTag('Leer', 0)] });
|
|
||||||
expect(document.body.textContent).toMatch(/Noch keine Themen/);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('shows empty state when tags array is empty', async () => {
|
|
||||||
render(ThemenWidget, { tags: [] });
|
|
||||||
expect(document.body.textContent).toMatch(/Noch keine Themen/);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('renders in compact single-column mode when compact prop is true', async () => {
|
|
||||||
const tags = [makeTag('Briefe', 5)];
|
|
||||||
const { container } = render(ThemenWidget, { tags, compact: true });
|
|
||||||
const grid = container.querySelector('[data-compact="true"]');
|
|
||||||
expect(grid).not.toBeNull();
|
|
||||||
});
|
|
||||||
|
|
||||||
it('links to "Alle Themen" page', async () => {
|
|
||||||
const tags = [makeTag('Briefe', 5)];
|
|
||||||
const { getByRole } = render(ThemenWidget, { tags });
|
|
||||||
const link = getByRole('link', { name: /Alle Themen/ });
|
|
||||||
await expect.element(link).toHaveAttribute('href', '/themen');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -409,24 +409,19 @@ describe('PersonMentionEditor — onExit cancels pending debounce', () => {
|
|||||||
await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
|
await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
|
||||||
const fetchesBeforeEscape = fetchMock.mock.calls.length;
|
const fetchesBeforeEscape = fetchMock.mock.calls.length;
|
||||||
|
|
||||||
// Freeze setTimeout so the 150 ms debounce cannot fire before Escape
|
// Trigger a new debounced search (queues runSearch after 150 ms), then
|
||||||
// triggers onExit. We install fake timers only now — after the setup
|
// immediately Escape *while focus is back in the editor* so Tiptap's
|
||||||
// above — so that vi.waitFor()'s real-timer polling still worked.
|
// suggestion-plugin Escape handler fires onExit before the debounce.
|
||||||
vi.useFakeTimers();
|
// Without onExit cancelling the pending debounce, runSearch executes
|
||||||
try {
|
// against the now-unmounted dropdown's state.
|
||||||
// fill() dispatches the input event synchronously via CDP; by the
|
|
||||||
// time the await resolves, onSearch('Walter') has run and the fake
|
|
||||||
// debounce timer is set.
|
|
||||||
await page.getByRole('searchbox').fill('Walter');
|
await page.getByRole('searchbox').fill('Walter');
|
||||||
// Focus the editor so the Escape lands on Tiptap's suggestion handler.
|
// Focus the editor so the Escape lands on Tiptap's suggestion handler.
|
||||||
(page.getByRole('textbox').element() as HTMLElement).focus();
|
(page.getByRole('textbox').element() as HTMLElement).focus();
|
||||||
await userEvent.keyboard('{Escape}');
|
await userEvent.keyboard('{Escape}');
|
||||||
// onExit has now called debouncedSearch.cancel(). Advance past the
|
|
||||||
// debounce window — the cancelled timer must not fire.
|
// Wait past the debounce window. If onExit did not cancel the pending
|
||||||
await vi.advanceTimersByTimeAsync(SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS);
|
// debounce, a fetch with q=Walter would still fire here.
|
||||||
} finally {
|
await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
|
||||||
vi.useRealTimers();
|
|
||||||
}
|
|
||||||
|
|
||||||
const newFetches = fetchMock.mock.calls.slice(fetchesBeforeEscape);
|
const newFetches = fetchMock.mock.calls.slice(fetchesBeforeEscape);
|
||||||
const walterFetches = newFetches.filter(
|
const walterFetches = newFetches.filter(
|
||||||
|
|||||||
@@ -1,29 +0,0 @@
|
|||||||
import { describe, it, expect } from 'vitest';
|
|
||||||
import { hasAnyDocuments } from './tagUtils';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
function makeNode(documentCount: number, children: TagTreeNodeDTO[] = []): TagTreeNodeDTO {
|
|
||||||
return { id: 'id', name: 'name', documentCount, children };
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('hasAnyDocuments', () => {
|
|
||||||
it('returns false for a leaf node with documentCount=0', () => {
|
|
||||||
expect(hasAnyDocuments(makeNode(0))).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns true for a leaf node with documentCount=3', () => {
|
|
||||||
expect(hasAnyDocuments(makeNode(3))).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns true for a root with documentCount=0 but a child with documentCount=5', () => {
|
|
||||||
const node = makeNode(0, [makeNode(5)]);
|
|
||||||
expect(hasAnyDocuments(node)).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns false for a root with documentCount=0 and all children also 0', () => {
|
|
||||||
const node = makeNode(0, [makeNode(0), makeNode(0)]);
|
|
||||||
expect(hasAnyDocuments(node)).toBe(false);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
export function hasAnyDocuments(node: TagTreeNodeDTO): boolean {
|
|
||||||
return (node.documentCount ?? 0) > 0 || (node.children ?? []).some(hasAnyDocuments);
|
|
||||||
}
|
|
||||||
@@ -10,9 +10,8 @@ type DashboardPulseDTO = components['schemas']['DashboardPulseDTO'];
|
|||||||
type ActivityFeedItemDTO = components['schemas']['ActivityFeedItemDTO'];
|
type ActivityFeedItemDTO = components['schemas']['ActivityFeedItemDTO'];
|
||||||
type IncompleteDocumentDTO = components['schemas']['IncompleteDocumentDTO'];
|
type IncompleteDocumentDTO = components['schemas']['IncompleteDocumentDTO'];
|
||||||
type PersonSummaryDTO = components['schemas']['PersonSummaryDTO'];
|
type PersonSummaryDTO = components['schemas']['PersonSummaryDTO'];
|
||||||
type DocumentListItem = components['schemas']['DocumentListItem'];
|
type Document = components['schemas']['Document'];
|
||||||
type Geschichte = components['schemas']['Geschichte'];
|
type Geschichte = components['schemas']['Geschichte'];
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
function settled<T>(res: PromiseSettledResult<unknown> | undefined): T | null {
|
function settled<T>(res: PromiseSettledResult<unknown> | undefined): T | null {
|
||||||
if (res?.status !== 'fulfilled') return null;
|
if (res?.status !== 'fulfilled') return null;
|
||||||
@@ -41,8 +40,7 @@ export async function load({ fetch, parent }) {
|
|||||||
api.GET('/api/documents/search', {
|
api.GET('/api/documents/search', {
|
||||||
params: { query: { sort: 'UPDATED_AT', dir: 'DESC', size: 5 } }
|
params: { query: { sort: 'UPDATED_AT', dir: 'DESC', size: 5 } }
|
||||||
}),
|
}),
|
||||||
api.GET('/api/geschichten', { params: { query: { status: 'PUBLISHED', limit: 3 } } }),
|
api.GET('/api/geschichten', { params: { query: { status: 'PUBLISHED', limit: 3 } } })
|
||||||
api.GET('/api/tags/tree')
|
|
||||||
];
|
];
|
||||||
if (canBlogWrite) {
|
if (canBlogWrite) {
|
||||||
readerFetches.push(
|
readerFetches.push(
|
||||||
@@ -50,15 +48,14 @@ export async function load({ fetch, parent }) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const [statsRes, topPersonsRes, recentDocsRes, recentStoriesRes, tagTreeRes, draftsRes] =
|
const [statsRes, topPersonsRes, recentDocsRes, recentStoriesRes, draftsRes] =
|
||||||
await Promise.allSettled(readerFetches);
|
await Promise.allSettled(readerFetches);
|
||||||
|
|
||||||
const readerStats = settled<StatsDTO>(statsRes);
|
const readerStats = settled<StatsDTO>(statsRes);
|
||||||
const topPersons = settled<PersonSummaryDTO[]>(topPersonsRes) ?? [];
|
const topPersons = settled<PersonSummaryDTO[]>(topPersonsRes) ?? [];
|
||||||
const searchData = settled<{ items: DocumentListItem[] }>(recentDocsRes);
|
const searchData = settled<{ items: { document: Document }[] }>(recentDocsRes);
|
||||||
const recentDocs = searchData?.items ?? [];
|
const recentDocs = searchData?.items.map((i) => i.document) ?? [];
|
||||||
const recentStories = settled<Geschichte[]>(recentStoriesRes) ?? [];
|
const recentStories = settled<Geschichte[]>(recentStoriesRes) ?? [];
|
||||||
const tagTree = settled<TagTreeNodeDTO[]>(tagTreeRes) ?? [];
|
|
||||||
const drafts = settled<Geschichte[]>(draftsRes) ?? [];
|
const drafts = settled<Geschichte[]>(draftsRes) ?? [];
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -68,7 +65,6 @@ export async function load({ fetch, parent }) {
|
|||||||
topPersons,
|
topPersons,
|
||||||
recentDocs,
|
recentDocs,
|
||||||
recentStories,
|
recentStories,
|
||||||
tagTree,
|
|
||||||
drafts,
|
drafts,
|
||||||
error: null as string | null
|
error: null as string | null
|
||||||
};
|
};
|
||||||
@@ -84,8 +80,7 @@ export async function load({ fetch, parent }) {
|
|||||||
readyResult,
|
readyResult,
|
||||||
weeklyStatsResult,
|
weeklyStatsResult,
|
||||||
incompleteResult,
|
incompleteResult,
|
||||||
incompleteCountResult,
|
incompleteCountResult
|
||||||
tagTreeResult
|
|
||||||
] = await Promise.allSettled([
|
] = await Promise.allSettled([
|
||||||
api.GET('/api/stats'),
|
api.GET('/api/stats'),
|
||||||
api.GET('/api/dashboard/resume'),
|
api.GET('/api/dashboard/resume'),
|
||||||
@@ -96,8 +91,7 @@ export async function load({ fetch, parent }) {
|
|||||||
api.GET('/api/transcription/ready-to-read'),
|
api.GET('/api/transcription/ready-to-read'),
|
||||||
api.GET('/api/transcription/weekly-stats'),
|
api.GET('/api/transcription/weekly-stats'),
|
||||||
api.GET('/api/documents/incomplete', { params: { query: { size: 5 } } }),
|
api.GET('/api/documents/incomplete', { params: { query: { size: 5 } } }),
|
||||||
api.GET('/api/documents/incomplete-count'),
|
api.GET('/api/documents/incomplete-count')
|
||||||
api.GET('/api/tags/tree')
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let stats: StatsDTO | null = null;
|
let stats: StatsDTO | null = null;
|
||||||
@@ -110,7 +104,6 @@ export async function load({ fetch, parent }) {
|
|||||||
let weeklyStats: TranscriptionWeeklyStatsDTO | null = null;
|
let weeklyStats: TranscriptionWeeklyStatsDTO | null = null;
|
||||||
let incompleteDocs: IncompleteDocumentDTO[] = [];
|
let incompleteDocs: IncompleteDocumentDTO[] = [];
|
||||||
let incompleteTotal = 0;
|
let incompleteTotal = 0;
|
||||||
let tagTree: TagTreeNodeDTO[] = [];
|
|
||||||
|
|
||||||
if (statsResult.status === 'fulfilled' && statsResult.value.response.ok) {
|
if (statsResult.status === 'fulfilled' && statsResult.value.response.ok) {
|
||||||
stats = statsResult.value.data ?? null;
|
stats = statsResult.value.data ?? null;
|
||||||
@@ -142,9 +135,6 @@ export async function load({ fetch, parent }) {
|
|||||||
if (incompleteCountResult.status === 'fulfilled' && incompleteCountResult.value.response.ok) {
|
if (incompleteCountResult.status === 'fulfilled' && incompleteCountResult.value.response.ok) {
|
||||||
incompleteTotal = (incompleteCountResult.value.data?.count as number | undefined) ?? 0;
|
incompleteTotal = (incompleteCountResult.value.data?.count as number | undefined) ?? 0;
|
||||||
}
|
}
|
||||||
if (tagTreeResult.status === 'fulfilled' && tagTreeResult.value.response.ok) {
|
|
||||||
tagTree = (tagTreeResult.value.data as TagTreeNodeDTO[]) ?? [];
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
isReader: false as const,
|
isReader: false as const,
|
||||||
@@ -158,7 +148,6 @@ export async function load({ fetch, parent }) {
|
|||||||
weeklyStats,
|
weeklyStats,
|
||||||
incompleteDocs,
|
incompleteDocs,
|
||||||
incompleteTotal,
|
incompleteTotal,
|
||||||
tagTree,
|
|
||||||
error: null as string | null
|
error: null as string | null
|
||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -178,9 +167,8 @@ export async function load({ fetch, parent }) {
|
|||||||
incompleteTotal: 0,
|
incompleteTotal: 0,
|
||||||
readerStats: null,
|
readerStats: null,
|
||||||
topPersons: [] as PersonSummaryDTO[],
|
topPersons: [] as PersonSummaryDTO[],
|
||||||
recentDocs: [] as DocumentListItem[],
|
recentDocs: [] as Document[],
|
||||||
recentStories: [] as Geschichte[],
|
recentStories: [] as Geschichte[],
|
||||||
tagTree: [] as TagTreeNodeDTO[],
|
|
||||||
drafts: [] as Geschichte[],
|
drafts: [] as Geschichte[],
|
||||||
error: 'Daten konnten nicht geladen werden.' as string | null
|
error: 'Daten konnten nicht geladen werden.' as string | null
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ import ReaderPersonChips from '$lib/shared/dashboard/ReaderPersonChips.svelte';
|
|||||||
import ReaderDraftsModule from '$lib/shared/dashboard/ReaderDraftsModule.svelte';
|
import ReaderDraftsModule from '$lib/shared/dashboard/ReaderDraftsModule.svelte';
|
||||||
import ReaderRecentDocs from '$lib/shared/dashboard/ReaderRecentDocs.svelte';
|
import ReaderRecentDocs from '$lib/shared/dashboard/ReaderRecentDocs.svelte';
|
||||||
import ReaderRecentStories from '$lib/shared/dashboard/ReaderRecentStories.svelte';
|
import ReaderRecentStories from '$lib/shared/dashboard/ReaderRecentStories.svelte';
|
||||||
import ThemenWidget from '$lib/shared/dashboard/ThemenWidget.svelte';
|
|
||||||
import { m } from '$lib/paraglide/messages.js';
|
import { m } from '$lib/paraglide/messages.js';
|
||||||
|
|
||||||
let { data } = $props();
|
let { data } = $props();
|
||||||
@@ -46,8 +45,6 @@ const greetingText = $derived.by(() => {
|
|||||||
|
|
||||||
<ReaderPersonChips persons={data.topPersons ?? []} />
|
<ReaderPersonChips persons={data.topPersons ?? []} />
|
||||||
|
|
||||||
<ThemenWidget tags={data.tagTree ?? []} />
|
|
||||||
|
|
||||||
<div class="grid grid-cols-1 gap-1.5 sm:grid-cols-2">
|
<div class="grid grid-cols-1 gap-1.5 sm:grid-cols-2">
|
||||||
<ReaderRecentDocs documents={data.recentDocs ?? []} />
|
<ReaderRecentDocs documents={data.recentDocs ?? []} />
|
||||||
<ReaderRecentStories stories={data.recentStories ?? []} />
|
<ReaderRecentStories stories={data.recentStories ?? []} />
|
||||||
@@ -59,13 +56,10 @@ const greetingText = $derived.by(() => {
|
|||||||
<h1 class="font-serif text-[2rem] text-ink">{greetingText}</h1>
|
<h1 class="font-serif text-[2rem] text-ink">{greetingText}</h1>
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
|
<div class="grid grid-cols-1 gap-5 lg:grid-cols-[1fr_320px] lg:items-start">
|
||||||
<div class="flex flex-col gap-5">
|
<div class="flex flex-col gap-5">
|
||||||
<DashboardResumeStrip resumeDoc={data.resumeDoc ?? null} />
|
<DashboardResumeStrip resumeDoc={data.resumeDoc ?? null} />
|
||||||
|
|
||||||
<ThemenWidget tags={data.tagTree ?? []} />
|
|
||||||
|
|
||||||
<div class="grid grid-cols-1 gap-5 lg:grid-cols-[1fr_320px] lg:items-start">
|
|
||||||
<div class="flex flex-col gap-5">
|
|
||||||
<EnrichmentBlock
|
<EnrichmentBlock
|
||||||
topDocs={data.incompleteDocs ?? []}
|
topDocs={data.incompleteDocs ?? []}
|
||||||
totalCount={data.incompleteTotal ?? 0}
|
totalCount={data.incompleteTotal ?? 0}
|
||||||
@@ -94,6 +88,5 @@ const greetingText = $derived.by(() => {
|
|||||||
{/if}
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
{/if}
|
{/if}
|
||||||
</main>
|
</main>
|
||||||
|
|||||||
@@ -108,8 +108,7 @@ describe('home page load — dashboard', () => {
|
|||||||
data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
|
data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
|
||||||
}) // weekly-stats
|
}) // weekly-stats
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
|
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }) // incomplete-count
|
.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }); // incomplete-count
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
|
|
||||||
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
||||||
typeof createApiClient
|
typeof createApiClient
|
||||||
>);
|
>);
|
||||||
@@ -147,8 +146,7 @@ describe('home page load — dashboard', () => {
|
|||||||
data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
|
data: { segmentationCount: 0, transcriptionCount: 0, readyCount: 0 }
|
||||||
}) // weekly-stats
|
}) // weekly-stats
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
|
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // incomplete
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }) // incomplete-count
|
.mockResolvedValueOnce({ response: { ok: true }, data: { count: 0 } }); // incomplete-count
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
|
|
||||||
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
||||||
typeof createApiClient
|
typeof createApiClient
|
||||||
>);
|
>);
|
||||||
@@ -396,56 +394,6 @@ describe('home page load — reader branch (isReader = !canWrite && !canAnnotate
|
|||||||
expect(result.isReader).toBe(false);
|
expect(result.isReader).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('maps search result items directly to recentDocs without wrapping in a .document property', async () => {
|
|
||||||
const searchItem = {
|
|
||||||
id: 'd1',
|
|
||||||
title: 'Liebesbrief',
|
|
||||||
originalFilename: 'letter.pdf',
|
|
||||||
completionPercentage: 80,
|
|
||||||
receivers: [],
|
|
||||||
tags: [],
|
|
||||||
contributors: [],
|
|
||||||
matchData: { titleOffsets: [], senderMatched: false },
|
|
||||||
createdAt: '2026-05-01T10:00:00Z',
|
|
||||||
updatedAt: '2026-05-10T08:00:00Z'
|
|
||||||
};
|
|
||||||
const mockGet = vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValueOnce({ response: { ok: true, status: 200 }, data: [] }) // initial persons
|
|
||||||
.mockResolvedValueOnce({
|
|
||||||
response: { ok: true },
|
|
||||||
data: { totalDocuments: 1, totalPersons: 1 }
|
|
||||||
}) // stats
|
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // topPersons
|
|
||||||
.mockResolvedValueOnce({
|
|
||||||
response: { ok: true },
|
|
||||||
data: { items: [searchItem], totalElements: 1, pageNumber: 0, pageSize: 5, totalPages: 1 }
|
|
||||||
}) // search
|
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }) // stories
|
|
||||||
.mockResolvedValueOnce({ response: { ok: true }, data: [] }); // tags/tree
|
|
||||||
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
|
||||||
typeof createApiClient
|
|
||||||
>);
|
|
||||||
|
|
||||||
const result = await load({
|
|
||||||
url: makeUrl(),
|
|
||||||
request: new Request('http://localhost/'),
|
|
||||||
fetch: vi.fn() as unknown as typeof fetch,
|
|
||||||
parent: vi
|
|
||||||
.fn()
|
|
||||||
.mockResolvedValue({ canWrite: false, canAnnotate: false, canBlogWrite: false })
|
|
||||||
} as Parameters<typeof load>[0]);
|
|
||||||
|
|
||||||
expect(result.isReader).toBe(true);
|
|
||||||
if (result.isReader) {
|
|
||||||
expect(result.recentDocs).toHaveLength(1);
|
|
||||||
expect(result.recentDocs[0]).toBeDefined();
|
|
||||||
expect(result.recentDocs[0].id).toBe('d1');
|
|
||||||
expect(result.recentDocs[0].createdAt).toBe('2026-05-01T10:00:00Z');
|
|
||||||
expect(result.recentDocs[0].updatedAt).toBe('2026-05-10T08:00:00Z');
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns topPersons=[] when topPersons fetch fails, rest of data still loads', async () => {
|
it('returns topPersons=[] when topPersons fetch fails, rest of data still loads', async () => {
|
||||||
const okStats = {
|
const okStats = {
|
||||||
response: { ok: true, status: 200 },
|
response: { ok: true, status: 200 },
|
||||||
@@ -461,8 +409,7 @@ describe('home page load — reader branch (isReader = !canWrite && !canAnnotate
|
|||||||
.mockResolvedValueOnce(okStats)
|
.mockResolvedValueOnce(okStats)
|
||||||
.mockReturnValueOnce(failPersons)
|
.mockReturnValueOnce(failPersons)
|
||||||
.mockResolvedValueOnce(okSearch)
|
.mockResolvedValueOnce(okSearch)
|
||||||
.mockResolvedValueOnce(okStories)
|
.mockResolvedValueOnce(okStories);
|
||||||
.mockResolvedValueOnce({ response: { ok: true, status: 200 }, data: [] }); // tags/tree
|
|
||||||
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType<
|
||||||
typeof createApiClient
|
typeof createApiClient
|
||||||
>);
|
>);
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
import { error } from '@sveltejs/kit';
|
|
||||||
import { createApiClient } from '$lib/shared/api.server';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
export async function load({ fetch }: Parameters<import('./$types').PageServerLoad>[0]) {
|
|
||||||
const api = createApiClient(fetch);
|
|
||||||
const result = await api.GET('/api/tags/tree');
|
|
||||||
if (!result.response.ok) throw error(500, 'Themen konnten nicht geladen werden.');
|
|
||||||
return { tree: (result.data ?? []) as TagTreeNodeDTO[] };
|
|
||||||
}
|
|
||||||
@@ -1,85 +0,0 @@
|
|||||||
<script lang="ts">
|
|
||||||
import * as m from '$lib/paraglide/messages.js';
|
|
||||||
import BackButton from '$lib/shared/primitives/BackButton.svelte';
|
|
||||||
import { hasAnyDocuments } from '$lib/shared/utils/tagUtils';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
const MAX_VISIBLE_CHILDREN = 5;
|
|
||||||
|
|
||||||
let { data }: { data: { tree: TagTreeNodeDTO[] } } = $props();
|
|
||||||
|
|
||||||
const visibleTree = $derived.by(() => data.tree.filter(hasAnyDocuments));
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<svelte:head>
|
|
||||||
<title>{m.themen_widget_title()}</title>
|
|
||||||
</svelte:head>
|
|
||||||
|
|
||||||
<main class="mx-auto max-w-7xl px-4 py-8 sm:px-6 lg:px-8">
|
|
||||||
<div class="mb-6 flex items-center gap-3">
|
|
||||||
<BackButton />
|
|
||||||
<h1 class="font-serif text-2xl font-semibold text-ink">{m.themen_widget_title()}</h1>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{#if visibleTree.length === 0}
|
|
||||||
<p class="font-sans text-sm text-ink-3">{m.themen_leer()}</p>
|
|
||||||
{:else}
|
|
||||||
<div class="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:grid-cols-3">
|
|
||||||
{#each visibleTree as tag (tag.id)}
|
|
||||||
{@const visibleChildren = (tag.children ?? []).filter(hasAnyDocuments)}
|
|
||||||
{@const shownChildren = visibleChildren.slice(0, MAX_VISIBLE_CHILDREN)}
|
|
||||||
{@const hiddenCount = visibleChildren.length - shownChildren.length}
|
|
||||||
|
|
||||||
<div class="overflow-hidden rounded-sm border border-line bg-surface shadow-sm">
|
|
||||||
<div
|
|
||||||
class="h-1.5 w-full flex-shrink-0"
|
|
||||||
aria-hidden="true"
|
|
||||||
style="background: var(--c-tag-{tag.color ?? 'slate'})"
|
|
||||||
></div>
|
|
||||||
|
|
||||||
<a
|
|
||||||
href="/documents?tag={encodeURIComponent(tag.name)}"
|
|
||||||
aria-label="{tag.name}{tag.documentCount > 0
|
|
||||||
? ', ' + m.themen_dokumente({ count: tag.documentCount })
|
|
||||||
: ''}"
|
|
||||||
class="flex min-h-[56px] items-center justify-between px-4 pt-4 pb-3 hover:bg-canvas focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
|
|
||||||
>
|
|
||||||
<span class="font-serif text-base font-semibold text-ink">{tag.name}</span>
|
|
||||||
<span class="mr-1 ml-auto font-sans text-sm text-ink-3 tabular-nums">
|
|
||||||
{#if tag.documentCount > 0}{tag.documentCount}{/if}
|
|
||||||
</span>
|
|
||||||
<span aria-hidden="true" class="h-3.5 w-3.5 flex-shrink-0 text-brand-mint">›</span>
|
|
||||||
</a>
|
|
||||||
|
|
||||||
{#if shownChildren.length > 0}
|
|
||||||
<div class="mx-4 border-t border-line"></div>
|
|
||||||
|
|
||||||
{#each shownChildren as child (child.id)}
|
|
||||||
<a
|
|
||||||
href="/documents?tag={encodeURIComponent(child.name)}"
|
|
||||||
class="flex min-h-[44px] items-center justify-between px-4 py-2.5 hover:bg-canvas focus-visible:bg-canvas focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
|
|
||||||
>
|
|
||||||
<span class="font-sans text-sm text-ink">{child.name}</span>
|
|
||||||
<span class="mr-1 ml-auto font-sans text-xs text-ink-3 tabular-nums">
|
|
||||||
{#if child.documentCount > 0}{child.documentCount}{/if}
|
|
||||||
</span>
|
|
||||||
<span aria-hidden="true" class="h-3 w-3 flex-shrink-0 text-brand-mint">›</span>
|
|
||||||
</a>
|
|
||||||
{/each}
|
|
||||||
|
|
||||||
{#if hiddenCount > 0}
|
|
||||||
<a
|
|
||||||
href="/documents?tag={encodeURIComponent(tag.name)}"
|
|
||||||
class="block min-h-[44px] px-4 py-2.5 font-sans text-sm text-ink-3 hover:bg-canvas hover:text-ink focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:outline-none focus-visible:ring-inset"
|
|
||||||
>
|
|
||||||
{m.themen_weitere({ count: hiddenCount })} →
|
|
||||||
</a>
|
|
||||||
{/if}
|
|
||||||
{/if}
|
|
||||||
</div>
|
|
||||||
{/each}
|
|
||||||
</div>
|
|
||||||
{/if}
|
|
||||||
</main>
|
|
||||||
@@ -1,60 +0,0 @@
|
|||||||
import { describe, expect, it, vi, beforeEach } from 'vitest';
|
|
||||||
|
|
||||||
vi.mock('$lib/shared/api.server', () => ({
|
|
||||||
createApiClient: vi.fn(),
|
|
||||||
extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
|
|
||||||
}));
|
|
||||||
|
|
||||||
import { createApiClient } from '$lib/shared/api.server';
|
|
||||||
|
|
||||||
beforeEach(() => vi.clearAllMocks());
|
|
||||||
|
|
||||||
function mockApiGet(ok: boolean, data: unknown) {
|
|
||||||
vi.mocked(createApiClient).mockReturnValue({
|
|
||||||
GET: vi.fn().mockResolvedValue({ response: { ok }, data })
|
|
||||||
} as ReturnType<typeof createApiClient>);
|
|
||||||
}
|
|
||||||
|
|
||||||
const makeTag = (name: string, documentCount = 0) => ({
|
|
||||||
id: 'id-' + name,
|
|
||||||
name,
|
|
||||||
documentCount,
|
|
||||||
children: []
|
|
||||||
});
|
|
||||||
|
|
||||||
describe('/themen +page.server load', () => {
|
|
||||||
function makeLoadEvent() {
|
|
||||||
return {
|
|
||||||
fetch: vi.fn() as unknown as typeof fetch,
|
|
||||||
request: new Request('http://localhost/themen'),
|
|
||||||
url: new URL('http://localhost/themen')
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
it('returns tag tree when API succeeds', async () => {
|
|
||||||
const tree = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
|
|
||||||
mockApiGet(true, tree);
|
|
||||||
|
|
||||||
const { load } = await import('./+page.server');
|
|
||||||
const result = await load(makeLoadEvent());
|
|
||||||
|
|
||||||
expect(result.tree).toEqual(tree);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('returns empty array when API returns empty list', async () => {
|
|
||||||
mockApiGet(true, []);
|
|
||||||
|
|
||||||
const { load } = await import('./+page.server');
|
|
||||||
const result = await load(makeLoadEvent());
|
|
||||||
|
|
||||||
expect(result.tree).toEqual([]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('throws 500 when API call fails', async () => {
|
|
||||||
mockApiGet(false, null);
|
|
||||||
|
|
||||||
const { load } = await import('./+page.server');
|
|
||||||
|
|
||||||
await expect(load(makeLoadEvent())).rejects.toMatchObject({ status: 500 });
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
import { describe, it, expect, afterEach } from 'vitest';
|
|
||||||
import { cleanup, render } from 'vitest-browser-svelte';
|
|
||||||
import ThemenPage from './+page.svelte';
|
|
||||||
import type { components } from '$lib/generated/api';
|
|
||||||
|
|
||||||
afterEach(() => {
|
|
||||||
cleanup();
|
|
||||||
});
|
|
||||||
|
|
||||||
type TagTreeNodeDTO = components['schemas']['TagTreeNodeDTO'];
|
|
||||||
|
|
||||||
function makeTag(
|
|
||||||
name: string,
|
|
||||||
documentCount: number,
|
|
||||||
children: TagTreeNodeDTO[] = []
|
|
||||||
): TagTreeNodeDTO {
|
|
||||||
return { id: 'id-' + name, name, documentCount, children };
|
|
||||||
}
|
|
||||||
|
|
||||||
describe('/themen +page', () => {
|
|
||||||
it('renders one card per visible root tag', async () => {
|
|
||||||
const tree = [makeTag('Briefe', 5), makeTag('Fotos', 3)];
|
|
||||||
render(ThemenPage, { data: { tree } });
|
|
||||||
expect(document.body.textContent).toContain('Briefe');
|
|
||||||
expect(document.body.textContent).toContain('Fotos');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('does not render a tag with no documents in its subtree', async () => {
|
|
||||||
const tree = [makeTag('Briefe', 5), makeTag('Leer', 0)];
|
|
||||||
render(ThemenPage, { data: { tree } });
|
|
||||||
expect(document.body.textContent).not.toContain('Leer');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('shows empty state when all tags filtered out', async () => {
|
|
||||||
render(ThemenPage, { data: { tree: [makeTag('Leer', 0)] } });
|
|
||||||
expect(document.body.textContent).toMatch(/Noch keine Themen/);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('shows empty state when tree is empty', async () => {
|
|
||||||
render(ThemenPage, { data: { tree: [] } });
|
|
||||||
expect(document.body.textContent).toMatch(/Noch keine Themen/);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('renders child tags for a root tag', async () => {
|
|
||||||
const tree = [makeTag('Briefe', 5, [makeTag('Brautbriefe', 3), makeTag('Kriegsbriefe', 2)])];
|
|
||||||
render(ThemenPage, { data: { tree } });
|
|
||||||
expect(document.body.textContent).toContain('Brautbriefe');
|
|
||||||
expect(document.body.textContent).toContain('Kriegsbriefe');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('shows "+ N weitere" when a root tag has more than 5 children', async () => {
|
|
||||||
const children = Array.from({ length: 7 }, (_, i) => makeTag(`Kind${i}`, i + 1));
|
|
||||||
const tree = [makeTag('Briefe', 10, children)];
|
|
||||||
render(ThemenPage, { data: { tree } });
|
|
||||||
expect(document.body.textContent).toMatch(/\+\s*2\s*weitere/);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
6
tools/import-normalizer/.gitignore
vendored
Normal file
6
tools/import-normalizer/.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
.venv/
|
||||||
|
out/
|
||||||
|
!out/canonical-persons-tree.json
|
||||||
|
review/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
44
tools/import-normalizer/README.md
Normal file
44
tools/import-normalizer/README.md
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# Import Normalizer
|
||||||
|
|
||||||
|
Transforms the raw family-archive spreadsheets in `../../import/` into a clean canonical
|
||||||
|
dataset (`out/`) plus review reports (`review/`). See the spec:
|
||||||
|
`../../docs/import-migration/02-normalization-spec.md`.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
Requires **Python 3.12** (uses `StrEnum`).
|
||||||
|
```bash
|
||||||
|
python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run
|
||||||
|
```bash
|
||||||
|
.venv/bin/python normalize.py
|
||||||
|
```
|
||||||
|
Outputs:
|
||||||
|
- `out/canonical-documents.xlsx`, `out/canonical-persons.xlsx`
|
||||||
|
- `review/*.csv` (residue to fix), `review/summary.txt` (grouped run stats incl. unknown-date rate)
|
||||||
|
|
||||||
|
## Iteration loop
|
||||||
|
1. **Run.** Read `review/summary.txt` for the health snapshot.
|
||||||
|
2. **Fix the residue** by editing the version-controlled overrides files, then re-run. Repeat.
|
||||||
|
|
||||||
|
| Review file | What to do |
|
||||||
|
| --- | --- |
|
||||||
|
| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). |
|
||||||
|
| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). |
|
||||||
|
| `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. |
|
||||||
|
| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. |
|
||||||
|
|
||||||
|
> `unresolved-names.csv` is the focused "names that need a human" list. Non-family
|
||||||
|
> correspondents that simply aren't in the register are NOT reported — they just become
|
||||||
|
> provisional persons in `out/canonical-persons.xlsx` (the `unmatched_name_strings` count in
|
||||||
|
> `summary.txt` tracks how many). The given-name set that drives `ambiguous_pair` detection is
|
||||||
|
> the register's first names plus `config.EXTRA_GIVEN_NAMES` — add names there if a real
|
||||||
|
> two-person cell isn't being flagged.
|
||||||
|
|
||||||
|
**Valid `person_id` values** all come from the `person_id` column of `out/canonical-persons.xlsx`.
|
||||||
|
|
||||||
|
## Tests
|
||||||
|
```bash
|
||||||
|
.venv/bin/python -m pytest tests/test_dates.py -v # run files individually (never the whole suite at once)
|
||||||
|
```
|
||||||
135
tools/import-normalizer/config.py
Normal file
135
tools/import-normalizer/config.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
"""Tunables for the import normalizer. No logic here — only data tables."""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# --- Paths ---
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent
|
||||||
|
REPO_ROOT = BASE_DIR.parent.parent
|
||||||
|
IMPORT_DIR = REPO_ROOT / "import"
|
||||||
|
|
||||||
|
DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx"
|
||||||
|
DOCUMENT_SHEET = "Familienarchiv"
|
||||||
|
PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx"
|
||||||
|
PERSON_SHEET = "Tabelle1"
|
||||||
|
|
||||||
|
OUT_DIR = BASE_DIR / "out"
|
||||||
|
REVIEW_DIR = BASE_DIR / "review"
|
||||||
|
OVERRIDES_DIR = BASE_DIR / "overrides"
|
||||||
|
|
||||||
|
# --- Header text (lowercased, whitespace-collapsed) -> canonical field ---
|
||||||
|
DOCUMENT_HEADER_MAP = {
|
||||||
|
"index": "index",
|
||||||
|
"datei": "file",
|
||||||
|
"box": "box",
|
||||||
|
"mappe": "folder",
|
||||||
|
"briefeschreiberin": "sender",
|
||||||
|
"empfängerin": "receivers",
|
||||||
|
"datum des briefes": "date",
|
||||||
|
"ort": "location",
|
||||||
|
"schlagwort": "tags",
|
||||||
|
"inhalt": "summary",
|
||||||
|
}
|
||||||
|
DOCUMENT_REQUIRED_FIELDS = {"index"}
|
||||||
|
|
||||||
|
PERSON_HEADER_MAP = {
|
||||||
|
"generation": "generation",
|
||||||
|
"familienname": "last_name",
|
||||||
|
"vorname": "first_name",
|
||||||
|
"geb als": "maiden_name",
|
||||||
|
"geburtsdatum": "birth_date",
|
||||||
|
"geburtsort": "birth_place",
|
||||||
|
"todesdatum": "death_date",
|
||||||
|
"sterbeort": "death_place",
|
||||||
|
"verheiratet mit": "spouse",
|
||||||
|
"bemerkung": "notes",
|
||||||
|
}
|
||||||
|
PERSON_REQUIRED_FIELDS = {"last_name"}
|
||||||
|
|
||||||
|
# --- Century rule (archive 1873–1957) ---
|
||||||
|
TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy
|
||||||
|
TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN
|
||||||
|
|
||||||
|
# --- Seasons -> representative month (day = 1) ---
|
||||||
|
SEASON_MONTHS = {
|
||||||
|
"frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4,
|
||||||
|
"sommer": 7, "herbst": 10, "winter": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Fixed feasts -> (month, day) ---
|
||||||
|
FIXED_FEASTS = {
|
||||||
|
"neujahr": (1, 1),
|
||||||
|
"heiligabend": (12, 24), "heiliger abend": (12, 24),
|
||||||
|
"weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25),
|
||||||
|
"silvester": (12, 31), "sylvester": (12, 31),
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Movable feasts -> day offset from Easter Sunday ---
|
||||||
|
MOVABLE_FEASTS = {
|
||||||
|
"karfreitag": -2,
|
||||||
|
"ostern": 0, "ostersonntag": 0, "ostermontag": 1,
|
||||||
|
"himmelfahrt": 39, "christi himmelfahrt": 39,
|
||||||
|
"pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50,
|
||||||
|
"fronleichnam": 60,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Month names -> number (German + English, full + abbreviations) ---
|
||||||
|
MONTHS = {
|
||||||
|
"januar": 1, "jan": 1, "january": 1,
|
||||||
|
"februar": 2, "feb": 2, "febr": 2, "february": 2,
|
||||||
|
"märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3,
|
||||||
|
"april": 4, "apr": 4,
|
||||||
|
"mai": 5, "may": 5,
|
||||||
|
"juni": 6, "jun": 6, "june": 6,
|
||||||
|
"juli": 7, "jul": 7, "july": 7,
|
||||||
|
"august": 8, "aug": 8,
|
||||||
|
"september": 9, "sep": 9, "sept": 9,
|
||||||
|
"oktober": 10, "okt": 10, "oct": 10, "october": 10,
|
||||||
|
"november": 11, "nov": 11,
|
||||||
|
"dezember": 12, "dez": 12, "dec": 12, "december": 12,
|
||||||
|
# Spanish (Mexican-branch correspondence)
|
||||||
|
"enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6,
|
||||||
|
"julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10,
|
||||||
|
"noviembre": 11, "diciembre": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
ROMAN_MONTHS = {
|
||||||
|
"i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6,
|
||||||
|
"vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Person matching ---
|
||||||
|
KNOWN_LAST_NAMES = [
|
||||||
|
"von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa",
|
||||||
|
"de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram",
|
||||||
|
]
|
||||||
|
FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied
|
||||||
|
|
||||||
|
# --- Name classification (unresolved-name review) ---
|
||||||
|
# Relational reference terms — a sender/receiver named by relation, not a proper name.
|
||||||
|
RELATIONAL_TERMS = {
|
||||||
|
"tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter",
|
||||||
|
"großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine",
|
||||||
|
"neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter",
|
||||||
|
"schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer",
|
||||||
|
}
|
||||||
|
# Collective/group terms — not a single person. Matched against alpha-only word tokens
|
||||||
|
# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes.
|
||||||
|
COLLECTIVE_TERMS = {
|
||||||
|
"familie", "fam", "kinder", "eltern", "geschwister", "großeltern",
|
||||||
|
"grosseltern", "alle", "diverse", "div", "gebrüder", "gebr",
|
||||||
|
# Plural/group relational terms — added for tag generation heuristic
|
||||||
|
"söhne", "töchter", "brüder", "schwestern", "schwiegereltern",
|
||||||
|
"vettern", "kusinen", "cousinen", "nichten", "neffen", "tanten",
|
||||||
|
"freunde", "bekannte", "geschw", "enkelkinder", "jungens", "verwandten",
|
||||||
|
}
|
||||||
|
# Markers of an unknown/illegible name (the literal "?" is handled separately in code).
|
||||||
|
# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn"
|
||||||
|
# (it occurs inside real names: Hanni, Johanna, Anna).
|
||||||
|
UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"}
|
||||||
|
# A name-column value longer than this (chars) is treated as prose/description, not a name.
|
||||||
|
PROSE_MAX_LEN = 40
|
||||||
|
# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not
|
||||||
|
# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more.
|
||||||
|
EXTRA_GIVEN_NAMES = {
|
||||||
|
"ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara",
|
||||||
|
"margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta",
|
||||||
|
}
|
||||||
279
tools/import-normalizer/dates.py
Normal file
279
tools/import-normalizer/dates.py
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
"""Tolerant historical date parsing for the family archive."""
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import StrEnum
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
|
class Precision(StrEnum):
|
||||||
|
DAY = "DAY"
|
||||||
|
MONTH = "MONTH"
|
||||||
|
SEASON = "SEASON"
|
||||||
|
YEAR = "YEAR"
|
||||||
|
RANGE = "RANGE"
|
||||||
|
APPROX = "APPROX"
|
||||||
|
UNKNOWN = "UNKNOWN"
|
||||||
|
|
||||||
|
|
||||||
|
def _advent_sunday(year: int, n: int) -> datetime.date:
|
||||||
|
"""n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24."""
|
||||||
|
dec24 = datetime.date(year, 12, 24)
|
||||||
|
back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6
|
||||||
|
fourth = dec24 - datetime.timedelta(days=back_to_sunday)
|
||||||
|
return fourth - datetime.timedelta(days=(4 - n) * 7)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_feast_or_season(token: str, year: int):
|
||||||
|
"""Return (iso, Precision) for a known feast/season token, else None."""
|
||||||
|
key = " ".join(token.lower().split()).strip(" .")
|
||||||
|
if key in config.MOVABLE_FEASTS:
|
||||||
|
d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key])
|
||||||
|
return d.isoformat(), Precision.DAY
|
||||||
|
if key in config.FIXED_FEASTS:
|
||||||
|
month, day = config.FIXED_FEASTS[key]
|
||||||
|
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||||
|
advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1}
|
||||||
|
if key in advent:
|
||||||
|
return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY
|
||||||
|
if key in config.SEASON_MONTHS:
|
||||||
|
return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def expand_year(token: str):
|
||||||
|
"""Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous."""
|
||||||
|
token = token.strip()
|
||||||
|
if not token.isdigit():
|
||||||
|
return None
|
||||||
|
n, v = len(token), int(token)
|
||||||
|
if n == 4:
|
||||||
|
# reject gross typos (e.g. "9003") so they go to review instead of a bogus year
|
||||||
|
return v if 1700 <= v <= 2100 else None
|
||||||
|
if n == 3:
|
||||||
|
return 1000 + v
|
||||||
|
if n == 2:
|
||||||
|
if v <= config.TWO_DIGIT_19XX_MAX:
|
||||||
|
return 1900 + v
|
||||||
|
if v >= config.TWO_DIGIT_18XX_MIN:
|
||||||
|
return 1800 + v
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ParsedDate:
|
||||||
|
iso: str | None
|
||||||
|
precision: Precision
|
||||||
|
raw: str
|
||||||
|
|
||||||
|
|
||||||
|
_LEADING_MARKERS = re.compile(
|
||||||
|
r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess(raw: str):
|
||||||
|
"""Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx."""
|
||||||
|
s = (raw or "").strip()
|
||||||
|
if not s:
|
||||||
|
return "", False
|
||||||
|
low = s.lower()
|
||||||
|
approx = ("?" in s) or any(
|
||||||
|
m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich"))
|
||||||
|
s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)"
|
||||||
|
s = s.replace("?", " ")
|
||||||
|
s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief")
|
||||||
|
stripped = _LEADING_MARKERS.sub("", s)
|
||||||
|
if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation
|
||||||
|
approx = True
|
||||||
|
s = re.sub(r"\s+", " ", stripped).strip(" .,")
|
||||||
|
return s, approx
|
||||||
|
|
||||||
|
|
||||||
|
_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})")
|
||||||
|
|
||||||
|
|
||||||
|
def _match_iso(s):
|
||||||
|
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
|
||||||
|
try:
|
||||||
|
datetime.date.fromisoformat(s)
|
||||||
|
return s, Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _match_numeric(s):
|
||||||
|
m = _NUM_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
day, month = int(m.group(1)), int(m.group(2))
|
||||||
|
year = expand_year(m.group(3))
|
||||||
|
if year is None or not (1 <= month <= 12):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def _match_roman(s):
|
||||||
|
m = _ROMAN_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
day = int(m.group(1))
|
||||||
|
month = config.ROMAN_MONTHS.get(m.group(2).lower())
|
||||||
|
year = expand_year(m.group(3))
|
||||||
|
if not month or year is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})")
|
||||||
|
|
||||||
|
|
||||||
|
def _lookup_month(token: str):
|
||||||
|
return config.MONTHS.get(token.lower().strip(" ."))
|
||||||
|
|
||||||
|
|
||||||
|
def _build_day_month_year(day, month, year):
|
||||||
|
if not month or year is None or not (1 <= month <= 12):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.date(year, month, day).isoformat(), Precision.DAY
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _match_monthname_a(s):
|
||||||
|
m = _MONTH_A_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3)))
|
||||||
|
|
||||||
|
|
||||||
|
# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match
|
||||||
|
# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929".
|
||||||
|
_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-–]\s*(\d{2,4})")
|
||||||
|
|
||||||
|
|
||||||
|
def _match_monthname_b(s):
|
||||||
|
m = _MONTH_B_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3)))
|
||||||
|
|
||||||
|
|
||||||
|
_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})")
|
||||||
|
_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})")
|
||||||
|
_YEAR_ONLY_RE = re.compile(r"\d{4}")
|
||||||
|
_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}")
|
||||||
|
_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*")
|
||||||
|
# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it
|
||||||
|
# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/").
|
||||||
|
_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)")
|
||||||
|
|
||||||
|
|
||||||
|
def _match_month_year(s):
|
||||||
|
m = _MONTH_YEAR_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
month = _lookup_month(m.group(1))
|
||||||
|
year = expand_year(m.group(2))
|
||||||
|
if not month or year is None:
|
||||||
|
return None
|
||||||
|
return datetime.date(year, month, 1).isoformat(), Precision.MONTH
|
||||||
|
|
||||||
|
|
||||||
|
def _match_feast_season(s):
|
||||||
|
m = _TOKEN_YEAR_RE.fullmatch(s)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
year = expand_year(m.group(2))
|
||||||
|
if year is None:
|
||||||
|
return None
|
||||||
|
return resolve_feast_or_season(m.group(1), year)
|
||||||
|
|
||||||
|
|
||||||
|
def _match_year_only(s):
|
||||||
|
if _YEAR_ONLY_RE.fullmatch(s):
|
||||||
|
return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _match_range(s):
|
||||||
|
m = _RANGE_YY_RE.fullmatch(s)
|
||||||
|
if m:
|
||||||
|
return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE
|
||||||
|
m = _RANGE_DAY_RE.fullmatch(s)
|
||||||
|
if m:
|
||||||
|
first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923"
|
||||||
|
for matcher in (_match_numeric, _match_monthname_a):
|
||||||
|
r = matcher(first)
|
||||||
|
if r:
|
||||||
|
return r[0], Precision.RANGE
|
||||||
|
m = _RANGE_HYPHEN_RE.fullmatch(s)
|
||||||
|
if m:
|
||||||
|
start = m.group(1).strip()
|
||||||
|
for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only):
|
||||||
|
r = matcher(start)
|
||||||
|
if r:
|
||||||
|
return r[0], Precision.RANGE
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_MATCHERS = [
|
||||||
|
_match_iso,
|
||||||
|
_match_range,
|
||||||
|
_match_numeric,
|
||||||
|
_match_roman,
|
||||||
|
_match_monthname_a,
|
||||||
|
_match_month_year,
|
||||||
|
_match_monthname_b,
|
||||||
|
_match_feast_season,
|
||||||
|
_match_year_only,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate:
|
||||||
|
if date_overrides:
|
||||||
|
key = (raw or "").strip()
|
||||||
|
if key in date_overrides:
|
||||||
|
iso, prec = date_overrides[key]
|
||||||
|
return ParsedDate(iso or None, Precision(prec), raw)
|
||||||
|
cleaned, approx = _preprocess(raw)
|
||||||
|
if not cleaned:
|
||||||
|
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||||
|
for matcher in _MATCHERS:
|
||||||
|
result = matcher(cleaned)
|
||||||
|
if result:
|
||||||
|
iso, precision = result
|
||||||
|
if approx:
|
||||||
|
precision = Precision.APPROX
|
||||||
|
return ParsedDate(iso, precision, raw)
|
||||||
|
return ParsedDate(None, Precision.UNKNOWN, raw)
|
||||||
|
|
||||||
|
|
||||||
|
def easter(year: int) -> datetime.date:
|
||||||
|
"""Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm."""
|
||||||
|
a = year % 19
|
||||||
|
b = year // 100
|
||||||
|
c = year % 100
|
||||||
|
d = b // 4
|
||||||
|
e = b % 4
|
||||||
|
f = (b + 8) // 25
|
||||||
|
g = (b - f + 1) // 3
|
||||||
|
h = (19 * a + b - d - g + 15) % 30
|
||||||
|
i = c // 4
|
||||||
|
k = c % 4
|
||||||
|
l = (32 + 2 * e + 2 * i - h - k) % 7
|
||||||
|
m = (a + 11 * h + 22 * l) // 451
|
||||||
|
month = (h + l - 7 * m + 114) // 31
|
||||||
|
day = ((h + l - 7 * m + 114) % 31) + 1
|
||||||
|
return datetime.date(year, month, day)
|
||||||
119
tools/import-normalizer/documents.py
Normal file
119
tools/import-normalizer/documents.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
"""Document row extraction, triage, and the canonical document record."""
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
import dates as _dates
|
||||||
|
import tags as _tags
|
||||||
|
|
||||||
|
|
||||||
|
class Triage(Enum):
|
||||||
|
OK = auto()
|
||||||
|
EMPTY = auto()
|
||||||
|
BLANK_INDEX = auto()
|
||||||
|
X_SUFFIX = auto()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RawRow:
|
||||||
|
source_row: int
|
||||||
|
index: str = ""
|
||||||
|
file: str = ""
|
||||||
|
box: str = ""
|
||||||
|
folder: str = ""
|
||||||
|
sender: str = ""
|
||||||
|
receivers: str = ""
|
||||||
|
date: str = ""
|
||||||
|
location: str = ""
|
||||||
|
tags: str = ""
|
||||||
|
summary: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CanonicalDocument:
|
||||||
|
index: str
|
||||||
|
box: str = ""
|
||||||
|
folder: str = ""
|
||||||
|
sender_person_id: str = ""
|
||||||
|
sender_name: str = ""
|
||||||
|
receiver_person_ids: list = field(default_factory=list)
|
||||||
|
receiver_names: list = field(default_factory=list)
|
||||||
|
date_iso: str = ""
|
||||||
|
date_raw: str = ""
|
||||||
|
date_precision: str = ""
|
||||||
|
location: str = ""
|
||||||
|
tags: list = field(default_factory=list)
|
||||||
|
summary: str = ""
|
||||||
|
source_row: int = 0
|
||||||
|
needs_review: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow:
|
||||||
|
def get(field_name):
|
||||||
|
idx = header.get(field_name)
|
||||||
|
if idx is None or idx >= len(cells):
|
||||||
|
return ""
|
||||||
|
return (cells[idx] or "").strip()
|
||||||
|
return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS})
|
||||||
|
|
||||||
|
|
||||||
|
def triage(cells: list[str], index_col: int = 0) -> Triage:
|
||||||
|
nonempty = [c for c in cells if c and str(c).strip()]
|
||||||
|
if not nonempty:
|
||||||
|
return Triage.EMPTY
|
||||||
|
index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else ""
|
||||||
|
if not index:
|
||||||
|
return Triage.BLANK_INDEX
|
||||||
|
if index.endswith("x"):
|
||||||
|
return Triage.X_SUFFIX
|
||||||
|
return Triage.OK
|
||||||
|
|
||||||
|
|
||||||
|
def classify_blank_index(cells: list[str], header: dict[str, int]) -> str:
|
||||||
|
"""REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'."""
|
||||||
|
name_cols = {header.get("sender"), header.get("receivers")} - {None}
|
||||||
|
populated = {i for i, c in enumerate(cells) if c and str(c).strip()}
|
||||||
|
if populated and populated <= name_cols:
|
||||||
|
return "section_banner"
|
||||||
|
return "data_no_index"
|
||||||
|
|
||||||
|
|
||||||
|
def index_file_mismatch(index: str, file_path: str) -> bool:
|
||||||
|
# Assumes the Datei value is a filename with an extension (all corpus paths are *.pdf).
|
||||||
|
if not file_path.strip():
|
||||||
|
return False
|
||||||
|
basename = file_path.replace("\\", "/").rsplit("/", 1)[-1]
|
||||||
|
stem = basename.rsplit(".", 1)[0]
|
||||||
|
return stem != index
|
||||||
|
|
||||||
|
|
||||||
|
def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument:
|
||||||
|
pd = _dates.parse_date(raw.date, date_overrides)
|
||||||
|
flags = []
|
||||||
|
|
||||||
|
sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row)
|
||||||
|
if raw.sender.strip() and not sender_matched:
|
||||||
|
flags.append("unmatched_sender")
|
||||||
|
if sender_multi:
|
||||||
|
flags.append("multi_sender")
|
||||||
|
|
||||||
|
receivers = ctx.resolve_receivers(raw.receivers, raw.source_row)
|
||||||
|
if any(not matched for _, _, matched in receivers):
|
||||||
|
flags.append("unmatched_receiver")
|
||||||
|
|
||||||
|
if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN:
|
||||||
|
flags.append("unparsed_date")
|
||||||
|
if index_file_mismatch(raw.index, raw.file):
|
||||||
|
flags.append("index_file_mismatch")
|
||||||
|
|
||||||
|
return CanonicalDocument(
|
||||||
|
index=raw.index, box=raw.box, folder=raw.folder,
|
||||||
|
sender_person_id=sender_id, sender_name=sender_name,
|
||||||
|
receiver_person_ids=[r[0] for r in receivers],
|
||||||
|
receiver_names=[r[1] for r in receivers],
|
||||||
|
date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision),
|
||||||
|
location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary,
|
||||||
|
source_row=raw.source_row, needs_review=flags,
|
||||||
|
)
|
||||||
50
tools/import-normalizer/ingest.py
Normal file
50
tools/import-normalizer/ingest.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields."""
|
||||||
|
import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
|
||||||
|
def _cell_to_str(value) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(value, bool): # bool is a subclass of int — handle before the int branch
|
||||||
|
return str(value)
|
||||||
|
if isinstance(value, datetime.datetime):
|
||||||
|
return value.date().isoformat()
|
||||||
|
if isinstance(value, datetime.date):
|
||||||
|
return value.isoformat()
|
||||||
|
if isinstance(value, float) and value.is_integer():
|
||||||
|
return str(int(value))
|
||||||
|
if isinstance(value, int):
|
||||||
|
return str(value)
|
||||||
|
return str(value).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def read_sheet(path: Path, sheet_name: str) -> list[list[str]]:
|
||||||
|
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
||||||
|
if sheet_name not in wb.sheetnames:
|
||||||
|
raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}")
|
||||||
|
ws = wb[sheet_name]
|
||||||
|
rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)]
|
||||||
|
wb.close()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_header(text: str) -> str:
|
||||||
|
return " ".join(text.lower().split())
|
||||||
|
|
||||||
|
|
||||||
|
def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]):
|
||||||
|
"""Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing."""
|
||||||
|
fields: dict[str, int] = {}
|
||||||
|
unknown: list[str] = []
|
||||||
|
for idx, raw in enumerate(header_row):
|
||||||
|
key = _norm_header(raw)
|
||||||
|
if key in field_map:
|
||||||
|
fields[field_map[key]] = idx
|
||||||
|
elif raw.strip():
|
||||||
|
unknown.append(raw)
|
||||||
|
missing = required - set(fields)
|
||||||
|
if missing:
|
||||||
|
raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})")
|
||||||
|
return fields, unknown
|
||||||
171
tools/import-normalizer/normalize.py
Normal file
171
tools/import-normalizer/normalize.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
"""Orchestrator: read raw workbooks -> canonical outputs + review reports."""
|
||||||
|
import argparse
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import config
|
||||||
|
import ingest
|
||||||
|
import persons
|
||||||
|
import documents
|
||||||
|
import overrides as overrides_mod
|
||||||
|
import tags as _tags
|
||||||
|
import writers
|
||||||
|
|
||||||
|
|
||||||
|
def run(*, document_workbook, document_sheet, person_workbook, person_sheet,
|
||||||
|
out_dir, review_dir, date_overrides, name_overrides,
|
||||||
|
approved_themes_path=None) -> dict:
|
||||||
|
out_dir, review_dir = Path(out_dir), Path(review_dir)
|
||||||
|
|
||||||
|
approved_themes = _tags.load_approved_themes(Path(approved_themes_path)) if approved_themes_path else set()
|
||||||
|
|
||||||
|
# --- persons ---
|
||||||
|
person_rows = ingest.read_sheet(person_workbook, person_sheet)
|
||||||
|
p_fields, _ = ingest.build_header_map(person_rows[0], config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
|
||||||
|
person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]]
|
||||||
|
register = persons.parse_register(person_dicts)
|
||||||
|
alias_index = persons.AliasIndex(register)
|
||||||
|
given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES)
|
||||||
|
ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names)
|
||||||
|
|
||||||
|
# --- documents ---
|
||||||
|
doc_rows = ingest.read_sheet(document_workbook, document_sheet)
|
||||||
|
d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS)
|
||||||
|
index_col = d_fields["index"]
|
||||||
|
|
||||||
|
canon_docs, blank_index, skipped_x, mismatches = [], [], [], []
|
||||||
|
unparsed_by_raw: dict[str, list] = {}
|
||||||
|
dates_by_override = 0
|
||||||
|
empty_count = 0
|
||||||
|
seen_index = Counter()
|
||||||
|
|
||||||
|
for source_row, cells in enumerate(doc_rows[1:], start=2):
|
||||||
|
t = documents.triage(cells, index_col)
|
||||||
|
if t is documents.Triage.EMPTY:
|
||||||
|
empty_count += 1
|
||||||
|
continue
|
||||||
|
if t is documents.Triage.BLANK_INDEX:
|
||||||
|
blank_index.append([source_row, documents.classify_blank_index(cells, d_fields),
|
||||||
|
" | ".join(c for c in cells if c)])
|
||||||
|
continue
|
||||||
|
if t is documents.Triage.X_SUFFIX:
|
||||||
|
idx = (cells[index_col] or "").strip()
|
||||||
|
skipped_x.append([source_row, idx, idx[:-1]])
|
||||||
|
continue
|
||||||
|
raw = documents.extract_row(cells, d_fields, source_row)
|
||||||
|
seen_index[raw.index] += 1
|
||||||
|
if raw.date.strip() and raw.date.strip() in date_overrides:
|
||||||
|
dates_by_override += 1
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes))
|
||||||
|
if "unparsed_date" in doc.needs_review:
|
||||||
|
unparsed_by_raw.setdefault(raw.date, []).append(source_row)
|
||||||
|
if "index_file_mismatch" in doc.needs_review:
|
||||||
|
mismatches.append([source_row, raw.index, raw.file])
|
||||||
|
canon_docs.append(doc)
|
||||||
|
|
||||||
|
# REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them.
|
||||||
|
dup_indexes = {idx for idx, n in seen_index.items() if n > 1}
|
||||||
|
duplicates = []
|
||||||
|
for doc in canon_docs:
|
||||||
|
if doc.index in dup_indexes:
|
||||||
|
if "duplicate_index" not in doc.needs_review:
|
||||||
|
doc.needs_review.append("duplicate_index")
|
||||||
|
duplicates.append([doc.source_row, doc.index])
|
||||||
|
|
||||||
|
all_people = register + list(ctx.provisional.values())
|
||||||
|
|
||||||
|
# --- write canonical outputs ---
|
||||||
|
writers.write_documents_xlsx(canon_docs, out_dir / "canonical-documents.xlsx")
|
||||||
|
writers.write_persons_xlsx(all_people, out_dir / "canonical-persons.xlsx")
|
||||||
|
|
||||||
|
all_tag_paths = [path for doc in canon_docs for path in doc.tags]
|
||||||
|
writers.write_tag_tree_xlsx(_tags.build_tag_tree(all_tag_paths), out_dir / "canonical-tag-tree.xlsx")
|
||||||
|
|
||||||
|
# --- review files ---
|
||||||
|
# unparsed dates: most-frequent first, with example source rows + blank override cells so a
|
||||||
|
# corrected row can be pasted straight into overrides/dates.csv (same raw,iso,precision shape).
|
||||||
|
unparsed_rows = sorted(
|
||||||
|
([raw, len(rows), " ".join(map(str, rows[:5])), "", ""] for raw, rows in unparsed_by_raw.items()),
|
||||||
|
key=lambda r: (-r[1], r[0]))
|
||||||
|
writers.write_review_csv(review_dir / "unparsed-dates.csv",
|
||||||
|
["raw", "count", "example_rows", "suggested_iso", "suggested_precision"], unparsed_rows)
|
||||||
|
|
||||||
|
writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates)
|
||||||
|
writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index)
|
||||||
|
writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x)
|
||||||
|
unresolved_agg: dict[tuple, list] = {}
|
||||||
|
for name, category, row in ctx.unresolved:
|
||||||
|
unresolved_agg.setdefault((category, name), []).append(row)
|
||||||
|
unresolved_rows = sorted(
|
||||||
|
([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))]
|
||||||
|
for (cat, name), rows in unresolved_agg.items()),
|
||||||
|
key=lambda r: (r[0], -r[2], r[1]))
|
||||||
|
writers.write_review_csv(review_dir / "unresolved-names.csv",
|
||||||
|
["category", "raw", "count", "example_rows"], unresolved_rows)
|
||||||
|
writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches)
|
||||||
|
|
||||||
|
all_summaries = [doc.summary for doc in canon_docs if doc.summary]
|
||||||
|
candidates = _tags.mine_summary_candidates(all_summaries)
|
||||||
|
writers.write_review_csv(review_dir / "tag-candidates.csv", ["candidate", "count"],
|
||||||
|
[[c, n] for c, n in candidates])
|
||||||
|
|
||||||
|
dated = sum(1 for d in canon_docs if d.date_raw.strip())
|
||||||
|
unknown = sum(1 for d in canon_docs if d.date_raw.strip() and d.date_precision == "UNKNOWN")
|
||||||
|
unknown_rate = f"{(100 * unknown / dated):.1f}%" if dated else "0.0%"
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"# INPUTS": "",
|
||||||
|
"document_rows_read": len(doc_rows) - 1,
|
||||||
|
"register_persons": len(register),
|
||||||
|
"unknown_headers": ", ".join(unknown_headers) or "(none)",
|
||||||
|
"# OUTPUTS": "",
|
||||||
|
"documents_emitted": len(canon_docs),
|
||||||
|
"provisional_persons": len(ctx.provisional),
|
||||||
|
"# DATES": "",
|
||||||
|
"dated_rows": dated,
|
||||||
|
"unparsed_dates": unknown,
|
||||||
|
"unknown_date_rate": f"{unknown_rate} (target <=5%)",
|
||||||
|
"distinct_unparsed_formats": len(unparsed_by_raw),
|
||||||
|
"# NAMES": "",
|
||||||
|
"unmatched_name_strings": len(ctx.unmatched),
|
||||||
|
"unresolved_name_occurrences": len(ctx.unresolved),
|
||||||
|
"unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"),
|
||||||
|
"unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"),
|
||||||
|
"unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"),
|
||||||
|
"unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"),
|
||||||
|
"unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"),
|
||||||
|
"unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"),
|
||||||
|
"# ANOMALIES": "",
|
||||||
|
"empty_rows": empty_count,
|
||||||
|
"blank_index_rows": len(blank_index),
|
||||||
|
"skipped_x_suffix": len(skipped_x),
|
||||||
|
"duplicate_index_rows": len(duplicates),
|
||||||
|
"index_file_mismatches": len(mismatches),
|
||||||
|
"# OVERRIDES": "",
|
||||||
|
"date_overrides_loaded": len(date_overrides),
|
||||||
|
"name_overrides_loaded": len(name_overrides),
|
||||||
|
"dates_resolved_by_override": dates_by_override,
|
||||||
|
"names_resolved_by_override": ctx.override_hits,
|
||||||
|
}
|
||||||
|
writers.write_summary(review_dir / "summary.txt", stats)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Normalize the family archive spreadsheets.")
|
||||||
|
parser.parse_args()
|
||||||
|
date_overrides, name_overrides = overrides_mod.load_overrides(
|
||||||
|
config.OVERRIDES_DIR / "dates.csv", config.OVERRIDES_DIR / "names.csv")
|
||||||
|
stats = run(
|
||||||
|
document_workbook=config.DOCUMENT_WORKBOOK, document_sheet=config.DOCUMENT_SHEET,
|
||||||
|
person_workbook=config.PERSON_WORKBOOK, person_sheet=config.PERSON_SHEET,
|
||||||
|
out_dir=config.OUT_DIR, review_dir=config.REVIEW_DIR,
|
||||||
|
date_overrides=date_overrides, name_overrides=name_overrides,
|
||||||
|
approved_themes_path=config.OVERRIDES_DIR / "approved-themes.csv")
|
||||||
|
print("Normalization complete:")
|
||||||
|
for k, v in stats.items():
|
||||||
|
print(f" {k}: {v}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3019
tools/import-normalizer/out/canonical-persons-tree.json
Normal file
3019
tools/import-normalizer/out/canonical-persons-tree.json
Normal file
File diff suppressed because it is too large
Load Diff
21
tools/import-normalizer/overrides.py
Normal file
21
tools/import-normalizer/overrides.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
"""Load human-supplied corrections. Missing files are not an error."""
|
||||||
|
import csv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_overrides(dates_path: Path, names_path: Path):
|
||||||
|
date_overrides: dict[str, tuple[str, str]] = {}
|
||||||
|
name_overrides: dict[str, str] = {}
|
||||||
|
if Path(dates_path).exists():
|
||||||
|
with open(dates_path, encoding="utf-8", newline="") as f:
|
||||||
|
for row in csv.DictReader(f):
|
||||||
|
raw = (row.get("raw") or "").strip()
|
||||||
|
if raw:
|
||||||
|
date_overrides[raw] = ((row.get("iso") or "").strip(), (row.get("precision") or "UNKNOWN").strip())
|
||||||
|
if Path(names_path).exists():
|
||||||
|
with open(names_path, encoding="utf-8", newline="") as f:
|
||||||
|
for row in csv.DictReader(f):
|
||||||
|
raw = (row.get("raw") or "").strip()
|
||||||
|
if raw:
|
||||||
|
name_overrides[raw] = (row.get("person_id") or "").strip()
|
||||||
|
return date_overrides, name_overrides
|
||||||
81
tools/import-normalizer/overrides/README.md
Normal file
81
tools/import-normalizer/overrides/README.md
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
# Overrides
|
||||||
|
|
||||||
|
Human corrections applied **deterministically on every run**. An override **wins** over the
|
||||||
|
automatic date parser / name matcher, so this is how you fix the residue the tool can't resolve
|
||||||
|
on its own. Two CSV files live here; both are read by `overrides.load_overrides()`.
|
||||||
|
|
||||||
|
- Missing or header-only files are fine — they just contribute zero overrides.
|
||||||
|
- Keep these files committed to git (they're your curated corrections); the generated `out/`
|
||||||
|
and `review/` folders are *not* committed.
|
||||||
|
- Matching is **exact** on the `raw` value after trimming surrounding whitespace. Copy the
|
||||||
|
`raw` value verbatim from the matching `review/*.csv`.
|
||||||
|
|
||||||
|
## The iteration loop
|
||||||
|
|
||||||
|
1. Run `python normalize.py`.
|
||||||
|
2. Open `review/unparsed-dates.csv` and `review/unresolved-names.csv` (sorted by frequency).
|
||||||
|
3. Add correction rows here, then re-run. Repeat until the residue is acceptable.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `dates.csv` — fix unparseable dates
|
||||||
|
|
||||||
|
Header: `raw,iso,precision`
|
||||||
|
|
||||||
|
| column | meaning |
|
||||||
|
| --- | --- |
|
||||||
|
| `raw` | the date string exactly as written in the spreadsheet (= the `raw` column in `review/unparsed-dates.csv`). |
|
||||||
|
| `iso` | the corrected date as `YYYY-MM-DD`. For partial dates use the 1st: month-only → `YYYY-MM-01`, year-only → `YYYY-01-01`. Leave **empty** if truly unknown. |
|
||||||
|
| `precision` | one of `DAY`, `MONTH`, `SEASON`, `YEAR`, `RANGE`, `APPROX`, `UNKNOWN`. |
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```csv
|
||||||
|
raw,iso,precision
|
||||||
|
23.Juni 58,1958-06-23,DAY
|
||||||
|
8.März 60,1960-03-08,DAY
|
||||||
|
Mayo 18-1929,1929-05-18,DAY
|
||||||
|
Abril 10-929,1929-04-10,DAY
|
||||||
|
30.April,1909-04-30,DAY
|
||||||
|
Mai 1895,1895-05-01,MONTH
|
||||||
|
Herbst 1913,1913-10-01,SEASON
|
||||||
|
1945/46,1945-01-01,RANGE
|
||||||
|
um 1920,1920-01-01,APPROX
|
||||||
|
?,,UNKNOWN
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- `23.Juni 58` / `8.März 60` — two-digit years `58`/`60` fall in the parser's ambiguous
|
||||||
|
`58–72` band (just past the 1873–1957 window), so they aren't auto-parsed; here you assert 1958/1960.
|
||||||
|
- `Mayo`/`Abril` — Spanish month names (Mexican-branch letters) the parser doesn't know yet.
|
||||||
|
- `30.April` — month+day with no year; pick the year from the letter's context.
|
||||||
|
- Empty `iso` + `UNKNOWN` records a deliberate "unknown date" (stops it showing up as residue).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `names.csv` — map a name string to a canonical person
|
||||||
|
|
||||||
|
Header: `raw,person_id`
|
||||||
|
|
||||||
|
| column | meaning |
|
||||||
|
| --- | --- |
|
||||||
|
| `raw` | the sender/receiver name string exactly as written (= the `raw` column in `review/unresolved-names.csv`). For a multi-name cell that was split (e.g. `"Walter und Eugenie"`), use the **individual** name part. |
|
||||||
|
| `person_id` | the canonical id to map it to. **Must be a real id** from the `person_id` column of `out/canonical-persons.xlsx` (a register person or an already-created provisional). |
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```csv
|
||||||
|
raw,person_id
|
||||||
|
A.Klucke,klucke-anna
|
||||||
|
? Hans de Gruyter,de-gruyter-hans
|
||||||
|
Eltern Cram,cram-john-james
|
||||||
|
Tante Lolly,blomquist-charlotte
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Use this for partial / misspelled / illegible / aliased names that should point at a known person.
|
||||||
|
- It maps one string → **one** person. It does **not** split a two-person cell: for genuine
|
||||||
|
pairs like `Ella Anita` (flagged `ambiguous_pair`), there is no split-via-override yet — leave
|
||||||
|
them, or add both given names to `config.EXTRA_GIVEN_NAMES` so they keep getting flagged.
|
||||||
|
- Look up valid `person_id` values in `out/canonical-persons.xlsx`. An id that doesn't exist
|
||||||
|
there will create a dangling reference (no validation yet).
|
||||||
1
tools/import-normalizer/overrides/approved-themes.csv
Normal file
1
tools/import-normalizer/overrides/approved-themes.csv
Normal file
@@ -0,0 +1 @@
|
|||||||
|
candidate
|
||||||
|
1
tools/import-normalizer/overrides/dates.csv
Normal file
1
tools/import-normalizer/overrides/dates.csv
Normal file
@@ -0,0 +1 @@
|
|||||||
|
raw,iso,precision
|
||||||
|
1
tools/import-normalizer/overrides/names.csv
Normal file
1
tools/import-normalizer/overrides/names.csv
Normal file
@@ -0,0 +1 @@
|
|||||||
|
raw,person_id
|
||||||
|
336
tools/import-normalizer/persons.py
Normal file
336
tools/import-normalizer/persons.py
Normal file
@@ -0,0 +1,336 @@
|
|||||||
|
"""Person register parsing, name splitting, alias resolution."""
|
||||||
|
import difflib
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from collections import Counter
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import StrEnum
|
||||||
|
|
||||||
|
import config
|
||||||
|
import dates
|
||||||
|
|
||||||
|
_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
|
||||||
|
"Ä": "ae", "Ö": "oe", "Ü": "ue"})
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_accents(s: str) -> str:
|
||||||
|
s = s.translate(_DIACRITIC_MAP)
|
||||||
|
s = unicodedata.normalize("NFKD", s)
|
||||||
|
return "".join(c for c in s if not unicodedata.combining(c))
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(last: str, first: str) -> str:
|
||||||
|
raw = f"{last} {first}".strip()
|
||||||
|
raw = _strip_accents(raw).lower()
|
||||||
|
raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-")
|
||||||
|
return raw or "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Person:
|
||||||
|
person_id: str
|
||||||
|
last_name: str = ""
|
||||||
|
first_name: str = ""
|
||||||
|
maiden_name: str = ""
|
||||||
|
title: str = ""
|
||||||
|
nickname: str = ""
|
||||||
|
extra_given_names: list[str] = field(default_factory=list)
|
||||||
|
birth_date: str | None = None
|
||||||
|
birth_date_raw: str = ""
|
||||||
|
birth_place: str = ""
|
||||||
|
death_date: str | None = None
|
||||||
|
death_date_raw: str = ""
|
||||||
|
death_place: str = ""
|
||||||
|
spouse: str = ""
|
||||||
|
generation: str = ""
|
||||||
|
notes: str = ""
|
||||||
|
aliases: list[str] = field(default_factory=list)
|
||||||
|
provisional: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_register(rows: list[dict]) -> list[Person]:
|
||||||
|
people = []
|
||||||
|
for r in rows:
|
||||||
|
last = (r.get("last_name") or "").strip()
|
||||||
|
if not last:
|
||||||
|
continue
|
||||||
|
given_raw = (r.get("first_name") or "").strip()
|
||||||
|
givens = [g.strip() for g in given_raw.split(",") if g.strip()]
|
||||||
|
first = givens[0] if givens else ""
|
||||||
|
extra = givens[1:]
|
||||||
|
|
||||||
|
spouse_raw = (r.get("spouse") or "").strip()
|
||||||
|
nickname = ""
|
||||||
|
m = _QUOTED_RE.match(spouse_raw)
|
||||||
|
if m:
|
||||||
|
nickname = m.group(1)
|
||||||
|
spouse_raw = ""
|
||||||
|
|
||||||
|
birth = dates.parse_date(r.get("birth_date") or "")
|
||||||
|
death = dates.parse_date(r.get("death_date") or "")
|
||||||
|
people.append(Person(
|
||||||
|
person_id=slugify(last, first),
|
||||||
|
last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(),
|
||||||
|
nickname=nickname, extra_given_names=extra,
|
||||||
|
birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(),
|
||||||
|
death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(),
|
||||||
|
spouse=spouse_raw, generation=(r.get("generation") or "").strip(),
|
||||||
|
notes=(r.get("notes") or "").strip(), provisional=False,
|
||||||
|
))
|
||||||
|
# De-duplicate colliding ids: every member of a colliding group gets a numeric suffix
|
||||||
|
# (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched.
|
||||||
|
counts = Counter(p.person_id for p in people)
|
||||||
|
seen: dict[str, int] = {}
|
||||||
|
for p in people:
|
||||||
|
if counts[p.person_id] > 1:
|
||||||
|
seen[p.person_id] = seen.get(p.person_id, 0) + 1
|
||||||
|
p.person_id = f"{p.person_id}-{seen[p.person_id]}"
|
||||||
|
return people
|
||||||
|
|
||||||
|
|
||||||
|
_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I)
|
||||||
|
_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$")
|
||||||
|
_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def find_known_last_name(segment: str) -> str | None:
|
||||||
|
seg = segment.strip()
|
||||||
|
for ln in config.KNOWN_LAST_NAMES: # config lists longest-first
|
||||||
|
if seg == ln or seg.endswith(" " + ln):
|
||||||
|
return ln
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def split_receivers(raw: str) -> list[str]:
|
||||||
|
if not raw or not raw.strip():
|
||||||
|
return []
|
||||||
|
# 0. split on "//"
|
||||||
|
if "//" in raw:
|
||||||
|
out = []
|
||||||
|
for seg in raw.split("//"):
|
||||||
|
out.extend(split_receivers(seg))
|
||||||
|
return out
|
||||||
|
cleaned = _GEB_RE.sub("", raw).strip()
|
||||||
|
if not cleaned: # e.g. a "geb. Müller"-only cell strips to empty
|
||||||
|
return []
|
||||||
|
if not _MULTI_RE.search(cleaned):
|
||||||
|
return [cleaned]
|
||||||
|
shared_last = None
|
||||||
|
pm = _PAREN_RE.search(cleaned)
|
||||||
|
if pm:
|
||||||
|
shared_last = pm.group(1).strip()
|
||||||
|
cleaned = cleaned[:pm.start()].strip()
|
||||||
|
parts = [p.strip() for p in _MULTI_RE.split(cleaned)]
|
||||||
|
parts = [p for p in parts if p and p.lower() != "familie"]
|
||||||
|
if not parts:
|
||||||
|
return []
|
||||||
|
if len(parts) == 1:
|
||||||
|
return [parts[0]]
|
||||||
|
if shared_last:
|
||||||
|
return [p if " " in p else f"{p} {shared_last}" for p in parts]
|
||||||
|
last_seg = parts[-1]
|
||||||
|
detected = find_known_last_name(last_seg)
|
||||||
|
if detected:
|
||||||
|
result = []
|
||||||
|
for p in parts[:-1]:
|
||||||
|
if " " not in p and find_known_last_name(p) is None:
|
||||||
|
result.append(f"{p} {detected}")
|
||||||
|
else:
|
||||||
|
result.append(p)
|
||||||
|
result.append(last_seg)
|
||||||
|
return result
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def _norm(name: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip()
|
||||||
|
|
||||||
|
|
||||||
|
class NameClass(StrEnum):
|
||||||
|
RESOLVABLE = "resolvable"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
SINGLE_TOKEN = "single_token"
|
||||||
|
RELATIONAL = "relational"
|
||||||
|
COLLECTIVE = "collective"
|
||||||
|
PROSE = "prose"
|
||||||
|
AMBIGUOUS_PAIR = "ambiguous_pair"
|
||||||
|
|
||||||
|
|
||||||
|
_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019"
|
||||||
|
|
||||||
|
|
||||||
|
def classify_name(raw: str, given_names: set[str]) -> NameClass:
|
||||||
|
"""Classify a (post-split) sender/receiver string by why it may be unresolvable.
|
||||||
|
|
||||||
|
Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL ->
|
||||||
|
SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE.
|
||||||
|
"""
|
||||||
|
s = raw.strip()
|
||||||
|
if not s:
|
||||||
|
return NameClass.RESOLVABLE
|
||||||
|
low = s.lower()
|
||||||
|
tokens = s.split()
|
||||||
|
# alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms
|
||||||
|
# are matched as whole words (no substring/prefix false positives like "Allerton").
|
||||||
|
alpha_words = re.findall(r"[a-zäöüß]+", low)
|
||||||
|
if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS):
|
||||||
|
return NameClass.UNKNOWN
|
||||||
|
if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s)
|
||||||
|
or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3):
|
||||||
|
return NameClass.PROSE
|
||||||
|
if any(w in config.COLLECTIVE_TERMS for w in alpha_words):
|
||||||
|
return NameClass.COLLECTIVE
|
||||||
|
if any(w in config.RELATIONAL_TERMS for w in alpha_words):
|
||||||
|
return NameClass.RELATIONAL
|
||||||
|
if len(tokens) == 1:
|
||||||
|
return NameClass.SINGLE_TOKEN
|
||||||
|
if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens):
|
||||||
|
return NameClass.AMBIGUOUS_PAIR
|
||||||
|
return NameClass.RESOLVABLE
|
||||||
|
|
||||||
|
|
||||||
|
# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is
|
||||||
|
# classified PROSE. Such multi-particle names are rare here and usually resolve via the
|
||||||
|
# register; if they surface in review, lower-priority than the real prose entries.
|
||||||
|
|
||||||
|
|
||||||
|
def build_given_names(register: list[Person], extra: set[str]) -> set[str]:
|
||||||
|
"""Set of normalized given names from the register (first + extra given) plus a supplement.
|
||||||
|
|
||||||
|
Used by classify_name to tell a two-given-name pair (two people) from a first+surname.
|
||||||
|
"""
|
||||||
|
names: set[str] = set()
|
||||||
|
for p in register:
|
||||||
|
if p.first_name:
|
||||||
|
names.add(_norm(p.first_name))
|
||||||
|
for g in p.extra_given_names:
|
||||||
|
names.add(_norm(g))
|
||||||
|
for e in extra:
|
||||||
|
names.add(_norm(e))
|
||||||
|
return names
|
||||||
|
|
||||||
|
|
||||||
|
class AliasIndex:
|
||||||
|
def __init__(self, people: list[Person]):
|
||||||
|
self._by_alias: dict[str, str] = {}
|
||||||
|
self._display: dict[str, str] = {}
|
||||||
|
self.known_ids: set[str] = {p.person_id for p in people}
|
||||||
|
first_name_ids: dict[str, list] = {}
|
||||||
|
for p in people:
|
||||||
|
self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip()
|
||||||
|
# Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01.
|
||||||
|
forms = [f"{p.first_name} {p.last_name}".strip()]
|
||||||
|
if p.maiden_name:
|
||||||
|
forms.append(f"{p.first_name} {p.maiden_name}".strip())
|
||||||
|
for extra in p.extra_given_names:
|
||||||
|
forms.append(f"{extra} {p.last_name}".strip())
|
||||||
|
if p.nickname:
|
||||||
|
forms.append(p.nickname)
|
||||||
|
seen = set()
|
||||||
|
for form in forms:
|
||||||
|
if form in seen:
|
||||||
|
continue
|
||||||
|
seen.add(form)
|
||||||
|
key = _norm(form)
|
||||||
|
if key and key not in self._by_alias:
|
||||||
|
self._by_alias[key] = p.person_id
|
||||||
|
p.aliases.append(form)
|
||||||
|
if p.first_name:
|
||||||
|
ids = first_name_ids.setdefault(_norm(p.first_name), [])
|
||||||
|
if p.person_id not in ids:
|
||||||
|
ids.append(p.person_id)
|
||||||
|
# first-name-only alias, only when unambiguous
|
||||||
|
for fname, ids in first_name_ids.items():
|
||||||
|
if len(ids) == 1 and fname not in self._by_alias:
|
||||||
|
self._by_alias[fname] = ids[0]
|
||||||
|
|
||||||
|
def resolve(self, name: str):
|
||||||
|
return self._by_alias.get(_norm(name))
|
||||||
|
|
||||||
|
def display(self, person_id: str) -> str:
|
||||||
|
return self._display.get(person_id, "")
|
||||||
|
|
||||||
|
def suggest(self, name: str):
|
||||||
|
keys = list(self._by_alias.keys())
|
||||||
|
match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD)
|
||||||
|
if not match:
|
||||||
|
return None, 0.0
|
||||||
|
score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio()
|
||||||
|
return self._by_alias[match[0]], score
|
||||||
|
|
||||||
|
|
||||||
|
class ResolutionContext:
|
||||||
|
"""Resolves raw name strings to person ids; accumulates provisional persons and review data."""
|
||||||
|
def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str],
|
||||||
|
given_names: set[str] | None = None):
|
||||||
|
self.index = alias_index
|
||||||
|
self.name_overrides = name_overrides
|
||||||
|
self.given_names = given_names or set()
|
||||||
|
self.provisional: dict[str, Person] = {}
|
||||||
|
self.unmatched: dict[str, list] = {}
|
||||||
|
self.unresolved: list[tuple] = [] # (raw_name, category, source_row) for non-RESOLVABLE names
|
||||||
|
self._raw_to_pid: dict[str, str] = {}
|
||||||
|
self.override_hits = 0
|
||||||
|
|
||||||
|
def _unique_id(self, base: str) -> str:
|
||||||
|
"""A provisional id must never collide with a register id or another provisional."""
|
||||||
|
used = self.index.known_ids | set(self.provisional)
|
||||||
|
pid, n = base, 1
|
||||||
|
while pid in used:
|
||||||
|
n += 1
|
||||||
|
pid = f"{base}-{n}"
|
||||||
|
return pid
|
||||||
|
|
||||||
|
def resolve_one(self, raw_name: str, source_row: int):
|
||||||
|
"""Return (person_id, display_name, matched: bool). '' name -> ('', '', True)."""
|
||||||
|
name = (raw_name or "").strip()
|
||||||
|
if not name:
|
||||||
|
return "", "", True
|
||||||
|
if name in self.name_overrides:
|
||||||
|
self.override_hits += 1
|
||||||
|
pid = self.name_overrides[name]
|
||||||
|
return pid, self.index.display(pid) or name, True
|
||||||
|
pid = self.index.resolve(name)
|
||||||
|
if pid:
|
||||||
|
return pid, self.index.display(pid) or name, True
|
||||||
|
# provisional person (unmatched) — never reuse a register id
|
||||||
|
self.unmatched.setdefault(name, []).append(source_row)
|
||||||
|
category = classify_name(name, self.given_names)
|
||||||
|
if category is not NameClass.RESOLVABLE:
|
||||||
|
self.unresolved.append((name, str(category), source_row))
|
||||||
|
if name in self._raw_to_pid:
|
||||||
|
return self._raw_to_pid[name], name, False
|
||||||
|
last, first = _last_first(name)
|
||||||
|
pid = self._unique_id(slugify(last, first))
|
||||||
|
self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True)
|
||||||
|
self._raw_to_pid[name] = pid
|
||||||
|
return pid, name, False
|
||||||
|
|
||||||
|
def resolve_sender(self, raw: str, source_row: int):
|
||||||
|
"""Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged."""
|
||||||
|
parts = split_receivers(raw)
|
||||||
|
if not parts:
|
||||||
|
return "", "", True, False
|
||||||
|
pid, name, matched = self.resolve_one(parts[0], source_row)
|
||||||
|
for extra in parts[1:]:
|
||||||
|
self.resolve_one(extra, source_row) # register the others as persons too
|
||||||
|
return pid, name, matched, len(parts) > 1
|
||||||
|
|
||||||
|
def resolve_receivers(self, raw: str, source_row: int):
|
||||||
|
return [self.resolve_one(part, source_row) for part in split_receivers(raw)]
|
||||||
|
|
||||||
|
|
||||||
|
def _last_first(name: str):
|
||||||
|
"""Best-effort split of a free name string into (last, first) for slug/provisional building."""
|
||||||
|
name = name.strip()
|
||||||
|
ln = find_known_last_name(name)
|
||||||
|
if ln:
|
||||||
|
first = name[: -len(ln)].strip()
|
||||||
|
return ln, first
|
||||||
|
tokens = name.split()
|
||||||
|
if len(tokens) >= 2:
|
||||||
|
return tokens[-1], " ".join(tokens[:-1])
|
||||||
|
return name, ""
|
||||||
409
tools/import-normalizer/persons_tree.py
Normal file
409
tools/import-normalizer/persons_tree.py
Normal file
@@ -0,0 +1,409 @@
|
|||||||
|
"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json."""
|
||||||
|
import argparse
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import config
|
||||||
|
import dates
|
||||||
|
from persons import _strip_accents
|
||||||
|
|
||||||
|
|
||||||
|
_MIN_YEAR = 1700
|
||||||
|
_MAX_YEAR = 2100
|
||||||
|
# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR],
|
||||||
|
# but the year is a plausible typo (1000-3000), don't try serial conversion.
|
||||||
|
# Years outside this range (e.g., 7568) are implausible and should try serial conversion.
|
||||||
|
_PLAUSIBLE_TYPO_MIN = 1000
|
||||||
|
_PLAUSIBLE_TYPO_MAX = 3000
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_year(raw: str | None) -> int | None:
|
||||||
|
"""Extract a birth/death year from an Excel cell string.
|
||||||
|
|
||||||
|
Handles three cases:
|
||||||
|
1. ISO / German / text string parseable by parse_date() → extract year if in range
|
||||||
|
2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion
|
||||||
|
(unless it's a plausible typo year, e.g., "1023" for "1923")
|
||||||
|
3. Mixed-format or unresolvable → None
|
||||||
|
|
||||||
|
Serial conversion only fires for pure-digit strings and implausible years,
|
||||||
|
preventing typo years like "1023" from being mis-converted as serials.
|
||||||
|
"""
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
s = str(raw).strip()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if it's a pure-digit string (candidate for serial conversion)
|
||||||
|
is_pure_digit = re.fullmatch(r"\d+", s) is not None
|
||||||
|
|
||||||
|
# Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.)
|
||||||
|
result = dates.parse_date(s)
|
||||||
|
if result.iso:
|
||||||
|
year = int(result.iso[:4])
|
||||||
|
if _MIN_YEAR <= year <= _MAX_YEAR:
|
||||||
|
return year
|
||||||
|
# Year is out of range. Only try serial conversion if it's an implausible year.
|
||||||
|
# Plausible typos (e.g., 1023 for 1923) should not be converted as serials.
|
||||||
|
if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX):
|
||||||
|
n = int(s)
|
||||||
|
if 1 <= n <= 80_000:
|
||||||
|
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||||
|
if _MIN_YEAR <= d.year <= _MAX_YEAR:
|
||||||
|
return d.year
|
||||||
|
return None
|
||||||
|
|
||||||
|
# parse_date() found nothing. Try serial conversion only for pure-digit strings.
|
||||||
|
if is_pure_digit:
|
||||||
|
n = int(s)
|
||||||
|
if 1 <= n <= 80_000:
|
||||||
|
d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n)
|
||||||
|
if _MIN_YEAR <= d.year <= _MAX_YEAR:
|
||||||
|
return d.year
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_generation(raw: str | None) -> int | None:
|
||||||
|
"""Extract the generation integer from column A values like 'G 3', 'G3', 'G 0'."""
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
m = re.search(r"\d+", str(raw))
|
||||||
|
return int(m.group()) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"}
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_tree(s: str) -> str:
|
||||||
|
"""Normalize a name string for tree matching.
|
||||||
|
|
||||||
|
- Strip surrounding quotes, remove parenthetical substrings
|
||||||
|
- Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces
|
||||||
|
- Remove known geographic/honorific suffix tokens
|
||||||
|
- Collapse whitespace
|
||||||
|
"""
|
||||||
|
s = (s or "").strip().strip("\"'")
|
||||||
|
s = re.sub(r"\([^)]*\)", "", s)
|
||||||
|
s = _strip_accents(s).lower().replace(".", " ")
|
||||||
|
tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES]
|
||||||
|
return " ".join(tokens).strip("., ")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_index(persons: list[dict]) -> dict[str, list[str]]:
|
||||||
|
"""Build a name → [rowId, …] lookup index with four keys per person."""
|
||||||
|
index: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
def _add(key: str, row_id: str) -> None:
|
||||||
|
if key:
|
||||||
|
index.setdefault(key, []).append(row_id)
|
||||||
|
|
||||||
|
for p in persons:
|
||||||
|
row_id = p["rowId"]
|
||||||
|
first = p.get("firstName") or ""
|
||||||
|
last = p.get("lastName") or ""
|
||||||
|
maiden = p.get("maidenName") or ""
|
||||||
|
|
||||||
|
_add(_norm_tree(f"{first} {last}"), row_id)
|
||||||
|
_add(_norm_tree(f"{last} {first}"), row_id)
|
||||||
|
if maiden:
|
||||||
|
_add(_norm_tree(f"{first} {maiden}"), row_id)
|
||||||
|
_add(_norm_tree(last), row_id)
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]:
|
||||||
|
"""Return (row_id, None) on unique match, (None, reason) otherwise."""
|
||||||
|
key = _norm_tree(raw)
|
||||||
|
if not key:
|
||||||
|
return None, "empty"
|
||||||
|
hits = index.get(key, [])
|
||||||
|
if len(hits) == 1:
|
||||||
|
return hits[0], None
|
||||||
|
if len(hits) == 0:
|
||||||
|
return None, "not_found"
|
||||||
|
return None, "ambiguous"
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_row(row_num: int, fields: dict) -> dict:
|
||||||
|
"""Produce one person record from a header-mapped row dict.
|
||||||
|
|
||||||
|
Internal keys prefixed with '_' are stripped before JSON output in main().
|
||||||
|
"""
|
||||||
|
def s(key: str) -> str:
|
||||||
|
return (fields.get(key) or "").strip()
|
||||||
|
|
||||||
|
birth_raw = s("birth_date")
|
||||||
|
death_raw = s("death_date")
|
||||||
|
|
||||||
|
birth_year = _parse_year(birth_raw)
|
||||||
|
death_year = _parse_year(death_raw)
|
||||||
|
|
||||||
|
notes_parts = []
|
||||||
|
if birth_raw and birth_year is None:
|
||||||
|
notes_parts.append(f"[Geburtsdatum: {birth_raw}]")
|
||||||
|
if death_raw and death_year is None:
|
||||||
|
notes_parts.append(f"[Todesdatum: {death_raw}]")
|
||||||
|
bemerkung = s("notes")
|
||||||
|
if bemerkung:
|
||||||
|
notes_parts.append(bemerkung)
|
||||||
|
|
||||||
|
maiden = s("maiden_name") or None
|
||||||
|
spouse = s("spouse") or None
|
||||||
|
bemerkung_out = bemerkung or None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"rowId": f"row_{row_num:03d}",
|
||||||
|
"firstName": s("first_name"),
|
||||||
|
"lastName": s("last_name"),
|
||||||
|
"maidenName": maiden,
|
||||||
|
"alias": None,
|
||||||
|
"notes": " ".join(notes_parts) or None,
|
||||||
|
"birthYear": birth_year,
|
||||||
|
"deathYear": death_year,
|
||||||
|
"birthPlace": s("birth_place") or None,
|
||||||
|
"deathPlace": s("death_place") or None,
|
||||||
|
"generation": _parse_generation(s("generation")),
|
||||||
|
"familyMember": True,
|
||||||
|
"_spouse_raw": spouse,
|
||||||
|
"_bemerkung_raw": bemerkung_out,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]:
|
||||||
|
"""Remove duplicate rows. Two-stage:
|
||||||
|
|
||||||
|
1. Exact (firstName, lastName, birthYear) match.
|
||||||
|
2. (firstName, lastName) where the later entry has birthYear=None and an earlier
|
||||||
|
entry already has a known birthYear.
|
||||||
|
"""
|
||||||
|
seen_full: dict[tuple, str] = {} # (first, last, year) -> rowId
|
||||||
|
seen_name: dict[tuple, str] = {} # (first, last) -> rowId of first entry with a year
|
||||||
|
result: list[dict] = []
|
||||||
|
skipped: list[str] = []
|
||||||
|
|
||||||
|
for p in persons:
|
||||||
|
first, last, year = p["firstName"], p["lastName"], p["birthYear"]
|
||||||
|
key_full = (first, last, year)
|
||||||
|
key_name = (first, last)
|
||||||
|
|
||||||
|
if key_full in seen_full:
|
||||||
|
skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if year is None and key_name in seen_name:
|
||||||
|
skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_full[key_full] = p["rowId"]
|
||||||
|
if year is not None:
|
||||||
|
seen_name[key_name] = p["rowId"]
|
||||||
|
|
||||||
|
result.append(p)
|
||||||
|
|
||||||
|
return result, skipped
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_spouses(
|
||||||
|
persons: list[dict], index: dict[str, list[str]]
|
||||||
|
) -> tuple[list[dict], list[dict]]:
|
||||||
|
"""Emit SPOUSE_OF edges from each person's _spouse_raw field."""
|
||||||
|
relationships: list[dict] = []
|
||||||
|
unresolved: list[dict] = []
|
||||||
|
emitted: set[frozenset] = set()
|
||||||
|
|
||||||
|
for p in persons:
|
||||||
|
raw = (p.get("_spouse_raw") or "").strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
row_id = p["rowId"]
|
||||||
|
matched_id, reason = _resolve_one(raw, index)
|
||||||
|
if matched_id:
|
||||||
|
edge = frozenset([row_id, matched_id])
|
||||||
|
if edge not in emitted:
|
||||||
|
emitted.add(edge)
|
||||||
|
relationships.append({
|
||||||
|
"personId": row_id,
|
||||||
|
"relatedPersonId": matched_id,
|
||||||
|
"type": "SPOUSE_OF",
|
||||||
|
"source": "verheiratet_mit",
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
unresolved.append({
|
||||||
|
"rowId": row_id,
|
||||||
|
"field": "verheiratet_mit",
|
||||||
|
"raw": raw,
|
||||||
|
"reason": reason,
|
||||||
|
})
|
||||||
|
|
||||||
|
return relationships, unresolved
|
||||||
|
|
||||||
|
|
||||||
|
_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I)
|
||||||
|
_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I)
|
||||||
|
_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_bemerkung(
|
||||||
|
row_id: str, bemerkung: str, index: dict[str, list[str]]
|
||||||
|
) -> tuple[list[dict], list[dict], str]:
|
||||||
|
"""Extract PARENT_OF edges from a Bemerkung cell.
|
||||||
|
|
||||||
|
Returns (relationships, unresolved, remaining_notes).
|
||||||
|
Text that doesn't match a parent pattern goes to remaining_notes unchanged.
|
||||||
|
"""
|
||||||
|
if not bemerkung or not bemerkung.strip():
|
||||||
|
return [], [], ""
|
||||||
|
|
||||||
|
s = bemerkung.strip()
|
||||||
|
|
||||||
|
for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")):
|
||||||
|
m = pattern.match(s)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split the captured group on the first comma or semicolon to separate
|
||||||
|
# the name part from any trailing description (e.g. ", nach Mexiko emigriert")
|
||||||
|
raw_names, _, trailing = m.group(1).strip().partition(",")
|
||||||
|
if not trailing:
|
||||||
|
raw_names, _, trailing = raw_names.partition(";")
|
||||||
|
name_part = raw_names.strip().rstrip("!., ")
|
||||||
|
remainder = trailing.strip().lstrip(".,! ")
|
||||||
|
parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()]
|
||||||
|
rels: list[dict] = []
|
||||||
|
unres: list[dict] = []
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
part = part.rstrip("!., ")
|
||||||
|
matched_id, reason = _resolve_one(part, index)
|
||||||
|
if matched_id:
|
||||||
|
if direction == "child":
|
||||||
|
rels.append({
|
||||||
|
"personId": matched_id,
|
||||||
|
"relatedPersonId": row_id,
|
||||||
|
"type": "PARENT_OF",
|
||||||
|
"source": "bemerkung",
|
||||||
|
"rawBemerkung": bemerkung,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
rels.append({
|
||||||
|
"personId": row_id,
|
||||||
|
"relatedPersonId": matched_id,
|
||||||
|
"type": "PARENT_OF",
|
||||||
|
"source": "bemerkung",
|
||||||
|
"rawBemerkung": bemerkung,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
unres.append({
|
||||||
|
"rowId": row_id,
|
||||||
|
"field": "bemerkung",
|
||||||
|
"raw": bemerkung,
|
||||||
|
"reason": reason,
|
||||||
|
})
|
||||||
|
|
||||||
|
return rels, unres, remainder
|
||||||
|
|
||||||
|
# No pattern matched — full text goes to notes, nothing to unresolved
|
||||||
|
return [], [], s
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", default=str(config.PERSON_WORKBOOK),
|
||||||
|
help="Path to Personendatei 2.xlsx"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"),
|
||||||
|
help="Path for output JSON"
|
||||||
|
)
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
from ingest import read_sheet, build_header_map
|
||||||
|
|
||||||
|
rows = read_sheet(Path(args.input), config.PERSON_SHEET)
|
||||||
|
if not rows:
|
||||||
|
print("ERROR: sheet is empty", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
header_row = [str(v) for v in rows[0]]
|
||||||
|
fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS)
|
||||||
|
|
||||||
|
# --- Pass 1: parse rows ---
|
||||||
|
persons_raw: list[dict] = []
|
||||||
|
for row_num, row in enumerate(rows[1:], start=2):
|
||||||
|
field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()}
|
||||||
|
if not field_dict.get("last_name", "").strip():
|
||||||
|
continue
|
||||||
|
persons_raw.append(_parse_row(row_num, field_dict))
|
||||||
|
|
||||||
|
persons, skipped_msgs = _deduplicate(persons_raw)
|
||||||
|
for msg in skipped_msgs:
|
||||||
|
print(f" SKIP {msg}", file=sys.stderr)
|
||||||
|
|
||||||
|
index = _build_index(persons)
|
||||||
|
|
||||||
|
# --- Pass 2: resolve relationships ---
|
||||||
|
all_rels: list[dict] = []
|
||||||
|
all_unresolved: list[dict] = []
|
||||||
|
|
||||||
|
spouse_rels, spouse_unres = _resolve_spouses(persons, index)
|
||||||
|
all_rels.extend(spouse_rels)
|
||||||
|
all_unresolved.extend(spouse_unres)
|
||||||
|
|
||||||
|
for p in persons:
|
||||||
|
bemerkung = p.pop("_bemerkung_raw", None) or ""
|
||||||
|
p.pop("_spouse_raw", None)
|
||||||
|
|
||||||
|
rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index)
|
||||||
|
all_rels.extend(rels)
|
||||||
|
all_unresolved.extend(unres)
|
||||||
|
|
||||||
|
if remaining:
|
||||||
|
existing = p.get("notes") or ""
|
||||||
|
if remaining not in existing:
|
||||||
|
p["notes"] = (existing + " " + remaining).strip() if existing else remaining
|
||||||
|
|
||||||
|
# --- Stats output ---
|
||||||
|
spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF")
|
||||||
|
parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF")
|
||||||
|
print(f"✓ {len(persons)} persons parsed")
|
||||||
|
print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)")
|
||||||
|
if all_unresolved:
|
||||||
|
print(f"⚠ {len(all_unresolved)} unresolved (see unresolved[] in output)")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
print("\n--- dry-run: first 5 unresolved ---")
|
||||||
|
for u in all_unresolved[:5]:
|
||||||
|
print(f" {u}")
|
||||||
|
return
|
||||||
|
|
||||||
|
output = {
|
||||||
|
"generated_at": datetime.datetime.now().isoformat(),
|
||||||
|
"source": Path(args.input).name,
|
||||||
|
"stats": {
|
||||||
|
"persons": len(persons),
|
||||||
|
"relationships": len(all_rels),
|
||||||
|
"unresolved": len(all_unresolved),
|
||||||
|
},
|
||||||
|
"persons": persons,
|
||||||
|
"relationships": all_rels,
|
||||||
|
"unresolved": all_unresolved,
|
||||||
|
}
|
||||||
|
|
||||||
|
out_path = Path(args.output)
|
||||||
|
out_path.parent.mkdir(exist_ok=True)
|
||||||
|
out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"→ {args.output}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
2
tools/import-normalizer/requirements.txt
Normal file
2
tools/import-normalizer/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
openpyxl==3.1.5
|
||||||
|
pytest==8.3.4
|
||||||
119
tools/import-normalizer/tags.py
Normal file
119
tools/import-normalizer/tags.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
import csv
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
_COLLECTIVE = config.COLLECTIVE_TERMS
|
||||||
|
|
||||||
|
_GERMAN_STOP_WORDS = {
|
||||||
|
"der", "die", "das", "ein", "eine", "einer", "einen", "einem", "eines",
|
||||||
|
"und", "oder", "aber", "an", "in", "auf", "für", "mit", "von", "zu",
|
||||||
|
"bei", "nach", "vor", "aus", "ist", "sind", "war", "waren", "hat",
|
||||||
|
"haben", "wird", "werden", "ich", "du", "er", "sie", "es", "wir",
|
||||||
|
"ihr", "ihn", "ihm", "ihnen", "mich", "mir", "dich", "dir",
|
||||||
|
"ihre", "ihren", "seinem", "seinen", "seiner", "seine",
|
||||||
|
"auch", "nicht", "noch", "dann", "durch", "dem", "den",
|
||||||
|
"des", "als", "wie", "dass", "um", "über", "unter", "zwischen",
|
||||||
|
"all", "alle", "was", "wer", "wo", "wann", "welche", "welcher",
|
||||||
|
"mehr", "sehr", "nur", "schon", "dabei", "dazu",
|
||||||
|
"bis", "seit", "gegen", "ohne", "doch", "wenn", "weil",
|
||||||
|
"ob", "so", "da", "dort", "hier", "nun", "ja", "nein",
|
||||||
|
"ihrer", "ihrem",
|
||||||
|
# Contracted prepositions common in German Inhalt summaries
|
||||||
|
"im", "am", "ans", "ins", "zum", "zur", "vom", "beim", "sich",
|
||||||
|
"hat", "hatte", "wird", "wurde", "wurden", "worden",
|
||||||
|
"kann", "konnte", "soll", "sollte", "will", "wollte",
|
||||||
|
"ihm", "dieses", "dieser", "diesem", "diesen",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_correspondence(raw: str) -> bool:
|
||||||
|
lower = raw.lower()
|
||||||
|
return " an " in lower or lower.startswith("an ") or ".an " in lower
|
||||||
|
|
||||||
|
|
||||||
|
def _tokenize(text: str) -> list[str]:
|
||||||
|
return [t.lower() for t in re.findall(r"[a-zA-ZäöüÄÖÜß]+", text)]
|
||||||
|
|
||||||
|
|
||||||
|
def _has_collective(tokens: list[str]) -> bool:
|
||||||
|
return any(t in _COLLECTIVE for t in tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def classify_schlagwort(raw: str) -> list[str]:
|
||||||
|
if not raw or not raw.strip():
|
||||||
|
return []
|
||||||
|
if not _is_correspondence(raw):
|
||||||
|
return [f"Themen/{raw}"]
|
||||||
|
if _has_collective(_tokenize(raw)):
|
||||||
|
return [f"Briefwechsel/{raw}"]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def mine_summary_candidates(summaries: list[str]) -> list[tuple[str, int]]:
|
||||||
|
counter: Counter = Counter()
|
||||||
|
for summary in summaries:
|
||||||
|
for token in re.split(r"[,;\s]+", summary.lower()):
|
||||||
|
token = re.sub(r"[^a-zA-ZäöüÄÖÜß]", "", token)
|
||||||
|
if len(token) >= 2 and token not in _GERMAN_STOP_WORDS:
|
||||||
|
counter[token] += 1
|
||||||
|
return counter.most_common()
|
||||||
|
|
||||||
|
|
||||||
|
def load_approved_themes(path: Path) -> set[str]:
|
||||||
|
if not path.exists():
|
||||||
|
return set()
|
||||||
|
themes: set[str] = set()
|
||||||
|
with open(path, newline="", encoding="utf-8") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
if row.get("candidate"):
|
||||||
|
themes.add(row["candidate"].strip().lower())
|
||||||
|
return themes
|
||||||
|
|
||||||
|
|
||||||
|
def apply_approved_themes(summary: str, themes: set[str]) -> list[str]:
|
||||||
|
lower = summary.lower()
|
||||||
|
return [
|
||||||
|
f"Themen/{theme}"
|
||||||
|
for theme in themes
|
||||||
|
if re.search(r"\b" + re.escape(theme) + r"\b", lower)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_tags(schlagwort: str, summary: str, themes: set[str]) -> list[str]:
|
||||||
|
result = classify_schlagwort(schlagwort or "")
|
||||||
|
if summary and themes:
|
||||||
|
result = result + apply_approved_themes(summary, themes)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def encode_tags(tag_list: list[str]) -> str:
|
||||||
|
return "|".join(tag_list)
|
||||||
|
|
||||||
|
|
||||||
|
def build_tag_tree(all_tag_paths: list[str]) -> list[dict]:
|
||||||
|
unique_paths = list(dict.fromkeys(all_tag_paths))
|
||||||
|
roots: dict[str, None] = {}
|
||||||
|
children: dict[str, tuple[str, str]] = {}
|
||||||
|
for path in unique_paths:
|
||||||
|
if "/" in path:
|
||||||
|
parent, child = path.split("/", 1)
|
||||||
|
roots[parent] = None
|
||||||
|
children[path] = (parent, child)
|
||||||
|
else:
|
||||||
|
roots[path] = None
|
||||||
|
|
||||||
|
rows: list[dict] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for root in roots:
|
||||||
|
if root not in seen:
|
||||||
|
rows.append({"tag_path": root, "parent_name": "", "tag_name": root})
|
||||||
|
seen.add(root)
|
||||||
|
for path, (parent, child) in children.items():
|
||||||
|
if path not in seen:
|
||||||
|
rows.append({"tag_path": path, "parent_name": parent, "tag_name": child})
|
||||||
|
seen.add(path)
|
||||||
|
return rows
|
||||||
0
tools/import-normalizer/tests/__init__.py
Normal file
0
tools/import-normalizer/tests/__init__.py
Normal file
20
tools/import-normalizer/tests/test_config.py
Normal file
20
tools/import-normalizer/tests/test_config.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import config
|
||||||
|
|
||||||
|
def test_century_boundaries():
|
||||||
|
assert config.TWO_DIGIT_19XX_MAX == 57
|
||||||
|
assert config.TWO_DIGIT_18XX_MIN == 73
|
||||||
|
|
||||||
|
def test_header_maps_cover_required_fields():
|
||||||
|
assert "index" in config.DOCUMENT_HEADER_MAP.values()
|
||||||
|
assert "last_name" in config.PERSON_HEADER_MAP.values()
|
||||||
|
|
||||||
|
def test_feast_tables_present():
|
||||||
|
assert config.MOVABLE_FEASTS["pfingsten"] == 49
|
||||||
|
assert config.SEASON_MONTHS["herbst"] == 10
|
||||||
|
|
||||||
|
def test_name_classification_tables():
|
||||||
|
assert "tante" in config.RELATIONAL_TERMS
|
||||||
|
assert "familie" in config.COLLECTIVE_TERMS
|
||||||
|
assert "unbekannt" in config.UNKNOWN_NAME_MARKERS
|
||||||
|
assert config.PROSE_MAX_LEN >= 30
|
||||||
|
assert "anita" in config.EXTRA_GIVEN_NAMES
|
||||||
148
tools/import-normalizer/tests/test_dates.py
Normal file
148
tools/import-normalizer/tests/test_dates.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
import datetime
|
||||||
|
import dates
|
||||||
|
from dates import Precision
|
||||||
|
|
||||||
|
def test_easter_known_years():
|
||||||
|
# Anonymous Gregorian algorithm — verified against published tables
|
||||||
|
assert dates.easter(2024) == datetime.date(2024, 3, 31)
|
||||||
|
assert dates.easter(2000) == datetime.date(2000, 4, 23)
|
||||||
|
assert dates.easter(1922) == datetime.date(1922, 4, 16)
|
||||||
|
assert dates.easter(1888) == datetime.date(1888, 4, 1)
|
||||||
|
|
||||||
|
def test_resolve_feast_movable():
|
||||||
|
assert dates.resolve_feast_or_season("Pfingsten", 1922) == ("1922-06-04", Precision.DAY)
|
||||||
|
assert dates.resolve_feast_or_season("Ostern", 2024) == ("2024-03-31", Precision.DAY)
|
||||||
|
assert dates.resolve_feast_or_season("Pfingstmontag", 1922) == ("1922-06-05", Precision.DAY)
|
||||||
|
|
||||||
|
def test_resolve_feast_fixed():
|
||||||
|
assert dates.resolve_feast_or_season("Weihnachten", 1900) == ("1900-12-25", Precision.DAY)
|
||||||
|
assert dates.resolve_feast_or_season("Neujahr", 1910) == ("1910-01-01", Precision.DAY)
|
||||||
|
|
||||||
|
def test_resolve_season():
|
||||||
|
assert dates.resolve_feast_or_season("Herbst", 1913) == ("1913-10-01", Precision.SEASON)
|
||||||
|
assert dates.resolve_feast_or_season("Sommer", 1910) == ("1910-07-01", Precision.SEASON)
|
||||||
|
|
||||||
|
def test_resolve_unknown_token_returns_none():
|
||||||
|
assert dates.resolve_feast_or_season("Freitag", 1919) is None
|
||||||
|
|
||||||
|
def test_expand_year():
|
||||||
|
assert dates.expand_year("1888") == 1888
|
||||||
|
assert dates.expand_year("889") == 1889 # 3-digit -> 1DDD
|
||||||
|
assert dates.expand_year("923") == 1923
|
||||||
|
assert dates.expand_year("08") == 1908 # 00..57 -> 19xx
|
||||||
|
assert dates.expand_year("17") == 1917
|
||||||
|
assert dates.expand_year("57") == 1957
|
||||||
|
assert dates.expand_year("73") == 1873 # 73..99 -> 18xx
|
||||||
|
assert dates.expand_year("99") == 1899
|
||||||
|
assert dates.expand_year("65") is None # 58..72 ambiguous
|
||||||
|
assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo)
|
||||||
|
assert dates.expand_year("x") is None
|
||||||
|
|
||||||
|
def test_parse_iso_and_empty():
|
||||||
|
assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23")
|
||||||
|
assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "")
|
||||||
|
assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?")
|
||||||
|
|
||||||
|
def test_parse_numeric_forms():
|
||||||
|
assert dates.parse_date("15.2.1888").iso == "1888-02-15"
|
||||||
|
assert dates.parse_date("13.5.09").iso == "1909-05-13"
|
||||||
|
assert dates.parse_date("17/6. 1916").iso == "1916-06-17"
|
||||||
|
assert dates.parse_date("11.10.08").iso == "1908-10-11"
|
||||||
|
assert dates.parse_date("30.1.889").iso == "1889-01-30"
|
||||||
|
assert dates.parse_date("15.2.1888").precision == Precision.DAY
|
||||||
|
|
||||||
|
def test_parse_numeric_unparseable():
|
||||||
|
assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year
|
||||||
|
assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year
|
||||||
|
|
||||||
|
def test_parse_approx_marker_upgrades_precision():
|
||||||
|
r = dates.parse_date("17.Nov (?) 1887") # month-name matcher now active; (?) marks approx
|
||||||
|
assert r.raw == "17.Nov (?) 1887"
|
||||||
|
assert r.precision == Precision.APPROX # month-name matcher parses date; (?) upgrades to APPROX
|
||||||
|
|
||||||
|
def test_parse_leading_qualifier_is_approx():
|
||||||
|
r = dates.parse_date("nach 1.5.1900") # qualifier stripped, numeric date salvaged, precision APPROX
|
||||||
|
assert r.iso == "1900-05-01"
|
||||||
|
assert r.precision == Precision.APPROX
|
||||||
|
|
||||||
|
def test_parse_roman_months():
|
||||||
|
assert dates.parse_date("22.III.18").iso == "1918-03-22"
|
||||||
|
assert dates.parse_date("19.XII.1954").iso == "1954-12-19"
|
||||||
|
assert dates.parse_date("1.III.27").iso == "1927-03-01"
|
||||||
|
assert dates.parse_date("22.III.18").precision == Precision.DAY
|
||||||
|
|
||||||
|
def test_parse_monthname_day_first():
|
||||||
|
assert dates.parse_date("6.März 1888").iso == "1888-03-06"
|
||||||
|
assert dates.parse_date("29.Sept.1891").iso == "1891-09-29"
|
||||||
|
assert dates.parse_date("10.Oct.95").iso == "1895-10-10"
|
||||||
|
assert dates.parse_date("9.December1889").iso == "1889-12-09"
|
||||||
|
assert dates.parse_date("18.Dez.1916").iso == "1916-12-18"
|
||||||
|
assert dates.parse_date("4Dezember 1936").iso == "1936-12-04"
|
||||||
|
assert dates.parse_date("25 August 1968").iso == "1968-08-25"
|
||||||
|
|
||||||
|
def test_parse_month_year_year_only():
|
||||||
|
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
|
||||||
|
assert dates.parse_date("October 1903") == dates.ParsedDate("1903-10-01", Precision.MONTH, "October 1903")
|
||||||
|
assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905")
|
||||||
|
|
||||||
|
def test_parse_feast_and_season_via_parse_date():
|
||||||
|
assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922")
|
||||||
|
assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913")
|
||||||
|
assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY
|
||||||
|
|
||||||
|
def test_parse_ranges():
|
||||||
|
assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916")
|
||||||
|
assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82")
|
||||||
|
assert dates.parse_date("1945/46?").iso == "1945-01-01" # '?' stripped -> RANGE, then APPROX
|
||||||
|
assert dates.parse_date("1945/46?").precision == Precision.APPROX
|
||||||
|
|
||||||
|
def test_parse_approx_full():
|
||||||
|
r = dates.parse_date("17.Nov (?) 1887")
|
||||||
|
assert r.iso == "1887-11-17"
|
||||||
|
assert r.precision == Precision.APPROX
|
||||||
|
|
||||||
|
def test_parse_english_month_first_now_works():
|
||||||
|
assert dates.parse_date("April 12. 1922").iso == "1922-04-12"
|
||||||
|
assert dates.parse_date("Mai 1895").iso == "1895-05-01" # not shadowed by month-first matcher
|
||||||
|
|
||||||
|
def test_parse_unparseable_examples():
|
||||||
|
assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN
|
||||||
|
|
||||||
|
def test_parse_invalid_calendar_date_is_unknown():
|
||||||
|
# try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review),
|
||||||
|
# never silently clamp. This is the most likely real-data bug class at 7,600 rows.
|
||||||
|
assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN
|
||||||
|
assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN
|
||||||
|
|
||||||
|
def test_parse_intra_month_day_range():
|
||||||
|
# "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916".
|
||||||
|
assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923")
|
||||||
|
assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916")
|
||||||
|
|
||||||
|
def test_parse_trailing_note_stripped_but_raw_preserved():
|
||||||
|
r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04
|
||||||
|
assert r.iso == "1887-11-17"
|
||||||
|
assert "2. Brief" in r.raw # original string preserved verbatim
|
||||||
|
|
||||||
|
def test_parse_date_override_wins():
|
||||||
|
ovr = {"13.5.65": ("1965-05-13", "DAY")}
|
||||||
|
r = dates.parse_date("13.5.65", ovr) # ambiguous without override
|
||||||
|
assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65")
|
||||||
|
|
||||||
|
def test_parse_spanish_months():
|
||||||
|
# Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year)
|
||||||
|
assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first
|
||||||
|
assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year
|
||||||
|
assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen
|
||||||
|
assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year
|
||||||
|
assert dates.parse_date("Agosto 27-929").iso == "1929-08-27"
|
||||||
|
assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year
|
||||||
|
assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY
|
||||||
|
|
||||||
|
def test_implausible_year_goes_to_review():
|
||||||
|
# a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN
|
||||||
|
assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN
|
||||||
|
|
||||||
|
def test_hyphen_month_first_does_not_shadow_month_year():
|
||||||
|
# the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18
|
||||||
|
assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895")
|
||||||
109
tools/import-normalizer/tests/test_documents.py
Normal file
109
tools/import-normalizer/tests/test_documents.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
import persons
|
||||||
|
import documents
|
||||||
|
from documents import Triage
|
||||||
|
|
||||||
|
def test_extract_row():
|
||||||
|
header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4,
|
||||||
|
"receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9}
|
||||||
|
cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||||
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]
|
||||||
|
raw = documents.extract_row(cells, header, source_row=3)
|
||||||
|
assert raw.index == "W-0001"
|
||||||
|
assert raw.sender == "Walter de Gruyter"
|
||||||
|
assert raw.date == "15.2.1888"
|
||||||
|
assert raw.source_row == 3
|
||||||
|
|
||||||
|
def test_triage():
|
||||||
|
assert documents.triage(["", "", ""]) == Triage.EMPTY
|
||||||
|
assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index
|
||||||
|
assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX
|
||||||
|
assert documents.triage(["W-0001", "x"]) == Triage.OK
|
||||||
|
|
||||||
|
def test_classify_blank_index():
|
||||||
|
header = {"sender": 4, "receivers": 5}
|
||||||
|
banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""]
|
||||||
|
data = ["", "", "V", "1", "", "Eugenie"]
|
||||||
|
assert documents.classify_blank_index(banner, header) == "section_banner"
|
||||||
|
assert documents.classify_blank_index(data, header) == "data_no_index"
|
||||||
|
|
||||||
|
def test_index_file_mismatch():
|
||||||
|
assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True
|
||||||
|
assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False
|
||||||
|
assert documents.index_file_mismatch("W-0001", "") is False
|
||||||
|
assert documents.index_file_mismatch("W-0001", "scans/W-0001.pdf") is False # unix path
|
||||||
|
assert documents.index_file_mismatch("W-0001", "W-0001.pdf") is False # no dir
|
||||||
|
|
||||||
|
|
||||||
|
def _ctx():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
||||||
|
])
|
||||||
|
return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
||||||
|
|
||||||
|
def test_to_canonical_resolves_and_flags():
|
||||||
|
ctx = _ctx()
|
||||||
|
raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1",
|
||||||
|
sender="Walter de Gruyter", receivers="Eugenie Müller",
|
||||||
|
date="15.2.1888", location="Rotterdam", tags="Brautbriefe",
|
||||||
|
summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert doc.sender_person_id == "de-gruyter-walter"
|
||||||
|
assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias
|
||||||
|
assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY"
|
||||||
|
assert doc.tags == ["Themen/Brautbriefe"]
|
||||||
|
assert doc.needs_review == []
|
||||||
|
|
||||||
|
def test_to_canonical_unmatched_and_unparsed():
|
||||||
|
ctx = _ctx()
|
||||||
|
raw = documents.RawRow(source_row=9, index="C-0001",
|
||||||
|
sender="Hans Wittkopf", receivers="", date="Freitag 1919")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert doc.sender_person_id == "wittkopf-hans" # provisional
|
||||||
|
assert "unmatched_sender" in doc.needs_review
|
||||||
|
assert "unparsed_date" in doc.needs_review
|
||||||
|
assert ctx.unmatched["Hans Wittkopf"] == [9]
|
||||||
|
assert any(p.provisional for p in ctx.provisional.values())
|
||||||
|
|
||||||
|
def test_to_canonical_splits_multi_sender():
|
||||||
|
# REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged.
|
||||||
|
ctx = _ctx()
|
||||||
|
raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert doc.sender_person_id == "de-gruyter-walter" # first part is primary
|
||||||
|
assert "multi_sender" in doc.needs_review
|
||||||
|
|
||||||
|
def test_provisional_id_never_collides_with_register():
|
||||||
|
# A provisional built from an unmatched string must not steal a register person_id.
|
||||||
|
people = persons.parse_register([{"last_name": "Xyz", "first_name": "Abc"}]) # id "xyz-abc"
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={})
|
||||||
|
# "Abc, Xyz" misses the alias index (the comma changes the normalized key) but its
|
||||||
|
# provisional slug is "xyz-abc" — already the register person's id, so it MUST be suffixed.
|
||||||
|
pid, _, matched = ctx.resolve_one("Abc, Xyz", source_row=1)
|
||||||
|
assert matched is False
|
||||||
|
assert "xyz-abc" in ctx.index.known_ids
|
||||||
|
assert pid == "xyz-abc-2" # suffixed away from the register id, not reused
|
||||||
|
|
||||||
|
def test_resolve_one_override_increments_hits():
|
||||||
|
people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Eugenie"}])
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex(people),
|
||||||
|
name_overrides={"Genie": "de-gruyter-eugenie"})
|
||||||
|
pid, name, matched = ctx.resolve_one("Genie", source_row=1)
|
||||||
|
assert pid == "de-gruyter-eugenie" and matched is True
|
||||||
|
assert name == "Eugenie de Gruyter" # display comes from the alias index
|
||||||
|
assert ctx.override_hits == 1
|
||||||
|
|
||||||
|
def test_ambiguous_pair_recorded_in_unresolved():
|
||||||
|
people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}])
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={},
|
||||||
|
given_names={"ella", "anita"})
|
||||||
|
raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita")
|
||||||
|
doc = documents.to_canonical(raw, ctx, date_overrides={})
|
||||||
|
assert len(doc.receiver_person_ids) == 1 # not split — one provisional
|
||||||
|
assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved)
|
||||||
|
|
||||||
|
def test_resolvable_first_surname_pair_not_unresolved():
|
||||||
|
ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={},
|
||||||
|
given_names={"ella", "anita"})
|
||||||
|
ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name
|
||||||
|
assert ctx.unresolved == [] # RESOLVABLE -> not recorded
|
||||||
46
tools/import-normalizer/tests/test_ingest.py
Normal file
46
tools/import-normalizer/tests/test_ingest.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import datetime
|
||||||
|
import openpyxl
|
||||||
|
import pytest
|
||||||
|
import ingest
|
||||||
|
|
||||||
|
def _make_workbook(tmp_path, sheet_name, rows):
|
||||||
|
wb = openpyxl.Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = sheet_name
|
||||||
|
for r in rows:
|
||||||
|
ws.append(r)
|
||||||
|
path = tmp_path / "wb.xlsx"
|
||||||
|
wb.save(path)
|
||||||
|
return path
|
||||||
|
|
||||||
|
def test_read_sheet_converts_cells(tmp_path):
|
||||||
|
path = _make_workbook(tmp_path, "S", [
|
||||||
|
["Index", "Datum"],
|
||||||
|
["W-0001", datetime.datetime(1888, 2, 15)],
|
||||||
|
["W-0002", 1],
|
||||||
|
])
|
||||||
|
rows = ingest.read_sheet(path, "S")
|
||||||
|
assert rows[0] == ["Index", "Datum"]
|
||||||
|
assert rows[1] == ["W-0001", "1888-02-15"] # Excel date -> ISO string
|
||||||
|
assert rows[2] == ["W-0002", "1"] # integer -> plain string
|
||||||
|
|
||||||
|
def test_build_header_map_collapses_whitespace_and_case():
|
||||||
|
header = ["Index", "Datum des Briefes", "EmpfängerIn", "Mystery"]
|
||||||
|
field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"}
|
||||||
|
fields, unknown = ingest.build_header_map(header, field_map, required={"index"})
|
||||||
|
assert fields == {"index": 0, "date": 1, "receivers": 2}
|
||||||
|
assert unknown == ["Mystery"]
|
||||||
|
|
||||||
|
def test_build_header_map_missing_required_raises():
|
||||||
|
with pytest.raises(ValueError, match="index"):
|
||||||
|
ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"})
|
||||||
|
|
||||||
|
def test_read_sheet_bool_not_coerced_to_int(tmp_path):
|
||||||
|
path = _make_workbook(tmp_path, "S", [["Flag"], [True], [False]])
|
||||||
|
rows = ingest.read_sheet(path, "S")
|
||||||
|
assert rows[1] == ["True"] and rows[2] == ["False"] # not "1"/"0"
|
||||||
|
|
||||||
|
def test_read_sheet_missing_sheet_raises(tmp_path):
|
||||||
|
path = _make_workbook(tmp_path, "S", [["A"]])
|
||||||
|
with pytest.raises(ValueError, match="not found"):
|
||||||
|
ingest.read_sheet(path, "Nope")
|
||||||
121
tools/import-normalizer/tests/test_normalize.py
Normal file
121
tools/import-normalizer/tests/test_normalize.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import openpyxl
|
||||||
|
import normalize
|
||||||
|
|
||||||
|
|
||||||
|
def _doc_wb(tmp_path):
|
||||||
|
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv"
|
||||||
|
ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn",
|
||||||
|
"Datum des Briefes", "Ort", "Schlagwort", "Inhalt"])
|
||||||
|
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||||
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"])
|
||||||
|
ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""])
|
||||||
|
ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""])
|
||||||
|
ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""])
|
||||||
|
ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter",
|
||||||
|
"Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"])
|
||||||
|
p = tmp_path / "docs.xlsx"; wb.save(p); return p
|
||||||
|
|
||||||
|
|
||||||
|
def _person_wb(tmp_path):
|
||||||
|
wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1"
|
||||||
|
ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum",
|
||||||
|
"Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"])
|
||||||
|
ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""])
|
||||||
|
ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""])
|
||||||
|
p = tmp_path / "persons.xlsx"; wb.save(p); return p
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_end_to_end(tmp_path):
|
||||||
|
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||||
|
stats = normalize.run(
|
||||||
|
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir,
|
||||||
|
date_overrides={}, name_overrides={})
|
||||||
|
assert (out_dir / "canonical-documents.xlsx").exists()
|
||||||
|
assert (out_dir / "canonical-persons.xlsx").exists()
|
||||||
|
assert stats["documents_emitted"] == 3 # W-0001, C-0001, W-0001 (dup) — x and blank excluded
|
||||||
|
assert stats["skipped_x_suffix"] == 1
|
||||||
|
assert stats["blank_index_rows"] == 1
|
||||||
|
assert stats["duplicate_index_rows"] == 2
|
||||||
|
assert stats["unresolved_unknown"] >= 1 # the "?" receiver is an UNKNOWN-class name
|
||||||
|
assert (review_dir / "skipped-x-suffix.csv").exists()
|
||||||
|
assert (review_dir / "unparsed-dates.csv").exists()
|
||||||
|
# C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01)
|
||||||
|
assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
|
||||||
|
assert (review_dir / "unresolved-names.csv").exists()
|
||||||
|
unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8")
|
||||||
|
assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver
|
||||||
|
assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced
|
||||||
|
|
||||||
|
# determinism (NFR-IDEM-01): a second run yields identical canonical content + review files
|
||||||
|
def _matrix(p):
|
||||||
|
wb = openpyxl.load_workbook(p)
|
||||||
|
return [[c.value for c in row] for row in wb.active.iter_rows()]
|
||||||
|
docs1 = _matrix(out_dir / "canonical-documents.xlsx")
|
||||||
|
persons1 = _matrix(out_dir / "canonical-persons.xlsx")
|
||||||
|
unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8")
|
||||||
|
normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={})
|
||||||
|
assert _matrix(out_dir / "canonical-documents.xlsx") == docs1
|
||||||
|
assert _matrix(out_dir / "canonical-persons.xlsx") == persons1
|
||||||
|
assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1
|
||||||
|
assert len(docs1) == 4 # header + 3 docs
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_tree_output_emitted(tmp_path):
|
||||||
|
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||||
|
normalize.run(
|
||||||
|
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir,
|
||||||
|
date_overrides={}, name_overrides={})
|
||||||
|
assert (out_dir / "canonical-tag-tree.xlsx").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_candidates_review_emitted(tmp_path):
|
||||||
|
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||||
|
normalize.run(
|
||||||
|
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir,
|
||||||
|
date_overrides={}, name_overrides={})
|
||||||
|
assert (review_dir / "tag-candidates.csv").exists()
|
||||||
|
text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8")
|
||||||
|
assert "candidate" in text and "count" in text
|
||||||
|
|
||||||
|
|
||||||
|
def test_schlagwort_encoded_as_themen_in_documents(tmp_path):
|
||||||
|
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||||
|
normalize.run(
|
||||||
|
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir,
|
||||||
|
date_overrides={}, name_overrides={})
|
||||||
|
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||||
|
ws = wb.active
|
||||||
|
header = [c.value for c in ws[1]]
|
||||||
|
tag_col = header.index("tags")
|
||||||
|
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||||
|
assert any(v and "Themen/Brautbriefe" in v for v in tag_values)
|
||||||
|
assert not any(v and v.strip() == "Brautbriefe" for v in tag_values)
|
||||||
|
|
||||||
|
|
||||||
|
def test_approved_themes_applied(tmp_path):
|
||||||
|
themes_file = tmp_path / "approved-themes.csv"
|
||||||
|
themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8")
|
||||||
|
out_dir = tmp_path / "out"; review_dir = tmp_path / "review"
|
||||||
|
normalize.run(
|
||||||
|
document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv",
|
||||||
|
person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1",
|
||||||
|
out_dir=out_dir, review_dir=review_dir,
|
||||||
|
date_overrides={}, name_overrides={},
|
||||||
|
approved_themes_path=themes_file)
|
||||||
|
wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx")
|
||||||
|
ws = wb.active
|
||||||
|
header = [c.value for c in ws[1]]
|
||||||
|
tag_col = header.index("tags")
|
||||||
|
tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)]
|
||||||
|
# W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag
|
||||||
|
assert any(v and "Themen/geschäftsreise" in v for v in tag_values)
|
||||||
132
tools/import-normalizer/tests/test_persons.py
Normal file
132
tools/import-normalizer/tests/test_persons.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
import config
|
||||||
|
import persons
|
||||||
|
from persons import NameClass
|
||||||
|
|
||||||
|
def test_slugify():
|
||||||
|
assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie"
|
||||||
|
assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard"
|
||||||
|
|
||||||
|
def test_parse_register_basic():
|
||||||
|
rows = [
|
||||||
|
{"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi",
|
||||||
|
"maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel",
|
||||||
|
"death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"',
|
||||||
|
"notes": "Schwester v Marie Cram"},
|
||||||
|
{"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else",
|
||||||
|
"maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann",
|
||||||
|
"notes": "Schwester v Herbert"},
|
||||||
|
]
|
||||||
|
people = persons.parse_register(rows)
|
||||||
|
p = people[0]
|
||||||
|
assert p.person_id == "blomquist-charlotte"
|
||||||
|
assert p.first_name == "Charlotte"
|
||||||
|
assert p.maiden_name == "Ruge"
|
||||||
|
assert p.birth_date == "1862-08-30"
|
||||||
|
assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse
|
||||||
|
assert p.spouse == ""
|
||||||
|
assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names
|
||||||
|
p2 = people[1]
|
||||||
|
assert p2.maiden_name == "Cram"
|
||||||
|
assert p2.spouse == "Ludwig Bohrmann"
|
||||||
|
assert p2.provisional is False
|
||||||
|
|
||||||
|
def test_parse_register_dedups_colliding_ids():
|
||||||
|
# Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id).
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "Cram", "first_name": "Hans"},
|
||||||
|
{"last_name": "Cram", "first_name": "Hans"},
|
||||||
|
])
|
||||||
|
ids = [p.person_id for p in people]
|
||||||
|
assert ids == ["cram-hans-1", "cram-hans-2"]
|
||||||
|
assert len(set(ids)) == 2
|
||||||
|
|
||||||
|
def test_split_receivers():
|
||||||
|
assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"]
|
||||||
|
assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"]
|
||||||
|
assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"]
|
||||||
|
assert persons.split_receivers("Clara u Familie") == ["Clara"]
|
||||||
|
assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"]
|
||||||
|
assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"]
|
||||||
|
assert persons.split_receivers("") == []
|
||||||
|
assert persons.split_receivers("geb. Müller") == [] # maiden-only cell -> no person
|
||||||
|
assert persons.split_receivers("Herbert//Clara") == ["Herbert", "Clara"] # // separator
|
||||||
|
|
||||||
|
def test_find_known_last_name():
|
||||||
|
assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter"
|
||||||
|
assert persons.find_known_last_name("Clara") is None
|
||||||
|
|
||||||
|
def test_alias_index_resolves_maiden_and_married():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"},
|
||||||
|
{"last_name": "Cram", "first_name": "Clara"},
|
||||||
|
])
|
||||||
|
idx = persons.AliasIndex(people)
|
||||||
|
eugenie = people[0].person_id
|
||||||
|
assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical
|
||||||
|
assert idx.resolve("Eugenie Müller") == eugenie # maiden alias
|
||||||
|
assert idx.resolve("eugenie müller") == eugenie # normalized
|
||||||
|
assert idx.resolve("Nobody Unknown") is None
|
||||||
|
|
||||||
|
def test_alias_index_suggestion():
|
||||||
|
people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}])
|
||||||
|
idx = persons.AliasIndex(people)
|
||||||
|
sid, score = idx.suggest("Hans Wittkop") # typo
|
||||||
|
assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD
|
||||||
|
|
||||||
|
def test_alias_index_first_name_only_when_unambiguous():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "Cram", "first_name": "Clara"},
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Walter"},
|
||||||
|
{"last_name": "Cram", "first_name": "Walter"}, # 2nd "Walter" -> first name ambiguous
|
||||||
|
])
|
||||||
|
idx = persons.AliasIndex(people)
|
||||||
|
assert idx.resolve("Clara") == people[0].person_id # unique first name resolves
|
||||||
|
assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve
|
||||||
|
assert idx.display(people[0].person_id) == "Clara Cram"
|
||||||
|
|
||||||
|
|
||||||
|
GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"}
|
||||||
|
|
||||||
|
def test_classify_unknown():
|
||||||
|
assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN
|
||||||
|
assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN
|
||||||
|
assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN
|
||||||
|
|
||||||
|
def test_classify_prose():
|
||||||
|
assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE
|
||||||
|
assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit
|
||||||
|
assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote
|
||||||
|
|
||||||
|
def test_classify_collective():
|
||||||
|
assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE
|
||||||
|
|
||||||
|
def test_classify_relational():
|
||||||
|
assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL
|
||||||
|
assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL
|
||||||
|
|
||||||
|
def test_classify_single_token():
|
||||||
|
assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN
|
||||||
|
assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN
|
||||||
|
|
||||||
|
def test_classify_ambiguous_pair():
|
||||||
|
assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR
|
||||||
|
assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR
|
||||||
|
|
||||||
|
def test_classify_resolvable_single_person():
|
||||||
|
# first + surname (surname not a given name) -> one real person, NOT ambiguous
|
||||||
|
assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE
|
||||||
|
assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE
|
||||||
|
|
||||||
|
def test_build_given_names():
|
||||||
|
people = persons.parse_register([
|
||||||
|
{"last_name": "de Gruyter", "first_name": "Eugenie"},
|
||||||
|
{"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given
|
||||||
|
])
|
||||||
|
g = persons.build_given_names(people, {"Anita"})
|
||||||
|
assert "eugenie" in g
|
||||||
|
assert "charlotte" in g and "meta" in g # primary + extra given names
|
||||||
|
assert "anita" in g # from the extra set, normalized
|
||||||
|
assert "schefold" not in g
|
||||||
457
tools/import-normalizer/tests/test_persons_tree.py
Normal file
457
tools/import-normalizer/tests/test_persons_tree.py
Normal file
@@ -0,0 +1,457 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import persons_tree
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_iso_string():
|
||||||
|
assert persons_tree._parse_year("1920-09-20") == 1920
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_birth():
|
||||||
|
# 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting
|
||||||
|
assert persons_tree._parse_year("7568") == 1920
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_death():
|
||||||
|
# 36222 days from 1899-12-30 ≈ 1999
|
||||||
|
assert persons_tree._parse_year("36222") == 1999
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_excel_serial_small():
|
||||||
|
# 177 days from 1899-12-30 = 1900-06-25
|
||||||
|
assert persons_tree._parse_year("177") == 1900
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_german_date_string():
|
||||||
|
assert persons_tree._parse_year("30.8.1862") == 1862
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_year_only():
|
||||||
|
assert persons_tree._parse_year("1930") == 1930
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_free_text():
|
||||||
|
assert persons_tree._parse_year("August 1941") == 1941
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_none():
|
||||||
|
assert persons_tree._parse_year(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_empty():
|
||||||
|
assert persons_tree._parse_year("") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_unresolvable_truncated():
|
||||||
|
# "2.9.196" has no valid 4-digit year — returns None
|
||||||
|
assert persons_tree._parse_year("2.9.196") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_typo_year():
|
||||||
|
# "4.3.1023" — year 1023 outside 1700-2100 guard — returns None
|
||||||
|
assert persons_tree._parse_year("4.3.1023") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_year_bare_out_of_range_year_is_none():
|
||||||
|
# "1023" is a plausible typo for "1923" but is NOT an Excel serial.
|
||||||
|
# parse_date("1023") parses it as year 1023 (out of 1700-2100 guard).
|
||||||
|
# The serial branch must NOT re-interpret it as a serial.
|
||||||
|
assert persons_tree._parse_year("1023") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_space():
|
||||||
|
assert persons_tree._parse_generation("G 3") == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_no_space():
|
||||||
|
assert persons_tree._parse_generation("G3") == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_extra_spaces():
|
||||||
|
assert persons_tree._parse_generation("G 0") == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_trailing_garbage():
|
||||||
|
assert persons_tree._parse_generation("G 2 de Gruyter") == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_empty():
|
||||||
|
assert persons_tree._parse_generation("") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_generation_none():
|
||||||
|
assert persons_tree._parse_generation(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_basic():
|
||||||
|
assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_diacritics():
|
||||||
|
assert persons_tree._norm_tree("Wöhler") == "woehler"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_strips_parens():
|
||||||
|
assert persons_tree._norm_tree("Otto (Herbert)") == "otto"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_strips_quotes():
|
||||||
|
assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_strips_geographic_suffix():
|
||||||
|
assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_strips_mexiko():
|
||||||
|
assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram"
|
||||||
|
|
||||||
|
|
||||||
|
def test_norm_tree_collapses_whitespace():
|
||||||
|
assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_index_forward_lookup():
|
||||||
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
assert "werner allemeyer" in idx
|
||||||
|
assert idx["werner allemeyer"] == ["row_002"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_index_reversed_lookup():
|
||||||
|
persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
assert idx.get("allemeyer werner") == ["row_002"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_index_maiden_name_lookup():
|
||||||
|
persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
assert idx.get("elsgard woehler") == ["row_002"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_index_single_token_fallback():
|
||||||
|
persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
assert idx.get("cram") == ["row_028"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_index_ambiguous_single_token():
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||||
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||||
|
]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
assert set(idx["cram"]) == {"row_028", "row_019"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_one_found():
|
||||||
|
persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx)
|
||||||
|
assert row_id == "row_003"
|
||||||
|
assert reason is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_one_not_found():
|
||||||
|
idx = {}
|
||||||
|
row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx)
|
||||||
|
assert row_id is None
|
||||||
|
assert reason == "not_found"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_one_ambiguous():
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None},
|
||||||
|
{"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None},
|
||||||
|
]
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
row_id, reason = persons_tree._resolve_one("Cram", idx)
|
||||||
|
assert row_id is None
|
||||||
|
assert reason == "ambiguous"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_row_serial_dates():
|
||||||
|
fields = {
|
||||||
|
"generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard",
|
||||||
|
"maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz",
|
||||||
|
"death_date": "36222", "death_place": "Espelkamp",
|
||||||
|
"spouse": "Allemeyer Werner", "notes": "Nichte von Herbert",
|
||||||
|
}
|
||||||
|
p = persons_tree._parse_row(2, fields)
|
||||||
|
assert p["rowId"] == "row_002"
|
||||||
|
assert p["firstName"] == "Elsgard"
|
||||||
|
assert p["lastName"] == "Allemeyer"
|
||||||
|
assert p["maidenName"] == "Wöhler"
|
||||||
|
assert p["birthYear"] == 1920
|
||||||
|
assert p["deathYear"] == 1999
|
||||||
|
assert p["birthPlace"] == "Garz"
|
||||||
|
assert p["deathPlace"] == "Espelkamp"
|
||||||
|
assert p["generation"] == 3
|
||||||
|
assert p["familyMember"] is True
|
||||||
|
assert p["_spouse_raw"] == "Allemeyer Werner"
|
||||||
|
assert p["_bemerkung_raw"] == "Nichte von Herbert"
|
||||||
|
assert "[Geburtsdatum" not in (p["notes"] or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_row_string_birth_date():
|
||||||
|
fields = {
|
||||||
|
"generation": "G 2", "last_name": "Cram", "first_name": "Herbert",
|
||||||
|
"maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
||||||
|
}
|
||||||
|
p = persons_tree._parse_row(28, fields)
|
||||||
|
assert p["birthYear"] == 1890
|
||||||
|
assert p["deathYear"] is None
|
||||||
|
assert p["notes"] is None or p["notes"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_row_unresolvable_date_goes_to_notes():
|
||||||
|
fields = {
|
||||||
|
"generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter",
|
||||||
|
"maiden_name": "", "birth_date": "28.9.", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid",
|
||||||
|
}
|
||||||
|
p = persons_tree._parse_row(96, fields)
|
||||||
|
assert p["birthYear"] is None
|
||||||
|
assert "[Geburtsdatum: 28.9.]" in p["notes"]
|
||||||
|
assert "Bruder v Ingrid" in p["notes"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_row_empty_spouse_and_notes():
|
||||||
|
fields = {
|
||||||
|
"generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen",
|
||||||
|
"maiden_name": "", "birth_date": "", "birth_place": "",
|
||||||
|
"death_date": "", "death_place": "", "spouse": "", "notes": "",
|
||||||
|
}
|
||||||
|
p = persons_tree._parse_row(4, fields)
|
||||||
|
assert p["_spouse_raw"] is None
|
||||||
|
assert p["_bemerkung_raw"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_no_duplicates():
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920},
|
||||||
|
{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923},
|
||||||
|
]
|
||||||
|
result, skipped = persons_tree._deduplicate(persons)
|
||||||
|
assert len(result) == 2
|
||||||
|
assert skipped == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_exact_match():
|
||||||
|
# rows 127/138: same firstName, lastName, birthYear
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
||||||
|
{"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951},
|
||||||
|
]
|
||||||
|
result, skipped = persons_tree._deduplicate(persons)
|
||||||
|
assert [p["rowId"] for p in result] == ["row_127"]
|
||||||
|
assert len(skipped) == 1
|
||||||
|
assert "row_138" in skipped[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_none_birth_year_after_known():
|
||||||
|
# rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964},
|
||||||
|
{"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None},
|
||||||
|
]
|
||||||
|
result, skipped = persons_tree._deduplicate(persons)
|
||||||
|
assert [p["rowId"] for p in result] == ["row_129"]
|
||||||
|
assert len(skipped) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_both_none_birth_year_kept():
|
||||||
|
# Two people with no birth year but same name: keep first only
|
||||||
|
persons = [
|
||||||
|
{"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
||||||
|
{"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None},
|
||||||
|
]
|
||||||
|
result, skipped = persons_tree._deduplicate(persons)
|
||||||
|
assert [p["rowId"] for p in result] == ["row_A"]
|
||||||
|
assert len(skipped) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def _make_persons(*args):
|
||||||
|
"""Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples."""
|
||||||
|
return [
|
||||||
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3],
|
||||||
|
"_spouse_raw": a[4], "_bemerkung_raw": None,
|
||||||
|
"birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None,
|
||||||
|
"generation": None, "familyMember": True, "alias": None, "notes": None}
|
||||||
|
for a in args
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_spouses_success():
|
||||||
|
persons = _make_persons(
|
||||||
|
("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"),
|
||||||
|
("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"),
|
||||||
|
)
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
||||||
|
assert len(rels) == 1
|
||||||
|
assert rels[0]["type"] == "SPOUSE_OF"
|
||||||
|
assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"}
|
||||||
|
assert unres == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_spouses_not_found():
|
||||||
|
persons = _make_persons(
|
||||||
|
("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'),
|
||||||
|
)
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
||||||
|
assert rels == []
|
||||||
|
assert len(unres) == 1
|
||||||
|
assert unres[0]["rowId"] == "row_007"
|
||||||
|
assert unres[0]["reason"] == "not_found"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_spouses_empty_spouse_field():
|
||||||
|
persons = _make_persons(
|
||||||
|
("row_004", "Jürgen", "Allemeyer", None, None),
|
||||||
|
)
|
||||||
|
idx = persons_tree._build_index(persons)
|
||||||
|
rels, unres = persons_tree._resolve_spouses(persons, idx)
|
||||||
|
assert rels == [] and unres == []
|
||||||
|
|
||||||
|
|
||||||
|
def _register(*args):
|
||||||
|
"""Build index from (rowId, first, last, maiden) tuples."""
|
||||||
|
persons = [
|
||||||
|
{"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]}
|
||||||
|
for a in args
|
||||||
|
]
|
||||||
|
return persons, persons_tree._build_index(persons)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_sohn_two_parents():
|
||||||
|
_, idx = _register(
|
||||||
|
("row_019", "Clara", "Cram", "de Gruyter"),
|
||||||
|
("row_028", "Herbert", "Cram", None),
|
||||||
|
)
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_021", "Sohn v Clara Cram u Herbert Cram", idx
|
||||||
|
)
|
||||||
|
assert len(rels) == 2
|
||||||
|
assert all(r["type"] == "PARENT_OF" for r in rels)
|
||||||
|
child_ids = {r["relatedPersonId"] for r in rels}
|
||||||
|
parent_ids = {r["personId"] for r in rels}
|
||||||
|
assert child_ids == {"row_021"}
|
||||||
|
assert "row_019" in parent_ids and "row_028" in parent_ids
|
||||||
|
assert unres == []
|
||||||
|
assert notes == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_tochter_von():
|
||||||
|
_, idx = _register(("row_019", "Clara", "Cram", None))
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_036", "Tochter von Clara Cram", idx
|
||||||
|
)
|
||||||
|
assert len(rels) == 1
|
||||||
|
assert rels[0] == {
|
||||||
|
"personId": "row_019",
|
||||||
|
"relatedPersonId": "row_036",
|
||||||
|
"type": "PARENT_OF",
|
||||||
|
"source": "bemerkung",
|
||||||
|
"rawBemerkung": "Tochter von Clara Cram",
|
||||||
|
}
|
||||||
|
assert notes == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_vater():
|
||||||
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_031", "Vater v Herbert Cram", idx
|
||||||
|
)
|
||||||
|
assert len(rels) == 1
|
||||||
|
assert rels[0]["personId"] == "row_031"
|
||||||
|
assert rels[0]["relatedPersonId"] == "row_028"
|
||||||
|
assert rels[0]["type"] == "PARENT_OF"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_unmatched_parent_name():
|
||||||
|
_, idx = _register() # empty index
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_004", "Sohn v Elsgard A.", idx
|
||||||
|
)
|
||||||
|
assert rels == []
|
||||||
|
assert len(unres) == 1
|
||||||
|
assert unres[0]["reason"] == "not_found"
|
||||||
|
assert notes == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_skip_nichte():
|
||||||
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_002", "Nichte von Herbert", idx
|
||||||
|
)
|
||||||
|
assert rels == []
|
||||||
|
assert unres == []
|
||||||
|
assert notes == "Nichte von Herbert"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_skip_bruder():
|
||||||
|
_, idx = _register(("row_028", "Herbert", "Cram", None))
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_033", "Bruder v Herbert", idx
|
||||||
|
)
|
||||||
|
assert rels == []
|
||||||
|
assert unres == []
|
||||||
|
assert notes == "Bruder v Herbert"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_empty():
|
||||||
|
_, idx = _register()
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx)
|
||||||
|
assert rels == [] and unres == [] and notes == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_plain_remark():
|
||||||
|
_, idx = _register()
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_029", "Verfasserin der Cram-Chronik !!", idx
|
||||||
|
)
|
||||||
|
assert rels == [] and unres == []
|
||||||
|
assert notes == "Verfasserin der Cram-Chronik !!"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_bemerkung_sohn_with_trailing_remark():
|
||||||
|
_, idx = _register(
|
||||||
|
("row_019", "Clara", "Cram", "de Gruyter"),
|
||||||
|
("row_028", "Herbert", "Cram", None),
|
||||||
|
)
|
||||||
|
rels, unres, notes = persons_tree._parse_bemerkung(
|
||||||
|
"row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx
|
||||||
|
)
|
||||||
|
assert len(rels) == 2
|
||||||
|
assert unres == []
|
||||||
|
assert notes == "nach Mexiko emigriert"
|
||||||
|
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def test_dry_run_exits_zero(tmp_path):
|
||||||
|
"""dry-run should complete without writing any file and exit 0."""
|
||||||
|
input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx"
|
||||||
|
if not input_path.exists():
|
||||||
|
import pytest
|
||||||
|
pytest.skip("source Excel file not present")
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"),
|
||||||
|
"--input", str(input_path),
|
||||||
|
"--output", str(tmp_path / "out.json"),
|
||||||
|
"--dry-run",
|
||||||
|
],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, result.stderr
|
||||||
|
assert not (tmp_path / "out.json").exists()
|
||||||
|
assert "persons parsed" in result.stdout
|
||||||
191
tools/import-normalizer/tests/test_tags.py
Normal file
191
tools/import-normalizer/tests/test_tags.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
import tags
|
||||||
|
|
||||||
|
|
||||||
|
# --- classify_schlagwort ---
|
||||||
|
|
||||||
|
def test_semantic_tag_kept_as_themen():
|
||||||
|
assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"]
|
||||||
|
|
||||||
|
def test_everyday_tag_kept_as_themen():
|
||||||
|
assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"]
|
||||||
|
|
||||||
|
def test_event_tag_kept_as_themen():
|
||||||
|
assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"]
|
||||||
|
|
||||||
|
def test_individual_correspondence_dropped():
|
||||||
|
assert tags.classify_schlagwort("Clara an Herbert") == []
|
||||||
|
|
||||||
|
def test_individual_correspondence_with_year_dropped():
|
||||||
|
assert tags.classify_schlagwort("Herbert an Clara 1918") == []
|
||||||
|
|
||||||
|
def test_individual_with_role_dropped():
|
||||||
|
assert tags.classify_schlagwort("Vater Juan an Herbert") == []
|
||||||
|
|
||||||
|
def test_relational_receiver_dropped():
|
||||||
|
assert tags.classify_schlagwort("Clara an ihre Mutter") == []
|
||||||
|
|
||||||
|
def test_group_receiver_kinder_kept_as_briefwechsel():
|
||||||
|
assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"]
|
||||||
|
|
||||||
|
def test_group_receiver_eltern_kept():
|
||||||
|
assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"]
|
||||||
|
|
||||||
|
def test_group_receiver_geschwister_kept():
|
||||||
|
assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"]
|
||||||
|
|
||||||
|
def test_group_receiver_schwiegereltern_kept():
|
||||||
|
assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"]
|
||||||
|
|
||||||
|
def test_group_receiver_soehne_kept():
|
||||||
|
assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"]
|
||||||
|
|
||||||
|
def test_group_receiver_brueder_kept():
|
||||||
|
assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"]
|
||||||
|
|
||||||
|
def test_group_receiver_cousinen_kept():
|
||||||
|
assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"]
|
||||||
|
|
||||||
|
def test_group_receiver_freunde_kept():
|
||||||
|
assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"]
|
||||||
|
|
||||||
|
def test_group_sender_geschwister_kept():
|
||||||
|
# collective on the LEFT side of "an"
|
||||||
|
assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"]
|
||||||
|
|
||||||
|
def test_receiver_only_individual_dropped():
|
||||||
|
# starts with "an " — single individual receiver
|
||||||
|
assert tags.classify_schlagwort("an Walter de Gruyter") == []
|
||||||
|
|
||||||
|
def test_receiver_only_group_kept():
|
||||||
|
# starts with "an " — collective receiver
|
||||||
|
assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"]
|
||||||
|
|
||||||
|
def test_abbreviated_sender_individual_dropped():
|
||||||
|
# "Maria W.an Clara" — abbreviated name + ".an"
|
||||||
|
assert tags.classify_schlagwort("Maria W.an Clara") == []
|
||||||
|
|
||||||
|
def test_abbreviated_sender_group_kept():
|
||||||
|
assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"]
|
||||||
|
|
||||||
|
def test_empty_schlagwort_returns_empty():
|
||||||
|
assert tags.classify_schlagwort("") == []
|
||||||
|
|
||||||
|
def test_einzelkinder_kept():
|
||||||
|
assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"]
|
||||||
|
|
||||||
|
def test_geschw_abbreviation_kept():
|
||||||
|
# "Geschw." abbreviation for Geschwister — appears after "u" in receiver side
|
||||||
|
assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."]
|
||||||
|
|
||||||
|
|
||||||
|
# --- mine_summary_candidates ---
|
||||||
|
|
||||||
|
def test_mine_candidates_counts_words():
|
||||||
|
summaries = ["Reise, Hochzeit", "Reise", "Krieg"]
|
||||||
|
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||||
|
assert candidates["reise"] == 2
|
||||||
|
assert candidates["hochzeit"] == 1
|
||||||
|
assert candidates["krieg"] == 1
|
||||||
|
|
||||||
|
def test_mine_candidates_filters_stop_words():
|
||||||
|
summaries = ["und die Reise", "das ist eine Reise"]
|
||||||
|
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||||
|
assert "reise" in candidates
|
||||||
|
assert "und" not in candidates
|
||||||
|
assert "die" not in candidates
|
||||||
|
assert "das" not in candidates
|
||||||
|
assert "ist" not in candidates
|
||||||
|
assert "eine" not in candidates
|
||||||
|
|
||||||
|
def test_mine_candidates_filters_contracted_prepositions():
|
||||||
|
# im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim
|
||||||
|
summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"]
|
||||||
|
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||||
|
for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"):
|
||||||
|
assert stop not in candidates, f"stop word '{stop}' leaked through"
|
||||||
|
assert "besuch" in candidates
|
||||||
|
assert "hochzeit" in candidates
|
||||||
|
|
||||||
|
def test_mine_candidates_filters_single_chars():
|
||||||
|
summaries = ["x Reise y"]
|
||||||
|
candidates = dict(tags.mine_summary_candidates(summaries))
|
||||||
|
assert "x" not in candidates
|
||||||
|
assert "y" not in candidates
|
||||||
|
|
||||||
|
def test_mine_candidates_sorted_descending():
|
||||||
|
summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"]
|
||||||
|
result = tags.mine_summary_candidates(summaries)
|
||||||
|
counts = [count for _, count in result]
|
||||||
|
assert counts == sorted(counts, reverse=True)
|
||||||
|
|
||||||
|
def test_mine_candidates_empty_summaries():
|
||||||
|
assert tags.mine_summary_candidates([]) == []
|
||||||
|
assert tags.mine_summary_candidates([""]) == []
|
||||||
|
|
||||||
|
|
||||||
|
# --- load_approved_themes and apply_approved_themes ---
|
||||||
|
|
||||||
|
def test_apply_themes_match_found(tmp_path):
|
||||||
|
themes = {"reise", "hochzeit"}
|
||||||
|
result = tags.apply_approved_themes("Reise nach Berlin", themes)
|
||||||
|
assert "Themen/reise" in result
|
||||||
|
|
||||||
|
def test_apply_themes_case_insensitive(tmp_path):
|
||||||
|
themes = {"reise"}
|
||||||
|
result = tags.apply_approved_themes("REISE", themes)
|
||||||
|
assert "Themen/reise" in result
|
||||||
|
|
||||||
|
def test_apply_themes_no_match(tmp_path):
|
||||||
|
themes = {"krieg"}
|
||||||
|
result = tags.apply_approved_themes("Alltag in Ruhrort", themes)
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_apply_themes_multiple_matches():
|
||||||
|
themes = {"reise", "hochzeit"}
|
||||||
|
result = tags.apply_approved_themes("Reise zur Hochzeit", themes)
|
||||||
|
assert len(result) == 2
|
||||||
|
assert "Themen/reise" in result
|
||||||
|
assert "Themen/hochzeit" in result
|
||||||
|
|
||||||
|
|
||||||
|
# --- encode_tags ---
|
||||||
|
|
||||||
|
def test_encode_tags_single():
|
||||||
|
assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe"
|
||||||
|
|
||||||
|
def test_encode_tags_multiple():
|
||||||
|
result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"])
|
||||||
|
assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder"
|
||||||
|
|
||||||
|
def test_encode_tags_empty():
|
||||||
|
assert tags.encode_tags([]) == ""
|
||||||
|
|
||||||
|
|
||||||
|
# --- build_tag_tree ---
|
||||||
|
|
||||||
|
def test_build_tag_tree_includes_roots():
|
||||||
|
paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]
|
||||||
|
tree = tags.build_tag_tree(paths)
|
||||||
|
tag_paths = [row["tag_path"] for row in tree]
|
||||||
|
assert "Themen" in tag_paths
|
||||||
|
assert "Briefwechsel" in tag_paths
|
||||||
|
|
||||||
|
def test_build_tag_tree_includes_children():
|
||||||
|
paths = ["Themen/Brautbriefe"]
|
||||||
|
tree = tags.build_tag_tree(paths)
|
||||||
|
child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe")
|
||||||
|
assert child["parent_name"] == "Themen"
|
||||||
|
assert child["tag_name"] == "Brautbriefe"
|
||||||
|
|
||||||
|
def test_build_tag_tree_root_has_empty_parent():
|
||||||
|
paths = ["Themen/Brautbriefe"]
|
||||||
|
tree = tags.build_tag_tree(paths)
|
||||||
|
root = next(r for r in tree if r["tag_path"] == "Themen")
|
||||||
|
assert root["parent_name"] == ""
|
||||||
|
assert root["tag_name"] == "Themen"
|
||||||
|
|
||||||
|
def test_build_tag_tree_no_duplicates():
|
||||||
|
paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"]
|
||||||
|
tree = tags.build_tag_tree(paths)
|
||||||
|
tag_paths = [row["tag_path"] for row in tree]
|
||||||
|
assert len(tag_paths) == len(set(tag_paths))
|
||||||
60
tools/import-normalizer/tests/test_writers.py
Normal file
60
tools/import-normalizer/tests/test_writers.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import csv
|
||||||
|
import openpyxl
|
||||||
|
import overrides
|
||||||
|
import writers
|
||||||
|
import documents
|
||||||
|
|
||||||
|
def test_load_overrides_missing_files(tmp_path):
|
||||||
|
d, n = overrides.load_overrides(tmp_path / "dates.csv", tmp_path / "names.csv")
|
||||||
|
assert d == {} and n == {}
|
||||||
|
|
||||||
|
def test_load_overrides_parsed(tmp_path):
|
||||||
|
dp = tmp_path / "dates.csv"
|
||||||
|
dp.write_text("raw,iso,precision\n13.5.65,1965-05-13,DAY\n", encoding="utf-8")
|
||||||
|
np = tmp_path / "names.csv"
|
||||||
|
np.write_text("raw,person_id\nEugenie Müller,de-gruyter-eugenie\n", encoding="utf-8")
|
||||||
|
d, n = overrides.load_overrides(dp, np)
|
||||||
|
assert d["13.5.65"] == ("1965-05-13", "DAY")
|
||||||
|
assert n["Eugenie Müller"] == "de-gruyter-eugenie"
|
||||||
|
|
||||||
|
def test_write_documents_xlsx_joins_lists(tmp_path):
|
||||||
|
doc = documents.CanonicalDocument(
|
||||||
|
index="W-0001", receiver_person_ids=["a", "b"], receiver_names=["A", "B"],
|
||||||
|
tags=["Brautbriefe"], date_precision="DAY", needs_review=["unparsed_date"])
|
||||||
|
out = tmp_path / "docs.xlsx"
|
||||||
|
writers.write_documents_xlsx([doc], out)
|
||||||
|
wb = openpyxl.load_workbook(out)
|
||||||
|
ws = wb.active
|
||||||
|
header = [c.value for c in ws[1]]
|
||||||
|
assert "receiver_person_ids" in header and "needs_review" in header
|
||||||
|
row = {h: c.value for h, c in zip(header, ws[2])}
|
||||||
|
assert row["receiver_person_ids"] == "a|b"
|
||||||
|
assert row["needs_review"] == "unparsed_date"
|
||||||
|
|
||||||
|
def test_write_documents_xlsx_pins_timestamp(tmp_path):
|
||||||
|
# determinism (NFR-IDEM-01): workbook created/modified are pinned, not the current time
|
||||||
|
doc = documents.CanonicalDocument(index="W-0001")
|
||||||
|
out = tmp_path / "d.xlsx"
|
||||||
|
writers.write_documents_xlsx([doc], out)
|
||||||
|
wb = openpyxl.load_workbook(out)
|
||||||
|
assert (wb.properties.created.year, wb.properties.created.month, wb.properties.created.day) == (2020, 1, 1)
|
||||||
|
|
||||||
|
def test_write_review_csv(tmp_path):
|
||||||
|
out = tmp_path / "r.csv"
|
||||||
|
writers.write_review_csv(out, ["raw", "count"], [["?", 3], ["x", 1]])
|
||||||
|
rows = list(csv.reader(out.open(encoding="utf-8")))
|
||||||
|
assert rows[0] == ["raw", "count"]
|
||||||
|
assert rows[1] == ["?", "3"]
|
||||||
|
|
||||||
|
def test_write_review_csv_defangs_formula_injection(tmp_path):
|
||||||
|
out = tmp_path / "r.csv"
|
||||||
|
writers.write_review_csv(out, ["raw", "count"], [["=cmd|'/C calc'!A0", 1], ["-2+3", 2]])
|
||||||
|
rows = list(csv.reader(out.open(encoding="utf-8")))
|
||||||
|
assert rows[1][0].startswith("'=") # leading '=' neutralised
|
||||||
|
assert rows[2][0].startswith("'-")
|
||||||
|
|
||||||
|
def test_write_summary_sections(tmp_path):
|
||||||
|
out = tmp_path / "s.txt"
|
||||||
|
writers.write_summary(out, {"# INPUTS": "", "rows": 10, "# DATES": "", "unknown_date_rate": "3.2%"})
|
||||||
|
text = out.read_text(encoding="utf-8")
|
||||||
|
assert "INPUTS:" in text and "DATES:" in text and " rows: 10" in text
|
||||||
86
tools/import-normalizer/writers.py
Normal file
86
tools/import-normalizer/writers.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""Write canonical .xlsx outputs and review .csv files."""
|
||||||
|
import csv
|
||||||
|
import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
_PIPE = "|"
|
||||||
|
# Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl
|
||||||
|
# otherwise stamps docProps with the current time on every save.
|
||||||
|
_FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def _join(value):
|
||||||
|
if isinstance(value, list):
|
||||||
|
return _PIPE.join(str(v) for v in value)
|
||||||
|
return "" if value is None else str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _csv_safe(value):
|
||||||
|
"""Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs."""
|
||||||
|
s = "" if value is None else str(value)
|
||||||
|
return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s
|
||||||
|
|
||||||
|
|
||||||
|
DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name",
|
||||||
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw",
|
||||||
|
"date_precision", "location", "tags", "summary", "source_row", "needs_review"]
|
||||||
|
|
||||||
|
PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname",
|
||||||
|
"birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw",
|
||||||
|
"death_place", "spouse", "generation", "notes", "aliases", "provisional"]
|
||||||
|
|
||||||
|
|
||||||
|
def _write_xlsx(records, columns, path: Path):
|
||||||
|
wb = openpyxl.Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.append(columns)
|
||||||
|
for rec in records:
|
||||||
|
ws.append([_join(getattr(rec, col)) for col in columns])
|
||||||
|
wb.properties.created = _FIXED_TS
|
||||||
|
wb.properties.modified = _FIXED_TS
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
wb.save(path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_documents_xlsx(docs, path: Path):
|
||||||
|
_write_xlsx(docs, DOC_COLUMNS, path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_tag_tree_xlsx(tree: list[dict], path: Path):
|
||||||
|
columns = ["tag_path", "parent_name", "tag_name"]
|
||||||
|
wb = openpyxl.Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.append(columns)
|
||||||
|
for row in tree:
|
||||||
|
ws.append([row.get(col, "") for col in columns])
|
||||||
|
wb.properties.created = _FIXED_TS
|
||||||
|
wb.properties.modified = _FIXED_TS
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
wb.save(path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_persons_xlsx(people, path: Path):
|
||||||
|
_write_xlsx(people, PERSON_COLUMNS, path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_review_csv(path: Path, header: list[str], rows: list[list]):
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, "w", encoding="utf-8", newline="") as f:
|
||||||
|
w = csv.writer(f)
|
||||||
|
w.writerow(header)
|
||||||
|
for row in rows:
|
||||||
|
w.writerow([_csv_safe(c) for c in row])
|
||||||
|
|
||||||
|
|
||||||
|
def write_summary(path: Path, stats: dict):
|
||||||
|
"""Render a grouped, scannable summary. Keys beginning with '#' are section headers."""
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
lines = []
|
||||||
|
for k, v in stats.items():
|
||||||
|
if k.startswith("#"):
|
||||||
|
lines.append("")
|
||||||
|
lines.append(k[1:].strip() + ":")
|
||||||
|
else:
|
||||||
|
lines.append(f" {k}: {v}")
|
||||||
|
Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|
||||||
Reference in New Issue
Block a user