Import normalizer: offline tool to normalize the raw archive spreadsheets #663

Merged
marcel merged 172 commits from docs/import-migration into main 2026-05-28 15:05:51 +02:00
2 changed files with 4 additions and 1 deletions
Showing only changes of commit b1b8fa4bed - Show all commits

View File

@@ -272,6 +272,7 @@ For multipart/form-data (file uploads): bypass the typed client and use `event.f
| Form display | German `dd.mm.yyyy` with auto-dot insertion via `handleDateInput()` | | Form display | German `dd.mm.yyyy` with auto-dot insertion via `handleDateInput()` |
| Wire format | ISO 8601 via a hidden `<input type="hidden" name="documentDate" value={dateIso}>` | | Wire format | ISO 8601 via a hidden `<input type="hidden" name="documentDate" value={dateIso}>` |
| Display | `new Intl.DateTimeFormat('de-DE', …).format(new Date(val + 'T12:00:00'))` | | Display | `new Intl.DateTimeFormat('de-DE', …).format(new Date(val + 'T12:00:00'))` |
| Honest precision display | `formatDocumentDate(iso, precision, end?, raw?, locale?)` (`$lib/shared/utils/documentDate.ts`) or the `<DocumentDate>` component — renders a document date at exactly its `meta_date_precision` (MONTH → "Juni 1916", never a fabricated day). It mirrors the Java `DocumentTitleFormatter`; both are pinned to `docs/date-label-fixtures.json` so the title and UI labels can't drift. `meta_date_raw` is untrusted — render it via default escaping, never `{@html}` (a CI guard enforces this). |
### Security checklist (new endpoint) ### Security checklist (new endpoint)

View File

@@ -16,7 +16,8 @@ System_Boundary(backend, "API Backend (Spring Boot)") {
Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).") Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).")
Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).") Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).")
Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.") Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.")
Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, keeps the S3 upload + thumbnail plumbing, and ports the path-traversal / homoglyph / absolute-path / %PDF magic-byte security guards.") Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, builds an honest precision-aware title via DocumentTitleFormatter, keeps the S3 upload + thumbnail plumbing, and ports the path-traversal / homoglyph / absolute-path / %PDF magic-byte security guards.")
Component(titleFmt, "DocumentTitleFormatter", "Pure helper", "Formats the date label baked into an import title at exactly the data's precision (MONTH -> 'Juni 1916', never a fabricated day). Mirrors the frontend formatDocumentDate; both are pinned to docs/date-label-fixtures.json (#666).")
Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.") Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.")
Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.") Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.")
Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.") Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.")
@@ -43,6 +44,7 @@ Rel(importOrch, docLoader, "4. Loads documents")
Rel(tagTreeLoader, sheetReader, "Reads canonical .xlsx") Rel(tagTreeLoader, sheetReader, "Reads canonical .xlsx")
Rel(personRegLoader, sheetReader, "Reads canonical .xlsx") Rel(personRegLoader, sheetReader, "Reads canonical .xlsx")
Rel(docLoader, sheetReader, "Reads canonical .xlsx") Rel(docLoader, sheetReader, "Reads canonical .xlsx")
Rel(docLoader, titleFmt, "Builds honest title date")
Rel(tagTreeLoader, tagSvc, "Upserts tags by source_ref") Rel(tagTreeLoader, tagSvc, "Upserts tags by source_ref")
Rel(personRegLoader, personSvc, "Upserts persons by source_ref") Rel(personRegLoader, personSvc, "Upserts persons by source_ref")
Rel(personTreeLoader, personSvc, "Upserts persons by source_ref") Rel(personTreeLoader, personSvc, "Upserts persons by source_ref")