diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 416b8597..f9553ab2 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -65,6 +65,29 @@ jobs: exit 1 fi + - name: Assert no raw document date rendered via {@html} (CWE-79 — #666) + shell: bash + run: | + # meta_date_raw is untrusted verbatim spreadsheet text — it must render via + # Svelte default escaping, never {@html}. This guard flags any {@html ...} + # whose expression references a raw-date variable. A comment mentioning + # "{@html}" without a raw token inside the braces does NOT match. + # The token list MUST cover every variable that carries the raw value: + # DocumentDate.svelte exposes it via the `raw` prop, so `\braw\b` is included. + # Grow this list whenever a new raw-bearing variable name is introduced. + pattern='\{@html[^}]*(metaDateRaw|documentDateRaw|rawDate|\braw\b)' + # Self-test: the regex must catch the dangerous forms and ignore the comment form. + printf '{@html doc.metaDateRaw}\n' | grep -qP "$pattern" \ + || { echo "FAIL: guard self-test — regex missed the unsafe {@html metaDateRaw} form"; exit 1; } + printf '{@html raw}\n' | grep -qP "$pattern" \ + || { echo "FAIL: guard self-test — regex missed the unsafe {@html raw} form (DocumentDate prop)"; exit 1; } + printf 'never use {@html} for this\n' | grep -qvP "$pattern" \ + || { echo "FAIL: guard self-test — regex wrongly flagged a {@html} comment"; exit 1; } + if grep -rPln "$pattern" --include='*.svelte' frontend/src/; then + echo "FAIL: meta_date_raw rendered via {@html} — use default {…} escaping (CWE-79, #666)." + exit 1 + fi + - name: Assert no (upload|download)-artifact past v3 shell: bash run: | diff --git a/.gitignore b/.gitignore index 60d3f1e8..753fb560 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,10 @@ node_modules/ # Repo uses npm; yarn.lock is ignored to avoid double-lockfile drift. frontend/yarn.lock + +**/.venv/ +**/__pycache__/ +*.pyc + +# Canonical import artifacts live only on the ops host (PII). +# See tools/import-normalizer/.gitignore — load-bearing for that policy. diff --git a/CLAUDE.md b/CLAUDE.md index c36ba70c..362baeac 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -87,7 +87,7 @@ backend/src/main/java/org/raddatz/familienarchiv/ ├── exception/ DomainException, ErrorCode, GlobalExceptionHandler ├── filestorage/ FileService (S3/MinIO) ├── geschichte/ Geschichte (story) domain -├── importing/ MassImportService +├── importing/ CanonicalImportOrchestrator + four loaders (TagTree/PersonRegister/PersonTree/Document) + CanonicalSheetReader ├── notification/ Notification domain + SseEmitterRegistry ├── ocr/ OCR domain — OcrService, OcrBatchService, training ├── person/ Person domain @@ -192,7 +192,8 @@ frontend/src/routes/ ├── persons/ │ ├── [id]/ Person detail │ ├── [id]/edit/ Person edit form -│ └── new/ Create person form +│ ├── new/ Create person form +│ └── review/ Triage view — confirm/rename/merge/delete provisional persons ├── briefwechsel/ Bilateral conversation timeline (Briefwechsel) ├── aktivitaeten/ Unified activity feed (Chronik) ├── geschichten/ Stories — list, [id], [id]/edit, new diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bace28d0..7b7ba8a0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -272,6 +272,7 @@ For multipart/form-data (file uploads): bypass the typed client and use `event.f | Form display | German `dd.mm.yyyy` with auto-dot insertion via `handleDateInput()` | | Wire format | ISO 8601 via a hidden `` | | Display | `new Intl.DateTimeFormat('de-DE', …).format(new Date(val + 'T12:00:00'))` | +| Honest precision display | `formatDocumentDate(iso, precision, end?, raw?, locale?)` (`$lib/shared/utils/documentDate.ts`) or the `` component — renders a document date at exactly its `meta_date_precision` (MONTH → "Juni 1916", never a fabricated day). It mirrors the Java `DocumentTitleFormatter`; both are pinned to `docs/date-label-fixtures.json` so the title and UI labels can't drift. `meta_date_raw` is untrusted — render it via default escaping, never `{@html}` (a CI guard enforces this). | ### Security checklist (new endpoint) diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 249221cc..b96d242a 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -34,7 +34,7 @@ src/main/java/org/raddatz/familienarchiv/ ├── exception/ # DomainException, ErrorCode, GlobalExceptionHandler ├── filestorage/ # FileService (S3/MinIO) ├── geschichte/ # Geschichte (story) domain -├── importing/ # MassImportService +├── importing/ # CanonicalImportOrchestrator + 4 loaders + CanonicalSheetReader ├── notification/ # Notification domain + SseEmitterRegistry ├── ocr/ # OCR domain — OcrService, OcrBatchService, training ├── person/ # Person domain — Person, PersonService, PersonController diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DatePrecision.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DatePrecision.java new file mode 100644 index 00000000..e67f17e1 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DatePrecision.java @@ -0,0 +1,17 @@ +package org.raddatz.familienarchiv.document; + +/** + * Precision of a document's date. Verbatim mirror of the import normalizer's + * {@code Precision} enum (tools/import-normalizer/dates.py) — the canonical output is the + * contract, so there is no translation layer. Do not add, remove, or rename values without + * also changing the normalizer; a mismatch silently breaks import idempotency (see ADR-025). + */ +public enum DatePrecision { + DAY, + MONTH, + SEASON, + YEAR, + RANGE, + APPROX, + UNKNOWN +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/Document.java b/backend/src/main/java/org/raddatz/familienarchiv/document/Document.java index 71c6dead..7f702763 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/Document.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/Document.java @@ -91,6 +91,29 @@ public class Document { @Column(name = "meta_date") private LocalDate documentDate; // Wann wurde der Brief geschrieben? + // Precision of documentDate — drives honest rendering ("ca. 1943", "Frühjahr 1943"). + // Verbatim mirror of the normalizer's Precision enum (see ADR-025). + @Enumerated(EnumType.STRING) + @Column(name = "meta_date_precision", nullable = false, length = 16) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private DatePrecision metaDatePrecision = DatePrecision.UNKNOWN; + + // Range end — only set when metaDatePrecision is RANGE (open-ended ranges allowed → may be null). + @Column(name = "meta_date_end") + private LocalDate metaDateEnd; + + // Original date cell, verbatim, preserved for provenance and "as written" display. + @Column(name = "meta_date_raw", columnDefinition = "TEXT") + private String metaDateRaw; + + // Raw attribution preserved even when a person is linked via sender/receivers. + @Column(name = "sender_text", columnDefinition = "TEXT") + private String senderText; + + @Column(name = "receiver_text", columnDefinition = "TEXT") + private String receiverText; + @Column(name = "meta_location") private String location; diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentBatchMetadataDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentBatchMetadataDTO.java index e9e47270..56553692 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentBatchMetadataDTO.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentBatchMetadataDTO.java @@ -12,6 +12,8 @@ public class DocumentBatchMetadataDTO { private UUID senderId; private List receiverIds; private LocalDate documentDate; + private DatePrecision metaDatePrecision; + private LocalDate metaDateEnd; private String location; private List tagNames; private Boolean metadataComplete; diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentController.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentController.java index f4bf72d3..daaa96c5 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentController.java @@ -313,9 +313,10 @@ public class DocumentController { @RequestParam(required = false) String tagQ, @RequestParam(required = false) DocumentStatus status, @RequestParam(required = false) String tagOp, + @RequestParam(required = false) Boolean undated, Authentication authentication) { TagOperator operator = "OR".equalsIgnoreCase(tagOp) ? TagOperator.OR : TagOperator.AND; - List ids = documentService.findIdsForFilter(q, from, to, senderId, receiverId, tags, tagQ, status, operator); + List ids = documentService.findIdsForFilter(q, from, to, senderId, receiverId, tags, tagQ, status, operator, Boolean.TRUE.equals(undated)); if (ids.size() > BULK_EDIT_FILTER_MAX_IDS) { throw DomainException.badRequest(ErrorCode.BULK_EDIT_TOO_MANY_IDS, "Filter matches " + ids.size() + " documents — refine filter (max " + BULK_EDIT_FILTER_MAX_IDS + ")"); @@ -375,6 +376,7 @@ public class DocumentController { @Parameter(description = "Sort field") @RequestParam(required = false) DocumentSort sort, @Parameter(description = "Sort direction: ASC or DESC") @RequestParam(required = false, defaultValue = "DESC") String dir, @Parameter(description = "Tag operator: AND (default) or OR") @RequestParam(required = false) String tagOp, + @Parameter(description = "Restrict to undated documents (meta_date IS NULL)") @RequestParam(required = false) Boolean undated, // @Max on page guards against overflow when pageable.getOffset() is computed // as page * size — Integer.MAX_VALUE * 50 would wrap to a negative long, which // Hibernate cheerfully turns into an invalid SQL OFFSET. @@ -387,7 +389,7 @@ public class DocumentController { // defaults to AND, which matches the frontend default and keeps old clients working. TagOperator operator = "OR".equalsIgnoreCase(tagOp) ? TagOperator.OR : TagOperator.AND; Pageable pageable = PageRequest.of(page, size); - return ResponseEntity.ok(documentService.searchDocuments(q, from, to, senderId, receiverId, tags, tagQ, status, sort, dir, operator, pageable)); + return ResponseEntity.ok(documentService.searchDocuments(q, from, to, senderId, receiverId, tags, tagQ, status, sort, dir, operator, Boolean.TRUE.equals(undated), pageable)); } @GetMapping(value = "/density", produces = MediaType.APPLICATION_JSON_VALUE) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java index bf8c19a3..44edf84c 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentListItem.java @@ -19,6 +19,9 @@ public record DocumentListItem( String originalFilename, String thumbnailUrl, LocalDate documentDate, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + DatePrecision metaDatePrecision, + LocalDate metaDateEnd, Person sender, @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List receivers, diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSearchResult.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSearchResult.java index b04f7fa2..0ce1758a 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSearchResult.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSearchResult.java @@ -15,24 +15,45 @@ public record DocumentSearchResult( @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int pageSize, @Schema(requiredMode = Schema.RequiredMode.REQUIRED) - int totalPages + int totalPages, + /** + * Total number of undated documents (meta_date IS NULL) matching the current + * filter context (q/tags/sender/receiver/status) across ALL pages — not the + * undated rows on the current page. Computed independently of the "Nur + * undatierte" toggle so it never collapses to the page slice (issue #668). + */ + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + long undatedCount ) { /** * Single-page convenience factory used by empty-result shortcuts and by tests that - * don't care about paging. Treats the whole list as page 0 of itself. + * don't care about paging. Treats the whole list as page 0 of itself. The undated + * count defaults to 0 — the service overlays the real global count via + * {@link #withUndatedCount(long)} before returning. */ public static DocumentSearchResult of(List items) { int size = items.size(); - return new DocumentSearchResult(items, size, 0, size, size == 0 ? 0 : 1); + return new DocumentSearchResult(items, size, 0, size, size == 0 ? 0 : 1, 0L); } /** * Paged factory used by the service when it has a real Pageable + full match count - * (e.g. from Spring's Page<T> or from an in-memory sort-then-slice). + * (e.g. from Spring's Page<T> or from an in-memory sort-then-slice). The undated + * count defaults to 0 — the service overlays the real global count via + * {@link #withUndatedCount(long)} before returning. */ public static DocumentSearchResult paged(List slice, Pageable pageable, long totalElements) { int pageSize = pageable.getPageSize(); int totalPages = pageSize == 0 ? 0 : (int) ((totalElements + pageSize - 1) / pageSize); - return new DocumentSearchResult(slice, totalElements, pageable.getPageNumber(), pageSize, totalPages); + return new DocumentSearchResult(slice, totalElements, pageable.getPageNumber(), pageSize, totalPages, 0L); + } + + /** + * Returns a copy with the global undated count overlaid, leaving every other + * field untouched. Lets the service compute the count once and attach it to + * whichever result shape the search path produced. + */ + public DocumentSearchResult withUndatedCount(long undatedCount) { + return new DocumentSearchResult(items, totalElements, pageNumber, pageSize, totalPages, undatedCount); } } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java index 65bd7cd7..8108c997 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentService.java @@ -171,7 +171,7 @@ public class DocumentService { hasFts, ftsIds, null, null, filters.sender(), filters.receiver(), filters.tags(), filters.tagQ(), - filters.status(), filters.tagOperator()); + filters.status(), filters.tagOperator(), false); return documentRepository.findAll(spec).stream() .map(Document::getDocumentDate) .filter(Objects::nonNull) @@ -378,6 +378,7 @@ public class DocumentService { // 1. Einfache Felder Update doc.setTitle(dto.getTitle()); doc.setDocumentDate(dto.getDocumentDate()); + applyDatePrecision(doc, dto); doc.setLocation(dto.getLocation()); doc.setTranscription(dto.getTranscription()); doc.setSummary(dto.getSummary()); @@ -446,6 +447,25 @@ public class DocumentService { return saved; } + /** + * Applies the three date-precision fields only when the DTO carries them. + * A null field means "not submitted" — overwriting the stored value with null + * would fabricate a precision the user never chose, the exact dishonesty #666 + * exists to prevent. A row with a genuinely-unknown precision must keep it when + * an unrelated edit (e.g. a location typo) is saved. + */ + private void applyDatePrecision(Document doc, DocumentUpdateDTO dto) { + if (dto.getMetaDatePrecision() != null) { + doc.setMetaDatePrecision(dto.getMetaDatePrecision()); + } + if (dto.getMetaDateEnd() != null) { + doc.setMetaDateEnd(dto.getMetaDateEnd()); + } + if (dto.getMetaDateRaw() != null) { + doc.setMetaDateRaw(dto.getMetaDateRaw()); + } + } + @Transactional public Document updateDocumentTags(UUID docId, List tagNames) { Document doc = documentRepository.findById(docId) @@ -481,7 +501,8 @@ public class DocumentService { */ @Transactional(readOnly = true) public List findIdsForFilter(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, - List tags, String tagQ, DocumentStatus status, TagOperator tagOperator) { + List tags, String tagQ, DocumentStatus status, TagOperator tagOperator, + boolean undated) { boolean hasText = StringUtils.hasText(text); List rankedIds = null; if (hasText) { @@ -490,7 +511,7 @@ public class DocumentService { } Specification spec = buildSearchSpec( - hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator); + hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator, undated); return documentRepository.findAll(spec).stream().map(Document::getId).toList(); } @@ -504,7 +525,8 @@ public class DocumentService { LocalDate from, LocalDate to, UUID sender, UUID receiver, List tags, String tagQ, - DocumentStatus status, TagOperator tagOperator) { + DocumentStatus status, TagOperator tagOperator, + boolean undated) { boolean useOrLogic = tagOperator == TagOperator.OR; List> expandedTagSets = tagService.expandTagNamesToDescendantIdSets(tags); Specification textSpec = hasText ? hasIds(ftsIds) : (root, query, cb) -> null; @@ -514,7 +536,8 @@ public class DocumentService { .and(hasReceiver(receiver)) .and(hasTags(expandedTagSets, useOrLogic)) .and(hasTagPartial(tagQ)) - .and(hasStatus(status)); + .and(hasStatus(status)) + .and(undatedOnly(undated)); } /** @@ -643,22 +666,62 @@ public class DocumentService { } // 1. Allgemeine Suche (für das Suchfeld im Frontend) - public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, Pageable pageable) { + public DocumentSearchResult searchDocuments(String text, LocalDate from, LocalDate to, UUID sender, UUID receiver, List tags, String tagQ, DocumentStatus status, DocumentSort sort, String dir, TagOperator tagOperator, boolean undated, Pageable pageable) { boolean hasText = StringUtils.hasText(text); - // Pure-text RELEVANCE: push pagination into SQL — skip findAllMatchingIdsByFts entirely (ADR-008). - if (isPureTextRelevance(hasText, sort, from, to, sender, receiver, tags, tagQ, status)) { + // Pure-text RELEVANCE: push pagination + ts_rank ordering into SQL — skip + // findAllMatchingIdsByFts entirely (ADR-008). This must run BEFORE any + // findAllMatchingIdsByFts call so the fast path is preserved. An active undated + // filter must NOT take this path: it bypasses buildSearchSpec, so the + // undatedOnly predicate would be silently dropped. By definition this path has + // no date/sender/receiver/tag/status filters, and undated documents are valid + // FTS hits already folded into the ranked page, so there is no separate undated + // count to report here. + if (!undated && isPureTextRelevance(hasText, sort, from, to, sender, receiver, tags, tagQ, status)) { return relevanceSortedPageFromSql(text, pageable); } List rankedIds = null; if (hasText) { rankedIds = documentRepository.findAllMatchingIdsByFts(text); + // FTS matched nothing → no results and, by definition, no undated matches either. if (rankedIds.isEmpty()) return DocumentSearchResult.of(List.of()); } + // Global undated count for the current filter (q/tags/sender/receiver/status), + // forcing undatedOnly(true) and IGNORING the user's "Nur undatierte" toggle so + // it never collapses to the page slice and never double-counts (issue #668). + long undatedCount = countUndatedForFilter(hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator); + + return runSearch(text, hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, sort, dir, tagOperator, undated, pageable) + .withUndatedCount(undatedCount); + } + + /** + * Counts every undated document (meta_date IS NULL) matching the active filter, + * across all pages, independent of the undated toggle. Reuses {@link #buildSearchSpec} + * with {@code undated=true} forced so the count tracks q/tags/sender/receiver/status. + * A {@code from}/{@code to} range excludes undated rows by the collision rule (#668), + * so the count is legitimately 0 inside a date range. + */ + private long countUndatedForFilter(boolean hasText, List ftsIds, + LocalDate from, LocalDate to, UUID sender, UUID receiver, + List tags, String tagQ, DocumentStatus status, TagOperator tagOperator) { + Specification undatedSpec = buildSearchSpec( + hasText, ftsIds, from, to, sender, receiver, tags, tagQ, status, tagOperator, true); + return documentRepository.count(undatedSpec); + } + + /** The original search dispatch — produces the page slice + totals, sans undated count. */ + private DocumentSearchResult runSearch(String text, boolean hasText, List rankedIds, + LocalDate from, LocalDate to, UUID sender, UUID receiver, + List tags, String tagQ, DocumentStatus status, + DocumentSort sort, String dir, TagOperator tagOperator, + boolean undated, Pageable pageable) { + // The pure-text RELEVANCE fast path is handled by the caller (searchDocuments) + // before findAllMatchingIdsByFts runs, so it never reaches here (ADR-008). Specification spec = buildSearchSpec( - hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator); + hasText, rankedIds, from, to, sender, receiver, tags, tagQ, status, tagOperator, undated); // SENDER and RECEIVER sorts load the full match set and slice in-memory. // JPA's Sort.by("sender.lastName") generates an INNER JOIN that silently drops @@ -758,6 +821,8 @@ public class DocumentService { doc.getOriginalFilename(), doc.getThumbnailUrl(), doc.getDocumentDate(), + doc.getMetaDatePrecision(), + doc.getMetaDateEnd(), doc.getSender(), List.copyOf(doc.getReceivers()), List.copyOf(doc.getTags()), @@ -780,7 +845,15 @@ public class DocumentService { private Sort resolveSort(DocumentSort sort, String dir) { Sort.Direction direction = "ASC".equalsIgnoreCase(dir) ? Sort.Direction.ASC : Sort.Direction.DESC; if (sort == null || sort == DocumentSort.DATE || sort == DocumentSort.RELEVANCE) { - return Sort.by(direction, "documentDate"); + // Undated documents (null documentDate) must order last regardless of + // direction — Postgres puts NULLs FIRST on ASC by default, which would + // surface the undated pile at the top with no explanation (issue #668). + // The title tiebreaker gives a stable total order when every row is + // null-dated (the "Nur undatierte" filter), so pagination is deterministic. + // title is @Column(nullable=false), so it is always present. + return Sort.by( + new Sort.Order(direction, "documentDate").nullsLast(), + Sort.Order.asc("title")); } // SENDER and RECEIVER are sorted in-memory before this method is called return switch (sort) { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSpecifications.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSpecifications.java index 22339a95..ff238a43 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSpecifications.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentSpecifications.java @@ -55,6 +55,12 @@ public class DocumentSpecifications { return (root, query, cb) -> status == null ? null : cb.equal(root.get("status"), status); } + // Filtert auf undatierte Dokumente (meta_date IS NULL) — für die "Nur undatierte"-Triage. + // false → kein Prädikat (no-op), true → documentDate IS NULL (issue #668). + public static Specification undatedOnly(boolean undated) { + return (root, query, cb) -> undated ? cb.isNull(root.get("documentDate")) : null; + } + /** * Filtert nach vorausgeweiteten Tag-ID-Sets mit AND- oder OR-Logik. * diff --git a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentUpdateDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentUpdateDTO.java index 3bfda02c..118113e3 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentUpdateDTO.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/document/DocumentUpdateDTO.java @@ -11,6 +11,11 @@ import org.raddatz.familienarchiv.ocr.ScriptType; public class DocumentUpdateDTO { private String title; private LocalDate documentDate; + private DatePrecision metaDatePrecision; + private LocalDate metaDateEnd; + private String metaDateRaw; + private String senderText; + private String receiverText; private String location; private String documentLocation; private String archiveBox; diff --git a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java index 54802f86..d9d0d8b2 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java @@ -40,6 +40,8 @@ public enum ErrorCode { // --- Import --- /** A mass import is already in progress; only one can run at a time. 409 */ IMPORT_ALREADY_RUNNING, + /** A canonical import artifact is missing, unreadable, or missing a required header. 400 */ + IMPORT_ARTIFACT_INVALID, // --- Thumbnails --- /** A thumbnail backfill is already in progress; only one can run at a time. 409 */ diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java new file mode 100644 index 00000000..2107bfda --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java @@ -0,0 +1,94 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.time.LocalDateTime; +import java.util.List; + +/** + * Runs the four canonical loaders in their real dependency order — encoded explicitly + * here, not implied by call order — and owns the async runner plus the {@link ImportStatus} + * state machine the admin UI consumes. The orchestrator smoke-checks that all four + * artifacts are present before starting, failing fast rather than half-loading tags but no + * documents. A malformed artifact (a loader throwing) sets {@code FAILED}; an individual + * bad file is surfaced through the {@link ImportStatus.SkippedFile} mechanism instead. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class CanonicalImportOrchestrator { + + private static final String TAG_TREE_ARTIFACT = "canonical-tag-tree.xlsx"; + private static final String PERSONS_ARTIFACT = "canonical-persons.xlsx"; + private static final String PERSONS_TREE_ARTIFACT = "canonical-persons-tree.json"; + private static final String DOCUMENTS_ARTIFACT = "canonical-documents.xlsx"; + + private final TagTreeImporter tagTreeImporter; + private final PersonRegisterImporter personRegisterImporter; + private final PersonTreeImporter personTreeImporter; + private final DocumentImporter documentImporter; + + @Value("${app.import.dir:/import}") + private String canonicalDir; + + private volatile ImportStatus currentStatus = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + + public ImportStatus getStatus() { + return currentStatus; + } + + @Async + public void runImportAsync() { + if (currentStatus.state() == ImportStatus.State.RUNNING) { + throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); + } + runImport(); + } + + /** Synchronous entry point — wrapped by {@link #runImportAsync()} and called directly in tests. */ + void runImport() { + currentStatus = new ImportStatus(ImportStatus.State.RUNNING, "IMPORT_RUNNING", + "Import läuft...", 0, List.of(), LocalDateTime.now()); + try { + File tagTree = requireArtifact(TAG_TREE_ARTIFACT); + File persons = requireArtifact(PERSONS_ARTIFACT); + File personsTree = requireArtifact(PERSONS_TREE_ARTIFACT); + File documents = requireArtifact(DOCUMENTS_ARTIFACT); + + // Dependency DAG: documents need persons + tags; the tree needs persons. + tagTreeImporter.load(tagTree); + personRegisterImporter.load(persons); + personTreeImporter.load(personsTree); + DocumentImporter.LoadResult result = documentImporter.load(documents); + + currentStatus = new ImportStatus(ImportStatus.State.DONE, "IMPORT_DONE", + "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", + result.processed(), result.skippedFiles(), currentStatus.startedAt()); + } catch (DomainException e) { + log.error("Canonical import failed: {}", e.getMessage()); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_ARTIFACT", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } catch (Exception e) { + log.error("Canonical import failed", e); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_INTERNAL", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } + } + + private File requireArtifact(String name) { + File artifact = new File(canonicalDir, name); + if (!artifact.isFile()) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Missing canonical artifact: " + name); + } + return artifact; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java new file mode 100644 index 00000000..ece6a7ca --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java @@ -0,0 +1,133 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DateUtil; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.usermodel.WorkbookFactory; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; + +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Value-level POI helper for the canonical import artifacts. No Spring, no domain + * knowledge: it opens a workbook, maps the header row to column indices by name, and + * yields typed rows whose cells are looked up by header name — the seam that replaces + * the old positional {@code @Value app.import.col.*} indices. List columns are split on + * the pipe delimiter the normalizer emits. + */ +public final class CanonicalSheetReader { + + private CanonicalSheetReader() { + } + + /** A single data row, addressable by canonical header name (never by index). */ + public static final class Row { + + private final Map headerIndex; + private final List cells; + + private Row(Map headerIndex, List cells) { + this.headerIndex = headerIndex; + this.cells = cells; + } + + /** Trimmed cell value for the named header, or "" when absent/blank. */ + public String get(String header) { + Integer index = headerIndex.get(header); + if (index == null || index >= cells.size()) return ""; + String value = cells.get(index); + return value == null ? "" : value.trim(); + } + } + + /** + * Reads all data rows from the first sheet, validating that every required header is + * present. Throws a fail-closed {@link DomainException} on a missing header so a + * loader never silently maps the wrong column. + */ + public static List readRows(File file, List requiredHeaders) { + try (FileInputStream fis = new FileInputStream(file); + Workbook workbook = WorkbookFactory.create(fis)) { + + Sheet sheet = workbook.getSheetAt(0); + org.apache.poi.ss.usermodel.Row headerRow = sheet.getRow(sheet.getFirstRowNum()); + Map headerIndex = mapHeaders(headerRow); + requireHeaders(file, headerIndex, requiredHeaders); + + List rows = new ArrayList<>(); + for (int i = sheet.getFirstRowNum() + 1; i <= sheet.getLastRowNum(); i++) { + org.apache.poi.ss.usermodel.Row poiRow = sheet.getRow(i); + if (poiRow == null) continue; + rows.add(new Row(headerIndex, readCells(poiRow, headerIndex.size()))); + } + return rows; + } catch (DomainException e) { + throw e; + } catch (Exception e) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Unreadable canonical artifact: " + file.getName()); + } + } + + /** Splits a pipe-delimited list column into trimmed, non-empty segments. */ + public static List splitList(String raw) { + if (raw == null || raw.isBlank()) return List.of(); + return Arrays.stream(raw.split("\\|")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .toList(); + } + + private static Map mapHeaders(org.apache.poi.ss.usermodel.Row headerRow) { + if (headerRow == null) { + return Map.of(); + } + Map headerIndex = new HashMap<>(); + for (int c = 0; c < headerRow.getLastCellNum(); c++) { + String name = cellToString(headerRow.getCell(c)).trim(); + if (!name.isEmpty()) headerIndex.putIfAbsent(name, c); + } + return headerIndex; + } + + private static void requireHeaders(File file, Map headerIndex, List requiredHeaders) { + for (String header : requiredHeaders) { + if (!headerIndex.containsKey(header)) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Missing required header '" + header + "' in artifact " + file.getName()); + } + } + } + + private static List readCells(org.apache.poi.ss.usermodel.Row poiRow, int columnCount) { + int width = Math.max(columnCount, poiRow.getLastCellNum()); + List cells = new ArrayList<>(width); + for (int c = 0; c < width; c++) { + cells.add(cellToString(poiRow.getCell(c))); + } + return cells; + } + + private static String cellToString(Cell cell) { + if (cell == null) return ""; + return switch (cell.getCellType()) { + case STRING -> cell.getStringCellValue(); + case NUMERIC -> { + if (DateUtil.isCellDateFormatted(cell)) { + yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); + } + yield String.valueOf((long) cell.getNumericCellValue()); + } + case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); + default -> ""; + }; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java new file mode 100644 index 00000000..b85a8cc6 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java @@ -0,0 +1,391 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.document.DatePrecision; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import org.raddatz.familienarchiv.tag.TagService; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.regex.Pattern; + +/** + * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no + * semantic transformation: the normalizer already resolved people to slugs and dates to + * ISO values. This loader maps columns by header name, routes each attribution + * register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}), + * parses clean dates, and keeps the S3/thumbnail plumbing. + * + *

The import corpus is uniform — every PDF is named {@code .pdf} flat in the import + * dir — so a document's PDF is resolved directly by its index: + * {@code importDir.resolve(index + ".pdf")}. The {@code index} is still hostile input + * regardless of upstream trust (CWE-22 does not care it came from our Python tool): it is + * validated against a strict catalog pattern with {@link #isValidImportIndex} (no path + * separators, no {@code .}/{@code ..}, no absolute path, no slash homoglyphs) and the + * resolved path is asserted to stay inside the import dir in {@link #resolvePdfByIndex} as + * defense-in-depth. The {@code %PDF} magic-byte check still gates upload. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class DocumentImporter { + + static final List REQUIRED_HEADERS = List.of( + "index", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision"); + + // Catalog index shape: 1–4 letters (ASCII + Latin-1 letters, e.g. the German "ü" in + // "Mü-0001"), one or more hyphens (the corpus has a few "C--0029" data-entry artefacts), + // digits, and an optional trailing "x" the normalizer recognises. Anchored, with no + // separator / dot / slash characters in the class, so ".pdf" can never traverse. + // NOTE: `\d` here is intentionally ASCII-only ([0-9]). Java's java.util.regex matches `\d` + // against [0-9] unless Pattern.UNICODE_CHARACTER_CLASS is set — do NOT add that flag, or + // Arabic-Indic / fullwidth digits would silently widen the accepted set. + private static final Pattern INDEX_PATTERN = + Pattern.compile("[A-Za-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF]{1,4}-+\\d+x?"); + + private final DocumentService documentService; + private final PersonService personService; + private final TagService tagService; + private final S3Client s3Client; + private final ThumbnailAsyncRunner thumbnailAsyncRunner; + private final FileStreamOpener fileStreamOpener; + + @Value("${app.s3.bucket:familienarchiv}") + private String bucketName; + + @Value("${app.import.dir:/import}") + private String importDir; + + /** Outcome of loading the document sheet: processed count + per-file skips. */ + public record LoadResult(int processed, List skippedFiles) {} + + // One transaction for the whole sheet keeps the Hibernate session open so an existing + // document's lazy receivers collection initialises during an idempotent re-import. + // Invoked cross-bean from the orchestrator, so the @Transactional proxy applies. + @Transactional + public LoadResult load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + int processed = 0; + List skipped = new ArrayList<>(); + // 1-based source row number for ops triage breadcrumbs (the spreadsheet header is row 1, + // so the first data row is row 2 — matches what an operator sees in the .xlsx). + int rowNumber = 1; + for (CanonicalSheetReader.Row row : rows) { + rowNumber++; + String index = row.get("index"); + if (index.isBlank()) continue; + Optional skipReason = importRow(row, index, rowNumber); + if (skipReason.isPresent()) { + skipped.add(new ImportStatus.SkippedFile(index, skipReason.get())); + } else { + processed++; + } + } + log.info("Imported {} documents from {} ({} skipped)", processed, artifact.getName(), skipped.size()); + return new LoadResult(processed, skipped); + } + + private Optional importRow(CanonicalSheetReader.Row row, String index, int rowNumber) { + if (!isValidImportIndex(index)) { + // Breadcrumb is the source row number, NOT the raw (possibly-hostile) index — an + // operator triaging the import can find the offending row in the .xlsx without us + // echoing attacker-controlled input into the log. + log.warn("Skipping import row {}: index rejected (fails catalog-shape validation)", rowNumber); + return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + } + Optional resolved = resolvePdfByIndex(index, rowNumber); + if (resolved.isEmpty()) { + // Distinct from the "index rejected" skip above: the index is VALID but no + // .pdf is on disk, so the row becomes a normal PLACEHOLDER (not skipped). The + // index is a validated catalog id (no hostile content), so it is safe to log here — + // this surfaces a corpus that drifts from the ".pdf" assumption (e.g. a file + // that arrived under a different name) rather than dropping it silently. + log.info("Import row {}: index {} is valid but {}.pdf is absent — creating PLACEHOLDER", + rowNumber, index, index); + } else { + try { + if (!isPdfMagicBytes(resolved.get())) { + return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + } + } catch (IOException e) { + log.error("Magic-byte check failed for row {}", index, e); + return Optional.of(ImportStatus.SkipReason.FILE_READ_ERROR); + } + } + return persist(row, index, resolved); + } + + private Optional persist(CanonicalSheetReader.Row row, String index, Optional file) { + Document existing = documentService.findByOriginalFilename(index).orElse(null); + if (existing != null && existing.getStatus() != DocumentStatus.PLACEHOLDER) { + return Optional.of(ImportStatus.SkipReason.ALREADY_EXISTS); + } + + String s3Key = null; + String contentType = null; + DocumentStatus status = DocumentStatus.PLACEHOLDER; + if (file.isPresent()) { + contentType = probeContentType(file.get()); + s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); + try { + uploadToS3(file.get(), s3Key, contentType); + status = DocumentStatus.UPLOADED; + } catch (Exception e) { + log.error("S3 upload failed for {}", file.get().getName(), e); + return Optional.of(ImportStatus.SkipReason.S3_UPLOAD_FAILED); + } + } + + Document doc = buildDocument(row, index, existing, s3Key, contentType, status); + Document saved = documentService.save(doc); + if (file.isPresent()) { + thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); + } + return Optional.empty(); + } + + private Document buildDocument(CanonicalSheetReader.Row row, String index, Document existing, + String s3Key, String contentType, DocumentStatus status) { + Document doc = existing != null ? existing + : Document.builder().originalFilename(index).build(); + applyAttribution(doc, row); + applyDates(doc, row); + applyAuthoritativeAssociations(doc, row); + applyFileMetadata(doc, s3Key, contentType, status, index); + applyComputedFlags(doc); + return doc; + } + + // Sender + raw sender/receiver text. The raw cells are always retained verbatim, even + // when a person is linked — the load-bearing invariant behind the merge story (ADR-025). + private void applyAttribution(Document doc, CanonicalSheetReader.Row row) { + String senderName = row.get("sender_name"); + String receiverNames = row.get("receiver_names"); + Person sender = resolveSender(row.get("sender_person_id"), senderName); + doc.setSender(sender); + doc.setSenderText(blankToNull(senderName)); + doc.setReceiverText(blankToNull(receiverNames)); + } + + // Date triplet + raw + location. Pure value parsing, no semantic logic. + private void applyDates(Document doc, CanonicalSheetReader.Row row) { + doc.setDocumentDate(parseIsoDate(row.get("date_iso"))); + doc.setMetaDatePrecision(parsePrecision(row.get("date_precision"))); + doc.setMetaDateEnd(parseIsoDate(row.get("date_end"))); + doc.setMetaDateRaw(blankToNull(row.get("date_raw"))); + doc.setLocation(blankToNull(row.get("location"))); + doc.setSummary(blankToNull(row.get("summary"))); + } + + // Receivers and tags are owned by the canonical row (ADR-025): clear then re-populate so a + // shrunk set on re-import prunes stale links rather than accumulating them. The + // "preserve human edits" rule does NOT extend to these collections. + private void applyAuthoritativeAssociations(Document doc, CanonicalSheetReader.Row row) { + Set receivers = resolveReceivers(row.get("receiver_person_ids"), row.get("receiver_names")); + doc.getReceivers().clear(); + doc.getReceivers().addAll(receivers); + attachTag(doc, row.get("tags")); + } + + // S3 key, content type, status, and the index-derived title. + private void applyFileMetadata(Document doc, String s3Key, String contentType, + DocumentStatus status, String index) { + doc.setStatus(status); + doc.setFilePath(s3Key); + doc.setContentType(contentType); + doc.setTitle(buildTitle(index, doc.getDocumentDate(), doc.getMetaDatePrecision(), + doc.getMetaDateEnd(), doc.getMetaDateRaw(), doc.getLocation())); + } + + // metadataComplete: a document counts as fully described if any of the three "who/when" + // pieces is filled. Called last so the upstream setters have already populated the doc. + private void applyComputedFlags(Document doc) { + doc.setMetadataComplete(doc.getDocumentDate() != null + || doc.getSender() != null + || !doc.getReceivers().isEmpty()); + } + + // The title carries the date at the HONEST precision (never a fabricated day) via the + // shared DocumentTitleFormatter, plus the location — kept under 20 lines by delegating. + private static String buildTitle(String index, LocalDate date, DatePrecision precision, + LocalDate end, String raw, String location) { + StringBuilder title = new StringBuilder(index); + if (date != null && precision != DatePrecision.UNKNOWN) { + title.append(" – ").append(DocumentTitleFormatter.formatTitleDate(date, precision, end, raw)); + } + if (location != null && !location.isBlank()) { + title.append(" – ").append(location); + } + return title.toString(); + } + + // ─── attribution routing — register-first, always retain raw ───────────────────── + + private Person resolveSender(String slug, String rawName) { + if (slug.isBlank()) return null; + return resolvePerson(slug, rawName); + } + + // Zips the parallel `receiver_person_ids` and `receiver_names` columns by position so an + // unresolved receiver becomes a provisional Person whose lastName is the human name from + // `receiver_names`, not the slug. If the names list is shorter than the slugs list (rare — + // canonical data zips them 1:1), missing entries fall back to slug-as-name. + private Set resolveReceivers(String slugs, String names) { + List slugList = CanonicalSheetReader.splitList(slugs); + List nameList = CanonicalSheetReader.splitList(names); + Set receivers = new LinkedHashSet<>(); + for (int i = 0; i < slugList.size(); i++) { + String slug = slugList.get(i); + String name = i < nameList.size() ? nameList.get(i) : slug; + receivers.add(resolvePerson(slug, name)); + } + return receivers; + } + + private Person resolvePerson(String slug, String rawName) { + return personService.findBySourceRef(slug) + .orElseGet(() -> personService.upsertBySourceRef(PersonUpsertCommand.builder() + .sourceRef(slug) + .lastName(blankToNull(rawName) == null ? slug : rawName) + .personType(PersonType.PERSON) + .provisional(true) + .build())); + } + + // Authoritative: the canonical row defines the document's tags exactly. Clearing first + // means a tag removed from the row is pruned on re-import (ADR-025). + private void attachTag(Document doc, String tagPath) { + doc.getTags().clear(); + if (tagPath.isBlank()) return; + tagService.findBySourceRef(tagPath).ifPresent(tag -> doc.getTags().add(tag)); + } + + // ─── clean-value parsing (no semantic logic) ───────────────────────────────────── + + private static LocalDate parseIsoDate(String value) { + if (value == null || value.isBlank()) return null; + try { + return LocalDate.parse(value.trim()); + } catch (DateTimeParseException e) { + return null; + } + } + + private static DatePrecision parsePrecision(String value) { + if (value == null || value.isBlank()) return DatePrecision.UNKNOWN; + try { + return DatePrecision.valueOf(value.trim()); + } catch (IllegalArgumentException e) { + return DatePrecision.UNKNOWN; + } + } + + // ─── file handling + S3 (small ≤20-line methods) ───────────────────────────────── + + private String probeContentType(File file) { + try { + String probed = Files.probeContentType(file.toPath()); + return probed != null ? probed : "application/octet-stream"; + } catch (IOException e) { + return "application/octet-stream"; + } + } + + private void uploadToS3(File file, String s3Key, String contentType) { + s3Client.putObject(PutObjectRequest.builder() + .bucket(bucketName) + .key(s3Key) + .contentType(contentType) + .build(), + RequestBody.fromFile(file)); + } + + // ─── index validation + containment — defense-in-depth, do not weaken ──────────── + + // The index is the only thing that drives the on-disk lookup, so it must never contain a + // path separator, traversal token, slash homoglyph, null byte, or absolute-path marker — + // each guard mirrors the filename guards ported from MassImportService — and it must match + // the strict catalog shape so anything unexpected is skipped loudly rather than read. + private boolean isValidImportIndex(String index) { + if (index == null || index.isBlank()) return false; + if (index.contains("/")) return false; + if (index.contains("\\")) return false; + if (index.contains("∕")) return false; // U+2215 DIVISION SLASH + if (index.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS + if (index.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR + if (index.contains(".")) return false; // no dots — ".pdf" is the only extension + if (index.contains("\0")) return false; + if (Paths.get(index).isAbsolute()) return false; + return INDEX_PATTERN.matcher(index).matches(); + } + + private boolean isPdfMagicBytes(File file) throws IOException { + // FileStreamOpener is injected so tests can stub a throwing implementation for the + // IO-error branch without spying on the importer itself. + try (InputStream is = fileStreamOpener.open(file)) { + byte[] header = is.readNBytes(4); + return header.length == 4 + && header[0] == 0x25 // % + && header[1] == 0x50 // P + && header[2] == 0x44 // D + && header[3] == 0x46; // F + } + } + + // O(1) direct lookup: the PDF is exactly importDir/.pdf. The caller has already + // validated the index shape; the canonical-path containment assertion below is + // defense-in-depth so even a symlinked .pdf cannot read outside importDir. + private Optional resolvePdfByIndex(String index, int rowNumber) { + File baseDir = new File(importDir); + File candidate = baseDir.toPath().resolve(index + ".pdf").toFile(); + try { + if (!candidate.isFile()) return Optional.empty(); + String baseDirCanonical = baseDir.getCanonicalPath(); + if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { + throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); + } + return Optional.of(candidate); + } catch (IOException e) { + // Distinct from the deliberate symlink-escape abort above (which throws): canonical + // resolution itself failed (e.g. the OS rejected the path mid-resolution). We fail + // safe to a PLACEHOLDER, but never silently — log it so the asymmetry surfaces in ops. + log.warn("Canonical path resolution failed for import row {}: treating {}.pdf as absent", + rowNumber, index, e); + return Optional.empty(); + } + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatter.java new file mode 100644 index 00000000..65120004 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatter.java @@ -0,0 +1,112 @@ +package org.raddatz.familienarchiv.importing; + +import org.raddatz.familienarchiv.document.DatePrecision; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.Locale; + +/** + * Produces the honest German date label baked into an import title — at exactly + * the precision the data claims, never finer. This is the Java half of the + * single source of truth shared with the frontend {@code formatDocumentDate} + * (TypeScript): both are asserted against {@code docs/date-label-fixtures.json} + * so the two implementations cannot drift (see #666). + * + *

Import titles are always German, so the labels here are the German + * canonical form (mirroring the {@code de} Paraglide messages used by the UI). + */ +final class DocumentTitleFormatter { + + private static final DateTimeFormatter LONG = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); + private static final DateTimeFormatter MONTH_YEAR = DateTimeFormatter.ofPattern("MMMM yyyy", Locale.GERMAN); + private static final DateTimeFormatter MEDIUM = DateTimeFormatter.ofPattern("d. MMM yyyy", Locale.GERMAN); + private static final DateTimeFormatter DAY_MONTH = DateTimeFormatter.ofPattern("d. MMM", Locale.GERMAN); + + private static final String UNKNOWN = "Datum unbekannt"; + private static final String APPROX_PREFIX = "ca."; + private static final String OPEN_RANGE_PREFIX = "ab"; + + private DocumentTitleFormatter() { + } + + /** + * @param date the sort/filter anchor day; null for UNKNOWN rows + * @param precision descriptive precision metadata + * @param end the RANGE end day; null means an open-ended range + * @param raw the verbatim spreadsheet cell, used only to pick a season word + * @return the honest German label + */ + static String formatTitleDate(LocalDate date, DatePrecision precision, LocalDate end, String raw) { + if (precision == DatePrecision.UNKNOWN || date == null) { + return UNKNOWN; + } + return switch (precision) { + case DAY -> LONG.format(date); + case MONTH -> MONTH_YEAR.format(date); + case SEASON -> seasonLabel(date, raw); + case YEAR -> String.valueOf(date.getYear()); + case APPROX -> APPROX_PREFIX + " " + date.getYear(); + case RANGE -> rangeLabel(date, end); + case UNKNOWN -> UNKNOWN; + }; + } + + private static String seasonLabel(LocalDate date, String raw) { + Season season = seasonFromRaw(raw); + if (season == null) { + season = seasonOfMonth(date.getMonthValue()); + } + return season.german + " " + date.getYear(); + } + + private static String rangeLabel(LocalDate start, LocalDate end) { + if (end == null) { + return OPEN_RANGE_PREFIX + " " + MEDIUM.format(start); + } + if (end.equals(start)) { + return MEDIUM.format(start); + } + if (start.getYear() != end.getYear()) { + return MEDIUM.format(start) + " – " + MEDIUM.format(end); + } + if (start.getMonthValue() == end.getMonthValue()) { + return start.getDayOfMonth() + ".–" + MEDIUM.format(end); + } + return DAY_MONTH.format(start) + " – " + MEDIUM.format(end); + } + + // ─── season mapping — mirrors the normalizer's representative months ───────────── + + private enum Season { + SPRING("Frühling"), + SUMMER("Sommer"), + AUTUMN("Herbst"), + WINTER("Winter"); + + private final String german; + + Season(String german) { + this.german = german; + } + } + + private static Season seasonOfMonth(int month) { + if (month >= 3 && month <= 5) return Season.SPRING; + if (month >= 6 && month <= 8) return Season.SUMMER; + if (month >= 9 && month <= 11) return Season.AUTUMN; + return Season.WINTER; + } + + private static Season seasonFromRaw(String raw) { + if (raw == null || raw.isBlank()) return null; + String token = raw.trim().split("\\s+")[0].toLowerCase(Locale.GERMAN); + return switch (token) { + case "frühling", "frühjahr" -> Season.SPRING; + case "sommer" -> Season.SUMMER; + case "herbst" -> Season.AUTUMN; + case "winter" -> Season.WINTER; + default -> null; + }; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/FileStreamOpener.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/FileStreamOpener.java new file mode 100644 index 00000000..aa4c2e50 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/FileStreamOpener.java @@ -0,0 +1,33 @@ +package org.raddatz.familienarchiv.importing; + +import org.springframework.stereotype.Component; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Test seam for opening a {@link File} as an {@link InputStream}. Extracted so the magic-byte + * check in {@link DocumentImporter} can be unit-tested for the IO-error branch by injecting a + * mock that throws, without needing a Mockito spy on the importer itself. + * + *

Production uses {@link DefaultFileStreamOpener}, a one-line delegate to + * {@code new FileInputStream(file)}. + */ +@FunctionalInterface +public interface FileStreamOpener { + + /** Opens {@code file} for sequential reads. Caller closes the returned stream. */ + InputStream open(File file) throws IOException; + + /** Default production implementation: plain {@code FileInputStream}. */ + @Component + final class DefaultFileStreamOpener implements FileStreamOpener { + + @Override + public InputStream open(File file) throws IOException { + return new FileInputStream(file); + } + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java new file mode 100644 index 00000000..ae21adc2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java @@ -0,0 +1,50 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.v3.oas.annotations.media.Schema; + +import java.time.LocalDateTime; +import java.util.List; + +/** + * Async import state surfaced to {@code admin/system/ImportStatusCard.svelte} via the + * generated types. The shape ({@code state, statusCode, processed, skippedFiles, skipped}) + * is kept verbatim from the retired MassImportService so the admin UI keeps working. + */ +public record ImportStatus( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, + @JsonIgnore String message, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, + LocalDateTime startedAt +) { + + public enum State { IDLE, RUNNING, DONE, FAILED } + + public enum SkipReason { + INVALID_FILENAME_PATH_TRAVERSAL, + INVALID_PDF_SIGNATURE, + FILE_READ_ERROR, + ALREADY_EXISTS, + S3_UPLOAD_FAILED + } + + public record SkippedFile( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason + ) {} + + // Note: @Schema on a record accessor method is not picked up by SpringDoc; the + // "skipped" count is a computed convenience field derived from skippedFiles.size(). + @JsonProperty("skipped") + public int skipped() { + return skippedFiles.size(); + } + + /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ + public ImportStatus { + skippedFiles = List.copyOf(skippedFiles); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java deleted file mode 100644 index 975517e7..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java +++ /dev/null @@ -1,509 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.v3.oas.annotations.media.Schema; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.poi.ss.usermodel.*; -import java.util.Objects; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.exception.ErrorCode; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.person.PersonNameParser; -import org.raddatz.familienarchiv.person.PersonService; -import org.raddatz.familienarchiv.tag.TagService; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.scheduling.annotation.Async; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import javax.xml.parsers.DocumentBuilderFactory; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Optional; -import java.util.UUID; -import java.util.stream.Stream; -import java.util.zip.ZipFile; - -@Service -@RequiredArgsConstructor -@Slf4j -public class MassImportService { - - public enum State { IDLE, RUNNING, DONE, FAILED } - - public enum SkipReason { - INVALID_FILENAME_PATH_TRAVERSAL, - INVALID_PDF_SIGNATURE, - FILE_READ_ERROR, - ALREADY_EXISTS, - S3_UPLOAD_FAILED - } - - public record SkippedFile( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason - ) {} - - public record ImportStatus( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, - @JsonIgnore String message, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, - LocalDateTime startedAt - ) { - // Note: @Schema on a record accessor method is not picked up by SpringDoc; the - // "skipped" count is a computed convenience field derived from skippedFiles.size(). - @JsonProperty("skipped") - public int skipped() { return skippedFiles.size(); } - - /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ - public ImportStatus { - skippedFiles = List.copyOf(skippedFiles); - } - } - - record ProcessResult(int processed, List skippedFiles) {} - - private volatile ImportStatus currentStatus = new ImportStatus(State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - - public ImportStatus getStatus() { - return currentStatus; - } - - private final DocumentService documentService; - private final PersonService personService; - private final TagService tagService; - private final S3Client s3Client; - private final ThumbnailAsyncRunner thumbnailAsyncRunner; - - @Value("${app.s3.bucket}") - private String bucketName; - - @Value("${app.import.col.index:0}") - private int colIndex; - - @Value("${app.import.col.box:1}") - private int colBox; - - @Value("${app.import.col.folder:2}") - private int colFolder; - - @Value("${app.import.col.sender:3}") - private int colSender; - - @Value("${app.import.col.receivers:5}") - private int colReceivers; - - @Value("${app.import.col.date:7}") - private int colDate; - - @Value("${app.import.col.location:9}") - private int colLocation; - - @Value("${app.import.col.tags:10}") - private int colTags; - - @Value("${app.import.col.summary:11}") - private int colSummary; - - @Value("${app.import.col.transcription:13}") - private int colTranscription; - - @Value("${app.import.dir:/import}") - private String importDir; - - private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); - - // ODS XML namespaces - private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; - private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; - - // We only need up to this many columns; caps repeated-empty-cell expansion - private static final int MAX_COLS = 20; - - @Async - public void runImportAsync() { - if (currentStatus.state() == State.RUNNING) { - throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); - } - currentStatus = new ImportStatus(State.RUNNING, "IMPORT_RUNNING", "Import läuft...", 0, List.of(), LocalDateTime.now()); - try { - File spreadsheet = findSpreadsheetFile(); - log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath()); - ProcessResult result = processRows(readSpreadsheet(spreadsheet)); - currentStatus = new ImportStatus(State.DONE, "IMPORT_DONE", - "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", - result.processed(), result.skippedFiles(), currentStatus.startedAt()); - } catch (NoSpreadsheetException e) { - log.error("Massenimport fehlgeschlagen: keine Tabellendatei", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_NO_SPREADSHEET", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } catch (Exception e) { - log.error("Massenimport fehlgeschlagen", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_INTERNAL", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } - } - - private static class NoSpreadsheetException extends RuntimeException { - NoSpreadsheetException(String message) { super(message); } - } - - private File findSpreadsheetFile() throws IOException { - try (Stream files = Files.list(Paths.get(importDir))) { - return files - .filter(p -> { - String name = p.toString().toLowerCase(); - return name.endsWith(".ods") || name.endsWith(".xlsx") || name.endsWith(".xls"); - }) - .findFirst() - .orElseThrow(() -> new NoSpreadsheetException( - "Keine Tabellendatei (.ods/.xlsx/.xls) in " + importDir + " gefunden!")) - .toFile(); - } - } - - // --- Spreadsheet reading (format-specific, produces neutral List>) --- - - private List> readSpreadsheet(File file) throws Exception { - String name = file.getName().toLowerCase(); - if (name.endsWith(".ods")) { - return readOds(file); - } - return readXlsx(file); - } - - /** - * Reads an ODS file by parsing its content.xml directly (no extra library needed). - * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML. - */ - List> readOds(File file) throws Exception { - List> result = new ArrayList<>(); - - try (ZipFile zip = new ZipFile(file)) { - var entry = zip.getEntry("content.xml"); - if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt"); - - var factory = XxeSafeXmlParser.hardenedFactory(); - factory.setNamespaceAware(true); - var builder = factory.newDocumentBuilder(); - var doc = builder.parse(zip.getInputStream(entry)); - - NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table"); - if (tables.getLength() == 0) return result; - - var table = (Element) tables.item(0); - NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row"); - - for (int i = 0; i < rows.getLength(); i++) { - var row = (Element) rows.item(i); - List rowData = new ArrayList<>(); - NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell"); - - for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) { - var cell = (Element) cells.item(j); - - // Read the display text (first ) - String value = ""; - NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p"); - if (textNodes.getLength() > 0) { - value = textNodes.item(0).getTextContent().trim(); - } - - // Expand number-columns-repeated (capped at MAX_COLS) - String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated"); - int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr); - repeat = Math.min(repeat, MAX_COLS - rowData.size()); - - for (int r = 0; r < repeat; r++) { - rowData.add(value); - } - } - result.add(rowData); - } - } - return result; - } - - /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */ - private List> readXlsx(File file) throws Exception { - List> result = new ArrayList<>(); - try (FileInputStream fis = new FileInputStream(file); - Workbook workbook = WorkbookFactory.create(fis)) { - - Sheet sheet = workbook.getSheetAt(0); - for (int i = 0; i <= sheet.getLastRowNum(); i++) { - Row row = sheet.getRow(i); - List rowData = new ArrayList<>(); - if (row != null) { - for (int j = 0; j < MAX_COLS; j++) { - rowData.add(xlsxCellToString(row.getCell(j))); - } - } - result.add(rowData); - } - } - return result; - } - - private String xlsxCellToString(Cell cell) { - if (cell == null) return ""; - return switch (cell.getCellType()) { - case STRING -> cell.getStringCellValue(); - case NUMERIC -> { - if (DateUtil.isCellDateFormatted(cell)) { - yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO - } - yield String.valueOf((int) cell.getNumericCellValue()); - } - case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); - default -> ""; - }; - } - - // --- Import logic (works on neutral List rows) --- - - private ProcessResult processRows(List> rows) { - int processed = 0; - List skippedFiles = new ArrayList<>(); - - for (int i = 1; i < rows.size(); i++) { // skip header row - List cells = rows.get(i); - String index = getCell(cells, colIndex); - if (index.isBlank()) continue; - - String filename = index.contains(".") ? index : index + ".pdf"; - if (!isValidImportFilename(filename)) { - log.warn("Skipping import row {}: filename rejected — {}", i, filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_FILENAME_PATH_TRAVERSAL)); - continue; - } - Optional fileOnDisk = findFileRecursive(filename); - if (fileOnDisk.isEmpty()) { - log.warn("Datei nicht gefunden, importiere nur Metadaten: {}", filename); - } - - if (fileOnDisk.isPresent()) { - try { - if (!isPdfMagicBytes(fileOnDisk.get())) { - log.warn("Überspringe {}: Datei beginnt nicht mit %PDF-Signatur", filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_PDF_SIGNATURE)); - continue; - } - } catch (IOException e) { - log.error("Fehler beim Prüfen der Magic-Bytes für {}", filename, e); - skippedFiles.add(new SkippedFile(filename, SkipReason.FILE_READ_ERROR)); - continue; - } - } - - Optional skipReason = importSingleDocument(cells, fileOnDisk, filename, index); - if (skipReason.isPresent()) { - skippedFiles.add(new SkippedFile(filename, skipReason.get())); - } else { - processed++; - } - } - return new ProcessResult(processed, skippedFiles); - } - - private boolean isValidImportFilename(String filename) { - if (filename == null || filename.isBlank()) return false; - if (filename.contains("/")) return false; - if (filename.contains("\\")) return false; - if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH - if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS - if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR - if (filename.contains("..")) return false; - if (filename.equals(".")) return false; - if (filename.contains("\0")) return false; - // Paths.get() is safe here on Linux for all inputs that passed the checks above; - // it may throw InvalidPathException for OS-specific illegal chars on Windows, - // but those are not reachable in production. - if (Paths.get(filename).isAbsolute()) return false; - return true; - } - - // package-private: Mockito spy in tests can override to inject IOException - InputStream openFileStream(File file) throws IOException { - return new FileInputStream(file); - } - - private boolean isPdfMagicBytes(File file) throws IOException { - try (InputStream is = openFileStream(file)) { - byte[] header = is.readNBytes(4); - return header.length == 4 - && header[0] == 0x25 // % - && header[1] == 0x50 // P - && header[2] == 0x44 // D - && header[3] == 0x46; // F - } - } - - /** - * Imports a single document row. - * - * @return empty Optional on success; an Optional containing the skip reason on failure/skip. - */ - @Transactional - protected Optional importSingleDocument(List cells, Optional file, String originalFilename, String index) { - Optional existing = documentService.findByOriginalFilename(originalFilename); - if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) { - log.info("Dokument {} existiert bereits, überspringe.", originalFilename); - return Optional.of(SkipReason.ALREADY_EXISTS); - } - - String archiveBox = getCell(cells, colBox); - String archiveFolder = getCell(cells, colFolder); - String senderRaw = getCell(cells, colSender); - String receiversRaw = getCell(cells, colReceivers); - LocalDate date = parseDate(getCell(cells, colDate)); - String location = getCell(cells, colLocation); - String tagRaw = getCell(cells, colTags); - String summary = getCell(cells, colSummary); - String transcription = getCell(cells, colTranscription); - - String s3Key = null; - String contentType = null; - DocumentStatus status = DocumentStatus.PLACEHOLDER; - - if (file.isPresent()) { - try { - contentType = Files.probeContentType(file.get().toPath()); - } catch (IOException e) { - contentType = null; - } - if (contentType == null) contentType = "application/octet-stream"; - - s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); - try { - s3Client.putObject(PutObjectRequest.builder() - .bucket(bucketName) - .key(s3Key) - .contentType(contentType) - .build(), - RequestBody.fromFile(file.get())); - status = DocumentStatus.UPLOADED; - } catch (Exception e) { - log.error("S3 Upload Fehler für {}", file.get().getName(), e); - return Optional.of(SkipReason.S3_UPLOAD_FAILED); - } - } - - Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw); - List receivers = PersonNameParser.parseReceivers(receiversRaw).stream() - .map(this::findOrCreatePerson) - .filter(Objects::nonNull) - .toList(); - - Tag tag = null; - if (!tagRaw.isBlank()) { - tag = tagService.findOrCreate(tagRaw); - } - - Document doc = existing.orElse(Document.builder() - .originalFilename(originalFilename) - .build()); - - // Heuristic: mark as complete if at least one key field is present in the spreadsheet row - boolean metadataComplete = date != null || !senderRaw.isBlank() || !receiversRaw.isBlank(); - - doc.setTitle(buildTitle(index, date, location)); - doc.setFilePath(s3Key); - doc.setContentType(contentType); - doc.setStatus(status); - doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox); - doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder); - doc.setDocumentDate(date); - doc.setLocation(location.isBlank() ? null : location); - doc.setSummary(summary.isBlank() ? null : summary); - doc.setTranscription(transcription.isBlank() ? null : transcription); - doc.setSender(sender); - doc.getReceivers().addAll(receivers); - if (tag != null) doc.getTags().add(tag); - doc.setMetadataComplete(metadataComplete); - - Document saved = documentService.save(doc); - if (file.isPresent()) { - thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); - } - log.info("Importiert{}: {}", file.isEmpty() ? " (nur Metadaten)" : "", originalFilename); - return Optional.empty(); - } - - // --- Helpers --- - - private String getCell(List cells, int col) { - if (col >= cells.size()) return ""; - String val = cells.get(col); - return val == null ? "" : val.trim(); - } - - private LocalDate parseDate(String value) { - if (value == null || value.isBlank()) return null; - try { - return LocalDate.parse(value.trim()); - } catch (DateTimeParseException e) { - return null; - } - } - - private String buildTitle(String index, LocalDate date, String location) { - StringBuilder sb = new StringBuilder(index); - if (date != null) { - sb.append(" \u2013 ").append(date.format(GERMAN_DATE)); - } - if (location != null && !location.isBlank()) { - sb.append(" \u2013 ").append(location); - } - return sb.toString(); - } - - private Person findOrCreatePerson(String rawName) { - return personService.findOrCreateByAlias(rawName); - } - - private Optional findFileRecursive(String filename) { - File baseDir = new File(importDir); - try (Stream walk = Files.walk(baseDir.toPath())) { - Optional match = walk.filter(p -> !Files.isDirectory(p)) - .filter(p -> p.getFileName().toString().equals(filename)) - .findFirst(); - if (match.isEmpty()) return Optional.empty(); - File candidate = match.get().toFile(); - String baseDirCanonical = baseDir.getCanonicalPath(); - if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { - throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); - } - return Optional.of(candidate); - } catch (IOException e) { - return Optional.empty(); - } - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java new file mode 100644 index 00000000..edad55d2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java @@ -0,0 +1,69 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.List; + +/** + * Loads {@code canonical-persons.xlsx} (the register) into the person domain via + * {@link PersonService}, upserting each person by the normalizer {@code person_id} + * (source_ref). Register persons are confident identities, so {@code provisional} is + * driven by the sheet's already-clean value (normally {@code False}). + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class PersonRegisterImporter { + + static final List REQUIRED_HEADERS = List.of("person_id", "last_name", "first_name", "provisional"); + + private final PersonService personService; + + public int load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + int processed = 0; + for (CanonicalSheetReader.Row row : rows) { + String personId = row.get("person_id"); + if (personId.isBlank()) continue; + personService.upsertBySourceRef(toCommand(row, personId)); + processed++; + } + log.info("Imported {} register persons from {}", processed, artifact.getName()); + return processed; + } + + private PersonUpsertCommand toCommand(CanonicalSheetReader.Row row, String personId) { + return PersonUpsertCommand.builder() + .sourceRef(personId) + .lastName(blankToNull(row.get("last_name"))) + .firstName(blankToNull(row.get("first_name"))) + .maidenName(blankToNull(row.get("maiden_name"))) + .notes(blankToNull(row.get("notes"))) + .birthYear(yearOf(row.get("birth_date"))) + .deathYear(yearOf(row.get("death_date"))) + .personType(PersonType.PERSON) + .provisional(Boolean.parseBoolean(row.get("provisional"))) + .build(); + } + + private static Integer yearOf(String isoDate) { + if (isoDate == null || isoDate.isBlank()) return null; + try { + return LocalDate.parse(isoDate.trim()).getYear(); + } catch (DateTimeParseException e) { + return null; + } + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java new file mode 100644 index 00000000..26ae0dcd --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java @@ -0,0 +1,135 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.person.relationship.RelationType; +import org.raddatz.familienarchiv.person.relationship.RelationshipService; +import org.raddatz.familienarchiv.person.relationship.dto.CreateRelationshipRequest; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * Loads {@code canonical-persons-tree.json} into the person + relationship domains. + * Tree persons are upserted via {@link PersonService} keyed on the shared + * {@code personId} slug (which Phase 1 #670 now emits into the tree), so they reconcile + * with the register rather than duplicating it. Relationships reference persons by the + * tree's local {@code rowId}; each side is mapped to the upserted person's UUID and + * created through {@link RelationshipService} (never the relationship repository — + * layering rule). A duplicate relationship on re-import is swallowed for idempotency. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class PersonTreeImporter { + + // The tree JSON is a local implementation detail, not a shared API payload, so the + // importer owns its own mapper rather than depending on the web ObjectMapper bean. + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final PersonService personService; + private final RelationshipService relationshipService; + + public int load(File artifact) { + JsonNode root = readTree(artifact); + Map idByRowId = upsertPersons(root.path("persons")); + int relationships = createRelationships(root.path("relationships"), idByRowId); + log.info("Imported {} tree persons and {} relationships from {}", + idByRowId.size(), relationships, artifact.getName()); + return idByRowId.size(); + } + + private JsonNode readTree(File artifact) { + try { + return OBJECT_MAPPER.readTree(artifact); + } catch (Exception e) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Unreadable canonical artifact: " + artifact.getName()); + } + } + + private Map upsertPersons(JsonNode persons) { + Map idByRowId = new HashMap<>(); + for (JsonNode node : persons) { + String personId = text(node, "personId"); + if (personId.isBlank()) continue; + Person person = personService.upsertBySourceRef(toCommand(node, personId)); + idByRowId.put(text(node, "rowId"), person.getId()); + } + return idByRowId; + } + + private PersonUpsertCommand toCommand(JsonNode node, String personId) { + return PersonUpsertCommand.builder() + .sourceRef(personId) + .lastName(blankToNull(text(node, "lastName"))) + .firstName(blankToNull(text(node, "firstName"))) + .maidenName(blankToNull(text(node, "maidenName"))) + .notes(blankToNull(text(node, "notes"))) + .birthYear(intOrNull(node, "birthYear")) + .deathYear(intOrNull(node, "deathYear")) + .familyMember(node.path("familyMember").asBoolean(false)) + .personType(PersonType.PERSON) + .provisional(false) + .build(); + } + + private int createRelationships(JsonNode relationships, Map idByRowId) { + int created = 0; + for (JsonNode node : relationships) { + // Trap: a relationship node's personId / relatedPersonId fields carry the tree's + // local rowId (e.g. "row_a"), NOT a person slug. They are resolved through + // idByRowId to the upserted person's UUID. + UUID person = idByRowId.get(text(node, "personId")); + UUID related = idByRowId.get(text(node, "relatedPersonId")); + if (person == null || related == null) { + log.warn("Skipping tree relationship with unresolved rowId: {} -> {}", + text(node, "personId"), text(node, "relatedPersonId")); + continue; + } + if (addRelationshipIdempotently(person, related, text(node, "type"))) { + created++; + } + } + return created; + } + + private boolean addRelationshipIdempotently(UUID person, UUID related, String type) { + try { + relationshipService.addRelationship(person, + new CreateRelationshipRequest(related, RelationType.valueOf(type), null, null, null)); + return true; + } catch (DomainException e) { + if (e.getCode() == ErrorCode.DUPLICATE_RELATIONSHIP + || e.getCode() == ErrorCode.CIRCULAR_RELATIONSHIP) { + return false; + } + throw e; + } + } + + private static String text(JsonNode node, String field) { + JsonNode value = node.get(field); + return value == null || value.isNull() ? "" : value.asText(); + } + + private static Integer intOrNull(JsonNode node, String field) { + JsonNode value = node.get(field); + return value == null || value.isNull() ? null : value.asInt(); + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java new file mode 100644 index 00000000..a871ab32 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java @@ -0,0 +1,54 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +/** + * Loads {@code canonical-tag-tree.xlsx} into the tag domain via {@link TagService}, + * upserting each tag by its canonical {@code tag_path} (the source_ref). Parent links are + * resolved by the parent's path, which is the child path with its last {@code /segment} + * stripped. Rows are emitted parents-first by the normalizer, so a parent is always + * resolved before any child references it. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class TagTreeImporter { + + static final List REQUIRED_HEADERS = List.of("tag_path", "parent_name", "tag_name"); + private static final String PATH_SEPARATOR = "/"; + + private final TagService tagService; + + public int load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + Map idByPath = new HashMap<>(); + int processed = 0; + for (CanonicalSheetReader.Row row : rows) { + String path = row.get("tag_path"); + if (path.isBlank()) continue; + UUID parentId = resolveParentId(path, idByPath); + Tag tag = tagService.upsertBySourceRef(path, row.get("tag_name"), parentId); + idByPath.put(path, tag.getId()); + processed++; + } + log.info("Imported {} tags from {}", processed, artifact.getName()); + return processed; + } + + private UUID resolveParentId(String path, Map idByPath) { + int lastSeparator = path.lastIndexOf(PATH_SEPARATOR); + if (lastSeparator < 0) return null; + String parentPath = path.substring(0, lastSeparator); + return idByPath.get(parentPath); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java deleted file mode 100644 index 949ea054..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java +++ /dev/null @@ -1,20 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -class XxeSafeXmlParser { - - private XxeSafeXmlParser() {} - - static DocumentBuilderFactory hardenedFactory() throws ParserConfigurationException { - var factory = DocumentBuilderFactory.newInstance(); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setXIncludeAware(false); - factory.setExpandEntityReferences(false); - return factory; - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/Person.java b/backend/src/main/java/org/raddatz/familienarchiv/person/Person.java index d2332519..993480c4 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/Person.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/Person.java @@ -57,6 +57,18 @@ public class Person { @Schema(requiredMode = Schema.RequiredMode.REQUIRED) private boolean familyMember = false; + // The normalizer person_id — join key and re-import idempotency key. Null for manually + // created persons; unique among non-null values (see ADR-025). + @Column(name = "source_ref") + private String sourceRef; + + // A provisional person is one the importer inferred but could not confidently identify. + // Distinct from familyMember (a genealogical fact); set true only by the importer (Phase 3). + @Column(name = "provisional", nullable = false) + @Builder.Default + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private boolean provisional = false; + // Entity-graph navigation for JPA JOIN queries (e.g. DocumentSpecifications.hasText). // Uses entity relationship rather than cross-domain repository access, avoiding a // separate DB roundtrip while respecting domain boundaries. diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonController.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonController.java index 5c47cbde..dad52b5a 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonController.java @@ -22,12 +22,15 @@ import org.springframework.web.bind.annotation.*; import org.springframework.web.server.ResponseStatusException; import jakarta.validation.Valid; +import jakarta.validation.constraints.Max; +import jakarta.validation.constraints.Min; import lombok.RequiredArgsConstructor; @RestController @RequestMapping("/api/persons") @RequiredArgsConstructor +@Validated public class PersonController { private final PersonService personService; @@ -35,15 +38,37 @@ public class PersonController { @GetMapping @RequirePermission(Permission.READ_ALL) - public ResponseEntity> getPersons( + public ResponseEntity getPersons( @RequestParam(required = false) String q, - @RequestParam(required = false, defaultValue = "0") int size, - @RequestParam(required = false) String sort) { - if ("documentCount".equals(sort) && size > 0 && q == null) { + @RequestParam(required = false) PersonType type, + @RequestParam(required = false) Boolean familyOnly, + @RequestParam(required = false) Boolean hasDocuments, + @RequestParam(required = false) Boolean provisional, + // review=true reveals the import noise (transcriber view); absent/false keeps the + // clean reader default (familyMember OR documentCount > 0). The explicit filters AND + // within whichever base the review flag selects. + @RequestParam(required = false, defaultValue = "false") boolean review, + @RequestParam(required = false) String sort, + @RequestParam(defaultValue = "0") @Min(0) int page, + @RequestParam(defaultValue = "50") @Min(1) @Max(100) int size) { + // Legacy top-N-by-document-count path (reader dashboard): preserved, wrapped in the + // same envelope so /api/persons always returns one shape. It is explicitly NON-paged — + // the top-N query returns the complete result, so PersonSearchResult.topN reports an + // honest totalElements (= returned count) instead of pretending to be a page slice. + if ("documentCount".equals(sort) && q == null) { int safeSize = Math.min(size, 50); - return ResponseEntity.ok(personService.findTopByDocumentCount(safeSize)); + List top = personService.findTopByDocumentCount(safeSize); + return ResponseEntity.ok(PersonSearchResult.topN(top)); } - return ResponseEntity.ok(personService.findAll(q)); + + PersonFilter filter = PersonFilter.builder() + .type(type) + .familyOnly(familyOnly) + .hasDocuments(hasDocuments) + .provisional(provisional) + .readerDefault(!review) + .build(); + return ResponseEntity.ok(personService.search(filter, page, size, q)); } @GetMapping("/{id}") @@ -110,6 +135,21 @@ public class PersonController { personService.mergePersons(id, UUID.fromString(targetIdStr)); } + // Dedicated state transition that clears the provisional flag. A separate verb (not a + // mass-assignable DTO field) so provisional can never be smuggled in via create/update. + @PatchMapping("/{id}/confirm") + @RequirePermission(Permission.WRITE_ALL) + public ResponseEntity confirmPerson(@PathVariable UUID id) { + return ResponseEntity.ok(personService.confirmPerson(id)); + } + + @DeleteMapping("/{id}") + @ResponseStatus(HttpStatus.NO_CONTENT) + @RequirePermission(Permission.WRITE_ALL) + public void deletePerson(@PathVariable UUID id) { + personService.deletePerson(id); + } + // ─── Alias endpoints ──────────────────────────────────────────────────── @GetMapping("/{id}/aliases") diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonFilter.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonFilter.java new file mode 100644 index 00000000..bc41214a --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonFilter.java @@ -0,0 +1,36 @@ +package org.raddatz.familienarchiv.person; + +import lombok.Builder; + +/** + * The reader/triage filter set for the persons directory, threaded as one value through + * {@code PersonController -> PersonService -> PersonRepository}. Each field is nullable: + * null means "do not constrain on this dimension". + * + *

    + *
  • {@code type} — restrict to a single {@link PersonType}.
  • + *
  • {@code familyOnly} — when true, only {@code familyMember} persons.
  • + *
  • {@code hasDocuments} — when true, only persons with documentCount > 0.
  • + *
  • {@code provisional} — match the {@code Person.provisional} flag exactly.
  • + *
  • {@code readerDefault} — when true, restrict to {@code familyMember OR documentCount > 0} + * (the clean reader view). The explicit filters above AND with this restriction.
  • + *
+ */ +@Builder +public record PersonFilter( + PersonType type, + Boolean familyOnly, + Boolean hasDocuments, + Boolean provisional, + boolean readerDefault +) { + /** The unconstrained "show all" filter (transcriber view, no reader restriction). */ + public static PersonFilter showAll() { + return PersonFilter.builder().readerDefault(false).build(); + } + + /** The clean reader default: familyMember OR documentCount > 0, no other constraints. */ + public static PersonFilter cleanDefault() { + return PersonFilter.builder().readerDefault(true).build(); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java index 6f431b74..50ff4ee9 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java @@ -32,6 +32,9 @@ public interface PersonRepository extends JpaRepository { // Lookup by full alias string, used during ODS mass import Optional findByAliasIgnoreCase(String alias); + // Lookup by the normalizer person_id, used for idempotent canonical re-import (Phase 3). + Optional findBySourceRef(String sourceRef); + // Exact first+last name match, used for filename-based sender lookup Optional findByFirstNameIgnoreCaseAndLastNameIgnoreCase(String firstName, String lastName); @@ -41,7 +44,7 @@ public interface PersonRepository extends JpaRepository { SELECT p.id, p.title, p.first_name AS firstName, p.last_name AS lastName, p.person_type AS personType, p.alias, p.birth_year AS birthYear, p.death_year AS deathYear, p.notes, - p.family_member AS familyMember, + p.family_member AS familyMember, p.provisional AS provisional, (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id) AS documentCount FROM persons p @@ -54,7 +57,7 @@ public interface PersonRepository extends JpaRepository { SELECT p.id, p.title, p.first_name AS firstName, p.last_name AS lastName, p.person_type AS personType, p.alias, p.birth_year AS birthYear, p.death_year AS deathYear, p.notes, - p.family_member AS familyMember, + p.family_member AS familyMember, p.provisional AS provisional, (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id) AS documentCount FROM persons p @@ -63,7 +66,7 @@ public interface PersonRepository extends JpaRepository { OR LOWER(CONCAT(p.last_name,' ',COALESCE(p.first_name,''))) LIKE LOWER(CONCAT('%',:query,'%')) OR LOWER(p.alias) LIKE LOWER(CONCAT('%',:query,'%')) OR LOWER(a.last_name) LIKE LOWER(CONCAT('%',:query,'%')) - GROUP BY p.id, p.title, p.first_name, p.last_name, p.person_type, p.alias, p.birth_year, p.death_year, p.notes, p.family_member + GROUP BY p.id, p.title, p.first_name, p.last_name, p.person_type, p.alias, p.birth_year, p.death_year, p.notes, p.family_member, p.provisional ORDER BY p.last_name ASC, p.first_name ASC """, nativeQuery = true) @@ -75,7 +78,7 @@ public interface PersonRepository extends JpaRepository { SELECT p.id, p.title, p.first_name AS firstName, p.last_name AS lastName, p.person_type AS personType, p.alias, p.birth_year AS birthYear, p.death_year AS deathYear, p.notes, - p.family_member AS familyMember, + p.family_member AS familyMember, p.provisional AS provisional, (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id) AS documentCount FROM persons p @@ -85,6 +88,61 @@ public interface PersonRepository extends JpaRepository { nativeQuery = true) List findTopByDocumentCount(@Param("limit") int limit); + // --- #667: filter-aware paged directory --- + // + // The slice query and the count query below MUST keep an IDENTICAL WHERE clause so the + // rendered page and totalElements can never drift. Every filter is nullable: a null param + // disables that predicate via the `:param IS NULL OR …` idiom. `readerDefault` (a plain + // boolean) restricts to "familyMember OR has documents"; the explicit filters AND on top. + // documentCount is recomputed inline (not via the SELECT alias) because WHERE cannot + // reference a computed alias. All params are named — no string concatenation, no injection. + String FILTER_WHERE = """ + WHERE (CAST(:type AS text) IS NULL OR p.person_type = CAST(:type AS text)) + AND (:familyOnly = FALSE OR :familyOnly IS NULL OR p.family_member = TRUE) + AND (:hasDocuments = FALSE OR :hasDocuments IS NULL OR ( + (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id)) > 0) + AND (:provisional IS NULL OR p.provisional = :provisional) + AND (:readerDefault = FALSE OR ( + p.family_member = TRUE OR ( + (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id)) > 0)) + AND (CAST(:query AS text) IS NULL OR + LOWER(CONCAT(COALESCE(p.first_name,''),' ',p.last_name)) LIKE LOWER(CONCAT('%',CAST(:query AS text),'%')) + OR LOWER(CONCAT(p.last_name,' ',COALESCE(p.first_name,''))) LIKE LOWER(CONCAT('%',CAST(:query AS text),'%')) + OR LOWER(p.alias) LIKE LOWER(CONCAT('%',CAST(:query AS text),'%'))) + """; + + @Query(value = """ + SELECT p.id, p.title, p.first_name AS firstName, p.last_name AS lastName, + p.person_type AS personType, + p.alias, p.birth_year AS birthYear, p.death_year AS deathYear, p.notes, + p.family_member AS familyMember, p.provisional AS provisional, + (SELECT COUNT(*) FROM documents d WHERE d.sender_id = p.id) + + (SELECT COUNT(*) FROM document_receivers dr WHERE dr.person_id = p.id) AS documentCount + FROM persons p + """ + FILTER_WHERE + """ + ORDER BY p.last_name ASC, p.first_name ASC + LIMIT :limit OFFSET :offset + """, + nativeQuery = true) + List findByFilter(@Param("type") String type, + @Param("familyOnly") Boolean familyOnly, + @Param("hasDocuments") Boolean hasDocuments, + @Param("provisional") Boolean provisional, + @Param("readerDefault") boolean readerDefault, + @Param("query") String query, + @Param("limit") int limit, + @Param("offset") int offset); + + @Query(value = "SELECT COUNT(*) FROM persons p " + FILTER_WHERE, nativeQuery = true) + long countByFilter(@Param("type") String type, + @Param("familyOnly") Boolean familyOnly, + @Param("hasDocuments") Boolean hasDocuments, + @Param("provisional") Boolean provisional, + @Param("readerDefault") boolean readerDefault, + @Param("query") String query); + // --- Correspondent queries --- @Query(value = """ @@ -136,6 +194,12 @@ public interface PersonRepository extends JpaRepository { @Query(value = "UPDATE documents SET sender_id = :target WHERE sender_id = :source", nativeQuery = true) void reassignSender(@Param("source") UUID source, @Param("target") UUID target); + // Used by deletePerson: detach a deleted person from documents they sent, so the hard + // delete cannot orphan a documents.sender_id FK (the column is nullable). + @Modifying + @Query(value = "UPDATE documents SET sender_id = NULL WHERE sender_id = :source", nativeQuery = true) + void reassignSenderToNull(@Param("source") UUID source); + @Modifying @Query(value = """ INSERT INTO document_receivers (document_id, person_id) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSearchResult.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSearchResult.java new file mode 100644 index 00000000..ff605770 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSearchResult.java @@ -0,0 +1,50 @@ +package org.raddatz.familienarchiv.person; + +import io.swagger.v3.oas.annotations.media.Schema; + +import java.util.List; + +/** + * Paged result for the /api/persons list endpoint. + * + *

Hand-written to mirror {@code document/DocumentSearchResult} field-for-field so the + * frontend sees one paged shape across the app. Deliberately NOT Spring {@code Page} + * (unstable serialized shape across Spring versions, noisy in OpenAPI) and deliberately + * NOT a reuse of the document DTO (would couple two feature modules — duplication beats + * coupling here). + */ +public record PersonSearchResult( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + List items, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + long totalElements, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + int pageNumber, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + int pageSize, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + int totalPages +) { + /** + * Paged factory: derives {@code totalPages} from the full match count and the page size. + * A zero count yields zero pages so the frontend hides the pagination control. + */ + public static PersonSearchResult paged(List slice, int pageNumber, int pageSize, long totalElements) { + int totalPages = pageSize == 0 ? 0 : (int) ((totalElements + pageSize - 1) / pageSize); + return new PersonSearchResult(slice, totalElements, pageNumber, pageSize, totalPages); + } + + /** + * Non-paged factory for the legacy {@code sort=documentCount} top-N dashboard path. + * That query returns the complete result in one shot — there is no further page + * to fetch — so the envelope reports reality rather than pretending to be a slice of a + * larger set: {@code totalElements} equals the number of rows actually returned, + * {@code pageSize} equals that same count, and {@code totalPages} is 1 (or 0 when empty). + * This avoids the earlier ambiguity where {@code totalElements} looked like a paged total. + */ + public static PersonSearchResult topN(List all) { + int count = all.size(); + int totalPages = count == 0 ? 0 : 1; + return new PersonSearchResult(all, count, 0, count, totalPages); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java index 89b11ef3..175ab529 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java @@ -31,20 +31,55 @@ public class PersonService { private final PersonRepository personRepository; private final PersonNameAliasRepository aliasRepository; - public List findAll(String q) { - if (q == null) { - return personRepository.findAllWithDocumentCount(); - } - if (q.isBlank()) { - return List.of(); - } - return personRepository.searchWithDocumentCount(q.trim()); - } - public List findTopByDocumentCount(int limit) { return personRepository.findTopByDocumentCount(limit); } + /** + * Filtered, paginated directory query. The slice and the total are derived from one + * shared WHERE clause (see {@link PersonRepository#FILTER_WHERE}) so totalElements can + * never drift from the rendered page. {@code type} is passed as the enum name because the + * native query compares against the string column. + */ + public PersonSearchResult search(PersonFilter filter, int page, int size, String q) { + String type = filter.type() == null ? null : filter.type().name(); + String query = (q == null || q.isBlank()) ? null : q.trim(); + int offset = page * size; + + List items = personRepository.findByFilter( + type, filter.familyOnly(), filter.hasDocuments(), filter.provisional(), + filter.readerDefault(), query, size, offset); + long total = personRepository.countByFilter( + type, filter.familyOnly(), filter.hasDocuments(), filter.provisional(), + filter.readerDefault(), query); + + return PersonSearchResult.paged(items, page, size, total); + } + + /** + * Clears the {@code provisional} flag — a deliberate state transition exposed as + * {@code PATCH /api/persons/{id}/confirm}, never as a mass-assignable DTO field (CWE-915). + */ + @Transactional + public Person confirmPerson(UUID id) { + Person person = getById(id); + person.setProvisional(false); + return personRepository.save(person); + } + + /** + * Hard-deletes a person used by triage. Detaches the person from any documents they + * sent (nulls sender_id) and from any received-document references first, so the delete + * cannot orphan an FK and fail with a 500. + */ + @Transactional + public void deletePerson(UUID id) { + getById(id); + personRepository.reassignSenderToNull(id); + personRepository.deleteReceiverReferences(id); + personRepository.deleteById(id); + } + public Person getById(UUID id) { return personRepository.findById(id) .orElseThrow(() -> DomainException.notFound(ErrorCode.PERSON_NOT_FOUND, "Person not found: " + id)); @@ -80,6 +115,11 @@ public class PersonService { return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName); } + /** Lookup by the normalizer person_id — used by the canonical importer for register-first matching. */ + public Optional findBySourceRef(String sourceRef) { + return personRepository.findBySourceRef(sourceRef); + } + @Nullable @Transactional public Person findOrCreateByAlias(String rawName) { @@ -115,6 +155,80 @@ public class PersonService { }); } + /** + * Idempotent upsert keyed on {@code sourceRef} (the normalizer person_id) for the + * canonical importer (Phase 3, ADR-025). On first import the canonical fields are + * written verbatim. On re-import the human-edit-preserve precedence applies: + * a non-blank existing field is never overwritten, and {@code provisional} never + * flips back to true once a human has confirmed the person. + */ + @Transactional + public Person upsertBySourceRef(PersonUpsertCommand cmd) { + return personRepository.findBySourceRef(cmd.sourceRef()) + .map(existing -> personRepository.save(mergeCanonical(existing, cmd))) + .orElseGet(() -> fromCanonical(cmd)); + } + + private Person fromCanonical(PersonUpsertCommand cmd) { + Person person = personRepository.save(Person.builder() + .sourceRef(cmd.sourceRef()) + .firstName(blankToNull(cmd.firstName())) + .lastName(cmd.lastName()) + .notes(blankToNull(cmd.notes())) + .birthYear(cmd.birthYear()) + .deathYear(cmd.deathYear()) + .familyMember(cmd.familyMember()) + .personType(cmd.personType() == null ? PersonType.PERSON : cmd.personType()) + .provisional(cmd.provisional()) + .build()); + String maiden = blankToNull(cmd.maidenName()); + if (maiden != null) { + int nextSortOrder = aliasRepository.findMaxSortOrder(person.getId()) + 1; + aliasRepository.save(PersonNameAlias.builder() + .person(person) + .lastName(maiden) + .type(PersonNameAliasType.MAIDEN_NAME) + .sortOrder(nextSortOrder) + .build()); + } + return person; + } + + private Person mergeCanonical(Person existing, PersonUpsertCommand cmd) { + existing.setFirstName(preferHuman(existing.getFirstName(), cmd.firstName())); + existing.setLastName(preferHuman(existing.getLastName(), cmd.lastName())); + existing.setNotes(preferHuman(existing.getNotes(), cmd.notes())); + existing.setBirthYear(preferHuman(existing.getBirthYear(), cmd.birthYear())); + existing.setDeathYear(preferHuman(existing.getDeathYear(), cmd.deathYear())); + if (cmd.personType() != null && existing.getPersonType() == PersonType.PERSON) { + existing.setPersonType(cmd.personType()); + } + // provisional is monotonic-downward: once it is false it never reverts to true. + // This also pins the cross-loader precedence (ADR-025): a register/tree person is + // loaded before documents and already false, so a later document row that references + // the same source_ref (provisional=true) can never flip it provisional — the guard + // below only fires while existing is still provisional. Order of document rows is + // therefore irrelevant. + if (existing.isProvisional()) { + existing.setProvisional(cmd.provisional()); + } + return existing; + } + + // preferHuman keeps an existing human-entered value and only falls back to the canonical + // value when the existing one is absent — the single idiom for every fill-blank field. + private static String preferHuman(String existing, String canonical) { + return (existing == null || existing.isBlank()) ? blankToNull(canonical) : existing; + } + + private static Integer preferHuman(Integer existing, Integer canonical) { + return existing != null ? existing : canonical; + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s.trim(); + } + @Transactional public Person createPerson(String firstName, String lastName, String alias) { Person person = Person.builder() diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSummaryDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSummaryDTO.java index 68cbbe1b..9a92d257 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSummaryDTO.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonSummaryDTO.java @@ -18,6 +18,7 @@ public interface PersonSummaryDTO { Integer getDeathYear(); String getNotes(); boolean isFamilyMember(); + boolean isProvisional(); long getDocumentCount(); default String getDisplayName() { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java new file mode 100644 index 00000000..63864ab6 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java @@ -0,0 +1,24 @@ +package org.raddatz.familienarchiv.person; + +import lombok.Builder; + +/** + * Importer → {@link PersonService} command for an idempotent upsert keyed on + * {@code sourceRef} (the normalizer's stable person_id). Carries only the canonical + * fields the importer owns; the service applies the human-edit-preserve precedence + * (see ADR-025): non-blank existing fields are never overwritten, and {@code provisional} + * never flips back to true once a human has confirmed a person. + */ +@Builder +public record PersonUpsertCommand( + String sourceRef, + String firstName, + String lastName, + String maidenName, + String notes, + Integer birthYear, + Integer deathYear, + boolean familyMember, + PersonType personType, + boolean provisional +) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/relationship/RelationshipService.java b/backend/src/main/java/org/raddatz/familienarchiv/person/relationship/RelationshipService.java index 032c1263..d813b8e8 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/relationship/RelationshipService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/relationship/RelationshipService.java @@ -31,6 +31,12 @@ import java.util.UUID; @RequiredArgsConstructor public class RelationshipService { + // Single source of truth for which relationship types are part of the family graph. + // Consulted by addRelationship (to set family_member on both endpoints) and by + // getFamilyNetwork (to filter the edges returned). FRIEND/COLLEAGUE/etc. are excluded. + private static final List FAMILY_RELATION_TYPES = + List.of(RelationType.PARENT_OF, RelationType.SPOUSE_OF, RelationType.SIBLING_OF); + private final PersonRelationshipRepository relationshipRepository; private final PersonService personService; private final RelationshipInferenceService inferenceService; @@ -64,7 +70,7 @@ public class RelationshipService { } List familyEdges = relationshipRepository.findAllByRelationTypeIn( - List.of(RelationType.PARENT_OF, RelationType.SPOUSE_OF, RelationType.SIBLING_OF)); + FAMILY_RELATION_TYPES); List edges = new ArrayList<>(); for (PersonRelationship r : familyEdges) { @@ -105,15 +111,23 @@ public class RelationshipService { .notes(blankToNull(dto.notes())) .build(); + PersonRelationship saved; try { // saveAndFlush so the unique_rel constraint violates synchronously and is // caught here, not at commit time outside the @Transactional boundary. - return toDTO(relationshipRepository.saveAndFlush(rel)); + saved = relationshipRepository.saveAndFlush(rel); } catch (DataIntegrityViolationException e) { throw DomainException.conflict( ErrorCode.DUPLICATE_RELATIONSHIP, "Relationship already exists for (" + personId + ", " + relatedPerson.getId() + ", " + dto.relationType() + ")"); } + // Family-graph edges imply both endpoints are family members. Idempotent: the + // setter is a no-op when the person is already flagged, so re-imports stay clean. + if (FAMILY_RELATION_TYPES.contains(dto.relationType())) { + personService.setFamilyMember(person.getId(), true); + personService.setFamilyMember(relatedPerson.getId(), true); + } + return toDTO(saved); } @Transactional diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/Tag.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/Tag.java index fc5974a6..32585eed 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/Tag.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/Tag.java @@ -30,4 +30,11 @@ public class Tag { /** Color token name (e.g. "sage"), only set on root-level tags. Null means no color. */ private String color; + + /** + * Import identity key, keyed on the canonical tag_path. Null for manually created tags; + * unique among non-null values. The importer (Phase 3) uses it for idempotent re-import. + */ + @Column(name = "source_ref") + private String sourceRef; } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java index 4a7fab90..f1b3b7ab 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java @@ -22,6 +22,9 @@ public interface TagRepository extends JpaRepository { Optional findByNameIgnoreCase(String name); + // Lookup by the canonical tag_path, used for idempotent canonical re-import (Phase 3). + Optional findBySourceRef(String sourceRef); + List findByNameContainingIgnoreCase(String name); /** diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java index a572f84f..14e1e9fa 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java @@ -7,6 +7,7 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; @@ -49,12 +50,37 @@ public class TagService { .orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id)); } + /** Lookup by the canonical tag_path — used by the canonical importer to attach a document's tag. */ + public Optional findBySourceRef(String sourceRef) { + return tagRepository.findBySourceRef(sourceRef); + } + public Tag findOrCreate(String name) { String cleanName = name.trim(); return tagRepository.findByNameIgnoreCase(cleanName) .orElseGet(() -> tagRepository.save(Tag.builder().name(cleanName).build())); } + /** + * Idempotent upsert keyed on {@code sourceRef} (the canonical tag_path) for the + * Phase-3 importer (ADR-025). On first import the canonical name and parent are + * written; on re-import a human-renamed tag name is preserved (the source_ref is the + * stable identity, the name is a human-editable label). + */ + @Transactional + public Tag upsertBySourceRef(String sourceRef, String name, UUID parentId) { + return tagRepository.findBySourceRef(sourceRef) + .map(existing -> { + existing.setParentId(parentId); + return tagRepository.save(existing); + }) + .orElseGet(() -> tagRepository.save(Tag.builder() + .sourceRef(sourceRef) + .name(name) + .parentId(parentId) + .build())); + } + @Transactional public Tag update(UUID id, TagUpdateDTO dto) { Tag tag = getById(id); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java index 18b6c2c0..74b5d643 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java @@ -5,7 +5,8 @@ import org.raddatz.familienarchiv.security.Permission; import org.raddatz.familienarchiv.security.RequirePermission; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.GetMapping; @@ -21,20 +22,20 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class AdminController { - private final MassImportService massImportService; + private final CanonicalImportOrchestrator importOrchestrator; private final DocumentService documentService; private final DocumentVersionService documentVersionService; private final ThumbnailBackfillService thumbnailBackfillService; @PostMapping("/trigger-import") - public ResponseEntity triggerMassImport() { - massImportService.runImportAsync(); - return ResponseEntity.accepted().body(massImportService.getStatus()); + public ResponseEntity triggerMassImport() { + importOrchestrator.runImportAsync(); + return ResponseEntity.accepted().body(importOrchestrator.getStatus()); } @GetMapping("/import-status") - public ResponseEntity importStatus() { - return ResponseEntity.ok(massImportService.getStatus()); + public ResponseEntity importStatus() { + return ResponseEntity.ok(importOrchestrator.getStatus()); } @PostMapping("/backfill-versions") diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index e74f4d41..1e4558e0 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -125,17 +125,10 @@ app: password: ${APP_ADMIN_PASSWORD:admin123} import: - col: - index: 0 - box: 1 - folder: 2 - sender: 3 - receivers: 5 - date: 7 - location: 9 - tags: 10 - summary: 11 - transcription: 13 + # Directory holding the normalizer's committed canonical artifacts + # (canonical-{documents,persons,tag-tree}.xlsx + canonical-persons-tree.json). + # The loader maps columns by header name — no positional indices (see ADR-025). + dir: ${IMPORT_DIR:/import} ocr: sender-model: diff --git a/backend/src/main/resources/db/migration/V69__import_precision_attribution_identity_schema.sql b/backend/src/main/resources/db/migration/V69__import_precision_attribution_identity_schema.sql new file mode 100644 index 00000000..bec01873 --- /dev/null +++ b/backend/src/main/resources/db/migration/V69__import_precision_attribution_identity_schema.sql @@ -0,0 +1,67 @@ +-- Phase 2 of "Handling the Unknowns": the schema foundation. +-- Consolidates every new import/precision/attribution/identity column into ONE +-- migration with a single owner so downstream phases (importer, rendering, persons +-- directory) compile against a finished, collision-free schema. See ADR-025. +-- +-- This file is forward-only and immutable once shipped (Flyway checksum model): +-- any fix goes in a later version, never an edit here. + +-- ─── documents: date precision, range end, raw date, raw attribution ────────── + +-- Range end is only set for RANGE precision (open-ended ranges allowed → end may be null). +ALTER TABLE documents ADD COLUMN meta_date_end date; + +-- Original date cell, verbatim, for provenance and "as written" display (Phase 4). +ALTER TABLE documents ADD COLUMN meta_date_raw text; + +-- Raw attribution preserved even when a person is linked. +ALTER TABLE documents ADD COLUMN sender_text text; +ALTER TABLE documents ADD COLUMN receiver_text text; + +-- Bound user-influenced spreadsheet text at the DB layer (mirrors transcription_blocks +-- length cap in V18). Defense in depth against malformed/huge import cells. +ALTER TABLE documents ADD CONSTRAINT chk_meta_date_raw_length CHECK (length(meta_date_raw) <= 10000); +ALTER TABLE documents ADD CONSTRAINT chk_sender_text_length CHECK (length(sender_text) <= 10000); +ALTER TABLE documents ADD CONSTRAINT chk_receiver_text_length CHECK (length(receiver_text) <= 10000); + +-- Precision enum — added with a DB default of 'UNKNOWN', backfilled, then made NOT NULL. +-- The DEFAULT serves two purposes: (1) existing rows get 'UNKNOWN' immediately, and +-- (2) raw-SQL inserts that omit the column (test fixtures, ad-hoc data loads) get a sane, +-- CHECK-valid value instead of violating the NOT NULL constraint. JPA saves still set it +-- explicitly via the entity's @Builder.Default = DatePrecision.UNKNOWN. +ALTER TABLE documents ADD COLUMN meta_date_precision varchar(16) DEFAULT 'UNKNOWN'; + +UPDATE documents +SET meta_date_precision = CASE WHEN meta_date IS NOT NULL THEN 'DAY' ELSE 'UNKNOWN' END; + +ALTER TABLE documents ALTER COLUMN meta_date_precision SET NOT NULL; + +-- Fail-closed allowlist of the seven precision values (verbatim mirror of the +-- normalizer's Precision enum). The DB enforces validity independent of the Java enum. +ALTER TABLE documents ADD CONSTRAINT chk_meta_date_precision + CHECK (meta_date_precision IN ('DAY', 'MONTH', 'SEASON', 'YEAR', 'RANGE', 'APPROX', 'UNKNOWN')); + +-- A non-null range end is permitted only when precision = RANGE. A RANGE row MAY have a +-- null end (open-ended range), so the rule is one-directional, not biconditional. +ALTER TABLE documents ADD CONSTRAINT chk_meta_date_end_only_for_range + CHECK (meta_date_end IS NULL OR meta_date_precision = 'RANGE'); + +-- For ranges with both endpoints, the end must not precede the start. +ALTER TABLE documents ADD CONSTRAINT chk_meta_date_end_after_start + CHECK (meta_date_end IS NULL OR meta_date IS NULL OR meta_date_end >= meta_date); + +-- ─── persons: source_ref (import identity) + provisional flag ───────────────── + +-- The normalizer person_id: join key for documents → persons and idempotency key for +-- re-import. Nullable (manually created persons never have one); unique among non-nulls. +ALTER TABLE persons ADD COLUMN source_ref varchar(255); +CREATE UNIQUE INDEX idx_persons_source_ref ON persons (source_ref); + +-- A provisional person is one the importer inferred but could not confidently identify. +-- Stays false until Phase 3 (importer) sets it; no code path writes true in this phase. +ALTER TABLE persons ADD COLUMN provisional boolean NOT NULL DEFAULT false; + +-- ─── tag: source_ref (import identity, keyed on canonical tag_path) ─────────── + +ALTER TABLE tag ADD COLUMN source_ref varchar(255); +CREATE UNIQUE INDEX idx_tag_source_ref ON tag (source_ref); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/MigrationIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/MigrationIntegrationTest.java index 425d0f59..ce217e6f 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/MigrationIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/MigrationIntegrationTest.java @@ -479,6 +479,191 @@ class MigrationIntegrationTest { assertThat(count).isEqualTo(1); } + // ─── V69: import/precision/attribution/identity schema foundation ──────── + + @Test + void v69_metaDatePrecisionColumn_isNotNull() { + Integer count = jdbc.queryForObject( + """ + SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'documents' + AND column_name = 'meta_date_precision' + AND is_nullable = 'NO' + """, + Integer.class); + assertThat(count).isEqualTo(1); + } + + @Test + void v69_backfillSql_setsDatedRowsToDayPrecision() { + // Re-run the migration's backfill UPDATE on a freshly dated row to prove the rule. + UUID docId = createDocumentWithDate("1943-05-12"); + + jdbc.update(V69_BACKFILL_PRECISION_SQL); + + String precision = jdbc.queryForObject( + "SELECT meta_date_precision FROM documents WHERE id = ?", String.class, docId); + assertThat(precision).isEqualTo("DAY"); + } + + @Test + void v69_backfillSql_setsUndatedRowsToUnknownPrecision() { + UUID docId = createDocument(); // no meta_date + + jdbc.update(V69_BACKFILL_PRECISION_SQL); + + String precision = jdbc.queryForObject( + "SELECT meta_date_precision FROM documents WHERE id = ?", String.class, docId); + assertThat(precision).isEqualTo("UNKNOWN"); + } + + // Mirrors the backfill UPDATE shipped in V69; idempotent for verification. + private static final String V69_BACKFILL_PRECISION_SQL = """ + UPDATE documents + SET meta_date_precision = CASE WHEN meta_date IS NOT NULL THEN 'DAY' ELSE 'UNKNOWN' END + """; + + @Test + void v69_precisionCheck_rejectsValueOutsideEnum() { + UUID docId = createDocument(); + + assertThatThrownBy(() -> + jdbc.update("UPDATE documents SET meta_date_precision = 'BOGUS' WHERE id = ?", docId) + ).isInstanceOf(DataIntegrityViolationException.class); + } + + @Test + void v69_metaDateEndCheck_rejectsNonNullEndWhenPrecisionNotRange() { + UUID docId = createDocumentWithDate("1943-05-12"); // precision DAY + + assertThatThrownBy(() -> + jdbc.update("UPDATE documents SET meta_date_end = '1943-06-01' WHERE id = ?", docId) + ).isInstanceOf(DataIntegrityViolationException.class); + } + + @Test + void v69_metaDateEndCheck_allowsNonNullEndWhenPrecisionRange() { + UUID docId = createDocumentWithDate("1943-05-12"); + + int rows = jdbc.update( + "UPDATE documents SET meta_date_precision = 'RANGE', meta_date_end = '1943-06-01' WHERE id = ?", + docId); + assertThat(rows).isEqualTo(1); + } + + @Test + void v69_metaDateEndCheck_allowsRangeWithNullEnd() { + // Loose semantics: the normalizer may emit an open-ended RANGE (start only). + UUID docId = createDocumentWithDate("1943-05-12"); + + int rows = jdbc.update( + "UPDATE documents SET meta_date_precision = 'RANGE' WHERE id = ?", docId); + assertThat(rows).isEqualTo(1); + } + + @Test + void v69_metaDateEndCheck_allowsRangeWithBothEndpointsNull() { + // Fully-open RANGE: neither start (meta_date) nor end (meta_date_end) is set. + // Both CHECKs hold (end IS NULL passes chk_meta_date_end_only_for_range; both-null + // passes chk_meta_date_end_after_start), so the row survives. This locks the actual + // DB behavior so a future tightening to a biconditional rule is a deliberate change. + UUID docId = createDocument(); // null meta_date + + int rows = jdbc.update( + "UPDATE documents SET meta_date_precision = 'RANGE' WHERE id = ?", docId); + assertThat(rows).isEqualTo(1); + + Object metaDate = jdbc.queryForObject("SELECT meta_date FROM documents WHERE id = ?", Object.class, docId); + Object metaDateEnd = jdbc.queryForObject( + "SELECT meta_date_end FROM documents WHERE id = ?", Object.class, docId); + assertThat(metaDate).isNull(); + assertThat(metaDateEnd).isNull(); + } + + @Test + void v69_rangeOrderCheck_rejectsEndBeforeStart() { + UUID docId = createDocumentWithDate("1943-05-12"); + + assertThatThrownBy(() -> + jdbc.update( + "UPDATE documents SET meta_date_precision = 'RANGE', meta_date_end = '1943-01-01' WHERE id = ?", + docId) + ).isInstanceOf(DataIntegrityViolationException.class); + } + + @Test + void v69_metaDateRawCheck_rejectsOverlongText() { + UUID docId = createDocument(); + String tooLong = "x".repeat(10001); + + assertThatThrownBy(() -> + jdbc.update("UPDATE documents SET meta_date_raw = ? WHERE id = ?", tooLong, docId) + ).isInstanceOf(DataIntegrityViolationException.class); + } + + @Test + void v69_senderTextAndReceiverText_storeRawAttribution() { + UUID docId = createDocument(); + + int rows = jdbc.update( + "UPDATE documents SET sender_text = 'Oma Anna', receiver_text = 'Tante Grete' WHERE id = ?", + docId); + assertThat(rows).isEqualTo(1); + } + + @Test + @Transactional(propagation = Propagation.NOT_SUPPORTED) + void v69_personsSourceRef_uniqueIndexRejectsDuplicate() { + jdbc.update( + "INSERT INTO persons (id, last_name, source_ref) VALUES (gen_random_uuid(), 'A', 'person:dup')"); + try { + assertThatThrownBy(() -> + jdbc.update( + "INSERT INTO persons (id, last_name, source_ref) VALUES (gen_random_uuid(), 'B', 'person:dup')") + ).isInstanceOf(DataIntegrityViolationException.class); + } finally { + jdbc.update("DELETE FROM persons WHERE source_ref = 'person:dup'"); + } + } + + @Test + @Transactional(propagation = Propagation.NOT_SUPPORTED) + void v69_personsSourceRef_allowsMultipleNulls() { + UUID a = createPerson("Null", "RefA"); + UUID b = createPerson("Null", "RefB"); + try { + String refA = jdbc.queryForObject("SELECT source_ref FROM persons WHERE id = ?", String.class, a); + String refB = jdbc.queryForObject("SELECT source_ref FROM persons WHERE id = ?", String.class, b); + assertThat(refA).isNull(); + assertThat(refB).isNull(); + } finally { + jdbc.update("DELETE FROM persons WHERE id IN (?, ?)", a, b); + } + } + + @Test + void v69_personsProvisional_defaultsToFalse() { + UUID id = createPerson("Provisional", "Default"); + + Boolean provisional = jdbc.queryForObject( + "SELECT provisional FROM persons WHERE id = ?", Boolean.class, id); + assertThat(provisional).isFalse(); + } + + @Test + @Transactional(propagation = Propagation.NOT_SUPPORTED) + void v69_tagSourceRef_uniqueIndexRejectsDuplicate() { + jdbc.update("INSERT INTO tag (id, name, source_ref) VALUES (gen_random_uuid(), 'TagDupA', 'tag:dup')"); + try { + assertThatThrownBy(() -> + jdbc.update("INSERT INTO tag (id, name, source_ref) VALUES (gen_random_uuid(), 'TagDupB', 'tag:dup')") + ).isInstanceOf(DataIntegrityViolationException.class); + } finally { + jdbc.update("DELETE FROM tag WHERE source_ref = 'tag:dup'"); + } + } + // ─── helpers ───────────────────────────────────────────────────────────── private UUID createPerson(String firstName, String lastName) { @@ -504,6 +689,12 @@ class MigrationIntegrationTest { return doc.getId(); } + private UUID createDocumentWithDate(String isoDate) { + UUID id = createDocument(); + jdbc.update("UPDATE documents SET meta_date = ?::date WHERE id = ?", isoDate, id); + return id; + } + private UUID insertAnnotation(UUID docId) { UUID id = UUID.randomUUID(); jdbc.update(""" diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java index fe15ba3b..7c9b28a1 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentControllerTest.java @@ -1,6 +1,7 @@ package org.raddatz.familienarchiv.document; import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; import org.raddatz.familienarchiv.document.DocumentBatchMetadataDTO; import org.raddatz.familienarchiv.document.DocumentSearchResult; import org.raddatz.familienarchiv.document.DocumentVersionSummary; @@ -35,7 +36,9 @@ import java.util.List; import java.util.Optional; import java.util.UUID; +import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.verify; @@ -73,23 +76,69 @@ class DocumentControllerTest { @Test @WithMockUser void search_returns200_whenAuthenticated() throws Exception { - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of())); mockMvc.perform(get("/api/documents/search")) .andExpect(status().isOk()); } + @Test + @WithMockUser + void search_undatedTrue_isReachableByAuthenticatedUser() throws Exception { + // The read GET must stay reachable for READ_ALL users — guards against a + // future refactor accidentally write-guarding the undated triage path (#668). + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) + .thenReturn(DocumentSearchResult.of(List.of())); + + mockMvc.perform(get("/api/documents/search").param("undated", "true")) + .andExpect(status().isOk()); + } + + @Test + void search_undatedTrue_returns401_whenUnauthenticated() throws Exception { + mockMvc.perform(get("/api/documents/search").param("undated", "true")) + .andExpect(status().isUnauthorized()); + } + + @Test + @WithMockUser + void search_undatedTrue_isForwardedToServiceAsTrue() throws Exception { + ArgumentCaptor undatedCaptor = ArgumentCaptor.forClass(Boolean.class); + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) + .thenReturn(DocumentSearchResult.of(List.of())); + + mockMvc.perform(get("/api/documents/search").param("undated", "true")) + .andExpect(status().isOk()); + + verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), undatedCaptor.capture(), any()); + assertThat(undatedCaptor.getValue()).isTrue(); + } + + @Test + @WithMockUser + void search_withoutUndatedParam_forwardsFalseToService() throws Exception { + ArgumentCaptor undatedCaptor = ArgumentCaptor.forClass(Boolean.class); + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) + .thenReturn(DocumentSearchResult.of(List.of())); + + mockMvc.perform(get("/api/documents/search")) + .andExpect(status().isOk()); + + verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), undatedCaptor.capture(), any()); + assertThat(undatedCaptor.getValue()).isFalse(); + } + @Test @WithMockUser void search_withStatusParam_passesItToService() throws Exception { - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), eq(DocumentStatus.REVIEWED), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), eq(DocumentStatus.REVIEWED), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of())); mockMvc.perform(get("/api/documents/search").param("status", "REVIEWED")) .andExpect(status().isOk()); - verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), eq(DocumentStatus.REVIEWED), any(), any(), any(), any()); + verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), eq(DocumentStatus.REVIEWED), any(), any(), any(), anyBoolean(), any()); } @Test @@ -116,7 +165,7 @@ class DocumentControllerTest { @Test @WithMockUser void search_responseContainsTotalCount() throws Exception { - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of())); mockMvc.perform(get("/api/documents/search")) @@ -131,9 +180,10 @@ class DocumentControllerTest { UUID docId = UUID.randomUUID(); var matchData = new SearchMatchData( "Er schrieb einen langen Brief", List.of(), false, List.of(), List.of(), List.of(), null, List.of()); - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem( - docId, "Brief an Anna", "brief.pdf", null, null, null, + docId, "Brief an Anna", "brief.pdf", null, null, + DatePrecision.UNKNOWN, null, null, List.of(), List.of(), null, null, null, null, 0, List.of(), matchData, LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0))))); @@ -150,9 +200,10 @@ class DocumentControllerTest { void search_returns_flat_item_with_id_and_without_sensitive_fields() throws Exception { UUID docId = UUID.randomUUID(); var matchData = new SearchMatchData(null, List.of(), false, List.of(), List.of(), List.of(), null, List.of()); - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of(new DocumentListItem( - docId, "Brief an Anna", "brief.pdf", null, null, null, + docId, "Brief an Anna", "brief.pdf", null, null, + DatePrecision.UNKNOWN, null, null, List.of(), List.of(), null, null, null, null, 0, List.of(), matchData, LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0))))); @@ -172,7 +223,7 @@ class DocumentControllerTest { @Test @WithMockUser void search_responseExposesPagingFields() throws Exception { - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of())); mockMvc.perform(get("/api/documents/search")) @@ -217,7 +268,7 @@ class DocumentControllerTest { @Test @WithMockUser void search_passesPageRequestToService() throws Exception { - when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), any())) .thenReturn(DocumentSearchResult.of(List.of())); mockMvc.perform(get("/api/documents/search").param("page", "2").param("size", "25")) @@ -225,7 +276,7 @@ class DocumentControllerTest { org.mockito.ArgumentCaptor captor = org.mockito.ArgumentCaptor.forClass(org.springframework.data.domain.Pageable.class); - verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), captor.capture()); + verify(documentService).searchDocuments(any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean(), captor.capture()); org.springframework.data.domain.Pageable pageable = captor.getValue(); org.assertj.core.api.Assertions.assertThat(pageable.getPageNumber()).isEqualTo(2); org.assertj.core.api.Assertions.assertThat(pageable.getPageSize()).isEqualTo(25); @@ -294,6 +345,34 @@ class DocumentControllerTest { .andExpect(status().isOk()); } + @Test + @WithMockUser(authorities = "WRITE_ALL") + void updateDocument_bindsPrecisionFormFields_toDTO() throws Exception { + // Pins the wire contract: the edit form's metaDatePrecision / metaDateEnd / + // metaDateRaw multipart field names must bind to DocumentUpdateDTO. A rename + // on either side silently drops the precision edit; this captures the DTO. + UUID id = UUID.randomUUID(); + Document doc = Document.builder().id(id).title("Brief").originalFilename("brief.pdf").build(); + when(userService.findByEmail(any())).thenReturn(AppUser.builder().id(UUID.randomUUID()).build()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(DocumentUpdateDTO.class); + when(documentService.updateDocument(eq(id), captor.capture(), any(), any())).thenReturn(doc); + + mockMvc.perform(multipart("/api/documents/" + id) + .param("metaDatePrecision", "RANGE") + .param("metaDateEnd", "1917-01-11") + .param("metaDateRaw", "10.–11. Januar 1917") + .with(req -> { req.setMethod("PUT"); return req; }).with(csrf())) + .andExpect(status().isOk()); + + DocumentUpdateDTO bound = captor.getValue(); + org.assertj.core.api.Assertions.assertThat(bound.getMetaDatePrecision()).isEqualTo(DatePrecision.RANGE); + org.assertj.core.api.Assertions.assertThat(bound.getMetaDateEnd()) + .isEqualTo(java.time.LocalDate.of(1917, 1, 11)); + org.assertj.core.api.Assertions.assertThat(bound.getMetaDateRaw()).isEqualTo("10.–11. Januar 1917"); + } + // ─── DELETE /api/documents/{id} ────────────────────────────────────────── @Test @@ -1115,7 +1194,7 @@ class DocumentControllerTest { void getDocumentIds_returns200_andDelegatesToService() throws Exception { when(userService.findByEmail(any())).thenReturn(AppUser.builder().id(UUID.randomUUID()).build()); UUID id = UUID.randomUUID(); - when(documentService.findIdsForFilter(any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.findIdsForFilter(any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean())) .thenReturn(List.of(id)); mockMvc.perform(get("/api/documents/ids")) @@ -1128,13 +1207,13 @@ class DocumentControllerTest { void getDocumentIds_passesSenderIdParamToService() throws Exception { when(userService.findByEmail(any())).thenReturn(AppUser.builder().id(UUID.randomUUID()).build()); UUID senderId = UUID.randomUUID(); - when(documentService.findIdsForFilter(any(), any(), any(), eq(senderId), any(), any(), any(), any(), any())) + when(documentService.findIdsForFilter(any(), any(), any(), eq(senderId), any(), any(), any(), any(), any(), anyBoolean())) .thenReturn(List.of()); mockMvc.perform(get("/api/documents/ids").param("senderId", senderId.toString())) .andExpect(status().isOk()); - verify(documentService).findIdsForFilter(any(), any(), any(), eq(senderId), any(), any(), any(), any(), any()); + verify(documentService).findIdsForFilter(any(), any(), any(), eq(senderId), any(), any(), any(), any(), any(), anyBoolean()); } @Test @@ -1144,7 +1223,7 @@ class DocumentControllerTest { // Service returns 5001 IDs — one over BULK_EDIT_FILTER_MAX_IDS (5000). java.util.List tooMany = new java.util.ArrayList<>(5001); for (int i = 0; i < 5001; i++) tooMany.add(UUID.randomUUID()); - when(documentService.findIdsForFilter(any(), any(), any(), any(), any(), any(), any(), any(), any())) + when(documentService.findIdsForFilter(any(), any(), any(), any(), any(), any(), any(), any(), any(), anyBoolean())) .thenReturn(tooMany); mockMvc.perform(get("/api/documents/ids")) diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentLazyLoadingTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentLazyLoadingTest.java index 62a2d843..1b5a4b1e 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentLazyLoadingTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentLazyLoadingTest.java @@ -123,8 +123,7 @@ class DocumentLazyLoadingTest { DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.RECEIVER, "asc", null, - PageRequest.of(0, 20)); + DocumentSort.RECEIVER, "asc", null, false, PageRequest.of(0, 20)); assertThat(result.totalElements()).isGreaterThan(0); assertThatCode(() -> result.items().forEach(i -> { if (i.sender() != null) i.sender().getLastName(); })) @@ -139,8 +138,7 @@ class DocumentLazyLoadingTest { assertThatCode(() -> documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.SENDER, "asc", null, - PageRequest.of(0, 20))) + DocumentSort.SENDER, "asc", null, false, PageRequest.of(0, 20))) .doesNotThrowAnyException(); } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentListItemIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentListItemIntegrationTest.java index 4c532882..3d0e4b90 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentListItemIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentListItemIntegrationTest.java @@ -56,8 +56,7 @@ class DocumentListItemIntegrationTest { assertThatCode(() -> documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(0, 50))) + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50))) .doesNotThrowAnyException(); } @@ -72,8 +71,7 @@ class DocumentListItemIntegrationTest { DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(0, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); assertThat(result.totalElements()).isGreaterThan(0); DocumentListItem item = result.items().get(0); @@ -81,6 +79,27 @@ class DocumentListItemIntegrationTest { assertThat(item.title()).isEqualTo("Kurrent Brief"); } + @Test + void search_listItem_carriesMetaDatePrecisionAndEnd() { + documentRepository.save(Document.builder() + .title("Range Brief") + .originalFilename("range.pdf") + .status(DocumentStatus.UPLOADED) + .documentDate(java.time.LocalDate.of(1943, 1, 1)) + .metaDatePrecision(DatePrecision.RANGE) + .metaDateEnd(java.time.LocalDate.of(1943, 12, 31)) + .build()); + + DocumentSearchResult result = documentService.searchDocuments( + null, null, null, null, null, null, null, null, + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); + + DocumentListItem item = result.items().stream() + .filter(i -> i.title().equals("Range Brief")).findFirst().orElseThrow(); + assertThat(item.metaDatePrecision()).isEqualTo(DatePrecision.RANGE); + assertThat(item.metaDateEnd()).isEqualTo(java.time.LocalDate.of(1943, 12, 31)); + } + @Test void detail_stillReturnsTrainingLabels() { Document saved = documentRepository.save(Document.builder() diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchPagedIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchPagedIntegrationTest.java index c61c38af..3d65cbac 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchPagedIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchPagedIntegrationTest.java @@ -62,8 +62,7 @@ class DocumentSearchPagedIntegrationTest { void search_firstPage_returnsExactlyPageSizeItems_andCorrectTotalElements() { DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(0, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); assertThat(result.items()).hasSize(50); assertThat(result.totalElements()).isEqualTo(FIXTURE_SIZE); @@ -76,8 +75,7 @@ class DocumentSearchPagedIntegrationTest { void search_lastPartialPage_returnsRemainingItems() { DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(2, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(2, 50)); // Page 2 (offset 100) of 120 docs → exactly 20 items on the tail. assertThat(result.items()).hasSize(20); @@ -89,8 +87,7 @@ class DocumentSearchPagedIntegrationTest { void search_pageBeyondLast_returnsEmptyContent_totalElementsStillCorrect() { DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(99, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(99, 50)); assertThat(result.items()).isEmpty(); assertThat(result.totalElements()).isEqualTo(FIXTURE_SIZE); @@ -103,8 +100,7 @@ class DocumentSearchPagedIntegrationTest { // returns the correct total from a real repository fetch. DocumentSearchResult result = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.SENDER, "asc", null, - PageRequest.of(1, 50)); + DocumentSort.SENDER, "asc", null, false, PageRequest.of(1, 50)); assertThat(result.items()).hasSize(50); assertThat(result.totalElements()).isEqualTo(FIXTURE_SIZE); @@ -112,16 +108,91 @@ class DocumentSearchPagedIntegrationTest { assertThat(result.totalPages()).isEqualTo(3); } + @Test + void search_undatedCount_isGlobalFilteredTotal_notPageSlice() { + // Seed 70 undated docs on top of the 120 dated ones. With a 50-per-page + // window the undated rows span multiple pages, so a page-local count could + // never exceed 50 — the global count must be the full 70 (issue #668). + int undatedTotal = 70; + for (int i = 0; i < undatedTotal; i++) { + documentRepository.save(Document.builder() + .title("Undatiert-" + String.format("%03d", i)) + .originalFilename("undatiert-" + i + ".pdf") + .status(DocumentStatus.UPLOADED) + .metaDatePrecision(DatePrecision.UNKNOWN) + .documentDate(null) + .build()); + } + + DocumentSearchResult result = documentService.searchDocuments( + null, null, null, null, null, null, null, null, + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); + + // Global undated count is the full undated total, independent of page size. + assertThat(result.undatedCount()).isEqualTo(undatedTotal); + // Total matches both dated + undated (no undated-only filter applied). + assertThat(result.totalElements()).isEqualTo(FIXTURE_SIZE + undatedTotal); + // The first DATE-DESC page is all dated rows (nulls last), so a page-local + // tally would report 0 undated — proving the count is not page-derived. + assertThat(result.items()).allMatch(item -> item.documentDate() != null); + } + + @Test + void search_undatedCount_ignoresUndatedOnlyToggle() { + // The "Nur undatierte" toggle must not skew the count: whether undated=true or + // false, the global undated count for the same filter is identical (issue #668). + int undatedTotal = 12; + for (int i = 0; i < undatedTotal; i++) { + documentRepository.save(Document.builder() + .title("U-" + i) + .originalFilename("u-" + i + ".pdf") + .status(DocumentStatus.UPLOADED) + .metaDatePrecision(DatePrecision.UNKNOWN) + .documentDate(null) + .build()); + } + + DocumentSearchResult unfiltered = documentService.searchDocuments( + null, null, null, null, null, null, null, null, + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); + DocumentSearchResult undatedOnly = documentService.searchDocuments( + null, null, null, null, null, null, null, null, + DocumentSort.DATE, "DESC", null, true, PageRequest.of(0, 50)); + + assertThat(unfiltered.undatedCount()).isEqualTo(undatedTotal); + assertThat(undatedOnly.undatedCount()).isEqualTo(undatedTotal); + } + + @Test + void search_undatedCount_isZero_insideDateRange() { + // A from/to range excludes undated rows by the collision rule (#668), so the + // global undated count inside a range is legitimately 0 even when undated docs exist. + for (int i = 0; i < 5; i++) { + documentRepository.save(Document.builder() + .title("U-range-" + i) + .originalFilename("u-range-" + i + ".pdf") + .status(DocumentStatus.UPLOADED) + .metaDatePrecision(DatePrecision.UNKNOWN) + .documentDate(null) + .build()); + } + + DocumentSearchResult result = documentService.searchDocuments( + null, LocalDate.of(1900, 1, 1), LocalDate.of(2000, 12, 31), + null, null, null, null, null, + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); + + assertThat(result.undatedCount()).isZero(); + } + @Test void search_differentPagesReturnDisjointSlices() { DocumentSearchResult page0 = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(0, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(0, 50)); DocumentSearchResult page1 = documentService.searchDocuments( null, null, null, null, null, null, null, null, - DocumentSort.DATE, "DESC", null, - PageRequest.of(1, 50)); + DocumentSort.DATE, "DESC", null, false, PageRequest.of(1, 50)); // No document id should appear on both pages — slicing must be exclusive. var idsOnPage0 = page0.items().stream() diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java index a487e272..09a8613b 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSearchResultTest.java @@ -15,7 +15,8 @@ class DocumentSearchResultTest { private DocumentListItem item(UUID docId) { return new DocumentListItem( - docId, "Test", "test.pdf", null, null, null, + docId, "Test", "test.pdf", null, null, + DatePrecision.UNKNOWN, null, null, List.of(), List.of(), null, null, null, null, 0, List.of(), SearchMatchData.empty(), LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)); @@ -66,7 +67,8 @@ class DocumentSearchResultTest { UUID id = UUID.randomUUID(); ActivityActorDTO actor = new ActivityActorDTO("AB", "#f00", "Anna Braun"); DocumentListItem item = new DocumentListItem( - id, "T", "t.pdf", null, null, null, + id, "T", "t.pdf", null, null, + DatePrecision.UNKNOWN, null, null, List.of(), List.of(), null, null, null, null, 75, List.of(actor), SearchMatchData.empty(), LocalDateTime.of(2026, 1, 15, 10, 0), LocalDateTime.of(2026, 1, 15, 10, 0)); @@ -100,4 +102,32 @@ class DocumentSearchResultTest { assertThat(schema.requiredMode()).isEqualTo(Schema.RequiredMode.REQUIRED); } } + + @Test + void undatedCount_component_is_annotated_as_required_in_openapi_schema() throws NoSuchFieldException { + Schema schema = DocumentSearchResult.class.getDeclaredField("undatedCount").getAnnotation(Schema.class); + assertThat(schema).isNotNull(); + assertThat(schema.requiredMode()).isEqualTo(Schema.RequiredMode.REQUIRED); + } + + @Test + void factories_default_undatedCount_to_zero() { + assertThat(DocumentSearchResult.of(List.of()).undatedCount()).isZero(); + assertThat(DocumentSearchResult.paged(List.of(), PageRequest.of(0, 50), 0L).undatedCount()).isZero(); + } + + @Test + void withUndatedCount_overlays_count_and_preserves_other_fields() { + DocumentSearchResult base = DocumentSearchResult.paged( + List.of(item(UUID.randomUUID())), PageRequest.of(1, 50), 120L); + + DocumentSearchResult withCount = base.withUndatedCount(7L); + + assertThat(withCount.undatedCount()).isEqualTo(7L); + assertThat(withCount.items()).isEqualTo(base.items()); + assertThat(withCount.totalElements()).isEqualTo(120L); + assertThat(withCount.pageNumber()).isEqualTo(1); + assertThat(withCount.pageSize()).isEqualTo(50); + assertThat(withCount.totalPages()).isEqualTo(3); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java index abf6e389..c3d00619 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceSortTest.java @@ -67,7 +67,7 @@ class DocumentServiceSortTest { .thenReturn(new PageImpl<>(List.of(newer, older))); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, PAGE); + "Brief", null, null, null, null, null, null, null, DocumentSort.DATE, "DESC", null, false, PAGE); assertThat(result.items()).hasSize(2); assertThat(result.items().get(0).id()).isEqualTo(id2); // newer first @@ -84,7 +84,7 @@ class DocumentServiceSortTest { .thenReturn(List.of(doc(id1))); documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, false, PAGE); verify(documentRepository).findFtsPageRaw(anyString(), anyInt(), anyInt()); verify(documentRepository, never()).findAllMatchingIdsByFts(anyString()); @@ -102,7 +102,7 @@ class DocumentServiceSortTest { when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1))); // unordered from JPA DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, false, PAGE); assertThat(result.items().get(0).id()).isEqualTo(id1); } @@ -119,7 +119,7 @@ class DocumentServiceSortTest { when(documentRepository.findAllById(any())).thenReturn(List.of(doc(id2), doc(id1))); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, null, null, null, PAGE); + "Brief", null, null, null, null, null, null, null, null, null, null, false, PAGE); assertThat(result.items().get(0).id()).isEqualTo(id1); } @@ -133,7 +133,7 @@ class DocumentServiceSortTest { DocumentSearchResult result = documentService.searchDocuments( "Brief", null, null, null, null, null, null, null, - DocumentSort.RELEVANCE, null, null, hugePage); + DocumentSort.RELEVANCE, null, null, false, hugePage); assertThat(result.items()).isEmpty(); verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt()); @@ -153,7 +153,7 @@ class DocumentServiceSortTest { DocumentSearchResult result = documentService.searchDocuments( "Brief", null, null, null, null, null, null, null, - DocumentSort.RELEVANCE, null, null, PAGE); + DocumentSort.RELEVANCE, null, null, false, PAGE); assertThat(result.items()).hasSize(1); assertThat(result.items().get(0).id()).isEqualTo(uuidId); @@ -173,7 +173,7 @@ class DocumentServiceSortTest { // sender filter is active → triggers in-memory path, not findFtsPageRaw LocalDate from = LocalDate.of(1900, 1, 1); documentService.searchDocuments( - "Brief", from, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, PAGE); + "Brief", from, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, false, PAGE); verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt()); verify(documentRepository).findAllMatchingIdsByFts("Brief"); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java index 8ef7a6f2..04b84fba 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentServiceTest.java @@ -47,6 +47,8 @@ import java.util.UUID; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.ArgumentMatchers.isNull; import static org.mockito.Mockito.*; @@ -144,6 +146,53 @@ class DocumentServiceTest { assertThat(doc.getArchiveFolder()).isEqualTo("Mappe B"); } + @Test + void updateDocument_persistsDatePrecisionEndAndRaw() throws Exception { + UUID id = UUID.randomUUID(); + Document doc = Document.builder().id(id).receivers(new HashSet<>()).tags(new HashSet<>()).build(); + when(documentRepository.findById(id)).thenReturn(Optional.of(doc)); + when(documentRepository.save(any())).thenReturn(doc); + + DocumentUpdateDTO dto = new DocumentUpdateDTO(); + dto.setDocumentDate(LocalDate.of(1917, 1, 10)); + dto.setMetaDatePrecision(DatePrecision.RANGE); + dto.setMetaDateEnd(LocalDate.of(1917, 1, 11)); + dto.setMetaDateRaw("10.–11. Januar 1917"); + + documentService.updateDocument(id, dto, null, null); + + assertThat(doc.getMetaDatePrecision()).isEqualTo(DatePrecision.RANGE); + assertThat(doc.getMetaDateEnd()).isEqualTo(LocalDate.of(1917, 1, 11)); + assertThat(doc.getMetaDateRaw()).isEqualTo("10.–11. Januar 1917"); + } + + @Test + void updateDocument_preservesStoredPrecision_whenDtoOmitsIt() throws Exception { + // Editing a doc (e.g. fixing a location typo) without touching the precision + // controls must NOT fabricate a precision. The form omits the three precision + // fields → they arrive null on the DTO → the stored values must be preserved. + UUID id = UUID.randomUUID(); + Document doc = Document.builder() + .id(id) + .metaDatePrecision(DatePrecision.MONTH) + .metaDateEnd(LocalDate.of(1916, 6, 30)) + .metaDateRaw("Juni 1916") + .receivers(new HashSet<>()) + .tags(new HashSet<>()) + .build(); + when(documentRepository.findById(id)).thenReturn(Optional.of(doc)); + when(documentRepository.save(any())).thenReturn(doc); + + DocumentUpdateDTO dto = new DocumentUpdateDTO(); + dto.setLocation("Berlin"); // unrelated edit; precision fields left null + + documentService.updateDocument(id, dto, null, null); + + assertThat(doc.getMetaDatePrecision()).isEqualTo(DatePrecision.MONTH); + assertThat(doc.getMetaDateEnd()).isEqualTo(LocalDate.of(1916, 6, 30)); + assertThat(doc.getMetaDateRaw()).isEqualTo("Juni 1916"); + } + // ─── deleteTagCascading ─────────────────────────────────────────────────── @Test @@ -1362,8 +1411,7 @@ class DocumentServiceTest { .thenReturn(new PageImpl<>(List.of())); documentService.searchDocuments(null, null, null, null, null, null, null, null, - org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, - org.springframework.data.domain.PageRequest.of(1, 50)); + org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, false, org.springframework.data.domain.PageRequest.of(1, 50)); verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class)); verify(documentRepository, never()).findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Sort.class)); @@ -1376,8 +1424,7 @@ class DocumentServiceTest { .thenReturn(new PageImpl<>(List.of())); documentService.searchDocuments(null, null, null, null, null, null, null, null, - org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, - org.springframework.data.domain.PageRequest.of(3, 25)); + org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, false, org.springframework.data.domain.PageRequest.of(3, 25)); verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), captor.capture()); assertThat(captor.getValue().getPageNumber()).isEqualTo(3); @@ -1393,8 +1440,7 @@ class DocumentServiceTest { .thenReturn(new PageImpl<>(List.of(d), org.springframework.data.domain.PageRequest.of(0, 50), 120L)); DocumentSearchResult result = documentService.searchDocuments(null, null, null, null, null, null, null, null, - org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, - org.springframework.data.domain.PageRequest.of(0, 50)); + org.raddatz.familienarchiv.document.DocumentSort.DATE, "DESC", null, false, org.springframework.data.domain.PageRequest.of(0, 50)); assertThat(result.totalElements()).isEqualTo(120L); assertThat(result.pageNumber()).isZero(); @@ -1403,6 +1449,50 @@ class DocumentServiceTest { assertThat(result.items()).hasSize(1); // only the slice is enriched } + @Test + void searchDocuments_dateSort_DESC_ordersUndatedLast() { + ArgumentCaptor captor = ArgumentCaptor.forClass(Pageable.class); + when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class))) + .thenReturn(new PageImpl<>(List.of())); + + documentService.searchDocuments(null, null, null, null, null, null, null, null, + DocumentSort.DATE, "DESC", null, false, org.springframework.data.domain.PageRequest.of(0, 5)); + + verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), captor.capture()); + Sort.Order dateOrder = captor.getValue().getSort().getOrderFor("documentDate"); + assertThat(dateOrder).isNotNull(); + assertThat(dateOrder.getDirection()).isEqualTo(Sort.Direction.DESC); + assertThat(dateOrder.getNullHandling()).isEqualTo(Sort.NullHandling.NULLS_LAST); + // Owner-decided tiebreaker (#668): title ASC, not createdAt. + Sort.Order tiebreak = captor.getValue().getSort().getOrderFor("title"); + assertThat(tiebreak).isNotNull(); + assertThat(tiebreak.getDirection()).isEqualTo(Sort.Direction.ASC); + assertThat(captor.getValue().getSort().getOrderFor("createdAt")).isNull(); + } + + @Test + void searchDocuments_dateSort_ASC_ordersUndatedLast() { + // The ASC bug: Postgres puts NULLs FIRST on ascending sort without explicit + // NULLS LAST, surfacing undated documents at the top. This is the red. + ArgumentCaptor captor = ArgumentCaptor.forClass(Pageable.class); + when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class))) + .thenReturn(new PageImpl<>(List.of())); + + documentService.searchDocuments(null, null, null, null, null, null, null, null, + DocumentSort.DATE, "ASC", null, false, org.springframework.data.domain.PageRequest.of(0, 5)); + + verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), captor.capture()); + Sort.Order dateOrder = captor.getValue().getSort().getOrderFor("documentDate"); + assertThat(dateOrder).isNotNull(); + assertThat(dateOrder.getDirection()).isEqualTo(Sort.Direction.ASC); + assertThat(dateOrder.getNullHandling()).isEqualTo(Sort.NullHandling.NULLS_LAST); + // Owner-decided tiebreaker (#668): title ASC, not createdAt. + Sort.Order tiebreak = captor.getValue().getSort().getOrderFor("title"); + assertThat(tiebreak).isNotNull(); + assertThat(tiebreak.getDirection()).isEqualTo(Sort.Direction.ASC); + assertThat(captor.getValue().getSort().getOrderFor("createdAt")).isNull(); + } + @Test void searchDocuments_UPDATED_AT_sort_resolves_to_updatedAt_field() { ArgumentCaptor captor = ArgumentCaptor.forClass(Pageable.class); @@ -1410,8 +1500,7 @@ class DocumentServiceTest { .thenReturn(new PageImpl<>(List.of())); documentService.searchDocuments(null, null, null, null, null, null, null, null, - DocumentSort.UPDATED_AT, "DESC", null, - org.springframework.data.domain.PageRequest.of(0, 5)); + DocumentSort.UPDATED_AT, "DESC", null, false, org.springframework.data.domain.PageRequest.of(0, 5)); verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), captor.capture()); assertThat(captor.getValue().getSort()) @@ -1435,8 +1524,7 @@ class DocumentServiceTest { .thenReturn(all); DocumentSearchResult result = documentService.searchDocuments(null, null, null, null, null, null, null, null, - org.raddatz.familienarchiv.document.DocumentSort.SENDER, "asc", null, - org.springframework.data.domain.PageRequest.of(1, 50)); + org.raddatz.familienarchiv.document.DocumentSort.SENDER, "asc", null, false, org.springframework.data.domain.PageRequest.of(1, 50)); assertThat(result.totalElements()).isEqualTo(120L); assertThat(result.pageNumber()).isEqualTo(1); @@ -1460,8 +1548,7 @@ class DocumentServiceTest { .thenReturn(all); DocumentSearchResult result = documentService.searchDocuments(null, null, null, null, null, null, null, null, - org.raddatz.familienarchiv.document.DocumentSort.SENDER, "asc", null, - org.springframework.data.domain.PageRequest.of(10, 50)); + org.raddatz.familienarchiv.document.DocumentSort.SENDER, "asc", null, false, org.springframework.data.domain.PageRequest.of(10, 50)); assertThat(result.items()).isEmpty(); assertThat(result.totalElements()).isEqualTo(30L); @@ -1474,7 +1561,7 @@ class DocumentServiceTest { when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class))) .thenReturn(new PageImpl<>(List.of())); - documentService.searchDocuments(null, null, null, null, null, null, null, DocumentStatus.REVIEWED, null, null, null, UNPAGED); + documentService.searchDocuments(null, null, null, null, null, null, null, DocumentStatus.REVIEWED, null, null, null, false, UNPAGED); verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class)); } @@ -1484,7 +1571,7 @@ class DocumentServiceTest { when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class))) .thenReturn(new PageImpl<>(List.of())); - documentService.searchDocuments(null, null, null, null, null, null, null, null, null, null, null, UNPAGED); + documentService.searchDocuments(null, null, null, null, null, null, null, null, null, null, null, false, UNPAGED); verify(documentRepository).findAll(any(org.springframework.data.jpa.domain.Specification.class), any(Pageable.class)); } @@ -1562,7 +1649,7 @@ class DocumentServiceTest { .thenReturn(List.of(withSender, noSender)); DocumentSearchResult result = documentService.searchDocuments( - null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, UNPAGED); + null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, false, UNPAGED); assertThat(result.items()).hasSize(2); assertThat(result.items()).extracting(DocumentListItem::title).containsExactly("Has Sender", "No Sender"); @@ -1582,12 +1669,117 @@ class DocumentServiceTest { .thenReturn(List.of(noReceivers, withReceiver)); DocumentSearchResult result = documentService.searchDocuments( - null, null, null, null, null, null, null, null, DocumentSort.RECEIVER, "asc", null, UNPAGED); + null, null, null, null, null, null, null, null, DocumentSort.RECEIVER, "asc", null, false, UNPAGED); assertThat(result.items()).extracting(DocumentListItem::title) .containsExactly("Has Receiver", "No Receivers"); } + // ─── searchDocuments — undated docs stay in their person group (#668) ─────── + + @Test + void searchDocuments_senderSort_asc_keepsUndatedInsideSenderGroupNotAtHead() { + // Locking test (#668): the in-memory SENDER comparator orders by sender name, + // not by date, so an undated (null documentDate) letter must stay WITHIN its + // sender's group — it must NOT float to the head of a multi-sender page. + // Two senders, each with a dated + an undated doc. ASC by "lastName firstName": + // "Adler Bob" < "Ziegler Anna", so both of Bob's docs come before both of Anna's. + // The undated doc supplied FIRST in the input proves grouping (not date) wins: + // were it ordered by date, the two undated docs would clump together at one end. + Person bobAdler = Person.builder().id(UUID.randomUUID()).firstName("Bob").lastName("Adler").build(); + Person annaZiegler = Person.builder().id(UUID.randomUUID()).firstName("Anna").lastName("Ziegler").build(); + Document undatedBob = Document.builder().id(UUID.randomUUID()).title("Bob undated") + .sender(bobAdler).documentDate(null).build(); + Document datedBob = Document.builder().id(UUID.randomUUID()).title("Bob dated") + .sender(bobAdler).documentDate(LocalDate.of(1916, 6, 15)).build(); + Document undatedAnna = Document.builder().id(UUID.randomUUID()).title("Anna undated") + .sender(annaZiegler).documentDate(null).build(); + Document datedAnna = Document.builder().id(UUID.randomUUID()).title("Anna dated") + .sender(annaZiegler).documentDate(LocalDate.of(1943, 12, 24)).build(); + + // Input order interleaves dated/undated so a date-based regression would reorder. + when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) + .thenReturn(List.of(undatedBob, datedAnna, datedBob, undatedAnna)); + + DocumentSearchResult result = documentService.searchDocuments( + null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, false, UNPAGED); + + // Bob's group precedes Anna's group (ASC by sender). The sort is stable, so + // within each group the input order is preserved (undatedBob, datedBob for Bob; + // datedAnna, undatedAnna for Anna). The undated docs never jump to the head and + // each stays inside its sender group — a date-based comparator would instead + // clump the two undated docs together at one end. + assertThat(result.items()).extracting(DocumentListItem::title) + .containsExactly("Bob undated", "Bob dated", "Anna dated", "Anna undated"); + } + + @Test + void searchDocuments_senderSort_desc_keepsUndatedInsideSenderGroupNotAtHead() { + // DESC symmetry for the in-memory path: sender order reverses ("Ziegler Anna" + // before "Adler Bob"), but the undated doc still sorts by sender, never by date, + // so it stays within its group and does not surface at the page head. + Person bobAdler = Person.builder().id(UUID.randomUUID()).firstName("Bob").lastName("Adler").build(); + Person annaZiegler = Person.builder().id(UUID.randomUUID()).firstName("Anna").lastName("Ziegler").build(); + Document undatedBob = Document.builder().id(UUID.randomUUID()).title("Bob undated") + .sender(bobAdler).documentDate(null).build(); + Document datedBob = Document.builder().id(UUID.randomUUID()).title("Bob dated") + .sender(bobAdler).documentDate(LocalDate.of(1916, 6, 15)).build(); + Document undatedAnna = Document.builder().id(UUID.randomUUID()).title("Anna undated") + .sender(annaZiegler).documentDate(null).build(); + Document datedAnna = Document.builder().id(UUID.randomUUID()).title("Anna dated") + .sender(annaZiegler).documentDate(LocalDate.of(1943, 12, 24)).build(); + + when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) + .thenReturn(List.of(undatedBob, datedAnna, datedBob, undatedAnna)); + + DocumentSearchResult result = documentService.searchDocuments( + null, null, null, null, null, null, null, null, DocumentSort.SENDER, "desc", null, false, UNPAGED); + + // Anna's group precedes Bob's (DESC by sender); undated stays inside its group. + assertThat(result.items()).extracting(DocumentListItem::title) + .containsExactly("Anna dated", "Anna undated", "Bob undated", "Bob dated"); + } + + @Test + void searchDocuments_undatedTrue_withSenderSort_appliesUndatedSpecification() { + // Reachable UI state: "Nur undatierte" toggled on while grouped by sender. + // The SENDER sort takes the in-memory path, but the undatedOnly predicate must + // still be composed into the Specification handed to the repository — proven by + // capturing the spec passed to findAll and confirming it filters to null dates. + Person alice = Person.builder().id(UUID.randomUUID()).firstName("Alice").lastName("Ziegler").build(); + Document undatedFromAlice = Document.builder().id(UUID.randomUUID()).title("Undated") + .sender(alice).documentDate(null).build(); + + org.mockito.ArgumentCaptor> specCaptor = + org.mockito.ArgumentCaptor.forClass(org.springframework.data.jpa.domain.Specification.class); + when(documentRepository.findAll(specCaptor.capture())) + .thenReturn(List.of(undatedFromAlice)); + + DocumentSearchResult result = documentService.searchDocuments( + null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, true, UNPAGED); + + // The in-memory path queried via a Specification (built by buildSearchSpec with + // undatedOnly(true)) rather than skipping straight to a sorted findAll. + assertThat(specCaptor.getValue()).isNotNull(); + assertThat(result.items()).extracting(DocumentListItem::title).containsExactly("Undated"); + } + + @Test + void searchDocuments_undatedTrue_usesSpecificationPath_notPureTextRelevanceShortcut() { + // undated=true must bypass the pure-text RELEVANCE SQL shortcut, which + // skips buildSearchSpec and would silently drop the undatedOnly predicate. + when(documentRepository.findAllMatchingIdsByFts("brief")).thenReturn(List.of(UUID.randomUUID())); + when(documentRepository.findAll(any(org.springframework.data.jpa.domain.Specification.class))) + .thenReturn(List.of()); + + documentService.searchDocuments("brief", null, null, null, null, null, null, null, + DocumentSort.RELEVANCE, null, null, true, UNPAGED); + + // The FTS-id path (buildSearchSpec) ran; the raw-page SQL shortcut did not. + verify(documentRepository).findAllMatchingIdsByFts("brief"); + verify(documentRepository, never()).findFtsPageRaw(anyString(), anyInt(), anyInt()); + } + @Test void searchDocuments_senderSort_nullLastNameSortsToEnd() { // Without fix: null lastName produces sort key "null Smith" which compares @@ -1604,7 +1796,7 @@ class DocumentServiceTest { .thenReturn(List.of(docNullName, docSmith)); DocumentSearchResult result = documentService.searchDocuments( - null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, UNPAGED); + null, null, null, null, null, null, null, null, DocumentSort.SENDER, "asc", null, false, UNPAGED); // null lastName should sort to end (treated as empty), not before "smith" (as "null") assertThat(result.items()).extracting(DocumentListItem::title) @@ -1627,7 +1819,7 @@ class DocumentServiceTest { when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, UNPAGED); + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, false, UNPAGED); assertThat(result.items()).hasSize(1); SearchMatchData md = result.items().get(0).matchData(); @@ -1641,8 +1833,7 @@ class DocumentServiceTest { .thenReturn(new PageImpl<>(List.of())); DocumentSearchResult result = documentService.searchDocuments( - null, null, null, null, null, null, null, null, null, null, null, - UNPAGED); + null, null, null, null, null, null, null, null, null, null, null, false, UNPAGED); assertThat(result.items()).isEmpty(); } @@ -1662,7 +1853,7 @@ class DocumentServiceTest { when(documentRepository.findEnrichmentData(any(), eq("Brief"))).thenReturn(rows); DocumentSearchResult result = documentService.searchDocuments( - "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, UNPAGED); + "Brief", null, null, null, null, null, null, null, DocumentSort.RELEVANCE, null, null, false, UNPAGED); SearchMatchData md = result.items().get(0).matchData(); assertThat(md.transcriptionSnippet()).isEqualTo("Hier ist der Brief aus Berlin"); @@ -2179,7 +2370,7 @@ class DocumentServiceTest { .thenReturn(List.of(d1, d2)); List result = documentService.findIdsForFilter( - null, null, null, null, null, null, null, null, null); + null, null, null, null, null, null, null, null, null, false); assertThat(result).containsExactly(d1.getId(), d2.getId()); } @@ -2194,7 +2385,7 @@ class DocumentServiceTest { when(tagService.expandTagNamesToDescendantIdSets(any())).thenReturn(List.of()); documentService.findIdsForFilter( - null, null, null, null, null, List.of("Brief"), null, null, TagOperator.OR); + null, null, null, null, null, List.of("Brief"), null, null, TagOperator.OR, false); // Spec built without throwing → OR branch was exercised. Coverage gain // is in not-throwing on the OR-specific code path; the actual SQL is @@ -2207,7 +2398,7 @@ class DocumentServiceTest { when(documentRepository.findAllMatchingIdsByFts("xyz")).thenReturn(List.of()); List result = documentService.findIdsForFilter( - "xyz", null, null, null, null, null, null, null, null); + "xyz", null, null, null, null, null, null, null, null, false); assertThat(result).isEmpty(); verify(documentRepository, never()).findAll(any(org.springframework.data.jpa.domain.Specification.class)); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSpecificationsTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSpecificationsTest.java index 7af1ec22..b9f8a46d 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSpecificationsTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/DocumentSpecificationsTest.java @@ -261,4 +261,21 @@ class DocumentSpecificationsTest { assertThat(result).isEmpty(); } + // ─── undatedOnly ────────────────────────────────────────────────────────── + + @Test + void undatedOnly_false_returnsAllDocuments() { + // false → no predicate (null), so the filter is a no-op (issue #668). + List result = documentRepository.findAll(Specification.where(undatedOnly(false))); + assertThat(result).hasSize(3); + } + + @Test + void undatedOnly_true_returnsOnlyDocumentsWithoutADate() { + // Only the placeholder photo has a null documentDate in the fixture. + List result = documentRepository.findAll(Specification.where(undatedOnly(true))); + assertThat(result).extracting(Document::getTitle).containsExactly("Familienfoto"); + assertThat(result).allMatch(d -> d.getDocumentDate() == null); + } + } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/document/UndatedDocumentOrderingIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/document/UndatedDocumentOrderingIntegrationTest.java new file mode 100644 index 00000000..e1eeddc7 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/document/UndatedDocumentOrderingIntegrationTest.java @@ -0,0 +1,149 @@ +package org.raddatz.familienarchiv.document; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.config.FlywayConfig; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest; +import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase; +import org.springframework.context.annotation.Import; +import org.springframework.data.domain.Sort; +import org.springframework.data.jpa.domain.Specification; + +import java.time.LocalDate; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.raddatz.familienarchiv.document.DocumentSpecifications.isBetween; +import static org.raddatz.familienarchiv.document.DocumentSpecifications.undatedOnly; + +/** + * Real-Postgres assertions for issue #668. H2 disagrees with Postgres on + * {@code NULLS FIRST/LAST} defaults and on whether {@code BETWEEN} excludes + * NULL, so these guarantees MUST run against {@code postgres:16-alpine}, never + * an in-memory database. + */ +@DataJpaTest +@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE) +@Import({PostgresContainerConfig.class, FlywayConfig.class}) +class UndatedDocumentOrderingIntegrationTest { + + @Autowired DocumentRepository documentRepository; + + @BeforeEach + void setUp() { + documentRepository.deleteAll(); + save("1916", LocalDate.of(1916, 6, 15)); + save("1943", LocalDate.of(1943, 12, 24)); + save("undated-a", null); + save("undated-b", null); + } + + private void save(String title, LocalDate date) { + documentRepository.save(Document.builder() + .title(title) + .originalFilename(title + ".pdf") + .status(DocumentStatus.UPLOADED) + .metaDatePrecision(date == null ? DatePrecision.UNKNOWN : DatePrecision.DAY) + .documentDate(date) + .build()); + } + + @Test + void dateAscWithNullsLast_returnsDatedFirstUndatedLast() { + Sort sort = Sort.by(new Sort.Order(Sort.Direction.ASC, "documentDate").nullsLast()); + + List result = documentRepository.findAll(sort); + + assertThat(result).hasSize(4); + assertThat(result.get(0).getDocumentDate()).isEqualTo(LocalDate.of(1916, 6, 15)); + assertThat(result.get(1).getDocumentDate()).isEqualTo(LocalDate.of(1943, 12, 24)); + assertThat(result.get(2).getDocumentDate()).isNull(); + assertThat(result.get(3).getDocumentDate()).isNull(); + } + + @Test + void sameDate_tiebreaksByTitleAsc_notCreatedAt_forBothDirections() throws Exception { + // Owner decision (#668): equal-date rows tie-break by title ASC, NOT + // createdAt. Insert two same-date docs so that createdAt order (insertion + // order) is the OPPOSITE of title order: the first-saved doc gets the later + // title ("zzz-first"), the second-saved doc gets the earlier title + // ("aaa-second"). If the tiebreaker were still createdAt-asc the first-saved + // row would lead; because it is title-asc the "aaa-second" row must lead — + // and it must lead in BOTH ASC and DESC date directions, since the date is + // equal so only the title tiebreaker decides. + // + // The Sort under test is built by the PRODUCTION resolveSort(DATE, dir) (via + // reflection — it is private), not hand-rolled here, so this test proves the + // real Postgres ordering that production emits, on real same-date rows. + documentRepository.deleteAll(); + LocalDate sameDate = LocalDate.of(1920, 3, 3); + save("zzz-first", sameDate); // saved first → earlier createdAt + save("aaa-second", sameDate); // saved second → later createdAt + + List asc = documentRepository.findAll(resolveProductionSort("ASC")); + assertThat(asc).extracting(Document::getTitle) + .containsExactly("aaa-second", "zzz-first"); + + List desc = documentRepository.findAll(resolveProductionSort("DESC")); + assertThat(desc).extracting(Document::getTitle) + .containsExactly("aaa-second", "zzz-first"); + } + + /** + * Invokes the production {@link DocumentService#resolveSort(DocumentSort, String)} + * for the DATE sort so the integration assertions exercise the real tiebreaker + * choice rather than a sort hand-built in the test. + */ + private Sort resolveProductionSort(String dir) throws Exception { + // resolveSort is a pure function of its arguments (uses no instance state), so a + // bean instance with null collaborators is sufficient to exercise it. + var ctor = DocumentService.class.getDeclaredConstructors()[0]; + ctor.setAccessible(true); + Object[] args = new Object[ctor.getParameterCount()]; + DocumentService service = (DocumentService) ctor.newInstance(args); + var m = DocumentService.class.getDeclaredMethod("resolveSort", DocumentSort.class, String.class); + m.setAccessible(true); + return (Sort) m.invoke(service, DocumentSort.DATE, dir); + } + + @Test + void undatedOnly_returnsExactlyTheNullDatedRows() { + List result = documentRepository.findAll(undatedOnly(true)); + + assertThat(result).hasSize(2); + assertThat(result).allMatch(d -> d.getDocumentDate() == null); + } + + @Test + void undatedOnly_false_returnsAllRows() { + Specification spec = Specification.where(undatedOnly(false)); + + List result = documentRepository.findAll(spec); + + assertThat(result).hasSize(4); + } + + @Test + void dateRange_excludesUndatedRows() { + List result = documentRepository.findAll(isBetween( + LocalDate.of(1900, 1, 1), LocalDate.of(2000, 12, 31))); + + assertThat(result).hasSize(2); + assertThat(result).allMatch(d -> d.getDocumentDate() != null); + } + + @Test + void undatedOnly_combinedWithDateRange_returnsEmpty() { + // The collision rule (#668): a from/to range and undated=true are mutually + // exclusive — a row cannot both have a null date and fall inside a range. + Specification spec = Specification + .where(undatedOnly(true)) + .and(isBetween(LocalDate.of(1900, 1, 1), LocalDate.of(2000, 12, 31))); + + List result = documentRepository.findAll(spec); + + assertThat(result).isEmpty(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java new file mode 100644 index 00000000..090ffe31 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java @@ -0,0 +1,229 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentRepository; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonRepository; +import org.raddatz.familienarchiv.tag.TagRepository; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.test.context.bean.override.mockito.MockitoBean; +import org.springframework.test.util.ReflectionTestUtils; +import software.amazon.awssdk.services.s3.S3Client; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Real Postgres (Testcontainers) integration test for the canonical importer. The + * {@code UNIQUE(source_ref)} constraint and the upsert-on-conflict behaviour only exist + * in real Postgres (never H2), so idempotency is verified here. S3 is mocked — the + * synthetic document rows carry no on-disk files, so every document is a PLACEHOLDER and + * no upload is attempted. + */ +@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) +@ActiveProfiles("test") +@Import(PostgresContainerConfig.class) +class CanonicalImportIntegrationTest { + + @MockitoBean S3Client s3Client; + + @Autowired CanonicalImportOrchestrator orchestrator; + @Autowired PersonRepository personRepository; + @Autowired TagRepository tagRepository; + @Autowired DocumentRepository documentRepository; + + Path artifactDir; + + @BeforeEach + void setUp() throws Exception { + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + artifactDir = Files.createTempDirectory("canonical-import-it"); + writeArtifacts(artifactDir); + ReflectionTestUtils.setField(orchestrator, "canonicalDir", artifactDir.toString()); + } + + /** + * The import commits through its own transactions (the orchestrator is not transactional), + * so this test cannot rely on {@code @Transactional} rollback for isolation. Delete the + * committed rows after each test — otherwise the last test's documents (dated 1888-02) and + * persons/tags leak into the shared Testcontainers Postgres and pollute other integration + * tests that assume a known seed (e.g. DocumentDensityIntegrationTest, + * DocumentSearchPagedIntegrationTest). Mirrors the @AfterEach deleteAll convention used by + * DocumentListItemIntegrationTest. + */ + @AfterEach + void cleanup() { + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + } + + @Test + void reimport_isIdempotent_noDuplicatePersonsTagsOrDocuments() { + orchestrator.runImport(); + long personsAfterFirst = personRepository.count(); + long tagsAfterFirst = tagRepository.count(); + long documentsAfterFirst = documentRepository.count(); + assertThat(orchestrator.getStatus().state()).isEqualTo(ImportStatus.State.DONE); + assertThat(personsAfterFirst).isPositive(); + assertThat(tagsAfterFirst).isPositive(); + assertThat(documentsAfterFirst).isPositive(); + + orchestrator.runImport(); + + assertThat(personRepository.count()).isEqualTo(personsAfterFirst); + assertThat(tagRepository.count()).isEqualTo(tagsAfterFirst); + assertThat(documentRepository.count()).isEqualTo(documentsAfterFirst); + } + + @Test + void reimport_preservesHumanEditedPersonField() { + orchestrator.runImport(); + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + walter.setNotes("Verified by archivist"); + walter.setFirstName("Walther"); + personRepository.save(walter); + + orchestrator.runImport(); + + Person reimported = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + assertThat(reimported.getNotes()).isEqualTo("Verified by archivist"); + assertThat(reimported.getFirstName()).isEqualTo("Walther"); + } + + @Test + void import_linksDocumentSenderToRegisterPerson_andRetainsRawText() { + orchestrator.runImport(); + + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + Document doc = documentRepository.findByOriginalFilename("W-0001").orElseThrow(); + assertThat(doc.getSender()).isNotNull(); + assertThat(doc.getSender().getId()).isEqualTo(walter.getId()); + assertThat(doc.getSenderText()).isEqualTo("Walter de Gruyter"); + assertThat(doc.getStatus()).isEqualTo(DocumentStatus.PLACEHOLDER); + } + + @Test + void import_provisionalFlag_trueForImporterCreated_falseForRegister() { + orchestrator.runImport(); + + Optional register = personRepository.findBySourceRef("de-gruyter-walter"); + assertThat(register).get().extracting(Person::isProvisional).isEqualTo(false); + } + + @Test + void reimport_prunesRemovedReceiverAndTag_whenCanonicalRowShrinks() throws Exception { + orchestrator.runImport(); + // findById uses the Document.full entity graph so receivers/tags initialise eagerly. + Document before = documentRepository.findById( + documentRepository.findByOriginalFilename("W-0001").orElseThrow().getId()).orElseThrow(); + assertThat(before.getReceivers()).isNotEmpty(); + assertThat(before.getTags()).isNotEmpty(); + + // Re-stage the document sheet with W-0001's receiver and tag removed. + writeSheet(artifactDir.resolve("canonical-documents.xlsx"), + List.of("index", "sender_person_id", "sender_name", "receiver_person_ids", + "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), + List.of( + List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter", + "", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"), + List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter", + "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", + "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); + + orchestrator.runImport(); + + Document after = documentRepository.findById(before.getId()).orElseThrow(); + assertThat(after.getReceivers()).isEmpty(); + assertThat(after.getTags()).isEmpty(); + } + + @Test + void import_neverFlipsRegisterPersonToProvisional_whenReferencedByDocumentRow() { + // de-gruyter-walter is a register person (provisional=false) AND the sender of W-0001. + // The orchestrator loads the register before documents, so the document loader's + // register-first match links the existing person and never mints a provisional one. + // A second run (documents reference the same person again) must not flip it true. + orchestrator.runImport(); + orchestrator.runImport(); + + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + assertThat(walter.isProvisional()).isFalse(); + Person eugenie = personRepository.findBySourceRef("de-gruyter-eugenie").orElseThrow(); + assertThat(eugenie.isProvisional()).isFalse(); + } + + // ─── synthetic-but-real artifact set ───────────────────────────────────────────── + + private void writeArtifacts(Path dir) throws Exception { + writeSheet(dir.resolve("canonical-tag-tree.xlsx"), + List.of("tag_path", "parent_name", "tag_name"), + List.of( + List.of("Themen", "", "Themen"), + List.of("Themen/Brautbriefe", "Themen", "Brautbriefe"))); + + writeSheet(dir.resolve("canonical-persons.xlsx"), + List.of("person_id", "last_name", "first_name", "maiden_name", "notes", "birth_date", "death_date", "provisional"), + List.of( + List.of("de-gruyter-walter", "de Gruyter", "Walter", "", "", "1865-01-01", "", "False"), + List.of("de-gruyter-eugenie", "de Gruyter", "Eugenie", "Wöhler", "", "", "", "False"))); + + Files.writeString(dir.resolve("canonical-persons-tree.json"), """ + {"persons":[ + {"rowId":"row_1","firstName":"Walter","lastName":"de Gruyter","familyMember":true,"personId":"de-gruyter-walter"}, + {"rowId":"row_2","firstName":"Eugenie","lastName":"de Gruyter","maidenName":"Wöhler","familyMember":true,"personId":"de-gruyter-eugenie"} + ],"relationships":[ + {"personId":"row_1","relatedPersonId":"row_2","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + writeSheet(dir.resolve("canonical-documents.xlsx"), + List.of("index", "sender_person_id", "sender_name", "receiver_person_ids", + "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), + List.of( + List.of("W-0001", "de-gruyter-walter", "Walter de Gruyter", + "de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "", + "Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"), + List.of("W-0002", "de-gruyter-eugenie", "Eugenie de Gruyter", + "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", + "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); + } + + private void writeSheet(Path file, List headers, List> rows) throws Exception { + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.size(); r++) { + Row row = sheet.createRow(r + 1); + List values = rows.get(r); + for (int c = 0; c < values.size(); c++) { + row.createCell(c).setCellValue(values.get(c)); + } + } + try (OutputStream out = Files.newOutputStream(file)) { + wb.write(out); + } + } + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java new file mode 100644 index 00000000..dc12d070 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java @@ -0,0 +1,130 @@ +package org.raddatz.familienarchiv.importing; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.springframework.test.util.ReflectionTestUtils; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class CanonicalImportOrchestratorTest { + + @Mock TagTreeImporter tagTreeImporter; + @Mock PersonRegisterImporter personRegisterImporter; + @Mock PersonTreeImporter personTreeImporter; + @Mock DocumentImporter documentImporter; + + private CanonicalImportOrchestrator orchestrator(Path dir) { + CanonicalImportOrchestrator o = new CanonicalImportOrchestrator( + tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + ReflectionTestUtils.setField(o, "canonicalDir", dir.toString()); + return o; + } + + private void writeAllArtifacts(Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons-tree.json"), "x"); + Files.writeString(dir.resolve("canonical-documents.xlsx"), "x"); + } + + @Test + void getStatus_isIdleByDefault(@TempDir Path dir) { + assertThat(orchestrator(dir).getStatus().state()).isEqualTo(ImportStatus.State.IDLE); + } + + @Test + void runImport_loadsTagsAndPersonsBeforeDocuments(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(0, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + InOrder order = inOrder(tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + order.verify(tagTreeImporter).load(any()); + order.verify(personRegisterImporter).load(any()); + order.verify(personTreeImporter).load(any()); + order.verify(documentImporter).load(any()); + } + + @Test + void runImport_setsStatusDone_onSuccess(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(3, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.DONE); + assertThat(o.getStatus().processed()).isEqualTo(3); + } + + @Test + void runImport_failsClosed_whenAnArtifactIsMissing(@TempDir Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + // the other three artifacts are absent + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(tagTreeImporter, never()).load(any()); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImport_setsStatusFailed_whenLoaderThrows(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(tagTreeImporter.load(any())).thenThrow(DomainException.badRequest( + org.raddatz.familienarchiv.exception.ErrorCode.IMPORT_ARTIFACT_INVALID, "bad")); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImportAsync_throwsConflict_whenAlreadyRunning(@TempDir Path dir) { + CanonicalImportOrchestrator o = orchestrator(dir); + ReflectionTestUtils.setField(o, "currentStatus", new ImportStatus( + ImportStatus.State.RUNNING, "IMPORT_RUNNING", "running", 0, List.of(), null)); + + assertThatThrownBy(o::runImportAsync) + .isInstanceOf(DomainException.class) + .hasMessageContaining("already in progress"); + } + + @Test + void runImport_aggregatesDocumentSkips(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(1, + List.of(new ImportStatus.SkippedFile("fake.pdf", ImportStatus.SkipReason.INVALID_PDF_SIGNATURE)))); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().skipped()).isEqualTo(1); + assertThat(o.getStatus().skippedFiles()) + .extracting(ImportStatus.SkippedFile::filename) + .containsExactly("fake.pdf"); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java new file mode 100644 index 00000000..ee1d3650 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java @@ -0,0 +1,115 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.raddatz.familienarchiv.exception.DomainException; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class CanonicalSheetReaderTest { + + @Test + void readRows_mapsCellsByHeaderName(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index", "file"), List.of(List.of("W-0001", "scan.pdf"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file")); + + assertThat(rows).hasSize(1); + assertThat(rows.get(0).get("index")).isEqualTo("W-0001"); + assertThat(rows.get(0).get("file")).isEqualTo("scan.pdf"); + } + + @Test + void readRows_throwsBadRequest_whenRequiredHeaderMissing(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index"), List.of(List.of("W-0001"))); + + assertThatThrownBy(() -> CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file"))) + .isInstanceOf(DomainException.class) + .hasMessageContaining("file"); + } + + @Test + void get_returnsEmptyString_forBlankCell(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index", "file"), List.of(List.of("W-0001", ""))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file")); + + assertThat(rows.get(0).get("file")).isEmpty(); + } + + @Test + void get_returnsEmptyString_forUnknownColumn(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index"), List.of(List.of("W-0001"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index")); + + assertThat(rows.get(0).get("does_not_exist")).isEmpty(); + } + + @Test + void get_returnsEmptyString_forTrailingColumns_whenRowShorterThanHeader(@TempDir Path tempDir) throws Exception { + // POI omits trailing empty cells, so a real-world artifact row can be narrower than + // the header. The missing columns must read as "" rather than throwing. + Path xlsx = write(tempDir, + List.of("index", "file", "summary"), + List.of(List.of("W-0001"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file", "summary")); + + assertThat(rows.get(0).get("index")).isEqualTo("W-0001"); + assertThat(rows.get(0).get("file")).isEmpty(); + assertThat(rows.get(0).get("summary")).isEmpty(); + } + + @Test + void splitList_splitsOnPipe() { + assertThat(CanonicalSheetReader.splitList("a|b|c")).containsExactly("a", "b", "c"); + } + + @Test + void splitList_returnsEmptyList_forBlank() { + assertThat(CanonicalSheetReader.splitList("")).isEmpty(); + assertThat(CanonicalSheetReader.splitList(" ")).isEmpty(); + } + + @Test + void splitList_returnsSingleElement_whenNoPipe() { + assertThat(CanonicalSheetReader.splitList("solo")).containsExactly("solo"); + } + + @Test + void splitList_trimsAndDropsEmptySegments() { + assertThat(CanonicalSheetReader.splitList("a| |b")).containsExactly("a", "b"); + } + + private Path write(Path dir, List headers, List> dataRows) throws Exception { + Path xlsx = dir.resolve("sheet.xlsx"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < dataRows.size(); r++) { + Row row = sheet.createRow(r + 1); + List values = dataRows.get(r); + for (int c = 0; c < values.size(); c++) { + row.createCell(c).setCellValue(values.get(c)); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java new file mode 100644 index 00000000..c97de87b --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java @@ -0,0 +1,656 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; +import org.springframework.test.util.ReflectionTestUtils; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import java.io.File; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDate; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class DocumentImporterTest { + + @Mock DocumentService documentService; + @Mock PersonService personService; + @Mock TagService tagService; + @Mock S3Client s3Client; + @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; + @Mock FileStreamOpener fileStreamOpener; + + DocumentImporter importer; + + @BeforeEach + void setUp() throws java.io.IOException { + // Default opener delegates to FileInputStream — tests that need to force an IOException + // override this stub locally (load_skipsFile_whenMagicByteCheckThrowsIoException). + lenient().when(fileStreamOpener.open(any(File.class))) + .thenAnswer(inv -> new java.io.FileInputStream(inv.getArgument(0, File.class))); + importer = new DocumentImporter(documentService, personService, tagService, s3Client, + thumbnailAsyncRunner, fileStreamOpener); + ReflectionTestUtils.setField(importer, "bucketName", "test-bucket"); + } + + // ─── index validation — a malicious/garbage index can never reach disk I/O ───────── + + @Test + void isValidImportIndex_returnsFalse_whenNull() { + assertThat(validIndex(null)).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenBlank() { + assertThat(validIndex(" ")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenForwardSlash() { + assertThat(validIndex("etc/passwd")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenBackslash() { + assertThat(validIndex("..\\etc\\passwd")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenDotDot() { + assertThat(validIndex("W-..0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenIsDotDot() { + assertThat(validIndex("..")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenSingleDot() { + assertThat(validIndex(".")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenAbsolutePath() { + assertThat(validIndex("/etc/passwd")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenNullByte() { + assertThat(validIndex("W-0001\0")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenUnicodeDivisionSlash() { + assertThat(validIndex("W∕0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenFullwidthSlash() { + assertThat(validIndex("W/0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenReverseSolidusOperator() { + assertThat(validIndex("W⧵0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenContainsDotPdfExtension() { + // The index is the bare catalog id; appending ".pdf" is the importer's job. A dot in + // the index would let "W-0001.pdf" become "W-0001.pdf.pdf" or smuggle an extension. + assertThat(validIndex("W-0001.pdf")).isFalse(); + } + + // ─── catalog-shape rejects — pass the char pre-checks but must fail INDEX_PATTERN ──── + // These pin the regex branch itself: each string contains no separator, dot, slash + // homoglyph, null byte, or absolute marker, so it sails past every char guard and is + // rejected *only* because INDEX_PATTERN.matches() returns false. A weaker pattern would + // let them through — these tests would then go red. + + @Test + void isValidImportIndex_returnsFalse_whenSpaceInIndex() { + // The real-world reject: "J 0070" is a space-typo with no PDF on disk. + assertThat(validIndex("J 0070")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenFiveLetterPrefix() { + // The catalog prefix is at most 4 letters; 5 must not match. + assertThat(validIndex("WXYZA-0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenNoLetterPrefix() { + // A digit-led id (no letter prefix) is not a catalog shape. + assertThat(validIndex("12-0001")).isFalse(); + } + + @Test + void isValidImportIndex_returnsFalse_whenUppercaseXSuffix() { + // Only a lowercase trailing "x" is allowed; an uppercase "X" suffix must fail. + assertThat(validIndex("W-0001X")).isFalse(); + } + + @Test + void isValidImportIndex_returnsTrue_whenPlainCatalogIndex() { + assertThat(validIndex("W-0124")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenTwoLetterPrefix() { + assertThat(validIndex("Al-0001")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenThreeLetterPrefix() { + assertThat(validIndex("CuH-0010")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenUmlautPrefix() { + // Real corpus indices carry a German umlaut, e.g. "Mü-0001.pdf" exists on disk. + assertThat(validIndex("Mü-0001")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenDoubleHyphen() { + // Real corpus: "C--0029" appears in the spreadsheet (a data-entry artefact, but a + // legitimate catalog shape that must still resolve, not crash). + assertThat(validIndex("C--0029")).isTrue(); + } + + @Test + void isValidImportIndex_returnsTrue_whenXSuffix() { + // The normalizer recognises an x-suffix catalog id; allow it defensively. + assertThat(validIndex("W-0001x")).isTrue(); + } + + // ─── a valid index resolves to exactly importDir/.pdf within containment ───── + + @Test + void load_resolvesPdfByIndex_uploadsToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D}; + Files.write(tempDir.resolve("W-0124.pdf"), pdf); + when(documentService.findByOriginalFilename("W-0124")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0124", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + // exactly importDir/.pdf was uploaded — the S3 key carries that basename + org.mockito.ArgumentCaptor bodyCaptor = org.mockito.ArgumentCaptor.forClass(RequestBody.class); + verify(s3Client).putObject(any(PutObjectRequest.class), bodyCaptor.capture()); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getStatus() == DocumentStatus.UPLOADED + && d.getFilePath() != null + && d.getFilePath().endsWith("_W-0124.pdf"))); + } + + @Test + void load_yieldsPlaceholder_whenIndexedPdfMissing(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("X-9999")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("X-9999", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void load_rejectsMaliciousIndex_neverReadsOutsideImportDir(@TempDir Path tempDir) throws Exception { + // An index with a path separator must be skipped outright, never used for disk I/O. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Path xlsx = writeDocs(tempDir, docRow("../../etc/cron.d/x", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + verify(documentService, never()).save(any()); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void resolvePdfByIndex_throwsWhenResolvedPathEscapesImportDir_viaSymlink( + @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { + // Containment defense-in-depth: even a syntactically valid index whose .pdf is a + // symlink pointing outside importDir must be refused — the resolved canonical path is + // asserted to stay inside importDir. + Path outsideFile = outsideDir.resolve("secret.pdf"); + Files.writeString(outsideFile, "sensitive"); + Files.createSymbolicLink(importDirPath.resolve("W-0001.pdf"), outsideFile); + ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString()); + + org.assertj.core.api.Assertions.assertThatThrownBy( + () -> ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "W-0001", 2)) + .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class); + } + + @Test + void resolvePdfByIndex_returnsExactlyImportDirIndexPdf_whenPresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Path expected = tempDir.resolve("Eu-0628.pdf"); + Files.writeString(expected, "%PDF-1.4"); + + Optional resolved = ReflectionTestUtils.invokeMethod(importer, "resolvePdfByIndex", "Eu-0628", 2); + + assertThat(resolved).isPresent(); + assertThat(resolved.get().getCanonicalFile()).isEqualTo(expected.toFile().getCanonicalFile()); + } + + // NOTE (Sara, PR #687): the IOException branch of resolvePdfByIndex — where + // File.getCanonicalPath() itself throws (an OS-level failure mid-resolution, not the + // symlink-escape DomainException) — is intentionally NOT covered by a test. Unlike + // isPdfMagicBytes, which has the package-private openFileStream(File) seam a Mockito spy can + // make throw, getCanonicalPath() is called on a File built internally with no injection seam, + // and there is no portable, deterministic way to make it throw on a temp file (it does not + // throw for missing/symlinked paths — those are handled by isFile()/the containment check). + // Adding a seam purely to test this would be production code in service of a non-defect; the + // substantive fix is the log.warn() now emitted in that branch so the quiet skip surfaces in + // ops. Left uncovered by deliberate decision, documented here so the branch is not assumed + // tested. + + // ─── PDF magic-byte guard — ported — do not remove ────────────────────────────── + + @Test + void load_skipsFile_whenNotPdfMagicBytes(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void load_skipsFile_whenMagicByteCheckThrowsIoException(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "content"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); + + // FileStreamOpener is injected — stub it to throw, no spy on the importer needed. + org.mockito.Mockito.when(fileStreamOpener.open(any(File.class))) + .thenThrow(new java.io.IOException("read error")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.FILE_READ_ERROR); + } + + @Test + void load_skipsAlreadyExists_whenDocumentUploadedNotPlaceholder(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0001").status(DocumentStatus.UPLOADED).build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.ALREADY_EXISTS); + verify(documentService, never()).save(any()); + } + + // ─── presence of importDir/.pdf drives status: present → UPLOADED, absent → PLACEHOLDER ─ + + @Test + void load_setsStatusPlaceholder_whenNoIndexedPdf(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + // ─── attribution routing — register-first + always retain raw ──────────────────── + + @Test + void load_linksRegisterSender_andRetainsRawSenderText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person walter = Person.builder().id(UUID.randomUUID()).sourceRef("de-gruyter-walter") + .firstName("Walter").lastName("de Gruyter").build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "de-gruyter-walter", "Walter de Gruyter", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == walter && "Walter de Gruyter".equals(d.getSenderText()))); + } + + @Test + void load_createsProvisionalSender_whenSlugUnmatchedInRegister(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("schwester-hanni") + .lastName("Schwester Hanni").provisional(true).build(); + when(documentService.findByOriginalFilename("W-0002")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty()); + when(personService.upsertBySourceRef(any())).thenReturn(provisional); + Path xlsx = writeDocs(tempDir, docRow("W-0002", "schwester-hanni", "Schwester Hanni", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().provisional()).isTrue(); + assertThat(captor.getValue().lastName()).isEqualTo("Schwester Hanni"); + } + + @Test + void load_createsNoSenderPerson_whenSlugEmptyButRawPresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "?", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(personService, never()).findBySourceRef(any()); + verify(personService, never()).upsertBySourceRef(any()); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == null && "?".equals(d.getSenderText()))); + } + + @Test + void load_splitsMultipleReceivers_andRetainsRawReceiverText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person herbert = Person.builder().id(UUID.randomUUID()).sourceRef("cram-herbert").lastName("Cram").build(); + Person clara = Person.builder().id(UUID.randomUUID()).sourceRef("clara").lastName("Clara").build(); + when(documentService.findByOriginalFilename("W-0004")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert)); + when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara)); + Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", + "cram-herbert|clara", "Herbert Cram|Clara", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getReceivers().size() == 2 + && d.getReceivers().contains(herbert) + && d.getReceivers().contains(clara) + && "Herbert Cram|Clara".equals(d.getReceiverText()))); + } + + @Test + void load_provisionalReceiverUsesHumanNameFromReceiverNames_notSlug(@TempDir Path tempDir) throws Exception { + // Regression: resolveReceivers used to pass the slug as both `sourceRef` AND `lastName`, + // so an unresolved receiver "smith-john" became a provisional Person with + // lastName="smith-john". The fix consumes the parallel `receiver_names` column. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("smith-john") + .lastName("John Smith").provisional(true).build(); + when(documentService.findByOriginalFilename("W-0050")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("smith-john")).thenReturn(Optional.empty()); + when(personService.upsertBySourceRef(any())).thenReturn(provisional); + Path xlsx = writeDocs(tempDir, docRow("W-0050", "", "", + "smith-john", "John Smith", "", "", "", "")); + + importer.load(xlsx.toFile()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().sourceRef()).isEqualTo("smith-john"); + assertThat(captor.getValue().lastName()).isEqualTo("John Smith"); + assertThat(captor.getValue().provisional()).isTrue(); + } + + @Test + void load_provisionalReceiverFallsBackToSlug_whenNamesListShorterThanSlugs(@TempDir Path tempDir) throws Exception { + // Parallel-list zip: if the names list is shorter than the slugs list, slugs without a + // matching name fall back to slug as the display name. This is the "missing name" case + // (rare in canonical data but the contract must define it). + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person alice = Person.builder().id(UUID.randomUUID()).sourceRef("alice-jones") + .lastName("Alice Jones").provisional(true).build(); + Person bob = Person.builder().id(UUID.randomUUID()).sourceRef("bob-roe") + .lastName("bob-roe").provisional(true).build(); + when(documentService.findByOriginalFilename("W-0051")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("alice-jones")).thenReturn(Optional.empty()); + when(personService.findBySourceRef("bob-roe")).thenReturn(Optional.empty()); + when(personService.upsertBySourceRef(any())).thenReturn(alice).thenReturn(bob); + Path xlsx = writeDocs(tempDir, docRow("W-0051", "", "", + "alice-jones|bob-roe", "Alice Jones", "", "", "", "")); + + importer.load(xlsx.toFile()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService, org.mockito.Mockito.times(2)).upsertBySourceRef(captor.capture()); + assertThat(captor.getAllValues()).extracting(PersonUpsertCommand::sourceRef) + .containsExactly("alice-jones", "bob-roe"); + assertThat(captor.getAllValues()).extracting(PersonUpsertCommand::lastName) + .containsExactly("Alice Jones", "bob-roe"); + } + + // ─── clean date values parse without semantic logic ────────────────────────────── + + @Test + void load_parsesCleanDateAndPrecision(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", + "", "", "1916-06-01", "1.6.1916", "MONTH", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + LocalDate.of(1916, 6, 1).equals(d.getDocumentDate()) + && d.getMetaDatePrecision() == org.raddatz.familienarchiv.document.DatePrecision.MONTH + && "1.6.1916".equals(d.getMetaDateRaw()))); + } + + @Test + void load_attachesTagBySourceRef(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Tag tag = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe").sourceRef("Themen/Brautbriefe").build(); + when(documentService.findByOriginalFilename("W-0006")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(tagService.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(tag)); + Path xlsx = writeDocs(tempDir, docRowWithTag("W-0006", "Themen/Brautbriefe")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getTags().contains(tag))); + } + + // ─── idempotency — update existing document in place by index ───────────────────── + + @Test + void load_updatesExistingDocumentInPlace_whenIndexExists(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build(); + when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing)); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getId().equals(existing.getId()))); + } + + // ─── canonical collections are authoritative — re-import prunes removed links ────── + + @Test + void load_prunesReceiversAndTags_whenCanonicalRowShrinks(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person staleReceiver = Person.builder().id(UUID.randomUUID()).sourceRef("stale-receiver").lastName("Stale").build(); + Tag staleTag = Tag.builder().id(UUID.randomUUID()).name("Stale").sourceRef("Themen/Stale").build(); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0008").status(DocumentStatus.PLACEHOLDER).build(); + existing.getReceivers().add(staleReceiver); + existing.getTags().add(staleTag); + when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing)); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + // The canonical row now carries no receiver and no tag: both stale links must go. + Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getReceivers().isEmpty() && d.getTags().isEmpty())); + } + + // ─── title carries the honest date label — never a precision the data lacks ─────── + + @Test + void load_buildsTitleWithMonthLabel_whenPrecisionIsMonth(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0100")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0100", "", "", "", "", + "1916-06-01", "Juni 1916", "MONTH", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getTitle().contains("Juni 1916") && !d.getTitle().contains("1. Juni"))); + } + + @Test + void load_buildsTitleWithFullDate_whenPrecisionIsDay(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0101")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0101", "", "", "", "", + "1943-12-24", "24.12.1943", "DAY", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getTitle().contains("24. Dezember 1943"))); + } + + @Test + void load_buildsTitleFromIndexOnly_whenDateUnknown(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0102")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0102", "", "", "", "", + "", "?", "UNKNOWN", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getTitle().equals("W-0102"))); + } + + // ─── helpers ───────────────────────────────────────────────────────────────────── + + private Boolean validIndex(String index) { + return ReflectionTestUtils.invokeMethod(importer, "isValidImportIndex", index); + } + + private Map docRow(String index, String senderId, String senderName, + String receiverIds, String receiverNames, String dateIso, + String dateRaw, String datePrecision, String dateEnd) { + Map r = new LinkedHashMap<>(); + r.put("index", index); + r.put("sender_person_id", senderId); + r.put("sender_name", senderName); + r.put("receiver_person_ids", receiverIds); + r.put("receiver_names", receiverNames); + r.put("date_iso", dateIso); + r.put("date_raw", dateRaw); + r.put("date_precision", datePrecision); + r.put("date_end", dateEnd); + r.put("location", ""); + r.put("tags", ""); + r.put("summary", ""); + return r; + } + + private Map docRowWithTag(String index, String tagPath) { + Map r = docRow(index, "", "", "", "", "", "", "", ""); + r.put("tags", tagPath); + return r; + } + + @SafeVarargs + private Path writeDocs(Path dir, Map... rows) throws Exception { + Path xlsx = dir.resolve("canonical-documents.xlsx"); + List headers = List.of("index", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", + "date_end", "location", "tags", "summary"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.length; r++) { + Row row = sheet.createRow(r + 1); + for (int c = 0; c < headers.size(); c++) { + row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatterTest.java new file mode 100644 index 00000000..d8f66b6e --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentTitleFormatterTest.java @@ -0,0 +1,49 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.DynamicTest; +import org.junit.jupiter.api.TestFactory; +import org.raddatz.familienarchiv.document.DatePrecision; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Asserts the Java title label against the SAME shared fixture table the TS + * formatter spec uses ({@code docs/date-label-fixtures.json}). This is the + * drift guard requested in #666 review: the two label implementations cannot + * silently diverge (en-dash vs hyphen, "ca." vs "circa", season words, range + * collapse) because both are pinned to one committed rule set. + */ +class DocumentTitleFormatterTest { + + @TestFactory + List matchesSharedFixtureTable() throws Exception { + // Maven runs tests from the backend/ module dir; the fixture lives at repo-root docs/. + Path fixture = Path.of("..", "docs", "date-label-fixtures.json"); + JsonNode root = new ObjectMapper().readTree(Files.readString(fixture)); + List tests = new ArrayList<>(); + for (JsonNode c : root.get("cases")) { + String name = c.get("name").asText(); + LocalDate anchor = parseDate(c.get("anchor")); + DatePrecision precision = DatePrecision.valueOf(c.get("precision").asText()); + LocalDate end = parseDate(c.get("end")); + String raw = c.get("raw").isNull() ? null : c.get("raw").asText(); + String expected = c.get("expected").asText(); + tests.add(DynamicTest.dynamicTest(name, () -> + assertThat(DocumentTitleFormatter.formatTitleDate(anchor, precision, end, raw)) + .isEqualTo(expected))); + } + return tests; + } + + private static LocalDate parseDate(JsonNode node) { + return node == null || node.isNull() ? null : LocalDate.parse(node.asText()); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java deleted file mode 100644 index d87d28c1..00000000 --- a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java +++ /dev/null @@ -1,896 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.tag.TagService; -import org.raddatz.familienarchiv.person.PersonService; -import org.springframework.test.util.ReflectionTestUtils; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.xml.sax.SAXParseException; - -import java.io.File; -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.*; - -@ExtendWith(MockitoExtension.class) -class MassImportServiceTest { - - @Mock DocumentService documentService; - @Mock PersonService personService; - @Mock TagService tagService; - @Mock S3Client s3Client; - @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; - - MassImportService service; - - @BeforeEach - void setUp() { - service = new MassImportService(documentService, personService, tagService, s3Client, thumbnailAsyncRunner); - ReflectionTestUtils.setField(service, "bucketName", "test-bucket"); - ReflectionTestUtils.setField(service, "importDir", "/import"); - ReflectionTestUtils.setField(service, "colIndex", 0); - ReflectionTestUtils.setField(service, "colBox", 1); - ReflectionTestUtils.setField(service, "colFolder", 2); - ReflectionTestUtils.setField(service, "colSender", 3); - ReflectionTestUtils.setField(service, "colReceivers", 5); - ReflectionTestUtils.setField(service, "colDate", 7); - ReflectionTestUtils.setField(service, "colLocation", 9); - ReflectionTestUtils.setField(service, "colTags", 10); - ReflectionTestUtils.setField(service, "colSummary", 11); - ReflectionTestUtils.setField(service, "colTranscription", 13); - } - - // ─── getStatus ──────────────────────────────────────────────────────────── - - @Test - void getStatus_returnsIdleByDefault() { - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.IDLE); - } - - @Test - void getStatus_hasStatusCode_IMPORT_IDLE_byDefault() { - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_IDLE"); - } - - // ─── runImportAsync ─────────────────────────────────────────────────────── - - @Test - void runImportAsync_setsFailedStatus_whenImportDirectoryDoesNotExist() { - // /import directory doesn't exist in test environment → IOException → IMPORT_FAILED_INTERNAL - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_INTERNAL"); - } - - @Test - void runImportAsync_readsFromConfiguredImportDir(@TempDir Path tempDir) { - // Empty temp dir → findSpreadsheetFile throws "no spreadsheet" with the - // configured path in the message. Proves the field, not a constant, - // drives the lookup. - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().message()).contains(tempDir.toString()); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_FAILED_NO_SPREADSHEET_whenDirIsEmpty(@TempDir Path tempDir) { - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_NO_SPREADSHEET"); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_DONE_whenSpreadsheetHasNoDataRows(@TempDir Path tempDir) throws Exception { - Path xlsx = tempDir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - wb.createSheet("Sheet1"); - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_DONE"); - } - - @Test - void runImportAsync_throwsConflict_whenAlreadyRunning() { - MassImportService.ImportStatus running = new MassImportService.ImportStatus( - MassImportService.State.RUNNING, "IMPORT_RUNNING", "Running...", 0, List.of(), LocalDateTime.now()); - ReflectionTestUtils.setField(service, "currentStatus", running); - - assertThatThrownBy(() -> service.runImportAsync()) - .isInstanceOf(DomainException.class) - .hasMessageContaining("already in progress"); - } - - // ─── importSingleDocument — skip already uploaded ───────────────────────── - - @Test - void importSingleDocument_skips_whenDocumentAlreadyUploadedNotPlaceholder() { - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("doc001.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.of(existing)); - - Optional result = service.importSingleDocument(minimalCells("doc001.pdf"), Optional.empty(), "doc001.pdf", "doc001"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — already-exists guard fires before file I/O ───── - - @Test - void importSingleDocument_skipsWithAlreadyExists_whenDocumentUploadedAndFileIsPresent(@TempDir Path tempDir) throws Exception { - // Document already exists with status UPLOADED (not PLACEHOLDER). - // A physical PDF file is also present on disk (valid magic bytes). - // Expected: ALREADY_EXISTS is returned and no S3 upload is attempted — - // the guard fires before any file I/O, so no partial processing occurs. - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("present.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("present.pdf")).thenReturn(Optional.of(existing)); - - Path physicalFile = tempDir.resolve("present.pdf"); - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(physicalFile, pdfHeader); - - Optional result = service.importSingleDocument( - minimalCells("present.pdf"), Optional.of(physicalFile.toFile()), "present.pdf", "present"); - - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService, never()).save(any()); - } - - // ─── importSingleDocument — S3 failure surfaced in skippedFiles ────────── - - @Test - void runImportAsync_addsS3UploadFailed_toSkippedFiles_whenS3Throws(@TempDir Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("upload_fail.pdf"), pdfHeader); - buildMinimalImportXlsx(tempDir, "upload_fail.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename("upload_fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 unavailable")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename, MassImportService.SkippedFile::reason) - .containsExactly(org.assertj.core.groups.Tuple.tuple("upload_fail.pdf", MassImportService.SkipReason.S3_UPLOAD_FAILED)); - } - - @Test - void runImportAsync_addsAlreadyExists_toSkippedFiles_whenDocumentAlreadyUploaded(@TempDir Path tempDir) throws Exception { - buildMinimalImportXlsx(tempDir, "existing.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(existing)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — create new document (metadata only) ─────────── - - @Test - void importSingleDocument_createsNewDocument_whenNotExists() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("doc002.pdf"), Optional.empty(), "doc002.pdf", "doc002"); - - verify(documentService).save(argThat(d -> - d.getOriginalFilename().equals("doc002.pdf") - && d.getStatus() == DocumentStatus.PLACEHOLDER)); - } - - // ─── importSingleDocument — update existing placeholder ────────────────── - - @Test - void importSingleDocument_updatesExistingPlaceholder() { - Document placeholder = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.PLACEHOLDER) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(placeholder)); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("existing.pdf"), Optional.empty(), "existing.pdf", "existing"); - - verify(documentService).save(same(placeholder)); - } - - // ─── importSingleDocument — with file (S3 upload) ───────────────────────── - - @Test - void importSingleDocument_uploadsFileToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("doc003.pdf"); - Files.write(tempFile, "PDF content".getBytes()); - - when(documentService.findByOriginalFilename("doc003.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument( - minimalCells("doc003.pdf"), Optional.of(tempFile.toFile()), "doc003.pdf", "doc003"); - - verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService).save(argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); - } - - @Test - void importSingleDocument_returnsS3UploadFailed_whenS3UploadFails(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("fail.pdf"); - Files.write(tempFile, "data".getBytes()); - - when(documentService.findByOriginalFilename("fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 error")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - Optional result = service.importSingleDocument( - minimalCells("fail.pdf"), Optional.of(tempFile.toFile()), "fail.pdf", "fail"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.S3_UPLOAD_FAILED); - } - - // ─── importSingleDocument — sender handling ─────────────────────────────── - - @Test - void importSingleDocument_setsNullSender_whenSenderCellIsBlank() { - when(documentService.findByOriginalFilename("nosender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nosender.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nosender.pdf", "nosender"); - - verify(documentService).save(argThat(d -> d.getSender() == null)); - verify(personService, never()).findOrCreateByAlias(any()); - } - - @Test - void importSingleDocument_createsSender_whenSenderCellIsNonBlank() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("withsender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(sender); - - List cells = buildCells("withsender.pdf", "Walter Müller", "", ""); - service.importSingleDocument(cells, Optional.empty(), "withsender.pdf", "withsender"); - - verify(personService).findOrCreateByAlias("Walter Müller"); - verify(documentService).save(argThat(d -> d.getSender() == sender)); - } - - // ─── importSingleDocument — tag handling ───────────────────────────────── - - @Test - void importSingleDocument_createsTag_whenTagCellIsNonBlank() { - Tag tag = Tag.builder().id(UUID.randomUUID()).name("Familie").build(); - when(documentService.findByOriginalFilename("tagged.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(tagService.findOrCreate("Familie")).thenReturn(tag); - - List cells = buildCells("tagged.pdf", "", "", "Familie"); - service.importSingleDocument(cells, Optional.empty(), "tagged.pdf", "tagged"); - - verify(tagService).findOrCreate("Familie"); - } - - @Test - void importSingleDocument_doesNotCreateTag_whenTagCellIsBlank() { - when(documentService.findByOriginalFilename("notag.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("notag.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "notag.pdf", "notag"); - - verify(tagService, never()).findOrCreate(any()); - } - - // ─── importSingleDocument — metadataComplete heuristic ─────────────────── - - @Test - void importSingleDocument_metadataComplete_whenSenderPresent() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("A").lastName("B").build(); - when(documentService.findByOriginalFilename("meta.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("A B")).thenReturn(sender); - - List cells = buildCells("meta.pdf", "A B", "", ""); - service.importSingleDocument(cells, Optional.empty(), "meta.pdf", "meta"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_metadataIncomplete_whenNoKeyFieldsPresent() { - when(documentService.findByOriginalFilename("nometa.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nometa.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nometa.pdf", "nometa"); - - verify(documentService).save(argThat(d -> !d.isMetadataComplete())); - } - - // ─── importSingleDocument — blank fields set to null ───────────────────── - - @Test - void importSingleDocument_setsBlankFieldsToNull() { - when(documentService.findByOriginalFilename("blank.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("blank.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "blank.pdf", "blank"); - - verify(documentService).save(argThat(d -> - d.getLocation() == null && - d.getSummary() == null && - d.getTranscription() == null && - d.getArchiveBox() == null && - d.getArchiveFolder() == null)); - } - - // ─── processRows — via ReflectionTestUtils ──────────────────────────────── - - @Test - void processRows_returnsZero_whenOnlyHeaderRow() { - List> rows = List.of(List.of("header", "col1")); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - } - - @Test - void processRows_skipsRowWithBlankIndex() { - List> rows = List.of( - List.of("header"), - minimalCells("") // blank index - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - verify(documentService, never()).findByOriginalFilename(any()); - } - - @Test - void processRows_addsExtension_whenIndexHasNoDot() { - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc001") // no dot → appends ".pdf" - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc001.pdf"); - } - - @Test - void processRows_usesFilenameAsIs_whenIndexHasDot() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc002.pdf") // has dot → used as-is - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc002.pdf"); - } - - // ─── isValidImportFilename — security regression — do not remove ───────── - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsNull() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", (String) null); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsBlank() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", " "); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsForwardSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsBackslash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "..\\etc\\passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "doc..evil.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".."); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsAbsolutePath() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "/etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsNullByte() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "file\0.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameIsPlainBasename() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "document.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeDivisionSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo∕bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsFullwidthSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo/bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeReverseSolidus() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo⧵bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasLeadingDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".hidden.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasSpaces() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "Brief an Oma.pdf"); - assertThat(result).isTrue(); - } - - @Test - void processRows_skipsRowAndContinues_whenFilenameIsPathTraversal() { - when(documentService.findByOriginalFilename("legitimate.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("../evil"), // row 1: path traversal — should be skipped - minimalCells("legitimate.pdf") // row 2: valid — should be processed - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - assertThat(result.skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); - } - - // ─── importSingleDocument — non-blank optional fields ──────────────────── - - @Test - void importSingleDocument_setsNonNullOptionalFields_whenPresent() { - when(documentService.findByOriginalFilename("rich.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - // box=1, folder=2, location=9, summary=11, transcription=13 - List cells = List.of( - "rich.pdf", // 0: index - "Box A", // 1: box - "Folder B", // 2: folder - "", // 3: sender - "", // 4: unused - "", // 5: receivers - "", // 6: unused - "", // 7: date - "", // 8: unused - "Hamburg", // 9: location - "", // 10: tags - "A summary", // 11: summary - "", // 12: unused - "A transcript" // 13: transcription - ); - - service.importSingleDocument(cells, Optional.empty(), "rich.pdf", "rich"); - - verify(documentService).save(argThat(d -> - "Box A".equals(d.getArchiveBox()) && - "Folder B".equals(d.getArchiveFolder()) && - "Hamburg".equals(d.getLocation()) && - "A summary".equals(d.getSummary()) && - "A transcript".equals(d.getTranscription()))); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenReceiversArePresent() { - Person receiver = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("rcv.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(receiver); - - List cells = List.of( - "rcv.pdf", "", "", "", "", "Walter Müller", "", "", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "rcv.pdf", "rcv"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenDateIsPresent() { - when(documentService.findByOriginalFilename("dated.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = List.of( - "dated.pdf", "", "", "", "", "", "", "2024-03-15", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "dated.pdf", "dated"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - // ─── buildTitle — null location ─────────────────────────────────────────── - - @Test - void buildTitle_withNullLocation_skipsLocationPart() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc005", LocalDate.of(1940, 5, 1), (String) null); - assertThat(result).contains("doc005").contains("1940"); - assertThat(result).doesNotContain("Berlin"); - } - - // ─── parseDate — via ReflectionTestUtils ───────────────────────────────── - - @Test - void parseDate_returnsNull_whenValueIsNull() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", (String) null); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsNull_whenValueIsBlank() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", " "); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsDate_whenValidIsoFormat() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "2024-03-15"); - assertThat(result).isEqualTo(LocalDate.of(2024, 3, 15)); - } - - @Test - void parseDate_returnsNull_whenInvalidDateString() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "15.03.2024"); - assertThat(result).isNull(); - } - - // ─── buildTitle — via ReflectionTestUtils ──────────────────────────────── - - @Test - void buildTitle_withDateAndLocation() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc001", LocalDate.of(1940, 5, 1), "Berlin"); - assertThat(result).contains("doc001").contains("Berlin").contains("1940"); - } - - @Test - void buildTitle_withDateOnly() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc002", LocalDate.of(1960, 8, 15), ""); - assertThat(result).contains("doc002").contains("1960"); - assertThat(result).doesNotContain("Berlin"); - } - - @Test - void buildTitle_withIndexOnly_whenDateAndLocationAreNull() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc003", null, ""); - assertThat(result).isEqualTo("doc003"); - } - - @Test - void buildTitle_withLocationOnly_whenDateIsNull() { - // date=null, location present → date part skipped, location appended - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc004", null, "Berlin"); - assertThat(result).contains("doc004").contains("Berlin"); - assertThat(result).doesNotContain("("); // no date part - } - - // ─── getCell — via ReflectionTestUtils ─────────────────────────────────── - - @Test - void getCell_returnsEmptyString_whenColBeyondListSize() { - List cells = List.of("a", "b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 5); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsEmptyString_whenValueIsNull() { - List cells = new ArrayList<>(); - cells.add(null); - cells.add("b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsTrimmedValue() { - List cells = List.of(" hello ", "world"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEqualTo("hello"); - } - - // ─── PDF magic byte validation regression ───────────────────────────────── - - @Test - void runImportAsync_uploadsValidPdf_andSkipsFakeOne(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - verify(s3Client, times(1)).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - } - - @Test - void runImportAsync_setsSkippedCount_toOne_whenOneFakeFile(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_includesRejectedFilename_inSkippedFiles(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename) - .contains("fake.pdf"); - } - - @Test - void runImportAsync_skipsFile_whenShorterThanFourBytes(@TempDir Path tempDir) throws Exception { - Files.write(tempDir.resolve("tiny.pdf"), new byte[]{0x25, 0x50, 0x44}); // only 3 bytes - buildMinimalImportXlsx(tempDir, "tiny.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_skipsFile_whenMagicBytesCheckThrowsIOException(@TempDir Path tempDir) throws Exception { - Files.writeString(tempDir.resolve("unreadable.pdf"), "some content"); - buildMinimalImportXlsx(tempDir, "unreadable.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - MassImportService spyService = spy(service); - doThrow(new java.io.IOException("simulated read error")).when(spyService).openFileStream(any(File.class)); - - spyService.runImportAsync(); - - assertThat(spyService.getStatus().skipped()).isEqualTo(1); - assertThat(spyService.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.FILE_READ_ERROR); - } - - // ─── findFileRecursive — symlink escape security regression — do not remove ─ - - @Test - void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( - @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { - Path outsideFile = outsideDir.resolve("secret.pdf"); - Files.writeString(outsideFile, "sensitive content"); - Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); - - ReflectionTestUtils.setField(service, "importDir", importDirPath.toString()); - - assertThatThrownBy(() -> ReflectionTestUtils.invokeMethod(service, "findFileRecursive", "secret.pdf")) - .isInstanceOf(DomainException.class); - } - - // ─── readOds — XXE security regression ─────────────────────────────────── - - // Security regression — do not remove. - @Test - void readOds_rejects_xxe_doctype_payload(@TempDir Path tempDir) throws Exception { - File malicious = buildXxeOds(tempDir, "file:///etc/hostname"); - assertThatThrownBy(() -> service.readOds(malicious)) - .isInstanceOf(SAXParseException.class) - .hasMessageContaining("DOCTYPE is disallowed"); - } - - @Test - void readOds_parses_valid_ods_correctly(@TempDir Path tempDir) throws Exception { - File valid = buildValidOds(tempDir, "Mustermann"); - List> rows = service.readOds(valid); - assertThat(rows).isNotEmpty(); - assertThat(rows.get(0)).contains("Mustermann"); - } - - // ─── helpers ────────────────────────────────────────────────────────────── - - /** - * Builds a minimal 14-element cell row with the given filename at index 0 - * and blanks for all optional fields. - */ - private List minimalCells(String filename) { - return buildCells(filename, "", "", ""); - } - - /** - * Builds a cell row with sender, receiver, and tag controls. - * Layout matches the default column indices set in setUp(). - */ - private List buildCells(String filename, String sender, String receivers, String tag) { - // 14 elements: index=0,box=1,folder=2,sender=3,[4],receivers=5,[6],date=7,[8],location=9,tag=10,summary=11,[12],transcription=13 - return List.of( - filename, // 0: index - "", // 1: box - "", // 2: folder - sender, // 3: sender - "", // 4: (unused) - receivers, // 5: receivers - "", // 6: (unused) - "", // 7: date - "", // 8: (unused) - "", // 9: location - tag, // 10: tags - "", // 11: summary - "", // 12: (unused) - "" // 13: transcription - ); - } - - /** Creates a minimal ODS ZIP containing a content.xml with an XXE payload. */ - private File buildXxeOds(Path dir, String entityTarget) throws Exception { - String xml = "" - + "]>" - + "" - + "" - + "" - + "&xxe;" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("malicious.ods"), xml); - } - - /** Creates a minimal valid ODS ZIP containing a content.xml with the given cell value. - * cellValue must not contain XML metacharacters ({@code < > &}). */ - private File buildValidOds(Path dir, String cellValue) throws Exception { - String xml = "" - + "" - + "" - + "" - + "" + cellValue + "" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("valid.ods"), xml); - } - - private File writeOdsZip(Path destination, String contentXml) throws Exception { - try (OutputStream fos = Files.newOutputStream(destination); - ZipOutputStream zip = new ZipOutputStream(fos)) { - zip.putNextEntry(new ZipEntry("content.xml")); - zip.write(contentXml.getBytes(StandardCharsets.UTF_8)); - zip.closeEntry(); - } - return destination.toFile(); - } - - private void setupOneValidOneFakeImport(Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("real.pdf"), pdfHeader); - Files.writeString(tempDir.resolve("fake.pdf"), "not a pdf"); - buildMinimalImportXlsx(tempDir, "real.pdf", "fake.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - } - - private void buildMinimalImportXlsx(Path dir, String... filenames) throws Exception { - Path xlsx = dir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - org.apache.poi.ss.usermodel.Sheet sheet = wb.createSheet("Sheet1"); - sheet.createRow(0).createCell(0).setCellValue("Index"); - for (int i = 0; i < filenames.length; i++) { - sheet.createRow(i + 1).createCell(0).setCellValue(filenames[i]); - } - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - } -} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java new file mode 100644 index 00000000..af5740c0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java @@ -0,0 +1,130 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.ArgumentCaptor; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonRegisterImporterTest { + + @Test + void load_upsertsPersonBySourceRef_withProvisionalFalse(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, row( + "allemeyer-elsgard", "Allemeyer", "Elsgard", "Wöhler", "Nichte von Herbert", "False")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + PersonUpsertCommand cmd = captor.getValue(); + assertThat(cmd.sourceRef()).isEqualTo("allemeyer-elsgard"); + assertThat(cmd.lastName()).isEqualTo("Allemeyer"); + assertThat(cmd.firstName()).isEqualTo("Elsgard"); + assertThat(cmd.maidenName()).isEqualTo("Wöhler"); + assertThat(cmd.notes()).isEqualTo("Nichte von Herbert"); + assertThat(cmd.provisional()).isFalse(); + } + + @Test + void load_parsesCapitalisedPythonBool_True(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, row( + "noise-geschirr", "Geschirr", "", "", "", "True")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().provisional()).isTrue(); + } + + @Test + void load_skipsRowWithBlankPersonId(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + Path xlsx = writePersons(tempDir, row("", "NoId", "", "", "", "False")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + verify(personService, times(0)).upsertBySourceRef(any()); + } + + @Test + void load_returnsCountOfProcessedRows(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, + row("a-one", "One", "A", "", "", "False"), + row("a-two", "Two", "B", "", "", "False")); + + int processed = new PersonRegisterImporter(personService).load(xlsx.toFile()); + + assertThat(processed).isEqualTo(2); + } + + private static Person personOf(PersonUpsertCommand cmd) { + return Person.builder().id(UUID.randomUUID()).sourceRef(cmd.sourceRef()) + .firstName(cmd.firstName()).lastName(cmd.lastName()) + .provisional(cmd.provisional()).build(); + } + + private Map row(String personId, String lastName, String firstName, + String maidenName, String notes, String provisional) { + Map r = new LinkedHashMap<>(); + r.put("person_id", personId); + r.put("last_name", lastName); + r.put("first_name", firstName); + r.put("maiden_name", maidenName); + r.put("notes", notes); + r.put("provisional", provisional); + return r; + } + + @SafeVarargs + private Path writePersons(Path dir, Map... rows) throws Exception { + Path xlsx = dir.resolve("canonical-persons.xlsx"); + List headers = List.of("person_id", "last_name", "first_name", "maiden_name", "notes", "provisional"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.length; r++) { + Row row = sheet.createRow(r + 1); + for (int c = 0; c < headers.size(); c++) { + row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java new file mode 100644 index 00000000..ce90d260 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java @@ -0,0 +1,163 @@ +package org.raddatz.familienarchiv.importing; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.ArgumentCaptor; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.person.relationship.RelationType; +import org.raddatz.familienarchiv.person.relationship.RelationshipService; +import org.raddatz.familienarchiv.person.relationship.dto.CreateRelationshipRequest; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonTreeImporterTest { + + @Test + void load_upsertsTreePersonBySourceRef_withFamilyMemberFlag(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_002","firstName":"Elsgard","lastName":"Allemeyer","maidenName":"Wöhler", + "notes":"Nichte","birthYear":1920,"deathYear":1999,"familyMember":true,"personId":"allemeyer-elsgard"} + ],"relationships":[]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + PersonUpsertCommand cmd = captor.getValue(); + assertThat(cmd.sourceRef()).isEqualTo("allemeyer-elsgard"); + assertThat(cmd.familyMember()).isTrue(); + assertThat(cmd.provisional()).isFalse(); + } + + @Test + void load_createsRelationship_resolvingRowIdsToUpsertedPersons(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + UUID idA = UUID.randomUUID(); + UUID idB = UUID.randomUUID(); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> { + PersonUpsertCommand c = inv.getArgument(0); + return Person.builder().id(c.sourceRef().equals("a") ? idA : idB) + .sourceRef(c.sourceRef()).lastName(c.lastName()).build(); + }); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(CreateRelationshipRequest.class); + verify(relationshipService).addRelationship(eq(idA), captor.capture()); + assertThat(captor.getValue().relatedPersonId()).isEqualTo(idB); + assertThat(captor.getValue().relationType()).isEqualTo(RelationType.SPOUSE_OF); + } + + @Test + void load_swallowsDuplicateRelationship_forIdempotentReimport(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())) + .thenAnswer(inv -> personOf(inv.getArgument(0))); + doThrow(DomainException.conflict(ErrorCode.DUPLICATE_RELATIONSHIP, "exists")) + .when(relationshipService).addRelationship(any(), any()); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + PersonTreeImporter importer = new PersonTreeImporter(personService, relationshipService); + + // Must not propagate the conflict — re-import is idempotent. + importer.load(json.toFile()); + + verify(relationshipService).addRelationship(any(), any()); + } + + @Test + void load_propagatesUnexpectedDomainException_fromAddRelationship(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())) + .thenAnswer(inv -> personOf(inv.getArgument(0))); + // An unexpected ErrorCode (not DUPLICATE/CIRCULAR) must NOT be swallowed. + doThrow(DomainException.internal(ErrorCode.INTERNAL_ERROR, "boom")) + .when(relationshipService).addRelationship(any(), any()); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + PersonTreeImporter importer = new PersonTreeImporter(personService, relationshipService); + + assertThatThrownBy(() -> importer.load(json.toFile())) + .isInstanceOf(DomainException.class) + .extracting("code").isEqualTo(ErrorCode.INTERNAL_ERROR); + } + + @Test + void load_skipsRelationship_whenRowIdUnresolved(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_ghost","type":"SPOUSE_OF","source":"x"} + ]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + verify(relationshipService, org.mockito.Mockito.never()).addRelationship(any(), any()); + } + + private static Person personOf(PersonUpsertCommand cmd) { + return Person.builder().id(UUID.randomUUID()).sourceRef(cmd.sourceRef()).lastName(cmd.lastName()).build(); + } + + private Path write(Path dir, String json) throws Exception { + Path file = dir.resolve("canonical-persons-tree.json"); + Files.writeString(file, json); + return file; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java new file mode 100644 index 00000000..e6becae5 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java @@ -0,0 +1,103 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TagTreeImporterTest { + + @Test + void load_upsertsRootTagWithNullParent(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + when(tagService.upsertBySourceRef(any(), any(), any())) + .thenAnswer(inv -> tagOf(inv.getArgument(0), inv.getArgument(1), inv.getArgument(2))); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"})); + + new TagTreeImporter(tagService).load(xlsx.toFile()); + + verify(tagService).upsertBySourceRef("Themen", "Themen", null); + } + + @Test + void load_resolvesParentByPath_forChildTag(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + UUID rootId = UUID.randomUUID(); + when(tagService.upsertBySourceRef(eq("Themen"), eq("Themen"), isNull())) + .thenReturn(tagOf("Themen", "Themen", null, rootId)); + when(tagService.upsertBySourceRef(eq("Themen/Brautbriefe"), eq("Brautbriefe"), eq(rootId))) + .thenReturn(tagOf("Themen/Brautbriefe", "Brautbriefe", rootId)); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"}, + new String[]{"Themen/Brautbriefe", "Themen", "Brautbriefe"})); + + new TagTreeImporter(tagService).load(xlsx.toFile()); + + verify(tagService).upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", rootId); + } + + @Test + void load_returnsCountOfProcessedRows(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + when(tagService.upsertBySourceRef(any(), any(), any())) + .thenAnswer(inv -> tagOf(inv.getArgument(0), inv.getArgument(1), inv.getArgument(2))); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"}, + new String[]{"Themen/Brautbriefe", "Themen", "Brautbriefe"})); + + int processed = new TagTreeImporter(tagService).load(xlsx.toFile()); + + assertThat(processed).isEqualTo(2); + } + + private static Tag tagOf(String sourceRef, String name, UUID parentId) { + return tagOf(sourceRef, name, parentId, UUID.randomUUID()); + } + + private static Tag tagOf(String sourceRef, String name, UUID parentId, UUID id) { + return Tag.builder().id(id).sourceRef(sourceRef).name(name).parentId(parentId).build(); + } + + private Path writeTagTree(Path dir, List rows) throws Exception { + Path xlsx = dir.resolve("canonical-tag-tree.xlsx"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + header.createCell(0).setCellValue("tag_path"); + header.createCell(1).setCellValue("parent_name"); + header.createCell(2).setCellValue("tag_name"); + for (int r = 0; r < rows.size(); r++) { + Row row = sheet.createRow(r + 1); + String[] values = rows.get(r); + for (int c = 0; c < values.length; c++) { + row.createCell(c).setCellValue(values[c]); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonControllerTest.java index e7767411..d43e9a9a 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonControllerTest.java @@ -65,44 +65,144 @@ class PersonControllerTest { @Test @WithMockUser(authorities = "READ_ALL") - void getPersons_returns200_withEmptyList() throws Exception { - when(personService.findAll(null)).thenReturn(Collections.emptyList()); + void getPersons_returns200_withEmptyPagedResult() throws Exception { + when(personService.search(any(), eq(0), eq(50), eq(null))) + .thenReturn(PersonSearchResult.paged(Collections.emptyList(), 0, 50, 0)); mockMvc.perform(get("/api/persons")) - .andExpect(status().isOk()); + .andExpect(status().isOk()) + .andExpect(jsonPath("$.items").isArray()) + .andExpect(jsonPath("$.totalElements").value(0)); } @Test @WithMockUser(authorities = "READ_ALL") void getPersons_delegatesQueryParam_toService() throws Exception { PersonSummaryDTO dto = mockPersonSummary("Hans", "Müller"); - when(personService.findAll("Hans")).thenReturn(List.of(dto)); + when(personService.search(any(), eq(0), eq(50), eq("Hans"))) + .thenReturn(PersonSearchResult.paged(List.of(dto), 0, 50, 1)); mockMvc.perform(get("/api/persons").param("q", "Hans")) .andExpect(status().isOk()) - .andExpect(jsonPath("$[0].firstName").value("Hans")); + .andExpect(jsonPath("$.items[0].firstName").value("Hans")); } @Test @WithMockUser(authorities = "READ_ALL") - void getPersons_delegatesTopByDocumentCount_whenSortAndSizeGiven() throws Exception { + void getPersons_passesFilterParams_toService() throws Exception { + ArgumentCaptor filterCaptor = ArgumentCaptor.forClass(PersonFilter.class); + when(personService.search(filterCaptor.capture(), eq(0), eq(50), eq(null))) + .thenReturn(PersonSearchResult.paged(Collections.emptyList(), 0, 50, 0)); + + mockMvc.perform(get("/api/persons") + .param("type", "INSTITUTION") + .param("familyOnly", "true") + .param("hasDocuments", "true") + .param("provisional", "false")) + .andExpect(status().isOk()); + + PersonFilter captured = filterCaptor.getValue(); + assertThat(captured.type()).isEqualTo(PersonType.INSTITUTION); + assertThat(captured.familyOnly()).isTrue(); + assertThat(captured.hasDocuments()).isTrue(); + assertThat(captured.provisional()).isFalse(); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_defaultsToReaderDefault_whenNoReviewFlag() throws Exception { + ArgumentCaptor filterCaptor = ArgumentCaptor.forClass(PersonFilter.class); + when(personService.search(filterCaptor.capture(), eq(0), eq(50), eq(null))) + .thenReturn(PersonSearchResult.paged(Collections.emptyList(), 0, 50, 0)); + + mockMvc.perform(get("/api/persons")).andExpect(status().isOk()); + + assertThat(filterCaptor.getValue().readerDefault()).isTrue(); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_dropsReaderDefault_whenReviewFlagSet() throws Exception { + ArgumentCaptor filterCaptor = ArgumentCaptor.forClass(PersonFilter.class); + when(personService.search(filterCaptor.capture(), eq(0), eq(50), eq(null))) + .thenReturn(PersonSearchResult.paged(Collections.emptyList(), 0, 50, 0)); + + mockMvc.perform(get("/api/persons").param("review", "true")).andExpect(status().isOk()); + + assertThat(filterCaptor.getValue().readerDefault()).isFalse(); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_passesPageAndSize_toService() throws Exception { + when(personService.search(any(), eq(2), eq(25), eq(null))) + .thenReturn(PersonSearchResult.paged(Collections.emptyList(), 2, 25, 0)); + + mockMvc.perform(get("/api/persons").param("page", "2").param("size", "25")) + .andExpect(status().isOk()); + + verify(personService).search(any(), eq(2), eq(25), eq(null)); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_returns400_whenSizeIsZero() throws Exception { + mockMvc.perform(get("/api/persons").param("size", "0")) + .andExpect(status().isBadRequest()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_returns400_whenSizeExceeds100() throws Exception { + mockMvc.perform(get("/api/persons").param("size", "101")) + .andExpect(status().isBadRequest()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_returns400_whenPageIsNegative() throws Exception { + mockMvc.perform(get("/api/persons").param("page", "-1")) + .andExpect(status().isBadRequest()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_delegatesTopByDocumentCount_whenSortGiven() throws Exception { PersonSummaryDTO top = mockPersonSummary("Käthe", "Raddatz"); when(personService.findTopByDocumentCount(4)).thenReturn(List.of(top)); mockMvc.perform(get("/api/persons").param("sort", "documentCount").param("size", "4")) .andExpect(status().isOk()) - .andExpect(jsonPath("$[0].firstName").value("Käthe")); + .andExpect(jsonPath("$.items[0].firstName").value("Käthe")); } @Test @WithMockUser(authorities = "READ_ALL") - void getPersons_capsTopByDocumentCount_atFifty() throws Exception { - ArgumentCaptor sizeCaptor = ArgumentCaptor.forClass(Integer.class); - when(personService.findTopByDocumentCount(sizeCaptor.capture())).thenReturn(Collections.emptyList()); + void getPersons_topByDocumentCount_isNonPaged_totalElementsEqualsReturnedCount() throws Exception { + // The top-N dashboard path is deliberately NON-paged: it returns the complete result + // (no further page exists), so totalElements equals the number of rows returned and + // totalPages is 1. Pinned so nobody "fixes" it into a misleading paged total. + when(personService.findTopByDocumentCount(50)) + .thenReturn(List.of(mockPersonSummary("Käthe", "Raddatz"), + mockPersonSummary("Hans", "Müller"))); - mockMvc.perform(get("/api/persons").param("sort", "documentCount").param("size", "999")) - .andExpect(status().isOk()); + mockMvc.perform(get("/api/persons").param("sort", "documentCount")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.items.length()").value(2)) + .andExpect(jsonPath("$.totalElements").value(2)) + .andExpect(jsonPath("$.pageNumber").value(0)) + .andExpect(jsonPath("$.pageSize").value(2)) + .andExpect(jsonPath("$.totalPages").value(1)); + } - assertThat(sizeCaptor.getValue()).isEqualTo(50); + @Test + @WithMockUser(authorities = "READ_ALL") + void getPersons_topByDocumentCount_emptyResult_reportsZeroPages() throws Exception { + when(personService.findTopByDocumentCount(50)).thenReturn(Collections.emptyList()); + + mockMvc.perform(get("/api/persons").param("sort", "documentCount")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.totalElements").value(0)) + .andExpect(jsonPath("$.totalPages").value(0)); } private PersonSummaryDTO mockPersonSummary(String firstName, String lastName) { @@ -117,6 +217,7 @@ class PersonControllerTest { public Integer getDeathYear() { return null; } public String getNotes() { return null; } public boolean isFamilyMember() { return false; } + public boolean isProvisional() { return false; } public long getDocumentCount() { return 0; } }; } @@ -397,6 +498,61 @@ class PersonControllerTest { .andExpect(status().isNoContent()); } + // ─── PATCH /api/persons/{id}/confirm ────────────────────────────────────── + + @Test + void confirmPerson_returns401_whenUnauthenticated() throws Exception { + mockMvc.perform(patch("/api/persons/{id}/confirm", UUID.randomUUID()).with(csrf())) + .andExpect(status().isUnauthorized()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void confirmPerson_returns403_whenUserHasOnlyReadPermission() throws Exception { + mockMvc.perform(patch("/api/persons/{id}/confirm", UUID.randomUUID()).with(csrf())) + .andExpect(status().isForbidden()); + } + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void confirmPerson_returns200_andClearsProvisional() throws Exception { + UUID id = UUID.randomUUID(); + Person confirmed = Person.builder().id(id).firstName("Bald").lastName("Bestaetigt").provisional(false).build(); + when(personService.confirmPerson(id)).thenReturn(confirmed); + + mockMvc.perform(patch("/api/persons/{id}/confirm", id).with(csrf())) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.provisional").value(false)); + + verify(personService).confirmPerson(id); + } + + // ─── DELETE /api/persons/{id} ────────────────────────────────────────────── + + @Test + void deletePerson_returns401_whenUnauthenticated() throws Exception { + mockMvc.perform(delete("/api/persons/{id}", UUID.randomUUID()).with(csrf())) + .andExpect(status().isUnauthorized()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void deletePerson_returns403_whenUserHasOnlyReadPermission() throws Exception { + mockMvc.perform(delete("/api/persons/{id}", UUID.randomUUID()).with(csrf())) + .andExpect(status().isForbidden()); + } + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void deletePerson_returns204_whenValid() throws Exception { + UUID id = UUID.randomUUID(); + + mockMvc.perform(delete("/api/persons/{id}", id).with(csrf())) + .andExpect(status().isNoContent()); + + verify(personService).deletePerson(id); + } + // ─── PUT /api/persons/{id} — lastName blank branch ──────────────────────── @Test diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java new file mode 100644 index 00000000..c8b81b2b --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java @@ -0,0 +1,151 @@ +package org.raddatz.familienarchiv.person; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonImportUpsertTest { + + @Mock PersonRepository personRepository; + @Mock PersonNameAliasRepository aliasRepository; + @InjectMocks PersonService personService; + + @Test + void upsertBySourceRef_insertsNewPerson_whenSourceRefUnknown() { + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.empty()); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.getSourceRef()).isEqualTo("clara-cram"); + assertThat(result.getFirstName()).isEqualTo("Clara"); + assertThat(result.getLastName()).isEqualTo("Cram"); + assertThat(result.isProvisional()).isFalse(); + } + + @Test + void upsertBySourceRef_updatesInPlace_whenSourceRefExists() { + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Clara").lastName("Cram").build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Updated note").personType(PersonType.PERSON).provisional(false).build(); + + personService.upsertBySourceRef(cmd); + + verify(personRepository).save(argThat(p -> p.getId().equals(existing.getId()))); + verify(personRepository, never()).save(argThat(p -> p.getId() == null)); + } + + @Test + void upsertBySourceRef_preservesHumanEditedNonBlankFields() { + // A human renamed the maiden-name register person and added notes in-app. + Person humanEdited = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Klara").lastName("Cram-Müller").notes("Verified by Marcel").build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(humanEdited)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Auto note").personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + // Human edits survive the re-import. + assertThat(result.getFirstName()).isEqualTo("Klara"); + assertThat(result.getLastName()).isEqualTo("Cram-Müller"); + assertThat(result.getNotes()).isEqualTo("Verified by Marcel"); + } + + @Test + void upsertBySourceRef_fillsOnlyBlankFields_onReimport() { + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Clara").lastName("Cram").notes(null).build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Nichte von Herbert").personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + // Blank field gets filled by canonical value. + assertThat(result.getNotes()).isEqualTo("Nichte von Herbert"); + } + + @Test + void upsertBySourceRef_fillsBlankYears_butPreservesHumanEditedYears_onReimport() { + // Existing has a human-set birthYear and a blank deathYear. + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .lastName("Cram").birthYear(1890).deathYear(null).build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").lastName("Cram") + .birthYear(1888).deathYear(1965) + .personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.getBirthYear()).isEqualTo(1890); // human value kept + assertThat(result.getDeathYear()).isEqualTo(1965); // blank filled from canonical + } + + @Test + void upsertBySourceRef_neverFlipsProvisionalBackToTrue_onceHumanConfirmed() { + // A human confirmed this provisional importer-created person (provisional -> false). + Person confirmed = Person.builder() + .id(UUID.randomUUID()).sourceRef("schwester-hanni") + .firstName(null).lastName("Schwester Hanni").provisional(false).build(); + when(personRepository.findBySourceRef("schwester-hanni")).thenReturn(Optional.of(confirmed)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("schwester-hanni").lastName("Schwester Hanni") + .personType(PersonType.PERSON).provisional(true).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.isProvisional()).isFalse(); + } + + @Test + void upsertBySourceRef_setsProvisionalTrue_forNewProvisionalPerson() { + when(personRepository.findBySourceRef("noise-geschirr")).thenReturn(Optional.empty()); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("noise-geschirr").lastName("Tante Tüten") + .personType(PersonType.PERSON).provisional(true).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.isProvisional()).isTrue(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonRepositoryTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonRepositoryTest.java index 8ccf27ba..910e701e 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonRepositoryTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonRepositoryTest.java @@ -463,4 +463,213 @@ class PersonRepositoryTest { assertThat(result).hasSize(1); assertThat(result.get(0).getLastName()).isEqualTo("Gesellschafter des Verlages"); } + + // ─── #671: provisional must be SELECTed in all three native projections ─── + // Adding isProvisional() to the interface compiles even if a native query forgets + // to SELECT p.provisional — it then silently returns false. These tests are the only + // guard against that trap, so they must run against real Postgres. + + @Test + void findAllWithDocumentCount_projectsProvisionalTrue() { + personRepository.save(Person.builder() + .firstName("Inferred").lastName("Person").provisional(true).build()); + + List result = personRepository.findAllWithDocumentCount(); + + assertThat(result).anyMatch(PersonSummaryDTO::isProvisional); + } + + @Test + void searchWithDocumentCount_projectsProvisionalTrue() { + personRepository.save(Person.builder() + .firstName("Provisorisch").lastName("Müller").provisional(true).build()); + + List result = personRepository.searchWithDocumentCount("Provisorisch"); + + assertThat(result).hasSize(1); + assertThat(result.get(0).isProvisional()).isTrue(); + } + + @Test + void findTopByDocumentCount_projectsProvisionalTrue() { + Person provisional = personRepository.save(Person.builder() + .firstName("Top").lastName("Provisional").provisional(true).build()); + documentRepository.save(Document.builder() + .title("Brief").originalFilename("b.pdf") + .status(DocumentStatus.UPLOADED) + .sender(provisional).build()); + + List result = personRepository.findTopByDocumentCount(10); + + PersonSummaryDTO summary = result.stream() + .filter(p -> p.getId().equals(provisional.getId())).findFirst().orElseThrow(); + assertThat(summary.isProvisional()).isTrue(); + } + + // ─── #667: filter-aware paged slice + paired COUNT (Postgres-only) ──────── + // The slice query (findByFilter) and the count query (countByFilter) MUST share one + // WHERE clause so totalElements can never drift from the rendered page. These tests run + // against real Postgres because the slice ORDER BY uses a computed alias that fails on H2. + + private void seedDirectoryFixture() { + // Register family member, no documents — visible by reader default (familyMember) + personRepository.save(Person.builder().firstName("Karl").lastName("Register").familyMember(true).build()); + // Person with one document — visible by reader default (documentCount > 0) + Person hasDoc = personRepository.save(Person.builder().firstName("Doku").lastName("Person").build()); + documentRepository.save(Document.builder().title("B").originalFilename("b.pdf") + .status(DocumentStatus.UPLOADED).sender(hasDoc).build()); + // Provisional, zero-document, non-family — hidden by reader default + personRepository.save(Person.builder().firstName("Unbe").lastName("Staetigt").provisional(true).build()); + // An institution with no documents, non-family, non-provisional + personRepository.save(Person.builder().lastName("Verlag GmbH").personType(PersonType.INSTITUTION).build()); + } + + @Test + void findByFilter_readerDefault_returnsOnlyFamilyOrWithDocuments() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, null, true, null, 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName) + .containsExactlyInAnyOrder("Register", "Person"); + } + + @Test + void countByFilter_readerDefault_matchesSliceSize() { + seedDirectoryFixture(); + + long count = personRepository.countByFilter(null, null, null, null, true, null); + + assertThat(count).isEqualTo(2); + } + + @Test + void findByFilter_showAll_returnsEveryone() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, null, false, null, 50, 0); + + assertThat(slice).hasSize(4); + } + + @Test + void findByFilter_typeInstitution_returnsOnlyInstitutions() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + "INSTITUTION", null, null, null, false, null, 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName).containsExactly("Verlag GmbH"); + } + + @Test + void findByFilter_familyOnly_returnsOnlyFamilyMembers() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, true, null, null, false, null, 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName).containsExactly("Register"); + } + + @Test + void findByFilter_hasDocuments_returnsOnlyPersonsWithDocuments() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, true, null, false, null, 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName).containsExactly("Person"); + } + + @Test + void findByFilter_provisionalTrue_returnsOnlyProvisional() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, true, false, null, 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName).containsExactly("Staetigt"); + } + + @Test + void findByFilter_combinedFilters_andTogether() { + seedDirectoryFixture(); + // family + has-documents → intersection is empty (Register has no docs, Doku is not family) + List slice = personRepository.findByFilter( + null, true, true, null, false, null, 50, 0); + + assertThat(slice).isEmpty(); + } + + @Test + void findByFilter_query_combinesWithFilters() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, null, false, "Verlag", 50, 0); + + assertThat(slice).extracting(PersonSummaryDTO::getLastName).containsExactly("Verlag GmbH"); + } + + @Test + void findByFilter_pageBeyondRange_returnsEmptySlice() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, null, false, null, 50, 999 * 50); + + assertThat(slice).isEmpty(); + } + + @Test + void findByFilter_respectsPageSize() { + seedDirectoryFixture(); + + List firstPage = personRepository.findByFilter( + null, null, null, null, false, null, 2, 0); + List secondPage = personRepository.findByFilter( + null, null, null, null, false, null, 2, 2); + + assertThat(firstPage).hasSize(2); + assertThat(secondPage).hasSize(2); + assertThat(firstPage).extracting(PersonSummaryDTO::getId) + .doesNotContainAnyElementsOf(secondPage.stream().map(PersonSummaryDTO::getId).toList()); + } + + @Test + void countByFilter_typeInstitution_matchesSlice() { + seedDirectoryFixture(); + + long count = personRepository.countByFilter("INSTITUTION", null, null, null, false, null); + + assertThat(count).isEqualTo(1); + } + + @Test + void countByFilter_query_matchesSliceSize() { + // The whole point of the shared FILTER_WHERE is that the slice and the count can never + // drift. Pin the query (LIKE) path explicitly: countByFilter must equal the slice size + // so a future edit to one query's LIKE clause is caught. + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, null, null, false, "Verlag", 50, 0); + long count = personRepository.countByFilter(null, null, null, null, false, "Verlag"); + + assertThat(count).isEqualTo(slice.size()); + assertThat(count).isEqualTo(1); + } + + @Test + void findByFilter_projectsDocumentCount() { + seedDirectoryFixture(); + + List slice = personRepository.findByFilter( + null, null, true, null, false, null, 50, 0); + + assertThat(slice.get(0).getDocumentCount()).isEqualTo(1); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceIntegrationTest.java index e8d5ed97..0578f5fb 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceIntegrationTest.java @@ -2,6 +2,9 @@ package org.raddatz.familienarchiv.person; import org.junit.jupiter.api.Test; import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentRepository; +import org.raddatz.familienarchiv.document.DocumentStatus; import org.raddatz.familienarchiv.person.Person; import org.raddatz.familienarchiv.person.PersonType; import org.raddatz.familienarchiv.person.PersonRepository; @@ -13,6 +16,11 @@ import org.springframework.test.context.bean.override.mockito.MockitoBean; import org.springframework.transaction.annotation.Transactional; import software.amazon.awssdk.services.s3.S3Client; +import jakarta.persistence.EntityManager; +import jakarta.persistence.PersistenceContext; + +import java.util.Set; + import static org.assertj.core.api.Assertions.assertThat; @SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) @@ -24,6 +32,9 @@ class PersonServiceIntegrationTest { @MockitoBean S3Client s3Client; @Autowired PersonService personService; @Autowired PersonRepository personRepository; + @Autowired DocumentRepository documentRepository; + + @PersistenceContext EntityManager entityManager; @Test void findOrCreateByAlias_skipReturnsNull_noRecordCreated() { @@ -63,4 +74,97 @@ class PersonServiceIntegrationTest { assertThat(result.getFirstName()).isEqualTo("Clara"); assertThat(result.getLastName()).isEqualTo("Cram"); } + + // ─── #667: confirm round-trip + reader-default semantics ────────────────── + + @Test + void search_readerDefault_hidesProvisionalZeroDocumentPerson() { + personRepository.save(Person.builder() + .firstName("Unbe").lastName("Staetigt").provisional(true).build()); + + PersonSearchResult result = personService.search(PersonFilter.cleanDefault(), 0, 50, null); + + assertThat(result.items()).noneMatch(p -> p.getLastName().equals("Staetigt")); + assertThat(result.totalElements()).isEqualTo(result.items().size()); + } + + @Test + void search_showAll_includesProvisionalZeroDocumentPerson() { + personRepository.save(Person.builder() + .firstName("Unbe").lastName("Staetigt").provisional(true).build()); + + PersonSearchResult result = personService.search(PersonFilter.showAll(), 0, 50, null); + + assertThat(result.items()).anyMatch(p -> p.getLastName().equals("Staetigt")); + } + + @Test + void confirmPerson_clearsProvisional_andShowAllTreatsItAsConfirmed() { + Person provisional = personRepository.save(Person.builder() + .firstName("Bald").lastName("Bestaetigt").provisional(true).build()); + + personService.confirmPerson(provisional.getId()); + + Person reloaded = personRepository.findById(provisional.getId()).orElseThrow(); + assertThat(reloaded.isProvisional()).isFalse(); + + PersonSearchResult showAll = personService.search(PersonFilter.showAll(), 0, 50, null); + assertThat(showAll.items()) + .filteredOn(p -> p.getId().equals(provisional.getId())) + .allMatch(p -> !p.isProvisional()); + } + + @Test + void deletePerson_removesPerson() { + Person target = personRepository.save(Person.builder() + .firstName("Weg").lastName("Person").provisional(true).build()); + + personService.deletePerson(target.getId()); + + assertThat(personRepository.findById(target.getId())).isEmpty(); + } + + @Test + void deletePerson_detachesSentAndReceivedReferences_beforeDelete_noOrphan() { + // A person referenced as BOTH a document sender and a document receiver must delete + // cleanly: deletePerson nulls the sender_id FK and removes the receiver join row first + // (reassignSenderToNull → deleteReceiverReferences → deleteById), so no FK orphan and + // the documents themselves survive. + Person target = personRepository.save(Person.builder() + .firstName("Weg").lastName("Person").provisional(true).build()); + Person bystander = personRepository.save(Person.builder() + .firstName("Bleibt").lastName("Hier").build()); + + Document sent = documentRepository.save(Document.builder() + .title("Sent letter").originalFilename("sent.pdf") + .status(DocumentStatus.UPLOADED).sender(target).build()); + Document received = documentRepository.save(Document.builder() + .title("Received letter").originalFilename("received.pdf") + .status(DocumentStatus.UPLOADED).sender(bystander) + .receivers(new java.util.HashSet<>(Set.of(target))).build()); + + // Persist the fixture and detach everything so the native @Modifying deletes operate on + // the database directly without the persistence context holding stale references that + // would re-flush a now-deleted person as a transient association. + entityManager.flush(); + entityManager.clear(); + + personService.deletePerson(target.getId()); + + // Native @Modifying queries bypass the persistence context — clear it so the asserting + // reads observe the post-delete database state, not stale managed entities. + entityManager.flush(); + entityManager.clear(); + + assertThat(personRepository.findById(target.getId())).isEmpty(); + + Document reloadedSent = documentRepository.findById(sent.getId()).orElseThrow(); + assertThat(reloadedSent.getSender()).isNull(); + + Document reloadedReceived = documentRepository.findById(received.getId()).orElseThrow(); + assertThat(reloadedReceived.getReceivers()) + .noneMatch(p -> p.getId().equals(target.getId())); + // The other person and the documents themselves survive the delete. + assertThat(personRepository.findById(bystander.getId())).isPresent(); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceTest.java index 1ad9ce27..4c8de65c 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonServiceTest.java @@ -58,33 +58,109 @@ class PersonServiceTest { assertThat(personService.getById(id)).isEqualTo(person); } - // ─── findAll ───────────────────────────────────────────────────────────── + // ─── #667: search (filter + pagination) ────────────────────────────────── @Test - void findAll_returnsAll_whenQueryIsNull() { - List expected = List.of(); - when(personRepository.findAllWithDocumentCount()).thenReturn(expected); + void search_returnsPagedResult_withTotalsFromCountQuery() { + PersonFilter filter = PersonFilter.cleanDefault(); + when(personRepository.countByFilter(null, null, null, null, true, null)).thenReturn(120L); + when(personRepository.findByFilter(null, null, null, null, true, null, 50, 0)) + .thenReturn(List.of()); - assertThat(personService.findAll(null)).isEqualTo(expected); - verify(personRepository).findAllWithDocumentCount(); - verify(personRepository, never()).searchWithDocumentCount(any()); + PersonSearchResult result = personService.search(filter, 0, 50, null); + + assertThat(result.totalElements()).isEqualTo(120L); + assertThat(result.pageNumber()).isEqualTo(0); + assertThat(result.pageSize()).isEqualTo(50); + assertThat(result.totalPages()).isEqualTo(3); // ceil(120 / 50) } @Test - void findAll_returnsEmpty_whenQueryIsWhitespaceOnly() { - assertThat(personService.findAll(" ")).isEmpty(); - verify(personRepository, never()).findAllWithDocumentCount(); - verify(personRepository, never()).searchWithDocumentCount(any()); + void search_passesTypeAsEnumName_toRepository() { + PersonFilter filter = PersonFilter.builder().type(PersonType.INSTITUTION).build(); + when(personRepository.countByFilter("INSTITUTION", null, null, null, false, null)).thenReturn(0L); + when(personRepository.findByFilter("INSTITUTION", null, null, null, false, null, 50, 0)) + .thenReturn(List.of()); + + personService.search(filter, 0, 50, null); + + verify(personRepository).findByFilter("INSTITUTION", null, null, null, false, null, 50, 0); } @Test - void findAll_searchesByName_whenQueryIsNonBlank() { - List expected = List.of(); - when(personRepository.searchWithDocumentCount("Anna")).thenReturn(expected); + void search_computesOffset_fromPageAndSize() { + PersonFilter filter = PersonFilter.showAll(); + when(personRepository.countByFilter(null, null, null, null, false, null)).thenReturn(0L); + when(personRepository.findByFilter(null, null, null, null, false, null, 20, 40)) + .thenReturn(List.of()); - assertThat(personService.findAll("Anna")).isEqualTo(expected); - verify(personRepository).searchWithDocumentCount("Anna"); - verify(personRepository, never()).findAllWithDocumentCount(); + personService.search(filter, 2, 20, null); // offset = page * size = 40 + + verify(personRepository).findByFilter(null, null, null, null, false, null, 20, 40); + } + + @Test + void search_trimsBlankQueryToNull() { + PersonFilter filter = PersonFilter.showAll(); + when(personRepository.countByFilter(null, null, null, null, false, null)).thenReturn(0L); + when(personRepository.findByFilter(null, null, null, null, false, null, 50, 0)) + .thenReturn(List.of()); + + personService.search(filter, 0, 50, " "); + + verify(personRepository).findByFilter(null, null, null, null, false, null, 50, 0); + } + + // ─── #667: confirmPerson ────────────────────────────────────────────────── + + @Test + void confirmPerson_clearsProvisionalFlag() { + UUID id = UUID.randomUUID(); + Person provisional = Person.builder().id(id).firstName("Inferred").lastName("Person").provisional(true).build(); + when(personRepository.findById(id)).thenReturn(Optional.of(provisional)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + Person result = personService.confirmPerson(id); + + assertThat(result.isProvisional()).isFalse(); + verify(personRepository).save(argThat(p -> !p.isProvisional())); + } + + @Test + void confirmPerson_throwsNotFound_whenMissing() { + UUID id = UUID.randomUUID(); + when(personRepository.findById(id)).thenReturn(Optional.empty()); + + assertThatThrownBy(() -> personService.confirmPerson(id)) + .isInstanceOf(DomainException.class) + .extracting(e -> ((DomainException) e).getStatus().value()) + .isEqualTo(404); + } + + // ─── #667: deletePerson ─────────────────────────────────────────────────── + + @Test + void deletePerson_deletes_whenPersonExists() { + UUID id = UUID.randomUUID(); + Person person = Person.builder().id(id).firstName("Weg").lastName("Person").build(); + when(personRepository.findById(id)).thenReturn(Optional.of(person)); + + personService.deletePerson(id); + + verify(personRepository).reassignSenderToNull(id); + verify(personRepository).deleteReceiverReferences(id); + verify(personRepository).deleteById(id); + } + + @Test + void deletePerson_throwsNotFound_whenMissing() { + UUID id = UUID.randomUUID(); + when(personRepository.findById(id)).thenReturn(Optional.empty()); + + assertThatThrownBy(() -> personService.deletePerson(id)) + .isInstanceOf(DomainException.class) + .extracting(e -> ((DomainException) e).getStatus().value()) + .isEqualTo(404); } // ─── createPerson ───────────────────────────────────────────────────────── diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceIntegrationTest.java index a2d4a5f2..acbb3825 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceIntegrationTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceIntegrationTest.java @@ -144,10 +144,12 @@ class RelationshipServiceIntegrationTest { @Test void setFamilyMember_true_makes_person_appear_in_network() { - // charlie starts with familyMember = false. Add a PARENT_OF edge alice→charlie - // so the edge exists, then flip charlie's flag and verify he appears in nodes. + // addRelationship side-effects family_member=true on both endpoints for family-graph + // edges (PARENT_OF/SPOUSE_OF/SIBLING_OF). Reset charlie so the explicit + // setFamilyMember(true) call below is the thing under test, not the auto-flip. relationshipService.addRelationship(alice.getId(), new CreateRelationshipRequest(charlie.getId(), RelationType.PARENT_OF, null, null, null)); + relationshipService.setFamilyMember(charlie.getId(), false); NetworkDTO before = relationshipService.getFamilyNetwork(); assertThat(before.nodes()).extracting("id").doesNotContain(charlie.getId()); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceTest.java index 0a0b963c..c8d1faf6 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/relationship/RelationshipServiceTest.java @@ -23,6 +23,8 @@ import java.util.UUID; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -148,6 +150,50 @@ class RelationshipServiceTest { assertThat(result.notes()).isEqualTo("first born"); } + @Test + void addRelationship_marks_both_endpoints_as_family_member_when_type_is_family() { + // Creating a family-graph edge (PARENT_OF / SPOUSE_OF / SIBLING_OF) must mark both + // endpoints as family members so they appear in findAllFamilyMembers and the network. + // This is what makes the canonical importer's relationships actually show up in the UI. + when(personService.getById(alice.getId())).thenReturn(alice); + when(personService.getById(bob.getId())).thenReturn(bob); + when(relationshipRepository.existsByPersonIdAndRelatedPersonIdAndRelationType( + bob.getId(), alice.getId(), RelationType.PARENT_OF)).thenReturn(false); + when(relationshipRepository.saveAndFlush(any())).thenAnswer(inv -> { + PersonRelationship r = inv.getArgument(0); + r.setId(UUID.randomUUID()); + r.setCreatedAt(Instant.now()); + return r; + }); + + var dto = new CreateRelationshipRequest(bob.getId(), RelationType.PARENT_OF, null, null, null); + service.addRelationship(alice.getId(), dto); + + verify(personService).setFamilyMember(alice.getId(), true); + verify(personService).setFamilyMember(bob.getId(), true); + } + + @Test + void addRelationship_does_not_flip_family_member_for_non_family_type() { + // FRIEND / COLLEAGUE / EMPLOYER / DOCTOR / NEIGHBOR / OTHER are NOT family-graph + // edges (see getFamilyNetwork's filter), so addRelationship must leave family_member + // alone — a doctor of the family is not a family member. + when(personService.getById(alice.getId())).thenReturn(alice); + when(personService.getById(bob.getId())).thenReturn(bob); + when(relationshipRepository.saveAndFlush(any())).thenAnswer(inv -> { + PersonRelationship r = inv.getArgument(0); + r.setId(UUID.randomUUID()); + r.setCreatedAt(Instant.now()); + return r; + }); + + var dto = new CreateRelationshipRequest(bob.getId(), RelationType.FRIEND, null, null, null); + service.addRelationship(alice.getId(), dto); + + verify(personService, never()).setFamilyMember(eq(alice.getId()), anyBoolean()); + verify(personService, never()).setFamilyMember(eq(bob.getId()), anyBoolean()); + } + @Test void deleteRelationship_succeeds_when_viewpoint_is_object() { UUID relId = UUID.randomUUID(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java b/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java new file mode 100644 index 00000000..c2e29dc0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java @@ -0,0 +1,62 @@ +package org.raddatz.familienarchiv.tag; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TagImportUpsertTest { + + @Mock TagRepository tagRepository; + @InjectMocks TagService tagService; + + @Test + void upsertBySourceRef_insertsNewTag_whenSourceRefUnknown() { + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.empty()); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + UUID parentId = UUID.randomUUID(); + Tag result = tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", parentId); + + assertThat(result.getSourceRef()).isEqualTo("Themen/Brautbriefe"); + assertThat(result.getName()).isEqualTo("Brautbriefe"); + assertThat(result.getParentId()).isEqualTo(parentId); + } + + @Test + void upsertBySourceRef_updatesInPlace_whenSourceRefExists() { + Tag existing = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe") + .sourceRef("Themen/Brautbriefe").build(); + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(existing)); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", null); + + verify(tagRepository).save(argThat(t -> t.getId().equals(existing.getId()))); + verify(tagRepository, never()).save(argThat(t -> t.getId() == null)); + } + + @Test + void upsertBySourceRef_preservesHumanRenamedTag_onReimport() { + Tag humanRenamed = Tag.builder().id(UUID.randomUUID()).name("Verlobungsbriefe") + .sourceRef("Themen/Brautbriefe").build(); + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(humanRenamed)); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + Tag result = tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", null); + + assertThat(result.getName()).isEqualTo("Verlobungsbriefe"); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java index b87b928b..8e51fad7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java @@ -7,7 +7,8 @@ import org.raddatz.familienarchiv.security.PermissionAspect; import org.raddatz.familienarchiv.user.CustomUserDetailsService; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.aop.AopAutoConfiguration; @@ -35,7 +36,7 @@ class AdminControllerTest { @Autowired MockMvc mockMvc; - @MockitoBean MassImportService massImportService; + @MockitoBean CanonicalImportOrchestrator importOrchestrator; @MockitoBean DocumentService documentService; @MockitoBean DocumentVersionService documentVersionService; @MockitoBean ThumbnailBackfillService thumbnailBackfillService; @@ -46,9 +47,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_returns200_withStatusCode_whenAdmin() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk()) @@ -60,9 +61,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_messageField_notPresentInApiResponse() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk()) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index cdae6581..26e07442 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -26,15 +26,19 @@ # MAIL_HOST, MAIL_PORT, SMTP relay (production only; staging uses mailpit) # MAIL_USERNAME, MAIL_PASSWORD # APP_MAIL_FROM sender address (e.g. noreply@raddatz.cloud) -# IMPORT_HOST_DIR absolute host path holding ONLY the ODS -# spreadsheet and PDFs for /admin/system mass +# IMPORT_HOST_DIR absolute host path holding the canonical +# import artifacts (canonical-*.xlsx + +# canonical-persons-tree.json) and the +# .pdf files for /admin/system # import — mounted read-only at /import inside # the backend. Compose refuses to start when # this var is unset, so staging and prod cannot # accidentally share an import source. Must be # readable by the backend container's UID # (currently root via the OpenJDK image — any -# world-readable directory works). +# world-readable directory works). Canonical +# artifacts are NOT in git (PII — ADR-025); ops +# syncs them in beside the PDFs out-of-band. networks: archiv-net: @@ -217,12 +221,17 @@ services: # Bound to localhost only — Caddy fronts external traffic. ports: - "127.0.0.1:${PORT_BACKEND}:8080" - # Host path holding the ODS spreadsheet + PDFs for the mass-import endpoint. - # Read-only; MassImportService only reads (Files.list / Files.walk on /import). + # Host path holding the canonical import artifacts (canonical-*.xlsx + + # canonical-persons-tree.json) + .pdf files for the import endpoint. + # Read-only; the canonical importer only reads them from /import. # Required — no default — so staging and prod cannot accidentally share an # import source. CI workflows pin this per-env (see .gitea/workflows/). + # NOTE: the canonical artifacts are NOT version-controlled (they contain real + # family PII — see ADR-025). Ops must produce them locally from the Python + # normalizer (tools/import-normalizer/) and sync them into this host path + # alongside the .pdf corpus before triggering an import. volumes: - - ${IMPORT_HOST_DIR:?Set IMPORT_HOST_DIR to a host path holding the mass-import payload (ODS + PDFs). See docs/DEPLOYMENT.md.}:/import:ro + - ${IMPORT_HOST_DIR:?Set IMPORT_HOST_DIR to a host path holding the import payload (canonical artifacts + .pdf files). See docs/DEPLOYMENT.md.}:/import:ro environment: SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv SPRING_DATASOURCE_USERNAME: archiv diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index de071a43..5bc46261 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -65,7 +65,7 @@ Members of the cross-cutting layer have no entity of their own, no user-facing C | `dashboard` | Stats aggregation for the admin dashboard and Family Pulse widget | Aggregates from 3+ domains; no owned entities | | `exception` | `DomainException`, `ErrorCode` enum, `GlobalExceptionHandler` | Framework infra; consumed by every controller and service. Adding a new `ErrorCode` requires matching updates in `frontend/src/lib/shared/errors.ts` and all three `messages/*.json` locale files. Current security-related codes: `CSRF_TOKEN_MISSING` (403 on mutating request without valid `X-XSRF-TOKEN` header), `TOO_MANY_LOGIN_ATTEMPTS` (429 when login rate limit exceeded). | | `filestorage` | `FileService` — MinIO/S3 upload, download, presigned-URL generation | Generic service; consumed by `document` and `ocr` | -| `importing` | `MassImportService` — async ODS/Excel batch import | Orchestrates across `person`, `tag`, `document` | +| `importing` | `CanonicalImportOrchestrator` — async canonical import running four idempotent loaders (`TagTreeImporter` → `PersonRegisterImporter` → `PersonTreeImporter` → `DocumentImporter`) over the normalizer's committed canonical artifacts (`canonical-*.xlsx` + `canonical-persons-tree.json`) | Orchestrates across `person`, `tag`, `document` | | `security` | `SecurityConfig`, `Permission` enum, `@RequirePermission` annotation, `PermissionAspect` (AOP) | Framework infra; enforced globally across all controllers | **Frontend `shared/`** follows the same admission criteria. Key members: `api.server.ts` (typed openapi-fetch client factory), `errors.ts` (backend `ErrorCode` → i18n mapping), `shared/primitives/` (generic UI components used across ≥2 domains), `shared/discussion/` (comment/mention editor used by `document` and `geschichte`), `shared/utils/` (pure date/sort/debounce utilities). diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index c6560a0a..2e79481e 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -99,7 +99,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | `APP_BASE_URL` | Public-facing URL for email links | `http://localhost:3000` | YES (prod) | — | | `APP_OCR_BASE_URL` | Internal URL of the OCR service | — | YES | — | | `APP_OCR_TRAINING_TOKEN` | Secret token for OCR training endpoints | — | YES (prod) | YES | -| `IMPORT_HOST_DIR` | Absolute host path holding the ODS spreadsheet + PDFs for the `/admin/system` mass-import card. Mounted read-only at `/import` inside the backend (compose-only — backend reads via `app.import.dir`). Compose refuses to start when unset, so staging and prod cannot accidentally share the source. Convention: `/srv/familienarchiv-staging/import` and `/srv/familienarchiv-production/import` | — | YES (prod compose) | — | +| `IMPORT_HOST_DIR` | Absolute host path holding the normalizer's canonical artifacts (`canonical-{documents,persons,tag-tree}.xlsx` + `canonical-persons-tree.json`) **plus the `.pdf` files** for the `/admin/system` import. Mounted read-only at `/import` inside the backend (the canonical importer reads via `app.import.dir`). Compose refuses to start when unset, so staging and prod cannot accidentally share the source. Convention: `/srv/familienarchiv-staging/import` and `/srv/familienarchiv-production/import` | — | YES (prod compose) | — | | `MAIL_HOST` | SMTP host | `mailpit` (dev) | YES (prod) | — | | `MAIL_PORT` | SMTP port | `1025` (dev) | YES (prod) | — | | `MAIL_USERNAME` | SMTP username | — | YES (prod) | YES | @@ -559,20 +559,45 @@ bash scripts/download-kraken-models.sh > Downloads the Kurrent/Sütterlin HTR models. Run once after a fresh clone or when models are updated. -### Trigger a mass import (Excel/ODS) +### Trigger a canonical import -**Dev:** drop the ODS spreadsheet + PDFs into `./import/` at the repo root — the dev compose bind-mounts it to `/import` automatically. +The importer no longer parses the raw spreadsheet. It consumes the **canonical artifacts** +produced by the normalizer (`tools/import-normalizer/`) — `canonical-tag-tree.xlsx`, +`canonical-persons.xlsx`, `canonical-persons-tree.json`, `canonical-documents.xlsx` — which +are committed under `tools/import-normalizer/out/`. The semantic transformation +(German-date parsing, name classification) lives entirely in the normalizer; the backend +maps the clean columns by header name. See [ADR-025](adr/025-canonical-import-and-single-migration-schema-foundation.md). + +**Prerequisite — regenerate the artifacts when the source data changes:** + +```bash +cd tools/import-normalizer +python3 -m venv .venv && .venv/bin/pip install -r requirements.txt # once, on a fresh clone +.venv/bin/python normalize.py +# writes the four canonical artifacts into ./out/ +``` + +**Dev:** place all four canonical artifacts **plus** the PDFs into `./import/` +at the repo root (the dev compose bind-mounts it to `/import`, which is `app.import.dir`). +Each PDF must be named `.pdf` (e.g. `W-0124.pdf`, `Mü-0001.pdf`) and live flat in the +import dir: since #686 the importer resolves a document's PDF directly by its index +(`importDir/.pdf`), not via a `datei`/`file` column — the recursive directory walk and +its basename/homoglyph guards are gone, replaced by strict index validation plus a +canonical-path containment assertion (a document whose `.pdf` is absent simply becomes a +`PLACEHOLDER`). The orchestrator smoke-checks that all four artifacts are present before +starting and fails closed (`IMPORT_ARTIFACT_INVALID`) if any is missing. **Staging/production:** -1. Pre-stage the payload on the host. Convention: `/srv/familienarchiv-staging/import/` or `/srv/familienarchiv-production/import/`. +1. Pre-stage the four canonical artifacts + PDFs on the host. Convention: + `/srv/familienarchiv-staging/import/` or `/srv/familienarchiv-production/import/`. ```bash rsync -avh --progress ./import/ user@host:/srv/familienarchiv-staging/import/ ``` 2. Make sure `IMPORT_HOST_DIR=` is set in `.env.staging` / `.env.production` (the nightly/release workflows already write this — see §3). Compose refuses to start without it. 3. Redeploy the stack so the bind mount picks up — or, if the mount is already in place, skip to step 4. 4. Call `POST /api/admin/trigger-import` (requires `ADMIN` permission), or click the "Import starten" button on `/admin/system`. -5. The import runs asynchronously — poll `GET /api/admin/import-status`, watch `/admin/system`, or tail the backend logs. +5. The import runs asynchronously — poll `GET /api/admin/import-status`, watch `/admin/system`, or tail the backend logs. Re-running is safe and idempotent (upsert by `source_ref` / document `index`). Person and tag scalar fields you edited in the app are preserved on re-import; a document's sender/receivers/tags are **canonical-authoritative** — a re-import re-applies them to exactly match the export, so a link removed from the export is removed from the document (the raw sender/receiver cell text is always kept). --- diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md index 99da1775..074f2fe1 100644 --- a/docs/GLOSSARY.md +++ b/docs/GLOSSARY.md @@ -25,6 +25,11 @@ _Not to be confused with [AppUser](#appuser-appuser)_ — `Person` is a historic **UserGroup** (`UserGroup`) — a named permission bundle assigned to one or more `AppUser`s. A user's effective permissions are the union of all permissions across all groups they belong to. +**source_ref** (`Person.sourceRef`, `Tag.sourceRef`) — the import normalizer's stable identity for a `Person` (its `person_id`) or `Tag` (its canonical `tag_path`). It is the join key linking normalized records to documents and the idempotency key for re-import; null for manually created records and unique among non-null values. + +**provisional person** (`Person.provisional`) — a `Person` the importer inferred from raw attribution text but could not confidently match to a known individual. The flag lets the persons directory surface uncertainty honestly rather than fabricate a confident identity; it defaults to `false` and is set `true` only by the importer. +_Not to be confused with `family_member`_ — `provisional` expresses import confidence, while `family_member` is a genealogical fact about whether the person belongs to the family tree. + --- ## Document-Related Terms @@ -36,6 +41,10 @@ _See also [TranscriptionBlock](#transcriptionblock-transcriptionblock)._ **Document** (`Document`) — a single archival item (letter, postcard, photograph) with a file stored in MinIO/S3 and associated metadata (sender, receivers, date, tags, transcription blocks). +**date precision** (`Document.metaDatePrecision`, enum `DatePrecision`) — how exactly a document's date is known, one of `DAY, MONTH, SEASON, YEAR, RANGE, APPROX, UNKNOWN`. A verbatim mirror of the import normalizer's `Precision` enum so honest dates can be rendered (`APPROX` → "ca.", `RANGE` uses `meta_date_end`) instead of fabricating a false `DAY`-level date. `UNKNOWN` is the explicit value for undated documents. + +**raw attribution** (`Document.senderText`, `Document.receiverText`, `Document.metaDateRaw`) — the original spreadsheet cell text for a document's sender, receiver, and date, preserved verbatim even after a `Person` or normalized date is linked. It keeps provenance intact and enables an "as written in the original" view. + **DocumentVersion** (`DocumentVersion`) — an append-only snapshot of a `Document`'s metadata at a point in time. Append-only by convention; no consumer-facing create or update endpoint exists. The entity uses Lombok `@Data` (which generates setters), so immutability is enforced by application convention, not at the Java level. **Tag** (`Tag`) — a hierarchical category that can be applied to `Document`s. Tags are self-referencing via a `parent_id` foreign key, forming a tree structure. @@ -55,9 +64,13 @@ _See also [Annotation](#annotation-documentannotation)._ - `REVIEWED`: a reviewer has approved the transcription. - `ARCHIVED`: the document is finalized and read-only. -**Mass import** — an asynchronous batch process (`MassImportService`) that reads an Excel or ODS file and creates `Person`s, `Tag`s, and `PLACEHOLDER` `Document`s in one shot. Only one import can run at a time (`IMPORT_ALREADY_RUNNING` error if attempted concurrently). +**Canonical import** — an asynchronous batch process (`CanonicalImportOrchestrator`) that consumes the normalizer's committed canonical artifacts and creates `Tag`s, `Person`s (register + tree), family relationships, and `Document`s. Four idempotent loaders run in a fixed dependency order — `TagTreeImporter` → `PersonRegisterImporter` → `PersonTreeImporter` → `DocumentImporter` — each calling the owning domain's service. Re-running it never duplicates rows (upsert by `source_ref` / document `index`) and never overwrites a human-edited field. Only one import can run at a time (`IMPORT_ALREADY_RUNNING` error if attempted concurrently); a missing or malformed artifact fails closed (`IMPORT_ARTIFACT_INVALID`). Replaced the legacy raw-spreadsheet `MassImportService` (see ADR-025). -**SkippedFile** (`MassImportService.SkippedFile`) — a file that was presented for import but not processed, recorded with a `filename` and a `reason` code. Possible reasons: `INVALID_PDF_SIGNATURE` (magic-byte validation failed), `S3_UPLOAD_FAILED` (file upload to MinIO/S3 threw an exception), `FILE_READ_ERROR` (the file could not be opened for reading), or `ALREADY_EXISTS` (a document with the same filename already exists in the archive with a status other than `PLACEHOLDER`). +**canonical artifact** — one of the four files the normalizer (`tools/import-normalizer/`) emits and commits to `tools/import-normalizer/out/`: `canonical-tag-tree.xlsx`, `canonical-persons.xlsx`, `canonical-persons-tree.json`, `canonical-documents.xlsx`. They are the contract the backend importer reads (mapped by header name); the semantic transformation (German-date parsing, name classification) lives only in the normalizer, never in Java. + +**CanonicalSheetReader** — the value-level POI helper that opens a canonical `.xlsx`, maps the header row to column indices by name (replacing the brittle positional column config), splits pipe-delimited list columns, and throws `IMPORT_ARTIFACT_INVALID` on a missing required header rather than NPE-ing on a null index. + +**SkippedFile** (`ImportStatus.SkippedFile`) — a file that was presented for import but not processed, recorded with a `filename` and a `reason` code. Possible reasons: `INVALID_FILENAME_PATH_TRAVERSAL` (the file-column basename failed the path-traversal guard), `INVALID_PDF_SIGNATURE` (magic-byte validation failed), `S3_UPLOAD_FAILED` (file upload to MinIO/S3 threw an exception), `FILE_READ_ERROR` (the file could not be opened for reading), or `ALREADY_EXISTS` (a document with the same `index` already exists in the archive with a status other than `PLACEHOLDER`). **skipped count** — the total number of `SkippedFile` entries accumulated during a single import run (`ImportStatus.skipped()`). Shown in the amber warning section of the Import Status Card in the admin UI; a value of zero suppresses the section entirely. diff --git a/docs/TODO-backend.md b/docs/TODO-backend.md index 7b03f802..e09f47e9 100644 --- a/docs/TODO-backend.md +++ b/docs/TODO-backend.md @@ -94,17 +94,6 @@ The schema includes `spring_session` and `spring_session_attributes` tables, but --- -### `MassImportService` provides no status or error feedback -**File:** `service/MassImportService.java`, `controller/AdminController.java` - -`/api/admin/trigger-import` returns immediately (async), but there is no way for the admin to know whether the import succeeded, failed, or is still running. Errors during async execution are silently swallowed. - -**Fix options:** -- Store import job status in a DB table (`import_jobs`) with state (`RUNNING`, `DONE`, `FAILED`) and expose a `GET /api/admin/import-status` endpoint -- Alternatively, make the endpoint synchronous since it already blocks on file I/O — only use async if you need true non-blocking behaviour - ---- - ## Missing Capabilities ### No test coverage @@ -114,7 +103,7 @@ The only test is a Spring context load test. No unit or integration tests exist **Suggested starting points (highest value for effort):** 1. `DocumentSpecifications` — pure logic, easy to unit test with an in-memory H2 or Testcontainers PostgreSQL -2. `ExcelService` — parsing logic, test with fixture `.xlsx` files (one exists in `api_tests/`) +2. Canonical import loaders (`CanonicalSheetReader`, `DocumentImporter`, etc.) — parsing/upsert logic, test with fixture canonical `.xlsx` files 3. `PermissionAspect` — security logic should be tested; use `@WithMockUser` from Spring Security Test --- diff --git a/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md b/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md new file mode 100644 index 00000000..0c3bee4b --- /dev/null +++ b/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md @@ -0,0 +1,201 @@ +# ADR-025 — Canonical Import Output as Contract & Single-Migration Schema Foundation + +**Date:** 2026-05-27 +**Status:** Accepted +**Issue:** #671 (schema, decisions 1–2); #669 (importer architecture, decision 3) +**Milestone:** Handling the Unknowns — honest uncertainty in dates & people + +--- + +## Context + +The "Handling the Unknowns" milestone introduces honest uncertainty into the archive: +documents whose dates are known only approximately or as a range, and people the importer +infers from raw attribution text but cannot confidently identify. Three sibling issues — +date precision (#666), name triage (#665), and the importer (#669) — each independently +planned a Flyway `V69` migration that altered `persons`. Three `V69`s is a boot failure +(Flyway versions must be unique), and `persons.provisional` was at risk of being defined +twice. + +Two durable decisions had to be made before any application code in Phases 3–6 could +compile against the new schema. + +--- + +## Decision + +### 1. All import/precision/attribution/identity schema lives in ONE migration with a single owner + +`V69__import_precision_attribution_identity_schema.sql` adds every new column for this +milestone in a single, atomic, forward-only migration: + +- `documents`: `meta_date_precision` (backfilled `DAY` where dated / `UNKNOWN` where not, + then `NOT NULL`), `meta_date_end`, `meta_date_raw`, `sender_text`, `receiver_text`. +- `persons`: `source_ref` (unique index, nullable), `provisional` (`NOT NULL DEFAULT false`). +- `tag`: `source_ref` (unique index, nullable). + +Integrity is pushed to the database as fail-closed `CHECK` constraints (the precedent is +`V22`'s `person_type` allowlist): + +- `meta_date_precision` must be one of the seven enum values. +- `meta_date_end` may be non-null **only** when precision = `RANGE` (one-directional, not + biconditional — see Consequences). +- `meta_date_end >= meta_date` for ranges with both endpoints (a `CHECK`, not a trigger). +- `meta_date_raw`, `sender_text`, `receiver_text` are length-capped at 10 000 (mirrors the + `transcription_blocks` cap in `V18`). + +No sibling issue adds another migration that alters `persons` or `documents` in this +milestone. + +### 2. The backend `DatePrecision` enum is a verbatim mirror of the normalizer's `Precision`; the canonical output is the contract + +The importer reads the Python normalizer's canonical output +(`tools/import-normalizer/`). The backend `DatePrecision` enum +(`DAY, MONTH, SEASON, YEAR, RANGE, APPROX, UNKNOWN`) is a verbatim copy of the normalizer's +`Precision(StrEnum)` (`dates.py`). There is **no translation layer**: the normalizer's +output strings are persisted as-is. The same applies to `source_ref`, which carries the +normalizer's `person_id` / canonical `tag_path` unchanged as the re-import idempotency key. + +### 3. The importer is four idempotent loaders over the canonical artifacts; Java no longer parses the raw spreadsheet (Phase 3, #669) + +The legacy `MassImportService` read the *raw* original spreadsheet by positional column +index (`@Value app.import.col.*`) and re-derived everything in Java (ISO-only date parsing, +name classification via `findOrCreateByAlias`, an ODS/XXE XML path). It is **deleted**. + +The rebuild is a `CanonicalImportOrchestrator` driving four single-responsibility loaders in +an explicit dependency DAG — `TagTreeImporter` → `PersonRegisterImporter` → +`PersonTreeImporter` → `DocumentImporter` — that **consume the canonical artifacts produced +by the offline Python normalizer** (`tools/import-normalizer/out/`, synced onto the ops host +alongside the PDFs — see "Canonical artifacts are produced locally, NOT version-controlled" +below). A shared `CanonicalSheetReader` maps columns **by header +name** (not by index) and fails closed (`IMPORT_ARTIFACT_INVALID`) on a missing header. Each +loader calls the **owning domain's service**, never a repository (layering rule); the tree +loader uses `RelationshipService`, never the relationship repository. + +Settled sub-decisions: + +- **Idempotency precedence is domain-specific.** Persons/tags upsert by `source_ref`, + documents by `index`. Two distinct rules apply: + - **Person/Tag scalar fields = preserve human edits.** On re-import a non-blank field a human + changed in-app is never overwritten (blank fields are filled from canonical via the single + `preferHuman` idiom), and `provisional` is monotonic-downward — once a human confirms a + person (`false`) it never reverts to `true`. Because the orchestrator loads the register and + tree *before* documents, a person already `false` can never be flipped provisional by a + later document row that references the same `source_ref`, regardless of document-row order. + - **Document sender/receivers/tags = canonical-authoritative.** A document's sender, receiver + set, and tag set are owned by the canonical row, not the archivist. On re-import of a + PLACEHOLDER document `DocumentImporter` clears and re-populates `receivers`/`tags` so a row + whose set *shrinks* prunes the removed links rather than accumulating stale ones. The + "preserve human edits" rule above does **not** extend to these collections. The raw + `sender_text`/`receiver_text` cells are always retained verbatim (a separate invariant). + Note non-PLACEHOLDER documents are skipped entirely (`ALREADY_EXISTS`), so once a document + has a file the importer never touches it again — this bounds the authoritative-overwrite + blast radius to placeholder rows. + Verified against real Postgres in `CanonicalImportIntegrationTest` + (`reimport_preservesHumanEditedPersonField`, `reimport_prunesRemovedReceiverAndTag…`, + `import_neverFlipsRegisterPersonToProvisional…`). +- **Name policy = Option A.** The normalizer resolved attribution upstream: the document sheet + carries the resolved slug in `sender_person_id` / `receiver_person_ids` and the raw cell in + `sender_name` / `receiver_names`. The importer routes register-first by `source_ref` + (provisional `Person` when a slug is unmatched), and **always retains the raw cell** in + `sender_text` / `receiver_text` even when a person is linked — the load-bearing invariant + behind the merge story. A row with no slug but raw text (prose / `?` / object-noise) links + no person and keeps only the raw text. +- **`provisional` is now populated.** Importer-minted persons are `provisional = true`; + register and tree persons stay `false`. This is the Phase-3 contract the schema (decision 1) + left at default-`false`. +- **PDFs resolve directly by index (`.pdf`), not by a `file` column.** The corpus is + uniform — all PDFs are named `.pdf` flat in the import dir (e.g. `W-0124.pdf`, + `Mü-0001.pdf`) — so `DocumentImporter` resolves a document's PDF with an O(1) + `importDir.resolve(index + ".pdf")` lookup. The redundant `file` column (carrying the + spreadsheet's messy `datei` value) and the recursive directory walk that resolved it were + removed (#686, which also closed #676 — the O(rows×tree) walk is gone). The normalizer no + longer emits `file` or the `index_file_mismatch` review flag. +- **Security guards are defense-in-depth, not upstream-trust.** The `index` is the only thing + that drives the on-disk lookup, so it is treated as hostile (CWE-22 does not care it came from + our tool): `isValidImportIndex` rejects slash/backslash, three Unicode slash homoglyphs, any + `.` (so `.pdf` is the only extension and `..` can never appear), null byte, and + absolute paths, and requires a strict catalog shape (1–4 Latin letters incl. umlauts, one or + more hyphens, digits, optional trailing `x`). A bad index skips the row with a clear + `SkipReason` (`INVALID_FILENAME_PATH_TRAVERSAL`). The resolved canonical path is still asserted + to stay inside the import dir as a second line of defense (a symlinked `.pdf` cannot + escape), and the `%PDF` magic-byte check still gates upload. These guards and their tests were + ported from the file-column resolution (originally from `MassImportService`). + +--- + +## Consequences + +- **RANGE is one-directional, not biconditional.** A `RANGE` row may have a null + `meta_date_end` (an open-ended range with only a start), because the normalizer can emit + start-only ranges. A biconditional `RANGE ⟺ end IS NOT NULL` rule would reject valid + normalizer output, so it was rejected. Phase 4 rendering must handle a `RANGE` with no end + gracefully. +- **`provisional` stays `false` throughout this phase.** The column and flag exist, but no + code path sets it `true`; the importer (Phase 3) is the only writer. This is intentional, + not a half-built feature. +- **A future dev must not "improve" the enum.** Renaming or dropping a `DatePrecision` value + without changing the normalizer silently breaks import idempotency and date rendering. The + enum's Javadoc states this; the DB `CHECK` enforces validity independent of the Java enum. +- **`source_ref` is unique + nullable.** Manually created persons/tags have `source_ref = + NULL`; Postgres allows multiple NULLs under a plain unique index, so no backfill is needed. +- **Forward-only.** The migration is immutable once shipped (Flyway checksum model); any fix + goes in a later version. There is no down-migration — rollback means restoring from the + nightly `pg_dump`, the standard procedure. +- **`runImport()` is non-transactional — per-loader transactions only.** The orchestrator + does not wrap the four loaders in a single transaction; each loader (or the per-call + `upsertBySourceRef` / `DocumentImporter.load`) carries its own `@Transactional` boundary. A + partial failure mid-run (e.g. the document loader throws after tags + persons committed) + leaves the earlier loaders' data committed and the `ImportStatus` set to `FAILED`. This is + acceptable precisely because the import is idempotent: re-running is safe and converges to + the same state, so the operational recovery for a partial failure is simply to fix the + offending artifact and re-trigger the import — no manual cleanup of half-written data is + required. A future maintainer must not assume all-or-nothing semantics. +- **The index pattern is corpus-specific and must be revisited if the catalog scheme grows.** + `INDEX_PATTERN` accepts only the *current* corpus shape — at most four Latin-1 letters (incl. + umlauts) followed by one or more hyphens, ASCII digits, and an optional trailing `x`. This is a + conscious constraint, not a general filename validator: a future sub-collection catalogued with + a 5-letter prefix, a digit-led id, or a non-Latin-1 letter (e.g. `Č` or a Cyrillic id) would + fail `isValidImportIndex` and its rows would be **skipped** (`INVALID_FILENAME_PATH_TRAVERSAL`), + not imported. Likewise a real PDF that does not follow `.pdf` produces a `PLACEHOLDER` + (the importer logs both cases distinctly — see #686). If the catalog scheme ever changes, the + pattern and its tests must be widened deliberately; do not loosen it casually, as it is the + allowlist that keeps the on-disk lookup safe. Note `\d` is intentionally ASCII-only — adding + `Pattern.UNICODE_CHARACTER_CLASS` would silently widen the accepted digit set. +- **A malicious/garbage index skips its row with a loud `SkipReason`, by design.** Since #686 + the index is the only on-disk lookup key. An index that fails `isValidImportIndex` + (path separator, traversal token, slash homoglyph, null byte, absolute path, or a non-catalog + shape) is recorded as a `SkippedFile` with reason `INVALID_FILENAME_PATH_TRAVERSAL` and the + import continues with the remaining rows — nothing outside the import dir is ever read. A + symlinked `.pdf` whose canonical path escapes the import dir is the one case that still + aborts the import (a `DomainException` from the containment assertion), because a syntactically + valid index resolving outside the dir is an environment-level attack signal, not a row typo. +- **`PersonSummaryDTO` coupling.** `provisional` was added to the `PersonSummaryDTO` native + interface projection; because the projection is backed by native SQL, the column had to be + added to all three native `SELECT`s (`findAllWithDocumentCount`, `searchWithDocumentCount`, + `findTopByDocumentCount`) or it would silently return `false`. Guarded by integration tests + against real Postgres. + +--- + +## Canonical artifacts are produced locally, NOT version-controlled + +The four files in `tools/import-normalizer/out/` — +`canonical-documents.xlsx`, `canonical-persons.xlsx`, `canonical-tag-tree.xlsx`, +`canonical-persons-tree.json` — contain real family PII (names, addresses, attribution +prose) and are **deliberately excluded from the git index** via +`tools/import-normalizer/.gitignore`. They are regenerated locally from the source +spreadsheet by running the Python normalizer, and synced into the ops host's +`IMPORT_HOST_DIR` out-of-band (alongside the `.pdf` corpus) — the same mechanism +that delivers the PDFs. + +The contract between normalizer and importer is the **header schema** (column names, +their types, the `Precision` enum strings, the slug shape) — not the file contents. +`CanonicalSheetReader` maps columns by header name and fails closed +(`IMPORT_ARTIFACT_INVALID`) on a missing header, which is what locks the contract; the +file-level golden fixtures stay outside the repo. + +A future maintainer must not "fix" CI by checking these artifacts back in — they are +PII, the regression that prompted this rule. Tests use small synthetic fixtures +constructed in-process (`DocumentImporterTest`, `CanonicalImportIntegrationTest`) rather +than real-corpus snapshots. diff --git a/docs/architecture/c4-diagrams.md b/docs/architecture/c4-diagrams.md index 858082aa..d01cdfaa 100644 --- a/docs/architecture/c4-diagrams.md +++ b/docs/architecture/c4-diagrams.md @@ -93,7 +93,7 @@ C4Component ### 3b — Document Management & Import -Document management, file storage, and bulk Excel/ODS import. +Document management, file storage, and the canonical import. ```mermaid C4Component @@ -105,12 +105,11 @@ C4Component System_Boundary(backend, "API Backend (Spring Boot)") { Component(docCtrl, "DocumentController", "Spring MVC — /api/documents", "CRUD for documents: search, get by ID, update metadata, upload/download file, conversation thread, and batch metadata updates.") - Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers asynchronous Excel/ODS mass import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).") + Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers the asynchronous canonical import (requires ADMIN permission). Reports import state via GET /api/admin/import-status (IDLE/RUNNING/DONE/FAILED).") Component(docSvc, "DocumentService", "Spring Service", "Core document business logic: store, update, search. Resolves persons and tags, delegates file I/O to FileService, builds dynamic JPA Specifications, and integrates with audit logging.") Component(fileSvc, "FileService", "Spring Service", "Wraps AWS SDK v2 S3Client. Uploads files with UUID-keyed paths, computes SHA-256 hash, downloads with content-type detection, and generates presigned URLs for OCR access.") - Component(massImport, "MassImportService", "Spring Service — @Async", "Reads Excel/ODS files from /import mount. Tracks import state (IDLE/RUNNING/DONE/FAILED) and delegates to ExcelService. Returns immediately; processing runs asynchronously.") - Component(excelSvc, "ExcelService", "Spring Service", "Parses Excel/ODS workbooks (Apache POI). Column indices configurable via application.properties. Creates/updates document records per row.") + Component(importOrch, "CanonicalImportOrchestrator", "Spring Service — @Async", "Runs four idempotent loaders (TagTree → PersonRegister → PersonTree → Document) in a fixed DAG over the normalizer's committed canonical artifacts (canonical-*.xlsx + canonical-persons-tree.json) from /import — see diagram 3b. Owns the IDLE/RUNNING/DONE/FAILED state machine.") Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.") Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.") @@ -123,14 +122,15 @@ C4Component Rel(frontend, docCtrl, "Document requests", "HTTP / JSON") Rel(frontend, adminCtrl, "Trigger import", "HTTP / JSON") Rel(docCtrl, docSvc, "Delegates to", "") - Rel(adminCtrl, massImport, "Triggers", "") + Rel(adminCtrl, importOrch, "Triggers", "") Rel(docSvc, fileSvc, "Upload / download files", "") Rel(docSvc, docRepo, "Reads / writes documents", "") Rel(docSvc, docSpec, "Builds search predicates", "") Rel(docSvc, personSvc, "Resolves sender / receivers", "") Rel(docSvc, tagSvc, "Finds or creates tags", "") - Rel(massImport, excelSvc, "Parses Excel/ODS file", "") - Rel(excelSvc, docSvc, "Creates / updates documents", "") + Rel(importOrch, docSvc, "Upserts documents (PDF by index) — see 3b", "") + Rel(importOrch, personSvc, "Upserts persons + relationships", "") + Rel(importOrch, tagSvc, "Upserts tag hierarchy", "") Rel(minioConf, fileSvc, "Provides S3Client and S3Presigner beans", "") Rel(fileSvc, minio, "PUT / GET / presigned URL objects", "S3 API / HTTP") Rel(docRepo, db, "SQL queries", "JDBC") @@ -492,7 +492,7 @@ C4Component Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.") Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.") Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.") - Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers Excel/ODS mass import (POST /api/admin/trigger-import). Displays import state.") + Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers the canonical import (POST /api/admin/trigger-import). Displays import state.") Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.") } diff --git a/docs/architecture/c4/l3-backend-3b-document-management.puml b/docs/architecture/c4/l3-backend-3b-document-management.puml index a15eb00b..ac2f0208 100644 --- a/docs/architecture/c4/l3-backend-3b-document-management.puml +++ b/docs/architecture/c4/l3-backend-3b-document-management.puml @@ -1,7 +1,7 @@ @startuml !include -title Component Diagram: API Backend — Document Management & Import +title Component Diagram: API Backend — Document Management & Canonical Import Container(frontend, "Web Frontend", "SvelteKit") ContainerDb(db, "PostgreSQL", "PostgreSQL 16") @@ -9,30 +9,50 @@ ContainerDb(minio, "Object Storage", "MinIO (S3-compatible)") System_Boundary(backend, "API Backend (Spring Boot)") { Component(docCtrl, "DocumentController", "Spring MVC — /api/documents", "CRUD for documents: search, get by ID, update metadata, upload/download file, conversation thread, batch metadata updates, and per-month density aggregation for the timeline filter widget.") - Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers asynchronous Excel/ODS mass import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).") + Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers the asynchronous canonical import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).") Component(docSvc, "DocumentService", "Spring Service", "Core document business logic: store, update, search. Resolves persons and tags, delegates file I/O to FileService, builds dynamic JPA Specifications, and integrates with audit logging.") Component(fileSvc, "FileService", "Spring Service", "Wraps AWS SDK v2 S3Client. Uploads files with UUID-keyed paths, computes SHA-256 hash, downloads with content-type detection, and generates presigned URLs for OCR access.") - Component(massImport, "MassImportService", "Spring Service — @Async", "Reads Excel/ODS files from /import mount. Tracks import state (IDLE/RUNNING/DONE/FAILED) and delegates to ExcelService. Returns immediately; processing runs asynchronously.") - Component(excelSvc, "ExcelService", "Spring Service", "Parses Excel/ODS workbooks (Apache POI). Column indices configurable via application.properties. Creates/updates document records per row.") + Component(importOrch, "CanonicalImportOrchestrator", "Spring Service — @Async", "Runs the four canonical loaders in an explicit dependency DAG (TagTree → PersonRegister → PersonTree → Document). Smoke-checks all four artifacts before starting, owns the IDLE/RUNNING/DONE/FAILED state machine, fails closed on a malformed artifact.") + Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).") + Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).") + Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.") + Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, builds an honest precision-aware title via DocumentTitleFormatter, keeps the S3 upload + thumbnail plumbing, and resolves each PDF by index (importDir/.pdf) guarded by strict index validation + canonical-path containment + %PDF magic-byte check (no recursive walk).") + Component(titleFmt, "DocumentTitleFormatter", "Pure helper", "Formats the date label baked into an import title at exactly the data's precision (MONTH -> 'Juni 1916', never a fabricated day). Mirrors the frontend formatDocumentDate; both are pinned to docs/date-label-fixtures.json (#666).") + Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.") Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.") Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.") Component(docSpec, "DocumentSpecifications", "JPA Criteria API", "Factory for composable predicates: hasText (full-text), hasSender, hasReceiver, isBetween (date range), hasTags (subquery AND/OR logic).") } -Component(personSvc, "PersonService", "Spring Service", "See diagram 3e. Called by DocumentService to resolve sender / receiver persons by ID.") -Component(tagSvc, "TagService", "Spring Service", "See diagram 3d. Called by DocumentService to find or create tags by name.") +Component(personSvc, "PersonService", "Spring Service", "See diagram 3e. Resolves sender / receiver persons by ID; upserts persons by source_ref for the importer.") +Component(tagSvc, "TagService", "Spring Service", "See diagram 3d. Finds or creates tags by name; upserts tags by source_ref for the importer.") +Component(relSvc, "RelationshipService", "Spring Service", "See diagram 3e. Creates family relationships from the person tree during import.") Rel(frontend, docCtrl, "Document requests", "HTTP / JSON") Rel(frontend, adminCtrl, "Trigger import", "HTTP / JSON") Rel(docCtrl, docSvc, "Delegates to") -Rel(adminCtrl, massImport, "Triggers") +Rel(adminCtrl, importOrch, "Triggers") Rel(docSvc, fileSvc, "Upload / download files") Rel(docSvc, docRepo, "Reads / writes documents") Rel(docSvc, docSpec, "Builds search predicates") Rel(docSvc, personSvc, "Resolves sender / receivers") Rel(docSvc, tagSvc, "Finds or creates tags") -Rel(massImport, excelSvc, "Parses Excel/ODS file") -Rel(excelSvc, docSvc, "Creates / updates documents") +Rel(importOrch, tagTreeLoader, "1. Loads tags") +Rel(importOrch, personRegLoader, "2. Loads register persons") +Rel(importOrch, personTreeLoader, "3. Loads tree persons + relationships") +Rel(importOrch, docLoader, "4. Loads documents") +Rel(tagTreeLoader, sheetReader, "Reads canonical .xlsx") +Rel(personRegLoader, sheetReader, "Reads canonical .xlsx") +Rel(docLoader, sheetReader, "Reads canonical .xlsx") +Rel(docLoader, titleFmt, "Builds honest title date") +Rel(tagTreeLoader, tagSvc, "Upserts tags by source_ref") +Rel(personRegLoader, personSvc, "Upserts persons by source_ref") +Rel(personTreeLoader, personSvc, "Upserts persons by source_ref") +Rel(personTreeLoader, relSvc, "Creates relationships") +Rel(docLoader, docSvc, "Upserts documents by index") +Rel(docLoader, personSvc, "Register-first match / provisional person") +Rel(docLoader, tagSvc, "Attaches tag by source_ref") +Rel(docLoader, fileSvc, "Uploads resolved file") Rel(minioConf, fileSvc, "Provides S3Client and S3Presigner beans") Rel(fileSvc, minio, "PUT / GET / presigned URL objects", "S3 API / HTTP") Rel(docRepo, db, "SQL queries", "JDBC") diff --git a/docs/architecture/c4/l3-backend-3e-persons.puml b/docs/architecture/c4/l3-backend-3e-persons.puml index 47c884aa..f424168d 100644 --- a/docs/architecture/c4/l3-backend-3e-persons.puml +++ b/docs/architecture/c4/l3-backend-3e-persons.puml @@ -7,12 +7,12 @@ Container(frontend, "Web Frontend", "SvelteKit") ContainerDb(db, "PostgreSQL", "PostgreSQL 16") System_Boundary(backend, "API Backend (Spring Boot)") { - Component(personCtrl, "PersonController", "Spring MVC — /api/persons", "Lists and searches family members. Returns documents sent by or received by a person, correspondent suggestions, and person summary with document counts.") + Component(personCtrl, "PersonController", "Spring MVC — /api/persons", "Filtered, paginated directory (type/familyOnly/hasDocuments/provisional + page/size -> PersonSearchResult). Returns documents sent/received, correspondent suggestions, person summaries with counts. PATCH /{id}/confirm clears provisional; DELETE /{id} removes a person (both WRITE_ALL).") Component(relCtrl, "RelationshipController", "Spring MVC — /api/network, /api/persons/{id}/relationships", "CRUD for explicit person relationships and the full family network graph (nodes + edges) used by the Stammbaum view.") - Component(personSvc, "PersonService", "Spring Service", "Person CRUD, alias management, and merge operations (reassigns all document sender/receiver references before deleting duplicate persons).") + Component(personSvc, "PersonService", "Spring Service", "Person CRUD, alias management, filtered paged search (PersonFilter -> paired slice/count), confirm (clears provisional), delete (detaches document refs first), and merge operations (reassigns all document sender/receiver references before deleting duplicate persons).") Component(relSvc, "RelationshipService", "Spring Service", "Manages explicit directional family relationships (PARENT_OF, SPOUSE_OF, SIBLING_OF, etc.) with optional date ranges and notes.") Component(relInference, "RelationshipInferenceService", "Spring Service", "Computes transitive family relationships from explicit edges to infer grandparent/grandchild, aunt/uncle, and other extended-family links for the network graph.") - Component(personRepo, "PersonRepository", "Spring Data JPA", "Queries persons with name search (including aliases), correspondent discovery, person summaries with document counts, and merge/reassignment helpers.") + Component(personRepo, "PersonRepository", "Spring Data JPA", "Queries persons with name search (including aliases), correspondent discovery, person summaries with document counts, paired filter-aware slice + COUNT queries (one shared WHERE clause), and merge/reassignment helpers.") Component(relRepo, "PersonRelationshipRepository", "Spring Data JPA", "Reads and writes PersonRelationship records. Supports lookup by person ID, by relation type, and existence checks for deduplication.") } diff --git a/docs/architecture/c4/l3-frontend-3c-people-stories.puml b/docs/architecture/c4/l3-frontend-3c-people-stories.puml index abfbea5e..b64539ab 100644 --- a/docs/architecture/c4/l3-frontend-3c-people-stories.puml +++ b/docs/architecture/c4/l3-frontend-3c-people-stories.puml @@ -7,8 +7,9 @@ Person(user, "User") Container(backend, "API Backend", "Spring Boot") System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") { - Component(personsPage, "/persons and /persons/[id]", "SvelteKit Routes", "Person directory and detail. Detail: metadata, document list sent/received, correspondents, explicit and inferred family relationships.") + Component(personsPage, "/persons and /persons/[id]", "SvelteKit Routes", "Person directory (server-side filtered + paginated) and detail. Directory: type/family/has-documents chips, reader default (familyMember OR documentCount > 0), writer-only show-all toggle. Detail: metadata, document list sent/received, correspondents, family relationships.") Component(personEdit, "/persons/[id]/edit and /persons/new", "SvelteKit Routes", "Create and edit person forms. Edit: metadata, aliases, explicit relationships. Actions: PUT/POST /api/persons.") + Component(personReview, "/persons/review", "SvelteKit Route", "Transcriber triage view (WRITE-gated link). Lists provisional persons; per-row Merge / Umbenennen / Bestätigen / Löschen. Actions: POST /merge, PUT /{id}, PATCH /{id}/confirm, DELETE /{id}.") Component(briefwechsel, "/briefwechsel", "SvelteKit Route", "Bilateral conversation timeline. Selects two persons via PersonTypeahead, fetches GET /api/documents/conversation, displays chronological exchange.") Component(aktivitaeten, "/aktivitaeten", "SvelteKit Route", "Unified activity feed (Chronik). Loader: GET /api/dashboard/activity and GET /api/notifications?read=false.") Component(geschichten, "/geschichten and /geschichten/[id]", "SvelteKit Routes", "Story list and detail pages. Loader: GET /api/geschichten?status=PUBLISHED.") @@ -20,8 +21,9 @@ System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") { } Rel(user, personsPage, "Browses family members", "HTTPS / Browser") -Rel(personsPage, backend, "GET /api/persons, GET /api/persons/{id}", "HTTP / JSON") +Rel(personsPage, backend, "GET /api/persons (filter + page params -> PersonSearchResult), GET /api/persons/{id}", "HTTP / JSON") Rel(personEdit, backend, "GET /api/persons/{id}, PUT /api/persons/{id}, POST /api/persons", "HTTP / JSON") +Rel(personReview, backend, "GET /api/persons?provisional=true, PATCH /api/persons/{id}/confirm, DELETE /api/persons/{id}, POST /api/persons/{id}/merge", "HTTP / JSON") Rel(briefwechsel, backend, "GET /api/documents/conversation", "HTTP / JSON") Rel(aktivitaeten, backend, "GET /api/dashboard/activity, GET /api/notifications", "HTTP / JSON") Rel(geschichten, backend, "GET /api/geschichten", "HTTP / JSON") diff --git a/docs/architecture/c4/l3-frontend-3d-administration.puml b/docs/architecture/c4/l3-frontend-3d-administration.puml index 3f7c89ef..5b711b3a 100644 --- a/docs/architecture/c4/l3-frontend-3d-administration.puml +++ b/docs/architecture/c4/l3-frontend-3d-administration.puml @@ -12,7 +12,7 @@ System_Boundary(frontend, "Web Frontend (SvelteKit / SSR)") { Component(adminGroups, "/admin/groups, /admin/groups/[id], /admin/groups/new", "SvelteKit Routes", "Permission group management: create/edit groups and their permission sets.") Component(adminTags, "/admin/tags and /admin/tags/[id]", "SvelteKit Routes", "Tag administration: edit tag hierarchy, merge tags, delete subtrees.") Component(adminOcr, "/admin/ocr and /admin/ocr/[personId]", "SvelteKit Routes", "Global and per-person OCR configuration. Manages script types and triggers sender model training.") - Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers Excel/ODS mass import (POST /api/admin/trigger-import). Displays import state.") + Component(adminSystem, "/admin/system", "SvelteKit Route", "System status panel. Triggers the canonical import (POST /api/admin/trigger-import). Displays import state.") Component(hilfe, "/hilfe/transkription", "SvelteKit Route", "Static transcription style guide for Kurrent and Sütterlin character recognition. No backend calls.") } diff --git a/docs/architecture/db/db-orm.puml b/docs/architecture/db/db-orm.puml index a6e64aa3..7b03c156 100644 --- a/docs/architecture/db/db-orm.puml +++ b/docs/architecture/db/db-orm.puml @@ -1,6 +1,6 @@ @startuml db-orm -' Schema source: Flyway V1–V60 (excl. V37, V43 — intentionally removed) -' Schema as of: V60 (2026-05-06) +' Schema source: Flyway V1–V69 (excl. V37, V43 — intentionally removed) +' Schema as of: V69 (2026-05-27) ' ⚠ This is a versioned snapshot. Update when the schema changes significantly. hide circle @@ -88,6 +88,11 @@ package "Documents" { summary : TEXT transcription : TEXT meta_date : DATE + meta_date_precision : VARCHAR(16) NOT NULL + meta_date_end : DATE + meta_date_raw : TEXT + sender_text : TEXT + receiver_text : TEXT meta_location : VARCHAR(255) meta_document_location : VARCHAR(255) archive_box : VARCHAR(255) @@ -182,6 +187,8 @@ package "Persons" { birth_year : INTEGER death_year : INTEGER family_member : BOOLEAN NOT NULL + source_ref : VARCHAR(255) UNIQUE + provisional : BOOLEAN NOT NULL } entity person_name_aliases { @@ -217,6 +224,7 @@ package "Tags" { name : VARCHAR(255) NOT NULL UNIQUE parent_id : UUID <> color : VARCHAR(20) + source_ref : VARCHAR(255) UNIQUE } } diff --git a/docs/architecture/db/db-relationships.puml b/docs/architecture/db/db-relationships.puml index c3100cfa..d6f4b542 100644 --- a/docs/architecture/db/db-relationships.puml +++ b/docs/architecture/db/db-relationships.puml @@ -1,7 +1,9 @@ @startuml db-relationships -' Schema source: Flyway V1–V60 (excl. V37, V43 — intentionally removed) -' Schema as of: V60 (2026-05-06) +' Schema source: Flyway V1–V69 (excl. V37, V43 — intentionally removed) +' Schema as of: V69 (2026-05-27) ' ⚠ This is a versioned snapshot. Update when the schema changes significantly. +' Note: V69 adds columns only (persons.source_ref, tag.source_ref, document +' precision/attribution fields); no new FK relationships, so this diagram is unchanged. hide circle skinparam linetype ortho diff --git a/docs/date-label-fixtures.json b/docs/date-label-fixtures.json new file mode 100644 index 00000000..c6aed293 --- /dev/null +++ b/docs/date-label-fixtures.json @@ -0,0 +1,140 @@ +{ + "_comment": "Single source of truth for the honest date-label rule set shared by the TS formatDocumentDate (frontend/src/lib/shared/utils/documentDate.ts) and the Java formatTitleDate (backend importing/DocumentTitleFormatter.java). The 'cases' array holds the GERMAN (de) canonical form and is asserted by BOTH suites — that is the Java<->TS drift guard (en-dash vs hyphen, 'ca.' vs 'circa', season words, range collapse). The Java title formatter intentionally renders German server-side (import titles are always German); only the TS UI formatter is locale-aware, so 'localeCases' (en/es month-name output) is asserted by the TS spec ONLY and must NOT be fed to the Java test. Do not edit one side's expectation without editing this file and the relevant test(s). Season->month mapping note: the Python import normalizer (tools/import-normalizer) is the UPSTREAM authority for which representative month a season maps to (4/7/10/1); both formatters mirror it but it sits OUTSIDE this Java<->TS guard, so a normalizer change is not caught here. See issue #666 and the Markus/Sara drift-guard decision.", + "cases": [ + { + "name": "DAY renders a full long date", + "precision": "DAY", + "anchor": "1943-12-24", + "end": null, + "raw": null, + "expected": "24. Dezember 1943" + }, + { + "name": "MONTH renders month and year only — never a fabricated day", + "precision": "MONTH", + "anchor": "1916-06-01", + "end": null, + "raw": "Juni 1916", + "expected": "Juni 1916" + }, + { + "name": "SEASON renders the season word from raw", + "precision": "SEASON", + "anchor": "1916-06-01", + "end": null, + "raw": "Sommer 1916", + "expected": "Sommer 1916" + }, + { + "name": "SEASON with null raw derives the season from the anchor month", + "precision": "SEASON", + "anchor": "1916-04-01", + "end": null, + "raw": null, + "expected": "Frühling 1916" + }, + { + "name": "YEAR renders the year only — suppresses month and day", + "precision": "YEAR", + "anchor": "1916-06-15", + "end": null, + "raw": null, + "expected": "1916" + }, + { + "name": "APPROX renders a ca. prefix before the year", + "precision": "APPROX", + "anchor": "1920-01-01", + "end": null, + "raw": null, + "expected": "ca. 1920" + }, + { + "name": "RANGE in the same month collapses the shared month and year", + "precision": "RANGE", + "anchor": "1917-01-10", + "end": "1917-01-11", + "raw": null, + "expected": "10.–11. Jan. 1917" + }, + { + "name": "RANGE across months expands both months, sharing the year", + "precision": "RANGE", + "anchor": "1917-01-30", + "end": "1917-02-02", + "raw": null, + "expected": "30. Jan. – 2. Feb. 1917" + }, + { + "name": "RANGE across a year boundary expands both full dates", + "precision": "RANGE", + "anchor": "1916-12-30", + "end": "1917-01-02", + "raw": null, + "expected": "30. Dez. 1916 – 2. Jan. 1917" + }, + { + "name": "RANGE where end equals start collapses to a single day", + "precision": "RANGE", + "anchor": "1917-01-10", + "end": "1917-01-10", + "raw": null, + "expected": "10. Jan. 1917" + }, + { + "name": "RANGE with a null end renders an open-range indicator, never a fabricated end", + "precision": "RANGE", + "anchor": "1917-01-10", + "end": null, + "raw": null, + "expected": "ab 10. Jan. 1917" + }, + { + "name": "UNKNOWN renders the unknown label regardless of anchor", + "precision": "UNKNOWN", + "anchor": null, + "end": null, + "raw": "?", + "expected": "Datum unbekannt" + } + ], + "localeComment": "TS-only locale parity for the read path (the younger phone audience may use en/es). Asserted ONLY by documentDate.spec.ts — the Java title formatter is German-only by design, so these MUST NOT be fed to DocumentTitleFormatterTest. Each case pins the localized month-name output for DAY and MONTH so a locale regression (e.g. a future de-DE hard-coding) is caught by the drift table, not just by ad-hoc tests.", + "localeCases": [ + { + "name": "DAY in English renders the English month name", + "precision": "DAY", + "anchor": "1943-12-24", + "end": null, + "raw": null, + "locale": "en", + "expected": "December 24, 1943" + }, + { + "name": "DAY in Spanish renders the Spanish month name", + "precision": "DAY", + "anchor": "1943-12-24", + "end": null, + "raw": null, + "locale": "es", + "expected": "24 de diciembre de 1943" + }, + { + "name": "MONTH in English renders the English month name, never a day", + "precision": "MONTH", + "anchor": "1916-06-01", + "end": null, + "raw": "Juni 1916", + "locale": "en", + "expected": "June 1916" + }, + { + "name": "MONTH in Spanish renders the Spanish month name, never a day", + "precision": "MONTH", + "anchor": "1916-06-01", + "end": null, + "raw": "Juni 1916", + "locale": "es", + "expected": "junio de 1916" + } + ] +} diff --git a/docs/import-migration/01-findings-spreadsheet-analysis.md b/docs/import-migration/01-findings-spreadsheet-analysis.md new file mode 100644 index 00000000..eee9723c --- /dev/null +++ b/docs/import-migration/01-findings-spreadsheet-analysis.md @@ -0,0 +1,313 @@ +# Spreadsheet Analysis — Findings (2026-05-25) + +Analysis of the **real raw archive** spreadsheets against the current `MassImportService` +(`backend/.../importing/MassImportService.java`). Goal: import ~7,600 letter rows + a +163-person register, with PDFs to follow. + +Every issue has an ID (`IMP-NN`), severity, evidence, and a proposed approach. + +--- + +## 0. Context: how the importer reads a row today + +`MassImportService` reads **sheet index 0** and maps columns by configurable indices +(`app.import.col.*`, defaults in the source): + +| Property | Default col | Meaning | +| --- | --- | --- | +| `colIndex` | 0 | Index (→ filename `.pdf`) | +| `colBox` | 1 | Box | +| `colFolder` | 2 | Mappe | +| `colSender` | 3 | Sender (raw) | +| `colReceivers` | 5 | Receivers (raw) | +| `colDate` | 7 | Date | +| `colLocation` | 9 | Location | +| `colTags` | 10 | Tag (single) | +| `colSummary` | 11 | Summary | +| `colTranscription` | 13 | Transcription | + +These defaults match the **ODS** file exactly (`Index, Box, Mappe, Von, BriefeschreiberIn, +An, EmpfängerIn, Datum, Datum Originalformat, Ort, Schlagwort, Inhalt, Zeitlicher Kontext, +Transkript` = 14 cols). The ODS was the development target. The new xlsx is a different beast. + +Per-row pipeline: skip if Index blank → derive filename from Index → validate filename → +look for file on disk (recursive; metadata-only if absent) → check PDF magic bytes → +`importSingleDocument` (upsert by `originalFilename`, dedupe non-placeholders as +`ALREADY_EXISTS`). Date parsing is **ISO-only** (`LocalDate.parse`). + +--- + +## IMP-01 — New xlsx column layout ≠ importer defaults 🔴 BLOCKER + +The new `…aktuell…xlsx` (sheet `Familienarchiv`, 7,943 rows × 12 cols) has a **denser, +different** layout. There is an extra `Datei` column at index 1, and the normalized +`Von`/`An`/ISO-`Datum` columns from the ODS **do not exist**. + +| col | New xlsx header | Importer default expects | Result with defaults | +| --- | --- | --- | --- | +| 0 | Index | Index | ✅ ok | +| 1 | **Datei** (path) | Box | ❌ Box ← `..\__scan\W-0001.pdf` | +| 2 | Box | Mappe | ❌ Mappe ← `V` | +| 3 | Mappe | Sender | ❌ Sender ← `1` | +| 4 | BriefeschreiberIn (sender) | — (unused) | ❌ sender ignored | +| 5 | EmpfängerIn (receiver) | Receivers | ✅ coincidentally ok | +| 6 | Datum des Briefes | — (unused) | ❌ date ignored | +| 7 | Ort (location) | Date | ❌ Date ← `Rotterdam` → null | +| 8 | Schlagwort (tag) | — (unused) | ❌ tag ignored | +| 9 | Inhalt (summary) | Location | ❌ Location ← summary text | +| 10 | — | Tag | ❌ empty | +| 11 | — | Summary | ❌ empty | +| 13 | — | Transcription | ❌ column doesn't exist | + +**Impact:** importing as-is produces almost entirely garbage metadata. + +**Proposed approach (decide with Marcel):** +- (a) Re-map via the existing `app.import.col.*` properties — fast, no code. New mapping: + `index=0, box=2, folder=3, sender=4, receivers=5, date=6, location=7, tags=8, summary=9`, + and there is **no** transcription column (point it past the end or add a "missing column" + convention). Caveat: tags land in `colTags` but the real per-letter keywords are in + `Inhalt` (col 9) — see IMP-08 note on tags vs summary. +- (b) Make the importer **header-driven** (map by header name, not index) so it survives + layout drift across files. More robust, needs a code change (→ Gitea issue). + +Recommendation: (b) is the durable fix given we have ≥3 different layouts already. + +--- + +## IMP-02 — 90% of dates are free-text the parser can't read 🔴 BLOCKER + +The dates are written **as in the letter**. `parseDate()` only does `LocalDate.parse()` +(ISO `yyyy-MM-dd`), so anything non-ISO becomes `null`. + +Of **7,319** rows with a date value (col 6): + +| kind | count | parses today? | +| --- | --- | --- | +| Real Excel date cells (→ ISO via POI) | 748 | ✅ | +| Free-text date strings | 6,571 | ❌ → null | + +→ **90% of dated rows lose their date.** (623 rows have no date at all.) + +Observed free-text formats (counts approximate, from col 6): + +| Format | Count | Examples | +| --- | --- | --- | +| `D.M.YY` | 1,338 | `11.10.08`, `13.5.09` | +| `D.RomanMonth.YY/YYYY` | ~1,527 | `22.III.18`, `19.XII.1954`, `1.III.27` | +| `D.Month YYYY` | 950 | `6.März 1888`, `9.März 1888` (note: **no space** after the dot) | +| `D.M.YYYY` | 358 | `15.2.1888`, `7.3.1888` | +| Approximate / unknown | 146 | `?`, `13.7.18?`, `17.Nov (?) 1887`, `13.Januar ? 1907` | +| `Month YYYY` / season / holiday | 41+27 | `Mai 1895`, `Herbst 1913`, `Pfingsten 1922`, `Ostern 1890` | +| `YYYY` only | 17 | `1905`, `1949` | +| `D.M.` no year | 10 | `8.9.`, `14.3.` | +| Ranges | 5+ | `8.1.1916 - 15.3.1916`, `1881/82`, `1945/46?` | +| Abbrev/English months, no space | many | `29.Sept.1891`, `10.Oct.95`, `9.December1889`, `18.Dez.1916` | +| Slash separator | ~315 | `2/2. 18`, `17/6. 1916`, `10/4. 1917` | +| English `Month D. YYYY` | several | `April 12. 1922`, `Oct.5. 1916`, `Mai 23. 1917` | +| Trailing notes | 5+ | `26.4.1888, 2. Brief`, `31.8.1888,2.Brief` | +| 3-digit year (typo) | 107 | `30.1.889` (→ 1889), `4.3.1023` (in person file → 1923) | +| Day-range within month | several | `7./8. Sept.1923` | + +**Proposed approach:** build a tolerant German/historical date parser (→ Gitea issue, it's +a code change). Requirements: +- Numeric `D.M.YY[YY]` and `D/M. YY[YY]` (slash = dot). +- Roman-numeral months (`I`–`XII`). +- German + English month names, full + abbreviated, with/without separating space + (`März`, `Sept.`, `Dez`, `December`, `Oct.`). +- 2-digit and 3-digit year normalization (`08`→1908? needs a century rule; `889`→1889). +- Partial dates → store what's known. The schema only has a single `documentDate + LocalDate`; **decide** whether to (i) store first-of-month/year, (ii) add a + `datePrecision` enum + `dateOriginal` text column, or (iii) keep raw text in a new + `documentDateRaw` field and leave `documentate` null when imprecise. Recommendation: + preserve the **original string** always (new column) + best-effort parsed date + + precision flag, so nothing is lost and the UI can show "ca. 1916". +- Unparseable/approximate (`?`, `Herbst 1913`) → keep raw, leave parsed date null, **do + not drop the row**. + +**Cross-check:** even after IMP-01 is fixed so the date column is read, IMP-02 still bites. +Both must be solved before a real import. + +--- + +## IMP-03 — New xlsx has no normalized/ISO date or name columns 🔴 BLOCKER + +The ODS had helper columns the importer relied on: `Von`/`An` (normalized names) and +`Datum` (ISO) alongside `Datum Originalformat`. The new xlsx has **only the raw** +`BriefeschreiberIn` / `EmpfängerIn` / `Datum des Briefes`. So: +- Names must be parsed from raw strings (PersonNameParser already does receivers; **sender + is taken raw, never split** — fine for senders, which are single, but no normalization). +- Dates must be parsed from raw (IMP-02). + +This is the root reason IMP-01/02 exist: the new file is the *uncurated* source, not the +hand-normalized ODS. Tie any importer redesign to this reality — we will not get clean +helper columns in the 7k-row file. + +--- + +## IMP-04 — Person register not imported at all 🟠 MAJOR + +`Personendatei 2.xlsx` → sheet `Tabelle1`, **163 people**, columns: +`Generation, Familienname, Vorname, geb als (maiden), Geburtsdatum, Geburtsort, +Todesdatum, Sterbeort, verheiratet mit, Bemerkung`. + +Today `MassImportService` has **no person-register import**. Persons are only +auto-created as bare aliases from the document sender/receiver strings +(`personService.findOrCreateByAlias`). All this rich genealogical data is unused: +- birth/death dates + places, +- maiden names (the key to dedup — see IMP-05), +- `verheiratet mit` (marriage links → `PersonRelationship` domain), +- `Bemerkung` relationship hints (`"Schwester v Marie Cram"`, `"Nichte von Herbert"`), +- `Generation` (G 1–G 4), +- nicknames in quotes (`"Tante Lolly"`). + +Data-quality notes in this file too: multi-value `Vorname` (`Charlotte,Meta,Jacobi`); +mixed Excel-date vs text dates; typos (`4.3.1023`); missing-day dates (`.12.1955`); +trailing spaces (`30.8.1862 `). + +**Proposed approach:** a separate **Person import** (→ Gitea issue). Order matters: import +persons *first* so documents can link to real people instead of creating alias stubs. +Use `geb als` + `verheiratet mit` to pre-build the alias/relationship graph. + +--- + +## IMP-05 — Name variations create duplicate Persons 🟠 MAJOR + +The same person appears under several surface forms across the document sheet: +- `Eugenie Müller` (151) vs `Eugenie de Gruyter` (452) — maiden vs married. +- `Clara Cram` (sender 1,284) vs `Clara de Gruyter` (455) vs `Clara de Gruyter sen.` (66). +- `Walter de Gruyter` (589) vs bare `Walter` (78). + +`findOrCreateByAlias` keys on the raw string, so each variant becomes (or matches) a +distinct alias and likely a **distinct Person**. Result: fragmented person records, +broken Briefwechsel pairing, wrong stats. + +**Proposed approach:** drive dedup from the register's `geb als` column (IMP-04) — +`Eugenie de Gruyter geb Müller` tells us the two strings are one person. Build an alias +map (married ↔ maiden ↔ nickname) before/while importing documents. This is partly data +(an alias mapping table/sheet) and partly code (consume it). Likely a Gitea issue once the +mapping format is decided. + +945 distinct sender strings / 274 distinct receiver strings — expect a long-tail of +variants to reconcile. Don't try to be perfect on the first pass; get the high-frequency +names right. + +--- + +## IMP-06 — 93 data rows with blank Index are silently dropped 🟠 MAJOR + +`processRows` does `if (index.isBlank()) continue;`. **93 rows** have a blank Index but +carry other data (sender/receiver/date/etc.). These are silently skipped — they don't even +appear in the `skippedFiles` report (that list only covers rows that *had* an index but +failed file checks). + +**Proposed approach:** before import, triage these 93 rows — are they continuation rows, +section markers, or genuine letters missing an ID? At minimum, surface a count/warning so +nothing vanishes unnoticed. Possibly a small importer change to report blank-index skips. + +--- + +## IMP-07 — 43 duplicate Index values 🟡 MINOR + +43 Index values repeat (e.g. `W-0388`, `Eu-0332`, `C-0234`, `C-0235`, `C-0236`, `J-0175`). +Since the filename is derived from Index, the importer's upsert keys both rows on the same +`originalFilename`: the second occurrence is treated as `ALREADY_EXISTS` (if the first +isn't a placeholder) and **its metadata is lost**, or it overwrites a placeholder. + +**Proposed approach:** list the 43 duplicates, check whether they're true duplicates or +two distinct letters that share an ID by mistake. Fix in the source data, or extend the ID +scheme. Data task first; software only if the ID scheme must change. + +--- + +## IMP-08 — Section/title rows interleaved with data 🟡 MINOR + +Row 2 of the sheet is a section header sitting only in the sender column +(`Brautbriefe von Walter der Gruyter an Eugenie Müller`) with a blank Index — caught by the +blank-Index skip (overlaps IMP-06). There may be more such banners scattered through 7,943 +rows. Also relevant: the per-letter **keywords live in `Inhalt` (col 9)** as comma-joined +values (`Tilburg,Verwandschaft`, `poetisch,Reise nach Breda`), while `Schlagwort` (col 8) +holds a single broad tag (`Brautbriefe`). The importer only takes **one** tag column — +decide which column feeds tags vs summary, and whether to split comma-lists into multiple +tags. + +**Proposed approach:** scan for rows where Index is blank but other cells are set (already +have the count: relates to the 93 in IMP-06). Confirm tag vs summary column choice with +Marcel. + +--- + +## IMP-09 — Index ↔ Datei filename mismatches 🟡 MINOR + +The `Datei` column (col 1) holds explicit relative paths (`..\__scan\W-0001.pdf`) but they +don't always agree with the Index. Example: row 20 has Index `W-0010x` but Datei +`..\__scan\W-0011x.pdf`. The importer derives the filename from **Index**, so it will look +for `W-0010x.pdf` and may miss the actual scan. (Note: the `Datei` paths themselves are +Windows-style with `\` and `..` and would be **rejected** by `isValidImportFilename` if anyone +tried to use that column directly — 7,623 rows use backslashes, 7,455 contain `..`.) + +**Proposed approach:** when the PDFs arrive, reconcile Index-derived names against actual +filenames; produce a mismatch report. Keep deriving from Index (stable IDs) but flag +disagreements. Mostly a data/QA task. + +--- + +## IMP-10 — `x`-suffix rows (letter backsides / enclosures) 🟡 MINOR + +**42 rows** have an `x`-suffixed Index (`W-0001x`, `W-0002x`, …). They're sparse — typically +only Index + Datei + sender + receiver, no box/folder/date. They appear to be the reverse +side or an enclosure of the preceding letter. The importer treats each as an independent +Document, and the `metadataComplete` heuristic flags them complete as soon as a sender is +present (date/box/folder all missing). + +**Proposed approach:** decide whether `x` rows should be (a) separate documents, (b) extra +pages/files attached to their parent, or (c) skipped. Affects both the data model and the +`metadataComplete` heuristic. Discuss with Marcel. + +--- + +## IMP-11 — Multi-receiver separators include bare `u` / `u.` 🟡 MINOR + +`PersonNameParser.parseReceivers` already handles ` und `, ` u `, `//`, `geb.`, +parenthesised shared surnames, and `Familie` filtering — good. But the real data also uses +the abbreviation in forms the top-receivers list shows are common: +`Eugenie u Walter de Gruyter` (230), `Herbert u Clara` (94), `Juan u Marie Cram` (75), +and space-joined pairs like `Ella Anita` (79) that may be two people. +Raw separator tally on receivers: ` und ` ×70, `,` ×11, `;` ×2, `/` ×1 — plus the many ` u ` +cases above. Senders are **not** parsed at all (taken raw), which is fine unless a sender +cell ever holds two names. + +**Proposed approach:** add `MassImportServiceTest` cases for the real-world strings above; +extend the parser only where it actually fails. `Ella Anita`-style space-joined pairs are +ambiguous — likely leave as one person unless the register says otherwise (ties to IMP-05). + +--- + +## IMP-12 — Importer reads only the first sheet, no validation 🟡 MINOR + +`readXlsx` does `workbook.getSheetAt(0)`. For the new xlsx that's `Familienarchiv` (✅), but +the file also contains `Inhaltsverzeichnis grob`, `Inhaltsverzeichnis WdG`, `Tabelle4`. +There is no header validation: if the wrong file/sheet is dropped in `/import`, the importer +will happily map columns positionally and import nonsense. Also `findSpreadsheetFile()` picks +the **first** spreadsheet found in `/import` — with three spreadsheets present there today, +which one wins is filesystem-order-dependent. + +**Proposed approach:** (a) validate the header row against expected names before importing; +(b) make the target sheet/file explicit (config or header match) rather than "first found". +Ties into the header-driven mapping in IMP-01(b). + +--- + +## Summary of recommended sequencing + +1. **Decide the importer mapping strategy** (IMP-01): positional re-config vs header-driven. + Header-driven is the durable choice and unblocks IMP-03/12. +2. **Build the tolerant date parser** (IMP-02) with original-string preservation + precision. +3. **Import the Person register first** (IMP-04) and build the alias/marriage graph, + which feeds person dedup (IMP-05). +4. **Then import documents**, with reporting for blank-index (IMP-06), duplicates (IMP-07), + and section rows (IMP-08). +5. **Reconcile files** when the ~7,000 PDFs arrive (IMP-09), and decide `x`-row semantics + (IMP-10). + +Code-change items (→ Gitea issues when we get there): IMP-01(b), IMP-02, IMP-04, IMP-05 +(consume side), IMP-06 reporting, IMP-12. Pure-data items stay in this folder. diff --git a/docs/import-migration/02-normalization-spec.md b/docs/import-migration/02-normalization-spec.md new file mode 100644 index 00000000..5b65a0b8 --- /dev/null +++ b/docs/import-migration/02-normalization-spec.md @@ -0,0 +1,419 @@ +# Spec — Import Normalizer + +> Authored in the voice of **"Elicit"**, requirements engineer (see +> `.claude/personas/req_engineer.md`). This is a requirements artifact: it states +> *what* the normalizer must do and *how we'll know it's done*, in problem/behaviour +> language. Technology choices already made during brainstorming (Python, openpyxl, +> overrides-and-rerun) are recorded as **constraints**, not re-litigated here. + +- **Status:** Draft for review +- **Date:** 2026-05-25 +- **Related:** [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) (issues `IMP-01..12`), [`README.md`](./README.md) +- **Scope boundary:** This spec covers the **offline normalizer** that turns the raw + spreadsheets into a clean, canonical dataset + review artifacts. Wiring the canonical + contract into the Java `MassImportService` and the `Document`/`Person` model is **Phase 2** + and gets its own spec. This spec only *defines the contract* Phase 2 must satisfy. + +--- + +## 1. Project Brief + +**Vision.** Turn the family's human-curated, free-form archive spreadsheets into a clean, +canonical dataset that imports deterministically — without hand-editing thousands of rows +and without losing the historical nuance of how things were originally written. + +**Problem.** The real archive (`…aktuell…xlsx`, 7,943 rows) and the person register +(`Personendatei 2.xlsx`, 163 people) were authored for humans to read, not machines to +import. Dates are written as they appeared in each letter (≈90% unparseable by the current +importer), the column layout differs from what the importer expects, and the same person +appears under many names. Importing as-is produces garbage (see `IMP-01..12`). + +**Goal (measurable).** +- G1 — After the automated pass, **≤ 5%** of dated rows remain `UNKNOWN`; after the + overrides-iteration loop, **≤ 0.5%**. +- G2 — **100%** of source rows are represented in the canonical output or in a review file — + *zero silent drops*. +- G3 — **100%** of original values (raw date string, raw name string, source row number) + are preserved. +- G4 — A full run over the current inputs completes in **< 60 s** on the dev laptop and is + **content-deterministic** when re-run with unchanged inputs+overrides: identical canonical + cell matrices and identical review-file contents. (Workbook metadata is pinned; literal xlsx + byte-identity is not guaranteed because the zip container stores entry metadata.) + +**Primary actor.** Marcel — solo owner & data steward (tech comfort 4/5). Also: a future +agent re-running the pipeline; and the `MassImportService` as the downstream consumer. + +**Non-Goals (explicitly out of scope).** +- NG1 — Changing `MassImportService` or the DB schema (that is Phase 2). +- NG2 — Uploading/attaching the ~7,000 PDFs (they arrive later; import matches by `index`). +- NG3 — A GUI. The interface is spreadsheets in, CSVs out, an overrides file hand-edited. +- NG4 — Perfect genealogical reconstruction. We resolve confidently-matchable people; the + long tail stays as provisional persons. +- NG5 — OCR/transcription content (the new xlsx has no transcription column). + +**Key assumptions.** (A1) Sheet `Familienarchiv` is the document source of truth. +(A2) Archive date range is **1873–1957** (drives the 2-digit-year century rule). +(A3) `index` is the stable document key and the basis for future PDF matching. +(A4) `Schlagwort` is a broad tag; `Inhalt` is a short summary/topic. + +**Risks.** (R1) 2-digit/partial dates are genuinely ambiguous → mitigated by precision flag ++ overrides. (R2) Name matching false-positives merge distinct people → mitigated by +conservative matching + review before merge. (R3) Source spreadsheet may be re-exported with +layout drift → mitigated by header-name-based mapping, not fixed indices. + +--- + +## 2. Personas + +**Marcel — Data Steward.** Role: solo owner of Familienarchiv. Context: holds the complete +raw archive; PDFs follow. Tech comfort: 4/5 (semi-technical, reads CSV/spreadsheets fluently, +not keen to hand-edit 7,600 rows). Primary goal: a clean, importable dataset he trusts. +Frustrations: dates in ~20 formats; one ancestor under 4 name variants. **JTBD:** *"When I +have raw, human-curated archive spreadsheets, I want to transform them into a clean importable +dataset without losing how things were originally written, so I can load the archive and keep +correcting edge cases as they surface."* + +**The Returning Agent.** Role: a future assistant session resuming the work. Goal: re-run the +pipeline deterministically and understand exactly what still needs human input. **JTBD:** +*"When I pick this up cold, I want one command and a clear residue report, so I can continue +without re-deriving context."* + +--- + +## 3. Constraints & Decisions Already Made + +These were settled during brainstorming and are fixed inputs to the requirements below. + +| # | Decision | Rationale | +| --- | --- | --- | +| C1 | **New canonical layout** with explicit headers (not the old positional ODS shape). | Fits the new data; importer becomes header-driven in Phase 2. | +| C2 | Dates stored as **parsed (nullable) + raw + precision**. | Historical archive; never lose the original; enable "ca. 1916". | +| C3 | **Include person resolution** (register + alias/marriage map → canonical persons) in this effort. | Maiden-name dedup needs the register. | +| C4 | **Overrides-file + re-run** loop for residue. | Deterministic, diffable, repeatable. | +| C5 | Implementation: **Python 3.12 + openpyxl**, standalone tool at `tools/import-normalizer/`. | Fast iteration; no Spring rebuild / coverage gate on transform code. | +| C6 | Century rule for archive **1873–1957**: 2-digit `00–57`→`19YY`, `73–99`→`18YY`, `58–72`→**flag**; 3-digit `DDD`→`1DDD`; never 20xx. | Stated by Marcel. Boundaries live in config. | +| C7 | `Schlagwort`→tag, `Inhalt`→summary. | Matches importer's existing semantics. | +| C8 | Non-register correspondents become **provisional persons**. | ~945 distinct sender strings vs 163 register people. | + +--- + +## 4. Functional Requirements + +Each requirement has a stable ID. User stories use Connextra + Given-When-Then; system rules +use EARS. Traceability to findings in §8. + +### 4.1 Ingest & layout (`FR-INGEST`, `FR-MAP`) + +**US-MAP-01** — *As the data steward, I want each source column mapped to a named canonical +field regardless of its position, so a re-exported spreadsheet with shifted columns still +imports correctly.* +- AC1 — Given the `Familienarchiv` sheet, when the normalizer reads the header row, then it + maps columns by **header name** (not fixed index) to the canonical fields. +- AC2 — Given a header the normalizer does not recognise, when it runs, then it records the + unknown header in `review/summary.txt` and continues (does not crash). +- AC3 — Given a required source header is **absent**, when it runs, then it aborts with a + clear message naming the missing header (fail loud, before producing partial output). + +- **REQ-INGEST-01** — The normalizer shall read only the `Familienarchiv` sheet of the + document workbook and the `Tabelle1` sheet of the person workbook. +- **REQ-MAP-01** — Header matching shall be case-insensitive and tolerant of internal + multiple spaces (e.g. `"Datum des Briefes"`). + +### 4.2 Row triage (`FR-TRIAGE`) — resolves IMP-06, IMP-07, IMP-08 + +**US-TRIAGE-01** — *As the data steward, I want rows that have data but no index surfaced +rather than dropped, so I never lose a letter silently.* +- AC1 — Given a row whose `index` is blank but which has any other non-empty cell, when the + normalizer runs, then that row is written to `review/blank-index-rows.csv` with its source + row number and is **not** emitted as a canonical document. +- AC2 — Given a fully empty row, when it runs, then the row is skipped and counted (not + reported as an anomaly). + +- **REQ-TRIAGE-01** — If two or more rows resolve to the same `index`, then the normalizer + shall emit all of them to `review/duplicate-index.csv` and mark each canonical row + `needs_review = duplicate_index` (it shall **not** silently drop either). +- **REQ-TRIAGE-02** — Where a row is identified as a section/banner row (blank index, text + only in a name column), the normalizer shall classify it as such in the blank-index report. +- **REQ-TRIAGE-03** — Rows whose `index` ends in `x` (a transcription/back-side of the base + letter, not yet independently mappable) shall be **skipped** — not emitted as a canonical + document — and written to `review/skipped-x-suffix.csv` with their source row and base index + (`index` minus the trailing `x`), so they can be linked in a later pass. (Resolves IMP-10.) + +### 4.3 Date normalization (`FR-DATE`) — resolves IMP-02, IMP-03 + +**US-DATE-01** — *As the data steward, I want every date interpreted as precisely as the +source allows, with the original always kept, so I can sort the archive and still see what the +letter actually said.* +- AC1 — Given a parseable date, when normalized, then `date_iso` holds the best-effort ISO + date, `date_raw` holds the verbatim source string, and `date_precision` ∈ + `{DAY, MONTH, SEASON, YEAR, RANGE, APPROX, UNKNOWN}`. +- AC2 — Given an unparseable date, when normalized, then `date_iso` is empty, + `date_precision = UNKNOWN`, `date_raw` is preserved, and the value appears in + `review/unparsed-dates.csv`. +- AC3 — Given the same `date_raw` appears in `overrides/dates.csv`, when normalized, then the + override's `(iso, precision)` wins over the automatic parse. + +- **REQ-DATE-01** — The parser shall accept, at minimum, these forms (see §10 examples): + Excel/ISO; `D.M.YYYY`/`D.M.YY`; `D/M. YY[YY]` (slash treated as dot); Roman-numeral months + `I–XII`; German + English month names, full and abbreviated, with or without a separating + space; `Month YYYY`; season/holiday + year; bare `YYYY`; and start-anchored ranges. +- **REQ-DATE-02** — Precision shall be assigned by what is known: full day → `DAY`; month+year + → `MONTH` (day = 1); a **named feast/holiday + year** → resolved to its **actual calendar + date for that year** → `DAY`; a **season + year** → representative mid-season month (day = 1) + → `SEASON`; year only → `YEAR` (month = Jan, day = 1); a range → start date + `RANGE`; a + value carrying an uncertainty marker (`?`, `um`, `ca`, `circa`) → `APPROX` with best-effort date. +- **REQ-DATE-03** — Two-digit and three-digit years shall be expanded per **C6**; a 2-digit + year in `58–72` shall yield `UNKNOWN` + a review entry rather than a guess. +- **REQ-DATE-04** — Trailing editorial notes (e.g. `", 2. Brief"`) shall be stripped before + parsing and preserved (kept within `date_raw`; not invented into the date). +- **REQ-DATE-05** — The parser shall be pure and side-effect-free so it can be unit-tested in + isolation (see NFR-TEST-01). +- **REQ-DATE-06** — **Movable feasts are never mapped to a fixed month**; they shall be + computed per year from Easter (Gauss/Butcher computus): Karfreitag = Easter−2, Ostern = + Easter Sunday, Himmelfahrt = Easter+39, Pfingst(sonntag) = Easter+49, Pfingstmontag = + Easter+50, Fronleichnam = Easter+60, 1.–4. Advent = the 4th…1st Sunday before 25 Dec. Fixed + feasts use a lookup table (Neujahr=01-01, Heiligabend=12-24, Weihnachten=12-25, + Silvester=12-31, …). Seasons map to representative months: Frühling/Frühjahr=Apr, Sommer=Jul, + Herbst=Oct, Winter=Jan. The feast/season tables and Easter algorithm live in `config.py` + (NFR-MAINT-01). +- **REQ-DATE-07** — **Intra-month day ranges carry an end day; half-resolved ranges are + flagged.** For a day range like `7./8. Sept.1923`, `date_iso` holds the start day, the end + day is resolved against the shared month/year into `date_end`, and `date_precision` = + `RANGE`. If the **start** parses but the **end day is impossible** (e.g. `10./40.1.1917`), + the row keeps the start and `RANGE` precision, leaves `date_end` **empty**, and is flagged + `needs_review = range_end_unparsed` — the unparseable end is dropped honestly (surfaced for + review), never silently invented or clamped. A `RANGE` row **may** therefore legitimately + have an empty `date_end`; the importer must treat `date_end` as optional even on a `RANGE`. + +### 4.4 Person resolution & dedup (`FR-PERS`, `FR-DEDUP`) — resolves IMP-04, IMP-05, IMP-11 + +**US-PERS-01** — *As the data steward, I want the genealogical register turned into canonical +people with all their known facts, so documents can link to real persons.* +- AC1 — Given a register row, when parsed, then a canonical person is produced with + `person_id`, name parts, `maiden_name`, birth/death (parsed + raw + place), spouse, + generation, nickname, notes — applying the same date rules as §4.3 to birth/death dates. +- AC2 — Given multi-value given names (`"Charlotte,Meta,Jacobi"`), when parsed, then the + primary given name is the first; the remainder are retained as additional names/aliases. + +**US-PERS-02** — *As the data steward, I want each sender/receiver string matched to a +canonical person where possible and never dropped otherwise, so the correspondence graph is +complete.* +- AC1 — Given a sender/receiver string, when resolved, then it maps to a register + `person_id` via the alias index (exact → normalized/casefold → conservative fuzzy). +- AC2 — Given no confident match, when resolved, then a **provisional person** is created from + the cleaned string, linked, and listed in `review/unmatched-names.csv` (occurrence count + + example source rows). +- AC3 — Given the string appears in `overrides/names.csv`, when resolved, then it maps to the + specified `person_id` (override wins). +- AC4 — Given a multi-person receiver cell (`"Eugenie u Walter de Gruyter"`, `"Herbert u + Clara"`, `"…//…"`, `"Hedi und Tutu (Gruber)"`), when resolved, then it is split into + individual people, each resolved independently; ambiguous space-joined pairs + (`"Ella Anita"`) are emitted to `review/ambiguous-receivers.csv` rather than guessed. + +- **REQ-DEDUP-01** — The alias index shall be derived from the register: canonical + "First Last", maiden form (`geb als`), spouse-surname married form, nickname, and + first-name-only **only when unambiguous** across the register. +- **REQ-DEDUP-02** — The normalizer shall not merge two distinct strings into one person on + fuzzy similarity alone above a configured threshold without the match being reported; merges + must be auditable. +- **REQ-PERS-01** — Sender cells shall be parsed for multi-person content using the same rules + as receiver cells (today the importer parses only receivers — IMP-11). + +### 4.5 Overrides & idempotency (`FR-OVR`) — supports the iteration loop + +- **REQ-OVR-01** — When the normalizer runs, then it shall load `overrides/dates.csv` and + `overrides/names.csv` if present and apply them; absence of either file shall not be an error. +- **REQ-OVR-02** — While overrides are unchanged and inputs are unchanged, re-running shall + produce **byte-identical** canonical outputs and review files (NFR-IDEM-01). +- **REQ-OVR-03** — Each override application shall be counted in `review/summary.txt` (how many + dates/names were resolved by override vs automatically). + +### 4.6 Canonical output & provenance (`FR-OUT`, `FR-PROV`) — resolves IMP-01, IMP-09, IMP-12 + +- **REQ-OUT-01** — The normalizer shall write `out/canonical-documents.xlsx` and + `out/canonical-persons.xlsx` with the headered schemas in §6. The `out/` directory is + **gitignored** (real family PII — see ADR-025); ops syncs the regenerated files onto the + import host alongside the PDFs out-of-band. +- **REQ-PROV-01** — Every canonical document row shall carry `source_row` (1-based row number + in the source sheet) so any value can be traced back to the original. +- **REQ-PROV-02** — Every canonical row shall carry a `needs_review` field listing zero or more + flags (`duplicate_index`, `unparsed_date`, `unmatched_sender`, `unmatched_receiver`, + `index_file_mismatch`, …) so the import and the UI can foreground uncertain data. +- **REQ-OUT-02** — Where the source `Datei` path disagrees with the index-derived filename + (IMP-09), the normalizer shall record the discrepancy in `review/index-file-mismatch.csv` + and flag the row; it shall **not** alter the `index` (the stable key). + +--- + +## 5. Non-Functional Requirements + +| ID | Category | Requirement (measurable) | +| --- | --- | --- | +| NFR-DATA-01 | Data integrity | 100% of source rows are accounted for in output **or** a review file; 100% of original date/name strings preserved verbatim. | +| NFR-IDEM-01 | Determinism | Identical inputs + overrides ⇒ identical *logical* output across runs/machines: identical canonical cell matrices and review-file contents. Workbook `created`/`modified` metadata is pinned to a constant; ordering of all generated rows/aliases is stable (no set-iteration leakage). xlsx byte-identity is explicitly not required — determinism is asserted on content. | +| NFR-PERF-01 | Performance | Full run over 7,943 doc rows + 163 person rows completes in < 60 s on the dev laptop. | +| NFR-ACCUR-01 | Date accuracy | After automated pass, `UNKNOWN` dates ≤ 5% of dated rows; after overrides iteration, ≤ 0.5%. | +| NFR-ACCUR-02 | Name coverage | Every sender/receiver occurrence yields a linked person (register or provisional); 0 dropped. | +| NFR-I18N-01 | Encoding | UTF-8 end-to-end; German diacritics and ß round-trip with no mojibake in any output. | +| NFR-TEST-01 | Testability | `dates.py` and `persons.py` have pytest tests covering every format/alias category in §10 with real examples from the archive. | +| NFR-MAINT-01 | Maintainability | Column-name map, century boundaries, season→month map, and fuzzy threshold live in `config.py`, not inline in logic. | +| NFR-OBSERV-01 | Observability | `review/summary.txt` reports per-run stats: rows in, documents out, dates by precision, names matched vs provisional, overrides applied, anomalies by type. | +| NFR-SAFETY-01 | Source safety | Source workbooks are opened read-only and never written. | + +--- + +## 6. Data Dictionary (canonical contract) + +This is the contract Phase 2 (the importer) must consume. Field-level, format-level — not a +DB schema. + +### 6.1 `canonical-documents.xlsx` + +| Field | Required | Format / values | Notes | +| --- | --- | --- | --- | +| `index` | yes | string | Stable key; basis for PDF matching. | +| `file` | no | string | verbatim `Datei` value (e.g. `H-0730.pdf`); carried through for the importer to link the scanned PDF. | +| `box` | no | string | from `Box`. | +| `folder` | no | string | from `Mappe`. | +| `sender_person_id` | no | person_id | resolved; empty if no sender. | +| `sender_name` | no | string | canonical display name (or cleaned raw if provisional). | +| `receiver_person_ids` | no | `id\|id\|…` | pipe-separated. | +| `receiver_names` | no | `name\|name\|…` | pipe-separated, aligned with ids. | +| `date_iso` | no | `YYYY-MM-DD` | best-effort; empty if `UNKNOWN`. | +| `date_raw` | no | string | verbatim source date. | +| `date_precision` | yes | enum | `DAY\|MONTH\|SEASON\|YEAR\|RANGE\|APPROX\|UNKNOWN`. | +| `date_end` | no | `YYYY-MM-DD` or empty | RANGE end day (e.g. `7./8. Sept.1923` → `date_iso` = start, `date_end` = end). Empty for every non-RANGE precision **and** for a half-resolved RANGE whose end did not parse (see REQ-DATE-07). | +| `location` | no | string | from `Ort`. | +| `tags` | no | `tag\|tag` | from `Schlagwort`. | +| `summary` | no | string | from `Inhalt`. | +| `source_row` | yes | int | provenance (NFR-DATA-01). | +| `needs_review` | yes | `flag\|flag` or empty | review flags (REQ-PROV-02). Flags include `unparsed_date`, `range_end_unparsed` (half-resolved RANGE, REQ-DATE-07), `unmatched_sender`, `unmatched_receiver`, `multi_sender`, `index_file_mismatch`, `duplicate_index`. | + +### 6.2 `canonical-persons.xlsx` + +| Field | Required | Format | Notes | +| --- | --- | --- | --- | +| `person_id` | yes | slug | stable id (e.g. `de-gruyter-eugenie`); collisions suffixed. | +| `last_name` | yes | string | from `Familienname`. | +| `first_name` | no | string | primary given name. | +| `maiden_name` | no | string | from `geb als` — drives dedup. | +| `title` | no | string | e.g. honorifics if present. | +| `nickname` | no | string | from quoted `Bemerkung`/spouse field. | +| `birth_date` / `birth_date_raw` / `birth_place` | no | ISO / string / string | §4.3 rules. | +| `death_date` / `death_date_raw` / `death_place` | no | ISO / string / string | §4.3 rules. | +| `spouse` | no | person_id or name | from `verheiratet mit`. | +| `generation` | no | string | `G 1`..`G 4`. | +| `notes` | no | string | from `Bemerkung`. | +| `aliases` | no | `a\|b\|c` | every surface form that maps here. | +| `provisional` | yes | bool | true if created from a document string, not the register. | + +### 6.3 `canonical-persons-tree.json` + +The de-duplicated genealogical tree (family members + their relationships) the importer +uses to seed the family graph. Each `persons[]` entry carries a `personId` that **joins +1:1 onto** `person_id` in `canonical-persons.xlsx`. + +| Field | Required | Format | Notes | +| --- | --- | --- | --- | +| `personId` | yes | slug | The register's **verbatim** `person_id` (e.g. `cram-hans-1`), propagated — never re-slugified — so collision suffixes match `canonical-persons.xlsx` exactly. Every tree `personId` exists in the register; the register is the sole slug authority. | +| `firstName` / `lastName` / `maidenName` | first/last yes | string | name parts. | +| `birthYear` / `deathYear` | no | int or null | year only (tree granularity). | +| `birthPlace` / `deathPlace` | no | string or null | from the register. | +| `generation` | no | int or null | parsed from `G n`. | +| `notes` | no | string or null | leftover Bemerkung text after relationship extraction. | +| `familyMember` | yes | bool | always true for tree persons. | + +A top-level `generated_at` is pinned to a fixed timestamp (`2020-01-01T00:00:00`) for +reproducibility (NFR-IDEM-01), not a wall-clock value. `relationships[]` carry `SPOUSE_OF` +and `PARENT_OF` edges keyed by `rowId`; `unresolved[]` lists relationship strings that did +not match a tree person. + +--- + +## 7. Prioritized Backlog (MoSCoW) + +| ID | Item | MoSCoW | Effort | Depends on | +| --- | --- | --- | --- | --- | +| B1 | Project scaffolding + read both workbooks (`FR-INGEST`, header map `FR-MAP`) | Must | S | — | +| B2 | Row triage + blank/duplicate/empty reports (`FR-TRIAGE`) | Must | S | B1 | +| B3 | Date parser + precision + century rule + Easter/feast computus + season map + tests (`FR-DATE`) | Must | L | B1 | +| B4 | Person register parser → canonical persons (`FR-PERS` US-PERS-01) | Must | M | B1 | +| B5 | Alias index + name resolution + multi-person split (`FR-DEDUP`, US-PERS-02) | Must | L | B4 | +| B6 | Overrides load + apply + idempotency (`FR-OVR`) | Must | S | B3,B5 | +| B7 | Canonical writers + provenance + review summary (`FR-OUT`, `FR-PROV`) | Must | M | B2,B3,B5 | +| B8 | Index↔Datei mismatch report (`REQ-OUT-02`) | Should | XS | B1 | +| B9 | Ambiguous-receiver review path (US-PERS-02 AC4) | Should | S | B5 | +| B10 | Comma-split `Inhalt` into extra tags | Could | XS | B7 | +| B11 | Phase-2 importer wiring (separate spec) | Won't (this spec) | — | B7 | + +--- + +## 8. Traceability — Findings → Requirements + +| Finding | Severity | Addressed by | +| --- | --- | --- | +| IMP-01 layout mismatch | blocker | C1, FR-MAP, REQ-OUT-01 | +| IMP-02 free-text dates | blocker | FR-DATE (all), C2, C6 | +| IMP-03 no ISO/normalized cols | blocker | FR-DATE, FR-PERS | +| IMP-04 register unimported | major | C3, US-PERS-01, §6.2 | +| IMP-05 name variants → dupes | major | C3, FR-DEDUP | +| IMP-06 blank-index dropped | major | US-TRIAGE-01 | +| IMP-07 duplicate indices | minor | REQ-TRIAGE-01 | +| IMP-08 section rows / tags vs summary | minor | REQ-TRIAGE-02, C7 | +| IMP-09 index↔file mismatch | minor | REQ-OUT-02, B8 | +| IMP-10 `x`-suffix rows | minor | REQ-TRIAGE-03 (skip + log this pass) | +| IMP-11 sender not split / ` u ` sep | minor | REQ-PERS-01, US-PERS-02 AC4 | +| IMP-12 first-sheet, no validation | minor | REQ-INGEST-01, FR-MAP AC2/AC3 | + +--- + +## 9. Open Questions / TBD Register + +| ID | Question | Why it matters | Ref | Resolution | +| --- | --- | --- | --- | --- | +| OQ-01 ✅ | Season/holiday → date. | Accuracy of ~70 SEASON/feast rows. | REQ-DATE-06 | **Resolved (2026-05-25):** movable feasts (Ostern, Pfingsten, Himmelfahrt, Advent, …) **computed per year from Easter — never a fixed month**; fixed feasts looked up (Weihnachten=12-25, Neujahr=01-01, …); seasons = mid-season month (Frühling=Apr, Sommer=Jul, Herbst=Oct, Winter=Jan). | +| OQ-02 ✅ | Date ranges: start only, or start+end? | Sorting/display of ~315 range values. | REQ-DATE-02, REQ-DATE-07 | **Confirmed (updated #670):** store **start** in `date_iso`, precision `RANGE`, full text in `date_raw`, **and the resolved end day in `date_end`** for intra-month day ranges. A half-resolved range (start parsed, end impossible) keeps `date_end` empty and is flagged `range_end_unparsed`. | +| OQ-03 ✅ | `person_id` format. | Stability across re-runs; diffability. | §6 | **Confirmed:** readable slug `lastname-firstname`, numeric suffix on collision. | +| OQ-04 ✅ | `x`-suffix row handling. | 42 rows. | REQ-TRIAGE-03 | **Resolved (2026-05-25):** `x` rows are transcriptions of the base letter but not yet mappable → **skip this pass**, log to `review/skipped-x-suffix.csv` for later linking. | +| OQ-05 ✅ | Importer output format. | Phase-2 reader. | B11 | **Confirmed:** `.xlsx` (openpyxl-native, headered). | +| OQ-06 ✅ | Fuzzy-match policy. | False-positive person merges (R2). | REQ-DEDUP-02 | **Confirmed:** conservative — report all fuzzy matches; no silent merge. | + +*All open questions resolved as of 2026-05-25. New ambiguities discovered during build go here.* + +--- + +## 10. Glossary & Worked Examples + +**Precision** — how exactly a date is known (`DAY` … `UNKNOWN`). **Provisional person** — a +person created from a document name string with no register match. **Alias index** — map from +every known surface form of a name to a canonical `person_id`. **Override** — a +human-supplied correction applied deterministically on each run. + +**Date examples → expected outcome:** + +| `date_raw` | `date_iso` | `date_precision` | +| --- | --- | --- | +| `15.2.1888` | 1888-02-15 | DAY | +| `6.März 1888` | 1888-03-06 | DAY | +| `22.III.18` | 1918-03-22 | DAY | +| `13.5.09` | 1909-05-13 | DAY | +| `10.Oct.95` | 1895-10-10 | DAY | +| `17/6. 1916` | 1916-06-17 | DAY | +| `Mai 1895` | 1895-05-01 | MONTH | +| `Pfingsten 1922` | 1922-06-04 | DAY (computed: Easter 1922 = Apr 16, +49 days) | +| `Herbst 1913` | 1913-10-01 | SEASON | +| `1905` | 1905-01-01 | YEAR | +| `8.1.1916 - 15.3.1916` | 1916-01-08 | RANGE | +| `17.Nov (?) 1887` | 1887-11-17 | APPROX | +| `?` | *(empty)* | UNKNOWN | + +**Name examples → expected outcome:** + +| raw cell | resolves to | +| --- | --- | +| `Eugenie Müller` (+ register `geb Müller`) | `de-gruyter-eugenie` (matched via maiden alias) | +| `Eugenie de Gruyter` | `de-gruyter-eugenie` | +| `Herbert u Clara` | `cram-herbert` + `cram-clara` (split, surname distributed) | +| `Hedi und Tutu (Gruber)` | `gruber-hedi` + `gruber-tutu` | +| `Ella Anita` | → `review/ambiguous-receivers.csv` (not auto-split) | +| `Hans Wittkopf` (not in register) | provisional `wittkopf-hans` | diff --git a/docs/import-migration/03-normalizer-implementation-plan.md b/docs/import-migration/03-normalizer-implementation-plan.md new file mode 100644 index 00000000..8d2f8428 --- /dev/null +++ b/docs/import-migration/03-normalizer-implementation-plan.md @@ -0,0 +1,2281 @@ +# Import Normalizer Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build an offline Python tool that turns the raw family-archive spreadsheets into a clean, canonical dataset (`canonical-documents.xlsx`, `canonical-persons.xlsx`) plus review CSVs, with a deterministic overrides-and-rerun loop. + +**Architecture:** A standalone Python package at `tools/import-normalizer/`. Pure, independently-testable units — date parsing (`dates.py`), person/register logic (`persons.py`), spreadsheet ingest (`ingest.py`), row mapping (`documents.py`) — are orchestrated by `normalize.py`. Source workbooks are read-only; all tunables live in `config.py`. Residue (unparseable dates, unmatched names) is reported to `review/*.csv` and corrected via version-controlled `overrides/*.csv` applied on each run. + +**Tech Stack:** Python 3.12, `openpyxl` (xlsx read/write), `pytest`. No third-party fuzzy library — `difflib` (stdlib) provides *suggestions only* (never auto-applied), per the conservative-matching requirement. + +**Spec:** [`02-normalization-spec.md`](./02-normalization-spec.md). Requirement IDs (`FR-*`, `REQ-*`, `NFR-*`) referenced per task. + +--- + +## File Structure + +``` +tools/import-normalizer/ +├── config.py # paths, header maps, century rule, season/feast tables, month tables, matching config +├── dates.py # Easter computus, feast/season resolution, year expansion, parse_date() +├── persons.py # slug, Person, parse_register(), split_receivers(), AliasIndex, ResolutionContext +├── ingest.py # read_sheet(), build_header_map() +├── documents.py # RawRow, extract_row(), triage helpers, CanonicalDocument, to_canonical() +├── writers.py # write_documents_xlsx(), write_persons_xlsx(), write_review_csv(), write_summary() +├── overrides.py # load_overrides() +├── normalize.py # main() orchestrator + CLI +├── requirements.txt +├── .gitignore # .venv/ out/ review/ __pycache__/ +├── README.md +├── overrides/ +│ ├── dates.csv # seed header: raw,iso,precision +│ └── names.csv # seed header: raw,person_id +└── tests/ + ├── __init__.py + ├── test_dates.py + ├── test_persons.py + ├── test_ingest.py + ├── test_documents.py + ├── test_writers.py + └── test_normalize.py +``` + +**Test command convention** (per the "never run the full suite" rule — run targeted files): +`tools/import-normalizer/.venv/bin/python -m pytest tools/import-normalizer/tests/test_X.py -v` + +All `git` commands assume CWD = repo root and the current branch `docs/import-migration`. + +--- + +### Task 1: Project scaffold, venv, config constants + +**Files:** +- Create: `tools/import-normalizer/requirements.txt` +- Create: `tools/import-normalizer/.gitignore` +- Create: `tools/import-normalizer/config.py` +- Create: `tools/import-normalizer/tests/__init__.py` +- Create: `tools/import-normalizer/tests/test_config.py` + +- [ ] **Step 1: Create `requirements.txt`** (pinned — an openpyxl minor bump can change xlsx serialization and break determinism, NFR-IDEM-01) + +``` +openpyxl==3.1.5 +pytest==8.3.4 +``` + +- [ ] **Step 2: Create the tool-local `.gitignore`** + +``` +.venv/ +out/ +review/ +__pycache__/ +*.pyc +``` + +- [ ] **Step 2b: Harden the repo-root `.gitignore`** (the root file currently has no venv pattern — that is how `ocr-service/.venv` got committed; prevent the whole class). Append these lines to `/home/marcel/Desktop/familienarchiv/.gitignore` if not already present: + +``` +**/.venv/ +**/__pycache__/ +*.pyc +``` +(Cleaning up the *already-committed* `ocr-service/.venv` via `git rm -r --cached ocr-service/.venv` is a separate task — do NOT bundle it into this branch.) + +- [ ] **Step 3: Create `config.py`** + +```python +"""Tunables for the import normalizer. No logic here — only data tables.""" +from pathlib import Path + +# --- Paths --- +BASE_DIR = Path(__file__).resolve().parent +REPO_ROOT = BASE_DIR.parent.parent +IMPORT_DIR = REPO_ROOT / "import" + +DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx" +DOCUMENT_SHEET = "Familienarchiv" +PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx" +PERSON_SHEET = "Tabelle1" + +OUT_DIR = BASE_DIR / "out" +REVIEW_DIR = BASE_DIR / "review" +OVERRIDES_DIR = BASE_DIR / "overrides" + +# --- Header text (lowercased, whitespace-collapsed) -> canonical field --- +DOCUMENT_HEADER_MAP = { + "index": "index", + "datei": "file", + "box": "box", + "mappe": "folder", + "briefeschreiberin": "sender", + "empfängerin": "receivers", + "datum des briefes": "date", + "ort": "location", + "schlagwort": "tags", + "inhalt": "summary", +} +DOCUMENT_REQUIRED_FIELDS = {"index"} + +PERSON_HEADER_MAP = { + "generation": "generation", + "familienname": "last_name", + "vorname": "first_name", + "geb als": "maiden_name", + "geburtsdatum": "birth_date", + "geburtsort": "birth_place", + "todesdatum": "death_date", + "sterbeort": "death_place", + "verheiratet mit": "spouse", + "bemerkung": "notes", +} +PERSON_REQUIRED_FIELDS = {"last_name"} + +# --- Century rule (archive 1873–1957) --- +TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy +TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN + +# --- Seasons -> representative month (day = 1) --- +SEASON_MONTHS = { + "frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4, + "sommer": 7, "herbst": 10, "winter": 1, +} + +# --- Fixed feasts -> (month, day) --- +FIXED_FEASTS = { + "neujahr": (1, 1), + "heiligabend": (12, 24), "heiliger abend": (12, 24), + "weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25), + "silvester": (12, 31), "sylvester": (12, 31), +} + +# --- Movable feasts -> day offset from Easter Sunday --- +MOVABLE_FEASTS = { + "karfreitag": -2, + "ostern": 0, "ostersonntag": 0, "ostermontag": 1, + "himmelfahrt": 39, "christi himmelfahrt": 39, + "pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50, + "fronleichnam": 60, +} + +# --- Month names -> number (German + English, full + abbreviations) --- +MONTHS = { + "januar": 1, "jan": 1, "january": 1, + "februar": 2, "feb": 2, "febr": 2, "february": 2, + "märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3, + "april": 4, "apr": 4, + "mai": 5, "may": 5, + "juni": 6, "jun": 6, "june": 6, + "juli": 7, "jul": 7, "july": 7, + "august": 8, "aug": 8, + "september": 9, "sep": 9, "sept": 9, + "oktober": 10, "okt": 10, "oct": 10, "october": 10, + "november": 11, "nov": 11, + "dezember": 12, "dez": 12, "dec": 12, "december": 12, +} + +ROMAN_MONTHS = { + "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, + "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, +} + +# --- Person matching --- +KNOWN_LAST_NAMES = [ + "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa", + "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", +] +FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied +``` + +- [ ] **Step 4: Create empty `tests/__init__.py`** (empty file). + +- [ ] **Step 5: Write `tests/test_config.py`** + +```python +import config + +def test_century_boundaries(): + assert config.TWO_DIGIT_19XX_MAX == 57 + assert config.TWO_DIGIT_18XX_MIN == 73 + +def test_header_maps_cover_required_fields(): + assert "index" in config.DOCUMENT_HEADER_MAP.values() + assert "last_name" in config.PERSON_HEADER_MAP.values() + +def test_feast_tables_present(): + assert config.MOVABLE_FEASTS["pfingsten"] == 49 + assert config.SEASON_MONTHS["herbst"] == 10 +``` + +- [ ] **Step 6: Create the venv and install deps** + +Run: +```bash +cd tools/import-normalizer && python3 -m venv .venv && .venv/bin/pip install -r requirements.txt && cd - +``` +Expected: openpyxl + pytest install successfully. + +- [ ] **Step 7: Run the config test** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py -v && cd -` +Expected: 3 passed. (Tests import `config` directly, so pytest must run with CWD = the tool dir; `conftest.py` is unnecessary because the modules are flat in that dir.) + +- [ ] **Step 8: Commit** + +```bash +git add .gitignore tools/import-normalizer/requirements.txt tools/import-normalizer/.gitignore tools/import-normalizer/config.py tools/import-normalizer/tests/__init__.py tools/import-normalizer/tests/test_config.py +git commit -m "feat(normalizer): scaffold tool + config tables" +``` + +--- + +### Task 2: Easter computus (`REQ-DATE-06`) + +**Files:** +- Create: `tools/import-normalizer/dates.py` +- Create: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Write the failing test** in `tests/test_dates.py` + +```python +import datetime +import dates + +def test_easter_known_years(): + # Anonymous Gregorian algorithm — verified against published tables + assert dates.easter(2024) == datetime.date(2024, 3, 31) + assert dates.easter(2000) == datetime.date(2000, 4, 23) + assert dates.easter(1922) == datetime.date(1922, 4, 16) + assert dates.easter(1888) == datetime.date(1888, 4, 1) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_easter_known_years -v && cd -` +Expected: FAIL with `ModuleNotFoundError: No module named 'dates'` or `AttributeError: module 'dates' has no attribute 'easter'`. + +- [ ] **Step 3: Create `dates.py` with the computus** + +```python +"""Tolerant historical date parsing for the family archive.""" +import datetime + + +def easter(year: int) -> datetime.date: + """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm.""" + a = year % 19 + b = year // 100 + c = year % 100 + d = b // 4 + e = b % 4 + f = (b + 8) // 25 + g = (b - f + 1) // 3 + h = (19 * a + b - d - g + 15) % 30 + i = c // 4 + k = c % 4 + l = (32 + 2 * e + 2 * i - h - k) % 7 + m = (a + 11 * h + 22 * l) // 451 + month = (h + l - 7 * m + 114) // 31 + day = ((h + l - 7 * m + 114) % 31) + 1 + return datetime.date(year, month, day) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_easter_known_years -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): Easter computus" +``` + +--- + +### Task 3: Feast & season resolution (`REQ-DATE-02`, `REQ-DATE-06`) + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add the failing test** to `tests/test_dates.py` + +```python +from dates import Precision + +def test_resolve_feast_movable(): + assert dates.resolve_feast_or_season("Pfingsten", 1922) == ("1922-06-04", Precision.DAY) + assert dates.resolve_feast_or_season("Ostern", 2024) == ("2024-03-31", Precision.DAY) + assert dates.resolve_feast_or_season("Pfingstmontag", 1922) == ("1922-06-05", Precision.DAY) + +def test_resolve_feast_fixed(): + assert dates.resolve_feast_or_season("Weihnachten", 1900) == ("1900-12-25", Precision.DAY) + assert dates.resolve_feast_or_season("Neujahr", 1910) == ("1910-01-01", Precision.DAY) + +def test_resolve_season(): + assert dates.resolve_feast_or_season("Herbst", 1913) == ("1913-10-01", Precision.SEASON) + assert dates.resolve_feast_or_season("Sommer", 1910) == ("1910-07-01", Precision.SEASON) + +def test_resolve_unknown_token_returns_none(): + assert dates.resolve_feast_or_season("Freitag", 1919) is None +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k "feast or season" -v && cd -` +Expected: FAIL — `Precision` and `resolve_feast_or_season` not defined. + +- [ ] **Step 3: Implement** — add to `dates.py` (top imports + new code) + +```python +from enum import StrEnum +import config + + +class Precision(StrEnum): + DAY = "DAY" + MONTH = "MONTH" + SEASON = "SEASON" + YEAR = "YEAR" + RANGE = "RANGE" + APPROX = "APPROX" + UNKNOWN = "UNKNOWN" + + +def _advent_sunday(year: int, n: int) -> datetime.date: + """n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24.""" + dec24 = datetime.date(year, 12, 24) + back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6 + fourth = dec24 - datetime.timedelta(days=back_to_sunday) + return fourth - datetime.timedelta(days=(4 - n) * 7) + + +def resolve_feast_or_season(token: str, year: int): + """Return (iso, Precision) for a known feast/season token, else None.""" + key = " ".join(token.lower().split()).strip(" .") + if key in config.MOVABLE_FEASTS: + d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key]) + return d.isoformat(), Precision.DAY + if key in config.FIXED_FEASTS: + month, day = config.FIXED_FEASTS[key] + return datetime.date(year, month, day).isoformat(), Precision.DAY + advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1} + if key in advent: + return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY + if key in config.SEASON_MONTHS: + return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON + return None +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k "feast or season" -v && cd -` +Expected: PASS (all 4). (Pfingstmontag 1922 = Easter Apr 16 + 50 = June 5.) + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): feast + season resolution" +``` + +--- + +### Task 4: Year expansion / century rule (`REQ-DATE-03`) + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add the failing test** + +```python +def test_expand_year(): + assert dates.expand_year("1888") == 1888 + assert dates.expand_year("889") == 1889 # 3-digit -> 1DDD + assert dates.expand_year("923") == 1923 + assert dates.expand_year("08") == 1908 # 00..57 -> 19xx + assert dates.expand_year("17") == 1917 + assert dates.expand_year("57") == 1957 + assert dates.expand_year("73") == 1873 # 73..99 -> 18xx + assert dates.expand_year("99") == 1899 + assert dates.expand_year("65") is None # 58..72 ambiguous + assert dates.expand_year("x") is None +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_expand_year -v && cd -` +Expected: FAIL — `expand_year` not defined. + +- [ ] **Step 3: Implement** — add to `dates.py` + +```python +def expand_year(token: str): + """Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous.""" + token = token.strip() + if not token.isdigit(): + return None + n, v = len(token), int(token) + if n == 4: + return v + if n == 3: + return 1000 + v + if n == 2: + if v <= config.TWO_DIGIT_19XX_MAX: + return 1900 + v + if v >= config.TWO_DIGIT_18XX_MIN: + return 1800 + v + return None + return None +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_expand_year -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): year expansion century rule" +``` + +--- + +### Task 5: `parse_date` dispatch + ISO + numeric forms (`FR-DATE`, `REQ-DATE-01/04/05`) + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add failing tests** + +```python +def test_parse_iso_and_empty(): + assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23") + assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "") + assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?") + +def test_parse_numeric_forms(): + assert dates.parse_date("15.2.1888").iso == "1888-02-15" + assert dates.parse_date("13.5.09").iso == "1909-05-13" + assert dates.parse_date("17/6. 1916").iso == "1916-06-17" + assert dates.parse_date("11.10.08").iso == "1908-10-11" + assert dates.parse_date("30.1.889").iso == "1889-01-30" + assert dates.parse_date("15.2.1888").precision == Precision.DAY + +def test_parse_numeric_unparseable(): + assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year + assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year + +def test_parse_approx_marker_upgrades_precision(): + r = dates.parse_date("17.Nov (?) 1887") # month-name handled in a later task; here just the marker path + # after the marker is detected, a parsed date becomes APPROX (verified fully in Task 8) + assert r.raw == "17.Nov (?) 1887" +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k "parse_" -v && cd -` +Expected: FAIL — `ParsedDate` / `parse_date` not defined. + +- [ ] **Step 3: Implement** — add to `dates.py` + +```python +import re +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ParsedDate: + iso: str | None + precision: Precision + raw: str + + +_LEADING_MARKERS = re.compile( + r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I) + + +def _preprocess(raw: str): + """Return (cleaned_string, approx_flag).""" + s = (raw or "").strip() + if not s: + return "", False + low = s.lower() + approx = ("?" in s) or any( + m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich")) + s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)" + s = s.replace("?", " ") + s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief") + s = _LEADING_MARKERS.sub("", s) + s = re.sub(r"\s+", " ", s).strip(" .,") + return s, approx + + +_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})") + + +def _match_iso(s): + if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s): + try: + datetime.date.fromisoformat(s) + return s, Precision.DAY + except ValueError: + return None + return None + + +def _match_numeric(s): + m = _NUM_RE.fullmatch(s) + if not m: + return None + day, month = int(m.group(1)), int(m.group(2)) + year = expand_year(m.group(3)) + if year is None or not (1 <= month <= 12): + return None + try: + return datetime.date(year, month, day).isoformat(), Precision.DAY + except ValueError: + return None + + +# Matchers are tried in order. Later tasks append to this list. +_MATCHERS = [_match_iso, _match_numeric] + + +def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: + if date_overrides: + key = (raw or "").strip() + if key in date_overrides: + iso, prec = date_overrides[key] + return ParsedDate(iso or None, Precision(prec), raw) + cleaned, approx = _preprocess(raw) + if not cleaned: + return ParsedDate(None, Precision.UNKNOWN, raw) + for matcher in _MATCHERS: + result = matcher(cleaned) + if result: + iso, precision = result + if approx: + precision = Precision.APPROX + return ParsedDate(iso, precision, raw) + return ParsedDate(None, Precision.UNKNOWN, raw) +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k "parse_" -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): parse_date dispatch + iso/numeric matchers" +``` + +--- + +### Task 6: Roman-numeral month matcher + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add failing test** + +```python +def test_parse_roman_months(): + assert dates.parse_date("22.III.18").iso == "1918-03-22" + assert dates.parse_date("19.XII.1954").iso == "1954-12-19" + assert dates.parse_date("1.III.27").iso == "1927-03-01" + assert dates.parse_date("22.III.18").precision == Precision.DAY +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_parse_roman_months -v && cd -` +Expected: FAIL — Roman dates currently fall through to UNKNOWN. + +- [ ] **Step 3: Implement** — add to `dates.py` and register the matcher + +```python +_ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I) + + +def _match_roman(s): + m = _ROMAN_RE.fullmatch(s) + if not m: + return None + day = int(m.group(1)) + month = config.ROMAN_MONTHS.get(m.group(2).lower()) + year = expand_year(m.group(3)) + if not month or year is None: + return None + try: + return datetime.date(year, month, day).isoformat(), Precision.DAY + except ValueError: + return None +``` + +Then change the matcher list line to: +```python +_MATCHERS = [_match_iso, _match_numeric, _match_roman] +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_parse_roman_months -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): roman-numeral month matcher" +``` + +--- + +### Task 7: Month-name matchers (day-first + English month-first) + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add failing tests** + +```python +def test_parse_monthname_day_first(): + assert dates.parse_date("6.März 1888").iso == "1888-03-06" + assert dates.parse_date("29.Sept.1891").iso == "1891-09-29" + assert dates.parse_date("10.Oct.95").iso == "1895-10-10" + assert dates.parse_date("9.December1889").iso == "1889-12-09" + assert dates.parse_date("18.Dez.1916").iso == "1916-12-18" + assert dates.parse_date("4Dezember 1936").iso == "1936-12-04" + assert dates.parse_date("25 August 1968").iso == "1968-08-25" + +def test_parse_monthname_english_month_first(): + assert dates.parse_date("April 12. 1922").iso == "1922-04-12" + assert dates.parse_date("Oct.5. 1916").iso == "1916-10-05" +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k monthname -v && cd -` +Expected: FAIL. + +- [ ] **Step 3: Implement** — add to `dates.py`. `_match_monthname_a` is day-first; `_match_monthname_b` is English month-first. + +```python +_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})") +_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\.?\s*(\d{2,4})") + + +def _lookup_month(token: str): + return config.MONTHS.get(token.lower().strip(" .")) + + +def _build_day_month_year(day, month, year): + if not month or year is None or not (1 <= month <= 12): + return None + try: + return datetime.date(year, month, day).isoformat(), Precision.DAY + except ValueError: + return None + + +def _match_monthname_a(s): + m = _MONTH_A_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) + + +def _match_monthname_b(s): + m = _MONTH_B_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3))) +``` + +Then update the matcher list (order matters — `_match_monthname_a` is day-first and safe to place before the month/year matcher; `_match_monthname_b` goes *after* the month/year matcher added in Task 8, so for now append only `_a`): +```python +_MATCHERS = [_match_iso, _match_numeric, _match_roman, _match_monthname_a] +``` + +- [ ] **Step 4: Run — expect `_a` cases to pass, `_b` (English) still failing** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_parse_monthname_day_first -v && cd -` +Expected: PASS. + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py::test_parse_monthname_english_month_first -v && cd -` +Expected: FAIL (`_match_monthname_b` not yet registered — it is wired in Task 8 to sit after the month/year matcher so it doesn't shadow `Mai 1895`). + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): day-first month-name matcher" +``` + +--- + +### Task 8: Month/year, feast/season, year-only, range matchers + final ordering + overrides + +**Files:** +- Modify: `tools/import-normalizer/dates.py` +- Modify: `tools/import-normalizer/tests/test_dates.py` + +- [ ] **Step 1: Add failing tests** + +```python +def test_parse_month_year_year_only(): + assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895") + assert dates.parse_date("October 1903").iso == "1903-10-01" + assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905") + +def test_parse_feast_and_season_via_parse_date(): + assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922") + assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913") + assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY + +def test_parse_ranges(): + assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916") + assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82") + assert dates.parse_date("1945/46?").iso == "1945-01-01" # '?' stripped -> RANGE, then APPROX + assert dates.parse_date("1945/46?").precision == Precision.APPROX + +def test_parse_approx_full(): + r = dates.parse_date("17.Nov (?) 1887") + assert r.iso == "1887-11-17" + assert r.precision == Precision.APPROX + +def test_parse_english_month_first_now_works(): + assert dates.parse_date("April 12. 1922").iso == "1922-04-12" + assert dates.parse_date("Mai 1895").iso == "1895-05-01" # not shadowed by month-first matcher + +def test_parse_unparseable_examples(): + assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN + +def test_parse_invalid_calendar_date_is_unknown(): + # try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review), + # never silently clamp. This is the most likely real-data bug class at 7,600 rows. + assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN + assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN + +def test_parse_intra_month_day_range(): + # "7./8. Sept.1923" -> start day, RANGE. Must NOT be confused with slash-date "17/6. 1916". + assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923") + assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916") + +def test_parse_trailing_note_stripped_but_raw_preserved(): + r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04 + assert r.iso == "1887-11-17" + assert "2. Brief" in r.raw # original string preserved verbatim +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -k "month_year or feast_and_season or ranges or approx_full or english_month_first_now or unparseable_examples" -v && cd -` +Expected: FAIL. + +- [ ] **Step 3: Implement** — add matchers to `dates.py` + +```python +_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})") +_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})") +_YEAR_ONLY_RE = re.compile(r"\d{4}") +_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}") +_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*") +# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it +# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/"). +_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)") + + +def _match_month_year(s): + m = _MONTH_YEAR_RE.fullmatch(s) + if not m: + return None + month = _lookup_month(m.group(1)) + year = expand_year(m.group(2)) + if not month or year is None: + return None + return datetime.date(year, month, 1).isoformat(), Precision.MONTH + + +def _match_feast_season(s): + m = _TOKEN_YEAR_RE.fullmatch(s) + if not m: + return None + year = expand_year(m.group(2)) + if year is None: + return None + return resolve_feast_or_season(m.group(1), year) + + +def _match_year_only(s): + if _YEAR_ONLY_RE.fullmatch(s): + return datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR + return None + + +def _match_range(s): + m = _RANGE_YY_RE.fullmatch(s) + if m: + return datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE + m = _RANGE_DAY_RE.fullmatch(s) + if m: + first = f"{m.group(1)}.{m.group(3)}" # "7." + "Sept.1923" -> "7.Sept.1923" + for matcher in (_match_numeric, _match_monthname_a): + r = matcher(first) + if r: + return r[0], Precision.RANGE + m = _RANGE_HYPHEN_RE.fullmatch(s) + if m: + start = m.group(1).strip() + for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only): + r = matcher(start) + if r: + return r[0], Precision.RANGE + return None +``` + +Then replace the matcher list with the final ordering: +```python +_MATCHERS = [ + _match_iso, + _match_range, + _match_numeric, + _match_roman, + _match_monthname_a, + _match_month_year, + _match_monthname_b, + _match_feast_season, + _match_year_only, +] +``` + +- [ ] **Step 4: Run the full date test file** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -v && cd -` +Expected: PASS (all tests, including the English month-first test from Task 7). + +- [ ] **Step 5: Add an overrides test, then commit** + +Append to `tests/test_dates.py`: +```python +def test_parse_date_override_wins(): + ovr = {"13.5.65": ("1965-05-13", "DAY")} + r = dates.parse_date("13.5.65", ovr) # ambiguous without override + assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65") +``` +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_dates.py -v && cd -` +Expected: PASS. + +```bash +git add tools/import-normalizer/dates.py tools/import-normalizer/tests/test_dates.py +git commit -m "feat(normalizer): month/year, feast/season, range matchers + overrides" +``` + +--- + +### Task 9: Person register parsing (`FR-PERS`, US-PERS-01) + +**Files:** +- Create: `tools/import-normalizer/persons.py` +- Create: `tools/import-normalizer/tests/test_persons.py` + +- [ ] **Step 1: Write the failing test** in `tests/test_persons.py` + +```python +import persons + +def test_slugify(): + assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie" + assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard" + +def test_parse_register_basic(): + rows = [ + {"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi", + "maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel", + "death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"', + "notes": "Schwester v Marie Cram"}, + {"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else", + "maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann", + "notes": "Schwester v Herbert"}, + ] + people = persons.parse_register(rows) + p = people[0] + assert p.person_id == "blomquist-charlotte" + assert p.first_name == "Charlotte" + assert p.maiden_name == "Ruge" + assert p.birth_date == "1862-08-30" + assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse + assert p.spouse == "" + assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names + p2 = people[1] + assert p2.maiden_name == "Cram" + assert p2.spouse == "Ludwig Bohrmann" + assert p2.provisional is False +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -` +Expected: FAIL — `persons` module / symbols not defined. + +- [ ] **Step 3: Implement `persons.py`** + +```python +"""Person register parsing, name splitting, alias resolution.""" +import re +import unicodedata +from dataclasses import dataclass, field + +import config +import dates + +_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss", + "Ä": "ae", "Ö": "oe", "Ü": "ue"}) + + +def _strip_accents(s: str) -> str: + s = s.translate(_DIACRITIC_MAP) + s = unicodedata.normalize("NFKD", s) + return "".join(c for c in s if not unicodedata.combining(c)) + + +def slugify(last: str, first: str) -> str: + raw = f"{last} {first}".strip() + raw = _strip_accents(raw).lower() + raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-") + return raw or "unknown" + + +@dataclass +class Person: + person_id: str + last_name: str = "" + first_name: str = "" + maiden_name: str = "" + title: str = "" + nickname: str = "" + extra_given_names: list = field(default_factory=list) + birth_date: str | None = None + birth_date_raw: str = "" + birth_place: str = "" + death_date: str | None = None + death_date_raw: str = "" + death_place: str = "" + spouse: str = "" + generation: str = "" + notes: str = "" + aliases: list = field(default_factory=list) + provisional: bool = False + + +_QUOTED_RE = re.compile(r'^[“"\']\s*(.+?)\s*[”"\']$') + + +def parse_register(rows: list[dict]) -> list[Person]: + people = [] + for r in rows: + last = (r.get("last_name") or "").strip() + if not last: + continue + given_raw = (r.get("first_name") or "").strip() + givens = [g.strip() for g in given_raw.split(",") if g.strip()] + first = givens[0] if givens else "" + extra = givens[1:] + + spouse_raw = (r.get("spouse") or "").strip() + nickname = "" + m = _QUOTED_RE.match(spouse_raw) + if m: + nickname = m.group(1) + spouse_raw = "" + + birth = dates.parse_date(r.get("birth_date") or "") + death = dates.parse_date(r.get("death_date") or "") + people.append(Person( + person_id=slugify(last, first), + last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(), + nickname=nickname, extra_given_names=extra, + birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(), + death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(), + spouse=spouse_raw, generation=(r.get("generation") or "").strip(), + notes=(r.get("notes") or "").strip(), provisional=False, + )) + # De-duplicate colliding ids with numeric suffix + seen = {} + for p in people: + if p.person_id in seen: + seen[p.person_id] += 1 + p.person_id = f"{p.person_id}-{seen[p.person_id]}" + else: + seen[p.person_id] = 1 + return people +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py +git commit -m "feat(normalizer): person register parsing" +``` + +--- + +### Task 10: Receiver splitting (`REQ-PERS-01`, US-PERS-02 AC4) + +**Files:** +- Modify: `tools/import-normalizer/persons.py` +- Modify: `tools/import-normalizer/tests/test_persons.py` + +- [ ] **Step 1: Add failing tests** (ported from the Java `PersonNameParser` contract) + +```python +def test_split_receivers(): + assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"] + assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"] + assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"] + assert persons.split_receivers("Clara u Familie") == ["Clara"] + assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"] + assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"] + assert persons.split_receivers("") == [] + +def test_find_known_last_name(): + assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter" + assert persons.find_known_last_name("Clara") is None +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k "split_receivers or known_last" -v && cd -` +Expected: FAIL. + +- [ ] **Step 3: Implement** — add to `persons.py` + +```python +_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I) +_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$") +_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I) + + +def find_known_last_name(segment: str): + seg = segment.strip() + for ln in config.KNOWN_LAST_NAMES: # config lists longest-first + if seg == ln or seg.endswith(" " + ln): + return ln + return None + + +def split_receivers(raw: str) -> list[str]: + if not raw or not raw.strip(): + return [] + # 0. split on "//" + if "//" in raw: + out = [] + for seg in raw.split("//"): + out.extend(split_receivers(seg)) + return out + cleaned = _GEB_RE.sub("", raw).strip() + if not _MULTI_RE.search(cleaned): + return [cleaned] + shared_last = None + pm = _PAREN_RE.search(cleaned) + if pm: + shared_last = pm.group(1).strip() + cleaned = cleaned[:pm.start()].strip() + parts = [p.strip() for p in _MULTI_RE.split(cleaned)] + parts = [p for p in parts if p and p.lower() != "familie"] + if not parts: + return [] + if len(parts) == 1: + return [parts[0]] + if shared_last: + return [p if " " in p else f"{p} {shared_last}" for p in parts] + last_seg = parts[-1] + detected = find_known_last_name(last_seg) + if detected: + result = [] + for p in parts[:-1]: + if " " not in p and find_known_last_name(p) is None: + result.append(f"{p} {detected}") + else: + result.append(p) + result.append(last_seg) + return result + return parts +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k "split_receivers or known_last" -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py +git commit -m "feat(normalizer): receiver splitting" +``` + +--- + +### Task 11: Alias index (`FR-DEDUP`, REQ-DEDUP-01/02) + +**Files:** +- Modify: `tools/import-normalizer/persons.py` +- Modify: `tools/import-normalizer/tests/test_persons.py` + +- [ ] **Step 1: Add failing tests** + +```python +def test_alias_index_resolves_maiden_and_married(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + {"last_name": "Cram", "first_name": "Clara"}, + ]) + idx = persons.AliasIndex(people) + eugenie = people[0].person_id + assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical + assert idx.resolve("Eugenie Müller") == eugenie # maiden alias + assert idx.resolve("eugenie müller") == eugenie # normalized + assert idx.resolve("Nobody Unknown") is None + +def test_alias_index_suggestion(): + people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}]) + idx = persons.AliasIndex(people) + sid, score = idx.suggest("Hans Wittkop") # typo + assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k alias -v && cd -` +Expected: FAIL — `AliasIndex` not defined. + +- [ ] **Step 3: Implement** — add to `persons.py` + +```python +import difflib + + +def _norm(name: str) -> str: + return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip() + + +class AliasIndex: + def __init__(self, people: list[Person]): + self._by_alias: dict[str, str] = {} + self._display: dict[str, str] = {} + self.known_ids: set[str] = {p.person_id for p in people} + first_name_ids: dict[str, list] = {} + for p in people: + self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip() + # Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01. + forms = [f"{p.first_name} {p.last_name}".strip()] + if p.maiden_name: + forms.append(f"{p.first_name} {p.maiden_name}".strip()) + for extra in p.extra_given_names: + forms.append(f"{extra} {p.last_name}".strip()) + if p.nickname: + forms.append(p.nickname) + seen = set() + for form in forms: + if form in seen: + continue + seen.add(form) + key = _norm(form) + if key and key not in self._by_alias: + self._by_alias[key] = p.person_id + p.aliases.append(form) + if p.first_name: + ids = first_name_ids.setdefault(_norm(p.first_name), []) + if p.person_id not in ids: + ids.append(p.person_id) + # first-name-only alias, only when unambiguous + for fname, ids in first_name_ids.items(): + if len(ids) == 1 and fname not in self._by_alias: + self._by_alias[fname] = ids[0] + + def resolve(self, name: str): + return self._by_alias.get(_norm(name)) + + def display(self, person_id: str) -> str: + return self._display.get(person_id, "") + + def suggest(self, name: str): + keys = list(self._by_alias.keys()) + match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD) + if not match: + return None, 0.0 + score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio() + return self._by_alias[match[0]], score +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k alias -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py +git commit -m "feat(normalizer): alias index with maiden/married/nickname resolution" +``` + +--- + +### Task 12: Spreadsheet ingest (`FR-INGEST`, `FR-MAP`, REQ-INGEST-01, REQ-MAP-01) + +**Files:** +- Create: `tools/import-normalizer/ingest.py` +- Create: `tools/import-normalizer/tests/test_ingest.py` + +- [ ] **Step 1: Write failing tests** (build a tiny workbook on disk with openpyxl) + +```python +import datetime +import openpyxl +import pytest +import ingest + +def _make_workbook(tmp_path, sheet_name, rows): + wb = openpyxl.Workbook() + ws = wb.active + ws.title = sheet_name + for r in rows: + ws.append(r) + path = tmp_path / "wb.xlsx" + wb.save(path) + return path + +def test_read_sheet_converts_cells(tmp_path): + path = _make_workbook(tmp_path, "S", [ + ["Index", "Datum"], + ["W-0001", datetime.datetime(1888, 2, 15)], + ["W-0002", 1], + ]) + rows = ingest.read_sheet(path, "S") + assert rows[0] == ["Index", "Datum"] + assert rows[1] == ["W-0001", "1888-02-15"] # Excel date -> ISO string + assert rows[2] == ["W-0002", "1"] # integer -> plain string + +def test_build_header_map_collapses_whitespace_and_case(): + header = ["Index", "Datum des Briefes", "EmpfängerIn", "Mystery"] + field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"} + fields, unknown = ingest.build_header_map(header, field_map, required={"index"}) + assert fields == {"index": 0, "date": 1, "receivers": 2} + assert unknown == ["Mystery"] + +def test_build_header_map_missing_required_raises(): + with pytest.raises(ValueError, match="index"): + ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"}) +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_ingest.py -v && cd -` +Expected: FAIL — `ingest` not defined. + +- [ ] **Step 3: Implement `ingest.py`** + +```python +"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields.""" +import datetime +from pathlib import Path +import openpyxl + + +def _cell_to_str(value) -> str: + if value is None: + return "" + if isinstance(value, datetime.datetime): + return value.date().isoformat() + if isinstance(value, datetime.date): + return value.isoformat() + if isinstance(value, float) and value.is_integer(): + return str(int(value)) + if isinstance(value, int): + return str(value) + return str(value).strip() + + +def read_sheet(path: Path, sheet_name: str) -> list[list[str]]: + wb = openpyxl.load_workbook(path, read_only=True, data_only=True) + if sheet_name not in wb.sheetnames: + raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}") + ws = wb[sheet_name] + rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)] + wb.close() + return rows + + +def _norm_header(text: str) -> str: + return " ".join(text.lower().split()) + + +def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]): + """Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing.""" + fields: dict[str, int] = {} + unknown: list[str] = [] + for idx, raw in enumerate(header_row): + key = _norm_header(raw) + if key in field_map: + fields[field_map[key]] = idx + elif raw.strip(): + unknown.append(raw) + missing = required - set(fields) + if missing: + raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})") + return fields, unknown +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_ingest.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/ingest.py tools/import-normalizer/tests/test_ingest.py +git commit -m "feat(normalizer): xlsx ingest + header mapping" +``` + +--- + +### Task 13: Row extraction, triage & CanonicalDocument (`FR-TRIAGE`, REQ-TRIAGE-01/02/03, `FR-PROV`) + +**Files:** +- Create: `tools/import-normalizer/documents.py` +- Create: `tools/import-normalizer/tests/test_documents.py` + +- [ ] **Step 1: Write failing tests** + +```python +import documents +from documents import Triage + +def test_extract_row(): + header = {"index": 0, "file": 1, "box": 2, "folder": 3, "sender": 4, + "receivers": 5, "date": 6, "location": 7, "tags": 8, "summary": 9} + cells = ["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"] + raw = documents.extract_row(cells, header, source_row=3) + assert raw.index == "W-0001" + assert raw.sender == "Walter de Gruyter" + assert raw.date == "15.2.1888" + assert raw.source_row == 3 + +def test_triage(): + assert documents.triage(["", "", ""]) == Triage.EMPTY + assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index + assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX + assert documents.triage(["W-0001", "x"]) == Triage.OK + +def test_classify_blank_index(): + header = {"sender": 4, "receivers": 5} + banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""] + data = ["", "", "V", "1", "", "Eugenie"] + assert documents.classify_blank_index(banner, header) == "section_banner" + assert documents.classify_blank_index(data, header) == "data_no_index" + +def test_index_file_mismatch(): + assert documents.index_file_mismatch("W-0010x", r"..\__scan\W-0011x.pdf") is True + assert documents.index_file_mismatch("W-0001", r"..\__scan\W-0001.pdf") is False + assert documents.index_file_mismatch("W-0001", "") is False +``` + +Note `triage` takes the raw `cells` list and uses column 0 as the index (matching `extract_row`'s header where `index` is col 0 in these tests). + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py -v && cd -` +Expected: FAIL — `documents` not defined. + +- [ ] **Step 3: Implement `documents.py`** (extraction + triage + dataclasses; resolution added in Task 14) + +```python +"""Document row extraction, triage, and the canonical document record.""" +from dataclasses import dataclass, field +from enum import Enum, auto + + +class Triage(Enum): + OK = auto() + EMPTY = auto() + BLANK_INDEX = auto() + X_SUFFIX = auto() + + +@dataclass +class RawRow: + source_row: int + index: str = "" + file: str = "" + box: str = "" + folder: str = "" + sender: str = "" + receivers: str = "" + date: str = "" + location: str = "" + tags: str = "" + summary: str = "" + + +@dataclass +class CanonicalDocument: + index: str + box: str = "" + folder: str = "" + sender_person_id: str = "" + sender_name: str = "" + receiver_person_ids: list = field(default_factory=list) + receiver_names: list = field(default_factory=list) + date_iso: str = "" + date_raw: str = "" + date_precision: str = "" + location: str = "" + tags: list = field(default_factory=list) + summary: str = "" + source_row: int = 0 + needs_review: list = field(default_factory=list) + + +_FIELDS = ["index", "file", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] + + +def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow: + def get(field_name): + idx = header.get(field_name) + if idx is None or idx >= len(cells): + return "" + return (cells[idx] or "").strip() + return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS}) + + +def triage(cells: list[str], index_col: int = 0) -> Triage: + nonempty = [c for c in cells if c and str(c).strip()] + if not nonempty: + return Triage.EMPTY + index = (cells[index_col] or "").strip() if index_col < len(cells) else "" + if not index: + return Triage.BLANK_INDEX + if index.endswith("x"): + return Triage.X_SUFFIX + return Triage.OK + + +def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: + """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'.""" + name_cols = {header.get("sender"), header.get("receivers")} - {None} + populated = {i for i, c in enumerate(cells) if c and str(c).strip()} + if populated and populated <= name_cols: + return "section_banner" + return "data_no_index" + + +def index_file_mismatch(index: str, file_path: str) -> bool: + if not file_path.strip(): + return False + basename = file_path.replace("\\", "/").rsplit("/", 1)[-1] + stem = basename.rsplit(".", 1)[0] + return stem != index +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/documents.py tools/import-normalizer/tests/test_documents.py +git commit -m "feat(normalizer): row extraction, triage, canonical record" +``` + +--- + +### Task 14: Resolution context + to_canonical (`FR-PERS`, `FR-DATE` integration, REQ-PROV-02) + +**Files:** +- Modify: `tools/import-normalizer/persons.py` +- Modify: `tools/import-normalizer/documents.py` +- Modify: `tools/import-normalizer/tests/test_documents.py` + +- [ ] **Step 1: Add failing tests** to `tests/test_documents.py` + +```python +import persons +import documents + +def _ctx(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Walter"}, + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + ]) + return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + +def test_to_canonical_resolves_and_flags(): + ctx = _ctx() + raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1", + sender="Walter de Gruyter", receivers="Eugenie Müller", + date="15.2.1888", location="Rotterdam", tags="Brautbriefe", + summary="Geschäftsreise", file=r"..\__scan\W-0001.pdf") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" + assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias + assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" + assert doc.tags == ["Brautbriefe"] + assert doc.needs_review == [] + +def test_to_canonical_unmatched_and_unparsed(): + ctx = _ctx() + raw = documents.RawRow(source_row=9, index="C-0001", + sender="Hans Wittkopf", receivers="", date="Freitag 1919") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "wittkopf-hans" # provisional + assert "unmatched_sender" in doc.needs_review + assert "unparsed_date" in doc.needs_review + assert ctx.unmatched["Hans Wittkopf"] == [9] + assert any(p.provisional for p in ctx.provisional.values()) + +def test_to_canonical_splits_multi_sender(): + # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged. + ctx = _ctx() + raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" # first part is primary + assert "multi_sender" in doc.needs_review + +def test_provisional_id_never_collides_with_register(): + # A provisional built from an unmatched string must not steal a register person_id. + people = persons.parse_register([{"last_name": "Cram", "first_name": "Clara"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + # Force a provisional whose natural slug equals the register id by using a string the + # alias index will not resolve but that slugs to "cram-clara": + pid, _, matched = ctx.resolve_one("Clara Cram (unsicher)", source_row=1) + assert matched is False + assert pid not in {"cram-clara"} or pid.endswith("-2") # suffixed away from the register id + +def test_ambiguous_space_pair_flagged_not_split(): + # US-PERS-02 AC4: "Ella Anita" is kept as one provisional + flagged, never guessed into two. + ctx = _ctx() + raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert len(doc.receiver_person_ids) == 1 # not split + assert any(part == "Ella Anita" for _, part, _ in ctx.ambiguous) +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py -k "to_canonical" -v && cd -` +Expected: FAIL — `ResolutionContext` / `to_canonical` not defined. + +- [ ] **Step 3a: Implement `ResolutionContext`** — add to `persons.py` + +```python +class ResolutionContext: + """Resolves raw name strings to person ids; accumulates provisional persons and review data.""" + def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str]): + self.index = alias_index + self.name_overrides = name_overrides + self.provisional: dict[str, Person] = {} + self.unmatched: dict[str, list] = {} + self.ambiguous: list[tuple] = [] + self._raw_to_pid: dict[str, str] = {} + self.override_hits = 0 + + def _unique_id(self, base: str) -> str: + """A provisional id must never collide with a register id or another provisional.""" + used = self.index.known_ids | set(self.provisional) + pid, n = base, 1 + while pid in used: + n += 1 + pid = f"{base}-{n}" + return pid + + def resolve_one(self, raw_name: str, source_row: int): + """Return (person_id, display_name, matched: bool). '' name -> ('', '', True).""" + name = (raw_name or "").strip() + if not name: + return "", "", True + if name in self.name_overrides: + self.override_hits += 1 + pid = self.name_overrides[name] + return pid, self.index.display(pid) or name, True + pid = self.index.resolve(name) + if pid: + return pid, self.index.display(pid) or name, True + # provisional person (unmatched) — never reuse a register id + self.unmatched.setdefault(name, []).append(source_row) + if name in self._raw_to_pid: + return self._raw_to_pid[name], name, False + last, first = _last_first(name) + pid = self._unique_id(slugify(last, first)) + self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True) + self._raw_to_pid[name] = pid + return pid, name, False + + def resolve_sender(self, raw: str, source_row: int): + """Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged.""" + parts = split_receivers(raw) + if not parts: + return "", "", True, False + pid, name, matched = self.resolve_one(parts[0], source_row) + for extra in parts[1:]: + self.resolve_one(extra, source_row) # register the others as persons too + return pid, name, matched, len(parts) > 1 + + def resolve_receivers(self, raw: str, source_row: int): + results = [] + for part in split_receivers(raw): + pid, name, matched = self.resolve_one(part, source_row) + if not matched and " " in part and find_known_last_name(part) is None and len(part.split()) == 2: + self.ambiguous.append((raw, part, source_row)) + results.append((pid, name, matched)) + return results + + +def _last_first(name: str): + """Best-effort split of a free name string into (last, first) for slug/provisional building.""" + name = name.strip() + ln = find_known_last_name(name) + if ln: + first = name[: -len(ln)].strip() + return ln, first + tokens = name.split() + if len(tokens) >= 2: + return tokens[-1], " ".join(tokens[:-1]) + return name, "" +``` + +- [ ] **Step 3b: Implement `to_canonical`** — add to `documents.py` + +```python +import dates as _dates + + +def to_canonical(raw, ctx, date_overrides: dict) -> CanonicalDocument: + pd = _dates.parse_date(raw.date, date_overrides) + flags = [] + + sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row) + if raw.sender.strip() and not sender_matched: + flags.append("unmatched_sender") + if sender_multi: + flags.append("multi_sender") + + receivers = ctx.resolve_receivers(raw.receivers, raw.source_row) + if any(not matched for _, _, matched in receivers): + flags.append("unmatched_receiver") + + if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN: + flags.append("unparsed_date") + if index_file_mismatch(raw.index, raw.file): + flags.append("index_file_mismatch") + + return CanonicalDocument( + index=raw.index, box=raw.box, folder=raw.folder, + sender_person_id=sender_id, sender_name=sender_name, + receiver_person_ids=[r[0] for r in receivers], + receiver_names=[r[1] for r in receivers], + date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision), + location=raw.location, tags=[raw.tags] if raw.tags else [], summary=raw.summary, + source_row=raw.source_row, needs_review=flags, + ) +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/documents.py tools/import-normalizer/tests/test_documents.py +git commit -m "feat(normalizer): person resolution context + to_canonical" +``` + +--- + +### Task 15: Overrides loader + writers (`FR-OVR`, `FR-OUT`, NFR-OBSERV-01) + +**Files:** +- Create: `tools/import-normalizer/overrides.py` +- Create: `tools/import-normalizer/writers.py` +- Create: `tools/import-normalizer/tests/test_writers.py` + +- [ ] **Step 1: Write failing tests** + +```python +import csv +import openpyxl +import overrides +import writers +import documents + +def test_load_overrides_missing_files(tmp_path): + d, n = overrides.load_overrides(tmp_path / "dates.csv", tmp_path / "names.csv") + assert d == {} and n == {} + +def test_load_overrides_parsed(tmp_path): + dp = tmp_path / "dates.csv" + dp.write_text("raw,iso,precision\n13.5.65,1965-05-13,DAY\n", encoding="utf-8") + np = tmp_path / "names.csv" + np.write_text("raw,person_id\nEugenie Müller,de-gruyter-eugenie\n", encoding="utf-8") + d, n = overrides.load_overrides(dp, np) + assert d["13.5.65"] == ("1965-05-13", "DAY") + assert n["Eugenie Müller"] == "de-gruyter-eugenie" + +def test_write_documents_xlsx_joins_lists(tmp_path): + doc = documents.CanonicalDocument( + index="W-0001", receiver_person_ids=["a", "b"], receiver_names=["A", "B"], + tags=["Brautbriefe"], date_precision="DAY", needs_review=["unparsed_date"]) + out = tmp_path / "docs.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + ws = wb.active + header = [c.value for c in ws[1]] + assert "receiver_person_ids" in header and "needs_review" in header + row = {h: c.value for h, c in zip(header, ws[2])} + assert row["receiver_person_ids"] == "a|b" + assert row["needs_review"] == "unparsed_date" + +def test_write_documents_xlsx_pins_timestamp(tmp_path): + # determinism (NFR-IDEM-01): workbook created/modified are pinned, not the current time + doc = documents.CanonicalDocument(index="W-0001") + out = tmp_path / "d.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + assert (wb.properties.created.year, wb.properties.created.month, wb.properties.created.day) == (2020, 1, 1) + +def test_write_review_csv(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["?", 3], ["x", 1]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[0] == ["raw", "count"] + assert rows[1] == ["?", "3"] + +def test_write_review_csv_defangs_formula_injection(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["=cmd|'/C calc'!A0", 1], ["-2+3", 2]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[1][0].startswith("'=") # leading '=' neutralised + assert rows[2][0].startswith("'-") + +def test_write_summary_sections(tmp_path): + out = tmp_path / "s.txt" + writers.write_summary(out, {"# INPUTS": "", "rows": 10, "# DATES": "", "unknown_date_rate": "3.2%"}) + text = out.read_text(encoding="utf-8") + assert "INPUTS:" in text and "DATES:" in text and " rows: 10" in text +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_writers.py -v && cd -` +Expected: FAIL — modules not defined. + +- [ ] **Step 3a: Implement `overrides.py`** + +```python +"""Load human-supplied corrections. Missing files are not an error.""" +import csv +from pathlib import Path + + +def load_overrides(dates_path: Path, names_path: Path): + date_overrides: dict[str, tuple[str, str]] = {} + name_overrides: dict[str, str] = {} + if Path(dates_path).exists(): + with open(dates_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + date_overrides[raw] = ((row.get("iso") or "").strip(), (row.get("precision") or "UNKNOWN").strip()) + if Path(names_path).exists(): + with open(names_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + name_overrides[raw] = (row.get("person_id") or "").strip() + return date_overrides, name_overrides +``` + +- [ ] **Step 3b: Implement `writers.py`** + +```python +"""Write canonical .xlsx outputs and review .csv files.""" +import csv +import datetime +from pathlib import Path +import openpyxl + +_PIPE = "|" +# Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl +# otherwise stamps docProps with the current time on every save. +_FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0) + + +def _join(value): + if isinstance(value, list): + return _PIPE.join(str(v) for v in value) + return "" if value is None else str(value) + + +def _csv_safe(value): + """Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs.""" + s = "" if value is None else str(value) + return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s + + +DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", + "date_precision", "location", "tags", "summary", "source_row", "needs_review"] + +PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname", + "birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw", + "death_place", "spouse", "generation", "notes", "aliases", "provisional"] + + +def _write_xlsx(records, columns, path: Path): + wb = openpyxl.Workbook() + ws = wb.active + ws.append(columns) + for rec in records: + ws.append([_join(getattr(rec, col)) for col in columns]) + wb.properties.created = _FIXED_TS + wb.properties.modified = _FIXED_TS + Path(path).parent.mkdir(parents=True, exist_ok=True) + wb.save(path) + + +def write_documents_xlsx(docs, path: Path): + _write_xlsx(docs, DOC_COLUMNS, path) + + +def write_persons_xlsx(people, path: Path): + _write_xlsx(people, PERSON_COLUMNS, path) + + +def write_review_csv(path: Path, header: list[str], rows: list[list]): + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8", newline="") as f: + w = csv.writer(f) + w.writerow(header) + for row in rows: + w.writerow([_csv_safe(c) for c in row]) + + +def write_summary(path: Path, stats: dict): + """Render a grouped, scannable summary. Keys beginning with '#' are section headers.""" + Path(path).parent.mkdir(parents=True, exist_ok=True) + lines = [] + for k, v in stats.items(): + if k.startswith("#"): + lines.append("") + lines.append(k[1:].strip() + ":") + else: + lines.append(f" {k}: {v}") + Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8") +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_writers.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/overrides.py tools/import-normalizer/writers.py tools/import-normalizer/tests/test_writers.py +git commit -m "feat(normalizer): overrides loader + xlsx/csv writers" +``` + +--- + +### Task 16: Orchestrator `normalize.py` + integration test (`FR-OUT`, `FR-TRIAGE`, REQ-TRIAGE-01/03, NFR-IDEM-01) + +**Files:** +- Create: `tools/import-normalizer/normalize.py` +- Create: `tools/import-normalizer/tests/test_normalize.py` + +- [ ] **Step 1: Write the failing integration test** (tiny in-memory fixtures, not the real 7,900-row file) + +```python +import openpyxl +import normalize + +def _doc_wb(tmp_path): + wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv" + ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn", + "Datum des Briefes", "Ort", "Schlagwort", "Inhalt"]) + ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]) + ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""]) + ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""]) + ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""]) + ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"]) + p = tmp_path / "docs.xlsx"; wb.save(p); return p + +def _person_wb(tmp_path): + wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1" + ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum", + "Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"]) + ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""]) + ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""]) + p = tmp_path / "persons.xlsx"; wb.save(p); return p + +def test_run_end_to_end(tmp_path): + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + stats = normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}) + assert (out_dir / "canonical-documents.xlsx").exists() + assert (out_dir / "canonical-persons.xlsx").exists() + assert stats["documents_emitted"] == 3 # W-0001, C-0001, W-0001 (dup) — x and blank excluded + assert stats["skipped_x_suffix"] == 1 + assert stats["blank_index_rows"] == 1 + assert stats["duplicate_index_rows"] == 2 + assert (review_dir / "skipped-x-suffix.csv").exists() + assert (review_dir / "unparsed-dates.csv").exists() + # C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01) + assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") + + # determinism (NFR-IDEM-01): a second run yields identical canonical content + review files + def _matrix(p): + wb = openpyxl.load_workbook(p) + return [[c.value for c in row] for row in wb.active.iter_rows()] + docs1 = _matrix(out_dir / "canonical-documents.xlsx") + persons1 = _matrix(out_dir / "canonical-persons.xlsx") + unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") + normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={}) + assert _matrix(out_dir / "canonical-documents.xlsx") == docs1 + assert _matrix(out_dir / "canonical-persons.xlsx") == persons1 + assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1 + assert len(docs1) == 4 # header + 3 docs +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_normalize.py -v && cd -` +Expected: FAIL — `normalize` not defined. + +- [ ] **Step 3: Implement `normalize.py`** + +```python +"""Orchestrator: read raw workbooks -> canonical outputs + review reports.""" +import argparse +from collections import Counter +from pathlib import Path + +import config +import ingest +import persons +import documents +import overrides as overrides_mod +import writers + + +def run(*, document_workbook, document_sheet, person_workbook, person_sheet, + out_dir, review_dir, date_overrides, name_overrides) -> dict: + out_dir, review_dir = Path(out_dir), Path(review_dir) + + # --- persons --- + person_rows = ingest.read_sheet(person_workbook, person_sheet) + p_fields, _ = ingest.build_header_map(person_rows[0], config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS) + person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]] + register = persons.parse_register(person_dicts) + alias_index = persons.AliasIndex(register) + ctx = persons.ResolutionContext(alias_index, name_overrides) + + # --- documents --- + doc_rows = ingest.read_sheet(document_workbook, document_sheet) + d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS) + index_col = d_fields["index"] + + canon_docs, blank_index, skipped_x, mismatches = [], [], [], [] + unparsed_by_raw: dict[str, list] = {} + dates_by_override = 0 + empty_count = 0 + seen_index = Counter() + + for source_row, cells in enumerate(doc_rows[1:], start=2): + t = documents.triage(cells, index_col) + if t is documents.Triage.EMPTY: + empty_count += 1 + continue + if t is documents.Triage.BLANK_INDEX: + blank_index.append([source_row, documents.classify_blank_index(cells, d_fields), + " | ".join(c for c in cells if c)]) + continue + if t is documents.Triage.X_SUFFIX: + idx = (cells[index_col] or "").strip() + skipped_x.append([source_row, idx, idx[:-1]]) + continue + raw = documents.extract_row(cells, d_fields, source_row) + seen_index[raw.index] += 1 + if raw.date.strip() and raw.date.strip() in date_overrides: + dates_by_override += 1 + doc = documents.to_canonical(raw, ctx, date_overrides) + if "unparsed_date" in doc.needs_review: + unparsed_by_raw.setdefault(raw.date, []).append(source_row) + if "index_file_mismatch" in doc.needs_review: + mismatches.append([source_row, raw.index, raw.file]) + canon_docs.append(doc) + + # REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them. + dup_indexes = {idx for idx, n in seen_index.items() if n > 1} + duplicates = [] + for doc in canon_docs: + if doc.index in dup_indexes: + if "duplicate_index" not in doc.needs_review: + doc.needs_review.append("duplicate_index") + duplicates.append([doc.source_row, doc.index]) + + all_people = register + list(ctx.provisional.values()) + + # --- write canonical outputs --- + writers.write_documents_xlsx(canon_docs, out_dir / "canonical-documents.xlsx") + writers.write_persons_xlsx(all_people, out_dir / "canonical-persons.xlsx") + + # --- review files --- + # unparsed dates: most-frequent first, with example source rows + blank override cells so a + # corrected row can be pasted straight into overrides/dates.csv (same raw,iso,precision shape). + unparsed_rows = sorted( + ([raw, len(rows), " ".join(map(str, rows[:5])), "", ""] for raw, rows in unparsed_by_raw.items()), + key=lambda r: (-r[1], r[0])) + writers.write_review_csv(review_dir / "unparsed-dates.csv", + ["raw", "count", "example_rows", "suggested_iso", "suggested_precision"], unparsed_rows) + + unmatched_rows = [] + for name, rows in sorted(ctx.unmatched.items()): + sid, score = alias_index.suggest(name) + unmatched_rows.append([name, len(rows), " ".join(map(str, rows[:5])), + sid or "", f"{score:.2f}" if sid else ""]) + writers.write_review_csv(review_dir / "unmatched-names.csv", + ["raw", "count", "example_rows", "suggested_id", "suggested_score"], unmatched_rows) + + writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates) + writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index) + writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x) + writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous) + writers.write_review_csv(review_dir / "index-file-mismatch.csv", ["source_row", "index", "file"], mismatches) + + dated = sum(1 for d in canon_docs if d.date_raw.strip()) + unknown = sum(1 for d in canon_docs if d.date_raw.strip() and d.date_precision == "UNKNOWN") + unknown_rate = f"{(100 * unknown / dated):.1f}%" if dated else "0.0%" + + stats = { + "# INPUTS": "", + "document_rows_read": len(doc_rows) - 1, + "register_persons": len(register), + "unknown_headers": ", ".join(unknown_headers) or "(none)", + "# OUTPUTS": "", + "documents_emitted": len(canon_docs), + "provisional_persons": len(ctx.provisional), + "# DATES": "", + "dated_rows": dated, + "unparsed_dates": unknown, + "unknown_date_rate": f"{unknown_rate} (target <=5%)", + "distinct_unparsed_formats": len(unparsed_by_raw), + "# NAMES": "", + "unmatched_name_strings": len(ctx.unmatched), + "ambiguous_receivers": len(ctx.ambiguous), + "# ANOMALIES": "", + "empty_rows": empty_count, + "blank_index_rows": len(blank_index), + "skipped_x_suffix": len(skipped_x), + "duplicate_index_rows": len(duplicates), + "index_file_mismatches": len(mismatches), + "# OVERRIDES": "", + "date_overrides_loaded": len(date_overrides), + "name_overrides_loaded": len(name_overrides), + "dates_resolved_by_override": dates_by_override, + "names_resolved_by_override": ctx.override_hits, + } + writers.write_summary(review_dir / "summary.txt", stats) + return stats + + +def main(): + parser = argparse.ArgumentParser(description="Normalize the family archive spreadsheets.") + parser.parse_args() + date_overrides, name_overrides = overrides_mod.load_overrides( + config.OVERRIDES_DIR / "dates.csv", config.OVERRIDES_DIR / "names.csv") + stats = run( + document_workbook=config.DOCUMENT_WORKBOOK, document_sheet=config.DOCUMENT_SHEET, + person_workbook=config.PERSON_WORKBOOK, person_sheet=config.PERSON_SHEET, + out_dir=config.OUT_DIR, review_dir=config.REVIEW_DIR, + date_overrides=date_overrides, name_overrides=name_overrides) + print("Normalization complete:") + for k, v in stats.items(): + print(f" {k}: {v}") + + +if __name__ == "__main__": + main() +``` + +> **Note for the implementer:** duplicate-index handling is a single second pass over `canon_docs` (`for doc in canon_docs: if doc.index in dup_indexes`) — this flags AND reports *every* colliding occurrence including the first (REQ-TRIAGE-01), not just repeats. Do not reintroduce a per-row append in the main loop. + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_normalize.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/normalize.py tools/import-normalizer/tests/test_normalize.py +git commit -m "feat(normalizer): orchestrator + end-to-end integration test" +``` + +--- + +### Task 17: README, seed overrides, and a real dry-run + +**Files:** +- Create: `tools/import-normalizer/README.md` +- Create: `tools/import-normalizer/overrides/dates.csv` +- Create: `tools/import-normalizer/overrides/names.csv` + +- [ ] **Step 1: Seed the overrides files** (header-only) + +`overrides/dates.csv`: +``` +raw,iso,precision +``` +`overrides/names.csv`: +``` +raw,person_id +``` + +- [ ] **Step 2: Write `README.md`** + +````markdown +# Import Normalizer + +Transforms the raw family-archive spreadsheets in `../../import/` into a clean canonical +dataset (`out/`) plus review reports (`review/`). See the spec: +`../../docs/import-migration/02-normalization-spec.md`. + +## Setup +Requires **Python 3.12** (uses `StrEnum`). +```bash +python3 -m venv .venv && .venv/bin/pip install -r requirements.txt +``` + +## Run +```bash +.venv/bin/python normalize.py +``` +Outputs: +- `out/canonical-documents.xlsx`, `out/canonical-persons.xlsx` +- `review/*.csv` (residue to fix), `review/summary.txt` (grouped run stats incl. unknown-date rate) + +## Iteration loop +1. **Run.** Read `review/summary.txt` for the health snapshot. +2. **Fix the residue** by editing the version-controlled overrides files, then re-run. Repeat. + +| Review file | What to do | +| --- | --- | +| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). | +| `unmatched-names.csv` | If `suggested_id` is right, copy `raw,suggested_id` into `overrides/names.csv`; else look up the correct id in `out/canonical-persons.xlsx` (the `person_id` column). | +| `ambiguous-receivers.csv` | A space-joined pair we refused to auto-split (e.g. `Ella Anita`). Decide and add a names override if it is really two people. | +| `index-file-mismatch.csv` | The `Datei` path disagrees with the index-derived filename — reconcile when the PDFs arrive. | +| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. | + +**Valid `person_id` values** all come from the `person_id` column of `out/canonical-persons.xlsx`. + +## Tests +```bash +.venv/bin/python -m pytest tests/test_dates.py -v # run files individually (never the whole suite at once) +``` +```` + +- [ ] **Step 3: Run the whole test suite file-by-file to confirm green** + +Run each individually (per the "no full-suite" rule): +```bash +cd tools/import-normalizer +for t in config dates persons ingest documents writers normalize; do .venv/bin/python -m pytest tests/test_$t.py -q || break; done +cd - +``` +Expected: every file reports all passed. + +- [ ] **Step 4: Real dry-run against the actual import data (manual verification, not a test)** + +Run: `cd tools/import-normalizer && .venv/bin/python normalize.py && cd -` +Expected: prints stats. Then inspect: +- `review/summary.txt` — sanity-check counts (≈7,600 documents emitted, register_persons ≈163). +- `review/unparsed-dates.csv` — confirm `UNKNOWN` rate is in the low single-digit %% of dated rows (NFR-ACCUR-01 target ≤5% before overrides). If higher, note the dominant unhandled formats for a follow-up parser tweak. +- Spot-check `out/canonical-documents.xlsx`: open the first ~20 rows; verify `date_iso`/`date_precision`, `sender_person_id`, and `receiver_person_ids` look right (e.g. `Eugenie Müller` → `de-gruyter-eugenie`). + +Record the run's `summary.txt` figures in `../../docs/import-migration/WORKLOG.md`. + +- [ ] **Step 5: Commit** (commit only source + seeds; `out/` and `review/` are gitignored) + +```bash +git add tools/import-normalizer/README.md tools/import-normalizer/overrides/dates.csv tools/import-normalizer/overrides/names.csv +git commit -m "docs(normalizer): README + seed overrides" +``` + +--- + +## Self-Review + +**Spec coverage check:** +- `FR-INGEST`/`FR-MAP` → Task 12 (header-name mapping, missing-required raises, unknown headers reported). ✓ +- `FR-TRIAGE` (REQ-TRIAGE-01/02/03) → Task 13 (triage by index-col, `classify_blank_index` banner detection) + Task 16 (single-pass duplicate flagging of *all* occurrences, blank-index report with `kind`, x-suffix skip+log). ✓ +- `FR-DATE` (REQ-DATE-01..06) → Tasks 2–8 (computus, feast/season, century rule, all matchers, overrides). ✓ +- `FR-PERS`/US-PERS-01 → Task 9; `REQ-PERS-01`/receiver split/AC4 ambiguous → Tasks 10, 14. ✓ +- `FR-DEDUP` (REQ-DEDUP-01/02) → Task 11 (maiden/married/nickname aliases, conservative; fuzzy = suggestion only). ✓ +- `FR-OVR` (REQ-OVR-01/02/03) → Task 15 (loader, missing-file tolerant) + Task 16 (applied + counted: `dates_resolved_by_override` / `names_resolved_by_override`) + Task 16 content-determinism assertion (two-run cell-matrix + review-file equality). ✓ +- `FR-OUT`/`FR-PROV` (REQ-OUT-01/02, REQ-PROV-01/02) → Tasks 13 (source_row, needs_review), 15 (writers), 16 (mismatch report). ✓ +- NFRs: DATA-01 (every row → output or review) covered by triage routing; OBSERV-01 → summary.txt; I18N-01 → utf-8 everywhere + diacritic map; TEST-01 → per-module tests; MAINT-01 → config tables. ✓ +- Data dictionary §6 → `DOC_COLUMNS`/`PERSON_COLUMNS` in Task 15 match the spec field list. ✓ + +**Placeholder scan:** No TBD/TODO; every code step shows complete code. The one `pass`/dead-line in Task 16 is explicitly called out with deletion instructions. ✓ + +**Type consistency:** `ParsedDate(iso, precision, raw)`, `Precision` (StrEnum → `str()` yields the value), `Person`, `RawRow`, `CanonicalDocument`, `AliasIndex.resolve/display/suggest`, `ResolutionContext.resolve_one/resolve_receivers`, `to_canonical(raw, ctx, date_overrides)`, `run(**kwargs)` — names line up across tasks. ✓ + +**Known follow-ups (out of scope for this plan):** Phase-2 importer wiring (`B11`); comma-splitting `Inhalt` into extra tags (`B10`, Could). These are intentionally deferred. + +--- + +## Review feedback incorporated (2026-05-25) + +Six personas reviewed this plan inline; the following changes were applied (see the session summary for detail): + +- **Idempotency redefined (architect/tester/req-eng):** spec G4/NFR-IDEM-01 changed from "byte-identical" to **content-deterministic**; Task 15 pins workbook `created`/`modified`; Task 11 builds aliases via ordered lists (no set-iteration leakage); Task 16 test now compares two runs' cell matrices + review files. +- **Duplicate-index bug fixed (developer/architect):** Task 16 now flags and reports *every* occurrence of a duplicated index in one pass; the dead `pass` line was removed; the test stat (`==2`) is correct. +- **Provisional id collision guarded (architect):** Task 14 `ResolutionContext._unique_id` suffixes provisional ids so they never overwrite a register `person_id`. +- **Date gaps closed (tester):** added invalid-calendar-date → UNKNOWN test, intra-month day-range matcher (`7./8. Sept.1923` → RANGE) + test, and a trailing-note-preservation test. +- **Multi-person sender (tester/req-eng, REQ-PERS-01):** Task 14 `resolve_sender` splits the sender, keeps the primary, flags `multi_sender`. +- **CSV injection defanged (security):** Task 15 `write_review_csv` neutralises leading `= + - @` etc. in human-opened CSVs (+ test). +- **REQ-TRIAGE-02 / REQ-OVR-03 realized (req-eng):** banner-vs-data classification in `blank-index-rows.csv`; override-application counts + an `unknown_date_rate` headline in `summary.txt`. +- **Ergonomics (UX):** `unparsed-dates.csv` now carries `example_rows` + blank `suggested_iso/precision` (paste-ready); `unmatched-names.csv` suggestion blanks-out on no-match and rounds the score; grouped `summary.txt`; README documents every review file + where to source `person_id`. +- **Repo hygiene (devops):** pinned `openpyxl==3.1.5` / `pytest==8.3.4`; hardened the **root** `.gitignore` against the committed-`.venv` class of mistake; documented the Python 3.12 requirement. diff --git a/docs/import-migration/04-unresolved-names-plan.md b/docs/import-migration/04-unresolved-names-plan.md new file mode 100644 index 00000000..f2b7543e --- /dev/null +++ b/docs/import-migration/04-unresolved-names-plan.md @@ -0,0 +1,502 @@ +# Unresolved-Name Classification Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a focused `review/unresolved-names.csv` that isolates sender/receiver strings whose *name itself* is problematic (unknown/illegible, single-token, relational-only, collective/group, prose-in-name-column, or a genuine two-given-name pair), and fix the ambiguous-pair heuristic so a plain `First Surname` external person (e.g. `Mieze Schefold`) is no longer falsely flagged. + +**Architecture:** A pure `classify_name(raw, given_names)` function in `persons.py` returns a `NameClass`. `ResolutionContext` classifies every *unmatched* name and records the non-`RESOLVABLE` ones in `self.unresolved`. A runtime-built given-name set (register first names + a small config supplement) lets the classifier distinguish a two-given-name pair (`Ella Anita` → two people) from a first+surname single person (`Mieze Schefold`). The orchestrator writes the aggregated report and per-category stats, replacing the noisy `ambiguous-receivers.csv`. + +**Tech Stack:** Python 3.12, openpyxl, pytest — extends the existing `tools/import-normalizer/`. + +**Context:** This builds on the completed normalizer (PR #663). Run all tests with CWD = the tool dir, e.g. `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_X.py -v`. Reuse the existing venv at `tools/import-normalizer/.venv` (do NOT recreate it). Commit on the current branch `docs/import-migration` (never main, never push). Each commit message ends with a trailing `Co-Authored-By: Claude Opus 4.7 ` line. + +--- + +## File Structure + +``` +tools/import-normalizer/ +├── config.py # + RELATIONAL_TERMS, COLLECTIVE_TERMS, UNKNOWN_NAME_MARKERS, PROSE_MAX_LEN, EXTRA_GIVEN_NAMES +├── persons.py # + NameClass, classify_name(), build_given_names(); ResolutionContext gains given_names + self.unresolved +├── normalize.py # writes unresolved-names.csv (replaces ambiguous-receivers.csv) + per-category stats +├── README.md # + unresolved-names.csv row in the review-file table +└── tests/ + ├── test_config.py # + name-table presence test + ├── test_persons.py # + classify_name + build_given_names tests + ├── test_documents.py # ambiguous test → unresolved test (+ resolvable-pair test) + └── test_normalize.py # integration asserts unresolved-names.csv +``` + +--- + +### Task 1: Config — name-classification tables + +**Files:** +- Modify: `tools/import-normalizer/config.py` +- Modify: `tools/import-normalizer/tests/test_config.py` + +- [ ] **Step 1: Add the failing test** to `tests/test_config.py` + +```python +def test_name_classification_tables(): + assert "tante" in config.RELATIONAL_TERMS + assert "familie" in config.COLLECTIVE_TERMS + assert "unbekannt" in config.UNKNOWN_NAME_MARKERS + assert config.PROSE_MAX_LEN >= 30 + assert "anita" in config.EXTRA_GIVEN_NAMES +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py::test_name_classification_tables -v && cd -` +Expected: FAIL — `AttributeError: module 'config' has no attribute 'RELATIONAL_TERMS'`. + +- [ ] **Step 3: Implement** — append to `config.py` (after the existing tables, before/after `KNOWN_LAST_NAMES` — anywhere at module level) + +```python +# --- Name classification (unresolved-name review) --- +# Relational reference terms — a sender/receiver named by relation, not a proper name. +RELATIONAL_TERMS = { + "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter", + "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine", + "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter", + "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer", +} +# Collective/group terms — not a single person. Matched against alpha-only word tokens +# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes. +COLLECTIVE_TERMS = { + "familie", "fam", "kinder", "eltern", "geschwister", "großeltern", + "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr", +} +# Markers of an unknown/illegible name (the literal "?" is handled separately in code). +# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn" +# (it occurs inside real names: Hanni, Johanna, Anna). +UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"} +# A name-column value longer than this (chars) is treated as prose/description, not a name. +PROSE_MAX_LEN = 40 +# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not +# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more. +EXTRA_GIVEN_NAMES = { + "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara", + "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta", +} +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_config.py -v && cd -` +Expected: PASS (all config tests). + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/config.py tools/import-normalizer/tests/test_config.py +git commit -m "feat(normalizer): config tables for name classification" +``` + +--- + +### Task 2: `classify_name` + `NameClass` + +**Files:** +- Modify: `tools/import-normalizer/persons.py` +- Modify: `tools/import-normalizer/tests/test_persons.py` + +- [ ] **Step 1: Add failing tests** to `tests/test_persons.py` + +```python +from persons import NameClass + +GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"} + +def test_classify_unknown(): + assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN + +def test_classify_prose(): + assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE + assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit + assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote + +def test_classify_collective(): + assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE + +def test_classify_relational(): + assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL + assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL + +def test_classify_single_token(): + assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN + assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN + +def test_classify_ambiguous_pair(): + assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR + assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR + +def test_classify_resolvable_single_person(): + # first + surname (surname not a given name) -> one real person, NOT ambiguous + assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE + assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -k classify -v && cd -` +Expected: FAIL — `NameClass` / `classify_name` not defined. + +- [ ] **Step 3: Implement** — add to `persons.py`. Add `from enum import StrEnum` to the imports if not present, then add: + +```python +class NameClass(StrEnum): + RESOLVABLE = "resolvable" + UNKNOWN = "unknown" + SINGLE_TOKEN = "single_token" + RELATIONAL = "relational" + COLLECTIVE = "collective" + PROSE = "prose" + AMBIGUOUS_PAIR = "ambiguous_pair" + + +_QUOTE_CHARS = "\"'“”„‚‘’" + + +def classify_name(raw: str, given_names: set[str]) -> NameClass: + """Classify a (post-split) sender/receiver string by why it may be unresolvable. + + Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL -> + SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE. + """ + s = raw.strip() + if not s: + return NameClass.RESOLVABLE + low = s.lower() + tokens = s.split() + # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms + # are matched as whole words (no substring/prefix false positives like "Allerton"). + alpha_words = re.findall(r"[a-zäöüß]+", low) + if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS): + return NameClass.UNKNOWN + if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s) + or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3): + return NameClass.PROSE + if any(w in config.COLLECTIVE_TERMS for w in alpha_words): + return NameClass.COLLECTIVE + if any(w in config.RELATIONAL_TERMS for w in alpha_words): + return NameClass.RELATIONAL + if len(tokens) == 1: + return NameClass.SINGLE_TOKEN + if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens): + return NameClass.AMBIGUOUS_PAIR + return NameClass.RESOLVABLE + + +# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is +# classified PROSE. Such multi-particle names are rare here and usually resolve via the +# register; if they surface in review, lower-priority than the real prose entries. +``` + +> Note: `_norm` already exists in `persons.py` (added in the alias-index task) and strips accents + lowercases. `classify_name` uses it so given-name matching is accent-insensitive. + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -` +Expected: PASS (all persons tests, including the 7 new classify tests). + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py +git commit -m "feat(normalizer): classify_name + NameClass" +``` + +--- + +### Task 3: `build_given_names` + +**Files:** +- Modify: `tools/import-normalizer/persons.py` +- Modify: `tools/import-normalizer/tests/test_persons.py` + +- [ ] **Step 1: Add failing test** to `tests/test_persons.py` + +```python +def test_build_given_names(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie"}, + {"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given + ]) + g = persons.build_given_names(people, {"Anita"}) + assert "eugenie" in g + assert "charlotte" in g and "meta" in g # primary + extra given names + assert "anita" in g # from the extra set, normalized + assert "schefold" not in g +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py::test_build_given_names -v && cd -` +Expected: FAIL — `build_given_names` not defined. + +- [ ] **Step 3: Implement** — add to `persons.py` + +```python +def build_given_names(register: list[Person], extra: set[str]) -> set[str]: + """Set of normalized given names from the register (first + extra given) plus a supplement. + + Used by classify_name to tell a two-given-name pair (two people) from a first+surname. + """ + names: set[str] = set() + for p in register: + if p.first_name: + names.add(_norm(p.first_name)) + for g in p.extra_given_names: + names.add(_norm(g)) + for e in extra: + names.add(_norm(e)) + return names +``` + +- [ ] **Step 4: Run to verify it passes** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_persons.py -v && cd -` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/tests/test_persons.py +git commit -m "feat(normalizer): build_given_names from register + supplement" +``` + +--- + +### Task 4: Integrate — ResolutionContext records unresolved; orchestrator writes the report + +This task touches `persons.py`, `normalize.py`, and two test files together so the whole suite stays green in one commit (removing `ctx.ambiguous` requires updating its only consumer, `normalize.py`, in the same change). + +**Files:** +- Modify: `tools/import-normalizer/persons.py` (ResolutionContext) +- Modify: `tools/import-normalizer/normalize.py` +- Modify: `tools/import-normalizer/tests/test_documents.py` +- Modify: `tools/import-normalizer/tests/test_normalize.py` + +- [ ] **Step 1: Update the failing tests first** + +In `tests/test_documents.py`, **replace** the existing `test_ambiguous_space_pair_flagged_not_split` function entirely with these two functions: + +```python +def test_ambiguous_pair_recorded_in_unresolved(): + people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}, + given_names={"ella", "anita"}) + raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert len(doc.receiver_person_ids) == 1 # not split — one provisional + assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved) + +def test_resolvable_first_surname_pair_not_unresolved(): + ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={}, + given_names={"ella", "anita"}) + ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name + assert ctx.unresolved == [] # RESOLVABLE -> not recorded +``` + +In `tests/test_normalize.py`, in the `_doc_wb` fixture, change the `C-0001` row's receiver from empty to `"?"` so the run produces an unresolved entry. Find the line that appends the `C-0001` row and set its `EmpfängerIn` cell to `"?"`. For example the row currently reads: + +```python + ws.append(["C-0001", "", "", "", "Hans Wittkopf", "", "Freitag 1919", "", "", ""]) +``` + +change the 6th cell (EmpfängerIn) from `""` to `"?"`: + +```python + ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""]) +``` + +Then add these assertions inside `test_run_end_to_end`, right after the existing `assert (review_dir / "unparsed-dates.csv").exists()` line: + +```python + assert (out_dir / "canonical-documents.xlsx").exists() # (keep existing asserts above) + assert (review_dir / "unresolved-names.csv").exists() + unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8") + assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver + assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced +``` + +- [ ] **Step 2: Run to verify they fail** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/test_documents.py tests/test_normalize.py -v && cd -` +Expected: FAIL — `ResolutionContext` has no `given_names`/`unresolved`; `unresolved-names.csv` not written. + +- [ ] **Step 3a: Implement — `ResolutionContext` in `persons.py`** + +Replace the `ResolutionContext.__init__` body's two lines (`self.ambiguous` and add `given_names`) and the relevant methods. The new `__init__`: + +```python + def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str], + given_names: set[str] | None = None): + self.index = alias_index + self.name_overrides = name_overrides + self.given_names = given_names or set() + self.provisional: dict[str, Person] = {} + self.unmatched: dict[str, list] = {} + self.unresolved: list[tuple] = [] # (raw_name, category, source_row) for non-RESOLVABLE names + self._raw_to_pid: dict[str, str] = {} + self.override_hits = 0 +``` + +In `resolve_one`, the provisional branch must classify the name. Replace this existing block: + +```python + # provisional person (unmatched) — never reuse a register id + self.unmatched.setdefault(name, []).append(source_row) + if name in self._raw_to_pid: + return self._raw_to_pid[name], name, False +``` + +with: + +```python + # provisional person (unmatched) — never reuse a register id + self.unmatched.setdefault(name, []).append(source_row) + category = classify_name(name, self.given_names) + if category is not NameClass.RESOLVABLE: + self.unresolved.append((name, str(category), source_row)) + if name in self._raw_to_pid: + return self._raw_to_pid[name], name, False +``` + +Replace the entire `resolve_receivers` method (the ambiguous detection now lives in `resolve_one` via `classify_name`): + +```python + def resolve_receivers(self, raw: str, source_row: int): + return [self.resolve_one(part, source_row) for part in split_receivers(raw)] +``` + +- [ ] **Step 3b: Implement — `normalize.py`** + +Find the line that builds the context: + +```python + ctx = persons.ResolutionContext(alias_index, name_overrides) +``` + +replace it with (build the given-name set from the register + config supplement): + +```python + given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES) + ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names) +``` + +Replace the `ambiguous-receivers.csv` write line: + +```python + writers.write_review_csv(review_dir / "ambiguous-receivers.csv", ["raw", "part", "source_row"], ctx.ambiguous) +``` + +with an aggregated unresolved-names report: + +```python + unresolved_agg: dict[tuple, list] = {} + for name, category, row in ctx.unresolved: + unresolved_agg.setdefault((category, name), []).append(row) + unresolved_rows = sorted( + ([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))] + for (cat, name), rows in unresolved_agg.items()), + key=lambda r: (r[0], -r[2], r[1])) + writers.write_review_csv(review_dir / "unresolved-names.csv", + ["category", "raw", "count", "example_rows"], unresolved_rows) +``` + +In the `stats` dict, replace the `"ambiguous_receivers"` line: + +```python + "ambiguous_receivers": len(ctx.ambiguous), +``` + +with a per-category breakdown: + +```python + "unresolved_name_occurrences": len(ctx.unresolved), + "unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"), + "unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"), + "unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"), + "unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"), + "unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"), + "unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"), +``` + +- [ ] **Step 4: Run the whole suite to verify green** + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -` +Expected: PASS (all tests, no `ambiguous` references remain). + +Also grep to confirm no dangling references: +Run: `grep -rn "ctx.ambiguous\|ambiguous-receivers\|ambiguous_receivers\|self.ambiguous" tools/import-normalizer/*.py` +Expected: no matches. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons.py tools/import-normalizer/normalize.py tools/import-normalizer/tests/test_documents.py tools/import-normalizer/tests/test_normalize.py +git commit -m "feat(normalizer): unresolved-names report + fix ambiguous-pair over-flagging" +``` + +--- + +### Task 5: README — document the new report + +**Files:** +- Modify: `tools/import-normalizer/README.md` + +- [ ] **Step 1: Update the review-file table** in `README.md`. Replace the `ambiguous-receivers.csv` row with an `unresolved-names.csv` row. Find the table row referencing `ambiguous-receivers.csv` and replace it with: + +```markdown +| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv`. | +``` + +If the README has no such row (older version), add the row above to the review-file table. + +- [ ] **Step 2: Add a note** to the iteration-loop section of `README.md` (after the table): + +```markdown +> `unresolved-names.csv` is the focused "names that need a human" list — distinct from +> `unmatched-names.csv` (which is just non-family correspondents that got provisional persons). +> The given-name set that drives `ambiguous_pair` detection is the register's first names plus +> `config.EXTRA_GIVEN_NAMES` — add names there if a real two-person cell isn't being flagged. +``` + +- [ ] **Step 3: Verify the suite is still green** (README-only change, but confirm nothing references the old file) + +Run: `cd tools/import-normalizer && .venv/bin/python -m pytest tests/ -q && cd -` +Expected: PASS. + +- [ ] **Step 4: Commit** + +```bash +git add tools/import-normalizer/README.md +git commit -m "docs(normalizer): document unresolved-names.csv review report" +``` + +--- + +## Self-Review + +**Spec coverage** (against the agreed proposal): +- Focused report isolating problem name classes → Task 4 writes `review/unresolved-names.csv` with a `category` column; categories defined in Task 2 `classify_name`. ✓ +- Fix ambiguous over-flagging of `First Surname` → Task 2 `AMBIGUOUS_PAIR` requires *both* tokens in the given-name set; `Mieze Schefold` → `RESOLVABLE` (tested). ✓ +- Distinguish "not fully known" (unknown/single-token/relational/collective/prose) from "can't split cleanly" (ambiguous_pair) → all are `NameClass` values, each its own category column value. ✓ +- Per-category counts in summary → Task 4 stats. ✓ +- Senders covered too (not just receivers) → classification happens in `resolve_one`, which both `resolve_sender` and `resolve_receivers` call. ✓ + +**Placeholder scan:** No TBD/TODO; every code step has complete code. The README replacement gives the exact row text. + +**Type consistency:** `NameClass` (StrEnum) defined Task 2; `classify_name(raw, given_names)` and `build_given_names(register, extra)` signatures used consistently in Task 4; `ResolutionContext(alias_index, name_overrides, given_names=…)` matches the new `__init__`; `self.unresolved` is `list[tuple]` of `(raw, category, source_row)` and read with that shape in both the report and the stats. `str(category)` yields the StrEnum value (e.g. `"ambiguous_pair"`), matching the stat comparisons and the test assertions. + +**Cross-task green:** Task 4 deliberately bundles the `persons.py` + `normalize.py` + test changes into one commit because removing `ctx.ambiguous` breaks its consumer otherwise — no red commit is left behind (lesson from the prior build). + +**Out of scope (future):** Spanish month names + `Mon DD-YYYY` date form (separate date-parser enhancement); promoting `unresolved` rows into a document-level `needs_review` flag; auto-splitting confirmed `ambiguous_pair` entries via overrides. diff --git a/docs/import-migration/README.md b/docs/import-migration/README.md new file mode 100644 index 00000000..0eab8b06 --- /dev/null +++ b/docs/import-migration/README.md @@ -0,0 +1,68 @@ +# Import Migration — Working Folder + +This folder tracks the iterative work of mass-importing the **real, raw family archive** +spreadsheets (≈7,600 letter rows + ~7,000 PDFs that arrive later) into Familienarchiv. + +It is intentionally **local docs, not Gitea issues**. We only open a Gitea issue when a +finding requires a *software* change (e.g. a new date parser). Pure data observations and +the running plan live here so any agent can pick the work up cold. + +## Source files (in `/import`) + +| File | What it is | Importer support today | +| --- | --- | --- | +| `zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx` | The **real raw archive** — 7,943 rows, sheet `Familienarchiv`. Human-readable, dates as written in the letters. | ❌ layout does **not** match importer defaults | +| `Personendatei 2.xlsx` | Genealogical **person register** — 163 people, sheet `Tabelle1` (maiden names, birth/death, marriages, relationships). | ❌ no importer at all | +| `zzfamilienarchiv Walter und Eugenie 2025-04-10.ods` | A small, **already-normalized** subset (Walter & Eugenie brautbriefe). 14 clean columns incl. ISO dates. | ✅ this is what `MassImportService` was built for | + +The PDFs (~7,000) will follow later. The importer matches files by the **Index** column +(e.g. `W-0001` → `W-0001.pdf`), and already imports metadata-only when a file is missing — +so we can import all metadata now and the PDFs will attach on a re-run. + +## How to inspect the spreadsheets + +`openpyxl` is installed in the OCR service venv: + +```bash +/home/marcel/Desktop/familienarchiv/ocr-service/.venv/bin/python3 -c "import openpyxl; print(openpyxl.__version__)" +``` + +## Documents in this folder + +- [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) — full analysis of every data-quality / importer issue found (2026-05-25). Each issue has an ID `IMP-NN`. +- [`02-normalization-spec.md`](./02-normalization-spec.md) — requirements spec for the offline **import normalizer** (the agreed strategy: normalize the raw sheets into a clean canonical dataset before import). Requirements `FR-*`/`NFR-*`, traceable to the `IMP-NN` findings. +- `WORKLOG.md` — running log of what each session did and what's next. **Start here when resuming.** + +## Strategy (decided 2026-05-25) + +Normalize **before** import. A standalone Python tool (`tools/import-normalizer/`, not yet +built) transforms the raw xlsx + person register into a clean canonical dataset +(`canonical-documents.xlsx`, `canonical-persons.xlsx`) plus review CSVs. Residual cases +(unparseable dates, unmatched names) are fixed via a version-controlled overrides file and +re-run. The Java importer is adjusted to consume the canonical contract in a later **Phase 2**. +See the spec for the full contract. + +The canonical artifacts themselves (the `out/` files) are **produced locally and not +version-controlled** — they contain real family PII. They are synced onto the ops host's +`IMPORT_HOST_DIR` alongside the PDFs, out-of-band. The contract is the header schema in +`02-normalization-spec.md` §6, not any particular file in `out/`. See ADR-025 for the full +rationale. + +## Status board + +| ID | Issue | Severity | Status | +| --- | --- | --- | --- | +| IMP-01 | New xlsx column layout ≠ importer defaults | 🔴 blocker | open | +| IMP-02 | 90% of dates are free-text the parser can't read | 🔴 blocker | open | +| IMP-03 | No ISO/normalized date column in the new xlsx | 🔴 blocker | open | +| IMP-04 | Person register (`Personendatei 2.xlsx`) not imported | 🟠 major | open | +| IMP-05 | Name variations = duplicate Persons (maiden vs married) | 🟠 major | open | +| IMP-06 | 93 data rows with blank Index are silently dropped | 🟠 major | open | +| IMP-07 | 43 duplicate Index values | 🟡 minor | open | +| IMP-08 | Section/title rows interleaved in data | 🟡 minor | open | +| IMP-09 | Index↔Datei filename mismatches | 🟡 minor | open | +| IMP-10 | `x`-suffix rows (letter backsides/enclosures) | 🟡 minor | open | +| IMP-11 | Multi-receiver separators incl. bare `u`/`u.` | 🟡 minor | open | +| IMP-12 | Importer reads only the first sheet, no validation | 🟡 minor | open | + +See the findings doc for detail and proposed approach per issue. diff --git a/docs/import-migration/WORKLOG.md b/docs/import-migration/WORKLOG.md new file mode 100644 index 00000000..6c41792f --- /dev/null +++ b/docs/import-migration/WORKLOG.md @@ -0,0 +1,147 @@ +# Import Migration — Worklog + +Running log of each working session. **Resume here.** Newest entry on top. + +--- + +## 2026-05-25 (session 5) — Unresolved-name classification + +**Did:** Implemented [`04-unresolved-names-plan.md`](./04-unresolved-names-plan.md) subagent-driven +(5 tasks, TDD, per-task spec + code-quality review; 67 tests pass). Added `classify_name` + +`NameClass` + `build_given_names` in `persons.py`; `ResolutionContext` now records non-RESOLVABLE +names in `self.unresolved`; orchestrator writes `review/unresolved-names.csv` (replaces the noisy +`ambiguous-receivers.csv`) with per-category stats. + +**Why:** `unmatched-names.csv` mixes boring non-family correspondents (expected) with genuinely +unresolvable entries. The new report isolates the latter so review focuses on ~440 real cases. + +**Real-run result:** unresolved-names.csv = single_token 191 / prose 103 / unknown 74 / +collective 46 / relational 21 / ambiguous_pair **5** (distinct). The ambiguous over-flagging fix +cut `ambiguous_pair` from 303 → 5 (genuine two-given-name pairs only; `Mieze Schefold` etc. now +correctly RESOLVABLE). given-name set = register first names ∪ `config.EXTRA_GIVEN_NAMES`. + +**Next:** populate `overrides/names.csv` from unresolved-names.csv (highest-count first); extend +`EXTRA_GIVEN_NAMES` if a real pair isn't flagged; still-open date work (Spanish months, 58–72 band). + +--- + +## 2026-05-25 (session 4) — Built the normalizer (subagent-driven, all 17 tasks) + +**Did:** Executed the plan subagent-driven (implementer + spec review + code-quality review per +task). The tool `tools/import-normalizer/` is **complete and passing (57 tests)**. Final +opus review: **READY** — determinism verified on the real corpus (two runs → identical cell +matrices + byte-identical review files), zero silent drops. + +**Per-task code review caught & fixed real issues** (all in the committed code): leading +qualifiers `nach/vor/…` now → APPROX; English month-first matcher hardened to structurally +not shadow `Mai 1895`; person-id collision de-dup suffixes *all* members; `split_receivers` +returns `[]` for a `geb.`-only cell; boolean cells no longer coerced to `1/0`; duplicate-index +flags every occurrence; provisional ids never steal a register id; CSV-injection defanged. + +**REAL DRY-RUN** (`python normalize.py` over the actual archive — outputs are gitignored): +- documents_emitted **7,582** (+225 empty +93 blank-index +42 x-suffix = 7,942 rows read, 0 dropped) +- register_persons **163**, provisional_persons **942** +- dates: DAY 6,509 / MONTH 36 / RANGE 36 / APPROX 28 / YEAR 17 / SEASON 1 / UNKNOWN 955 +- **unknown_date_rate 9.2%** (of dated rows; target ≤5% pre-override, ≤0.5% after overrides) +- duplicate_index 85, index_file_mismatches 550, ambiguous_receivers 303 + +**⚠️ Concurrency incident:** a parallel Claude session committed reader-dashboard work to this +branch and hard-reset it mid-execution, deleting the Task 15 files and orphaning a commit. +Recovered via reflog (`reset --hard 366b4848` + `checkout 401160e3 -- `); no code +lost. Casualty: my *during-execution* edits to the plan/spec docs (02/03) for Tasks 5–14 were +discarded — **the committed code + tests are the source of truth**, not the plan doc, which now +reflects the pre-execution + persona-review version. + +**Next steps (iterative refinement — the overrides loop, as designed):** +1. Shave the 9.2% UNKNOWN cheaply: add **Spanish month names** (Enero…Diciembre) and the + `Mon DD-YYYY` dash form to `config.MONTHS`/the parser (Mexican-branch correspondence); + revisit the 58–72 two-digit-year band (real `…58/59/60` dates = 1958–1960, just past the + 1873–1957 window — decide whether to extend the upper bound in `config`). +2. `?` (99×) is genuinely "date unknown" — leave UNKNOWN or add a convention. +3. Populate `overrides/dates.csv` + `overrides/names.csv` from the review CSVs and re-run. +4. README note: a leading `'`/`!` in a `review/*.csv` `raw` cell may be a CSV-defang artifact — + match against the true source value when writing overrides. +5. Phase 2 (separate spec): wire the canonical contract into the Java `MassImportService`. + +--- + +## 2026-05-25 (session 3) — Implementation plan + persona review + +**Did:** +- Wrote [`03-normalizer-implementation-plan.md`](./03-normalizer-implementation-plan.md): 17 + bite-sized TDD tasks for `tools/import-normalizer/` (Python, openpyxl), bottom-up — date + parser w/ Easter computus first, then persons/alias, ingest, mapping, orchestrator, writers. +- Ran a 6-persona inline review (architect, developer, tester, req-engineer, security, devops; + ui-expert too) via parallel agents. Acted on all material findings. + +**Key fixes from review (see plan §"Review feedback incorporated"):** +- Idempotency redefined byte-identical → **content-deterministic** (spec G4/NFR-IDEM-01); + pinned workbook timestamps + deterministic alias ordering + a real two-run equality test. +- Real bug: duplicate-index only reported repeats → now flags/reports every occurrence. +- Provisional `person_id` could overwrite a register id → now suffixed. +- Date parser gaps: invalid-calendar-date → UNKNOWN, intra-month day-range (`7./8. Sept.1923`). +- Multi-person sender now split + flagged (REQ-PERS-01); CSV-injection defanged in review files; + pinned deps + hardened root `.gitignore`. + +**Next:** +- Marcel reviews the plan. Then execute it (subagent-driven or inline) — the date parser + (Task 3/8 + Easter computus) is the meatiest piece. + +--- + +## 2026-05-25 (session 2) — Strategy + normalizer spec + +**Did:** +- Decided strategy with Marcel: **normalize the raw sheets first**, then import (higher + leverage than making the Java importer tolerate every mess). +- Locked design decisions (see spec §3): new canonical layout; dates = parsed + raw + + precision; include person register + dedup in this effort; overrides-file + re-run loop; + Python tool at `tools/import-normalizer/`. +- Century rule fixed by Marcel: archive spans **1873–1957**; 2-digit `00–57`→19YY, + `73–99`→18YY, `58–72`→flag; 3-digit→1DDD; never 20xx. +- Wrote [`02-normalization-spec.md`](./02-normalization-spec.md) in the requirements-engineer + persona (FR/NFR, Given-When-Then ACs, traceability to IMP-NN, TBD register). + +**All 6 open questions resolved (spec §9):** OQ-01 — movable feasts (Ostern, Pfingsten, …) +**computed per year from Easter**, never a fixed month; seasons → mid-season month +(Sommer=Jul, Herbst=Oct). OQ-02 ranges → start+RANGE. OQ-03 slug ids. OQ-04 — `x`-suffix rows +**skipped + logged** this pass (they're transcriptions of the base letter, not yet mappable). +OQ-05 → `.xlsx`. OQ-06 → conservative, no silent merge. + +**Git:** moved off the unrelated `feat/issue-356-…` branch; pulled `main`; created clean +branch **`docs/import-migration`** and committed these docs there. (The dirty `.venv` +pycache + `skills/implement/SKILL.md` in the tree are pre-existing/environmental noise — left +uncommitted, not ours.) + +**Next:** +- Marcel reviews the spec. +- Then writing-plans → build the normalizer at `tools/import-normalizer/` (backlog B1–B7 are + the Musts; B3 date parser incl. Easter computus is the big one). + +--- + +## 2026-05-25 (session 1) — Initial analysis + +**Did:** +- Got the real raw archive xlsx (7,943 rows) + person register (163 people). PDFs to follow. +- Compared the new xlsx layout against `MassImportService` defaults and the old ODS. +- Full statistical scan of all rows: dates, indices, senders/receivers, file column. +- Wrote [`01-findings-spreadsheet-analysis.md`](./01-findings-spreadsheet-analysis.md) + with 12 issues (IMP-01..IMP-12) + recommended sequencing. +- Installed `openpyxl` into the OCR service venv for inspection. + +**Key facts established:** +- Importer defaults match the **ODS**, not the new xlsx → wrong column mapping (IMP-01). +- **90%** of dated rows (6,571 / 7,319) are free-text dates the ISO-only parser drops (IMP-02). +- Person register is rich but **unimported**; holds the maiden-name dedup key (IMP-04/05). + +**Decisions pending from Marcel (blockers for any code work):** +1. IMP-01: positional re-config of `app.import.col.*` vs header-driven mapping rewrite? +2. IMP-02: how to store imprecise dates — new `dateOriginal` + `precision` columns, or lossy? +3. IMP-04/05: format for the person/alias mapping; import persons before documents? +4. IMP-10: are `x`-suffix rows separate documents, attachments, or skipped? + +**Next:** +- Get Marcel's calls on the 4 decisions above. +- Then split the code-change items into Gitea issues (IMP-01b, IMP-02, IMP-04, IMP-06, IMP-12). +- Pure-data tasks (IMP-07 dup list, IMP-09 file reconcile) stay here. diff --git a/docs/superpowers/plans/2026-05-25-personendatei-importer.md b/docs/superpowers/plans/2026-05-25-personendatei-importer.md new file mode 100644 index 00000000..f1e8a6e0 --- /dev/null +++ b/docs/superpowers/plans/2026-05-25-personendatei-importer.md @@ -0,0 +1,1329 @@ +# Personendatei Importer Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add `tools/import-normalizer/persons_tree.py` — a CLI tool that reads `import/Personendatei 2.xlsx` and writes `out/canonical-persons-tree.json` with 163 normalized person records, SPOUSE_OF/PARENT_OF relationship edges, and an `unresolved[]` list for manual review. + +**Architecture:** Two-pass approach: pass 1 parses all rows into person dicts and builds a name-lookup index; pass 2 resolves `verheiratet mit` (SPOUSE_OF) and parses `Bemerkung` for parent/child patterns (PARENT_OF). Reuses `ingest.read_sheet()`, `ingest.build_header_map()`, `dates.parse_date()`, and `persons._strip_accents` from the existing normalizer. No backend required. + +**Tech Stack:** Python 3.12, openpyxl (already in `.venv`), pytest (already in `.venv`), `dates.py`/`ingest.py`/`config.py`/`persons.py` from `tools/import-normalizer/`. + +--- + +## Context you need before starting + +**Run environment:** +```bash +cd tools/import-normalizer +source .venv/bin/activate # or: .venv/bin/python / .venv/bin/pytest directly +``` + +**Key existing modules (read these before coding):** +- `config.py` — `PERSON_WORKBOOK`, `PERSON_SHEET`, `PERSON_HEADER_MAP`, `OUT_DIR` +- `ingest.py` — `read_sheet(path, sheet_name) -> list[list[str]]` and `build_header_map(header_row, field_map, required)` +- `dates.py` — `parse_date(raw: str) -> ParsedDate` with `.iso` (ISO string or None) and `.precision` +- `persons.py` — `_strip_accents(s)` (diacritic normalization) + +**How ingest works:** `read_sheet()` opens the workbook with openpyxl and converts every cell to a string via `_cell_to_str()`. Date-formatted cells become ISO strings (`"1920-09-20"`). Cells stored as plain numbers (like the date serials in this file) become numeric strings (`"7568"`). All values arrive in `persons_tree.py` as strings. + +**PERSON_HEADER_MAP** (already in `config.py`): +```python +{ + "generation": "generation", + "familienname": "last_name", + "vorname": "first_name", + "geb als": "maiden_name", + "geburtsdatum": "birth_date", + "geburtsort": "birth_place", + "todesdatum": "death_date", + "sterbeort": "death_place", + "verheiratet mit": "spouse", + "bemerkung": "notes", +} +``` + +**File structure:** +- Create: `tools/import-normalizer/persons_tree.py` +- Create: `tools/import-normalizer/tests/test_persons_tree.py` + +--- + +## Task 1: Year extraction from cell string + +**Files:** +- Create: `tools/import-normalizer/persons_tree.py` +- Create: `tools/import-normalizer/tests/test_persons_tree.py` + +The trickiest part of this tool. Birth/death cells arrive as strings from `ingest.read_sheet()`: +- Date-formatted cells: ISO string `"1920-09-20"` → `parse_date()` handles it +- Plain number cells (the majority): numeric string `"7568"` → `parse_date("7568")` returns UNKNOWN (7568 > 2100 so `expand_year()` rejects it) → we must detect this and apply Excel serial conversion: `date(1899,12,30) + timedelta(days=7568)` → 1920 +- German string dates: `"30.8.1862"` → `parse_date()` handles it +- Year-only: `"1930"` → `parse_date()` handles it +- Free text: `"August 1941"` → `parse_date()` handles it +- Unresolvable: `"2.9.196"`, `"4.3.1023"` → return None + +- [ ] **Step 1: Write the failing tests** + +Create `tools/import-normalizer/tests/test_persons_tree.py`: + +```python +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import persons_tree + + +def test_parse_year_iso_string(): + assert persons_tree._parse_year("1920-09-20") == 1920 + + +def test_parse_year_excel_serial_birth(): + # 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting + assert persons_tree._parse_year("7568") == 1920 + + +def test_parse_year_excel_serial_death(): + # 36222 days from 1899-12-30 ≈ 1999 + assert persons_tree._parse_year("36222") == 1999 + + +def test_parse_year_excel_serial_small(): + # 177 days from 1899-12-30 = 1900-06-25 + assert persons_tree._parse_year("177") == 1900 + + +def test_parse_year_german_date_string(): + assert persons_tree._parse_year("30.8.1862") == 1862 + + +def test_parse_year_year_only(): + assert persons_tree._parse_year("1930") == 1930 + + +def test_parse_year_free_text(): + assert persons_tree._parse_year("August 1941") == 1941 + + +def test_parse_year_none(): + assert persons_tree._parse_year(None) is None + + +def test_parse_year_empty(): + assert persons_tree._parse_year("") is None + + +def test_parse_year_unresolvable_truncated(): + # "2.9.196" has no valid 4-digit year — returns None + assert persons_tree._parse_year("2.9.196") is None + + +def test_parse_year_typo_year(): + # "4.3.1023" — year 1023 outside 1500-2100 guard — returns None + assert persons_tree._parse_year("4.3.1023") is None +``` + +- [ ] **Step 2: Run tests — verify they all fail with ImportError or NameError** + +```bash +cd tools/import-normalizer +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: `ImportError: No module named 'persons_tree'` + +- [ ] **Step 3: Create `persons_tree.py` with `_parse_year`** + +```python +"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json.""" +import argparse +import datetime +import json +import re +import sys +from pathlib import Path + +import config +import dates +from persons import _strip_accents + + +def _parse_year(raw: str | None) -> int | None: + """Extract a birth/death year from an Excel cell string. + + Handles four cases: + 1. ISO string (openpyxl date-formatted cell) → parse_date() + 2. Numeric string that is an Excel serial (1-80000) → timedelta conversion + 3. Any other string → parse_date() + 4. Unresolvable → None + """ + if raw is None: + return None + s = str(raw).strip() + if not s: + return None + + # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.) + result = dates.parse_date(s) + if result.iso: + return int(result.iso[:4]) + + # If it's a pure integer string, try Excel serial conversion. + # parse_date() returns UNKNOWN for serials like "7568" because 7568 > 2100. + if re.fullmatch(r"\d+", s): + n = int(s) + if 1 <= n <= 80_000: + d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) + if 1500 <= d.year <= 2100: + return d.year + + return None +``` + +- [ ] **Step 4: Run tests — verify they pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 11 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add persons_tree skeleton + year extraction" +``` + +--- + +## Task 2: Generation number parsing + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +Column A has values like `"G 3"`, `"G3"`, `"G 0"`, `"G 2 de Gruyter"`, `"G 0"`. Extract the first digit sequence. + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def test_parse_generation_space(): + assert persons_tree._parse_generation("G 3") == 3 + + +def test_parse_generation_no_space(): + assert persons_tree._parse_generation("G3") == 3 + + +def test_parse_generation_extra_spaces(): + assert persons_tree._parse_generation("G 0") == 0 + + +def test_parse_generation_trailing_garbage(): + assert persons_tree._parse_generation("G 2 de Gruyter") == 2 + + +def test_parse_generation_empty(): + assert persons_tree._parse_generation("") is None + + +def test_parse_generation_none(): + assert persons_tree._parse_generation(None) is None +``` + +- [ ] **Step 2: Run — expect NameError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py::test_parse_generation_space -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_parse_generation'` + +- [ ] **Step 3: Implement `_parse_generation`** + +Add to `persons_tree.py` after `_parse_year`: + +```python +def _parse_generation(raw: str | None) -> int | None: + """Extract the generation integer from column A values like 'G 3', 'G3', 'G 0'.""" + if not raw: + return None + m = re.search(r"\d+", str(raw)) + return int(m.group()) if m else None +``` + +- [ ] **Step 4: Run — expect all generation tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 17 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add generation parser to persons_tree" +``` + +--- + +## Task 3: Name normalization and lookup index + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +The lookup index maps normalized name strings to lists of `rowId`s. `_norm_tree` extends `persons._norm` with parenthetical stripping and geographic suffix removal. The index is built with four keys per person: `"first last"`, `"last first"`, `"first maiden"`, and `last` alone (for single-token fallback). + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def test_norm_tree_basic(): + assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer" + + +def test_norm_tree_diacritics(): + assert persons_tree._norm_tree("Wöhler") == "woehler" + + +def test_norm_tree_strips_parens(): + assert persons_tree._norm_tree("Otto (Herbert)") == "otto" + + +def test_norm_tree_strips_quotes(): + assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly" + + +def test_norm_tree_strips_geographic_suffix(): + assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram" + + +def test_norm_tree_strips_mexiko(): + assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram" + + +def test_norm_tree_collapses_whitespace(): + assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter" + + +def test_build_index_forward_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert "werner allemeyer" in idx + assert idx["werner allemeyer"] == ["row_002"] + + +def test_build_index_reversed_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + # col I uses reversed order: "Allemeyer Werner" + assert idx.get("allemeyer werner") == ["row_002"] + + +def test_build_index_maiden_name_lookup(): + persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}] + idx = persons_tree._build_index(persons) + # maiden-name form: "Elsgard Wöhler" -> "elsgard woehler" + assert idx.get("elsgard woehler") == ["row_002"] + + +def test_build_index_single_token_fallback(): + persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert idx.get("cram") == ["row_028"] + + +def test_build_index_ambiguous_single_token(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + # "cram" alone is ambiguous — both rows map to it + assert set(idx["cram"]) == {"row_028", "row_019"} + + +def test_resolve_one_found(): + persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx) + assert row_id == "row_003" + assert reason is None + + +def test_resolve_one_not_found(): + idx = {} + row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx) + assert row_id is None + assert reason == "not_found" + + +def test_resolve_one_ambiguous(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Cram", idx) + assert row_id is None + assert reason == "ambiguous" +``` + +- [ ] **Step 2: Run — expect failures** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v -k "norm_tree or build_index or resolve_one" +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_norm_tree'` + +- [ ] **Step 3: Implement `_norm_tree`, `_build_index`, `_resolve_one`** + +Add to `persons_tree.py`: + +```python +_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"} + + +def _norm_tree(s: str) -> str: + """Normalize a name string for tree matching. + + - Lowercase + diacritic → ASCII (uses persons._strip_accents logic) + - Strip surrounding quote characters + - Remove parenthetical substrings: "(Herbert)" → "" + - Replace dots with spaces (e.g. "Jr." → "Jr ") + - Remove known geographic/honorific suffix tokens + - Collapse whitespace + """ + s = (s or "").strip().strip("\"'") + s = re.sub(r"\([^)]*\)", "", s) + s = _strip_accents(s).lower().replace(".", " ") + tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES] + return " ".join(tokens).strip("., ") + + +def _build_index(persons: list[dict]) -> dict[str, list[str]]: + """Build a name → [rowId, …] lookup index with four keys per person.""" + index: dict[str, list[str]] = {} + + def _add(key: str, row_id: str) -> None: + if key: + index.setdefault(key, []).append(row_id) + + for p in persons: + row_id = p["rowId"] + first = p.get("firstName") or "" + last = p.get("lastName") or "" + maiden = p.get("maidenName") or "" + + _add(_norm_tree(f"{first} {last}"), row_id) # "Werner Allemeyer" + _add(_norm_tree(f"{last} {first}"), row_id) # "Allemeyer Werner" (col I order) + if maiden: + _add(_norm_tree(f"{first} {maiden}"), row_id) # maiden-name reference + _add(_norm_tree(last), row_id) # single-token fallback + + return index + + +def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]: + """Return (row_id, None) on unique match, (None, reason) otherwise.""" + key = _norm_tree(raw) + if not key: + return None, "empty" + hits = index.get(key, []) + if len(hits) == 1: + return hits[0], None + if len(hits) == 0: + return None, "not_found" + return None, "ambiguous" +``` + +- [ ] **Step 4: Run — all tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 36 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add name normalization + lookup index to persons_tree" +``` + +--- + +## Task 4: Row-level person parsing (pass 1) + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +`_parse_row(row_num, fields)` takes a 1-based row number and a field dict (from `build_header_map`) and produces the person record. Unresolvable date raw values are appended to notes. Internal keys `_spouse_raw` and `_bemerkung_raw` carry forward to pass 2 and are stripped before JSON output. + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def test_parse_row_serial_dates(): + fields = { + "generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard", + "maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz", + "death_date": "36222", "death_place": "Espelkamp", + "spouse": "Allemeyer Werner", "notes": "Nichte von Herbert", + } + p = persons_tree._parse_row(2, fields) + assert p["rowId"] == "row_002" + assert p["firstName"] == "Elsgard" + assert p["lastName"] == "Allemeyer" + assert p["maidenName"] == "Wöhler" + assert p["birthYear"] == 1920 + assert p["deathYear"] == 1999 + assert p["birthPlace"] == "Garz" + assert p["deathPlace"] == "Espelkamp" + assert p["generation"] == 3 + assert p["familyMember"] is True + assert p["_spouse_raw"] == "Allemeyer Werner" + assert p["_bemerkung_raw"] == "Nichte von Herbert" + # no date annotation in notes because both dates resolved + assert "[Geburtsdatum" not in (p["notes"] or "") + + +def test_parse_row_string_birth_date(): + fields = { + "generation": "G 2", "last_name": "Cram", "first_name": "Herbert", + "maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas", + "death_date": "", "death_place": "", "spouse": "", "notes": "", + } + p = persons_tree._parse_row(28, fields) + assert p["birthYear"] == 1890 + assert p["deathYear"] is None + assert p["notes"] is None or p["notes"] == "" + + +def test_parse_row_unresolvable_date_goes_to_notes(): + fields = { + "generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter", + "maiden_name": "", "birth_date": "28.9.", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid", + } + p = persons_tree._parse_row(96, fields) + assert p["birthYear"] is None + assert "[Geburtsdatum: 28.9.]" in p["notes"] + assert "Bruder v Ingrid" in p["notes"] + + +def test_parse_row_empty_spouse_and_notes(): + fields = { + "generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen", + "maiden_name": "", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": "", + } + p = persons_tree._parse_row(4, fields) + assert p["_spouse_raw"] is None + assert p["_bemerkung_raw"] is None +``` + +- [ ] **Step 2: Run — expect NameError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -k "parse_row" -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_parse_row'` + +- [ ] **Step 3: Implement `_parse_row`** + +Add to `persons_tree.py`: + +```python +def _parse_row(row_num: int, fields: dict) -> dict: + """Produce one person record from a header-mapped row dict. + + Internal keys prefixed with '_' are stripped before JSON output in main(). + """ + def s(key: str) -> str: + return (fields.get(key) or "").strip() + + birth_raw = s("birth_date") + death_raw = s("death_date") + + birth_year = _parse_year(birth_raw) + death_year = _parse_year(death_raw) + + notes_parts = [] + if birth_raw and birth_year is None: + notes_parts.append(f"[Geburtsdatum: {birth_raw}]") + if death_raw and death_year is None: + notes_parts.append(f"[Todesdatum: {death_raw}]") + bemerkung = s("notes") + if bemerkung: + notes_parts.append(bemerkung) + + maiden = s("maiden_name") or None + spouse = s("spouse") or None + bemerkung_out = bemerkung or None + + return { + "rowId": f"row_{row_num:03d}", + "firstName": s("first_name"), + "lastName": s("last_name"), + "maidenName": maiden, + "alias": None, + "notes": " ".join(notes_parts) or None, + "birthYear": birth_year, + "deathYear": death_year, + "birthPlace": s("birth_place") or None, + "deathPlace": s("death_place") or None, + "generation": _parse_generation(s("generation")), + "familyMember": True, + "_spouse_raw": spouse, + "_bemerkung_raw": bemerkung_out, + } +``` + +- [ ] **Step 4: Run — all tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 40 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add row parser to persons_tree" +``` + +--- + +## Task 5: Deduplication + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +Two-stage deduplication: +1. Exact `(firstName, lastName, birthYear)` match — catches rows 127/138 (same name + serial). +2. `(firstName, lastName)` match where the later entry has `birthYear=None` and an earlier entry has a birthYear — catches rows 129/139 (one has a date, the other doesn't). + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def test_deduplicate_no_duplicates(): + persons = [ + {"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920}, + {"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert len(result) == 2 + assert skipped == [] + + +def test_deduplicate_exact_match(): + # rows 127/138: same firstName, lastName, birthYear + persons = [ + {"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + {"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_127"] + assert len(skipped) == 1 + assert "row_138" in skipped[0] + + +def test_deduplicate_none_birth_year_after_known(): + # rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None + persons = [ + {"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964}, + {"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_129"] + assert len(skipped) == 1 + + +def test_deduplicate_both_none_birth_year_kept(): + # Two people with no birth year but same name: keep first only + persons = [ + {"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + {"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_A"] + assert len(skipped) == 1 +``` + +- [ ] **Step 2: Run — expect NameError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -k "deduplicate" -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_deduplicate'` + +- [ ] **Step 3: Implement `_deduplicate`** + +Add to `persons_tree.py`: + +```python +def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]: + """Remove duplicate rows. Two-stage: + + 1. Exact (firstName, lastName, birthYear) match. + 2. (firstName, lastName) where the later entry has birthYear=None and an earlier + entry already has a known birthYear. + """ + seen_full: dict[tuple, str] = {} # (first, last, year) -> rowId + seen_name: dict[tuple, str] = {} # (first, last) -> rowId of first entry with a year + result: list[dict] = [] + skipped: list[str] = [] + + for p in persons: + first, last, year = p["firstName"], p["lastName"], p["birthYear"] + key_full = (first, last, year) + key_name = (first, last) + + if key_full in seen_full: + skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})") + continue + + if year is None and key_name in seen_name: + skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)") + continue + + seen_full[key_full] = p["rowId"] + if year is not None: + seen_name[key_name] = p["rowId"] + + result.append(p) + + return result, skipped +``` + +- [ ] **Step 4: Run — all tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 44 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add deduplication to persons_tree" +``` + +--- + +## Task 6: SPOUSE_OF relationship extraction + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +Walk every person's `_spouse_raw`, resolve via the name index, and emit one `SPOUSE_OF` edge per matched pair. Skip if an identical edge (either direction) already exists. Unresolved entries go to `unresolved[]`. + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def _make_persons(*args): + """Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples.""" + return [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3], + "_spouse_raw": a[4], "_bemerkung_raw": None, + "birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None, + "generation": None, "familyMember": True, "alias": None, "notes": None} + for a in args + ] + + +def test_resolve_spouses_success(): + persons = _make_persons( + ("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"), + ("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + # Both rows reference each other, but only ONE edge should be emitted + assert len(rels) == 1 + assert rels[0]["type"] == "SPOUSE_OF" + assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"} + assert unres == [] + + +def test_resolve_spouses_not_found(): + persons = _make_persons( + ("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["rowId"] == "row_007" + assert unres[0]["reason"] == "not_found" + + +def test_resolve_spouses_empty_spouse_field(): + persons = _make_persons( + ("row_004", "Jürgen", "Allemeyer", None, None), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] and unres == [] +``` + +- [ ] **Step 2: Run — expect NameError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -k "resolve_spouses" -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_resolve_spouses'` + +- [ ] **Step 3: Implement `_resolve_spouses`** + +Add to `persons_tree.py`: + +```python +def _resolve_spouses( + persons: list[dict], index: dict[str, list[str]] +) -> tuple[list[dict], list[dict]]: + """Emit SPOUSE_OF edges from each person's _spouse_raw field.""" + relationships: list[dict] = [] + unresolved: list[dict] = [] + emitted: set[frozenset] = set() + + for p in persons: + raw = (p.get("_spouse_raw") or "").strip() + if not raw: + continue + row_id = p["rowId"] + matched_id, reason = _resolve_one(raw, index) + if matched_id: + edge = frozenset([row_id, matched_id]) + if edge not in emitted: + emitted.add(edge) + relationships.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "SPOUSE_OF", + "source": "verheiratet_mit", + }) + else: + unresolved.append({ + "rowId": row_id, + "field": "verheiratet_mit", + "raw": raw, + "reason": reason, + }) + + return relationships, unresolved +``` + +- [ ] **Step 4: Run — all tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 47 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add SPOUSE_OF resolution to persons_tree" +``` + +--- + +## Task 7: PARENT_OF extraction from Bemerkung + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +Two patterns anchored at start-of-string: +- `Sohn|Tochter + v(on)? + names` → named persons are parents of this row's person +- `Vater|Mutter + v(on)? + names` → this row's person is parent of named persons + +Names after the keyword may be two people joined by ` u ` or ` und `. Each part is resolved independently. Unmatched parts go to `unresolved[]`. The matched portion is stripped from `notes`; the remainder of the Bemerkung stays in `notes`. + +Everything that doesn't match any parent pattern goes to `notes` unchanged (no unresolved entry). + +- [ ] **Step 1: Write failing tests** + +Append to `tests/test_persons_tree.py`: + +```python +def _register(*args): + """Build index from (rowId, first, last, maiden) tuples.""" + persons = [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]} + for a in args + ] + return persons, persons_tree._build_index(persons) + + +def test_parse_bemerkung_sohn_two_parents(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara u Herbert", idx + ) + assert len(rels) == 2 + assert all(r["type"] == "PARENT_OF" for r in rels) + # Both parents point to the child + child_ids = {r["relatedPersonId"] for r in rels} + parent_ids = {r["personId"] for r in rels} + assert child_ids == {"row_021"} + assert "row_019" in parent_ids and "row_028" in parent_ids + assert unres == [] + assert notes == "" + + +def test_parse_bemerkung_tochter_von(): + _, idx = _register(("row_019", "Clara", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_036", "Tochter von Clara Cram", idx + ) + assert len(rels) == 1 + assert rels[0] == { + "personId": "row_019", + "relatedPersonId": "row_036", + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": "Tochter von Clara Cram", + } + assert notes == "" + + +def test_parse_bemerkung_vater(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_031", "Vater v Herbert", idx + ) + assert len(rels) == 1 + assert rels[0]["personId"] == "row_031" # this person is the parent + assert rels[0]["relatedPersonId"] == "row_028" + assert rels[0]["type"] == "PARENT_OF" + + +def test_parse_bemerkung_unmatched_parent_name(): + _, idx = _register() # empty index + rels, unres, notes = persons_tree._parse_bemerkung( + "row_004", "Sohn v Elsgard A.", idx + ) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["reason"] == "not_found" + # notes should be empty after stripping the matched pattern + assert notes == "" + + +def test_parse_bemerkung_skip_nichte(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_002", "Nichte von Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Nichte von Herbert" + + +def test_parse_bemerkung_skip_bruder(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_033", "Bruder v Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Bruder v Herbert" + + +def test_parse_bemerkung_empty(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx) + assert rels == [] and unres == [] and notes == "" + + +def test_parse_bemerkung_plain_remark(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung( + "row_029", "Verfasserin der Cram-Chronik !!", idx + ) + assert rels == [] and unres == [] + assert notes == "Verfasserin der Cram-Chronik !!" +``` + +- [ ] **Step 2: Run — expect NameError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -k "parse_bemerkung" -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute '_parse_bemerkung'` + +- [ ] **Step 3: Implement `_parse_bemerkung`** + +Add to `persons_tree.py`: + +```python +_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I) +_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I) +_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I) + + +def _parse_bemerkung( + row_id: str, bemerkung: str, index: dict[str, list[str]] +) -> tuple[list[dict], list[dict], str]: + """Extract PARENT_OF edges from a Bemerkung cell. + + Returns (relationships, unresolved, remaining_notes). + Text that doesn't match a parent pattern goes to remaining_notes unchanged. + """ + if not bemerkung or not bemerkung.strip(): + return [], [], "" + + s = bemerkung.strip() + + for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")): + m = pattern.match(s) + if not m: + continue + + name_part = m.group(1).strip().rstrip("!., ") + parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] + rels: list[dict] = [] + unres: list[dict] = [] + + for part in parts: + part = part.rstrip("!., ") + matched_id, reason = _resolve_one(part, index) + if matched_id: + if direction == "child": + # named person is parent of this row + rels.append({ + "personId": matched_id, + "relatedPersonId": row_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + # this row is parent of named person + rels.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + unres.append({ + "rowId": row_id, + "field": "bemerkung", + "raw": bemerkung, + "reason": reason, + }) + + remainder = s[m.end():].strip().lstrip(".,! ") + return rels, unres, remainder + + # No pattern matched — full text goes to notes, nothing to unresolved + return [], [], s +``` + +- [ ] **Step 4: Run — all tests pass** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 55 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add PARENT_OF Bemerkung extraction to persons_tree" +``` + +--- + +## Task 8: main() — CLI, two-pass loop, JSON output + +**Files:** +- Modify: `tools/import-normalizer/persons_tree.py` +- Modify: `tools/import-normalizer/tests/test_persons_tree.py` + +Wire the two passes into `main()`. Pass 1: read sheet → parse rows → deduplicate → build index. Pass 2: resolve spouses + parse Bemerkung → collect relationships + unresolved → strip internal `_` keys → write JSON. + +- [ ] **Step 1: Write failing test for dry-run** + +Append to `tests/test_persons_tree.py`: + +```python +import subprocess + + +def test_dry_run_exits_zero(tmp_path): + """dry-run should complete without writing any file and exit 0.""" + input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx" + if not input_path.exists(): + import pytest + pytest.skip("source Excel file not present") + + result = subprocess.run( + [ + sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"), + "--input", str(input_path), + "--output", str(tmp_path / "out.json"), + "--dry-run", + ], + capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr + assert not (tmp_path / "out.json").exists() + assert "persons parsed" in result.stdout +``` + +- [ ] **Step 2: Run — expect NameError/AttributeError** + +```bash +.venv/bin/pytest tests/test_persons_tree.py::test_dry_run_exits_zero -v +``` + +Expected: `AttributeError: module 'persons_tree' has no attribute 'main'` or exit code != 0. + +- [ ] **Step 3: Implement `main()`** + +Add to `persons_tree.py`: + +```python +def main() -> None: + parser = argparse.ArgumentParser( + description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json" + ) + parser.add_argument( + "--input", default=str(config.PERSON_WORKBOOK), + help="Path to Personendatei 2.xlsx" + ) + parser.add_argument( + "--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"), + help="Path for output JSON" + ) + parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write") + args = parser.parse_args() + + from ingest import read_sheet, build_header_map + + rows = read_sheet(Path(args.input), config.PERSON_SHEET) + if not rows: + print("ERROR: sheet is empty", file=sys.stderr) + sys.exit(1) + + header_row = [str(v) for v in rows[0]] + fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS) + + # --- Pass 1: parse rows --- + persons_raw: list[dict] = [] + for row_num, row in enumerate(rows[1:], start=2): + field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()} + if not field_dict.get("last_name", "").strip(): + continue + persons_raw.append(_parse_row(row_num, field_dict)) + + persons, skipped_msgs = _deduplicate(persons_raw) + for msg in skipped_msgs: + print(f" SKIP {msg}", file=sys.stderr) + + index = _build_index(persons) + + # --- Pass 2: resolve relationships --- + all_rels: list[dict] = [] + all_unresolved: list[dict] = [] + + spouse_rels, spouse_unres = _resolve_spouses(persons, index) + all_rels.extend(spouse_rels) + all_unresolved.extend(spouse_unres) + + for p in persons: + bemerkung = p.pop("_bemerkung_raw", None) or "" + p.pop("_spouse_raw", None) + + rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index) + all_rels.extend(rels) + all_unresolved.extend(unres) + + if remaining: + existing = p.get("notes") or "" + # avoid duplicating the bemerkung that was already put in notes during _parse_row + if remaining not in existing: + p["notes"] = (existing + " " + remaining).strip() if existing else remaining + + # --- Stats output --- + spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF") + parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF") + print(f"✓ {len(persons)} persons parsed") + print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)") + if all_unresolved: + print(f"⚠ {len(all_unresolved)} unresolved (see unresolved[] in output)") + + if args.dry_run: + print("\n--- dry-run: first 5 unresolved ---") + for u in all_unresolved[:5]: + print(f" {u}") + return + + output = { + "generated_at": datetime.datetime.now().isoformat(), + "source": Path(args.input).name, + "stats": { + "persons": len(persons), + "relationships": len(all_rels), + "unresolved": len(all_unresolved), + }, + "persons": persons, + "relationships": all_rels, + "unresolved": all_unresolved, + } + + out_path = Path(args.output) + out_path.parent.mkdir(exist_ok=True) + out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"→ {args.output}") + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 4: Run dry-run test** + +```bash +.venv/bin/pytest tests/test_persons_tree.py::test_dry_run_exits_zero -v +``` + +Expected: PASS. (If the Excel file is absent the test is skipped, not failed.) + +- [ ] **Step 5: Run all tests** + +```bash +.venv/bin/pytest tests/test_persons_tree.py -v +``` + +Expected: all 56 tests PASS (or 55 + 1 skipped if Excel file absent). + +- [ ] **Step 6: Commit** + +```bash +git add tools/import-normalizer/persons_tree.py tools/import-normalizer/tests/test_persons_tree.py +git commit -m "feat(normalizer): add main() CLI to persons_tree" +``` + +--- + +## Task 9: Integration run against the real file + +**Files:** none (read-only validation) + +- [ ] **Step 1: Run with `--dry-run` and inspect output** + +```bash +cd tools/import-normalizer +.venv/bin/python persons_tree.py --dry-run +``` + +Expected output (approximate — exact numbers will differ once resolved): +``` +✓ ~161 persons parsed (163 rows minus 2 duplicates) +✓ ~N relationships emitted (X SPOUSE_OF, Y PARENT_OF) +⚠ ~Z unresolved (see unresolved[] in output) + +--- dry-run: first 5 unresolved --- + {'rowId': '...', 'field': '...', 'raw': '...', 'reason': '...'} + ... +``` + +If you see `ERROR` or a Python traceback, investigate before continuing. + +- [ ] **Step 2: Write the output file** + +```bash +.venv/bin/python persons_tree.py +``` + +Expected: `→ out/canonical-persons-tree.json` + +- [ ] **Step 3: Spot-check the output** + +```bash +python3 -c " +import json +data = json.load(open('out/canonical-persons-tree.json')) +print('persons:', data['stats']['persons']) +print('relationships:', data['stats']['relationships']) +print('unresolved:', data['stats']['unresolved']) + +# Check Herbert Cram +herbert = next(p for p in data['persons'] if p['firstName'] == 'Herbert' and p['lastName'] == 'Cram') +print('Herbert:', herbert) + +# Check a SPOUSE_OF edge involving Clara and Herbert +clara = next(p for p in data['persons'] if p['firstName'] == 'Clara' and p['lastName'] == 'Cram') +spouse_edge = next((r for r in data['relationships'] + if r['type'] == 'SPOUSE_OF' + and {r['personId'], r['relatedPersonId']} == {herbert['rowId'], clara['rowId']}), None) +print('Herbert-Clara SPOUSE_OF edge:', spouse_edge) +" +``` + +Verify: +- `persons` ≈ 161 (163 − 2 duplicates) +- Herbert Cram has `birthYear: 1890`, `generation: 2` +- A `SPOUSE_OF` edge exists between Herbert and Clara + +- [ ] **Step 4: Commit the output file** + +```bash +git add out/canonical-persons-tree.json +git commit -m "feat(normalizer): add canonical-persons-tree.json output" +``` + +--- + +## Self-Review Checklist + +- **§4 date parsing** → Task 1 (`_parse_year`) covers Excel serial, ISO, German string, year-only, free text, unresolvable ✓ +- **§5 generation** → Task 2 (`_parse_generation`) covers all format variants ✓ +- **§5 notes construction** → Task 4 (`_parse_row`) appends unresolvable date raws and bemerkung ✓ +- **§6 name index** → Task 3 (`_norm_tree`, `_build_index`, `_resolve_one`) covers forward, reversed, maiden, single-token, ambiguous ✓ +- **§12 OQ-01 deduplication** → Task 5 (`_deduplicate`) handles same-year + no-year cases ✓ +- **§7.1 SPOUSE_OF** → Task 6 (`_resolve_spouses`) with dedup of bidirectional edges ✓ +- **§7.2 PARENT_OF** → Task 7 (`_parse_bemerkung`) with Sohn/Tochter/Vater/Mutter + multi-parent split ✓ +- **§9 CLI** → Task 8 (`main()`) with `--input`, `--output`, `--dry-run` ✓ +- **§10 module reuse** → `ingest.read_sheet`, `ingest.build_header_map`, `dates.parse_date`, `persons._strip_accents` all used ✓ +- **§11 non-goals** → no API calls, no alias records, no SIBLING_OF, no dedup vs canonical-persons.xlsx ✓ +- **§8 JSON schema** → all fields present: `rowId`, `firstName`, `lastName`, `maidenName`, `alias`, `notes`, `birthYear`, `deathYear`, `birthPlace`, `deathPlace`, `generation`, `familyMember` ✓ diff --git a/docs/superpowers/specs/2026-05-25-personendatei-importer-design.md b/docs/superpowers/specs/2026-05-25-personendatei-importer-design.md new file mode 100644 index 00000000..acd46286 --- /dev/null +++ b/docs/superpowers/specs/2026-05-25-personendatei-importer-design.md @@ -0,0 +1,292 @@ +# Personendatei Importer — Design Spec + +**Date:** 2026-05-25 +**Source file:** `import/Personendatei 2.xlsx` +**Output:** `tools/import-normalizer/out/canonical-persons-tree.json` +**Tool location:** `tools/import-normalizer/persons_tree.py` + +--- + +## 1. Purpose + +Normalize the 163-person family register in `Personendatei 2.xlsx` into a machine-readable JSON file that a future backend importer can consume to seed the `persons` and `person_relationships` tables. The tool is offline (no backend required) and produces a reviewable artifact with an explicit `unresolved[]` list for manual follow-up. + +--- + +## 2. Source Data — Column Map + +Sheet: `Tabelle1` (rows 2–164; row 1 is the header). + +| Col | Header | Content | Notes | +|-----|--------|---------|-------| +| A | Generation | `G 0`–`G 5` | Generation relative to Herbert & Clara Cram (G 2). Inconsistent formatting: `"G3"`, `"G 0"`, `"G 2 de Gruyter"` — strip non-digit chars and parse the integer. | +| B | Familienname | Last name | Sometimes compound: `"de Gruyter"`, `"Cram Heydrich"`, `"Burkhard- Meier"` | +| C | Vorname | First name | Sometimes multiple: `"Charlotte,Meta,Jacobi"`, nicknames in parens: `"Otto (Herbert)"` | +| D | geb als | Maiden name | Used as a name alias for matching | +| E | Geburtsdatum | Birth date | **Mixed types** — see §4 | +| F | Geburtsort | Birth place | Free-text string, stored verbatim | +| G | Todesdatum | Death date | Same mixed types as col E | +| H | Sterbeort | Death place | Free-text string, stored verbatim | +| I | verheiratet mit | Spouse name | Partial name in either `"Firstname Lastname"` or `"Lastname Firstname"` order | +| J | Bemerkung | German relationship notes | `"Sohn v Clara u Herbert"`, `"Nichte v Herbert"`, free text | + +--- + +## 3. Two-Pass Architecture + +### Pass 1 — Parse & Normalize (rows → person records) + +For each row: +1. Read all 10 columns. +2. Assign a stable `rowId`: `"row_{i:03d}"` where `i` is the 1-based row number (e.g. `row_002`). +3. Normalize fields per §4 and §5. +4. Build the **name-lookup index** (see §6). +5. Emit a person record. + +### Pass 2 — Resolve Relationships + +Walk every person record: +1. Resolve col I (spouse) → emit `SPOUSE_OF` edge or `unresolved` entry. +2. Parse col J (Bemerkung) for parent/child patterns → emit `PARENT_OF` edges or `unresolved` entries. +3. Append unmatched Bemerkung text to `person.notes`. + +--- + +## 4. Date Parsing + +Both col E (birth) and col G (death) arrive as either an Excel numeric serial or a string. + +### Excel serial conversion +When the cell value is an integer (or a float with no string representation): +``` +date = datetime(1899, 12, 30) + timedelta(days=int(value)) +year = date.year +``` +Excel's epoch is 1899-12-30 (accounts for the Lotus 1-2-3 leap-year bug). + +### String fallback — reuse existing `dates.parse_date()` +Pass the raw string to the existing `tools/import-normalizer/dates.parse_date()`. It already handles: +- `DD.MM.YYYY` and `D.M.YY` +- Year-only (`1930`) +- Month + year (`August 1941`, `Sept. 1913`) +- Partial/approximate markers + +Extract `.year` from the returned `ParsedDate.iso` if `iso` is not `None`. + +### Unresolvable dates +If both paths yield `None` (e.g. `"2.9.196"`, `"4.3.1023"`, `".12.1955"`): +- Set `birthYear`/`deathYear` to `null`. +- Append the raw value to `person.notes` as `"[Geburtsdatum: ]"` or `"[Todesdatum: ]"` for human review. + +--- + +## 5. Person Record Normalization + +### Name fields +- **lastName** = col B, stripped. +- **firstName** = col C. Keep as-is (including multi-name strings and parenthetical nicknames) — the backend can split later. +- **maidenName** = col D, stripped. Stored in the JSON; the backend maps this to a `PersonNameAlias` of type `BIRTH_NAME`. +- **alias** = `null` (the tool does not invent aliases; maiden name is the alias). + +### Generation +Extract the first digit sequence from col A: +```python +import re +m = re.search(r"\d+", raw_generation) +generation = int(m.group()) if m else None +``` +Handles all observed variants: `"G 3"`, `"G3"`, `"G 0"`, `"G 2 de Gruyter"`, `"G 0"`. +Stored as `generation: int | null` in the JSON (informational; not mapped to a backend field directly). + +### familyMember +Set `true` for all records. Every person in this register is part of the family network. The backend can refine this. + +### notes +Constructed by concatenation: +1. Unmatched Bemerkung text (after relationship pattern is stripped). +2. Unresolvable date raw values (prefixed with field name). + +--- + +## 6. Name Lookup Index + +After pass 1, build a `dict[str, list[str]]` mapping normalized name keys → list of `rowId`s. + +### Normalization function `_norm(s) -> str` +1. Lowercase. +2. Strip surrounding `"` and `'`. +3. Remove parenthetical substrings: `r"\([^)]*\)"`. +4. Collapse internal whitespace. +5. Strip geographic/honorific suffixes: `aachen`, `mex.`, `mexiko`, `sen`, `jun`, `jr`. +6. Strip trailing commas, dots. + +### Keys indexed per person +For a person with firstName `F`, lastName `L`, maidenName `M`: +- `_norm(f"{F} {L}")` — canonical order +- `_norm(f"{L} {F}")` — reversed order (col I uses this heavily) +- `_norm(f"{F} {M}")` if maidenName is set — maiden-name reference +- `_norm(L)` alone — single-token fallback + +### Match resolution +Given a raw name string from col I or col J: +1. `_norm(raw)` → look up in index. +2. **Exactly one hit** → match confirmed, use that `rowId`. +3. **Zero hits** → `reason: "not_found"` → `unresolved[]`. +4. **Multiple hits** → `reason: "ambiguous"` → `unresolved[]`. + +--- + +## 7. Relationship Extraction + +### 7.1 SPOUSE_OF (col I — `verheiratet mit`) + +1. Normalize col I value. +2. Resolve via name index (§6). +3. If matched: emit one edge `{ personId, relatedPersonId, type: "SPOUSE_OF", source: "verheiratet_mit" }`. + - Skip if an identical edge (regardless of direction) already exists in the relationship list. +4. If unresolved: add to `unresolved[]`. + +### 7.2 PARENT_OF (col J — `Bemerkung`) + +Apply these regex patterns in order, case-insensitive, with optional whitespace: + +| Pattern | Direction | Note | +|---------|-----------|------| +| `(Sohn\|Tochter)\s+v(?:on)?\s+(.+)` | Named person(s) → this person | "Sohn v Clara u Herbert" | +| `(Vater\|Mutter)\s+v(?:on)?\s+(.+)` | This person → named person(s) | "Vater v Herbert" | + +**Multi-parent extraction:** The parent string may contain two parents joined by `\s+u(?:nd)?\s+`. Split on this pattern, resolve each part independently. + +**Emit** one `PARENT_OF` edge per resolved parent: +```json +{ + "personId": "", + "relatedPersonId": "", + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": "" +} +``` + +**Skip** (do not emit, do not add to `unresolved[]`, leave in notes): +- Patterns starting with `Neffe`, `Nichte`, `Enkel`, `Enkelin`, `Urenkel`, `Urenkelin` — too indirect. +- Patterns starting with `Bruder`, `Schwester` — SIBLING_OF is out of scope for this tool. +- Any other Bemerkung text that does not match the parent patterns. + +**After extraction:** the matched portion of the Bemerkung is removed; the remainder goes into `person.notes`. + +--- + +## 8. Output JSON Schema + +File: `tools/import-normalizer/out/canonical-persons-tree.json` + +```json +{ + "generated_at": "", + "source": "Personendatei 2.xlsx", + "stats": { + "persons": 163, + "relationships": 87, + "unresolved": 12 + }, + "persons": [ + { + "rowId": "row_002", + "firstName": "Elsgard", + "lastName": "Allemeyer", + "maidenName": "Wöhler", + "alias": null, + "notes": "Nichte von Herbert", + "birthYear": 1920, + "deathYear": 1999, + "birthPlace": "Garz", + "deathPlace": "Espelkamp", + "generation": 3, + "familyMember": true + } + ], + "relationships": [ + { + "personId": "row_002", + "relatedPersonId": "row_003", + "type": "SPOUSE_OF", + "source": "verheiratet_mit" + }, + { + "personId": "row_019", + "relatedPersonId": "row_021", + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": "Tochter v Clara u Herbert" + } + ], + "unresolved": [ + { + "rowId": "row_007", + "field": "verheiratet_mit", + "raw": "\"Tante Lolly\"", + "reason": "not_found" + }, + { + "rowId": "row_042", + "field": "bemerkung", + "raw": "Zwillingsbruder v Herbert", + "reason": "not_found" + } + ] +} +``` + +--- + +## 9. CLI Interface + +``` +python3 persons_tree.py [--input PATH] [--output PATH] [--dry-run] +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--input` | `../../import/Personendatei 2.xlsx` | Source Excel file | +| `--output` | `out/canonical-persons-tree.json` | Output JSON file | +| `--dry-run` | off | Print stats + first 5 unresolved entries; do not write file | + +On success, print: +``` +✓ 163 persons parsed +✓ 87 relationships emitted (52 SPOUSE_OF, 35 PARENT_OF) +⚠ 12 unresolved (see unresolved[] in output) +→ out/canonical-persons-tree.json +``` + +--- + +## 10. Module Reuse + +| Existing module | What we reuse | +|-----------------|---------------| +| `dates.parse_date()` | String date parsing — handles DD.MM.YYYY, year-only, month+year, approximate markers | +| `config.MONTHS` | Month name → integer mapping (German + Spanish month names already present) | + +The Excel serial conversion is new logic added directly in `persons_tree.py` (3 lines). + +--- + +## 11. What This Tool Does NOT Do + +- Does not call the backend API or touch the database. +- Does not create `PersonNameAlias` records — it emits `maidenName` as a field; the future backend importer maps it. +- Does not infer SIBLING_OF edges (requires symmetric lookup across multiple rows — deferred). +- Does not deduplicate persons that appear in both this file and `canonical-persons.xlsx` — deduplication is the backend importer's responsibility. +- Does produce `birthPlace` / `deathPlace` as top-level fields in the JSON (see §8) — they are free-text strings and informational only. The `Person` entity has no corresponding columns; the future backend importer decides whether to add columns or fold the values into `notes`. + +--- + +## 12. Resolved Decisions + +| OQ | Question | Decision | +|----|----------|----------| +| OQ-01 | Duplicate rows (127/138 — Christa Schütz; 129/139 — Christoph Seils). | **Tool deduplicates.** On pass 1, after building the person list, detect rows with identical `(firstName, lastName, birthYear)` and keep only the first occurrence. Log skipped row ids to stdout. | +| OQ-02 | `birthPlace` / `deathPlace` absent from `Person` entity. | **Keep as separate top-level fields** in the JSON (`birthPlace`, `deathPlace`). The future backend importer may add columns to the `persons` table; the field is preserved here to avoid data loss. | +| OQ-03 | `firstName` = `"Charlotte,Meta,Jacobi"` (multi-name comma string). | **Store verbatim as `firstName`.** No splitting. | diff --git a/frontend/CLAUDE.md b/frontend/CLAUDE.md index 3699350f..92ae7396 100644 --- a/frontend/CLAUDE.md +++ b/frontend/CLAUDE.md @@ -28,7 +28,7 @@ src/ │ ├── +layout.server.ts # Loads current user, injects auth cookie │ ├── +page.svelte # Home / document search dashboard │ ├── documents/ # Document CRUD, detail, edit, upload -│ ├── persons/ # Person directory, detail, edit, merge +│ ├── persons/ # Person directory (filtered, paginated), detail, edit, merge, review (triage) │ ├── briefwechsel/ # Bilateral conversation timeline │ ├── aktivitaeten/ # Unified activity feed (Chronik) │ ├── admin/ # User, group, tag, OCR, system management diff --git a/frontend/eslint.config.js b/frontend/eslint.config.js index 037353b8..79384407 100644 --- a/frontend/eslint.config.js +++ b/frontend/eslint.config.js @@ -71,7 +71,13 @@ export default defineConfig( message: 'text-accent is decorative-only (#a1dcd8 in light mode = 1.52:1 contrast — WCAG fail). Use text-primary or text-ink-2 for text labels.' } - ] + ], + // Primary XSS guard: any {@html ...} block in a Svelte template is a potential + // injection sink. This rule replaces the regex CI guard's role as the primary + // defense (the CI regex stays as a backstop). For any legitimate use (e.g. + // trusted server-rendered Markdown), suppress with an inline + // `` and a justification. + 'svelte/no-at-html-tags': 'error' } }, { diff --git a/frontend/messages/de.json b/frontend/messages/de.json index 25a17b1f..d791b094 100644 --- a/frontend/messages/de.json +++ b/frontend/messages/de.json @@ -14,6 +14,7 @@ "error_file_too_large": "Die Datei ist zu groß (max. 50 MB).", "error_user_not_found": "Der Benutzer wurde nicht gefunden.", "error_import_already_running": "Ein Import läuft bereits. Bitte warten Sie, bis dieser abgeschlossen ist.", + "error_import_artifact_invalid": "Eine Importdatei fehlt oder ist ungültig. Bitte führen Sie den Normalizer erneut aus.", "error_invalid_credentials": "E-Mail-Adresse oder Passwort ist falsch.", "error_session_expired": "Ihre Sitzung ist abgelaufen. Bitte melden Sie sich erneut an.", "error_session_expired_explainer": "Aus Sicherheitsgründen werden Sitzungen nach 8 Stunden Inaktivität automatisch beendet.", @@ -99,6 +100,9 @@ "docs_list_summary": "Zusammenfassung", "docs_list_unknown": "Unbekannt", "docs_group_undated": "Undatiert", + "docs_filter_undated_only": "Nur undatierte", + "docs_filter_undated_count_label": "{count} undatierte Dokumente", + "docs_range_excludes_undated": "Ein Datumsfilter schließt undatierte Dokumente aus, da sie keinem Zeitraum zugeordnet werden können.", "docs_group_unknown": "Unbekannt", "doc_section_who_when": "Wer & Wann", "doc_section_description": "Beschreibung", @@ -129,6 +133,34 @@ "persons_search_placeholder": "Namen suchen...", "persons_empty_heading": "Keine Personen gefunden.", "persons_empty_text": "Versuchen Sie einen anderen Suchbegriff.", + "persons_empty_filtered": "Keine Personen für diese Filter.", + "persons_filter_group_label": "Filter", + "persons_filter_type_person": "Person", + "persons_filter_type_group": "Gruppe", + "persons_filter_type_institution": "Institution", + "persons_filter_family_only": "Nur Familie", + "persons_filter_has_documents": "Mit Dokumenten", + "persons_toggle_show_all": "Alle anzeigen", + "persons_toggle_needs_review": "Zu prüfen ({count})", + "person_badge_unconfirmed": "unbestätigt", + "persons_review_heading": "Personen prüfen", + "persons_review_intro": "Vom Import erzeugte, noch nicht bestätigte Personen. Zusammenführen, umbenennen, bestätigen oder löschen.", + "persons_review_action_merge": "Zusammenführen", + "persons_review_action_rename": "Umbenennen", + "persons_review_action_confirm": "Bestätigen", + "persons_review_action_delete": "Löschen", + "persons_review_action_cancel": "Abbrechen", + "persons_review_action_save": "Speichern", + "persons_review_empty": "Keine Personen zu prüfen.", + "persons_review_delete_confirm_title": "Person löschen", + "persons_review_delete_confirm_text": "Diese Person wird endgültig gelöscht. Dokumentverweise bleiben erhalten, verlieren aber diese Person.", + "persons_review_delete_confirm_button": "Person löschen", + "persons_review_confirm_confirm_title": "Person bestätigen", + "persons_review_confirm_confirm_text": "Diese Person wird als bestätigt markiert und erscheint nicht mehr in der Prüfliste.", + "persons_review_confirm_confirm_button": "Bestätigen", + "persons_review_merge_label": "Mit welcher Person zusammenführen?", + "persons_field_first_name": "Vorname", + "persons_field_last_name": "Nachname", "persons_new_heading": "Neue Person", "persons_section_details": "Angaben zur Person", "person_edit_heading": "Person bearbeiten", @@ -260,6 +292,24 @@ "doc_preview_iframe_title": "Dokumentvorschau", "doc_image_alt": "Original-Scan", "doc_no_date": "Kein Datum", + "date_precision_unknown": "Datum unbekannt", + "date_precision_approx_prefix": "ca.", + "date_range_open_prefix": "ab", + "date_season_spring": "Frühling", + "date_season_summer": "Sommer", + "date_season_autumn": "Herbst", + "date_season_winter": "Winter", + "date_original_label": "Originaltext:", + "date_unknown_icon_label": "Datum unbekannt", + "form_label_date_precision": "Datumsgenauigkeit", + "form_label_date_end": "Enddatum", + "date_precision_option_day": "Genauer Tag", + "date_precision_option_month": "Monat", + "date_precision_option_season": "Jahreszeit", + "date_precision_option_year": "Jahr", + "date_precision_option_range": "Zeitraum", + "date_precision_option_approx": "Ungefähr", + "date_precision_option_unknown": "Unbekannt", "person_merge_will_be_deleted": "wird gelöscht.", "comp_typeahead_placeholder": "Namen tippen...", "comp_typeahead_loading": "Suche...", @@ -356,11 +406,13 @@ "admin_system_import_status_done_label": "Dokumente verarbeitet", "admin_system_import_skipped_label": "übersprungen", "import_reason_invalid_pdf_signature": "Keine gültige PDF-Signatur", + "import_reason_path_traversal": "Ungültiger Dateiname (Pfad)", "import_reason_file_read_error": "Fehler beim Lesen der Datei", "import_reason_s3_upload_failed": "Upload-Fehler (S3)", "import_reason_already_exists": "Bereits importiert", "admin_system_import_status_failed": "Import fehlgeschlagen", "admin_system_import_failed_no_spreadsheet": "Keine Tabellendatei gefunden.", + "admin_system_import_failed_artifact": "Eine Importdatei fehlt oder ist ungültig.", "admin_system_import_failed_internal": "Interner Fehler beim Import.", "admin_system_thumbnails_heading": "Thumbnails erzeugen", "admin_system_thumbnails_description": "Erzeugt Vorschaubilder für Dokumente ohne Thumbnail (z. B. nach dem Massenimport).", diff --git a/frontend/messages/en.json b/frontend/messages/en.json index 0289f7a6..84b7cd3b 100644 --- a/frontend/messages/en.json +++ b/frontend/messages/en.json @@ -14,6 +14,7 @@ "error_file_too_large": "The file is too large (max. 50 MB).", "error_user_not_found": "User not found.", "error_import_already_running": "An import is already running. Please wait for it to finish.", + "error_import_artifact_invalid": "A canonical import file is missing or invalid. Please re-run the normalizer.", "error_invalid_credentials": "Email address or password is incorrect.", "error_session_expired": "Your session has expired. Please sign in again.", "error_session_expired_explainer": "For security reasons, sessions are automatically ended after 8 hours of inactivity.", @@ -99,6 +100,9 @@ "docs_list_summary": "Summary", "docs_list_unknown": "Unknown", "docs_group_undated": "Undated", + "docs_filter_undated_only": "Undated only", + "docs_filter_undated_count_label": "{count} undated documents", + "docs_range_excludes_undated": "A date range filter excludes undated documents, because they cannot belong to any time span.", "docs_group_unknown": "Unknown", "doc_section_who_when": "Who & When", "doc_section_description": "Description", @@ -129,6 +133,34 @@ "persons_search_placeholder": "Search names...", "persons_empty_heading": "No persons found.", "persons_empty_text": "Try a different search term.", + "persons_empty_filtered": "No persons match these filters.", + "persons_filter_group_label": "Filter", + "persons_filter_type_person": "Person", + "persons_filter_type_group": "Group", + "persons_filter_type_institution": "Institution", + "persons_filter_family_only": "Family only", + "persons_filter_has_documents": "With documents", + "persons_toggle_show_all": "Show all", + "persons_toggle_needs_review": "Needs review ({count})", + "person_badge_unconfirmed": "unconfirmed", + "persons_review_heading": "Review persons", + "persons_review_intro": "Import-generated persons not yet confirmed. Merge, rename, confirm or delete.", + "persons_review_action_merge": "Merge", + "persons_review_action_rename": "Rename", + "persons_review_action_confirm": "Confirm", + "persons_review_action_delete": "Delete", + "persons_review_action_cancel": "Cancel", + "persons_review_action_save": "Save", + "persons_review_empty": "No persons to review.", + "persons_review_delete_confirm_title": "Delete person", + "persons_review_delete_confirm_text": "This person will be permanently deleted. Document references are kept but lose this person.", + "persons_review_delete_confirm_button": "Delete person", + "persons_review_confirm_confirm_title": "Confirm person", + "persons_review_confirm_confirm_text": "This person will be marked as confirmed and will no longer appear in the review list.", + "persons_review_confirm_confirm_button": "Confirm", + "persons_review_merge_label": "Merge into which person?", + "persons_field_first_name": "First name", + "persons_field_last_name": "Last name", "persons_new_heading": "New person", "persons_section_details": "Person details", "person_edit_heading": "Edit person", @@ -260,6 +292,24 @@ "doc_preview_iframe_title": "Document Preview", "doc_image_alt": "Original scan", "doc_no_date": "No date", + "date_precision_unknown": "Date unknown", + "date_precision_approx_prefix": "c.", + "date_range_open_prefix": "from", + "date_season_spring": "Spring", + "date_season_summer": "Summer", + "date_season_autumn": "Autumn", + "date_season_winter": "Winter", + "date_original_label": "Original:", + "date_unknown_icon_label": "Date unknown", + "form_label_date_precision": "Date precision", + "form_label_date_end": "End date", + "date_precision_option_day": "Exact day", + "date_precision_option_month": "Month", + "date_precision_option_season": "Season", + "date_precision_option_year": "Year", + "date_precision_option_range": "Range", + "date_precision_option_approx": "Approximate", + "date_precision_option_unknown": "Unknown", "person_merge_will_be_deleted": "will be deleted.", "comp_typeahead_placeholder": "Type a name...", "comp_typeahead_loading": "Searching...", @@ -356,11 +406,13 @@ "admin_system_import_status_done_label": "Documents processed", "admin_system_import_skipped_label": "skipped", "import_reason_invalid_pdf_signature": "Invalid PDF signature", + "import_reason_path_traversal": "Invalid filename (path)", "import_reason_file_read_error": "File read error", "import_reason_s3_upload_failed": "Upload error (S3)", "import_reason_already_exists": "Already imported", "admin_system_import_status_failed": "Import failed", "admin_system_import_failed_no_spreadsheet": "No spreadsheet file found.", + "admin_system_import_failed_artifact": "A canonical import file is missing or invalid.", "admin_system_import_failed_internal": "Import failed due to an internal error.", "admin_system_thumbnails_heading": "Generate thumbnails", "admin_system_thumbnails_description": "Generates preview images for documents without a thumbnail (e.g. after the mass import).", diff --git a/frontend/messages/es.json b/frontend/messages/es.json index cc3a5627..00a74688 100644 --- a/frontend/messages/es.json +++ b/frontend/messages/es.json @@ -14,6 +14,7 @@ "error_file_too_large": "El archivo es demasiado grande (máx. 50 MB).", "error_user_not_found": "Usuario no encontrado.", "error_import_already_running": "Ya hay una importación en curso. Por favor, espere a que finalice.", + "error_import_artifact_invalid": "Falta un archivo de importación canónico o no es válido. Vuelva a ejecutar el normalizador.", "error_invalid_credentials": "El correo electrónico o la contraseña son incorrectos.", "error_session_expired": "Su sesión ha expirado. Por favor, inicie sesión de nuevo.", "error_session_expired_explainer": "Por razones de seguridad, las sesiones se terminan automáticamente tras 8 horas de inactividad.", @@ -99,6 +100,9 @@ "docs_list_summary": "Resumen", "docs_list_unknown": "Desconocido", "docs_group_undated": "Sin fecha", + "docs_filter_undated_only": "Solo sin fecha", + "docs_filter_undated_count_label": "{count} documentos sin fecha", + "docs_range_excludes_undated": "Un filtro de intervalo de fechas excluye los documentos sin fecha, ya que no pueden pertenecer a ningún periodo.", "docs_group_unknown": "Desconocido", "doc_section_who_when": "Quién & Cuándo", "doc_section_description": "Descripción", @@ -129,6 +133,34 @@ "persons_search_placeholder": "Buscar nombres...", "persons_empty_heading": "No se encontraron personas.", "persons_empty_text": "Pruebe con otro término de búsqueda.", + "persons_empty_filtered": "Ninguna persona coincide con estos filtros.", + "persons_filter_group_label": "Filtro", + "persons_filter_type_person": "Persona", + "persons_filter_type_group": "Grupo", + "persons_filter_type_institution": "Institución", + "persons_filter_family_only": "Solo familia", + "persons_filter_has_documents": "Con documentos", + "persons_toggle_show_all": "Mostrar todo", + "persons_toggle_needs_review": "Por revisar ({count})", + "person_badge_unconfirmed": "sin confirmar", + "persons_review_heading": "Revisar personas", + "persons_review_intro": "Personas generadas por la importación aún sin confirmar. Fusionar, renombrar, confirmar o eliminar.", + "persons_review_action_merge": "Fusionar", + "persons_review_action_rename": "Renombrar", + "persons_review_action_confirm": "Confirmar", + "persons_review_action_delete": "Eliminar", + "persons_review_action_cancel": "Cancelar", + "persons_review_action_save": "Guardar", + "persons_review_empty": "No hay personas por revisar.", + "persons_review_delete_confirm_title": "Eliminar persona", + "persons_review_delete_confirm_text": "Esta persona se eliminará de forma permanente. Las referencias de documentos se conservan pero pierden a esta persona.", + "persons_review_delete_confirm_button": "Eliminar persona", + "persons_review_confirm_confirm_title": "Confirmar persona", + "persons_review_confirm_confirm_text": "Esta persona se marcará como confirmada y dejará de aparecer en la lista de revisión.", + "persons_review_confirm_confirm_button": "Confirmar", + "persons_review_merge_label": "¿Fusionar con qué persona?", + "persons_field_first_name": "Nombre", + "persons_field_last_name": "Apellido", "persons_new_heading": "Nueva persona", "persons_section_details": "Datos de la persona", "person_edit_heading": "Editar persona", @@ -260,6 +292,24 @@ "doc_preview_iframe_title": "Vista previa del documento", "doc_image_alt": "Escaneado original", "doc_no_date": "Sin fecha", + "date_precision_unknown": "Fecha desconocida", + "date_precision_approx_prefix": "ca.", + "date_range_open_prefix": "desde", + "date_season_spring": "Primavera", + "date_season_summer": "Verano", + "date_season_autumn": "Otoño", + "date_season_winter": "Invierno", + "date_original_label": "Texto original:", + "date_unknown_icon_label": "Fecha desconocida", + "form_label_date_precision": "Precisión de la fecha", + "form_label_date_end": "Fecha final", + "date_precision_option_day": "Día exacto", + "date_precision_option_month": "Mes", + "date_precision_option_season": "Estación", + "date_precision_option_year": "Año", + "date_precision_option_range": "Periodo", + "date_precision_option_approx": "Aproximada", + "date_precision_option_unknown": "Desconocida", "person_merge_will_be_deleted": "será eliminado.", "comp_typeahead_placeholder": "Escriba un nombre...", "comp_typeahead_loading": "Buscando...", @@ -356,11 +406,13 @@ "admin_system_import_status_done_label": "Documentos procesados", "admin_system_import_skipped_label": "omitidos", "import_reason_invalid_pdf_signature": "Firma PDF no válida", + "import_reason_path_traversal": "Nombre de archivo no válido (ruta)", "import_reason_file_read_error": "Error al leer el archivo", "import_reason_s3_upload_failed": "Error de carga (S3)", "import_reason_already_exists": "Ya importado", "admin_system_import_status_failed": "Importación fallida", "admin_system_import_failed_no_spreadsheet": "No se encontró ninguna hoja de cálculo.", + "admin_system_import_failed_artifact": "Falta un archivo de importación canónico o no es válido.", "admin_system_import_failed_internal": "Error interno durante la importación.", "admin_system_thumbnails_heading": "Generar miniaturas", "admin_system_thumbnails_description": "Genera imágenes de vista previa para documentos sin miniatura (p. ej. tras la importación masiva).", diff --git a/frontend/src/lib/activity/ChronikRow.svelte.spec.ts b/frontend/src/lib/activity/ChronikRow.svelte.spec.ts index 0afccf9c..c97d8957 100644 --- a/frontend/src/lib/activity/ChronikRow.svelte.spec.ts +++ b/frontend/src/lib/activity/ChronikRow.svelte.spec.ts @@ -44,6 +44,17 @@ describe('ChronikRow', () => { expect(link).not.toBeNull(); }); + // --- #668 negative guarantee: Chronik never fabricates a letter date --- + it('renders the activity timestamp, not a letter date, and no undated badge', async () => { + // The row shows the relative activity time (happenedAt), never the letter's + // documentDate — ActivityFeedItemDTO carries no date surface to badge. + render(ChronikRow, { item: baseItem }); + // No undated badge is introduced into a Chronik row. + expect(document.querySelector('[data-testid="undated-badge"]')).toBeNull(); + // No fabricated "Datum unbekannt" letter-date label appears. + await expect.element(page.getByText('Datum unbekannt')).not.toBeInTheDocument(); + }); + // --- simple variant --- it('renders simple variant when count === 1 and not a mention', async () => { render(ChronikRow, { item: baseItem }); diff --git a/frontend/src/lib/document/DocumentDate.svelte b/frontend/src/lib/document/DocumentDate.svelte new file mode 100644 index 00000000..5b959881 --- /dev/null +++ b/frontend/src/lib/document/DocumentDate.svelte @@ -0,0 +1,70 @@ + + + + {#if isUnknown} + + + + {label} + + {:else} + {label} + {/if} + {#if showRawLine} + + {m.date_original_label()} {raw} + {/if} + diff --git a/frontend/src/lib/document/DocumentDate.svelte.test.ts b/frontend/src/lib/document/DocumentDate.svelte.test.ts new file mode 100644 index 00000000..fa842b7b --- /dev/null +++ b/frontend/src/lib/document/DocumentDate.svelte.test.ts @@ -0,0 +1,35 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { cleanup, render } from 'vitest-browser-svelte'; +import { page } from 'vitest/browser'; +import DocumentDate from './DocumentDate.svelte'; + +// Browser-project (Playwright) tests — CI only. + +afterEach(cleanup); + +describe('DocumentDate', () => { + it('renders a DAY date as a full long date', async () => { + render(DocumentDate, { props: { iso: '1943-12-24', precision: 'DAY' } }); + await expect.element(page.getByText('24. Dezember 1943')).toBeInTheDocument(); + }); + + it('renders MONTH precision as month + year, never a day', async () => { + render(DocumentDate, { props: { iso: '1916-06-01', precision: 'MONTH', raw: 'Juni 1916' } }); + await expect.element(page.getByText('Juni 1916')).toBeInTheDocument(); + }); + + it('shows the verbatim raw cell as a visible secondary line for UNKNOWN (not tooltip-only)', async () => { + render(DocumentDate, { props: { iso: null, precision: 'UNKNOWN', raw: 'Sommer?' } }); + // Real, visible text — not hidden behind a title attribute. + await expect.element(page.getByText('Datum unbekannt')).toBeInTheDocument(); + await expect.element(page.getByText(/Sommer\?/)).toBeVisible(); + }); + + it('renders a malicious raw value as inert escaped text (no element injected)', async () => { + const malicious = ''; + render(DocumentDate, { props: { iso: null, precision: 'UNKNOWN', raw: malicious } }); + // The payload appears as literal text, and no is created in the DOM. + await expect.element(page.getByText(/([]), dateIso = $bindable(''), + datePrecision = $bindable('DAY'), + dateEndIso = $bindable(''), currentTitle = $bindable(''), topbar, actionbar @@ -38,6 +41,8 @@ let { senderId?: string; selectedReceivers?: Person[]; dateIso?: string; + datePrecision?: DatePrecision; + dateEndIso?: string; currentTitle?: string; topbar: Snippet; actionbar: Snippet; @@ -47,6 +52,8 @@ tags = untrack(() => (doc.tags as Tag[]) ?? []); senderId = untrack(() => doc.sender?.id ?? ''); selectedReceivers = untrack(() => (doc.receivers as Person[]) ?? []); dateIso = untrack(() => doc.documentDate ?? ''); +datePrecision = untrack(() => doc.metaDatePrecision ?? (doc.documentDate ? 'DAY' : 'UNKNOWN')); +dateEndIso = untrack(() => doc.metaDateEnd ?? ''); currentTitle = untrack(() => doc.title ?? ''); const fileLoader = createFileLoader(); @@ -199,6 +206,9 @@ async function handleReplaceFile(e: Event) { bind:senderId={senderId} bind:selectedReceivers={selectedReceivers} bind:dateIso={dateIso} + bind:precision={datePrecision} + bind:endDateIso={dateEndIso} + rawDate={doc.metaDateRaw ?? ''} initialDateIso={doc.documentDate ?? ''} initialLocation={doc.location ?? ''} initialSenderName={doc.sender?.displayName ?? ''} diff --git a/frontend/src/lib/document/DocumentMetadataDrawer.svelte b/frontend/src/lib/document/DocumentMetadataDrawer.svelte index 01ecc62a..4b8081e9 100644 --- a/frontend/src/lib/document/DocumentMetadataDrawer.svelte +++ b/frontend/src/lib/document/DocumentMetadataDrawer.svelte @@ -4,6 +4,8 @@ import { formatDate } from '$lib/shared/utils/date'; import { formatDocumentStatus } from '$lib/document/documentStatusLabel'; import { getInitials, personAvatarColor } from '$lib/person/personFormat'; import RelationshipPill from '$lib/person/relationship/RelationshipPill.svelte'; +import DocumentDate from './DocumentDate.svelte'; +import type { DatePrecision } from '$lib/shared/utils/documentDate'; type Person = { id: string; firstName?: string | null; lastName: string; displayName: string }; type Tag = { id: string; name: string }; @@ -16,6 +18,9 @@ type GeschichteSummary = { type Props = { documentDate: string | null; + metaDatePrecision?: DatePrecision | null; + metaDateEnd?: string | null; + metaDateRaw?: string | null; location: string | null; status: string; sender: Person | null; @@ -29,6 +34,9 @@ type Props = { let { documentDate, + metaDatePrecision = null, + metaDateEnd = null, + metaDateRaw = null, location, status, sender, @@ -59,7 +67,6 @@ function formatGeschichteDate(g: GeschichteSummary): string { return formatDate(g.publishedAt.slice(0, 10), 'short'); } -const formattedDate = $derived(documentDate ? formatDate(documentDate) : '—'); const displayLocation = $derived(location ?? '—'); const statusLabel = $derived(formatDocumentStatus(status)); const visibleReceivers = $derived(receivers.slice(0, VISIBLE_RECEIVER_LIMIT)); @@ -105,7 +112,18 @@ function getFullName(person: Person): string {

{m.doc_details_field_date()}
-
{formattedDate}
+
+ {#if documentDate || metaDateRaw} + + {:else} + — + {/if} +
{m.form_label_location()}
diff --git a/frontend/src/lib/document/DocumentMultiSelect.svelte b/frontend/src/lib/document/DocumentMultiSelect.svelte index 0196544b..fbfd59a9 100644 --- a/frontend/src/lib/document/DocumentMultiSelect.svelte +++ b/frontend/src/lib/document/DocumentMultiSelect.svelte @@ -2,13 +2,23 @@ import type { components } from '$lib/generated/api'; import { m } from '$lib/paraglide/messages.js'; import { clickOutside } from '$lib/shared/actions/clickOutside'; -import { formatDate } from '$lib/shared/utils/date'; +import { formatDocumentDate, type DatePrecision } from '$lib/shared/utils/documentDate'; +import { getLocale } from '$lib/paraglide/runtime.js'; -type Document = components['schemas']['Document']; type DocumentListItem = components['schemas']['DocumentListItem']; +/** + * Exactly the fields this picker reads — id for selection/dedup, the rest for + * the honest date label. A full `Document` and a `DocumentListItem` are both + * structurally assignable, so the search results need no cast. + */ +type DocumentOption = Pick< + DocumentListItem, + 'id' | 'title' | 'documentDate' | 'metaDatePrecision' | 'metaDateEnd' +>; + interface Props { - selectedDocuments?: Document[]; + selectedDocuments?: DocumentOption[]; placeholder?: string; hiddenInputName?: string; } @@ -20,7 +30,7 @@ let { }: Props = $props(); let searchTerm = $state(''); -let results: Document[] = $state([]); +let results: DocumentOption[] = $state([]); let showDropdown = $state(false); let loading = $state(false); let debounceTimer: ReturnType; @@ -46,11 +56,13 @@ function handleInput() { const res = await fetch(`/api/documents/search?q=${encodeURIComponent(searchTerm)}&size=10`); if (res.ok) { const body: { items: DocumentListItem[] } = await res.json(); - const docs = body.items.map((it) => ({ + const docs: DocumentOption[] = body.items.map((it) => ({ id: it.id, title: it.title, - documentDate: it.documentDate - })) as unknown as Document[]; + documentDate: it.documentDate, + metaDatePrecision: it.metaDatePrecision, + metaDateEnd: it.metaDateEnd + })); results = docs.filter((d) => !selectedDocuments.some((s) => s.id === d.id)); } } catch { @@ -61,7 +73,7 @@ function handleInput() { }, 300); } -function selectDocument(doc: Document) { +function selectDocument(doc: DocumentOption) { selectedDocuments = [...selectedDocuments, doc]; searchTerm = ''; showDropdown = false; @@ -72,9 +84,16 @@ function removeDocument(id: string | undefined) { selectedDocuments = selectedDocuments.filter((d) => d.id !== id); } -function formatDocLabel(doc: Document): string { - if (doc.documentDate) return `${doc.title} · ${formatDate(doc.documentDate, 'short')}`; - return doc.title; +function formatDocLabel(doc: DocumentOption): string { + if (!doc.documentDate) return doc.title; + const label = formatDocumentDate( + doc.documentDate, + doc.metaDatePrecision as DatePrecision, + doc.metaDateEnd, + null, + getLocale() + ); + return `${doc.title} · ${label}`; } diff --git a/frontend/src/lib/document/DocumentMultiSelect.svelte.spec.ts b/frontend/src/lib/document/DocumentMultiSelect.svelte.spec.ts index 6514ab55..d348026c 100644 --- a/frontend/src/lib/document/DocumentMultiSelect.svelte.spec.ts +++ b/frontend/src/lib/document/DocumentMultiSelect.svelte.spec.ts @@ -9,6 +9,7 @@ const docFactory = (id: string, title: string, date = '1880-01-01') => ({ id, title, documentDate: date, + metaDatePrecision: 'DAY' as const, originalFilename: `${title}.pdf`, receivers: [], tags: [], @@ -55,7 +56,8 @@ describe('DocumentMultiSelect — rendering', () => { selectedDocuments: [docFactory('d1', 'Brief vom 1. Mai', '1882-05-01')] }); await expect.element(page.getByText(/Brief vom 1\. Mai/)).toBeInTheDocument(); - await expect.element(page.getByText(/01\.05\.1882/)).toBeInTheDocument(); + // DAY precision renders the honest long date (formatDocumentDate), not 01.05.1882. + await expect.element(page.getByText(/1\. Mai 1882/)).toBeInTheDocument(); }); it('emits a hidden documentIds input for each pre-selected document', async () => { diff --git a/frontend/src/lib/document/DocumentRow.svelte b/frontend/src/lib/document/DocumentRow.svelte index 903ed727..44fcbd16 100644 --- a/frontend/src/lib/document/DocumentRow.svelte +++ b/frontend/src/lib/document/DocumentRow.svelte @@ -2,7 +2,7 @@ import { goto } from '$app/navigation'; import type { components } from '$lib/generated/api'; import { applyOffsets } from '$lib/document/search'; -import { formatDate } from '$lib/shared/utils/date'; +import DocumentDate from './DocumentDate.svelte'; import * as m from '$lib/paraglide/messages.js'; import { bulkSelectionStore } from '$lib/document/bulkSelection.svelte'; import ProgressRing from '$lib/shared/primitives/ProgressRing.svelte'; @@ -164,7 +164,16 @@ function safeTagColor(color: string | null | undefined): string {
- {doc.documentDate ? formatDate(doc.documentDate) : '—'} + +
@@ -178,7 +187,15 @@ function safeTagColor(color: string | null | undefined): string {
+ + +
+ +
{/if}
diff --git a/frontend/src/routes/SearchFilterBar.svelte.spec.ts b/frontend/src/routes/SearchFilterBar.svelte.spec.ts index 26d1d333..446cd046 100644 --- a/frontend/src/routes/SearchFilterBar.svelte.spec.ts +++ b/frontend/src/routes/SearchFilterBar.svelte.spec.ts @@ -128,6 +128,56 @@ describe('SearchFilterBar – AND/OR tag operator toggle', () => { }); }); +describe('SearchFilterBar – undated-only toggle (#668)', () => { + async function openAdvanced() { + const filterBtn = page.getByRole('button', { name: 'Filter', exact: true }); + await filterBtn.click(); + } + + it('renders the "Nur undatierte" toggle in the advanced row', async () => { + render(SearchFilterBar, { ...defaultProps, sort: 'DATE', dir: 'desc' }); + await openAdvanced(); + await expect.element(page.getByTestId('undated-only-toggle')).toBeInTheDocument(); + }); + + it('reflects the active undated state via aria-pressed', async () => { + render(SearchFilterBar, { ...defaultProps, sort: 'DATE', dir: 'desc', undated: true }); + await openAdvanced(); + await expect + .element(page.getByTestId('undated-only-toggle')) + .toHaveAttribute('aria-pressed', 'true'); + }); + + it('calls onSearchImmediate when the undated toggle is clicked', async () => { + const onSearch = vi.fn(); + const onSearchImmediate = vi.fn(); + render(SearchFilterBar, { + ...defaultProps, + onSearch, + onSearchImmediate, + sort: 'DATE', + dir: 'desc' + }); + await openAdvanced(); + await page.getByTestId('undated-only-toggle').click(); + await expect.poll(() => onSearchImmediate.mock.calls.length).toBeGreaterThan(0); + }); + + it('shows the global undated count chip when undatedCount > 0', async () => { + // The count is the backend's global filtered total (#668), passed straight + // through — the chip must render it verbatim, not a page-derived number. + render(SearchFilterBar, { ...defaultProps, sort: 'DATE', dir: 'desc', undatedCount: 42 }); + await openAdvanced(); + await expect.element(page.getByTestId('undated-count')).toHaveTextContent('42'); + }); + + it('hides the undated count chip when undatedCount is 0', async () => { + render(SearchFilterBar, { ...defaultProps, sort: 'DATE', dir: 'desc', undatedCount: 0 }); + await openAdvanced(); + await expect.element(page.getByTestId('undated-count')).not.toBeInTheDocument(); + }); +}); + describe('SearchFilterBar – tagQ live filter', () => { it('calls onSearch when tag text changes in TagInput', async () => { vi.stubGlobal( diff --git a/frontend/src/routes/admin/system/+page.svelte b/frontend/src/routes/admin/system/+page.svelte index 0f09b24f..8b5b6ccf 100644 --- a/frontend/src/routes/admin/system/+page.svelte +++ b/frontend/src/routes/admin/system/+page.svelte @@ -3,6 +3,7 @@ import { onDestroy } from 'svelte'; import { m } from '$lib/paraglide/messages.js'; import ImportStatusCard from './ImportStatusCard.svelte'; import type { ImportStatus } from './types.js'; +import { withCsrf } from '$lib/shared/cookies'; let backfillResult: number | null = $state(null); let backfillLoading = $state(false); @@ -61,7 +62,7 @@ async function fetchImportStatus() { } async function triggerImport() { - const res = await fetch('/api/admin/trigger-import', { method: 'POST' }); + const res = await fetch('/api/admin/trigger-import', withCsrf({ method: 'POST' })); if (res.ok) { importStatus = await res.json(); if (importStatus!.state === 'RUNNING') { @@ -83,7 +84,7 @@ async function fetchThumbnailStatus() { } async function triggerThumbnails() { - const res = await fetch('/api/admin/generate-thumbnails', { method: 'POST' }); + const res = await fetch('/api/admin/generate-thumbnails', withCsrf({ method: 'POST' })); if (res.ok) { thumbnailStatus = await res.json(); if (thumbnailStatus!.state === 'RUNNING') { @@ -106,7 +107,7 @@ async function backfillVersions() { backfillLoading = true; backfillResult = null; try { - const res = await fetch('/api/admin/backfill-versions', { method: 'POST' }); + const res = await fetch('/api/admin/backfill-versions', withCsrf({ method: 'POST' })); if (res.ok) { const data = await res.json(); backfillResult = data.count; @@ -120,7 +121,7 @@ async function backfillFileHashes() { backfillHashesLoading = true; backfillHashesResult = null; try { - const res = await fetch('/api/admin/backfill-file-hashes', { method: 'POST' }); + const res = await fetch('/api/admin/backfill-file-hashes', withCsrf({ method: 'POST' })); if (res.ok) { const data = await res.json(); backfillHashesResult = data.count; diff --git a/frontend/src/routes/admin/system/ImportStatusCard.svelte b/frontend/src/routes/admin/system/ImportStatusCard.svelte index bb9bce72..d6ebb2a2 100644 --- a/frontend/src/routes/admin/system/ImportStatusCard.svelte +++ b/frontend/src/routes/admin/system/ImportStatusCard.svelte @@ -13,10 +13,13 @@ let { const failureMessage = $derived( importStatus?.statusCode === 'IMPORT_FAILED_NO_SPREADSHEET' ? m.admin_system_import_failed_no_spreadsheet() - : m.admin_system_import_failed_internal() + : importStatus?.statusCode === 'IMPORT_FAILED_ARTIFACT' + ? m.admin_system_import_failed_artifact() + : m.admin_system_import_failed_internal() ); function reasonLabel(code: string): string { + if (code === 'INVALID_FILENAME_PATH_TRAVERSAL') return m.import_reason_path_traversal(); if (code === 'INVALID_PDF_SIGNATURE') return m.import_reason_invalid_pdf_signature(); if (code === 'FILE_READ_ERROR') return m.import_reason_file_read_error(); if (code === 'S3_UPLOAD_FAILED') return m.import_reason_s3_upload_failed(); @@ -39,14 +42,9 @@ function reasonLabel(code: string): string { aria-label={m.admin_system_import_status_running()} class="inline-block h-5 w-5 animate-spin rounded-full border-2 border-ink-3 border-t-brand-mint motion-reduce:animate-none" > -
-

- {importStatus.processed} -

-

- {m.admin_system_import_status_running()} -

-
+

+ {m.admin_system_import_status_running()} +

{:else if importStatus?.state === 'DONE'}
diff --git a/frontend/src/routes/admin/system/ImportStatusCard.svelte.test.ts b/frontend/src/routes/admin/system/ImportStatusCard.svelte.test.ts index d7470cda..f2248c58 100644 --- a/frontend/src/routes/admin/system/ImportStatusCard.svelte.test.ts +++ b/frontend/src/routes/admin/system/ImportStatusCard.svelte.test.ts @@ -26,15 +26,17 @@ describe('ImportStatusCard', () => { await expect.element(getByTestId('spinner')).toBeInTheDocument(); }); - it('shows processed count at text-base while RUNNING', async () => { + it('shows no processed count while RUNNING (spinner only, no misleading 0)', async () => { + // The whole document load commits in one transaction, so a live count would sit at 0 + // until the end. Show just the spinner + "running" label instead of a stuck "0". const { getByTestId } = render(ImportStatusCard, { props: { - importStatus: makeStatus({ state: 'RUNNING', statusCode: 'IMPORT_RUNNING', processed: 7 }), + importStatus: makeStatus({ state: 'RUNNING', statusCode: 'IMPORT_RUNNING', processed: 0 }), ontrigger: () => {} } }); - await expect.element(getByTestId('processed-count')).toHaveTextContent('7'); + await expect.element(getByTestId('processed-count')).not.toBeInTheDocument(); }); it('shows processed count while DONE', async () => { diff --git a/frontend/src/routes/documents/+page.server.ts b/frontend/src/routes/documents/+page.server.ts index 16186611..a4e2242b 100644 --- a/frontend/src/routes/documents/+page.server.ts +++ b/frontend/src/routes/documents/+page.server.ts @@ -46,6 +46,8 @@ export async function load({ url, fetch }) { : 'desc'; const tagQ = url.searchParams.get('tagQ') || ''; const tagOp = url.searchParams.get('tagOp') === 'OR' ? 'OR' : 'AND'; + // Narrow the accepted truthy surface to exactly "true" (mirrors the tagOp clamp). + const undated = url.searchParams.get('undated') === 'true'; const page = Math.max(0, Number(url.searchParams.get('page') ?? '0') || 0); const api = createApiClient(fetch); @@ -66,6 +68,7 @@ export async function load({ url, fetch }) { tag: tags.length ? tags : undefined, tagQ: tagQ && !tags.length ? tagQ : undefined, tagOp: tagOp === 'OR' ? 'OR' : undefined, + undated: undated || undefined, sort, dir: dir || undefined, page, @@ -82,6 +85,7 @@ export async function load({ url, fetch }) { pageNumber: 0, pageSize: PAGE_SIZE, totalPages: 0, + undatedCount: 0, q, from, to, @@ -94,6 +98,7 @@ export async function load({ url, fetch }) { dir, tagQ, tagOp, + undated, error: 'Daten konnten nicht geladen werden.' as string | null }; } @@ -112,6 +117,8 @@ export async function load({ url, fetch }) { pageNumber: result.data?.pageNumber ?? page, pageSize: result.data?.pageSize ?? PAGE_SIZE, totalPages: result.data?.totalPages ?? 0, + // Global undated count for the active filter, across all pages (issue #668). + undatedCount: result.data?.undatedCount ?? 0, q, from, to, @@ -124,6 +131,7 @@ export async function load({ url, fetch }) { dir, tagQ, tagOp, + undated, error: errorMessage }; } diff --git a/frontend/src/routes/documents/+page.svelte b/frontend/src/routes/documents/+page.svelte index d2f05de5..5006d9eb 100644 --- a/frontend/src/routes/documents/+page.svelte +++ b/frontend/src/routes/documents/+page.svelte @@ -32,10 +32,16 @@ let sort = $state(untrack(() => data.sort || 'DATE')); let dir = $state(untrack(() => data.dir || 'desc')); let tagQ = $state(untrack(() => data.tagQ || '')); let tagOperator = $state<'AND' | 'OR'>(untrack(() => (data.tagOp as 'AND' | 'OR') || 'AND')); +let undated = $state(untrack(() => data.undated ?? false)); function hasAdvancedFilters() { return ( - (data.tags?.length ?? 0) > 0 || !!data.senderId || !!data.receiverId || !!data.from || !!data.to + (data.tags?.length ?? 0) > 0 || + !!data.senderId || + !!data.receiverId || + !!data.from || + !!data.to || + !!data.undated ); } @@ -54,6 +60,7 @@ type FilterSnapshot = { dir: string; tagQ: string; tagOp: 'AND' | 'OR'; + undated: boolean; zoomFrom?: string | null; zoomTo?: string | null; }; @@ -77,6 +84,7 @@ function buildSearchParams(filters: FilterSnapshot, targetPage?: number): Svelte if (filters.dir) params.set('dir', filters.dir); if (filters.tagQ) params.set('tagQ', filters.tagQ); if (filters.tagOp === 'OR') params.set('tagOp', 'OR'); + if (filters.undated) params.set('undated', 'true'); if (filters.zoomFrom) params.set('zoomFrom', filters.zoomFrom); if (filters.zoomTo) params.set('zoomTo', filters.zoomTo); if (targetPage !== undefined && targetPage > 0) params.set('page', String(targetPage)); @@ -112,6 +120,7 @@ function navigateWithZoom(zoomFrom: string | null, zoomTo: string | null) { dir, tagQ, tagOp: tagOperator, + undated, zoomFrom, zoomTo }); @@ -136,7 +145,8 @@ function buildPageHref(targetPage: number): string { sort: data.sort || '', dir: data.dir || '', tagQ: data.tagQ || '', - tagOp: (data.tagOp as 'AND' | 'OR') || 'AND' + tagOp: (data.tagOp as 'AND' | 'OR') || 'AND', + undated: data.undated ?? false }, targetPage ); @@ -188,7 +198,8 @@ async function editAllMatching() { sort: '', dir: '', tagQ: data.tagQ || '', - tagOp: (data.tagOp as 'AND' | 'OR') || 'AND' + tagOp: (data.tagOp as 'AND' | 'OR') || 'AND', + undated: data.undated ?? false }); params.delete('sort'); params.delete('dir'); @@ -226,6 +237,7 @@ $effect(() => { dir = data.dir || 'desc'; tagQ = data.tagQ || ''; tagOperator = (data.tagOp as 'AND' | 'OR') || 'AND'; + undated = data.undated ?? false; if (hasAdvancedFilters()) showAdvanced = true; }); @@ -255,6 +267,8 @@ $effect(() => { bind:dir={dir} bind:tagQ={tagQ} bind:tagOperator={tagOperator} + bind:undated={undated} + undatedCount={data.undatedCount ?? 0} initialSenderName={initialSenderName} initialReceiverName={initialReceiverName} navKey={navKey} @@ -343,6 +357,8 @@ $effect(() => { canWrite={data.canWrite} error={data.error} sort={sort} + from={data.from} + to={data.to} /> diff --git a/frontend/src/routes/documents/[id]/edit/+page.server.ts b/frontend/src/routes/documents/[id]/edit/+page.server.ts index 4cd24a81..c012e3b9 100644 --- a/frontend/src/routes/documents/[id]/edit/+page.server.ts +++ b/frontend/src/routes/documents/[id]/edit/+page.server.ts @@ -2,6 +2,7 @@ import { error, fail, redirect } from '@sveltejs/kit'; import { env } from '$env/dynamic/private'; import { createApiClient, extractErrorCode } from '$lib/shared/api.server'; import { parseBackendError, getErrorMessage } from '$lib/shared/errors'; +import { hasWriteAll } from '$lib/shared/server/permissions'; export async function load({ params, @@ -15,30 +16,21 @@ export async function load({ depends: (dep: string) => void; }) { depends('app:document'); - const canWrite = - locals.user?.groups?.some((g: { permissions: string[] }) => - g.permissions.includes('WRITE_ALL') - ) ?? false; - if (!canWrite) throw error(403, 'Forbidden'); + if (!hasWriteAll(locals)) throw error(403, 'Forbidden'); const { id } = params; const api = createApiClient(fetch); - const [docResult, personsResult] = await Promise.all([ - api.GET('/api/documents/{id}', { params: { path: { id } } }), - api.GET('/api/persons') - ]); + const docResult = await api.GET('/api/documents/{id}', { params: { path: { id } } }); if (!docResult.response.ok) { throw error(docResult.response.status, getErrorMessage(extractErrorCode(docResult.error))); } - if (!personsResult.response.ok) { - throw error(personsResult.response.status, getErrorMessage('INTERNAL_ERROR')); - } return { document: docResult.data!, - persons: personsResult.data + // Sender/receiver editing uses PersonTypeahead (self-fetching); no full list is consumed. + persons: [] as never[] }; } diff --git a/frontend/src/routes/documents/new/+page.server.ts b/frontend/src/routes/documents/new/+page.server.ts index b54496c3..2d6c2fd8 100644 --- a/frontend/src/routes/documents/new/+page.server.ts +++ b/frontend/src/routes/documents/new/+page.server.ts @@ -2,6 +2,7 @@ import { fail, redirect } from '@sveltejs/kit'; import { env } from '$env/dynamic/private'; import { createApiClient } from '$lib/shared/api.server'; import { parseBackendError, getErrorMessage } from '$lib/shared/errors'; +import { hasWriteAll } from '$lib/shared/server/permissions'; export async function load({ fetch, @@ -12,11 +13,7 @@ export async function load({ locals: App.Locals; url: URL; }) { - const canWrite = - locals.user?.groups?.some((g: { permissions: string[] }) => - g.permissions.includes('WRITE_ALL') - ) ?? false; - if (!canWrite) throw redirect(303, '/'); + if (!hasWriteAll(locals)) throw redirect(303, '/'); const senderId = url.searchParams.get('senderId') || ''; const receiverId = url.searchParams.get('receiverId') || ''; @@ -57,10 +54,12 @@ export async function load({ ); } - const [personsResult] = await Promise.all([api.GET('/api/persons'), ...requests]); + await Promise.all(requests); return { - persons: personsResult.response.ok ? personsResult.data : [], + // Sender/receiver selection uses PersonTypeahead, which fetches its own results on + // demand — the page never consumes a pre-loaded full person list, so none is fetched. + persons: [] as never[], initialSenderId: senderId, initialSenderName, initialReceivers diff --git a/frontend/src/routes/documents/page.server.spec.ts b/frontend/src/routes/documents/page.server.spec.ts index 2ed33a86..c51eb3f0 100644 --- a/frontend/src/routes/documents/page.server.spec.ts +++ b/frontend/src/routes/documents/page.server.spec.ts @@ -100,6 +100,106 @@ describe('documents page load — search params', () => { ); }); + it('forwards undated=true to the search API as a boolean true', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + await load({ + url: makeUrl({ undated: 'true' }), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(mockGet).toHaveBeenCalledWith( + '/api/documents/search', + expect.objectContaining({ + params: expect.objectContaining({ + query: expect.objectContaining({ undated: true }) + }) + }) + ); + }); + + it('omits undated from the query when the param is absent', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + await load({ + url: makeUrl({}), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + const query = mockGet.mock.calls[0][1].params.query; + expect(query.undated).toBeUndefined(); + }); + + it('treats any undated value other than the literal "true" as not-undated', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + const result = await load({ + url: makeUrl({ undated: '1' }), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(result.undated).toBe(false); + expect(mockGet.mock.calls[0][1].params.query.undated).toBeUndefined(); + }); + + it('returns the undated flag in page data when enabled', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + const result = await load({ + url: makeUrl({ undated: 'true' }), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(result.undated).toBe(true); + }); + + it('does not carry page when toggling undated (page reset)', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + // A bare undated toggle URL carries no page param → loader requests page 0. + await load({ + url: makeUrl({ undated: 'true' }), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(mockGet.mock.calls[0][1].params.query.page).toBe(0); + }); + it('returns items and total from the search result', async () => { const item = { document: { id: 'd1' }, @@ -125,6 +225,51 @@ describe('documents page load — search params', () => { expect(result.totalElements).toBe(42); }); + it('forwards the global undatedCount from the search result (#668)', async () => { + // The backend returns the global undated total for the active filter across + // ALL pages; the loader must pass it straight through, not recompute it locally. + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { + items: [], + totalElements: 200, + pageNumber: 0, + pageSize: 50, + totalPages: 4, + undatedCount: 73 + } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + const result = await load({ + url: makeUrl({ q: 'test' }), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(result.undatedCount).toBe(73); + }); + + it('defaults undatedCount to 0 when the search result omits it', async () => { + const mockGet = vi.fn().mockResolvedValue({ + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }); + vi.mocked(createApiClient).mockReturnValue({ GET: mockGet } as ReturnType< + typeof createApiClient + >); + + const result = await load({ + url: makeUrl(), + request: new Request('http://localhost/documents'), + fetch: vi.fn() as unknown as typeof fetch + }); + + expect(result.undatedCount).toBe(0); + }); + it('returns filter values in the result for pre-filling the UI', async () => { const mockGet = vi.fn().mockResolvedValue({ response: { ok: true, status: 200 }, diff --git a/frontend/src/routes/persons/+page.server.ts b/frontend/src/routes/persons/+page.server.ts index 62a1f8cf..3f04ff7c 100644 --- a/frontend/src/routes/persons/+page.server.ts +++ b/frontend/src/routes/persons/+page.server.ts @@ -1,25 +1,55 @@ import { error } from '@sveltejs/kit'; import { createApiClient } from '$lib/shared/api.server'; import { getErrorMessage } from '$lib/shared/errors'; +import { hasWriteAll } from '$lib/shared/server/permissions'; + +const PAGE_SIZE = 50; + +type PersonType = 'PERSON' | 'INSTITUTION' | 'GROUP'; + +function parseType(raw: string | null): PersonType | undefined { + return raw === 'PERSON' || raw === 'INSTITUTION' || raw === 'GROUP' ? raw : undefined; +} export async function load({ url, fetch, locals }) { const q = url.searchParams.get('q') || ''; + const page = Math.max(0, Number.parseInt(url.searchParams.get('page') ?? '0', 10) || 0); + const review = + url.searchParams.get('review') === '1' || url.searchParams.get('review') === 'true'; + const type = parseType(url.searchParams.get('type')); + const familyOnly = url.searchParams.get('familyOnly') === 'true'; + const hasDocuments = url.searchParams.get('hasDocuments') === 'true'; + const api = createApiClient(fetch); - const canWrite = - (locals.user as { groups?: { permissions: string[] }[] } | undefined)?.groups?.some((g) => - g.permissions.includes('WRITE_ALL') - ) ?? false; + const canWrite = hasWriteAll(locals); - const [personsResult, statsResult] = await Promise.all([ - api.GET('/api/persons', { params: { query: { q: q || undefined } } }), - api.GET('/api/stats', {}) + const filters = { + q: q || undefined, + type, + familyOnly: familyOnly || undefined, + hasDocuments: hasDocuments || undefined, + review: review || undefined, + page, + size: PAGE_SIZE + }; + + // The "Zu prüfen (N)" link count is the totalElements of a provisional-only query. A size=1 + // page keeps the extra request cheap — we only need the count, not the rows. + const [personsResult, statsResult, reviewCountResult] = await Promise.all([ + api.GET('/api/persons', { params: { query: filters } }), + api.GET('/api/stats', {}), + canWrite + ? api.GET('/api/persons', { params: { query: { provisional: true, review: true, size: 1 } } }) + : Promise.resolve(null) ]); if (!personsResult.response.ok) { throw error(personsResult.response.status, getErrorMessage(undefined)); } + const result = personsResult.data!; + const stats = statsResult.response.ok ? { totalPersons: statsResult.data!.totalPersons ?? 0, @@ -27,5 +57,21 @@ export async function load({ url, fetch, locals }) { } : { totalPersons: 0, totalDocuments: 0 }; - return { persons: personsResult.data!, stats, q, canWrite }; + const needsReviewCount = + reviewCountResult && reviewCountResult.response.ok + ? (reviewCountResult.data!.totalElements ?? 0) + : 0; + + return { + persons: result.items, + totalElements: result.totalElements, + totalPages: result.totalPages, + pageNumber: result.pageNumber, + pageSize: result.pageSize, + filters: { type, familyOnly, hasDocuments, review }, + needsReviewCount, + stats, + q, + canWrite + }; } diff --git a/frontend/src/routes/persons/+page.svelte b/frontend/src/routes/persons/+page.svelte index 5522d3ad..06080c83 100644 --- a/frontend/src/routes/persons/+page.svelte +++ b/frontend/src/routes/persons/+page.svelte @@ -1,9 +1,12 @@ @@ -32,7 +54,7 @@ function handleSearch() {
-
+

{m.page_title_persons()}

diff --git a/frontend/src/routes/persons/page.server.spec.ts b/frontend/src/routes/persons/page.server.spec.ts new file mode 100644 index 00000000..814fe2c8 --- /dev/null +++ b/frontend/src/routes/persons/page.server.spec.ts @@ -0,0 +1,137 @@ +import { describe, expect, it, vi, beforeEach } from 'vitest'; + +vi.mock('$lib/shared/api.server', () => ({ + createApiClient: vi.fn(), + extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code +})); + +import { load } from './+page.server'; +import { createApiClient } from '$lib/shared/api.server'; + +beforeEach(() => vi.clearAllMocks()); + +function makeUrl(params: Record = {}) { + const url = new URL('http://localhost/persons'); + for (const [key, value] of Object.entries(params)) url.searchParams.set(key, value); + return url; +} + +/** Invokes the loader with a minimal event; the partial event is cast to satisfy the type. */ +function runLoad(url: URL, user: unknown) { + return load({ + url, + fetch: vi.fn() as unknown as typeof fetch, + request: new Request('http://localhost/persons'), + locals: { user } as App.Locals + } as unknown as Parameters[0]); +} + +/** Mock the typed client. /api/persons returns a paged envelope; /api/stats returns counts. */ +function mockApi() { + const personsResult = { + response: { ok: true, status: 200 }, + data: { items: [], totalElements: 0, pageNumber: 0, pageSize: 50, totalPages: 0 } + }; + // Loose `...args` signature (matching the documents loader spec) so call tuples aren't + // narrowed to length 1 — the test inspects calls[i][1].params.query. + const get = vi.fn((...args: unknown[]) => { + if (args[0] === '/api/stats') { + return Promise.resolve({ + response: { ok: true, status: 200 }, + data: { totalPersons: 7, totalDocuments: 3 } + }); + } + return Promise.resolve(personsResult); + }); + vi.mocked(createApiClient).mockReturnValue({ GET: get } as unknown as ReturnType< + typeof createApiClient + >); + return get; +} + +const writer = { groups: [{ permissions: ['READ_ALL', 'WRITE_ALL'] }] }; +const reader = { groups: [{ permissions: ['READ_ALL'] }] }; + +type GetCall = [string, { params: { query: Record } }]; + +/** Find the GET call to a path, optionally narrowing by a query predicate. */ +function findCall( + get: ReturnType, + path: string, + matchQuery?: (q: Record) => boolean +): GetCall | undefined { + return (get.mock.calls as unknown as GetCall[]).find( + (c) => c[0] === path && (!matchQuery || matchQuery(c[1].params.query)) + ); +} + +describe('persons page load — reader default', () => { + it('does NOT pass review when no review param is present (clean reader default)', async () => { + const get = mockApi(); + + await runLoad(makeUrl(), reader); + + const personsCall = findCall(get, '/api/persons'); + expect(personsCall?.[1].params.query.review).toBeUndefined(); + }); + + it('passes review=true when review=1 is in the URL', async () => { + const get = mockApi(); + + await runLoad(makeUrl({ review: '1' }), reader); + + const personsCall = findCall(get, '/api/persons'); + expect(personsCall?.[1].params.query.review).toBe(true); + }); +}); + +describe('persons page load — filter forwarding', () => { + it('forwards type, familyOnly, hasDocuments and page to the API', async () => { + const get = mockApi(); + + await runLoad( + makeUrl({ type: 'INSTITUTION', familyOnly: 'true', hasDocuments: 'true', page: '2' }), + reader + ); + + const personsCall = findCall(get, '/api/persons'); + expect(personsCall?.[1].params.query).toMatchObject({ + type: 'INSTITUTION', + familyOnly: true, + hasDocuments: true, + page: 2, + size: 50 + }); + }); + + it('clamps a negative page to 0', async () => { + const get = mockApi(); + + await runLoad(makeUrl({ page: '-5' }), reader); + + const personsCall = findCall(get, '/api/persons'); + expect(personsCall?.[1].params.query.page).toBe(0); + }); +}); + +describe('persons page load — needsReviewCount', () => { + it('fires a provisional count request for writers', async () => { + const get = mockApi(); + + await runLoad(makeUrl(), writer); + + const provisionalCall = findCall(get, '/api/persons', (query) => query.provisional === true); + expect(provisionalCall).toBeDefined(); + }); + + it('does not fire a provisional count request for read-only users', async () => { + const get = mockApi(); + + const result = await runLoad(makeUrl(), reader); + + const provisionalCall = findCall(get, '/api/persons', (query) => query.provisional === true); + expect(provisionalCall).toBeUndefined(); + expect(result.needsReviewCount).toBe(0); + expect(result.canWrite).toBe(false); + }); +}); diff --git a/frontend/src/routes/persons/page.svelte.spec.ts b/frontend/src/routes/persons/page.svelte.spec.ts index 19697f80..f20c22fe 100644 --- a/frontend/src/routes/persons/page.svelte.spec.ts +++ b/frontend/src/routes/persons/page.svelte.spec.ts @@ -6,6 +6,7 @@ import Page from './+page.svelte'; const tick = () => new Promise((r) => setTimeout(r, 0)); vi.mock('$app/navigation', () => ({ goto: vi.fn() })); +vi.mock('$app/state', () => ({ page: { url: new URL('http://localhost/persons') } })); const makePerson = (overrides = {}) => ({ id: '1', @@ -13,6 +14,8 @@ const makePerson = (overrides = {}) => ({ lastName: 'Mustermann', displayName: 'Max Mustermann', documentCount: 0, + provisional: false, + personType: 'PERSON', ...overrides }); @@ -24,7 +27,13 @@ const emptyData = { canBlogWrite: false, q: '', persons: [], - stats: defaultStats + stats: defaultStats, + totalElements: 0, + totalPages: 0, + pageNumber: 0, + pageSize: 50, + filters: { type: undefined, familyOnly: false, hasDocuments: false, review: false }, + needsReviewCount: 0 }; const dataWithPersons = { ...emptyData, diff --git a/frontend/src/routes/persons/page.svelte.test.ts b/frontend/src/routes/persons/page.svelte.test.ts index 71d90e6f..8a1cea20 100644 --- a/frontend/src/routes/persons/page.svelte.test.ts +++ b/frontend/src/routes/persons/page.svelte.test.ts @@ -16,6 +16,8 @@ vi.mock('$app/navigation', () => ({ onNavigate: () => () => {} })); +vi.mock('$app/state', () => ({ page: { url: new URL('http://localhost/persons') } })); + const { default: PersonsListPage } = await import('./+page.svelte'); afterEach(cleanup); @@ -31,8 +33,15 @@ const baseData = (overrides: Record = {}) => ({ birthYear?: number; deathYear?: number; documentCount?: number; + provisional?: boolean; }>, stats: { totalPersons: 0, totalDocuments: 0 }, + totalElements: 0, + totalPages: 0, + pageNumber: 0, + pageSize: 50, + filters: { type: undefined, familyOnly: false, hasDocuments: false, review: false }, + needsReviewCount: 0, canWrite: false, q: '', ...overrides diff --git a/frontend/src/routes/persons/review/+page.server.ts b/frontend/src/routes/persons/review/+page.server.ts new file mode 100644 index 00000000..f2a5a695 --- /dev/null +++ b/frontend/src/routes/persons/review/+page.server.ts @@ -0,0 +1,119 @@ +import { error, fail } from '@sveltejs/kit'; +import { createApiClient, extractErrorCode } from '$lib/shared/api.server'; +import { getErrorMessage } from '$lib/shared/errors'; +import { hasWriteAll } from '$lib/shared/server/permissions'; + +const PAGE_SIZE = 50; + +export async function load({ url, fetch, locals }) { + const canWrite = hasWriteAll(locals); + + const page = Math.max(0, Number.parseInt(url.searchParams.get('page') ?? '0', 10) || 0); + const api = createApiClient(fetch); + + const result = await api.GET('/api/persons', { + params: { query: { provisional: true, review: true, page, size: PAGE_SIZE } } + }); + + if (!result.response.ok) { + throw error(result.response.status, getErrorMessage(undefined)); + } + + const data = result.data!; + + return { + persons: data.items, + totalElements: data.totalElements, + totalPages: data.totalPages, + pageNumber: data.pageNumber, + canWrite + }; +} + +export const actions = { + confirm: async ({ request, fetch, locals }) => { + if (!hasWriteAll(locals)) { + return fail(403, { error: getErrorMessage('FORBIDDEN') }); + } + const id = (await request.formData()).get('id') as string; + const api = createApiClient(fetch); + const result = await api.PATCH('/api/persons/{id}/confirm', { + params: { path: { id } } + }); + if (!result.response.ok) { + return fail(result.response.status, { + error: getErrorMessage(extractErrorCode(result.error)) + }); + } + return { success: true }; + }, + + delete: async ({ request, fetch, locals }) => { + if (!hasWriteAll(locals)) { + return fail(403, { error: getErrorMessage('FORBIDDEN') }); + } + const id = (await request.formData()).get('id') as string; + const api = createApiClient(fetch); + const result = await api.DELETE('/api/persons/{id}', { + params: { path: { id } } + }); + if (!result.response.ok) { + return fail(result.response.status, { + error: getErrorMessage(extractErrorCode(result.error)) + }); + } + return { success: true }; + }, + + merge: async ({ request, fetch, locals }) => { + if (!hasWriteAll(locals)) { + return fail(403, { error: getErrorMessage('FORBIDDEN') }); + } + const formData = await request.formData(); + const id = formData.get('id') as string; + const targetPersonId = formData.get('targetPersonId') as string; + if (!targetPersonId) { + return fail(400, { error: getErrorMessage('INVALID_INPUT') }); + } + const api = createApiClient(fetch); + const result = await api.POST('/api/persons/{id}/merge', { + params: { path: { id } }, + body: { targetPersonId } + }); + if (!result.response.ok) { + return fail(result.response.status, { + error: getErrorMessage(extractErrorCode(result.error)) + }); + } + return { success: true }; + }, + + rename: async ({ request, fetch, locals }) => { + if (!hasWriteAll(locals)) { + return fail(403, { error: getErrorMessage('FORBIDDEN') }); + } + const formData = await request.formData(); + const id = formData.get('id') as string; + const firstName = (formData.get('firstName') as string)?.trim() || undefined; + const lastName = (formData.get('lastName') as string)?.trim(); + const personType = (formData.get('personType') as string) || 'PERSON'; + if (!lastName) { + return fail(400, { error: getErrorMessage('INVALID_INPUT') }); + } + const api = createApiClient(fetch); + const result = await api.PUT('/api/persons/{id}', { + params: { path: { id } }, + body: { + firstName, + lastName, + personType: personType as 'PERSON' | 'INSTITUTION' | 'GROUP' | 'UNKNOWN' + } + }); + if (!result.response.ok) { + return fail(result.response.status, { + error: getErrorMessage(extractErrorCode(result.error)) + }); + } + return { success: true }; + } +}; diff --git a/frontend/src/routes/persons/review/+page.svelte b/frontend/src/routes/persons/review/+page.svelte new file mode 100644 index 00000000..13300575 --- /dev/null +++ b/frontend/src/routes/persons/review/+page.svelte @@ -0,0 +1,56 @@ + + + + {m.persons_review_heading()} + + +
+ + +
+

{m.persons_review_heading()}

+

{m.persons_review_intro()}

+
+ + {#if form?.error} + + {/if} + + {#if !hasResults} +
+

{m.persons_review_empty()}

+
+ {:else} +
    + {#each data.persons as person (person.id)} + + {/each} +
+ + + {/if} +
diff --git a/frontend/src/routes/persons/review/page.server.spec.ts b/frontend/src/routes/persons/review/page.server.spec.ts new file mode 100644 index 00000000..93260ddc --- /dev/null +++ b/frontend/src/routes/persons/review/page.server.spec.ts @@ -0,0 +1,252 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; + +vi.mock('$lib/shared/api.server', () => ({ + createApiClient: vi.fn(), + extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code +})); + +import { actions } from './+page.server'; +import { createApiClient } from '$lib/shared/api.server'; + +beforeEach(() => vi.clearAllMocks()); + +const writer = { groups: [{ permissions: ['READ_ALL', 'WRITE_ALL'] }] }; +const reader = { groups: [{ permissions: ['READ_ALL'] }] }; + +/** Mock the typed client with a single response stubbed for every verb. */ +function mockApi(response: { ok: boolean; status: number; error?: unknown }) { + const result = { + response: { ok: response.ok, status: response.status }, + error: response.error, + data: response.ok ? {} : undefined + }; + const apiCall = vi.fn(() => Promise.resolve(result)); + vi.mocked(createApiClient).mockReturnValue({ + GET: apiCall, + PATCH: apiCall, + POST: apiCall, + PUT: apiCall, + DELETE: apiCall + } as unknown as ReturnType); + return apiCall; +} + +/** Build a SvelteKit RequestEvent with a FormData body and a user shape. */ +function runAction( + action: (typeof actions)[keyof typeof actions], + formData: FormData, + user: unknown +) { + return action({ + request: new Request('http://localhost', { method: 'POST', body: formData }), + fetch: vi.fn() as unknown as typeof fetch, + locals: { user } as App.Locals + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } as any); +} + +describe('persons/review confirm action', () => { + it('returns { success: true } on backend 200', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.confirm, fd, writer); + + expect(apiCall).toHaveBeenCalledOnce(); + expect(result).toEqual({ success: true }); + }); + + it('returns fail(403) on backend 403', async () => { + mockApi({ ok: false, status: 403, error: { code: 'FORBIDDEN' } }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.confirm, fd, writer); + + expect(result).toMatchObject({ status: 403 }); + }); + + it('returns fail(404) on backend 404', async () => { + mockApi({ ok: false, status: 404, error: { code: 'NOT_FOUND' } }); + const fd = new FormData(); + fd.append('id', 'p-missing'); + + const result = await runAction(actions.confirm, fd, writer); + + expect(result).toMatchObject({ status: 404 }); + }); + + it('returns fail(403) when the user lacks WRITE_ALL (server-side guard)', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.confirm, fd, reader); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 403 }); + }); +}); + +describe('persons/review delete action', () => { + it('returns { success: true } on backend 200', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.delete, fd, writer); + + expect(apiCall).toHaveBeenCalledOnce(); + expect(result).toEqual({ success: true }); + }); + + it('returns fail(403) on backend 403', async () => { + mockApi({ ok: false, status: 403, error: { code: 'FORBIDDEN' } }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.delete, fd, writer); + + expect(result).toMatchObject({ status: 403 }); + }); + + it('returns fail(404) on backend 404', async () => { + mockApi({ ok: false, status: 404, error: { code: 'NOT_FOUND' } }); + const fd = new FormData(); + fd.append('id', 'p-missing'); + + const result = await runAction(actions.delete, fd, writer); + + expect(result).toMatchObject({ status: 404 }); + }); + + it('returns fail(403) when the user lacks WRITE_ALL (server-side guard)', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.delete, fd, reader); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 403 }); + }); +}); + +describe('persons/review merge action', () => { + it('returns { success: true } on backend 200', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('targetPersonId', 'p-2'); + + const result = await runAction(actions.merge, fd, writer); + + expect(apiCall).toHaveBeenCalledOnce(); + expect(result).toEqual({ success: true }); + }); + + it('returns fail(400) when targetPersonId is missing', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.merge, fd, writer); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 400 }); + }); + + it('returns fail(403) on backend 403', async () => { + mockApi({ ok: false, status: 403, error: { code: 'FORBIDDEN' } }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('targetPersonId', 'p-2'); + + const result = await runAction(actions.merge, fd, writer); + + expect(result).toMatchObject({ status: 403 }); + }); + + it('returns fail(404) on backend 404', async () => { + mockApi({ ok: false, status: 404, error: { code: 'NOT_FOUND' } }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('targetPersonId', 'p-missing'); + + const result = await runAction(actions.merge, fd, writer); + + expect(result).toMatchObject({ status: 404 }); + }); + + it('returns fail(403) when the user lacks WRITE_ALL (server-side guard)', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('targetPersonId', 'p-2'); + + const result = await runAction(actions.merge, fd, reader); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 403 }); + }); +}); + +describe('persons/review rename action', () => { + it('returns { success: true } on backend 200', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('lastName', 'Smith'); + + const result = await runAction(actions.rename, fd, writer); + + expect(apiCall).toHaveBeenCalledOnce(); + expect(result).toEqual({ success: true }); + }); + + it('returns fail(400) when lastName is missing', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + + const result = await runAction(actions.rename, fd, writer); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 400 }); + }); + + it('returns fail(403) on backend 403', async () => { + mockApi({ ok: false, status: 403, error: { code: 'FORBIDDEN' } }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('lastName', 'Smith'); + + const result = await runAction(actions.rename, fd, writer); + + expect(result).toMatchObject({ status: 403 }); + }); + + it('returns fail(404) on backend 404', async () => { + mockApi({ ok: false, status: 404, error: { code: 'NOT_FOUND' } }); + const fd = new FormData(); + fd.append('id', 'p-missing'); + fd.append('lastName', 'Smith'); + + const result = await runAction(actions.rename, fd, writer); + + expect(result).toMatchObject({ status: 404 }); + }); + + it('returns fail(403) when the user lacks WRITE_ALL (server-side guard)', async () => { + const apiCall = mockApi({ ok: true, status: 200 }); + const fd = new FormData(); + fd.append('id', 'p-1'); + fd.append('lastName', 'Smith'); + + const result = await runAction(actions.rename, fd, reader); + + expect(apiCall).not.toHaveBeenCalled(); + expect(result).toMatchObject({ status: 403 }); + }); +}); diff --git a/tools/import-normalizer/.gitignore b/tools/import-normalizer/.gitignore new file mode 100644 index 00000000..d48fb3f8 --- /dev/null +++ b/tools/import-normalizer/.gitignore @@ -0,0 +1,5 @@ +.venv/ +out/ +review/ +__pycache__/ +*.pyc diff --git a/tools/import-normalizer/README.md b/tools/import-normalizer/README.md new file mode 100644 index 00000000..4500db5c --- /dev/null +++ b/tools/import-normalizer/README.md @@ -0,0 +1,43 @@ +# Import Normalizer + +Transforms the raw family-archive spreadsheets in `../../import/` into a clean canonical +dataset (`out/`) plus review reports (`review/`). See the spec: +`../../docs/import-migration/02-normalization-spec.md`. + +## Setup +Requires **Python 3.12** (uses `StrEnum`). +```bash +python3 -m venv .venv && .venv/bin/pip install -r requirements.txt +``` + +## Run +```bash +.venv/bin/python normalize.py +``` +Outputs: +- `out/canonical-documents.xlsx`, `out/canonical-persons.xlsx` +- `review/*.csv` (residue to fix), `review/summary.txt` (grouped run stats incl. unknown-date rate) + +## Iteration loop +1. **Run.** Read `review/summary.txt` for the health snapshot. +2. **Fix the residue** by editing the version-controlled overrides files, then re-run. Repeat. + +| Review file | What to do | +| --- | --- | +| `unparsed-dates.csv` | For each `raw` (sorted by frequency), fill `suggested_iso` + `suggested_precision`, then paste `raw,suggested_iso,suggested_precision` into `overrides/dates.csv` (header `raw,iso,precision`). | +| `unresolved-names.csv` | Names whose value is itself problematic, grouped by `category`: `unknown` (`?`/illegible), `single_token` (first OR last name only), `relational` (`Tante …`), `collective` (`Familie …`), `prose` (a description landed in a name column), `ambiguous_pair` (two given names → likely two people, not auto-split). Review highest-impact categories first; add decisions to `overrides/names.csv` (look up valid ids in `out/canonical-persons.xlsx`). | +| `duplicate-index.csv`, `blank-index-rows.csv`, `skipped-x-suffix.csv` | Inspect; fix in the source spreadsheet if needed. | + +> `unresolved-names.csv` is the focused "names that need a human" list. Non-family +> correspondents that simply aren't in the register are NOT reported — they just become +> provisional persons in `out/canonical-persons.xlsx` (the `unmatched_name_strings` count in +> `summary.txt` tracks how many). The given-name set that drives `ambiguous_pair` detection is +> the register's first names plus `config.EXTRA_GIVEN_NAMES` — add names there if a real +> two-person cell isn't being flagged. + +**Valid `person_id` values** all come from the `person_id` column of `out/canonical-persons.xlsx`. + +## Tests +```bash +.venv/bin/python -m pytest tests/test_dates.py -v # run files individually (never the whole suite at once) +``` diff --git a/tools/import-normalizer/config.py b/tools/import-normalizer/config.py new file mode 100644 index 00000000..afea47c9 --- /dev/null +++ b/tools/import-normalizer/config.py @@ -0,0 +1,134 @@ +"""Tunables for the import normalizer. No logic here — only data tables.""" +from pathlib import Path + +# --- Paths --- +BASE_DIR = Path(__file__).resolve().parent +REPO_ROOT = BASE_DIR.parent.parent +IMPORT_DIR = REPO_ROOT / "import" + +DOCUMENT_WORKBOOK = IMPORT_DIR / "zzfamilienarchiv aktuell 2 - Kopie 2025-07-05.xlsx" +DOCUMENT_SHEET = "Familienarchiv" +PERSON_WORKBOOK = IMPORT_DIR / "Personendatei 2.xlsx" +PERSON_SHEET = "Tabelle1" + +OUT_DIR = BASE_DIR / "out" +REVIEW_DIR = BASE_DIR / "review" +OVERRIDES_DIR = BASE_DIR / "overrides" + +# --- Header text (lowercased, whitespace-collapsed) -> canonical field --- +DOCUMENT_HEADER_MAP = { + "index": "index", + "box": "box", + "mappe": "folder", + "briefeschreiberin": "sender", + "empfängerin": "receivers", + "datum des briefes": "date", + "ort": "location", + "schlagwort": "tags", + "inhalt": "summary", +} +DOCUMENT_REQUIRED_FIELDS = {"index"} + +PERSON_HEADER_MAP = { + "generation": "generation", + "familienname": "last_name", + "vorname": "first_name", + "geb als": "maiden_name", + "geburtsdatum": "birth_date", + "geburtsort": "birth_place", + "todesdatum": "death_date", + "sterbeort": "death_place", + "verheiratet mit": "spouse", + "bemerkung": "notes", +} +PERSON_REQUIRED_FIELDS = {"last_name"} + +# --- Century rule (archive 1873–1957) --- +TWO_DIGIT_19XX_MAX = 57 # 00..57 -> 1900+yy +TWO_DIGIT_18XX_MIN = 73 # 73..99 -> 1800+yy ; 58..72 -> ambiguous -> UNKNOWN + +# --- Seasons -> representative month (day = 1) --- +SEASON_MONTHS = { + "frühling": 4, "fruehling": 4, "frühjahr": 4, "fruehjahr": 4, + "sommer": 7, "herbst": 10, "winter": 1, +} + +# --- Fixed feasts -> (month, day) --- +FIXED_FEASTS = { + "neujahr": (1, 1), + "heiligabend": (12, 24), "heiliger abend": (12, 24), + "weihnachten": (12, 25), "weihnacht": (12, 25), "1. weihnachtstag": (12, 25), + "silvester": (12, 31), "sylvester": (12, 31), +} + +# --- Movable feasts -> day offset from Easter Sunday --- +MOVABLE_FEASTS = { + "karfreitag": -2, + "ostern": 0, "ostersonntag": 0, "ostermontag": 1, + "himmelfahrt": 39, "christi himmelfahrt": 39, + "pfingsten": 49, "pfingstsonntag": 49, "pfingstmontag": 50, + "fronleichnam": 60, +} + +# --- Month names -> number (German + English, full + abbreviations) --- +MONTHS = { + "januar": 1, "jan": 1, "january": 1, + "februar": 2, "feb": 2, "febr": 2, "february": 2, + "märz": 3, "maerz": 3, "mär": 3, "mar": 3, "march": 3, + "april": 4, "apr": 4, + "mai": 5, "may": 5, + "juni": 6, "jun": 6, "june": 6, + "juli": 7, "jul": 7, "july": 7, + "august": 8, "aug": 8, + "september": 9, "sep": 9, "sept": 9, + "oktober": 10, "okt": 10, "oct": 10, "october": 10, + "november": 11, "nov": 11, + "dezember": 12, "dez": 12, "dec": 12, "december": 12, + # Spanish (Mexican-branch correspondence) + "enero": 1, "febrero": 2, "marzo": 3, "abril": 4, "mayo": 5, "junio": 6, + "julio": 7, "agosto": 8, "septiembre": 9, "setiembre": 9, "octubre": 10, + "noviembre": 11, "diciembre": 12, +} + +ROMAN_MONTHS = { + "i": 1, "ii": 2, "iii": 3, "iv": 4, "v": 5, "vi": 6, + "vii": 7, "viii": 8, "ix": 9, "x": 10, "xi": 11, "xii": 12, +} + +# --- Person matching --- +KNOWN_LAST_NAMES = [ + "von der Heide", "von Massenbach", "von Geldern", "von Gelden", "von Staa", + "de Gruyter", "Dieckmann", "Gruber", "Müller", "Wolff", "Cram", +] +FUZZY_SUGGEST_THRESHOLD = 0.82 # difflib ratio; suggestions only, never auto-applied + +# --- Name classification (unresolved-name review) --- +# Relational reference terms — a sender/receiver named by relation, not a proper name. +RELATIONAL_TERMS = { + "tante", "onkel", "mutter", "vater", "oma", "opa", "großmutter", "grossmutter", + "großvater", "grossvater", "schwester", "bruder", "cousin", "cousine", "kusine", + "neffe", "nichte", "tochter", "sohn", "schwager", "schwägerin", "schwiegermutter", + "schwiegervater", "enkel", "enkelin", "vetter", "base", "witwe", "witwer", +} +# Collective/group terms — not a single person. Matched against alpha-only word tokens +# (so "Fam.Cram" -> ["fam","cram"] matches "fam"), NOT as substrings/prefixes. +COLLECTIVE_TERMS = { + "familie", "fam", "kinder", "eltern", "geschwister", "großeltern", + "grosseltern", "alle", "diverse", "div", "gebrüder", "gebr", + # Plural/group relational terms — added for tag generation heuristic + "söhne", "töchter", "brüder", "schwestern", "schwiegereltern", + "vettern", "kusinen", "cousinen", "nichten", "neffen", "tanten", + "freunde", "bekannte", "geschw", "enkelkinder", "jungens", "verwandten", +} +# Markers of an unknown/illegible name (the literal "?" is handled separately in code). +# All long enough to be safe as SUBSTRING matches — do NOT add short tokens like "nn" +# (it occurs inside real names: Hanni, Johanna, Anna). +UNKNOWN_NAME_MARKERS = {"unbekannt", "unbek", "unleserlich", "unklar", "unsicher"} +# A name-column value longer than this (chars) is treated as prose/description, not a name. +PROSE_MAX_LEN = 40 +# Common given names that may appear in two-given-name pairs (e.g. "Ella Anita") but are not +# in the family register. Only used to detect AMBIGUOUS_PAIR — extend as review surfaces more. +EXTRA_GIVEN_NAMES = { + "ella", "anita", "kurt", "georg", "hanni", "mieze", "ellen", "leni", "klara", + "margret", "gustava", "emmy", "minna", "sophie", "helga", "raymonde", "augusta", +} diff --git a/tools/import-normalizer/dates.py b/tools/import-normalizer/dates.py new file mode 100644 index 00000000..907178b2 --- /dev/null +++ b/tools/import-normalizer/dates.py @@ -0,0 +1,306 @@ +"""Tolerant historical date parsing for the family archive.""" +import datetime +import re +from dataclasses import dataclass +from enum import StrEnum +import config + + +class Precision(StrEnum): + DAY = "DAY" + MONTH = "MONTH" + SEASON = "SEASON" + YEAR = "YEAR" + RANGE = "RANGE" + APPROX = "APPROX" + UNKNOWN = "UNKNOWN" + + +def _advent_sunday(year: int, n: int) -> datetime.date: + """n-th Advent (1..4). 4th Advent = last Sunday on/before Dec 24.""" + dec24 = datetime.date(year, 12, 24) + back_to_sunday = (dec24.weekday() - 6) % 7 # Mon=0..Sun=6 + fourth = dec24 - datetime.timedelta(days=back_to_sunday) + return fourth - datetime.timedelta(days=(4 - n) * 7) + + +def resolve_feast_or_season(token: str, year: int): + """Return (iso, Precision) for a known feast/season token, else None.""" + key = " ".join(token.lower().split()).strip(" .") + if key in config.MOVABLE_FEASTS: + d = easter(year) + datetime.timedelta(days=config.MOVABLE_FEASTS[key]) + return d.isoformat(), Precision.DAY + if key in config.FIXED_FEASTS: + month, day = config.FIXED_FEASTS[key] + return datetime.date(year, month, day).isoformat(), Precision.DAY + advent = {"1. advent": 1, "2. advent": 2, "3. advent": 3, "4. advent": 4, "advent": 1} + if key in advent: + return _advent_sunday(year, advent[key]).isoformat(), Precision.DAY + if key in config.SEASON_MONTHS: + return datetime.date(year, config.SEASON_MONTHS[key], 1).isoformat(), Precision.SEASON + return None + + +def expand_year(token: str): + """Expand a 2/3/4-digit year string per the 1873–1957 century rule. None if ambiguous.""" + token = token.strip() + if not token.isdigit(): + return None + n, v = len(token), int(token) + if n == 4: + # reject gross typos (e.g. "9003") so they go to review instead of a bogus year + return v if 1700 <= v <= 2100 else None + if n == 3: + return 1000 + v + if n == 2: + if v <= config.TWO_DIGIT_19XX_MAX: + return 1900 + v + if v >= config.TWO_DIGIT_18XX_MIN: + return 1800 + v + return None + return None + + +@dataclass(frozen=True) +class ParsedDate: + iso: str | None + precision: Precision + raw: str + end: str | None = None # RANGE end day; None for every non-RANGE precision + # True only for a half-resolved RANGE: the start parsed but the end did not, so + # the end was dropped and the row should surface in review (#670, Gap 2). + needs_review: bool = False + + +@dataclass(frozen=True) +class MatchResult: + """Uniform return shape for every _match_* matcher. + + A matcher returns None when it does not match, or a MatchResult when it does. + `end` is the RANGE end day (None for every non-RANGE precision); `needs_review` + is True only for a half-resolved RANGE whose start parsed but end did not. + """ + iso: str + precision: Precision + end: str | None = None + needs_review: bool = False + + +_LEADING_MARKERS = re.compile( + r"^(um|ca\.?|circa|etwa|wohl|vermutlich|nach|vor|anfang|mitte|ende)\s+", re.I) + + +def _preprocess(raw: str): + """Return (cleaned_string, approx_flag). Any uncertainty/qualifier marker -> approx.""" + s = (raw or "").strip() + if not s: + return "", False + low = s.lower() + approx = ("?" in s) or any( + m in low for m in ("um ", "ca.", "ca ", "circa", "etwa", "wohl", "vermutlich")) + s = re.sub(r"\(\s*\?\s*\)", " ", s) # remove "(?)" + s = s.replace("?", " ") + s = re.sub(r",.*$", "", s) # drop trailing editorial note (", 2. Brief") + stripped = _LEADING_MARKERS.sub("", s) + if stripped != s: # a leading qualifier (um/ca/nach/vor/anfang/…) signals approximation + approx = True + s = re.sub(r"\s+", " ", stripped).strip(" .,") + return s, approx + + +_NUM_RE = re.compile(r"(\d{1,2})[./](\d{1,2})\.?\s*(\d{2,4})") + + +def _match_iso(s): + if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s): + try: + datetime.date.fromisoformat(s) + return MatchResult(s, Precision.DAY) + except ValueError: + return None + return None + + +def _match_numeric(s): + m = _NUM_RE.fullmatch(s) + if not m: + return None + day, month = int(m.group(1)), int(m.group(2)) + year = expand_year(m.group(3)) + if year is None or not (1 <= month <= 12): + return None + try: + return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY) + except ValueError: + return None + + +_ROMAN_RE = re.compile(r"(\d{1,2})\.\s*([IVXLC]+)\.?\s*(\d{2,4})", re.I) + + +def _match_roman(s): + m = _ROMAN_RE.fullmatch(s) + if not m: + return None + day = int(m.group(1)) + month = config.ROMAN_MONTHS.get(m.group(2).lower()) + year = expand_year(m.group(3)) + if not month or year is None: + return None + try: + return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY) + except ValueError: + return None + + +_MONTH_A_RE = re.compile(r"(\d{1,2})[.\s]*([A-Za-zÄÖÜäöü]+)\.?\s*(\d{2,4})") + + +def _lookup_month(token: str): + return config.MONTHS.get(token.lower().strip(" .")) + + +def _build_day_month_year(day, month, year): + if not month or year is None or not (1 <= month <= 12): + return None + try: + return MatchResult(datetime.date(year, month, day).isoformat(), Precision.DAY) + except ValueError: + return None + + +def _match_monthname_a(s): + m = _MONTH_A_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(1)), _lookup_month(m.group(2)), expand_year(m.group(3))) + + +# A separator (dot OR hyphen/en-dash) after the day is REQUIRED so this can't match +# "Mai 1895" (MONTH YYYY) as day=18; the hyphen form also covers Spanish "Mayo 18-1929". +_MONTH_B_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s*(\d{1,2})\s*[.\-–]\s*(\d{2,4})") + + +def _match_monthname_b(s): + m = _MONTH_B_RE.fullmatch(s) + if not m: + return None + return _build_day_month_year(int(m.group(2)), _lookup_month(m.group(1)), expand_year(m.group(3))) + + +_MONTH_YEAR_RE = re.compile(r"([A-Za-zÄÖÜäöü]+)\.?\s+(\d{2,4})") +_TOKEN_YEAR_RE = re.compile(r"(.+?)\.?\s+(\d{2,4})") +_YEAR_ONLY_RE = re.compile(r"\d{4}") +_RANGE_YY_RE = re.compile(r"(\d{4})\s*/\s*\d{2}") +_RANGE_HYPHEN_RE = re.compile(r"(.*\d)\s*[-–]\s*\d.*") +# Intra-month day range, e.g. "7./8. Sept.1923" — require a dot before the slash so it +# does NOT swallow slash-as-dot single dates like "17/6. 1916" (which has no dot before "/"). +_RANGE_DAY_RE = re.compile(r"(\d{1,2})\./(\d{1,2})\.\s*(.+)") + + +def _match_month_year(s): + m = _MONTH_YEAR_RE.fullmatch(s) + if not m: + return None + month = _lookup_month(m.group(1)) + year = expand_year(m.group(2)) + if not month or year is None: + return None + return MatchResult(datetime.date(year, month, 1).isoformat(), Precision.MONTH) + + +def _match_feast_season(s): + m = _TOKEN_YEAR_RE.fullmatch(s) + if not m: + return None + year = expand_year(m.group(2)) + if year is None: + return None + resolved = resolve_feast_or_season(m.group(1), year) + if resolved is None: + return None + iso, precision = resolved + return MatchResult(iso, precision) + + +def _match_year_only(s): + if _YEAR_ONLY_RE.fullmatch(s): + return MatchResult(datetime.date(int(s), 1, 1).isoformat(), Precision.YEAR) + return None + + +def _match_range(s): + m = _RANGE_YY_RE.fullmatch(s) + if m: + return MatchResult(datetime.date(int(m.group(1)), 1, 1).isoformat(), Precision.RANGE) + m = _RANGE_DAY_RE.fullmatch(s) + if m: + day_start, day_end, rest = m.group(1), m.group(2), m.group(3) + # "10." + "1.1917" -> "10.1.1917"; resolve start and end day against the shared month/year + for matcher in (_match_numeric, _match_roman, _match_monthname_a): + start = matcher(f"{day_start}.{rest}") + if start: + end = matcher(f"{day_end}.{rest}") + # Half-resolved range (start parsed, end did not — e.g. the impossible + # end day in "10./40.1.1917"): keep the start and RANGE precision, drop + # the end, and flag needs_review so the dropped end surfaces (#670, Gap 2). + return MatchResult(start.iso, Precision.RANGE, + end.iso if end else None, + needs_review=end is None) + m = _RANGE_HYPHEN_RE.fullmatch(s) + if m: + start = m.group(1).strip() + for matcher in (_match_numeric, _match_roman, _match_monthname_a, _match_year_only): + r = matcher(start) + if r: + return MatchResult(r.iso, Precision.RANGE) + return None + + +_MATCHERS = [ + _match_iso, + _match_range, + _match_numeric, + _match_roman, + _match_monthname_a, + _match_month_year, + _match_monthname_b, + _match_feast_season, + _match_year_only, +] + + +def parse_date(raw: str, date_overrides: dict | None = None) -> ParsedDate: + if date_overrides: + key = (raw or "").strip() + if key in date_overrides: + iso, prec = date_overrides[key] + return ParsedDate(iso or None, Precision(prec), raw) + cleaned, approx = _preprocess(raw) + if not cleaned: + return ParsedDate(None, Precision.UNKNOWN, raw) + for matcher in _MATCHERS: + result = matcher(cleaned) + if result: + precision = Precision.APPROX if approx else result.precision + return ParsedDate(result.iso, precision, raw, result.end, result.needs_review) + return ParsedDate(None, Precision.UNKNOWN, raw) + + +def easter(year: int) -> datetime.date: + """Easter Sunday (Gregorian) via the Anonymous Gregorian / Butcher algorithm.""" + a = year % 19 + b = year // 100 + c = year % 100 + d = b // 4 + e = b % 4 + f = (b + 8) // 25 + g = (b - f + 1) // 3 + h = (19 * a + b - d - g + 15) % 30 + i = c // 4 + k = c % 4 + l = (32 + 2 * e + 2 * i - h - k) % 7 + m = (a + 11 * h + 22 * l) // 451 + month = (h + l - 7 * m + 114) // 31 + day = ((h + l - 7 * m + 114) % 31) + 1 + return datetime.date(year, month, day) diff --git a/tools/import-normalizer/documents.py b/tools/import-normalizer/documents.py new file mode 100644 index 00000000..08bb1c31 --- /dev/null +++ b/tools/import-normalizer/documents.py @@ -0,0 +1,111 @@ +"""Document row extraction, triage, and the canonical document record.""" +from dataclasses import dataclass, field +from enum import Enum, auto + +import dates as _dates +import tags as _tags + + +class Triage(Enum): + OK = auto() + EMPTY = auto() + BLANK_INDEX = auto() + X_SUFFIX = auto() + + +@dataclass +class RawRow: + source_row: int + index: str = "" + box: str = "" + folder: str = "" + sender: str = "" + receivers: str = "" + date: str = "" + location: str = "" + tags: str = "" + summary: str = "" + + +@dataclass +class CanonicalDocument: + index: str + box: str = "" + folder: str = "" + sender_person_id: str = "" + sender_name: str = "" + receiver_person_ids: list = field(default_factory=list) + receiver_names: list = field(default_factory=list) + date_iso: str = "" + date_raw: str = "" + date_precision: str = "" + date_end: str = "" + location: str = "" + tags: list = field(default_factory=list) + summary: str = "" + source_row: int = 0 + needs_review: list = field(default_factory=list) + + +_FIELDS = ["index", "box", "folder", "sender", "receivers", "date", "location", "tags", "summary"] + + +def extract_row(cells: list[str], header: dict[str, int], source_row: int) -> RawRow: + def get(field_name): + idx = header.get(field_name) + if idx is None or idx >= len(cells): + return "" + return (cells[idx] or "").strip() + return RawRow(source_row=source_row, **{f: get(f) for f in _FIELDS}) + + +def triage(cells: list[str], index_col: int = 0) -> Triage: + nonempty = [c for c in cells if c and str(c).strip()] + if not nonempty: + return Triage.EMPTY + index = (cells[index_col] or "").strip() if 0 <= index_col < len(cells) else "" + if not index: + return Triage.BLANK_INDEX + if index.endswith("x"): + return Triage.X_SUFFIX + return Triage.OK + + +def classify_blank_index(cells: list[str], header: dict[str, int]) -> str: + """REQ-TRIAGE-02: 'section_banner' if only name columns are populated, else 'data_no_index'.""" + name_cols = {header.get("sender"), header.get("receivers")} - {None} + populated = {i for i, c in enumerate(cells) if c and str(c).strip()} + if populated and populated <= name_cols: + return "section_banner" + return "data_no_index" + + +def to_canonical(raw, ctx, date_overrides: dict, approved_themes: frozenset = frozenset()) -> CanonicalDocument: + pd = _dates.parse_date(raw.date, date_overrides) + flags = [] + + sender_id, sender_name, sender_matched, sender_multi = ctx.resolve_sender(raw.sender, raw.source_row) + if raw.sender.strip() and not sender_matched: + flags.append("unmatched_sender") + if sender_multi: + flags.append("multi_sender") + + receivers = ctx.resolve_receivers(raw.receivers, raw.source_row) + if any(not matched for _, _, matched in receivers): + flags.append("unmatched_receiver") + + if raw.date.strip() and pd.precision == _dates.Precision.UNKNOWN: + flags.append("unparsed_date") + if pd.needs_review: + flags.append("range_end_unparsed") + + return CanonicalDocument( + index=raw.index, box=raw.box, folder=raw.folder, + sender_person_id=sender_id, sender_name=sender_name, + receiver_person_ids=[r[0] for r in receivers], + receiver_names=[r[1] for r in receivers], + date_iso=pd.iso or "", date_raw=raw.date, date_precision=str(pd.precision), + date_end=pd.end or "", + location=raw.location, tags=_tags.generate_tags(raw.tags, raw.summary, approved_themes), summary=raw.summary, + source_row=raw.source_row, needs_review=flags, + ) diff --git a/tools/import-normalizer/ingest.py b/tools/import-normalizer/ingest.py new file mode 100644 index 00000000..7c7c8de2 --- /dev/null +++ b/tools/import-normalizer/ingest.py @@ -0,0 +1,50 @@ +"""Read .xlsx sheets into neutral list[list[str]] and map headers to fields.""" +import datetime +from pathlib import Path +import openpyxl + + +def _cell_to_str(value) -> str: + if value is None: + return "" + if isinstance(value, bool): # bool is a subclass of int — handle before the int branch + return str(value) + if isinstance(value, datetime.datetime): + return value.date().isoformat() + if isinstance(value, datetime.date): + return value.isoformat() + if isinstance(value, float) and value.is_integer(): + return str(int(value)) + if isinstance(value, int): + return str(value) + return str(value).strip() + + +def read_sheet(path: Path, sheet_name: str) -> list[list[str]]: + wb = openpyxl.load_workbook(path, read_only=True, data_only=True) + if sheet_name not in wb.sheetnames: + raise ValueError(f"Sheet '{sheet_name}' not found in {path.name}; sheets: {wb.sheetnames}") + ws = wb[sheet_name] + rows = [[_cell_to_str(v) for v in row] for row in ws.iter_rows(values_only=True)] + wb.close() + return rows + + +def _norm_header(text: str) -> str: + return " ".join(text.lower().split()) + + +def build_header_map(header_row: list[str], field_map: dict[str, str], required: set[str]): + """Return (field->col_index, unknown_headers). Raise ValueError if a required field is missing.""" + fields: dict[str, int] = {} + unknown: list[str] = [] + for idx, raw in enumerate(header_row): + key = _norm_header(raw) + if key in field_map: + fields[field_map[key]] = idx + elif raw.strip(): + unknown.append(raw) + missing = required - set(fields) + if missing: + raise ValueError(f"Required header(s) missing: {sorted(missing)} (found headers: {header_row})") + return fields, unknown diff --git a/tools/import-normalizer/normalize.py b/tools/import-normalizer/normalize.py new file mode 100644 index 00000000..5bde0246 --- /dev/null +++ b/tools/import-normalizer/normalize.py @@ -0,0 +1,167 @@ +"""Orchestrator: read raw workbooks -> canonical outputs + review reports.""" +import argparse +from collections import Counter +from pathlib import Path + +import config +import ingest +import persons +import documents +import overrides as overrides_mod +import tags as _tags +import writers + + +def run(*, document_workbook, document_sheet, person_workbook, person_sheet, + out_dir, review_dir, date_overrides, name_overrides, + approved_themes_path=None) -> dict: + out_dir, review_dir = Path(out_dir), Path(review_dir) + + approved_themes = _tags.load_approved_themes(Path(approved_themes_path)) if approved_themes_path else set() + + # --- persons --- + person_rows = ingest.read_sheet(person_workbook, person_sheet) + p_fields, _ = ingest.build_header_map(person_rows[0], config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS) + person_dicts = [{f: (row[i] if i < len(row) else "") for f, i in p_fields.items()} for row in person_rows[1:]] + register = persons.parse_register(person_dicts) + alias_index = persons.AliasIndex(register) + given_names = persons.build_given_names(register, config.EXTRA_GIVEN_NAMES) + ctx = persons.ResolutionContext(alias_index, name_overrides, given_names=given_names) + + # --- documents --- + doc_rows = ingest.read_sheet(document_workbook, document_sheet) + d_fields, unknown_headers = ingest.build_header_map(doc_rows[0], config.DOCUMENT_HEADER_MAP, config.DOCUMENT_REQUIRED_FIELDS) + index_col = d_fields["index"] + + canon_docs, blank_index, skipped_x = [], [], [] + unparsed_by_raw: dict[str, list] = {} + dates_by_override = 0 + empty_count = 0 + seen_index = Counter() + + for source_row, cells in enumerate(doc_rows[1:], start=2): + t = documents.triage(cells, index_col) + if t is documents.Triage.EMPTY: + empty_count += 1 + continue + if t is documents.Triage.BLANK_INDEX: + blank_index.append([source_row, documents.classify_blank_index(cells, d_fields), + " | ".join(c for c in cells if c)]) + continue + if t is documents.Triage.X_SUFFIX: + idx = (cells[index_col] or "").strip() + skipped_x.append([source_row, idx, idx[:-1]]) + continue + raw = documents.extract_row(cells, d_fields, source_row) + seen_index[raw.index] += 1 + if raw.date.strip() and raw.date.strip() in date_overrides: + dates_by_override += 1 + doc = documents.to_canonical(raw, ctx, date_overrides, frozenset(approved_themes)) + if "unparsed_date" in doc.needs_review: + unparsed_by_raw.setdefault(raw.date, []).append(source_row) + canon_docs.append(doc) + + # REQ-TRIAGE-01: flag EVERY occurrence of a duplicated index and report all of them. + dup_indexes = {idx for idx, n in seen_index.items() if n > 1} + duplicates = [] + for doc in canon_docs: + if doc.index in dup_indexes: + if "duplicate_index" not in doc.needs_review: + doc.needs_review.append("duplicate_index") + duplicates.append([doc.source_row, doc.index]) + + all_people = register + list(ctx.provisional.values()) + + # --- write canonical outputs --- + writers.write_documents_xlsx(canon_docs, out_dir / "canonical-documents.xlsx") + writers.write_persons_xlsx(all_people, out_dir / "canonical-persons.xlsx") + + all_tag_paths = [path for doc in canon_docs for path in doc.tags] + writers.write_tag_tree_xlsx(_tags.build_tag_tree(all_tag_paths), out_dir / "canonical-tag-tree.xlsx") + + # --- review files --- + # unparsed dates: most-frequent first, with example source rows + blank override cells so a + # corrected row can be pasted straight into overrides/dates.csv (same raw,iso,precision shape). + unparsed_rows = sorted( + ([raw, len(rows), " ".join(map(str, rows[:5])), "", ""] for raw, rows in unparsed_by_raw.items()), + key=lambda r: (-r[1], r[0])) + writers.write_review_csv(review_dir / "unparsed-dates.csv", + ["raw", "count", "example_rows", "suggested_iso", "suggested_precision"], unparsed_rows) + + writers.write_review_csv(review_dir / "duplicate-index.csv", ["source_row", "index"], duplicates) + writers.write_review_csv(review_dir / "blank-index-rows.csv", ["source_row", "kind", "content"], blank_index) + writers.write_review_csv(review_dir / "skipped-x-suffix.csv", ["source_row", "index", "base_index"], skipped_x) + unresolved_agg: dict[tuple, list] = {} + for name, category, row in ctx.unresolved: + unresolved_agg.setdefault((category, name), []).append(row) + unresolved_rows = sorted( + ([cat, name, len(rows), " ".join(map(str, sorted(rows)[:5]))] + for (cat, name), rows in unresolved_agg.items()), + key=lambda r: (r[0], -r[2], r[1])) + writers.write_review_csv(review_dir / "unresolved-names.csv", + ["category", "raw", "count", "example_rows"], unresolved_rows) + + all_summaries = [doc.summary for doc in canon_docs if doc.summary] + candidates = _tags.mine_summary_candidates(all_summaries) + writers.write_review_csv(review_dir / "tag-candidates.csv", ["candidate", "count"], + [[c, n] for c, n in candidates]) + + dated = sum(1 for d in canon_docs if d.date_raw.strip()) + unknown = sum(1 for d in canon_docs if d.date_raw.strip() and d.date_precision == "UNKNOWN") + unknown_rate = f"{(100 * unknown / dated):.1f}%" if dated else "0.0%" + + stats = { + "# INPUTS": "", + "document_rows_read": len(doc_rows) - 1, + "register_persons": len(register), + "unknown_headers": ", ".join(unknown_headers) or "(none)", + "# OUTPUTS": "", + "documents_emitted": len(canon_docs), + "provisional_persons": len(ctx.provisional), + "# DATES": "", + "dated_rows": dated, + "unparsed_dates": unknown, + "unknown_date_rate": f"{unknown_rate} (target <=5%)", + "distinct_unparsed_formats": len(unparsed_by_raw), + "# NAMES": "", + "unmatched_name_strings": len(ctx.unmatched), + "unresolved_name_occurrences": len(ctx.unresolved), + "unresolved_unknown": sum(1 for _, c, _ in ctx.unresolved if c == "unknown"), + "unresolved_single_token": sum(1 for _, c, _ in ctx.unresolved if c == "single_token"), + "unresolved_relational": sum(1 for _, c, _ in ctx.unresolved if c == "relational"), + "unresolved_collective": sum(1 for _, c, _ in ctx.unresolved if c == "collective"), + "unresolved_prose": sum(1 for _, c, _ in ctx.unresolved if c == "prose"), + "unresolved_ambiguous_pair": sum(1 for _, c, _ in ctx.unresolved if c == "ambiguous_pair"), + "# ANOMALIES": "", + "empty_rows": empty_count, + "blank_index_rows": len(blank_index), + "skipped_x_suffix": len(skipped_x), + "duplicate_index_rows": len(duplicates), + "# OVERRIDES": "", + "date_overrides_loaded": len(date_overrides), + "name_overrides_loaded": len(name_overrides), + "dates_resolved_by_override": dates_by_override, + "names_resolved_by_override": ctx.override_hits, + } + writers.write_summary(review_dir / "summary.txt", stats) + return stats + + +def main(): + parser = argparse.ArgumentParser(description="Normalize the family archive spreadsheets.") + parser.parse_args() + date_overrides, name_overrides = overrides_mod.load_overrides( + config.OVERRIDES_DIR / "dates.csv", config.OVERRIDES_DIR / "names.csv") + stats = run( + document_workbook=config.DOCUMENT_WORKBOOK, document_sheet=config.DOCUMENT_SHEET, + person_workbook=config.PERSON_WORKBOOK, person_sheet=config.PERSON_SHEET, + out_dir=config.OUT_DIR, review_dir=config.REVIEW_DIR, + date_overrides=date_overrides, name_overrides=name_overrides, + approved_themes_path=config.OVERRIDES_DIR / "approved-themes.csv") + print("Normalization complete:") + for k, v in stats.items(): + print(f" {k}: {v}") + + +if __name__ == "__main__": + main() diff --git a/tools/import-normalizer/overrides.py b/tools/import-normalizer/overrides.py new file mode 100644 index 00000000..65638dff --- /dev/null +++ b/tools/import-normalizer/overrides.py @@ -0,0 +1,21 @@ +"""Load human-supplied corrections. Missing files are not an error.""" +import csv +from pathlib import Path + + +def load_overrides(dates_path: Path, names_path: Path): + date_overrides: dict[str, tuple[str, str]] = {} + name_overrides: dict[str, str] = {} + if Path(dates_path).exists(): + with open(dates_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + date_overrides[raw] = ((row.get("iso") or "").strip(), (row.get("precision") or "UNKNOWN").strip()) + if Path(names_path).exists(): + with open(names_path, encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + raw = (row.get("raw") or "").strip() + if raw: + name_overrides[raw] = (row.get("person_id") or "").strip() + return date_overrides, name_overrides diff --git a/tools/import-normalizer/overrides/README.md b/tools/import-normalizer/overrides/README.md new file mode 100644 index 00000000..f5ee0a9b --- /dev/null +++ b/tools/import-normalizer/overrides/README.md @@ -0,0 +1,81 @@ +# Overrides + +Human corrections applied **deterministically on every run**. An override **wins** over the +automatic date parser / name matcher, so this is how you fix the residue the tool can't resolve +on its own. Two CSV files live here; both are read by `overrides.load_overrides()`. + +- Missing or header-only files are fine — they just contribute zero overrides. +- Keep these files committed to git (they're your curated corrections); the generated `out/` + and `review/` folders are *not* committed. +- Matching is **exact** on the `raw` value after trimming surrounding whitespace. Copy the + `raw` value verbatim from the matching `review/*.csv`. + +## The iteration loop + +1. Run `python normalize.py`. +2. Open `review/unparsed-dates.csv` and `review/unresolved-names.csv` (sorted by frequency). +3. Add correction rows here, then re-run. Repeat until the residue is acceptable. + +--- + +## `dates.csv` — fix unparseable dates + +Header: `raw,iso,precision` + +| column | meaning | +| --- | --- | +| `raw` | the date string exactly as written in the spreadsheet (= the `raw` column in `review/unparsed-dates.csv`). | +| `iso` | the corrected date as `YYYY-MM-DD`. For partial dates use the 1st: month-only → `YYYY-MM-01`, year-only → `YYYY-01-01`. Leave **empty** if truly unknown. | +| `precision` | one of `DAY`, `MONTH`, `SEASON`, `YEAR`, `RANGE`, `APPROX`, `UNKNOWN`. | + +### Example + +```csv +raw,iso,precision +23.Juni 58,1958-06-23,DAY +8.März 60,1960-03-08,DAY +Mayo 18-1929,1929-05-18,DAY +Abril 10-929,1929-04-10,DAY +30.April,1909-04-30,DAY +Mai 1895,1895-05-01,MONTH +Herbst 1913,1913-10-01,SEASON +1945/46,1945-01-01,RANGE +um 1920,1920-01-01,APPROX +?,,UNKNOWN +``` + +Notes: +- `23.Juni 58` / `8.März 60` — two-digit years `58`/`60` fall in the parser's ambiguous + `58–72` band (just past the 1873–1957 window), so they aren't auto-parsed; here you assert 1958/1960. +- `Mayo`/`Abril` — Spanish month names (Mexican-branch letters) the parser doesn't know yet. +- `30.April` — month+day with no year; pick the year from the letter's context. +- Empty `iso` + `UNKNOWN` records a deliberate "unknown date" (stops it showing up as residue). + +--- + +## `names.csv` — map a name string to a canonical person + +Header: `raw,person_id` + +| column | meaning | +| --- | --- | +| `raw` | the sender/receiver name string exactly as written (= the `raw` column in `review/unresolved-names.csv`). For a multi-name cell that was split (e.g. `"Walter und Eugenie"`), use the **individual** name part. | +| `person_id` | the canonical id to map it to. **Must be a real id** from the `person_id` column of `out/canonical-persons.xlsx` (a register person or an already-created provisional). | + +### Example + +```csv +raw,person_id +A.Klucke,klucke-anna +? Hans de Gruyter,de-gruyter-hans +Eltern Cram,cram-john-james +Tante Lolly,blomquist-charlotte +``` + +Notes: +- Use this for partial / misspelled / illegible / aliased names that should point at a known person. +- It maps one string → **one** person. It does **not** split a two-person cell: for genuine + pairs like `Ella Anita` (flagged `ambiguous_pair`), there is no split-via-override yet — leave + them, or add both given names to `config.EXTRA_GIVEN_NAMES` so they keep getting flagged. +- Look up valid `person_id` values in `out/canonical-persons.xlsx`. An id that doesn't exist + there will create a dangling reference (no validation yet). diff --git a/tools/import-normalizer/overrides/approved-themes.csv b/tools/import-normalizer/overrides/approved-themes.csv new file mode 100644 index 00000000..02e8acdc --- /dev/null +++ b/tools/import-normalizer/overrides/approved-themes.csv @@ -0,0 +1 @@ +candidate diff --git a/tools/import-normalizer/overrides/dates.csv b/tools/import-normalizer/overrides/dates.csv new file mode 100644 index 00000000..f4ace38f --- /dev/null +++ b/tools/import-normalizer/overrides/dates.csv @@ -0,0 +1 @@ +raw,iso,precision diff --git a/tools/import-normalizer/overrides/names.csv b/tools/import-normalizer/overrides/names.csv new file mode 100644 index 00000000..445b0cb1 --- /dev/null +++ b/tools/import-normalizer/overrides/names.csv @@ -0,0 +1 @@ +raw,person_id diff --git a/tools/import-normalizer/persons.py b/tools/import-normalizer/persons.py new file mode 100644 index 00000000..fa257510 --- /dev/null +++ b/tools/import-normalizer/persons.py @@ -0,0 +1,336 @@ +"""Person register parsing, name splitting, alias resolution.""" +import difflib +import re +import unicodedata +from collections import Counter +from dataclasses import dataclass, field +from enum import StrEnum + +import config +import dates + +_DIACRITIC_MAP = str.maketrans({"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss", + "Ä": "ae", "Ö": "oe", "Ü": "ue"}) + + +def _strip_accents(s: str) -> str: + s = s.translate(_DIACRITIC_MAP) + s = unicodedata.normalize("NFKD", s) + return "".join(c for c in s if not unicodedata.combining(c)) + + +def slugify(last: str, first: str) -> str: + raw = f"{last} {first}".strip() + raw = _strip_accents(raw).lower() + raw = re.sub(r"[^a-z0-9]+", "-", raw).strip("-") + return raw or "unknown" + + +@dataclass +class Person: + person_id: str + last_name: str = "" + first_name: str = "" + maiden_name: str = "" + title: str = "" + nickname: str = "" + extra_given_names: list[str] = field(default_factory=list) + birth_date: str | None = None + birth_date_raw: str = "" + birth_place: str = "" + death_date: str | None = None + death_date_raw: str = "" + death_place: str = "" + spouse: str = "" + generation: str = "" + notes: str = "" + aliases: list[str] = field(default_factory=list) + provisional: bool = False + + +_QUOTED_RE = re.compile(r'^[""\']\s*(.+?)\s*[""\']$') + + +def parse_register(rows: list[dict]) -> list[Person]: + people = [] + for r in rows: + last = (r.get("last_name") or "").strip() + if not last: + continue + given_raw = (r.get("first_name") or "").strip() + givens = [g.strip() for g in given_raw.split(",") if g.strip()] + first = givens[0] if givens else "" + extra = givens[1:] + + spouse_raw = (r.get("spouse") or "").strip() + nickname = "" + m = _QUOTED_RE.match(spouse_raw) + if m: + nickname = m.group(1) + spouse_raw = "" + + birth = dates.parse_date(r.get("birth_date") or "") + death = dates.parse_date(r.get("death_date") or "") + people.append(Person( + person_id=slugify(last, first), + last_name=last, first_name=first, maiden_name=(r.get("maiden_name") or "").strip(), + nickname=nickname, extra_given_names=extra, + birth_date=birth.iso, birth_date_raw=(r.get("birth_date") or "").strip(), birth_place=(r.get("birth_place") or "").strip(), + death_date=death.iso, death_date_raw=(r.get("death_date") or "").strip(), death_place=(r.get("death_place") or "").strip(), + spouse=spouse_raw, generation=(r.get("generation") or "").strip(), + notes=(r.get("notes") or "").strip(), provisional=False, + )) + # De-duplicate colliding ids: every member of a colliding group gets a numeric suffix + # (-1, -2, …) so no id is left as an ambiguous "base". Unique ids are untouched. + counts = Counter(p.person_id for p in people) + seen: dict[str, int] = {} + for p in people: + if counts[p.person_id] > 1: + seen[p.person_id] = seen.get(p.person_id, 0) + 1 + p.person_id = f"{p.person_id}-{seen[p.person_id]}" + return people + + +_GEB_RE = re.compile(r",?\s*geb\.?\s+.+$", re.I) +_PAREN_RE = re.compile(r"\(([^)]+)\)\s*$") +_MULTI_RE = re.compile(r"\s+(?:und|u)\s+", re.I) + + +def find_known_last_name(segment: str) -> str | None: + seg = segment.strip() + for ln in config.KNOWN_LAST_NAMES: # config lists longest-first + if seg == ln or seg.endswith(" " + ln): + return ln + return None + + +def split_receivers(raw: str) -> list[str]: + if not raw or not raw.strip(): + return [] + # 0. split on "//" + if "//" in raw: + out = [] + for seg in raw.split("//"): + out.extend(split_receivers(seg)) + return out + cleaned = _GEB_RE.sub("", raw).strip() + if not cleaned: # e.g. a "geb. Müller"-only cell strips to empty + return [] + if not _MULTI_RE.search(cleaned): + return [cleaned] + shared_last = None + pm = _PAREN_RE.search(cleaned) + if pm: + shared_last = pm.group(1).strip() + cleaned = cleaned[:pm.start()].strip() + parts = [p.strip() for p in _MULTI_RE.split(cleaned)] + parts = [p for p in parts if p and p.lower() != "familie"] + if not parts: + return [] + if len(parts) == 1: + return [parts[0]] + if shared_last: + return [p if " " in p else f"{p} {shared_last}" for p in parts] + last_seg = parts[-1] + detected = find_known_last_name(last_seg) + if detected: + result = [] + for p in parts[:-1]: + if " " not in p and find_known_last_name(p) is None: + result.append(f"{p} {detected}") + else: + result.append(p) + result.append(last_seg) + return result + return parts + + +def _norm(name: str) -> str: + return re.sub(r"\s+", " ", _strip_accents(name).lower().replace(".", " ")).strip() + + +class NameClass(StrEnum): + RESOLVABLE = "resolvable" + UNKNOWN = "unknown" + SINGLE_TOKEN = "single_token" + RELATIONAL = "relational" + COLLECTIVE = "collective" + PROSE = "prose" + AMBIGUOUS_PAIR = "ambiguous_pair" + + +_QUOTE_CHARS = "\"'\u201c\u201d\u201e\u201a\u2018\u2019" + + +def classify_name(raw: str, given_names: set[str]) -> NameClass: + """Classify a (post-split) sender/receiver string by why it may be unresolvable. + + Precedence (first match wins): UNKNOWN -> PROSE -> COLLECTIVE -> RELATIONAL -> + SINGLE_TOKEN -> AMBIGUOUS_PAIR -> RESOLVABLE. + """ + s = raw.strip() + if not s: + return NameClass.RESOLVABLE + low = s.lower() + tokens = s.split() + # alpha-only word tokens: "Fam.Cram" -> ["fam","cram"], so collective/relational terms + # are matched as whole words (no substring/prefix false positives like "Allerton"). + alpha_words = re.findall(r"[a-zäöüß]+", low) + if "?" in s or any(m in low for m in config.UNKNOWN_NAME_MARKERS): + return NameClass.UNKNOWN + if (len(s) > config.PROSE_MAX_LEN or any(c.isdigit() for c in s) + or any(q in s for q in _QUOTE_CHARS) or len(tokens) > 3): + return NameClass.PROSE + if any(w in config.COLLECTIVE_TERMS for w in alpha_words): + return NameClass.COLLECTIVE + if any(w in config.RELATIONAL_TERMS for w in alpha_words): + return NameClass.RELATIONAL + if len(tokens) == 1: + return NameClass.SINGLE_TOKEN + if len(tokens) == 2 and all(_norm(t) in given_names for t in tokens): + return NameClass.AMBIGUOUS_PAIR + return NameClass.RESOLVABLE + + +# Known limitation: a 4+-token name with no digits/quotes (e.g. "Anna von der Heide") is +# classified PROSE. Such multi-particle names are rare here and usually resolve via the +# register; if they surface in review, lower-priority than the real prose entries. + + +def build_given_names(register: list[Person], extra: set[str]) -> set[str]: + """Set of normalized given names from the register (first + extra given) plus a supplement. + + Used by classify_name to tell a two-given-name pair (two people) from a first+surname. + """ + names: set[str] = set() + for p in register: + if p.first_name: + names.add(_norm(p.first_name)) + for g in p.extra_given_names: + names.add(_norm(g)) + for e in extra: + names.add(_norm(e)) + return names + + +class AliasIndex: + def __init__(self, people: list[Person]): + self._by_alias: dict[str, str] = {} + self._display: dict[str, str] = {} + self.known_ids: set[str] = {p.person_id for p in people} + first_name_ids: dict[str, list] = {} + for p in people: + self._display[p.person_id] = f"{p.first_name} {p.last_name}".strip() + # Ordered, de-duplicated forms (NOT a set) so alias order is deterministic — NFR-IDEM-01. + forms = [f"{p.first_name} {p.last_name}".strip()] + if p.maiden_name: + forms.append(f"{p.first_name} {p.maiden_name}".strip()) + for extra in p.extra_given_names: + forms.append(f"{extra} {p.last_name}".strip()) + if p.nickname: + forms.append(p.nickname) + seen = set() + for form in forms: + if form in seen: + continue + seen.add(form) + key = _norm(form) + if key and key not in self._by_alias: + self._by_alias[key] = p.person_id + p.aliases.append(form) + if p.first_name: + ids = first_name_ids.setdefault(_norm(p.first_name), []) + if p.person_id not in ids: + ids.append(p.person_id) + # first-name-only alias, only when unambiguous + for fname, ids in first_name_ids.items(): + if len(ids) == 1 and fname not in self._by_alias: + self._by_alias[fname] = ids[0] + + def resolve(self, name: str): + return self._by_alias.get(_norm(name)) + + def display(self, person_id: str) -> str: + return self._display.get(person_id, "") + + def suggest(self, name: str): + keys = list(self._by_alias.keys()) + match = difflib.get_close_matches(_norm(name), keys, n=1, cutoff=config.FUZZY_SUGGEST_THRESHOLD) + if not match: + return None, 0.0 + score = difflib.SequenceMatcher(None, _norm(name), match[0]).ratio() + return self._by_alias[match[0]], score + + +class ResolutionContext: + """Resolves raw name strings to person ids; accumulates provisional persons and review data.""" + def __init__(self, alias_index: AliasIndex, name_overrides: dict[str, str], + given_names: set[str] | None = None): + self.index = alias_index + self.name_overrides = name_overrides + self.given_names = given_names or set() + self.provisional: dict[str, Person] = {} + self.unmatched: dict[str, list] = {} + self.unresolved: list[tuple] = [] # (raw_name, category, source_row) for non-RESOLVABLE names + self._raw_to_pid: dict[str, str] = {} + self.override_hits = 0 + + def _unique_id(self, base: str) -> str: + """A provisional id must never collide with a register id or another provisional.""" + used = self.index.known_ids | set(self.provisional) + pid, n = base, 1 + while pid in used: + n += 1 + pid = f"{base}-{n}" + return pid + + def resolve_one(self, raw_name: str, source_row: int): + """Return (person_id, display_name, matched: bool). '' name -> ('', '', True).""" + name = (raw_name or "").strip() + if not name: + return "", "", True + if name in self.name_overrides: + self.override_hits += 1 + pid = self.name_overrides[name] + return pid, self.index.display(pid) or name, True + pid = self.index.resolve(name) + if pid: + return pid, self.index.display(pid) or name, True + # provisional person (unmatched) — never reuse a register id + self.unmatched.setdefault(name, []).append(source_row) + category = classify_name(name, self.given_names) + if category is not NameClass.RESOLVABLE: + self.unresolved.append((name, str(category), source_row)) + if name in self._raw_to_pid: + return self._raw_to_pid[name], name, False + last, first = _last_first(name) + pid = self._unique_id(slugify(last, first)) + self.provisional[pid] = Person(person_id=pid, last_name=last, first_name=first, provisional=True) + self._raw_to_pid[name] = pid + return pid, name, False + + def resolve_sender(self, raw: str, source_row: int): + """Senders are split like receivers (REQ-PERS-01). Primary = first part; multi flagged.""" + parts = split_receivers(raw) + if not parts: + return "", "", True, False + pid, name, matched = self.resolve_one(parts[0], source_row) + for extra in parts[1:]: + self.resolve_one(extra, source_row) # register the others as persons too + return pid, name, matched, len(parts) > 1 + + def resolve_receivers(self, raw: str, source_row: int): + return [self.resolve_one(part, source_row) for part in split_receivers(raw)] + + +def _last_first(name: str): + """Best-effort split of a free name string into (last, first) for slug/provisional building.""" + name = name.strip() + ln = find_known_last_name(name) + if ln: + first = name[: -len(ln)].strip() + return ln, first + tokens = name.split() + if len(tokens) >= 2: + return tokens[-1], " ".join(tokens[:-1]) + return name, "" diff --git a/tools/import-normalizer/persons_tree.py b/tools/import-normalizer/persons_tree.py new file mode 100644 index 00000000..5c18897c --- /dev/null +++ b/tools/import-normalizer/persons_tree.py @@ -0,0 +1,443 @@ +"""Normalize Personendatei 2.xlsx into canonical-persons-tree.json.""" +import argparse +import datetime +import json +import re +import sys +from pathlib import Path + +import config +import dates +import persons as _persons +from persons import _strip_accents + + +# Pinned so the committed tree JSON is reproducible and does not churn on every run +# (NFR-IDEM-01) — mirrors writers._FIXED_TS for the xlsx exports. +_GENERATED_AT = "2020-01-01T00:00:00" + +_MIN_YEAR = 1700 +_MAX_YEAR = 2100 +# Threshold: if parse_date parses a pure-digit string as a year outside [_MIN_YEAR, _MAX_YEAR], +# but the year is a plausible typo (1000-3000), don't try serial conversion. +# Years outside this range (e.g., 7568) are implausible and should try serial conversion. +_PLAUSIBLE_TYPO_MIN = 1000 +_PLAUSIBLE_TYPO_MAX = 3000 + + +def _parse_year(raw: str | None) -> int | None: + """Extract a birth/death year from an Excel cell string. + + Handles three cases: + 1. ISO / German / text string parseable by parse_date() → extract year if in range + 2. Pure-integer string (out-of-range or unparseable) → try Excel serial conversion + (unless it's a plausible typo year, e.g., "1023" for "1923") + 3. Mixed-format or unresolvable → None + + Serial conversion only fires for pure-digit strings and implausible years, + preventing typo years like "1023" from being mis-converted as serials. + """ + if raw is None: + return None + s = str(raw).strip() + if not s: + return None + + # Check if it's a pure-digit string (candidate for serial conversion) + is_pure_digit = re.fullmatch(r"\d+", s) is not None + + # Try parse_date first (handles ISO, DD.MM.YYYY, year-only, month+year, etc.) + result = dates.parse_date(s) + if result.iso: + year = int(result.iso[:4]) + if _MIN_YEAR <= year <= _MAX_YEAR: + return year + # Year is out of range. Only try serial conversion if it's an implausible year. + # Plausible typos (e.g., 1023 for 1923) should not be converted as serials. + if is_pure_digit and not (_PLAUSIBLE_TYPO_MIN <= year <= _PLAUSIBLE_TYPO_MAX): + n = int(s) + if 1 <= n <= 80_000: + d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) + if _MIN_YEAR <= d.year <= _MAX_YEAR: + return d.year + return None + + # parse_date() found nothing. Try serial conversion only for pure-digit strings. + if is_pure_digit: + n = int(s) + if 1 <= n <= 80_000: + d = datetime.date(1899, 12, 30) + datetime.timedelta(days=n) + if _MIN_YEAR <= d.year <= _MAX_YEAR: + return d.year + + return None + + +def _parse_generation(raw: str | None) -> int | None: + """Extract the generation integer from column A values like 'G 3', 'G3', 'G 0'.""" + if not raw: + return None + m = re.search(r"\d+", str(raw)) + return int(m.group()) if m else None + + +_GEO_SUFFIXES = {"aachen", "mex", "mexiko", "sen", "jun", "jr"} + + +def _norm_tree(s: str) -> str: + """Normalize a name string for tree matching. + + - Strip surrounding quotes, remove parenthetical substrings + - Diacritic → ASCII (ä→ae etc.), lowercase, dots → spaces + - Remove known geographic/honorific suffix tokens + - Collapse whitespace + """ + s = (s or "").strip().strip("\"'") + s = re.sub(r"\([^)]*\)", "", s) + s = _strip_accents(s).lower().replace(".", " ") + tokens = [t for t in s.split() if t and t not in _GEO_SUFFIXES] + return " ".join(tokens).strip("., ") + + +def _build_index(persons: list[dict]) -> dict[str, list[str]]: + """Build a name → [rowId, …] lookup index with four keys per person.""" + index: dict[str, list[str]] = {} + + def _add(key: str, row_id: str) -> None: + if key: + index.setdefault(key, []).append(row_id) + + for p in persons: + row_id = p["rowId"] + first = p.get("firstName") or "" + last = p.get("lastName") or "" + maiden = p.get("maidenName") or "" + + _add(_norm_tree(f"{first} {last}"), row_id) + _add(_norm_tree(f"{last} {first}"), row_id) + if maiden: + _add(_norm_tree(f"{first} {maiden}"), row_id) + _add(_norm_tree(last), row_id) + + return index + + +def _resolve_one(raw: str, index: dict[str, list[str]]) -> tuple[str | None, str | None]: + """Return (row_id, None) on unique match, (None, reason) otherwise.""" + key = _norm_tree(raw) + if not key: + return None, "empty" + hits = index.get(key, []) + if len(hits) == 1: + return hits[0], None + if len(hits) == 0: + return None, "not_found" + return None, "ambiguous" + + +def _parse_row(row_num: int, fields: dict) -> dict: + """Produce one person record from a header-mapped row dict. + + Internal keys prefixed with '_' are stripped before JSON output in main(). + """ + def s(key: str) -> str: + return (fields.get(key) or "").strip() + + birth_raw = s("birth_date") + death_raw = s("death_date") + + birth_year = _parse_year(birth_raw) + death_year = _parse_year(death_raw) + + notes_parts = [] + if birth_raw and birth_year is None: + notes_parts.append(f"[Geburtsdatum: {birth_raw}]") + if death_raw and death_year is None: + notes_parts.append(f"[Todesdatum: {death_raw}]") + bemerkung = s("notes") + if bemerkung: + notes_parts.append(bemerkung) + + maiden = s("maiden_name") or None + spouse = s("spouse") or None + bemerkung_out = bemerkung or None + + return { + "rowId": f"row_{row_num:03d}", + "firstName": s("first_name"), + "lastName": s("last_name"), + "maidenName": maiden, + "alias": None, + "notes": " ".join(notes_parts) or None, + "birthYear": birth_year, + "deathYear": death_year, + "birthPlace": s("birth_place") or None, + "deathPlace": s("death_place") or None, + "generation": _parse_generation(s("generation")), + "familyMember": True, + "_spouse_raw": spouse, + "_bemerkung_raw": bemerkung_out, + } + + +def _attach_person_ids(tree_persons: list[dict], raw_dicts: list[dict]) -> None: + """Attach the register's verbatim person_id to each tree person, in place. + + The register (persons.parse_register) is the sole authority for person_id; it + slugifies and suffixes colliding ids exactly once. We propagate that id rather + than re-slugify in the tree, because re-slugifying would not reproduce the + register's collision suffixes and so would not reconcile 1:1 with the register + (#670, Gap 3). + + tree_persons and raw_dicts must be the same length and in the same row order — + parse_register and _parse_row both keep exactly the rows that have a last name. + """ + register = _persons.parse_register(raw_dicts) + if len(tree_persons) != len(register): + raise ValueError( + "person_id propagation requires equal length: " + f"{len(tree_persons)} tree persons vs {len(register)} register persons " + "(the positional zip would otherwise silently truncate and mis-join ids)" + ) + for tree_person, register_person in zip(tree_persons, register): + tree_person["personId"] = register_person.person_id + + +def _deduplicate(persons: list[dict]) -> tuple[list[dict], list[str]]: + """Remove duplicate rows. Two-stage: + + 1. Exact (firstName, lastName, birthYear) match. + 2. (firstName, lastName) where the later entry has birthYear=None and an earlier + entry already has a known birthYear. + """ + seen_full: dict[tuple, str] = {} # (first, last, year) -> rowId + seen_name: dict[tuple, str] = {} # (first, last) -> rowId of first entry with a year + result: list[dict] = [] + skipped: list[str] = [] + + for p in persons: + first, last, year = p["firstName"], p["lastName"], p["birthYear"] + key_full = (first, last, year) + key_name = (first, last) + + if key_full in seen_full: + skipped.append(f"{p['rowId']} duplicates {seen_full[key_full]} ({first} {last}, year={year})") + continue + + if year is None and key_name in seen_name: + skipped.append(f"{p['rowId']} duplicates {seen_name[key_name]} ({first} {last}, no birth year)") + continue + + seen_full[key_full] = p["rowId"] + if year is not None: + seen_name[key_name] = p["rowId"] + + result.append(p) + + return result, skipped + + +def _resolve_spouses( + persons: list[dict], index: dict[str, list[str]] +) -> tuple[list[dict], list[dict]]: + """Emit SPOUSE_OF edges from each person's _spouse_raw field.""" + relationships: list[dict] = [] + unresolved: list[dict] = [] + emitted: set[frozenset] = set() + + for p in persons: + raw = (p.get("_spouse_raw") or "").strip() + if not raw: + continue + row_id = p["rowId"] + matched_id, reason = _resolve_one(raw, index) + if matched_id: + edge = frozenset([row_id, matched_id]) + if edge not in emitted: + emitted.add(edge) + relationships.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "SPOUSE_OF", + "source": "verheiratet_mit", + }) + else: + unresolved.append({ + "rowId": row_id, + "field": "verheiratet_mit", + "raw": raw, + "reason": reason, + }) + + return relationships, unresolved + + +_CHILD_RE = re.compile(r"^(?:Sohn|Tochter)\s+v(?:on)?\s+(.+)", re.I) +_PARENT_RE = re.compile(r"^(?:Vater|Mutter)\s+v(?:on)?\s+(.+)", re.I) +_AND_RE = re.compile(r"\s+u(?:nd)?\s+", re.I) + + +def _parse_bemerkung( + row_id: str, bemerkung: str, index: dict[str, list[str]] +) -> tuple[list[dict], list[dict], str]: + """Extract PARENT_OF edges from a Bemerkung cell. + + Returns (relationships, unresolved, remaining_notes). + Text that doesn't match a parent pattern goes to remaining_notes unchanged. + """ + if not bemerkung or not bemerkung.strip(): + return [], [], "" + + s = bemerkung.strip() + + for pattern, direction in ((_CHILD_RE, "child"), (_PARENT_RE, "parent")): + m = pattern.match(s) + if not m: + continue + + # Split the captured group on the first comma or semicolon to separate + # the name part from any trailing description (e.g. ", nach Mexiko emigriert") + raw_names, _, trailing = m.group(1).strip().partition(",") + if not trailing: + raw_names, _, trailing = raw_names.partition(";") + name_part = raw_names.strip().rstrip("!., ") + remainder = trailing.strip().lstrip(".,! ") + parts = [p.strip() for p in _AND_RE.split(name_part) if p.strip()] + rels: list[dict] = [] + unres: list[dict] = [] + + for part in parts: + part = part.rstrip("!., ") + matched_id, reason = _resolve_one(part, index) + if matched_id: + if direction == "child": + rels.append({ + "personId": matched_id, + "relatedPersonId": row_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + rels.append({ + "personId": row_id, + "relatedPersonId": matched_id, + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": bemerkung, + }) + else: + unres.append({ + "rowId": row_id, + "field": "bemerkung", + "raw": bemerkung, + "reason": reason, + }) + + return rels, unres, remainder + + # No pattern matched — full text goes to notes, nothing to unresolved + return [], [], s + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Normalize Personendatei 2.xlsx → canonical-persons-tree.json" + ) + parser.add_argument( + "--input", default=str(config.PERSON_WORKBOOK), + help="Path to Personendatei 2.xlsx" + ) + parser.add_argument( + "--output", default=str(config.OUT_DIR / "canonical-persons-tree.json"), + help="Path for output JSON" + ) + parser.add_argument("--dry-run", action="store_true", help="Print stats, skip write") + args = parser.parse_args() + + from ingest import read_sheet, build_header_map + + rows = read_sheet(Path(args.input), config.PERSON_SHEET) + if not rows: + print("ERROR: sheet is empty", file=sys.stderr) + sys.exit(1) + + header_row = [str(v) for v in rows[0]] + fields_map, _ = build_header_map(header_row, config.PERSON_HEADER_MAP, config.PERSON_REQUIRED_FIELDS) + + # --- Pass 1: parse rows --- + persons_raw: list[dict] = [] + raw_dicts: list[dict] = [] + for row_num, row in enumerate(rows[1:], start=2): + field_dict = {field: (row[col] if col < len(row) else "") for field, col in fields_map.items()} + if not field_dict.get("last_name", "").strip(): + continue + persons_raw.append(_parse_row(row_num, field_dict)) + raw_dicts.append(field_dict) + + # Propagate the register's verbatim person_id before dedup so the tree reconciles 1:1 + # with canonical-persons.xlsx (#670, Gap 3). + _attach_person_ids(persons_raw, raw_dicts) + + persons, skipped_msgs = _deduplicate(persons_raw) + for msg in skipped_msgs: + print(f" SKIP {msg}", file=sys.stderr) + + index = _build_index(persons) + + # --- Pass 2: resolve relationships --- + all_rels: list[dict] = [] + all_unresolved: list[dict] = [] + + spouse_rels, spouse_unres = _resolve_spouses(persons, index) + all_rels.extend(spouse_rels) + all_unresolved.extend(spouse_unres) + + for p in persons: + bemerkung = p.pop("_bemerkung_raw", None) or "" + p.pop("_spouse_raw", None) + + rels, unres, remaining = _parse_bemerkung(p["rowId"], bemerkung, index) + all_rels.extend(rels) + all_unresolved.extend(unres) + + if remaining: + existing = p.get("notes") or "" + if remaining not in existing: + p["notes"] = (existing + " " + remaining).strip() if existing else remaining + + # --- Stats output --- + spouse_count = sum(1 for r in all_rels if r["type"] == "SPOUSE_OF") + parent_count = sum(1 for r in all_rels if r["type"] == "PARENT_OF") + print(f"✓ {len(persons)} persons parsed") + print(f"✓ {len(all_rels)} relationships emitted ({spouse_count} SPOUSE_OF, {parent_count} PARENT_OF)") + if all_unresolved: + print(f"⚠ {len(all_unresolved)} unresolved (see unresolved[] in output)") + + if args.dry_run: + print("\n--- dry-run: first 5 unresolved ---") + for u in all_unresolved[:5]: + print(f" {u}") + return + + output = { + "generated_at": _GENERATED_AT, + "source": Path(args.input).name, + "stats": { + "persons": len(persons), + "relationships": len(all_rels), + "unresolved": len(all_unresolved), + }, + "persons": persons, + "relationships": all_rels, + "unresolved": all_unresolved, + } + + out_path = Path(args.output) + out_path.parent.mkdir(exist_ok=True) + out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"→ {args.output}") + + +if __name__ == "__main__": + main() diff --git a/tools/import-normalizer/requirements.txt b/tools/import-normalizer/requirements.txt new file mode 100644 index 00000000..886c2074 --- /dev/null +++ b/tools/import-normalizer/requirements.txt @@ -0,0 +1,2 @@ +openpyxl==3.1.5 +pytest==8.3.4 diff --git a/tools/import-normalizer/tags.py b/tools/import-normalizer/tags.py new file mode 100644 index 00000000..b5ac5b92 --- /dev/null +++ b/tools/import-normalizer/tags.py @@ -0,0 +1,119 @@ +import csv +import re +from collections import Counter +from pathlib import Path + +import config + +_COLLECTIVE = config.COLLECTIVE_TERMS + +_GERMAN_STOP_WORDS = { + "der", "die", "das", "ein", "eine", "einer", "einen", "einem", "eines", + "und", "oder", "aber", "an", "in", "auf", "für", "mit", "von", "zu", + "bei", "nach", "vor", "aus", "ist", "sind", "war", "waren", "hat", + "haben", "wird", "werden", "ich", "du", "er", "sie", "es", "wir", + "ihr", "ihn", "ihm", "ihnen", "mich", "mir", "dich", "dir", + "ihre", "ihren", "seinem", "seinen", "seiner", "seine", + "auch", "nicht", "noch", "dann", "durch", "dem", "den", + "des", "als", "wie", "dass", "um", "über", "unter", "zwischen", + "all", "alle", "was", "wer", "wo", "wann", "welche", "welcher", + "mehr", "sehr", "nur", "schon", "dabei", "dazu", + "bis", "seit", "gegen", "ohne", "doch", "wenn", "weil", + "ob", "so", "da", "dort", "hier", "nun", "ja", "nein", + "ihrer", "ihrem", + # Contracted prepositions common in German Inhalt summaries + "im", "am", "ans", "ins", "zum", "zur", "vom", "beim", "sich", + "hat", "hatte", "wird", "wurde", "wurden", "worden", + "kann", "konnte", "soll", "sollte", "will", "wollte", + "ihm", "dieses", "dieser", "diesem", "diesen", +} + + +def _is_correspondence(raw: str) -> bool: + lower = raw.lower() + return " an " in lower or lower.startswith("an ") or ".an " in lower + + +def _tokenize(text: str) -> list[str]: + return [t.lower() for t in re.findall(r"[a-zA-ZäöüÄÖÜß]+", text)] + + +def _has_collective(tokens: list[str]) -> bool: + return any(t in _COLLECTIVE for t in tokens) + + +def classify_schlagwort(raw: str) -> list[str]: + if not raw or not raw.strip(): + return [] + if not _is_correspondence(raw): + return [f"Themen/{raw}"] + if _has_collective(_tokenize(raw)): + return [f"Briefwechsel/{raw}"] + return [] + + +def mine_summary_candidates(summaries: list[str]) -> list[tuple[str, int]]: + counter: Counter = Counter() + for summary in summaries: + for token in re.split(r"[,;\s]+", summary.lower()): + token = re.sub(r"[^a-zA-ZäöüÄÖÜß]", "", token) + if len(token) >= 2 and token not in _GERMAN_STOP_WORDS: + counter[token] += 1 + return counter.most_common() + + +def load_approved_themes(path: Path) -> set[str]: + if not path.exists(): + return set() + themes: set[str] = set() + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("candidate"): + themes.add(row["candidate"].strip().lower()) + return themes + + +def apply_approved_themes(summary: str, themes: set[str]) -> list[str]: + lower = summary.lower() + return [ + f"Themen/{theme}" + for theme in themes + if re.search(r"\b" + re.escape(theme) + r"\b", lower) + ] + + +def generate_tags(schlagwort: str, summary: str, themes: set[str]) -> list[str]: + result = classify_schlagwort(schlagwort or "") + if summary and themes: + result = result + apply_approved_themes(summary, themes) + return result + + +def encode_tags(tag_list: list[str]) -> str: + return "|".join(tag_list) + + +def build_tag_tree(all_tag_paths: list[str]) -> list[dict]: + unique_paths = list(dict.fromkeys(all_tag_paths)) + roots: dict[str, None] = {} + children: dict[str, tuple[str, str]] = {} + for path in unique_paths: + if "/" in path: + parent, child = path.split("/", 1) + roots[parent] = None + children[path] = (parent, child) + else: + roots[path] = None + + rows: list[dict] = [] + seen: set[str] = set() + for root in roots: + if root not in seen: + rows.append({"tag_path": root, "parent_name": "", "tag_name": root}) + seen.add(root) + for path, (parent, child) in children.items(): + if path not in seen: + rows.append({"tag_path": path, "parent_name": parent, "tag_name": child}) + seen.add(path) + return rows diff --git a/tools/import-normalizer/tests/__init__.py b/tools/import-normalizer/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/import-normalizer/tests/test_config.py b/tools/import-normalizer/tests/test_config.py new file mode 100644 index 00000000..a88917d9 --- /dev/null +++ b/tools/import-normalizer/tests/test_config.py @@ -0,0 +1,20 @@ +import config + +def test_century_boundaries(): + assert config.TWO_DIGIT_19XX_MAX == 57 + assert config.TWO_DIGIT_18XX_MIN == 73 + +def test_header_maps_cover_required_fields(): + assert "index" in config.DOCUMENT_HEADER_MAP.values() + assert "last_name" in config.PERSON_HEADER_MAP.values() + +def test_feast_tables_present(): + assert config.MOVABLE_FEASTS["pfingsten"] == 49 + assert config.SEASON_MONTHS["herbst"] == 10 + +def test_name_classification_tables(): + assert "tante" in config.RELATIONAL_TERMS + assert "familie" in config.COLLECTIVE_TERMS + assert "unbekannt" in config.UNKNOWN_NAME_MARKERS + assert config.PROSE_MAX_LEN >= 30 + assert "anita" in config.EXTRA_GIVEN_NAMES diff --git a/tools/import-normalizer/tests/test_dates.py b/tools/import-normalizer/tests/test_dates.py new file mode 100644 index 00000000..2b59796a --- /dev/null +++ b/tools/import-normalizer/tests/test_dates.py @@ -0,0 +1,205 @@ +import datetime +import dates +from dates import Precision + +def test_matchers_return_uniform_matchresult(): + # Every matcher returns a MatchResult(iso, precision, end) — no 2- vs 3-tuple + # length-sniffing. A non-range matcher leaves end=None; a range matcher sets it. + day = dates._match_numeric("15.2.1888") + assert isinstance(day, dates.MatchResult) + assert (day.iso, day.precision, day.end) == ("1888-02-15", Precision.DAY, None) + + rng = dates._match_range("10./11.1.1917") + assert isinstance(rng, dates.MatchResult) + assert (rng.iso, rng.precision, rng.end) == ("1917-01-10", Precision.RANGE, "1917-01-11") + + +def test_easter_known_years(): + # Anonymous Gregorian algorithm — verified against published tables + assert dates.easter(2024) == datetime.date(2024, 3, 31) + assert dates.easter(2000) == datetime.date(2000, 4, 23) + assert dates.easter(1922) == datetime.date(1922, 4, 16) + assert dates.easter(1888) == datetime.date(1888, 4, 1) + +def test_resolve_feast_movable(): + assert dates.resolve_feast_or_season("Pfingsten", 1922) == ("1922-06-04", Precision.DAY) + assert dates.resolve_feast_or_season("Ostern", 2024) == ("2024-03-31", Precision.DAY) + assert dates.resolve_feast_or_season("Pfingstmontag", 1922) == ("1922-06-05", Precision.DAY) + +def test_resolve_feast_fixed(): + assert dates.resolve_feast_or_season("Weihnachten", 1900) == ("1900-12-25", Precision.DAY) + assert dates.resolve_feast_or_season("Neujahr", 1910) == ("1910-01-01", Precision.DAY) + +def test_resolve_season(): + assert dates.resolve_feast_or_season("Herbst", 1913) == ("1913-10-01", Precision.SEASON) + assert dates.resolve_feast_or_season("Sommer", 1910) == ("1910-07-01", Precision.SEASON) + +def test_resolve_unknown_token_returns_none(): + assert dates.resolve_feast_or_season("Freitag", 1919) is None + +def test_expand_year(): + assert dates.expand_year("1888") == 1888 + assert dates.expand_year("889") == 1889 # 3-digit -> 1DDD + assert dates.expand_year("923") == 1923 + assert dates.expand_year("08") == 1908 # 00..57 -> 19xx + assert dates.expand_year("17") == 1917 + assert dates.expand_year("57") == 1957 + assert dates.expand_year("73") == 1873 # 73..99 -> 18xx + assert dates.expand_year("99") == 1899 + assert dates.expand_year("65") is None # 58..72 ambiguous + assert dates.expand_year("9003") is None # implausible 4-digit year -> reject (typo) + assert dates.expand_year("x") is None + +def test_parse_iso_and_empty(): + assert dates.parse_date("1910-04-23") == dates.ParsedDate("1910-04-23", Precision.DAY, "1910-04-23") + assert dates.parse_date("") == dates.ParsedDate(None, Precision.UNKNOWN, "") + assert dates.parse_date("?") == dates.ParsedDate(None, Precision.UNKNOWN, "?") + +def test_parse_numeric_forms(): + assert dates.parse_date("15.2.1888").iso == "1888-02-15" + assert dates.parse_date("13.5.09").iso == "1909-05-13" + assert dates.parse_date("17/6. 1916").iso == "1916-06-17" + assert dates.parse_date("11.10.08").iso == "1908-10-11" + assert dates.parse_date("30.1.889").iso == "1889-01-30" + assert dates.parse_date("15.2.1888").precision == Precision.DAY + +def test_parse_numeric_unparseable(): + assert dates.parse_date("8.9.").precision == Precision.UNKNOWN # no year + assert dates.parse_date("13.5.65").precision == Precision.UNKNOWN # ambiguous 2-digit year + +def test_parse_approx_marker_upgrades_precision(): + r = dates.parse_date("17.Nov (?) 1887") # month-name matcher now active; (?) marks approx + assert r.raw == "17.Nov (?) 1887" + assert r.precision == Precision.APPROX # month-name matcher parses date; (?) upgrades to APPROX + +def test_parse_leading_qualifier_is_approx(): + r = dates.parse_date("nach 1.5.1900") # qualifier stripped, numeric date salvaged, precision APPROX + assert r.iso == "1900-05-01" + assert r.precision == Precision.APPROX + +def test_parse_roman_months(): + assert dates.parse_date("22.III.18").iso == "1918-03-22" + assert dates.parse_date("19.XII.1954").iso == "1954-12-19" + assert dates.parse_date("1.III.27").iso == "1927-03-01" + assert dates.parse_date("22.III.18").precision == Precision.DAY + +def test_parse_monthname_day_first(): + assert dates.parse_date("6.März 1888").iso == "1888-03-06" + assert dates.parse_date("29.Sept.1891").iso == "1891-09-29" + assert dates.parse_date("10.Oct.95").iso == "1895-10-10" + assert dates.parse_date("9.December1889").iso == "1889-12-09" + assert dates.parse_date("18.Dez.1916").iso == "1916-12-18" + assert dates.parse_date("4Dezember 1936").iso == "1936-12-04" + assert dates.parse_date("25 August 1968").iso == "1968-08-25" + +def test_parse_month_year_year_only(): + assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895") + assert dates.parse_date("October 1903") == dates.ParsedDate("1903-10-01", Precision.MONTH, "October 1903") + assert dates.parse_date("1905") == dates.ParsedDate("1905-01-01", Precision.YEAR, "1905") + +def test_parse_feast_and_season_via_parse_date(): + assert dates.parse_date("Pfingsten 1922") == dates.ParsedDate("1922-06-04", Precision.DAY, "Pfingsten 1922") + assert dates.parse_date("Herbst 1913") == dates.ParsedDate("1913-10-01", Precision.SEASON, "Herbst 1913") + assert dates.parse_date("Pfingstsonntag 1915").precision == Precision.DAY + +def test_parse_ranges(): + assert dates.parse_date("8.1.1916 - 15.3.1916") == dates.ParsedDate("1916-01-08", Precision.RANGE, "8.1.1916 - 15.3.1916") + assert dates.parse_date("1881/82") == dates.ParsedDate("1881-01-01", Precision.RANGE, "1881/82") + assert dates.parse_date("1945/46?").iso == "1945-01-01" # '?' stripped -> RANGE, then APPROX + assert dates.parse_date("1945/46?").precision == Precision.APPROX + +def test_parse_approx_full(): + r = dates.parse_date("17.Nov (?) 1887") + assert r.iso == "1887-11-17" + assert r.precision == Precision.APPROX + +def test_parse_english_month_first_now_works(): + assert dates.parse_date("April 12. 1922").iso == "1922-04-12" + assert dates.parse_date("Mai 1895").iso == "1895-05-01" # not shadowed by month-first matcher + +def test_parse_unparseable_examples(): + assert dates.parse_date("Freitag 1919").precision == Precision.UNKNOWN + +def test_parse_invalid_calendar_date_is_unknown(): + # try/except ValueError in the matchers must route impossible dates to UNKNOWN (-> review), + # never silently clamp. This is the most likely real-data bug class at 7,600 rows. + assert dates.parse_date("30.2.1888").precision == Precision.UNKNOWN + assert dates.parse_date("31.4.1916").precision == Precision.UNKNOWN + +def test_parse_intra_month_day_range(): + # "7./8. Sept.1923" -> start day, RANGE, end day 8th. Must NOT be confused with slash-date "17/6. 1916". + assert dates.parse_date("7./8. Sept.1923") == dates.ParsedDate("1923-09-07", Precision.RANGE, "7./8. Sept.1923", "1923-09-08") + assert dates.parse_date("17/6. 1916") == dates.ParsedDate("1916-06-17", Precision.DAY, "17/6. 1916") + +def test_parse_intra_month_day_range_carries_end_day(): + # the intra-month day range surfaces the END day so Phase 4 can render meta_date_end + r = dates.parse_date("10./11.1.1917") + assert r.iso == "1917-01-10" + assert r.precision == Precision.RANGE + assert r.end == "1917-01-11" + +def test_parse_roman_month_day_range(): + # "10./11.I.1917" — Roman-numeral-month range; previously fell through to UNKNOWN + r = dates.parse_date("10./11.I.1917") + assert r.iso == "1917-01-10" + assert r.precision == Precision.RANGE + assert r.end == "1917-01-11" + +def test_parse_range_invalid_end_keeps_start_flags_review(): + # "10./40.1.1917" — the 40th is an impossible end day. The start parses fine, + # so the row stays RANGE with the start preserved, the unparseable end is dropped + # (end is None), and the half-resolved range is flagged needs_review so the + # dropped end surfaces honestly instead of vanishing silently (#670, Gap 2). + r = dates.parse_date("10./40.1.1917") + assert r.iso == "1917-01-10" + assert r.precision == Precision.RANGE + assert r.end is None + assert r.needs_review is True + + +def test_parse_range_valid_end_not_flagged(): + # a fully-resolved range carries its end and is NOT flagged for review + r = dates.parse_date("10./11.1.1917") + assert r.end == "1917-01-11" + assert r.needs_review is False + + +def test_parse_non_range_has_no_review_flag(): + # every fully-parsed non-range date is never flagged for review by the date layer + assert dates.parse_date("15.2.1888").needs_review is False + assert dates.parse_date("Mai 1895").needs_review is False + assert dates.parse_date("").needs_review is False + + +def test_parse_non_range_has_no_end(): + assert dates.parse_date("15.2.1888").end is None + assert dates.parse_date("Mai 1895").end is None + assert dates.parse_date("").end is None + +def test_parse_trailing_note_stripped_but_raw_preserved(): + r = dates.parse_date("17.Nov 1887, 2. Brief") # REQ-DATE-04 + assert r.iso == "1887-11-17" + assert "2. Brief" in r.raw # original string preserved verbatim + +def test_parse_date_override_wins(): + ovr = {"13.5.65": ("1965-05-13", "DAY")} + r = dates.parse_date("13.5.65", ovr) # ambiguous without override + assert r == dates.ParsedDate("1965-05-13", Precision.DAY, "13.5.65") + +def test_parse_spanish_months(): + # Mexican-branch letters: Spanish month names, day-first and month-first (hyphen/dot before year) + assert dates.parse_date("21.Enero 1911").iso == "1911-01-21" # day-first + assert dates.parse_date("Junio 17.929").iso == "1929-06-17" # month-first, dot, 3-digit year + assert dates.parse_date("Mayo 18-1929").iso == "1929-05-18" # month-first, hyphen + assert dates.parse_date("Abril 10-929").iso == "1929-04-10" # hyphen, 3-digit year + assert dates.parse_date("Agosto 27-929").iso == "1929-08-27" + assert dates.parse_date("febrero 14-29").iso == "1929-02-14" # hyphen, 2-digit year + assert dates.parse_date("Mayo 18-1929").precision == Precision.DAY + +def test_implausible_year_goes_to_review(): + # a source typo like "October 23-9003" must NOT parse to a bogus year 9003 — stays UNKNOWN + assert dates.parse_date("October 23-9003").precision == Precision.UNKNOWN + +def test_hyphen_month_first_does_not_shadow_month_year(): + # the hyphen-separator generalization must NOT make "Mai 1895" parse as day=18 + assert dates.parse_date("Mai 1895") == dates.ParsedDate("1895-05-01", Precision.MONTH, "Mai 1895") diff --git a/tools/import-normalizer/tests/test_documents.py b/tools/import-normalizer/tests/test_documents.py new file mode 100644 index 00000000..bbf8c4c1 --- /dev/null +++ b/tools/import-normalizer/tests/test_documents.py @@ -0,0 +1,149 @@ +import persons +import documents +from documents import Triage + +def test_extract_row(): + header = {"index": 0, "box": 1, "folder": 2, "sender": 3, + "receivers": 4, "date": 5, "location": 6, "tags": 7, "summary": 8} + cells = ["W-0001", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"] + raw = documents.extract_row(cells, header, source_row=3) + assert raw.index == "W-0001" + assert raw.sender == "Walter de Gruyter" + assert raw.date == "15.2.1888" + assert raw.source_row == 3 + +def test_triage(): + assert documents.triage(["", "", ""]) == Triage.EMPTY + assert documents.triage(["", "", "Walter"]) == Triage.BLANK_INDEX # data but no index + assert documents.triage(["W-0001x", "x"]) == Triage.X_SUFFIX + assert documents.triage(["W-0001", "x"]) == Triage.OK + +def test_classify_blank_index(): + header = {"sender": 4, "receivers": 5} + banner = ["", "", "", "", "Brautbriefe von Walter an Eugenie", ""] + data = ["", "", "V", "1", "", "Eugenie"] + assert documents.classify_blank_index(banner, header) == "section_banner" + assert documents.classify_blank_index(data, header) == "data_no_index" + +def _ctx(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Walter"}, + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + ]) + return persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + +def test_to_canonical_resolves_and_flags(): + ctx = _ctx() + raw = documents.RawRow(source_row=3, index="W-0001", box="V", folder="1", + sender="Walter de Gruyter", receivers="Eugenie Müller", + date="15.2.1888", location="Rotterdam", tags="Brautbriefe", + summary="Geschäftsreise") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" + assert doc.receiver_person_ids == ["de-gruyter-eugenie"] # matched via maiden alias + assert doc.date_iso == "1888-02-15" and doc.date_precision == "DAY" + assert doc.tags == ["Themen/Brautbriefe"] + assert doc.needs_review == [] + + +def test_canonical_document_has_no_file_field(): + # #686: PDFs resolve by index (.pdf) in the importer; the file field is gone. + doc = documents.CanonicalDocument(index="W-0001") + assert not hasattr(doc, "file") + + +def test_to_canonical_range_carries_date_end(): + ctx = _ctx() + raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", + date="10./11.1.1917") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_iso == "1917-01-10" + assert doc.date_precision == "RANGE" + assert doc.date_end == "1917-01-11" + + +def test_to_canonical_non_range_has_empty_date_end(): + ctx = _ctx() + raw = documents.RawRow(source_row=4, index="H-0730", sender="", receivers="", + date="15.2.1888") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_precision == "DAY" + assert doc.date_end == "" + +def test_to_canonical_half_resolved_range_flags_review(): + # an impossible end day ("10./40.1.1917") keeps the start + RANGE precision but + # drops the unparseable end; the document must surface this as a review flag + # so the importer (#669) knows date_end is empty on a RANGE row by design. + ctx = _ctx() + raw = documents.RawRow(source_row=5, index="H-0731", sender="", receivers="", + date="10./40.1.1917") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_iso == "1917-01-10" + assert doc.date_precision == "RANGE" + assert doc.date_end == "" + assert "range_end_unparsed" in doc.needs_review + + +def test_to_canonical_full_range_not_flagged(): + ctx = _ctx() + raw = documents.RawRow(source_row=5, index="H-0730", sender="", receivers="", + date="10./11.1.1917") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.date_end == "1917-01-11" + assert "range_end_unparsed" not in doc.needs_review + + +def test_to_canonical_unmatched_and_unparsed(): + ctx = _ctx() + raw = documents.RawRow(source_row=9, index="C-0001", + sender="Hans Wittkopf", receivers="", date="Freitag 1919") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "wittkopf-hans" # provisional + assert "unmatched_sender" in doc.needs_review + assert "unparsed_date" in doc.needs_review + assert ctx.unmatched["Hans Wittkopf"] == [9] + assert any(p.provisional for p in ctx.provisional.values()) + +def test_to_canonical_splits_multi_sender(): + # REQ-PERS-01 / IMP-11: a multi-person sender is parsed, primary kept, flagged. + ctx = _ctx() + raw = documents.RawRow(source_row=5, index="C-0100", sender="Walter und Eugenie de Gruyter", receivers="") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert doc.sender_person_id == "de-gruyter-walter" # first part is primary + assert "multi_sender" in doc.needs_review + +def test_provisional_id_never_collides_with_register(): + # A provisional built from an unmatched string must not steal a register person_id. + people = persons.parse_register([{"last_name": "Xyz", "first_name": "Abc"}]) # id "xyz-abc" + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}) + # "Abc, Xyz" misses the alias index (the comma changes the normalized key) but its + # provisional slug is "xyz-abc" — already the register person's id, so it MUST be suffixed. + pid, _, matched = ctx.resolve_one("Abc, Xyz", source_row=1) + assert matched is False + assert "xyz-abc" in ctx.index.known_ids + assert pid == "xyz-abc-2" # suffixed away from the register id, not reused + +def test_resolve_one_override_increments_hits(): + people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Eugenie"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), + name_overrides={"Genie": "de-gruyter-eugenie"}) + pid, name, matched = ctx.resolve_one("Genie", source_row=1) + assert pid == "de-gruyter-eugenie" and matched is True + assert name == "Eugenie de Gruyter" # display comes from the alias index + assert ctx.override_hits == 1 + +def test_ambiguous_pair_recorded_in_unresolved(): + people = persons.parse_register([{"last_name": "de Gruyter", "first_name": "Walter"}]) + ctx = persons.ResolutionContext(persons.AliasIndex(people), name_overrides={}, + given_names={"ella", "anita"}) + raw = documents.RawRow(source_row=7, index="C-0200", sender="", receivers="Ella Anita") + doc = documents.to_canonical(raw, ctx, date_overrides={}) + assert len(doc.receiver_person_ids) == 1 # not split — one provisional + assert any(name == "Ella Anita" and cat == "ambiguous_pair" for name, cat, _ in ctx.unresolved) + +def test_resolvable_first_surname_pair_not_unresolved(): + ctx = persons.ResolutionContext(persons.AliasIndex([]), name_overrides={}, + given_names={"ella", "anita"}) + ctx.resolve_one("Mieze Schefold", source_row=1) # surname is not a given name + assert ctx.unresolved == [] # RESOLVABLE -> not recorded diff --git a/tools/import-normalizer/tests/test_ingest.py b/tools/import-normalizer/tests/test_ingest.py new file mode 100644 index 00000000..7cce1452 --- /dev/null +++ b/tools/import-normalizer/tests/test_ingest.py @@ -0,0 +1,46 @@ +import datetime +import openpyxl +import pytest +import ingest + +def _make_workbook(tmp_path, sheet_name, rows): + wb = openpyxl.Workbook() + ws = wb.active + ws.title = sheet_name + for r in rows: + ws.append(r) + path = tmp_path / "wb.xlsx" + wb.save(path) + return path + +def test_read_sheet_converts_cells(tmp_path): + path = _make_workbook(tmp_path, "S", [ + ["Index", "Datum"], + ["W-0001", datetime.datetime(1888, 2, 15)], + ["W-0002", 1], + ]) + rows = ingest.read_sheet(path, "S") + assert rows[0] == ["Index", "Datum"] + assert rows[1] == ["W-0001", "1888-02-15"] # Excel date -> ISO string + assert rows[2] == ["W-0002", "1"] # integer -> plain string + +def test_build_header_map_collapses_whitespace_and_case(): + header = ["Index", "Datum des Briefes", "EmpfängerIn", "Mystery"] + field_map = {"index": "index", "datum des briefes": "date", "empfängerin": "receivers"} + fields, unknown = ingest.build_header_map(header, field_map, required={"index"}) + assert fields == {"index": 0, "date": 1, "receivers": 2} + assert unknown == ["Mystery"] + +def test_build_header_map_missing_required_raises(): + with pytest.raises(ValueError, match="index"): + ingest.build_header_map(["Box", "Ort"], {"box": "box", "ort": "location"}, required={"index"}) + +def test_read_sheet_bool_not_coerced_to_int(tmp_path): + path = _make_workbook(tmp_path, "S", [["Flag"], [True], [False]]) + rows = ingest.read_sheet(path, "S") + assert rows[1] == ["True"] and rows[2] == ["False"] # not "1"/"0" + +def test_read_sheet_missing_sheet_raises(tmp_path): + path = _make_workbook(tmp_path, "S", [["A"]]) + with pytest.raises(ValueError, match="not found"): + ingest.read_sheet(path, "Nope") diff --git a/tools/import-normalizer/tests/test_normalize.py b/tools/import-normalizer/tests/test_normalize.py new file mode 100644 index 00000000..2adf2a4a --- /dev/null +++ b/tools/import-normalizer/tests/test_normalize.py @@ -0,0 +1,179 @@ +import json +import subprocess +import sys +from pathlib import Path + +import openpyxl +import normalize + + +def _doc_wb(tmp_path): + wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Familienarchiv" + ws.append(["Index", "Datei", "Box", "Mappe", "BriefeschreiberIn", "EmpfängerIn", + "Datum des Briefes", "Ort", "Schlagwort", "Inhalt"]) + ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "Geschäftsreise"]) + ws.append(["W-0001x", r"..\__scan\W-0001x.pdf", "", "", "Walter de Gruyter", "Eugenie Müller", "", "", "", ""]) + ws.append(["", "", "", "", "Section banner row", "", "", "", "", ""]) + ws.append(["C-0001", "", "", "", "Hans Wittkopf", "?", "Freitag 1919", "", "", ""]) + ws.append(["W-0001", r"..\__scan\W-0001.pdf", "V", "1", "Walter de Gruyter", + "Eugenie Müller", "15.2.1888", "Rotterdam", "Brautbriefe", "dup"]) + p = tmp_path / "docs.xlsx"; wb.save(p); return p + + +def _person_wb(tmp_path): + wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1" + ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum", + "Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"]) + ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""]) + ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""]) + p = tmp_path / "persons.xlsx"; wb.save(p); return p + + +def test_run_end_to_end(tmp_path): + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + stats = normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}) + assert (out_dir / "canonical-documents.xlsx").exists() + assert (out_dir / "canonical-persons.xlsx").exists() + assert stats["documents_emitted"] == 3 # W-0001, C-0001, W-0001 (dup) — x and blank excluded + assert stats["skipped_x_suffix"] == 1 + assert stats["blank_index_rows"] == 1 + assert stats["duplicate_index_rows"] == 2 + assert stats["unresolved_unknown"] >= 1 # the "?" receiver is an UNKNOWN-class name + assert (review_dir / "skipped-x-suffix.csv").exists() + assert (review_dir / "unparsed-dates.csv").exists() + # C-0001's "Freitag 1919" is unparseable -> must appear in the review file (NFR-DATA-01) + assert "Freitag 1919" in (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") + assert (review_dir / "unresolved-names.csv").exists() + unresolved_text = (review_dir / "unresolved-names.csv").read_text(encoding="utf-8") + assert "unknown" in unresolved_text and "?" in unresolved_text # the "?" receiver + assert not (review_dir / "ambiguous-receivers.csv").exists() # replaced + + # determinism (NFR-IDEM-01): a second run yields identical canonical content + review files + def _matrix(p): + wb = openpyxl.load_workbook(p) + return [[c.value for c in row] for row in wb.active.iter_rows()] + docs1 = _matrix(out_dir / "canonical-documents.xlsx") + persons1 = _matrix(out_dir / "canonical-persons.xlsx") + unparsed1 = (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") + normalize.run(document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={}) + assert _matrix(out_dir / "canonical-documents.xlsx") == docs1 + assert _matrix(out_dir / "canonical-persons.xlsx") == persons1 + assert (review_dir / "unparsed-dates.csv").read_text(encoding="utf-8") == unparsed1 + assert len(docs1) == 4 # header + 3 docs + + +def test_tag_tree_output_emitted(tmp_path): + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}) + assert (out_dir / "canonical-tag-tree.xlsx").exists() + + +def test_tag_candidates_review_emitted(tmp_path): + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}) + assert (review_dir / "tag-candidates.csv").exists() + text = (review_dir / "tag-candidates.csv").read_text(encoding="utf-8") + assert "candidate" in text and "count" in text + + +def test_schlagwort_encoded_as_themen_in_documents(tmp_path): + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}) + wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx") + ws = wb.active + header = [c.value for c in ws[1]] + tag_col = header.index("tags") + tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)] + assert any(v and "Themen/Brautbriefe" in v for v in tag_values) + assert not any(v and v.strip() == "Brautbriefe" for v in tag_values) + + +def test_approved_themes_applied(tmp_path): + themes_file = tmp_path / "approved-themes.csv" + themes_file.write_text("candidate\ngeschäftsreise\n", encoding="utf-8") + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=_person_wb(tmp_path), person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, + date_overrides={}, name_overrides={}, + approved_themes_path=themes_file) + wb = openpyxl.load_workbook(out_dir / "canonical-documents.xlsx") + ws = wb.active + header = [c.value for c in ws[1]] + tag_col = header.index("tags") + tag_values = [ws.cell(row=r, column=tag_col + 1).value for r in range(2, ws.max_row + 1)] + # W-0001 has Inhalt "Geschäftsreise" — should get an extra Themen/geschäftsreise tag + assert any(v and "Themen/geschäftsreise" in v for v in tag_values) + + +def _person_wb_with_collision(tmp_path): + # Two "Hans Cram" rows force the register to suffix the colliding slug (-1/-2); + # the tree must carry those exact suffixed ids so the join still reconciles. + wb = openpyxl.Workbook(); ws = wb.active; ws.title = "Tabelle1" + ws.append(["Generation", "Familienname", "Vorname", "geb als", "Geburtsdatum", + "Geburtsort", "Todesdatum", "Sterbeort", "verheiratet mit", "Bemerkung"]) + ws.append(["G 1", "de Gruyter", "Walter", "", "", "", "", "", "", ""]) + ws.append(["G 1", "de Gruyter", "Eugenie", "Müller", "", "", "", "", "", ""]) + ws.append(["G 2", "Cram", "Hans", "", "1890", "", "", "", "", ""]) + ws.append(["G 3", "Cram", "Hans", "", "1925", "", "", "", "", ""]) + p = tmp_path / "persons.xlsx"; wb.save(p); return p + + +def _generate_tree(person_wb, out_path): + script = Path(__file__).parent.parent / "persons_tree.py" + result = subprocess.run( + [sys.executable, str(script), "--input", str(person_wb), "--output", str(out_path)], + capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr + return json.loads(out_path.read_text(encoding="utf-8")) + + +def test_tree_person_ids_reconcile_with_persons_xlsx(tmp_path): + # The real #669 contract: every personId in canonical-persons-tree.json must join + # 1:1 onto a person_id in canonical-persons.xlsx — no orphan tree id, no duplicate. + # Both artifacts are produced from the SAME person workbook (collision included). + person_wb = _person_wb_with_collision(tmp_path) + out_dir = tmp_path / "out"; review_dir = tmp_path / "review" + + normalize.run( + document_workbook=_doc_wb(tmp_path), document_sheet="Familienarchiv", + person_workbook=person_wb, person_sheet="Tabelle1", + out_dir=out_dir, review_dir=review_dir, date_overrides={}, name_overrides={}) + + tree = _generate_tree(person_wb, tmp_path / "tree.json") + tree_ids = [p["personId"] for p in tree["persons"]] + + wb = openpyxl.load_workbook(out_dir / "canonical-persons.xlsx") + ws = wb.active + header = [c.value for c in ws[1]] + pid_col = header.index("person_id") + register_ids = [ws.cell(row=r, column=pid_col + 1).value for r in range(2, ws.max_row + 1)] + + # tree ids are unique (no duplicate join key) + assert len(tree_ids) == len(set(tree_ids)) + # the suffixed collision ids actually reached the tree + assert "cram-hans-1" in tree_ids and "cram-hans-2" in tree_ids + # every tree id resolves to exactly one register row — the join is total and 1:1 + register_counts = {pid: register_ids.count(pid) for pid in tree_ids} + assert all(count == 1 for count in register_counts.values()), register_counts diff --git a/tools/import-normalizer/tests/test_persons.py b/tools/import-normalizer/tests/test_persons.py new file mode 100644 index 00000000..5d26ecfa --- /dev/null +++ b/tools/import-normalizer/tests/test_persons.py @@ -0,0 +1,132 @@ +import config +import persons +from persons import NameClass + +def test_slugify(): + assert persons.slugify("de Gruyter", "Eugenie") == "de-gruyter-eugenie" + assert persons.slugify("Müller", "Karl Erhard") == "mueller-karl-erhard" + +def test_parse_register_basic(): + rows = [ + {"generation": "G 1", "last_name": "Blomquist", "first_name": "Charlotte,Meta,Jacobi", + "maiden_name": "Ruge", "birth_date": "30.8.1862", "birth_place": "Schülperneusiel", + "death_date": "1934-07-23", "death_place": "Göteborg", "spouse": '"Tante Lolly"', + "notes": "Schwester v Marie Cram"}, + {"generation": "G 2", "last_name": "Bohrmann", "first_name": "Else", + "maiden_name": "Cram", "birth_date": "28.11.1888", "spouse": "Ludwig Bohrmann", + "notes": "Schwester v Herbert"}, + ] + people = persons.parse_register(rows) + p = people[0] + assert p.person_id == "blomquist-charlotte" + assert p.first_name == "Charlotte" + assert p.maiden_name == "Ruge" + assert p.birth_date == "1862-08-30" + assert p.nickname == "Tante Lolly" # quoted spouse field is a nickname, not a spouse + assert p.spouse == "" + assert "Meta" in p.extra_given_names and "Jacobi" in p.extra_given_names + p2 = people[1] + assert p2.maiden_name == "Cram" + assert p2.spouse == "Ludwig Bohrmann" + assert p2.provisional is False + +def test_parse_register_dedups_colliding_ids(): + # Two people with the same first+last name: BOTH get a numeric suffix (no ambiguous base id). + people = persons.parse_register([ + {"last_name": "Cram", "first_name": "Hans"}, + {"last_name": "Cram", "first_name": "Hans"}, + ]) + ids = [p.person_id for p in people] + assert ids == ["cram-hans-1", "cram-hans-2"] + assert len(set(ids)) == 2 + +def test_split_receivers(): + assert persons.split_receivers("Eugenie Müller") == ["Eugenie Müller"] + assert persons.split_receivers("Walter und Eugenie de Gruyter") == ["Walter de Gruyter", "Eugenie de Gruyter"] + assert persons.split_receivers("Hedi und Tutu (Gruber)") == ["Hedi Gruber", "Tutu Gruber"] + assert persons.split_receivers("Clara u Familie") == ["Clara"] + assert persons.split_receivers("Eugenie de Gruyter geb. Müller") == ["Eugenie de Gruyter"] + assert persons.split_receivers("Herbert u Clara") == ["Herbert", "Clara"] + assert persons.split_receivers("") == [] + assert persons.split_receivers("geb. Müller") == [] # maiden-only cell -> no person + assert persons.split_receivers("Herbert//Clara") == ["Herbert", "Clara"] # // separator + +def test_find_known_last_name(): + assert persons.find_known_last_name("Eugenie de Gruyter") == "de Gruyter" + assert persons.find_known_last_name("Clara") is None + +def test_alias_index_resolves_maiden_and_married(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie", "maiden_name": "Müller"}, + {"last_name": "Cram", "first_name": "Clara"}, + ]) + idx = persons.AliasIndex(people) + eugenie = people[0].person_id + assert idx.resolve("Eugenie de Gruyter") == eugenie # canonical + assert idx.resolve("Eugenie Müller") == eugenie # maiden alias + assert idx.resolve("eugenie müller") == eugenie # normalized + assert idx.resolve("Nobody Unknown") is None + +def test_alias_index_suggestion(): + people = persons.parse_register([{"last_name": "Wittkopf", "first_name": "Hans"}]) + idx = persons.AliasIndex(people) + sid, score = idx.suggest("Hans Wittkop") # typo + assert sid == people[0].person_id and score >= config.FUZZY_SUGGEST_THRESHOLD + +def test_alias_index_first_name_only_when_unambiguous(): + people = persons.parse_register([ + {"last_name": "Cram", "first_name": "Clara"}, + {"last_name": "de Gruyter", "first_name": "Walter"}, + {"last_name": "Cram", "first_name": "Walter"}, # 2nd "Walter" -> first name ambiguous + ]) + idx = persons.AliasIndex(people) + assert idx.resolve("Clara") == people[0].person_id # unique first name resolves + assert idx.resolve("Walter") is None # ambiguous first name does NOT resolve + assert idx.display(people[0].person_id) == "Clara Cram" + + +GIVEN = {"ella", "anita", "kurt", "georg", "clara", "eugenie"} + +def test_classify_unknown(): + assert persons.classify_name("?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("A. Kredell?", GIVEN) is NameClass.UNKNOWN + assert persons.classify_name("unbekannt", GIVEN) is NameClass.UNKNOWN + +def test_classify_prose(): + assert persons.classify_name("Adressenliste v Clara Cram zur Kondolenz", GIVEN) is NameClass.PROSE + assert persons.classify_name("Clara de Gruyter(*1871)", GIVEN) is NameClass.PROSE # digit + assert persons.classify_name('"Cramiade" Gedicht', GIVEN) is NameClass.PROSE # quote + +def test_classify_collective(): + assert persons.classify_name("Familie", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Fam.Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("Eltern Cram", GIVEN) is NameClass.COLLECTIVE + assert persons.classify_name("seine Kinder", GIVEN) is NameClass.COLLECTIVE + +def test_classify_relational(): + assert persons.classify_name("Cousine Emmy Haniel", GIVEN) is NameClass.RELATIONAL + assert persons.classify_name("Schwester Hanni", GIVEN) is NameClass.RELATIONAL + +def test_classify_single_token(): + assert persons.classify_name("Agnes", GIVEN) is NameClass.SINGLE_TOKEN + assert persons.classify_name("A.B.", GIVEN) is NameClass.SINGLE_TOKEN + +def test_classify_ambiguous_pair(): + assert persons.classify_name("Ella Anita", GIVEN) is NameClass.AMBIGUOUS_PAIR + assert persons.classify_name("Kurt Georg", GIVEN) is NameClass.AMBIGUOUS_PAIR + +def test_classify_resolvable_single_person(): + # first + surname (surname not a given name) -> one real person, NOT ambiguous + assert persons.classify_name("Mieze Schefold", GIVEN) is NameClass.RESOLVABLE + assert persons.classify_name("Adolf Butenandt", GIVEN) is NameClass.RESOLVABLE + +def test_build_given_names(): + people = persons.parse_register([ + {"last_name": "de Gruyter", "first_name": "Eugenie"}, + {"last_name": "Cram", "first_name": "Charlotte,Meta"}, # comma -> primary + extra given + ]) + g = persons.build_given_names(people, {"Anita"}) + assert "eugenie" in g + assert "charlotte" in g and "meta" in g # primary + extra given names + assert "anita" in g # from the extra set, normalized + assert "schefold" not in g diff --git a/tools/import-normalizer/tests/test_persons_tree.py b/tools/import-normalizer/tests/test_persons_tree.py new file mode 100644 index 00000000..cdf7a450 --- /dev/null +++ b/tools/import-normalizer/tests/test_persons_tree.py @@ -0,0 +1,515 @@ +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import persons_tree + + +def test_parse_year_iso_string(): + assert persons_tree._parse_year("1920-09-20") == 1920 + + +def test_parse_year_excel_serial_birth(): + # 7568 days from 1899-12-30 = 1920-09-19 or -20 depending on leap counting + assert persons_tree._parse_year("7568") == 1920 + + +def test_parse_year_excel_serial_death(): + # 36222 days from 1899-12-30 ≈ 1999 + assert persons_tree._parse_year("36222") == 1999 + + +def test_parse_year_excel_serial_small(): + # 177 days from 1899-12-30 = 1900-06-25 + assert persons_tree._parse_year("177") == 1900 + + +def test_parse_year_german_date_string(): + assert persons_tree._parse_year("30.8.1862") == 1862 + + +def test_parse_year_year_only(): + assert persons_tree._parse_year("1930") == 1930 + + +def test_parse_year_free_text(): + assert persons_tree._parse_year("August 1941") == 1941 + + +def test_parse_year_none(): + assert persons_tree._parse_year(None) is None + + +def test_parse_year_empty(): + assert persons_tree._parse_year("") is None + + +def test_parse_year_unresolvable_truncated(): + # "2.9.196" has no valid 4-digit year — returns None + assert persons_tree._parse_year("2.9.196") is None + + +def test_parse_year_typo_year(): + # "4.3.1023" — year 1023 outside 1700-2100 guard — returns None + assert persons_tree._parse_year("4.3.1023") is None + + +def test_parse_year_bare_out_of_range_year_is_none(): + # "1023" is a plausible typo for "1923" but is NOT an Excel serial. + # parse_date("1023") parses it as year 1023 (out of 1700-2100 guard). + # The serial branch must NOT re-interpret it as a serial. + assert persons_tree._parse_year("1023") is None + + +def test_parse_generation_space(): + assert persons_tree._parse_generation("G 3") == 3 + + +def test_parse_generation_no_space(): + assert persons_tree._parse_generation("G3") == 3 + + +def test_parse_generation_extra_spaces(): + assert persons_tree._parse_generation("G 0") == 0 + + +def test_parse_generation_trailing_garbage(): + assert persons_tree._parse_generation("G 2 de Gruyter") == 2 + + +def test_parse_generation_empty(): + assert persons_tree._parse_generation("") is None + + +def test_parse_generation_none(): + assert persons_tree._parse_generation(None) is None + + +def test_norm_tree_basic(): + assert persons_tree._norm_tree("Werner Allemeyer") == "werner allemeyer" + + +def test_norm_tree_diacritics(): + assert persons_tree._norm_tree("Wöhler") == "woehler" + + +def test_norm_tree_strips_parens(): + assert persons_tree._norm_tree("Otto (Herbert)") == "otto" + + +def test_norm_tree_strips_quotes(): + assert persons_tree._norm_tree('"Tante Lolly"') == "tante lolly" + + +def test_norm_tree_strips_geographic_suffix(): + assert persons_tree._norm_tree("Walter Cram Aachen") == "walter cram" + + +def test_norm_tree_strips_mexiko(): + assert persons_tree._norm_tree("Hans Cram Mexiko") == "hans cram" + + +def test_norm_tree_collapses_whitespace(): + assert persons_tree._norm_tree(" Clara de Gruyter ") == "clara de gruyter" + + +def test_build_index_forward_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert "werner allemeyer" in idx + assert idx["werner allemeyer"] == ["row_002"] + + +def test_build_index_reversed_lookup(): + persons = [{"rowId": "row_002", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert idx.get("allemeyer werner") == ["row_002"] + + +def test_build_index_maiden_name_lookup(): + persons = [{"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "maidenName": "Wöhler"}] + idx = persons_tree._build_index(persons) + assert idx.get("elsgard woehler") == ["row_002"] + + +def test_build_index_single_token_fallback(): + persons = [{"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}] + idx = persons_tree._build_index(persons) + assert idx.get("cram") == ["row_028"] + + +def test_build_index_ambiguous_single_token(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + assert set(idx["cram"]) == {"row_028", "row_019"} + + +def test_resolve_one_found(): + persons = [{"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "maidenName": None}] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Allemeyer Werner", idx) + assert row_id == "row_003" + assert reason is None + + +def test_resolve_one_not_found(): + idx = {} + row_id, reason = persons_tree._resolve_one("Nobody Unknown", idx) + assert row_id is None + assert reason == "not_found" + + +def test_resolve_one_ambiguous(): + persons = [ + {"rowId": "row_028", "firstName": "Herbert", "lastName": "Cram", "maidenName": None}, + {"rowId": "row_019", "firstName": "Clara", "lastName": "Cram", "maidenName": None}, + ] + idx = persons_tree._build_index(persons) + row_id, reason = persons_tree._resolve_one("Cram", idx) + assert row_id is None + assert reason == "ambiguous" + + +def test_parse_row_serial_dates(): + fields = { + "generation": "G 3", "last_name": "Allemeyer", "first_name": "Elsgard", + "maiden_name": "Wöhler", "birth_date": "7568", "birth_place": "Garz", + "death_date": "36222", "death_place": "Espelkamp", + "spouse": "Allemeyer Werner", "notes": "Nichte von Herbert", + } + p = persons_tree._parse_row(2, fields) + assert p["rowId"] == "row_002" + assert p["firstName"] == "Elsgard" + assert p["lastName"] == "Allemeyer" + assert p["maidenName"] == "Wöhler" + assert p["birthYear"] == 1920 + assert p["deathYear"] == 1999 + assert p["birthPlace"] == "Garz" + assert p["deathPlace"] == "Espelkamp" + assert p["generation"] == 3 + assert p["familyMember"] is True + assert p["_spouse_raw"] == "Allemeyer Werner" + assert p["_bemerkung_raw"] == "Nichte von Herbert" + assert "[Geburtsdatum" not in (p["notes"] or "") + + +def test_parse_row_string_birth_date(): + fields = { + "generation": "G 2", "last_name": "Cram", "first_name": "Herbert", + "maiden_name": "", "birth_date": "25.6.1890", "birth_place": "Texas", + "death_date": "", "death_place": "", "spouse": "", "notes": "", + } + p = persons_tree._parse_row(28, fields) + assert p["birthYear"] == 1890 + assert p["deathYear"] is None + assert p["notes"] is None or p["notes"] == "" + + +def test_parse_row_unresolvable_date_goes_to_notes(): + fields = { + "generation": "G 3", "last_name": "Heydrich", "first_name": "Dieter", + "maiden_name": "", "birth_date": "28.9.", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": "Bruder v Ingrid", + } + p = persons_tree._parse_row(96, fields) + assert p["birthYear"] is None + assert "[Geburtsdatum: 28.9.]" in p["notes"] + assert "Bruder v Ingrid" in p["notes"] + + +def test_parse_row_empty_spouse_and_notes(): + fields = { + "generation": "G 4", "last_name": "Allemeyer", "first_name": "Jürgen", + "maiden_name": "", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": "", + } + p = persons_tree._parse_row(4, fields) + assert p["_spouse_raw"] is None + assert p["_bemerkung_raw"] is None + + +def test_deduplicate_no_duplicates(): + persons = [ + {"rowId": "row_002", "firstName": "Elsgard", "lastName": "Allemeyer", "birthYear": 1920}, + {"rowId": "row_003", "firstName": "Werner", "lastName": "Allemeyer", "birthYear": 1923}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert len(result) == 2 + assert skipped == [] + + +def test_deduplicate_exact_match(): + # rows 127/138: same firstName, lastName, birthYear + persons = [ + {"rowId": "row_127", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + {"rowId": "row_138", "firstName": "Christa", "lastName": "Schütz", "birthYear": 1951}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_127"] + assert len(skipped) == 1 + assert "row_138" in skipped[0] + + +def test_deduplicate_none_birth_year_after_known(): + # rows 129/139: row 129 has birthYear=1964, row 139 has birthYear=None + persons = [ + {"rowId": "row_129", "firstName": "Christoph", "lastName": "Seils", "birthYear": 1964}, + {"rowId": "row_139", "firstName": "Christoph", "lastName": "Seils", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_129"] + assert len(skipped) == 1 + + +def test_deduplicate_both_none_birth_year_kept(): + # Two people with no birth year but same name: keep first only + persons = [ + {"rowId": "row_A", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + {"rowId": "row_B", "firstName": "Hans", "lastName": "Cram", "birthYear": None}, + ] + result, skipped = persons_tree._deduplicate(persons) + assert [p["rowId"] for p in result] == ["row_A"] + assert len(skipped) == 1 + + +def _make_persons(*args): + """Helper: args are (rowId, firstName, lastName, maidenName, spouse_raw) tuples.""" + return [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3], + "_spouse_raw": a[4], "_bemerkung_raw": None, + "birthYear": None, "deathYear": None, "birthPlace": None, "deathPlace": None, + "generation": None, "familyMember": True, "alias": None, "notes": None} + for a in args + ] + + +def test_resolve_spouses_success(): + persons = _make_persons( + ("row_002", "Elsgard", "Allemeyer", "Wöhler", "Allemeyer Werner"), + ("row_003", "Werner", "Allemeyer", None, "Elsgard Wöhler"), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert len(rels) == 1 + assert rels[0]["type"] == "SPOUSE_OF" + assert set([rels[0]["personId"], rels[0]["relatedPersonId"]]) == {"row_002", "row_003"} + assert unres == [] + + +def test_resolve_spouses_not_found(): + persons = _make_persons( + ("row_007", "Charlotte", "Blomquist", "Ruge", '"Tante Lolly"'), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["rowId"] == "row_007" + assert unres[0]["reason"] == "not_found" + + +def test_resolve_spouses_empty_spouse_field(): + persons = _make_persons( + ("row_004", "Jürgen", "Allemeyer", None, None), + ) + idx = persons_tree._build_index(persons) + rels, unres = persons_tree._resolve_spouses(persons, idx) + assert rels == [] and unres == [] + + +def _register(*args): + """Build index from (rowId, first, last, maiden) tuples.""" + persons = [ + {"rowId": a[0], "firstName": a[1], "lastName": a[2], "maidenName": a[3]} + for a in args + ] + return persons, persons_tree._build_index(persons) + + +def test_parse_bemerkung_sohn_two_parents(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara Cram u Herbert Cram", idx + ) + assert len(rels) == 2 + assert all(r["type"] == "PARENT_OF" for r in rels) + child_ids = {r["relatedPersonId"] for r in rels} + parent_ids = {r["personId"] for r in rels} + assert child_ids == {"row_021"} + assert "row_019" in parent_ids and "row_028" in parent_ids + assert unres == [] + assert notes == "" + + +def test_parse_bemerkung_tochter_von(): + _, idx = _register(("row_019", "Clara", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_036", "Tochter von Clara Cram", idx + ) + assert len(rels) == 1 + assert rels[0] == { + "personId": "row_019", + "relatedPersonId": "row_036", + "type": "PARENT_OF", + "source": "bemerkung", + "rawBemerkung": "Tochter von Clara Cram", + } + assert notes == "" + + +def test_parse_bemerkung_vater(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_031", "Vater v Herbert Cram", idx + ) + assert len(rels) == 1 + assert rels[0]["personId"] == "row_031" + assert rels[0]["relatedPersonId"] == "row_028" + assert rels[0]["type"] == "PARENT_OF" + + +def test_parse_bemerkung_unmatched_parent_name(): + _, idx = _register() # empty index + rels, unres, notes = persons_tree._parse_bemerkung( + "row_004", "Sohn v Elsgard A.", idx + ) + assert rels == [] + assert len(unres) == 1 + assert unres[0]["reason"] == "not_found" + assert notes == "" + + +def test_parse_bemerkung_skip_nichte(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_002", "Nichte von Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Nichte von Herbert" + + +def test_parse_bemerkung_skip_bruder(): + _, idx = _register(("row_028", "Herbert", "Cram", None)) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_033", "Bruder v Herbert", idx + ) + assert rels == [] + assert unres == [] + assert notes == "Bruder v Herbert" + + +def test_parse_bemerkung_empty(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung("row_004", "", idx) + assert rels == [] and unres == [] and notes == "" + + +def test_parse_bemerkung_plain_remark(): + _, idx = _register() + rels, unres, notes = persons_tree._parse_bemerkung( + "row_029", "Verfasserin der Cram-Chronik !!", idx + ) + assert rels == [] and unres == [] + assert notes == "Verfasserin der Cram-Chronik !!" + + +def test_parse_bemerkung_sohn_with_trailing_remark(): + _, idx = _register( + ("row_019", "Clara", "Cram", "de Gruyter"), + ("row_028", "Herbert", "Cram", None), + ) + rels, unres, notes = persons_tree._parse_bemerkung( + "row_021", "Sohn v Clara Cram u Herbert Cram, nach Mexiko emigriert", idx + ) + assert len(rels) == 2 + assert unres == [] + assert notes == "nach Mexiko emigriert" + + +def test_generated_at_is_fixed_for_reproducibility(): + # NFR-IDEM-01: a pinned timestamp so the committed tree JSON doesn't churn on every run + assert persons_tree._GENERATED_AT == "2020-01-01T00:00:00" + + +def test_attach_person_ids_propagates_register_slug(): + # the tree person must carry the register's verbatim person_id (slug), not a recomputed one + raw_dicts = [ + {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Walter", + "maiden_name": "", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Eugenie", + "maiden_name": "Müller", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + ] + tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)] + persons_tree._attach_person_ids(tree_persons, raw_dicts) + assert tree_persons[0]["personId"] == "de-gruyter-walter" + assert tree_persons[1]["personId"] == "de-gruyter-eugenie" + + +def test_attach_person_ids_raises_on_length_divergence(): + # The propagation is a positional zip; if tree_persons and the register drift in + # length (e.g. a future filter change), zip would silently truncate and mis-join ids. + # The guard must fail loudly instead. + raw_dicts = [ + {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Walter", + "maiden_name": "", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + # second register row has a last name -> parse_register keeps it ... + {"generation": "G 1", "last_name": "de Gruyter", "first_name": "Eugenie", + "maiden_name": "Müller", "birth_date": "", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + ] + # ... but the tree side only has one person -> lengths diverge. + tree_persons = [persons_tree._parse_row(2, raw_dicts[0])] + import pytest + with pytest.raises(ValueError, match="length"): + persons_tree._attach_person_ids(tree_persons, raw_dicts) + + +def test_attach_person_ids_carries_register_collision_suffix(): + # when two register rows slug-collide, the register suffixes the ids (-1, -2); + # those exact suffixed ids must reach the tree persons, never a recomputed bare slug + raw_dicts = [ + {"generation": "G 2", "last_name": "Cram", "first_name": "Hans", + "maiden_name": "", "birth_date": "1890", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + {"generation": "G 3", "last_name": "Cram", "first_name": "Hans", + "maiden_name": "", "birth_date": "1925", "birth_place": "", + "death_date": "", "death_place": "", "spouse": "", "notes": ""}, + ] + tree_persons = [persons_tree._parse_row(n, d) for n, d in enumerate(raw_dicts, start=2)] + persons_tree._attach_person_ids(tree_persons, raw_dicts) + assert tree_persons[0]["personId"] == "cram-hans-1" + assert tree_persons[1]["personId"] == "cram-hans-2" + + +import subprocess + + +def test_dry_run_exits_zero(tmp_path): + """dry-run should complete without writing any file and exit 0.""" + input_path = Path(__file__).parent.parent.parent.parent / "import" / "Personendatei 2.xlsx" + if not input_path.exists(): + import pytest + pytest.skip("source Excel file not present") + + result = subprocess.run( + [ + sys.executable, str(Path(__file__).parent.parent / "persons_tree.py"), + "--input", str(input_path), + "--output", str(tmp_path / "out.json"), + "--dry-run", + ], + capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr + assert not (tmp_path / "out.json").exists() + assert "persons parsed" in result.stdout diff --git a/tools/import-normalizer/tests/test_tags.py b/tools/import-normalizer/tests/test_tags.py new file mode 100644 index 00000000..2f77f461 --- /dev/null +++ b/tools/import-normalizer/tests/test_tags.py @@ -0,0 +1,191 @@ +import tags + + +# --- classify_schlagwort --- + +def test_semantic_tag_kept_as_themen(): + assert tags.classify_schlagwort("Brautbriefe") == ["Themen/Brautbriefe"] + +def test_everyday_tag_kept_as_themen(): + assert tags.classify_schlagwort("Alltag in Ruhrort") == ["Themen/Alltag in Ruhrort"] + +def test_event_tag_kept_as_themen(): + assert tags.classify_schlagwort("zur Hochzeit") == ["Themen/zur Hochzeit"] + +def test_individual_correspondence_dropped(): + assert tags.classify_schlagwort("Clara an Herbert") == [] + +def test_individual_correspondence_with_year_dropped(): + assert tags.classify_schlagwort("Herbert an Clara 1918") == [] + +def test_individual_with_role_dropped(): + assert tags.classify_schlagwort("Vater Juan an Herbert") == [] + +def test_relational_receiver_dropped(): + assert tags.classify_schlagwort("Clara an ihre Mutter") == [] + +def test_group_receiver_kinder_kept_as_briefwechsel(): + assert tags.classify_schlagwort("Clara an Kinder") == ["Briefwechsel/Clara an Kinder"] + +def test_group_receiver_eltern_kept(): + assert tags.classify_schlagwort("Herbert an seine Eltern") == ["Briefwechsel/Herbert an seine Eltern"] + +def test_group_receiver_geschwister_kept(): + assert tags.classify_schlagwort("Walter an Geschwister") == ["Briefwechsel/Walter an Geschwister"] + +def test_group_receiver_schwiegereltern_kept(): + assert tags.classify_schlagwort("Clara an Schwiegereltern") == ["Briefwechsel/Clara an Schwiegereltern"] + +def test_group_receiver_soehne_kept(): + assert tags.classify_schlagwort("Mutter Cram an ihre Söhne") == ["Briefwechsel/Mutter Cram an ihre Söhne"] + +def test_group_receiver_brueder_kept(): + assert tags.classify_schlagwort("Hans an Brüder") == ["Briefwechsel/Hans an Brüder"] + +def test_group_receiver_cousinen_kept(): + assert tags.classify_schlagwort("Clara an Cousinen in Göttingen") == ["Briefwechsel/Clara an Cousinen in Göttingen"] + +def test_group_receiver_freunde_kept(): + assert tags.classify_schlagwort("Freunde an Herbert") == ["Briefwechsel/Freunde an Herbert"] + +def test_group_sender_geschwister_kept(): + # collective on the LEFT side of "an" + assert tags.classify_schlagwort("Geschwister Cram an Herbert") == ["Briefwechsel/Geschwister Cram an Herbert"] + +def test_receiver_only_individual_dropped(): + # starts with "an " — single individual receiver + assert tags.classify_schlagwort("an Walter de Gruyter") == [] + +def test_receiver_only_group_kept(): + # starts with "an " — collective receiver + assert tags.classify_schlagwort("an ihre Geschwister") == ["Briefwechsel/an ihre Geschwister"] + +def test_abbreviated_sender_individual_dropped(): + # "Maria W.an Clara" — abbreviated name + ".an" + assert tags.classify_schlagwort("Maria W.an Clara") == [] + +def test_abbreviated_sender_group_kept(): + assert tags.classify_schlagwort("Eugenie sen.an Kinder") == ["Briefwechsel/Eugenie sen.an Kinder"] + +def test_empty_schlagwort_returns_empty(): + assert tags.classify_schlagwort("") == [] + +def test_einzelkinder_kept(): + assert tags.classify_schlagwort("Enkelkinder an Clara") == ["Briefwechsel/Enkelkinder an Clara"] + +def test_geschw_abbreviation_kept(): + # "Geschw." abbreviation for Geschwister — appears after "u" in receiver side + assert tags.classify_schlagwort("Bruder Hans an Herbert u Geschw.") == ["Briefwechsel/Bruder Hans an Herbert u Geschw."] + + +# --- mine_summary_candidates --- + +def test_mine_candidates_counts_words(): + summaries = ["Reise, Hochzeit", "Reise", "Krieg"] + candidates = dict(tags.mine_summary_candidates(summaries)) + assert candidates["reise"] == 2 + assert candidates["hochzeit"] == 1 + assert candidates["krieg"] == 1 + +def test_mine_candidates_filters_stop_words(): + summaries = ["und die Reise", "das ist eine Reise"] + candidates = dict(tags.mine_summary_candidates(summaries)) + assert "reise" in candidates + assert "und" not in candidates + assert "die" not in candidates + assert "das" not in candidates + assert "ist" not in candidates + assert "eine" not in candidates + +def test_mine_candidates_filters_contracted_prepositions(): + # im=in+dem, zum=zu+dem, zur=zu+der, vom=von+dem, sich, am, beim + summaries = ["im Sommer zum Besuch, zur Hochzeit vom Vater, sich gefreut am Morgen beim Fest"] + candidates = dict(tags.mine_summary_candidates(summaries)) + for stop in ("im", "zum", "zur", "vom", "sich", "am", "beim", "ans"): + assert stop not in candidates, f"stop word '{stop}' leaked through" + assert "besuch" in candidates + assert "hochzeit" in candidates + +def test_mine_candidates_filters_single_chars(): + summaries = ["x Reise y"] + candidates = dict(tags.mine_summary_candidates(summaries)) + assert "x" not in candidates + assert "y" not in candidates + +def test_mine_candidates_sorted_descending(): + summaries = ["Reise", "Reise", "Hochzeit", "Reise", "Hochzeit", "Krieg"] + result = tags.mine_summary_candidates(summaries) + counts = [count for _, count in result] + assert counts == sorted(counts, reverse=True) + +def test_mine_candidates_empty_summaries(): + assert tags.mine_summary_candidates([]) == [] + assert tags.mine_summary_candidates([""]) == [] + + +# --- load_approved_themes and apply_approved_themes --- + +def test_apply_themes_match_found(tmp_path): + themes = {"reise", "hochzeit"} + result = tags.apply_approved_themes("Reise nach Berlin", themes) + assert "Themen/reise" in result + +def test_apply_themes_case_insensitive(tmp_path): + themes = {"reise"} + result = tags.apply_approved_themes("REISE", themes) + assert "Themen/reise" in result + +def test_apply_themes_no_match(tmp_path): + themes = {"krieg"} + result = tags.apply_approved_themes("Alltag in Ruhrort", themes) + assert result == [] + +def test_apply_themes_multiple_matches(): + themes = {"reise", "hochzeit"} + result = tags.apply_approved_themes("Reise zur Hochzeit", themes) + assert len(result) == 2 + assert "Themen/reise" in result + assert "Themen/hochzeit" in result + + +# --- encode_tags --- + +def test_encode_tags_single(): + assert tags.encode_tags(["Themen/Brautbriefe"]) == "Themen/Brautbriefe" + +def test_encode_tags_multiple(): + result = tags.encode_tags(["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"]) + assert result == "Themen/Brautbriefe|Briefwechsel/Clara an Kinder" + +def test_encode_tags_empty(): + assert tags.encode_tags([]) == "" + + +# --- build_tag_tree --- + +def test_build_tag_tree_includes_roots(): + paths = ["Themen/Brautbriefe", "Briefwechsel/Clara an Kinder"] + tree = tags.build_tag_tree(paths) + tag_paths = [row["tag_path"] for row in tree] + assert "Themen" in tag_paths + assert "Briefwechsel" in tag_paths + +def test_build_tag_tree_includes_children(): + paths = ["Themen/Brautbriefe"] + tree = tags.build_tag_tree(paths) + child = next(r for r in tree if r["tag_path"] == "Themen/Brautbriefe") + assert child["parent_name"] == "Themen" + assert child["tag_name"] == "Brautbriefe" + +def test_build_tag_tree_root_has_empty_parent(): + paths = ["Themen/Brautbriefe"] + tree = tags.build_tag_tree(paths) + root = next(r for r in tree if r["tag_path"] == "Themen") + assert root["parent_name"] == "" + assert root["tag_name"] == "Themen" + +def test_build_tag_tree_no_duplicates(): + paths = ["Themen/Brautbriefe", "Themen/Alltag", "Themen/Brautbriefe"] + tree = tags.build_tag_tree(paths) + tag_paths = [row["tag_path"] for row in tree] + assert len(tag_paths) == len(set(tag_paths)) diff --git a/tools/import-normalizer/tests/test_writers.py b/tools/import-normalizer/tests/test_writers.py new file mode 100644 index 00000000..fe6dfbe9 --- /dev/null +++ b/tools/import-normalizer/tests/test_writers.py @@ -0,0 +1,76 @@ +import csv +import openpyxl +import overrides +import writers +import documents + +def test_load_overrides_missing_files(tmp_path): + d, n = overrides.load_overrides(tmp_path / "dates.csv", tmp_path / "names.csv") + assert d == {} and n == {} + +def test_load_overrides_parsed(tmp_path): + dp = tmp_path / "dates.csv" + dp.write_text("raw,iso,precision\n13.5.65,1965-05-13,DAY\n", encoding="utf-8") + np = tmp_path / "names.csv" + np.write_text("raw,person_id\nEugenie Müller,de-gruyter-eugenie\n", encoding="utf-8") + d, n = overrides.load_overrides(dp, np) + assert d["13.5.65"] == ("1965-05-13", "DAY") + assert n["Eugenie Müller"] == "de-gruyter-eugenie" + +def test_write_documents_xlsx_joins_lists(tmp_path): + doc = documents.CanonicalDocument( + index="W-0001", receiver_person_ids=["a", "b"], receiver_names=["A", "B"], + tags=["Brautbriefe"], date_precision="DAY", needs_review=["unparsed_date"]) + out = tmp_path / "docs.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + ws = wb.active + header = [c.value for c in ws[1]] + assert "receiver_person_ids" in header and "needs_review" in header + row = {h: c.value for h, c in zip(header, ws[2])} + assert row["receiver_person_ids"] == "a|b" + assert row["needs_review"] == "unparsed_date" + + +def test_write_documents_xlsx_carries_date_end_and_has_no_file_column(tmp_path): + # #686: PDFs resolve by index (.pdf), so the redundant "file" column is dropped. + doc = documents.CanonicalDocument( + index="H-0730", date_iso="1917-01-10", + date_precision="RANGE", date_end="1917-01-11") + out = tmp_path / "docs.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + ws = wb.active + header = [c.value for c in ws[1]] + assert "file" not in header + assert "date_end" in header + row = {h: c.value for h, c in zip(header, ws[2])} + assert row["date_end"] == "1917-01-11" + +def test_write_documents_xlsx_pins_timestamp(tmp_path): + # determinism (NFR-IDEM-01): workbook created/modified are pinned, not the current time + doc = documents.CanonicalDocument(index="W-0001") + out = tmp_path / "d.xlsx" + writers.write_documents_xlsx([doc], out) + wb = openpyxl.load_workbook(out) + assert (wb.properties.created.year, wb.properties.created.month, wb.properties.created.day) == (2020, 1, 1) + +def test_write_review_csv(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["?", 3], ["x", 1]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[0] == ["raw", "count"] + assert rows[1] == ["?", "3"] + +def test_write_review_csv_defangs_formula_injection(tmp_path): + out = tmp_path / "r.csv" + writers.write_review_csv(out, ["raw", "count"], [["=cmd|'/C calc'!A0", 1], ["-2+3", 2]]) + rows = list(csv.reader(out.open(encoding="utf-8"))) + assert rows[1][0].startswith("'=") # leading '=' neutralised + assert rows[2][0].startswith("'-") + +def test_write_summary_sections(tmp_path): + out = tmp_path / "s.txt" + writers.write_summary(out, {"# INPUTS": "", "rows": 10, "# DATES": "", "unknown_date_rate": "3.2%"}) + text = out.read_text(encoding="utf-8") + assert "INPUTS:" in text and "DATES:" in text and " rows: 10" in text diff --git a/tools/import-normalizer/writers.py b/tools/import-normalizer/writers.py new file mode 100644 index 00000000..b7c3e816 --- /dev/null +++ b/tools/import-normalizer/writers.py @@ -0,0 +1,87 @@ +"""Write canonical .xlsx outputs and review .csv files.""" +import csv +import datetime +from pathlib import Path +import openpyxl + +_PIPE = "|" +# Pinned workbook metadata so reruns are content-deterministic (NFR-IDEM-01); openpyxl +# otherwise stamps docProps with the current time on every save. +_FIXED_TS = datetime.datetime(2020, 1, 1, 0, 0, 0) + + +def _join(value): + if isinstance(value, list): + return _PIPE.join(str(v) for v in value) + return "" if value is None else str(value) + + +def _csv_safe(value): + """Neutralise spreadsheet formula injection (CWE-1236) in human-opened review CSVs.""" + s = "" if value is None else str(value) + return "'" + s if s[:1] in ("=", "+", "-", "@", "\t", "\r", "\n") else s + + +DOC_COLUMNS = ["index", "box", "folder", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", + "date_precision", "date_end", "location", "tags", "summary", + "source_row", "needs_review"] + +PERSON_COLUMNS = ["person_id", "last_name", "first_name", "maiden_name", "title", "nickname", + "birth_date", "birth_date_raw", "birth_place", "death_date", "death_date_raw", + "death_place", "spouse", "generation", "notes", "aliases", "provisional"] + + +def _write_xlsx(records, columns, path: Path): + wb = openpyxl.Workbook() + ws = wb.active + ws.append(columns) + for rec in records: + ws.append([_join(getattr(rec, col)) for col in columns]) + wb.properties.created = _FIXED_TS + wb.properties.modified = _FIXED_TS + Path(path).parent.mkdir(parents=True, exist_ok=True) + wb.save(path) + + +def write_documents_xlsx(docs, path: Path): + _write_xlsx(docs, DOC_COLUMNS, path) + + +def write_tag_tree_xlsx(tree: list[dict], path: Path): + columns = ["tag_path", "parent_name", "tag_name"] + wb = openpyxl.Workbook() + ws = wb.active + ws.append(columns) + for row in tree: + ws.append([row.get(col, "") for col in columns]) + wb.properties.created = _FIXED_TS + wb.properties.modified = _FIXED_TS + Path(path).parent.mkdir(parents=True, exist_ok=True) + wb.save(path) + + +def write_persons_xlsx(people, path: Path): + _write_xlsx(people, PERSON_COLUMNS, path) + + +def write_review_csv(path: Path, header: list[str], rows: list[list]): + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8", newline="") as f: + w = csv.writer(f) + w.writerow(header) + for row in rows: + w.writerow([_csv_safe(c) for c in row]) + + +def write_summary(path: Path, stats: dict): + """Render a grouped, scannable summary. Keys beginning with '#' are section headers.""" + Path(path).parent.mkdir(parents=True, exist_ok=True) + lines = [] + for k, v in stats.items(): + if k.startswith("#"): + lines.append("") + lines.append(k[1:].strip() + ":") + else: + lines.append(f" {k}: {v}") + Path(path).write_text("\n".join(lines).strip() + "\n", encoding="utf-8")