diff --git a/CLAUDE.md b/CLAUDE.md index 10a3c368..b3a5b189 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -87,7 +87,7 @@ backend/src/main/java/org/raddatz/familienarchiv/ ├── exception/ DomainException, ErrorCode, GlobalExceptionHandler ├── filestorage/ FileService (S3/MinIO) ├── geschichte/ Geschichte (story) domain -├── importing/ MassImportService +├── importing/ CanonicalImportOrchestrator + four loaders (TagTree/PersonRegister/PersonTree/Document) + CanonicalSheetReader ├── notification/ Notification domain + SseEmitterRegistry ├── ocr/ OCR domain — OcrService, OcrBatchService, training ├── person/ Person domain diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 249221cc..b96d242a 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -34,7 +34,7 @@ src/main/java/org/raddatz/familienarchiv/ ├── exception/ # DomainException, ErrorCode, GlobalExceptionHandler ├── filestorage/ # FileService (S3/MinIO) ├── geschichte/ # Geschichte (story) domain -├── importing/ # MassImportService +├── importing/ # CanonicalImportOrchestrator + 4 loaders + CanonicalSheetReader ├── notification/ # Notification domain + SseEmitterRegistry ├── ocr/ # OCR domain — OcrService, OcrBatchService, training ├── person/ # Person domain — Person, PersonService, PersonController diff --git a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java index 54802f86..d9d0d8b2 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java @@ -40,6 +40,8 @@ public enum ErrorCode { // --- Import --- /** A mass import is already in progress; only one can run at a time. 409 */ IMPORT_ALREADY_RUNNING, + /** A canonical import artifact is missing, unreadable, or missing a required header. 400 */ + IMPORT_ARTIFACT_INVALID, // --- Thumbnails --- /** A thumbnail backfill is already in progress; only one can run at a time. 409 */ diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java new file mode 100644 index 00000000..2107bfda --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java @@ -0,0 +1,94 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.time.LocalDateTime; +import java.util.List; + +/** + * Runs the four canonical loaders in their real dependency order — encoded explicitly + * here, not implied by call order — and owns the async runner plus the {@link ImportStatus} + * state machine the admin UI consumes. The orchestrator smoke-checks that all four + * artifacts are present before starting, failing fast rather than half-loading tags but no + * documents. A malformed artifact (a loader throwing) sets {@code FAILED}; an individual + * bad file is surfaced through the {@link ImportStatus.SkippedFile} mechanism instead. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class CanonicalImportOrchestrator { + + private static final String TAG_TREE_ARTIFACT = "canonical-tag-tree.xlsx"; + private static final String PERSONS_ARTIFACT = "canonical-persons.xlsx"; + private static final String PERSONS_TREE_ARTIFACT = "canonical-persons-tree.json"; + private static final String DOCUMENTS_ARTIFACT = "canonical-documents.xlsx"; + + private final TagTreeImporter tagTreeImporter; + private final PersonRegisterImporter personRegisterImporter; + private final PersonTreeImporter personTreeImporter; + private final DocumentImporter documentImporter; + + @Value("${app.import.dir:/import}") + private String canonicalDir; + + private volatile ImportStatus currentStatus = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + + public ImportStatus getStatus() { + return currentStatus; + } + + @Async + public void runImportAsync() { + if (currentStatus.state() == ImportStatus.State.RUNNING) { + throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); + } + runImport(); + } + + /** Synchronous entry point — wrapped by {@link #runImportAsync()} and called directly in tests. */ + void runImport() { + currentStatus = new ImportStatus(ImportStatus.State.RUNNING, "IMPORT_RUNNING", + "Import läuft...", 0, List.of(), LocalDateTime.now()); + try { + File tagTree = requireArtifact(TAG_TREE_ARTIFACT); + File persons = requireArtifact(PERSONS_ARTIFACT); + File personsTree = requireArtifact(PERSONS_TREE_ARTIFACT); + File documents = requireArtifact(DOCUMENTS_ARTIFACT); + + // Dependency DAG: documents need persons + tags; the tree needs persons. + tagTreeImporter.load(tagTree); + personRegisterImporter.load(persons); + personTreeImporter.load(personsTree); + DocumentImporter.LoadResult result = documentImporter.load(documents); + + currentStatus = new ImportStatus(ImportStatus.State.DONE, "IMPORT_DONE", + "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", + result.processed(), result.skippedFiles(), currentStatus.startedAt()); + } catch (DomainException e) { + log.error("Canonical import failed: {}", e.getMessage()); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_ARTIFACT", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } catch (Exception e) { + log.error("Canonical import failed", e); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_INTERNAL", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } + } + + private File requireArtifact(String name) { + File artifact = new File(canonicalDir, name); + if (!artifact.isFile()) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Missing canonical artifact: " + name); + } + return artifact; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java new file mode 100644 index 00000000..ece6a7ca --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalSheetReader.java @@ -0,0 +1,133 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DateUtil; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.usermodel.WorkbookFactory; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; + +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Value-level POI helper for the canonical import artifacts. No Spring, no domain + * knowledge: it opens a workbook, maps the header row to column indices by name, and + * yields typed rows whose cells are looked up by header name — the seam that replaces + * the old positional {@code @Value app.import.col.*} indices. List columns are split on + * the pipe delimiter the normalizer emits. + */ +public final class CanonicalSheetReader { + + private CanonicalSheetReader() { + } + + /** A single data row, addressable by canonical header name (never by index). */ + public static final class Row { + + private final Map headerIndex; + private final List cells; + + private Row(Map headerIndex, List cells) { + this.headerIndex = headerIndex; + this.cells = cells; + } + + /** Trimmed cell value for the named header, or "" when absent/blank. */ + public String get(String header) { + Integer index = headerIndex.get(header); + if (index == null || index >= cells.size()) return ""; + String value = cells.get(index); + return value == null ? "" : value.trim(); + } + } + + /** + * Reads all data rows from the first sheet, validating that every required header is + * present. Throws a fail-closed {@link DomainException} on a missing header so a + * loader never silently maps the wrong column. + */ + public static List readRows(File file, List requiredHeaders) { + try (FileInputStream fis = new FileInputStream(file); + Workbook workbook = WorkbookFactory.create(fis)) { + + Sheet sheet = workbook.getSheetAt(0); + org.apache.poi.ss.usermodel.Row headerRow = sheet.getRow(sheet.getFirstRowNum()); + Map headerIndex = mapHeaders(headerRow); + requireHeaders(file, headerIndex, requiredHeaders); + + List rows = new ArrayList<>(); + for (int i = sheet.getFirstRowNum() + 1; i <= sheet.getLastRowNum(); i++) { + org.apache.poi.ss.usermodel.Row poiRow = sheet.getRow(i); + if (poiRow == null) continue; + rows.add(new Row(headerIndex, readCells(poiRow, headerIndex.size()))); + } + return rows; + } catch (DomainException e) { + throw e; + } catch (Exception e) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Unreadable canonical artifact: " + file.getName()); + } + } + + /** Splits a pipe-delimited list column into trimmed, non-empty segments. */ + public static List splitList(String raw) { + if (raw == null || raw.isBlank()) return List.of(); + return Arrays.stream(raw.split("\\|")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .toList(); + } + + private static Map mapHeaders(org.apache.poi.ss.usermodel.Row headerRow) { + if (headerRow == null) { + return Map.of(); + } + Map headerIndex = new HashMap<>(); + for (int c = 0; c < headerRow.getLastCellNum(); c++) { + String name = cellToString(headerRow.getCell(c)).trim(); + if (!name.isEmpty()) headerIndex.putIfAbsent(name, c); + } + return headerIndex; + } + + private static void requireHeaders(File file, Map headerIndex, List requiredHeaders) { + for (String header : requiredHeaders) { + if (!headerIndex.containsKey(header)) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Missing required header '" + header + "' in artifact " + file.getName()); + } + } + } + + private static List readCells(org.apache.poi.ss.usermodel.Row poiRow, int columnCount) { + int width = Math.max(columnCount, poiRow.getLastCellNum()); + List cells = new ArrayList<>(width); + for (int c = 0; c < width; c++) { + cells.add(cellToString(poiRow.getCell(c))); + } + return cells; + } + + private static String cellToString(Cell cell) { + if (cell == null) return ""; + return switch (cell.getCellType()) { + case STRING -> cell.getStringCellValue(); + case NUMERIC -> { + if (DateUtil.isCellDateFormatted(cell)) { + yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); + } + yield String.valueOf((long) cell.getNumericCellValue()); + } + case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); + default -> ""; + }; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java new file mode 100644 index 00000000..6be566ab --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java @@ -0,0 +1,334 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.document.DatePrecision; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Transactional; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import org.raddatz.familienarchiv.tag.TagService; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Stream; + +/** + * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no + * semantic transformation: the normalizer already resolved people to slugs and dates to + * ISO values. This loader maps columns by header name, routes each attribution + * register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}), + * parses clean dates, and keeps the file/S3/thumbnail plumbing. + * + *

The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not + * care that it came from our Python tool): its basename is validated with + * {@link #isValidImportFilename} and then resolved with canonical-path containment in + * {@link #findFileRecursive}. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class DocumentImporter { + + static final List REQUIRED_HEADERS = List.of( + "index", "file", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision"); + + private final DocumentService documentService; + private final PersonService personService; + private final TagService tagService; + private final S3Client s3Client; + private final ThumbnailAsyncRunner thumbnailAsyncRunner; + + @Value("${app.s3.bucket:familienarchiv}") + private String bucketName; + + @Value("${app.import.dir:/import}") + private String importDir; + + /** Outcome of loading the document sheet: processed count + per-file skips. */ + public record LoadResult(int processed, List skippedFiles) {} + + // One transaction for the whole sheet keeps the Hibernate session open so an existing + // document's lazy receivers collection initialises during an idempotent re-import. + // Invoked cross-bean from the orchestrator, so the @Transactional proxy applies. + @Transactional + public LoadResult load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + int processed = 0; + List skipped = new ArrayList<>(); + for (CanonicalSheetReader.Row row : rows) { + String index = row.get("index"); + if (index.isBlank()) continue; + Optional skipReason = importRow(row, index, skipped); + if (skipReason.isPresent()) { + skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get())); + } else { + processed++; + } + } + log.info("Imported {} documents from {} ({} skipped)", processed, artifact.getName(), skipped.size()); + return new LoadResult(processed, skipped); + } + + private Optional importRow(CanonicalSheetReader.Row row, String index, + List skipped) { + Optional resolved; + try { + resolved = resolveFile(row.get("file")); + } catch (InvalidImportFilenameException e) { + log.warn("Skipping import row {}: filename rejected", index); + return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + } + if (resolved.isPresent()) { + try { + if (!isPdfMagicBytes(resolved.get())) { + return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + } + } catch (IOException e) { + log.error("Magic-byte check failed for row {}", index, e); + return Optional.of(ImportStatus.SkipReason.FILE_READ_ERROR); + } + } + return persist(row, index, resolved); + } + + private Optional persist(CanonicalSheetReader.Row row, String index, Optional file) { + Document existing = documentService.findByOriginalFilename(index).orElse(null); + if (existing != null && existing.getStatus() != DocumentStatus.PLACEHOLDER) { + return Optional.of(ImportStatus.SkipReason.ALREADY_EXISTS); + } + + String s3Key = null; + String contentType = null; + DocumentStatus status = DocumentStatus.PLACEHOLDER; + if (file.isPresent()) { + contentType = probeContentType(file.get()); + s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); + try { + uploadToS3(file.get(), s3Key, contentType); + status = DocumentStatus.UPLOADED; + } catch (Exception e) { + log.error("S3 upload failed for {}", file.get().getName(), e); + return Optional.of(ImportStatus.SkipReason.S3_UPLOAD_FAILED); + } + } + + Document doc = buildDocument(row, index, existing, s3Key, contentType, status); + Document saved = documentService.save(doc); + if (file.isPresent()) { + thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); + } + return Optional.empty(); + } + + private Document buildDocument(CanonicalSheetReader.Row row, String index, Document existing, + String s3Key, String contentType, DocumentStatus status) { + Document doc = existing != null ? existing + : Document.builder().originalFilename(index).build(); + + String senderName = row.get("sender_name"); + String receiverNames = row.get("receiver_names"); + Person sender = resolveSender(row.get("sender_person_id"), senderName); + Set receivers = resolveReceivers(row.get("receiver_person_ids")); + + doc.setTitle(index); + doc.setStatus(status); + doc.setFilePath(s3Key); + doc.setContentType(contentType); + doc.setSender(sender); + doc.setSenderText(blankToNull(senderName)); + // The canonical row is authoritative for receivers/tags (ADR-025): clear then + // re-populate so a shrunk set on re-import prunes stale links rather than + // accumulating them. The raw sender_text/receiver_text retention is separate. + doc.getReceivers().clear(); + doc.getReceivers().addAll(receivers); + doc.setReceiverText(blankToNull(receiverNames)); + doc.setDocumentDate(parseIsoDate(row.get("date_iso"))); + doc.setMetaDatePrecision(parsePrecision(row.get("date_precision"))); + doc.setMetaDateEnd(parseIsoDate(row.get("date_end"))); + doc.setMetaDateRaw(blankToNull(row.get("date_raw"))); + doc.setLocation(blankToNull(row.get("location"))); + doc.setSummary(blankToNull(row.get("summary"))); + attachTag(doc, row.get("tags")); + doc.setMetadataComplete(doc.getDocumentDate() != null || sender != null || !receivers.isEmpty()); + return doc; + } + + // ─── attribution routing — register-first, always retain raw ───────────────────── + + private Person resolveSender(String slug, String rawName) { + if (slug.isBlank()) return null; + return resolvePerson(slug, rawName); + } + + private Set resolveReceivers(String slugs) { + Set receivers = new LinkedHashSet<>(); + for (String slug : CanonicalSheetReader.splitList(slugs)) { + receivers.add(resolvePerson(slug, slug)); + } + return receivers; + } + + private Person resolvePerson(String slug, String rawName) { + return personService.findBySourceRef(slug) + .orElseGet(() -> personService.upsertBySourceRef(PersonUpsertCommand.builder() + .sourceRef(slug) + .lastName(blankToNull(rawName) == null ? slug : rawName) + .personType(PersonType.PERSON) + .provisional(true) + .build())); + } + + // Authoritative: the canonical row defines the document's tags exactly. Clearing first + // means a tag removed from the row is pruned on re-import (ADR-025). + private void attachTag(Document doc, String tagPath) { + doc.getTags().clear(); + if (tagPath.isBlank()) return; + tagService.findBySourceRef(tagPath).ifPresent(tag -> doc.getTags().add(tag)); + } + + // ─── clean-value parsing (no semantic logic) ───────────────────────────────────── + + private static LocalDate parseIsoDate(String value) { + if (value == null || value.isBlank()) return null; + try { + return LocalDate.parse(value.trim()); + } catch (DateTimeParseException e) { + return null; + } + } + + private static DatePrecision parsePrecision(String value) { + if (value == null || value.isBlank()) return DatePrecision.UNKNOWN; + try { + return DatePrecision.valueOf(value.trim()); + } catch (IllegalArgumentException e) { + return DatePrecision.UNKNOWN; + } + } + + // ─── file handling + S3 (small ≤20-line methods) ───────────────────────────────── + + private Optional resolveFile(String fileColumn) { + if (fileColumn == null || fileColumn.isBlank()) return Optional.empty(); + String basename = basenameOf(fileColumn); + if (!isValidImportFilename(basename)) { + throw new InvalidImportFilenameException(); + } + return findFileRecursive(basename); + } + + private static String basenameOf(String fileColumn) { + String normalized = fileColumn.replace('\\', '/'); + int lastSlash = normalized.lastIndexOf('/'); + return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim(); + } + + private String probeContentType(File file) { + try { + String probed = Files.probeContentType(file.toPath()); + return probed != null ? probed : "application/octet-stream"; + } catch (IOException e) { + return "application/octet-stream"; + } + } + + private void uploadToS3(File file, String s3Key, String contentType) { + s3Client.putObject(PutObjectRequest.builder() + .bucket(bucketName) + .key(s3Key) + .contentType(contentType) + .build(), + RequestBody.fromFile(file)); + } + + // ─── security guards — ported verbatim from MassImportService — do not weaken ──── + + private boolean isValidImportFilename(String filename) { + if (filename == null || filename.isBlank()) return false; + if (filename.contains("/")) return false; + if (filename.contains("\\")) return false; + if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH + if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS + if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR + if (filename.contains("..")) return false; + if (filename.equals(".")) return false; + if (filename.contains("\0")) return false; + if (Paths.get(filename).isAbsolute()) return false; + return true; + } + + // package-private: a Mockito spy in tests can override to inject IOException + InputStream openFileStream(File file) throws IOException { + return new FileInputStream(file); + } + + private boolean isPdfMagicBytes(File file) throws IOException { + try (InputStream is = openFileStream(file)) { + byte[] header = is.readNBytes(4); + return header.length == 4 + && header[0] == 0x25 // % + && header[1] == 0x50 // P + && header[2] == 0x44 // D + && header[3] == 0x46; // F + } + } + + private Optional findFileRecursive(String filename) { + File baseDir = new File(importDir); + try (Stream walk = Files.walk(baseDir.toPath())) { + Optional match = walk.filter(p -> !Files.isDirectory(p)) + .filter(p -> p.getFileName().toString().equals(filename)) + .findFirst(); + if (match.isEmpty()) return Optional.empty(); + File candidate = match.get().toFile(); + String baseDirCanonical = baseDir.getCanonicalPath(); + if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { + throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); + } + return Optional.of(candidate); + } catch (IOException e) { + return Optional.empty(); + } + } + + private static String displayName(CanonicalSheetReader.Row row, String index) { + String file = row.get("file"); + return file.isBlank() ? index : basenameOf(file); + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } + + private static final class InvalidImportFilenameException extends RuntimeException { + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java new file mode 100644 index 00000000..ae21adc2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java @@ -0,0 +1,50 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.v3.oas.annotations.media.Schema; + +import java.time.LocalDateTime; +import java.util.List; + +/** + * Async import state surfaced to {@code admin/system/ImportStatusCard.svelte} via the + * generated types. The shape ({@code state, statusCode, processed, skippedFiles, skipped}) + * is kept verbatim from the retired MassImportService so the admin UI keeps working. + */ +public record ImportStatus( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, + @JsonIgnore String message, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, + LocalDateTime startedAt +) { + + public enum State { IDLE, RUNNING, DONE, FAILED } + + public enum SkipReason { + INVALID_FILENAME_PATH_TRAVERSAL, + INVALID_PDF_SIGNATURE, + FILE_READ_ERROR, + ALREADY_EXISTS, + S3_UPLOAD_FAILED + } + + public record SkippedFile( + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason + ) {} + + // Note: @Schema on a record accessor method is not picked up by SpringDoc; the + // "skipped" count is a computed convenience field derived from skippedFiles.size(). + @JsonProperty("skipped") + public int skipped() { + return skippedFiles.size(); + } + + /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ + public ImportStatus { + skippedFiles = List.copyOf(skippedFiles); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java deleted file mode 100644 index 975517e7..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java +++ /dev/null @@ -1,509 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.v3.oas.annotations.media.Schema; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.poi.ss.usermodel.*; -import java.util.Objects; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.exception.ErrorCode; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.person.PersonNameParser; -import org.raddatz.familienarchiv.person.PersonService; -import org.raddatz.familienarchiv.tag.TagService; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.scheduling.annotation.Async; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import javax.xml.parsers.DocumentBuilderFactory; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Optional; -import java.util.UUID; -import java.util.stream.Stream; -import java.util.zip.ZipFile; - -@Service -@RequiredArgsConstructor -@Slf4j -public class MassImportService { - - public enum State { IDLE, RUNNING, DONE, FAILED } - - public enum SkipReason { - INVALID_FILENAME_PATH_TRAVERSAL, - INVALID_PDF_SIGNATURE, - FILE_READ_ERROR, - ALREADY_EXISTS, - S3_UPLOAD_FAILED - } - - public record SkippedFile( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason - ) {} - - public record ImportStatus( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, - @JsonIgnore String message, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, - LocalDateTime startedAt - ) { - // Note: @Schema on a record accessor method is not picked up by SpringDoc; the - // "skipped" count is a computed convenience field derived from skippedFiles.size(). - @JsonProperty("skipped") - public int skipped() { return skippedFiles.size(); } - - /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ - public ImportStatus { - skippedFiles = List.copyOf(skippedFiles); - } - } - - record ProcessResult(int processed, List skippedFiles) {} - - private volatile ImportStatus currentStatus = new ImportStatus(State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - - public ImportStatus getStatus() { - return currentStatus; - } - - private final DocumentService documentService; - private final PersonService personService; - private final TagService tagService; - private final S3Client s3Client; - private final ThumbnailAsyncRunner thumbnailAsyncRunner; - - @Value("${app.s3.bucket}") - private String bucketName; - - @Value("${app.import.col.index:0}") - private int colIndex; - - @Value("${app.import.col.box:1}") - private int colBox; - - @Value("${app.import.col.folder:2}") - private int colFolder; - - @Value("${app.import.col.sender:3}") - private int colSender; - - @Value("${app.import.col.receivers:5}") - private int colReceivers; - - @Value("${app.import.col.date:7}") - private int colDate; - - @Value("${app.import.col.location:9}") - private int colLocation; - - @Value("${app.import.col.tags:10}") - private int colTags; - - @Value("${app.import.col.summary:11}") - private int colSummary; - - @Value("${app.import.col.transcription:13}") - private int colTranscription; - - @Value("${app.import.dir:/import}") - private String importDir; - - private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); - - // ODS XML namespaces - private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; - private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; - - // We only need up to this many columns; caps repeated-empty-cell expansion - private static final int MAX_COLS = 20; - - @Async - public void runImportAsync() { - if (currentStatus.state() == State.RUNNING) { - throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); - } - currentStatus = new ImportStatus(State.RUNNING, "IMPORT_RUNNING", "Import läuft...", 0, List.of(), LocalDateTime.now()); - try { - File spreadsheet = findSpreadsheetFile(); - log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath()); - ProcessResult result = processRows(readSpreadsheet(spreadsheet)); - currentStatus = new ImportStatus(State.DONE, "IMPORT_DONE", - "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", - result.processed(), result.skippedFiles(), currentStatus.startedAt()); - } catch (NoSpreadsheetException e) { - log.error("Massenimport fehlgeschlagen: keine Tabellendatei", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_NO_SPREADSHEET", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } catch (Exception e) { - log.error("Massenimport fehlgeschlagen", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_INTERNAL", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } - } - - private static class NoSpreadsheetException extends RuntimeException { - NoSpreadsheetException(String message) { super(message); } - } - - private File findSpreadsheetFile() throws IOException { - try (Stream files = Files.list(Paths.get(importDir))) { - return files - .filter(p -> { - String name = p.toString().toLowerCase(); - return name.endsWith(".ods") || name.endsWith(".xlsx") || name.endsWith(".xls"); - }) - .findFirst() - .orElseThrow(() -> new NoSpreadsheetException( - "Keine Tabellendatei (.ods/.xlsx/.xls) in " + importDir + " gefunden!")) - .toFile(); - } - } - - // --- Spreadsheet reading (format-specific, produces neutral List>) --- - - private List> readSpreadsheet(File file) throws Exception { - String name = file.getName().toLowerCase(); - if (name.endsWith(".ods")) { - return readOds(file); - } - return readXlsx(file); - } - - /** - * Reads an ODS file by parsing its content.xml directly (no extra library needed). - * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML. - */ - List> readOds(File file) throws Exception { - List> result = new ArrayList<>(); - - try (ZipFile zip = new ZipFile(file)) { - var entry = zip.getEntry("content.xml"); - if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt"); - - var factory = XxeSafeXmlParser.hardenedFactory(); - factory.setNamespaceAware(true); - var builder = factory.newDocumentBuilder(); - var doc = builder.parse(zip.getInputStream(entry)); - - NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table"); - if (tables.getLength() == 0) return result; - - var table = (Element) tables.item(0); - NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row"); - - for (int i = 0; i < rows.getLength(); i++) { - var row = (Element) rows.item(i); - List rowData = new ArrayList<>(); - NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell"); - - for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) { - var cell = (Element) cells.item(j); - - // Read the display text (first ) - String value = ""; - NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p"); - if (textNodes.getLength() > 0) { - value = textNodes.item(0).getTextContent().trim(); - } - - // Expand number-columns-repeated (capped at MAX_COLS) - String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated"); - int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr); - repeat = Math.min(repeat, MAX_COLS - rowData.size()); - - for (int r = 0; r < repeat; r++) { - rowData.add(value); - } - } - result.add(rowData); - } - } - return result; - } - - /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */ - private List> readXlsx(File file) throws Exception { - List> result = new ArrayList<>(); - try (FileInputStream fis = new FileInputStream(file); - Workbook workbook = WorkbookFactory.create(fis)) { - - Sheet sheet = workbook.getSheetAt(0); - for (int i = 0; i <= sheet.getLastRowNum(); i++) { - Row row = sheet.getRow(i); - List rowData = new ArrayList<>(); - if (row != null) { - for (int j = 0; j < MAX_COLS; j++) { - rowData.add(xlsxCellToString(row.getCell(j))); - } - } - result.add(rowData); - } - } - return result; - } - - private String xlsxCellToString(Cell cell) { - if (cell == null) return ""; - return switch (cell.getCellType()) { - case STRING -> cell.getStringCellValue(); - case NUMERIC -> { - if (DateUtil.isCellDateFormatted(cell)) { - yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO - } - yield String.valueOf((int) cell.getNumericCellValue()); - } - case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); - default -> ""; - }; - } - - // --- Import logic (works on neutral List rows) --- - - private ProcessResult processRows(List> rows) { - int processed = 0; - List skippedFiles = new ArrayList<>(); - - for (int i = 1; i < rows.size(); i++) { // skip header row - List cells = rows.get(i); - String index = getCell(cells, colIndex); - if (index.isBlank()) continue; - - String filename = index.contains(".") ? index : index + ".pdf"; - if (!isValidImportFilename(filename)) { - log.warn("Skipping import row {}: filename rejected — {}", i, filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_FILENAME_PATH_TRAVERSAL)); - continue; - } - Optional fileOnDisk = findFileRecursive(filename); - if (fileOnDisk.isEmpty()) { - log.warn("Datei nicht gefunden, importiere nur Metadaten: {}", filename); - } - - if (fileOnDisk.isPresent()) { - try { - if (!isPdfMagicBytes(fileOnDisk.get())) { - log.warn("Überspringe {}: Datei beginnt nicht mit %PDF-Signatur", filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_PDF_SIGNATURE)); - continue; - } - } catch (IOException e) { - log.error("Fehler beim Prüfen der Magic-Bytes für {}", filename, e); - skippedFiles.add(new SkippedFile(filename, SkipReason.FILE_READ_ERROR)); - continue; - } - } - - Optional skipReason = importSingleDocument(cells, fileOnDisk, filename, index); - if (skipReason.isPresent()) { - skippedFiles.add(new SkippedFile(filename, skipReason.get())); - } else { - processed++; - } - } - return new ProcessResult(processed, skippedFiles); - } - - private boolean isValidImportFilename(String filename) { - if (filename == null || filename.isBlank()) return false; - if (filename.contains("/")) return false; - if (filename.contains("\\")) return false; - if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH - if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS - if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR - if (filename.contains("..")) return false; - if (filename.equals(".")) return false; - if (filename.contains("\0")) return false; - // Paths.get() is safe here on Linux for all inputs that passed the checks above; - // it may throw InvalidPathException for OS-specific illegal chars on Windows, - // but those are not reachable in production. - if (Paths.get(filename).isAbsolute()) return false; - return true; - } - - // package-private: Mockito spy in tests can override to inject IOException - InputStream openFileStream(File file) throws IOException { - return new FileInputStream(file); - } - - private boolean isPdfMagicBytes(File file) throws IOException { - try (InputStream is = openFileStream(file)) { - byte[] header = is.readNBytes(4); - return header.length == 4 - && header[0] == 0x25 // % - && header[1] == 0x50 // P - && header[2] == 0x44 // D - && header[3] == 0x46; // F - } - } - - /** - * Imports a single document row. - * - * @return empty Optional on success; an Optional containing the skip reason on failure/skip. - */ - @Transactional - protected Optional importSingleDocument(List cells, Optional file, String originalFilename, String index) { - Optional existing = documentService.findByOriginalFilename(originalFilename); - if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) { - log.info("Dokument {} existiert bereits, überspringe.", originalFilename); - return Optional.of(SkipReason.ALREADY_EXISTS); - } - - String archiveBox = getCell(cells, colBox); - String archiveFolder = getCell(cells, colFolder); - String senderRaw = getCell(cells, colSender); - String receiversRaw = getCell(cells, colReceivers); - LocalDate date = parseDate(getCell(cells, colDate)); - String location = getCell(cells, colLocation); - String tagRaw = getCell(cells, colTags); - String summary = getCell(cells, colSummary); - String transcription = getCell(cells, colTranscription); - - String s3Key = null; - String contentType = null; - DocumentStatus status = DocumentStatus.PLACEHOLDER; - - if (file.isPresent()) { - try { - contentType = Files.probeContentType(file.get().toPath()); - } catch (IOException e) { - contentType = null; - } - if (contentType == null) contentType = "application/octet-stream"; - - s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); - try { - s3Client.putObject(PutObjectRequest.builder() - .bucket(bucketName) - .key(s3Key) - .contentType(contentType) - .build(), - RequestBody.fromFile(file.get())); - status = DocumentStatus.UPLOADED; - } catch (Exception e) { - log.error("S3 Upload Fehler für {}", file.get().getName(), e); - return Optional.of(SkipReason.S3_UPLOAD_FAILED); - } - } - - Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw); - List receivers = PersonNameParser.parseReceivers(receiversRaw).stream() - .map(this::findOrCreatePerson) - .filter(Objects::nonNull) - .toList(); - - Tag tag = null; - if (!tagRaw.isBlank()) { - tag = tagService.findOrCreate(tagRaw); - } - - Document doc = existing.orElse(Document.builder() - .originalFilename(originalFilename) - .build()); - - // Heuristic: mark as complete if at least one key field is present in the spreadsheet row - boolean metadataComplete = date != null || !senderRaw.isBlank() || !receiversRaw.isBlank(); - - doc.setTitle(buildTitle(index, date, location)); - doc.setFilePath(s3Key); - doc.setContentType(contentType); - doc.setStatus(status); - doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox); - doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder); - doc.setDocumentDate(date); - doc.setLocation(location.isBlank() ? null : location); - doc.setSummary(summary.isBlank() ? null : summary); - doc.setTranscription(transcription.isBlank() ? null : transcription); - doc.setSender(sender); - doc.getReceivers().addAll(receivers); - if (tag != null) doc.getTags().add(tag); - doc.setMetadataComplete(metadataComplete); - - Document saved = documentService.save(doc); - if (file.isPresent()) { - thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); - } - log.info("Importiert{}: {}", file.isEmpty() ? " (nur Metadaten)" : "", originalFilename); - return Optional.empty(); - } - - // --- Helpers --- - - private String getCell(List cells, int col) { - if (col >= cells.size()) return ""; - String val = cells.get(col); - return val == null ? "" : val.trim(); - } - - private LocalDate parseDate(String value) { - if (value == null || value.isBlank()) return null; - try { - return LocalDate.parse(value.trim()); - } catch (DateTimeParseException e) { - return null; - } - } - - private String buildTitle(String index, LocalDate date, String location) { - StringBuilder sb = new StringBuilder(index); - if (date != null) { - sb.append(" \u2013 ").append(date.format(GERMAN_DATE)); - } - if (location != null && !location.isBlank()) { - sb.append(" \u2013 ").append(location); - } - return sb.toString(); - } - - private Person findOrCreatePerson(String rawName) { - return personService.findOrCreateByAlias(rawName); - } - - private Optional findFileRecursive(String filename) { - File baseDir = new File(importDir); - try (Stream walk = Files.walk(baseDir.toPath())) { - Optional match = walk.filter(p -> !Files.isDirectory(p)) - .filter(p -> p.getFileName().toString().equals(filename)) - .findFirst(); - if (match.isEmpty()) return Optional.empty(); - File candidate = match.get().toFile(); - String baseDirCanonical = baseDir.getCanonicalPath(); - if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { - throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); - } - return Optional.of(candidate); - } catch (IOException e) { - return Optional.empty(); - } - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java new file mode 100644 index 00000000..edad55d2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonRegisterImporter.java @@ -0,0 +1,69 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.List; + +/** + * Loads {@code canonical-persons.xlsx} (the register) into the person domain via + * {@link PersonService}, upserting each person by the normalizer {@code person_id} + * (source_ref). Register persons are confident identities, so {@code provisional} is + * driven by the sheet's already-clean value (normally {@code False}). + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class PersonRegisterImporter { + + static final List REQUIRED_HEADERS = List.of("person_id", "last_name", "first_name", "provisional"); + + private final PersonService personService; + + public int load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + int processed = 0; + for (CanonicalSheetReader.Row row : rows) { + String personId = row.get("person_id"); + if (personId.isBlank()) continue; + personService.upsertBySourceRef(toCommand(row, personId)); + processed++; + } + log.info("Imported {} register persons from {}", processed, artifact.getName()); + return processed; + } + + private PersonUpsertCommand toCommand(CanonicalSheetReader.Row row, String personId) { + return PersonUpsertCommand.builder() + .sourceRef(personId) + .lastName(blankToNull(row.get("last_name"))) + .firstName(blankToNull(row.get("first_name"))) + .maidenName(blankToNull(row.get("maiden_name"))) + .notes(blankToNull(row.get("notes"))) + .birthYear(yearOf(row.get("birth_date"))) + .deathYear(yearOf(row.get("death_date"))) + .personType(PersonType.PERSON) + .provisional(Boolean.parseBoolean(row.get("provisional"))) + .build(); + } + + private static Integer yearOf(String isoDate) { + if (isoDate == null || isoDate.isBlank()) return null; + try { + return LocalDate.parse(isoDate.trim()).getYear(); + } catch (DateTimeParseException e) { + return null; + } + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java new file mode 100644 index 00000000..26ae0dcd --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/PersonTreeImporter.java @@ -0,0 +1,135 @@ +package org.raddatz.familienarchiv.importing; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonType; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.person.relationship.RelationType; +import org.raddatz.familienarchiv.person.relationship.RelationshipService; +import org.raddatz.familienarchiv.person.relationship.dto.CreateRelationshipRequest; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * Loads {@code canonical-persons-tree.json} into the person + relationship domains. + * Tree persons are upserted via {@link PersonService} keyed on the shared + * {@code personId} slug (which Phase 1 #670 now emits into the tree), so they reconcile + * with the register rather than duplicating it. Relationships reference persons by the + * tree's local {@code rowId}; each side is mapped to the upserted person's UUID and + * created through {@link RelationshipService} (never the relationship repository — + * layering rule). A duplicate relationship on re-import is swallowed for idempotency. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class PersonTreeImporter { + + // The tree JSON is a local implementation detail, not a shared API payload, so the + // importer owns its own mapper rather than depending on the web ObjectMapper bean. + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final PersonService personService; + private final RelationshipService relationshipService; + + public int load(File artifact) { + JsonNode root = readTree(artifact); + Map idByRowId = upsertPersons(root.path("persons")); + int relationships = createRelationships(root.path("relationships"), idByRowId); + log.info("Imported {} tree persons and {} relationships from {}", + idByRowId.size(), relationships, artifact.getName()); + return idByRowId.size(); + } + + private JsonNode readTree(File artifact) { + try { + return OBJECT_MAPPER.readTree(artifact); + } catch (Exception e) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Unreadable canonical artifact: " + artifact.getName()); + } + } + + private Map upsertPersons(JsonNode persons) { + Map idByRowId = new HashMap<>(); + for (JsonNode node : persons) { + String personId = text(node, "personId"); + if (personId.isBlank()) continue; + Person person = personService.upsertBySourceRef(toCommand(node, personId)); + idByRowId.put(text(node, "rowId"), person.getId()); + } + return idByRowId; + } + + private PersonUpsertCommand toCommand(JsonNode node, String personId) { + return PersonUpsertCommand.builder() + .sourceRef(personId) + .lastName(blankToNull(text(node, "lastName"))) + .firstName(blankToNull(text(node, "firstName"))) + .maidenName(blankToNull(text(node, "maidenName"))) + .notes(blankToNull(text(node, "notes"))) + .birthYear(intOrNull(node, "birthYear")) + .deathYear(intOrNull(node, "deathYear")) + .familyMember(node.path("familyMember").asBoolean(false)) + .personType(PersonType.PERSON) + .provisional(false) + .build(); + } + + private int createRelationships(JsonNode relationships, Map idByRowId) { + int created = 0; + for (JsonNode node : relationships) { + // Trap: a relationship node's personId / relatedPersonId fields carry the tree's + // local rowId (e.g. "row_a"), NOT a person slug. They are resolved through + // idByRowId to the upserted person's UUID. + UUID person = idByRowId.get(text(node, "personId")); + UUID related = idByRowId.get(text(node, "relatedPersonId")); + if (person == null || related == null) { + log.warn("Skipping tree relationship with unresolved rowId: {} -> {}", + text(node, "personId"), text(node, "relatedPersonId")); + continue; + } + if (addRelationshipIdempotently(person, related, text(node, "type"))) { + created++; + } + } + return created; + } + + private boolean addRelationshipIdempotently(UUID person, UUID related, String type) { + try { + relationshipService.addRelationship(person, + new CreateRelationshipRequest(related, RelationType.valueOf(type), null, null, null)); + return true; + } catch (DomainException e) { + if (e.getCode() == ErrorCode.DUPLICATE_RELATIONSHIP + || e.getCode() == ErrorCode.CIRCULAR_RELATIONSHIP) { + return false; + } + throw e; + } + } + + private static String text(JsonNode node, String field) { + JsonNode value = node.get(field); + return value == null || value.isNull() ? "" : value.asText(); + } + + private static Integer intOrNull(JsonNode node, String field) { + JsonNode value = node.get(field); + return value == null || value.isNull() ? null : value.asInt(); + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java new file mode 100644 index 00000000..a871ab32 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/TagTreeImporter.java @@ -0,0 +1,54 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +/** + * Loads {@code canonical-tag-tree.xlsx} into the tag domain via {@link TagService}, + * upserting each tag by its canonical {@code tag_path} (the source_ref). Parent links are + * resolved by the parent's path, which is the child path with its last {@code /segment} + * stripped. Rows are emitted parents-first by the normalizer, so a parent is always + * resolved before any child references it. + */ +@Component +@RequiredArgsConstructor +@Slf4j +public class TagTreeImporter { + + static final List REQUIRED_HEADERS = List.of("tag_path", "parent_name", "tag_name"); + private static final String PATH_SEPARATOR = "/"; + + private final TagService tagService; + + public int load(File artifact) { + List rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS); + Map idByPath = new HashMap<>(); + int processed = 0; + for (CanonicalSheetReader.Row row : rows) { + String path = row.get("tag_path"); + if (path.isBlank()) continue; + UUID parentId = resolveParentId(path, idByPath); + Tag tag = tagService.upsertBySourceRef(path, row.get("tag_name"), parentId); + idByPath.put(path, tag.getId()); + processed++; + } + log.info("Imported {} tags from {}", processed, artifact.getName()); + return processed; + } + + private UUID resolveParentId(String path, Map idByPath) { + int lastSeparator = path.lastIndexOf(PATH_SEPARATOR); + if (lastSeparator < 0) return null; + String parentPath = path.substring(0, lastSeparator); + return idByPath.get(parentPath); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java deleted file mode 100644 index 949ea054..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java +++ /dev/null @@ -1,20 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -class XxeSafeXmlParser { - - private XxeSafeXmlParser() {} - - static DocumentBuilderFactory hardenedFactory() throws ParserConfigurationException { - var factory = DocumentBuilderFactory.newInstance(); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setXIncludeAware(false); - factory.setExpandEntityReferences(false); - return factory; - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java index 1beebcc3..940e34a2 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonRepository.java @@ -32,6 +32,9 @@ public interface PersonRepository extends JpaRepository { // Lookup by full alias string, used during ODS mass import Optional findByAliasIgnoreCase(String alias); + // Lookup by the normalizer person_id, used for idempotent canonical re-import (Phase 3). + Optional findBySourceRef(String sourceRef); + // Exact first+last name match, used for filename-based sender lookup Optional findByFirstNameIgnoreCaseAndLastNameIgnoreCase(String firstName, String lastName); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java index 89b11ef3..d2e894bf 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java @@ -80,6 +80,11 @@ public class PersonService { return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName); } + /** Lookup by the normalizer person_id — used by the canonical importer for register-first matching. */ + public Optional findBySourceRef(String sourceRef) { + return personRepository.findBySourceRef(sourceRef); + } + @Nullable @Transactional public Person findOrCreateByAlias(String rawName) { @@ -115,6 +120,80 @@ public class PersonService { }); } + /** + * Idempotent upsert keyed on {@code sourceRef} (the normalizer person_id) for the + * canonical importer (Phase 3, ADR-025). On first import the canonical fields are + * written verbatim. On re-import the human-edit-preserve precedence applies: + * a non-blank existing field is never overwritten, and {@code provisional} never + * flips back to true once a human has confirmed the person. + */ + @Transactional + public Person upsertBySourceRef(PersonUpsertCommand cmd) { + return personRepository.findBySourceRef(cmd.sourceRef()) + .map(existing -> personRepository.save(mergeCanonical(existing, cmd))) + .orElseGet(() -> fromCanonical(cmd)); + } + + private Person fromCanonical(PersonUpsertCommand cmd) { + Person person = personRepository.save(Person.builder() + .sourceRef(cmd.sourceRef()) + .firstName(blankToNull(cmd.firstName())) + .lastName(cmd.lastName()) + .notes(blankToNull(cmd.notes())) + .birthYear(cmd.birthYear()) + .deathYear(cmd.deathYear()) + .familyMember(cmd.familyMember()) + .personType(cmd.personType() == null ? PersonType.PERSON : cmd.personType()) + .provisional(cmd.provisional()) + .build()); + String maiden = blankToNull(cmd.maidenName()); + if (maiden != null) { + int nextSortOrder = aliasRepository.findMaxSortOrder(person.getId()) + 1; + aliasRepository.save(PersonNameAlias.builder() + .person(person) + .lastName(maiden) + .type(PersonNameAliasType.MAIDEN_NAME) + .sortOrder(nextSortOrder) + .build()); + } + return person; + } + + private Person mergeCanonical(Person existing, PersonUpsertCommand cmd) { + existing.setFirstName(preferHuman(existing.getFirstName(), cmd.firstName())); + existing.setLastName(preferHuman(existing.getLastName(), cmd.lastName())); + existing.setNotes(preferHuman(existing.getNotes(), cmd.notes())); + existing.setBirthYear(preferHuman(existing.getBirthYear(), cmd.birthYear())); + existing.setDeathYear(preferHuman(existing.getDeathYear(), cmd.deathYear())); + if (cmd.personType() != null && existing.getPersonType() == PersonType.PERSON) { + existing.setPersonType(cmd.personType()); + } + // provisional is monotonic-downward: once it is false it never reverts to true. + // This also pins the cross-loader precedence (ADR-025): a register/tree person is + // loaded before documents and already false, so a later document row that references + // the same source_ref (provisional=true) can never flip it provisional — the guard + // below only fires while existing is still provisional. Order of document rows is + // therefore irrelevant. + if (existing.isProvisional()) { + existing.setProvisional(cmd.provisional()); + } + return existing; + } + + // preferHuman keeps an existing human-entered value and only falls back to the canonical + // value when the existing one is absent — the single idiom for every fill-blank field. + private static String preferHuman(String existing, String canonical) { + return (existing == null || existing.isBlank()) ? blankToNull(canonical) : existing; + } + + private static Integer preferHuman(Integer existing, Integer canonical) { + return existing != null ? existing : canonical; + } + + private static String blankToNull(String s) { + return (s == null || s.isBlank()) ? null : s.trim(); + } + @Transactional public Person createPerson(String firstName, String lastName, String alias) { Person person = Person.builder() diff --git a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java new file mode 100644 index 00000000..63864ab6 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonUpsertCommand.java @@ -0,0 +1,24 @@ +package org.raddatz.familienarchiv.person; + +import lombok.Builder; + +/** + * Importer → {@link PersonService} command for an idempotent upsert keyed on + * {@code sourceRef} (the normalizer's stable person_id). Carries only the canonical + * fields the importer owns; the service applies the human-edit-preserve precedence + * (see ADR-025): non-blank existing fields are never overwritten, and {@code provisional} + * never flips back to true once a human has confirmed a person. + */ +@Builder +public record PersonUpsertCommand( + String sourceRef, + String firstName, + String lastName, + String maidenName, + String notes, + Integer birthYear, + Integer deathYear, + boolean familyMember, + PersonType personType, + boolean provisional +) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java index 4a7fab90..f1b3b7ab 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagRepository.java @@ -22,6 +22,9 @@ public interface TagRepository extends JpaRepository { Optional findByNameIgnoreCase(String name); + // Lookup by the canonical tag_path, used for idempotent canonical re-import (Phase 3). + Optional findBySourceRef(String sourceRef); + List findByNameContainingIgnoreCase(String name); /** diff --git a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java index a572f84f..14e1e9fa 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java @@ -7,6 +7,7 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; @@ -49,12 +50,37 @@ public class TagService { .orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id)); } + /** Lookup by the canonical tag_path — used by the canonical importer to attach a document's tag. */ + public Optional findBySourceRef(String sourceRef) { + return tagRepository.findBySourceRef(sourceRef); + } + public Tag findOrCreate(String name) { String cleanName = name.trim(); return tagRepository.findByNameIgnoreCase(cleanName) .orElseGet(() -> tagRepository.save(Tag.builder().name(cleanName).build())); } + /** + * Idempotent upsert keyed on {@code sourceRef} (the canonical tag_path) for the + * Phase-3 importer (ADR-025). On first import the canonical name and parent are + * written; on re-import a human-renamed tag name is preserved (the source_ref is the + * stable identity, the name is a human-editable label). + */ + @Transactional + public Tag upsertBySourceRef(String sourceRef, String name, UUID parentId) { + return tagRepository.findBySourceRef(sourceRef) + .map(existing -> { + existing.setParentId(parentId); + return tagRepository.save(existing); + }) + .orElseGet(() -> tagRepository.save(Tag.builder() + .sourceRef(sourceRef) + .name(name) + .parentId(parentId) + .build())); + } + @Transactional public Tag update(UUID id, TagUpdateDTO dto) { Tag tag = getById(id); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java index 18b6c2c0..74b5d643 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java @@ -5,7 +5,8 @@ import org.raddatz.familienarchiv.security.Permission; import org.raddatz.familienarchiv.security.RequirePermission; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.GetMapping; @@ -21,20 +22,20 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class AdminController { - private final MassImportService massImportService; + private final CanonicalImportOrchestrator importOrchestrator; private final DocumentService documentService; private final DocumentVersionService documentVersionService; private final ThumbnailBackfillService thumbnailBackfillService; @PostMapping("/trigger-import") - public ResponseEntity triggerMassImport() { - massImportService.runImportAsync(); - return ResponseEntity.accepted().body(massImportService.getStatus()); + public ResponseEntity triggerMassImport() { + importOrchestrator.runImportAsync(); + return ResponseEntity.accepted().body(importOrchestrator.getStatus()); } @GetMapping("/import-status") - public ResponseEntity importStatus() { - return ResponseEntity.ok(massImportService.getStatus()); + public ResponseEntity importStatus() { + return ResponseEntity.ok(importOrchestrator.getStatus()); } @PostMapping("/backfill-versions") diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index e74f4d41..1e4558e0 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -125,17 +125,10 @@ app: password: ${APP_ADMIN_PASSWORD:admin123} import: - col: - index: 0 - box: 1 - folder: 2 - sender: 3 - receivers: 5 - date: 7 - location: 9 - tags: 10 - summary: 11 - transcription: 13 + # Directory holding the normalizer's committed canonical artifacts + # (canonical-{documents,persons,tag-tree}.xlsx + canonical-persons-tree.json). + # The loader maps columns by header name — no positional indices (see ADR-025). + dir: ${IMPORT_DIR:/import} ocr: sender-model: diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java new file mode 100644 index 00000000..bd2b8d8f --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportIntegrationTest.java @@ -0,0 +1,229 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.PostgresContainerConfig; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentRepository; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonRepository; +import org.raddatz.familienarchiv.tag.TagRepository; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.test.context.bean.override.mockito.MockitoBean; +import org.springframework.test.util.ReflectionTestUtils; +import software.amazon.awssdk.services.s3.S3Client; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Real Postgres (Testcontainers) integration test for the canonical importer. The + * {@code UNIQUE(source_ref)} constraint and the upsert-on-conflict behaviour only exist + * in real Postgres (never H2), so idempotency is verified here. S3 is mocked — the + * synthetic document rows carry no on-disk files, so every document is a PLACEHOLDER and + * no upload is attempted. + */ +@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) +@ActiveProfiles("test") +@Import(PostgresContainerConfig.class) +class CanonicalImportIntegrationTest { + + @MockitoBean S3Client s3Client; + + @Autowired CanonicalImportOrchestrator orchestrator; + @Autowired PersonRepository personRepository; + @Autowired TagRepository tagRepository; + @Autowired DocumentRepository documentRepository; + + Path artifactDir; + + @BeforeEach + void setUp() throws Exception { + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + artifactDir = Files.createTempDirectory("canonical-import-it"); + writeArtifacts(artifactDir); + ReflectionTestUtils.setField(orchestrator, "canonicalDir", artifactDir.toString()); + } + + /** + * The import commits through its own transactions (the orchestrator is not transactional), + * so this test cannot rely on {@code @Transactional} rollback for isolation. Delete the + * committed rows after each test — otherwise the last test's documents (dated 1888-02) and + * persons/tags leak into the shared Testcontainers Postgres and pollute other integration + * tests that assume a known seed (e.g. DocumentDensityIntegrationTest, + * DocumentSearchPagedIntegrationTest). Mirrors the @AfterEach deleteAll convention used by + * DocumentListItemIntegrationTest. + */ + @AfterEach + void cleanup() { + documentRepository.deleteAll(); + personRepository.deleteAll(); + tagRepository.deleteAll(); + } + + @Test + void reimport_isIdempotent_noDuplicatePersonsTagsOrDocuments() { + orchestrator.runImport(); + long personsAfterFirst = personRepository.count(); + long tagsAfterFirst = tagRepository.count(); + long documentsAfterFirst = documentRepository.count(); + assertThat(orchestrator.getStatus().state()).isEqualTo(ImportStatus.State.DONE); + assertThat(personsAfterFirst).isPositive(); + assertThat(tagsAfterFirst).isPositive(); + assertThat(documentsAfterFirst).isPositive(); + + orchestrator.runImport(); + + assertThat(personRepository.count()).isEqualTo(personsAfterFirst); + assertThat(tagRepository.count()).isEqualTo(tagsAfterFirst); + assertThat(documentRepository.count()).isEqualTo(documentsAfterFirst); + } + + @Test + void reimport_preservesHumanEditedPersonField() { + orchestrator.runImport(); + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + walter.setNotes("Verified by archivist"); + walter.setFirstName("Walther"); + personRepository.save(walter); + + orchestrator.runImport(); + + Person reimported = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + assertThat(reimported.getNotes()).isEqualTo("Verified by archivist"); + assertThat(reimported.getFirstName()).isEqualTo("Walther"); + } + + @Test + void import_linksDocumentSenderToRegisterPerson_andRetainsRawText() { + orchestrator.runImport(); + + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + Document doc = documentRepository.findByOriginalFilename("W-0001").orElseThrow(); + assertThat(doc.getSender()).isNotNull(); + assertThat(doc.getSender().getId()).isEqualTo(walter.getId()); + assertThat(doc.getSenderText()).isEqualTo("Walter de Gruyter"); + assertThat(doc.getStatus()).isEqualTo(DocumentStatus.PLACEHOLDER); + } + + @Test + void import_provisionalFlag_trueForImporterCreated_falseForRegister() { + orchestrator.runImport(); + + Optional register = personRepository.findBySourceRef("de-gruyter-walter"); + assertThat(register).get().extracting(Person::isProvisional).isEqualTo(false); + } + + @Test + void reimport_prunesRemovedReceiverAndTag_whenCanonicalRowShrinks() throws Exception { + orchestrator.runImport(); + // findById uses the Document.full entity graph so receivers/tags initialise eagerly. + Document before = documentRepository.findById( + documentRepository.findByOriginalFilename("W-0001").orElseThrow().getId()).orElseThrow(); + assertThat(before.getReceivers()).isNotEmpty(); + assertThat(before.getTags()).isNotEmpty(); + + // Re-stage the document sheet with W-0001's receiver and tag removed. + writeSheet(artifactDir.resolve("canonical-documents.xlsx"), + List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids", + "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), + List.of( + List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + "", "", "1888-02-15", "15.2.1888", "DAY", "", "Rotterdam", "", "Geschäftsreise"), + List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter", + "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", + "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); + + orchestrator.runImport(); + + Document after = documentRepository.findById(before.getId()).orElseThrow(); + assertThat(after.getReceivers()).isEmpty(); + assertThat(after.getTags()).isEmpty(); + } + + @Test + void import_neverFlipsRegisterPersonToProvisional_whenReferencedByDocumentRow() { + // de-gruyter-walter is a register person (provisional=false) AND the sender of W-0001. + // The orchestrator loads the register before documents, so the document loader's + // register-first match links the existing person and never mints a provisional one. + // A second run (documents reference the same person again) must not flip it true. + orchestrator.runImport(); + orchestrator.runImport(); + + Person walter = personRepository.findBySourceRef("de-gruyter-walter").orElseThrow(); + assertThat(walter.isProvisional()).isFalse(); + Person eugenie = personRepository.findBySourceRef("de-gruyter-eugenie").orElseThrow(); + assertThat(eugenie.isProvisional()).isFalse(); + } + + // ─── synthetic-but-real artifact set ───────────────────────────────────────────── + + private void writeArtifacts(Path dir) throws Exception { + writeSheet(dir.resolve("canonical-tag-tree.xlsx"), + List.of("tag_path", "parent_name", "tag_name"), + List.of( + List.of("Themen", "", "Themen"), + List.of("Themen/Brautbriefe", "Themen", "Brautbriefe"))); + + writeSheet(dir.resolve("canonical-persons.xlsx"), + List.of("person_id", "last_name", "first_name", "maiden_name", "notes", "birth_date", "death_date", "provisional"), + List.of( + List.of("de-gruyter-walter", "de Gruyter", "Walter", "", "", "1865-01-01", "", "False"), + List.of("de-gruyter-eugenie", "de Gruyter", "Eugenie", "Wöhler", "", "", "", "False"))); + + Files.writeString(dir.resolve("canonical-persons-tree.json"), """ + {"persons":[ + {"rowId":"row_1","firstName":"Walter","lastName":"de Gruyter","familyMember":true,"personId":"de-gruyter-walter"}, + {"rowId":"row_2","firstName":"Eugenie","lastName":"de Gruyter","maidenName":"Wöhler","familyMember":true,"personId":"de-gruyter-eugenie"} + ],"relationships":[ + {"personId":"row_1","relatedPersonId":"row_2","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + writeSheet(dir.resolve("canonical-documents.xlsx"), + List.of("index", "file", "sender_person_id", "sender_name", "receiver_person_ids", + "receiver_names", "date_iso", "date_raw", "date_precision", "date_end", "location", "tags", "summary"), + List.of( + List.of("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + "de-gruyter-eugenie", "Eugenie de Gruyter", "1888-02-15", "15.2.1888", "DAY", "", + "Rotterdam", "Themen/Brautbriefe", "Geschäftsreise"), + List.of("W-0002", "", "de-gruyter-eugenie", "Eugenie de Gruyter", + "de-gruyter-walter", "Walter de Gruyter", "1888-02-16", "16.2.1888", "DAY", "", + "Middelburg", "Themen/Brautbriefe", "Reisepläne"))); + } + + private void writeSheet(Path file, List headers, List> rows) throws Exception { + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.size(); r++) { + Row row = sheet.createRow(r + 1); + List values = rows.get(r); + for (int c = 0; c < values.size(); c++) { + row.createCell(c).setCellValue(values.get(c)); + } + } + try (OutputStream out = Files.newOutputStream(file)) { + wb.write(out); + } + } + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java new file mode 100644 index 00000000..dc12d070 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java @@ -0,0 +1,130 @@ +package org.raddatz.familienarchiv.importing; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.springframework.test.util.ReflectionTestUtils; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class CanonicalImportOrchestratorTest { + + @Mock TagTreeImporter tagTreeImporter; + @Mock PersonRegisterImporter personRegisterImporter; + @Mock PersonTreeImporter personTreeImporter; + @Mock DocumentImporter documentImporter; + + private CanonicalImportOrchestrator orchestrator(Path dir) { + CanonicalImportOrchestrator o = new CanonicalImportOrchestrator( + tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + ReflectionTestUtils.setField(o, "canonicalDir", dir.toString()); + return o; + } + + private void writeAllArtifacts(Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons-tree.json"), "x"); + Files.writeString(dir.resolve("canonical-documents.xlsx"), "x"); + } + + @Test + void getStatus_isIdleByDefault(@TempDir Path dir) { + assertThat(orchestrator(dir).getStatus().state()).isEqualTo(ImportStatus.State.IDLE); + } + + @Test + void runImport_loadsTagsAndPersonsBeforeDocuments(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(0, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + InOrder order = inOrder(tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + order.verify(tagTreeImporter).load(any()); + order.verify(personRegisterImporter).load(any()); + order.verify(personTreeImporter).load(any()); + order.verify(documentImporter).load(any()); + } + + @Test + void runImport_setsStatusDone_onSuccess(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(3, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.DONE); + assertThat(o.getStatus().processed()).isEqualTo(3); + } + + @Test + void runImport_failsClosed_whenAnArtifactIsMissing(@TempDir Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + // the other three artifacts are absent + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(tagTreeImporter, never()).load(any()); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImport_setsStatusFailed_whenLoaderThrows(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(tagTreeImporter.load(any())).thenThrow(DomainException.badRequest( + org.raddatz.familienarchiv.exception.ErrorCode.IMPORT_ARTIFACT_INVALID, "bad")); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImportAsync_throwsConflict_whenAlreadyRunning(@TempDir Path dir) { + CanonicalImportOrchestrator o = orchestrator(dir); + ReflectionTestUtils.setField(o, "currentStatus", new ImportStatus( + ImportStatus.State.RUNNING, "IMPORT_RUNNING", "running", 0, List.of(), null)); + + assertThatThrownBy(o::runImportAsync) + .isInstanceOf(DomainException.class) + .hasMessageContaining("already in progress"); + } + + @Test + void runImport_aggregatesDocumentSkips(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(1, + List.of(new ImportStatus.SkippedFile("fake.pdf", ImportStatus.SkipReason.INVALID_PDF_SIGNATURE)))); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().skipped()).isEqualTo(1); + assertThat(o.getStatus().skippedFiles()) + .extracting(ImportStatus.SkippedFile::filename) + .containsExactly("fake.pdf"); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java new file mode 100644 index 00000000..ee1d3650 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalSheetReaderTest.java @@ -0,0 +1,115 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.raddatz.familienarchiv.exception.DomainException; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class CanonicalSheetReaderTest { + + @Test + void readRows_mapsCellsByHeaderName(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index", "file"), List.of(List.of("W-0001", "scan.pdf"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file")); + + assertThat(rows).hasSize(1); + assertThat(rows.get(0).get("index")).isEqualTo("W-0001"); + assertThat(rows.get(0).get("file")).isEqualTo("scan.pdf"); + } + + @Test + void readRows_throwsBadRequest_whenRequiredHeaderMissing(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index"), List.of(List.of("W-0001"))); + + assertThatThrownBy(() -> CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file"))) + .isInstanceOf(DomainException.class) + .hasMessageContaining("file"); + } + + @Test + void get_returnsEmptyString_forBlankCell(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index", "file"), List.of(List.of("W-0001", ""))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file")); + + assertThat(rows.get(0).get("file")).isEmpty(); + } + + @Test + void get_returnsEmptyString_forUnknownColumn(@TempDir Path tempDir) throws Exception { + Path xlsx = write(tempDir, List.of("index"), List.of(List.of("W-0001"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index")); + + assertThat(rows.get(0).get("does_not_exist")).isEmpty(); + } + + @Test + void get_returnsEmptyString_forTrailingColumns_whenRowShorterThanHeader(@TempDir Path tempDir) throws Exception { + // POI omits trailing empty cells, so a real-world artifact row can be narrower than + // the header. The missing columns must read as "" rather than throwing. + Path xlsx = write(tempDir, + List.of("index", "file", "summary"), + List.of(List.of("W-0001"))); + + List rows = CanonicalSheetReader.readRows(xlsx.toFile(), List.of("index", "file", "summary")); + + assertThat(rows.get(0).get("index")).isEqualTo("W-0001"); + assertThat(rows.get(0).get("file")).isEmpty(); + assertThat(rows.get(0).get("summary")).isEmpty(); + } + + @Test + void splitList_splitsOnPipe() { + assertThat(CanonicalSheetReader.splitList("a|b|c")).containsExactly("a", "b", "c"); + } + + @Test + void splitList_returnsEmptyList_forBlank() { + assertThat(CanonicalSheetReader.splitList("")).isEmpty(); + assertThat(CanonicalSheetReader.splitList(" ")).isEmpty(); + } + + @Test + void splitList_returnsSingleElement_whenNoPipe() { + assertThat(CanonicalSheetReader.splitList("solo")).containsExactly("solo"); + } + + @Test + void splitList_trimsAndDropsEmptySegments() { + assertThat(CanonicalSheetReader.splitList("a| |b")).containsExactly("a", "b"); + } + + private Path write(Path dir, List headers, List> dataRows) throws Exception { + Path xlsx = dir.resolve("sheet.xlsx"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < dataRows.size(); r++) { + Row row = sheet.createRow(r + 1); + List values = dataRows.get(r); + for (int c = 0; c < values.size(); c++) { + row.createCell(c).setCellValue(values.get(c)); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java new file mode 100644 index 00000000..f0b2263b --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/DocumentImporterTest.java @@ -0,0 +1,459 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.document.Document; +import org.raddatz.familienarchiv.document.DocumentService; +import org.raddatz.familienarchiv.document.DocumentStatus; +import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; +import org.springframework.test.util.ReflectionTestUtils; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +import java.io.File; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDate; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class DocumentImporterTest { + + @Mock DocumentService documentService; + @Mock PersonService personService; + @Mock TagService tagService; + @Mock S3Client s3Client; + @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; + + DocumentImporter importer; + + @BeforeEach + void setUp() { + importer = new DocumentImporter(documentService, personService, tagService, s3Client, thumbnailAsyncRunner); + ReflectionTestUtils.setField(importer, "bucketName", "test-bucket"); + } + + // ─── security regression — ported from MassImportServiceTest — do not remove ───── + + @Test + void isValidImportFilename_returnsFalse_whenNull() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenBlank() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenForwardSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenBackslash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenDotDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenIsDotDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenAbsolutePath() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenNullByte() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenFullwidthSlash() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse(); + } + + @Test + void isValidImportFilename_returnsTrue_whenPlainBasename() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue(); + } + + @Test + void isValidImportFilename_returnsTrue_whenLeadingDot() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue(); + } + + @Test + void isValidImportFilename_returnsTrue_whenHasSpaces() { + assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue(); + } + + @Test + void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( + @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { + Path outsideFile = outsideDir.resolve("secret.pdf"); + Files.writeString(outsideFile, "sensitive"); + Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); + ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString()); + + org.assertj.core.api.Assertions.assertThatThrownBy( + () -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf")) + .isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class); + } + + // ─── path traversal in the file column cannot escape importDir ─────────────────── + + @Test + void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception { + // A file column whose basename is itself a traversal token must be rejected + // outright, never used for disk I/O. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); + verify(documentService, never()).save(any()); + } + + @Test + void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception { + // ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to + // importDir, so no file is found, nothing is uploaded, and the row becomes a + // metadata-only PLACEHOLDER — the file outside importDir is never read. + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + } + + // ─── PDF magic-byte guard — ported — do not remove ────────────────────────────── + + @Test + void load_skipsFile_whenNotPdfMagicBytes(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + @Test + void load_skipsFile_whenMagicByteCheckThrowsIoException(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Files.writeString(tempDir.resolve("W-0001.pdf"), "content"); + lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + DocumentImporter spyImporter = org.mockito.Mockito.spy(importer); + org.mockito.Mockito.doThrow(new java.io.IOException("read error")) + .when(spyImporter).openFileStream(any(File.class)); + + DocumentImporter.LoadResult result = spyImporter.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.FILE_READ_ERROR); + } + + @Test + void load_skipsAlreadyExists_whenDocumentUploadedNotPlaceholder(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0001").status(DocumentStatus.UPLOADED).build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", "")); + + DocumentImporter.LoadResult result = importer.load(xlsx.toFile()); + + assertThat(result.skippedFiles()) + .extracting(ImportStatus.SkippedFile::reason) + .containsExactly(ImportStatus.SkipReason.ALREADY_EXISTS); + verify(documentService, never()).save(any()); + } + + // ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ─────────── + + @Test + void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D}; + Files.write(tempDir.resolve("W-0001.pdf"), pdf); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); + } + + @Test + void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER)); + verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); + } + + // ─── attribution routing — register-first + always retain raw ──────────────────── + + @Test + void load_linksRegisterSender_andRetainsRawSenderText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person walter = Person.builder().id(UUID.randomUUID()).sourceRef("de-gruyter-walter") + .firstName("Walter").lastName("de Gruyter").build(); + when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter)); + Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == walter && "Walter de Gruyter".equals(d.getSenderText()))); + } + + @Test + void load_createsProvisionalSender_whenSlugUnmatchedInRegister(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("schwester-hanni") + .lastName("Schwester Hanni").provisional(true).build(); + when(documentService.findByOriginalFilename("W-0002")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty()); + when(personService.upsertBySourceRef(any())).thenReturn(provisional); + Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + org.mockito.ArgumentCaptor captor = + org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().provisional()).isTrue(); + assertThat(captor.getValue().lastName()).isEqualTo("Schwester Hanni"); + } + + @Test + void load_createsNoSenderPerson_whenSlugEmptyButRawPresent(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?", + "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(personService, never()).findBySourceRef(any()); + verify(personService, never()).upsertBySourceRef(any()); + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getSender() == null && "?".equals(d.getSenderText()))); + } + + @Test + void load_splitsMultipleReceivers_andRetainsRawReceiverText(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person herbert = Person.builder().id(UUID.randomUUID()).sourceRef("cram-herbert").lastName("Cram").build(); + Person clara = Person.builder().id(UUID.randomUUID()).sourceRef("clara").lastName("Clara").build(); + when(documentService.findByOriginalFilename("W-0004")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert)); + when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara)); + Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "", + "cram-herbert|clara", "Herbert Cram|Clara", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getReceivers().size() == 2 + && d.getReceivers().contains(herbert) + && d.getReceivers().contains(clara) + && "Herbert Cram|Clara".equals(d.getReceiverText()))); + } + + // ─── clean date values parse without semantic logic ────────────────────────────── + + @Test + void load_parsesCleanDateAndPrecision(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "", + "", "", "1916-06-01", "1.6.1916", "MONTH", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + LocalDate.of(1916, 6, 1).equals(d.getDocumentDate()) + && d.getMetaDatePrecision() == org.raddatz.familienarchiv.document.DatePrecision.MONTH + && "1.6.1916".equals(d.getMetaDateRaw()))); + } + + @Test + void load_attachesTagBySourceRef(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Tag tag = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe").sourceRef("Themen/Brautbriefe").build(); + when(documentService.findByOriginalFilename("W-0006")).thenReturn(Optional.empty()); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(tagService.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(tag)); + Path xlsx = writeDocs(tempDir, docRowWithTag("W-0006", "Themen/Brautbriefe")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getTags().contains(tag))); + } + + // ─── idempotency — update existing document in place by index ───────────────────── + + @Test + void load_updatesExistingDocumentInPlace_whenIndexExists(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build(); + when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing)); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getId().equals(existing.getId()))); + } + + // ─── canonical collections are authoritative — re-import prunes removed links ────── + + @Test + void load_prunesReceiversAndTags_whenCanonicalRowShrinks(@TempDir Path tempDir) throws Exception { + ReflectionTestUtils.setField(importer, "importDir", tempDir.toString()); + Person staleReceiver = Person.builder().id(UUID.randomUUID()).sourceRef("stale-receiver").lastName("Stale").build(); + Tag staleTag = Tag.builder().id(UUID.randomUUID()).name("Stale").sourceRef("Themen/Stale").build(); + Document existing = Document.builder().id(UUID.randomUUID()) + .originalFilename("W-0008").status(DocumentStatus.PLACEHOLDER).build(); + existing.getReceivers().add(staleReceiver); + existing.getTags().add(staleTag); + when(documentService.findByOriginalFilename("W-0008")).thenReturn(Optional.of(existing)); + when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); + // The canonical row now carries no receiver and no tag: both stale links must go. + Path xlsx = writeDocs(tempDir, docRow("W-0008", "", "", "", "", "", "", "", "", "")); + + importer.load(xlsx.toFile()); + + verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> + d.getReceivers().isEmpty() && d.getTags().isEmpty())); + } + + // ─── helpers ───────────────────────────────────────────────────────────────────── + + private Map docRow(String index, String file, String senderId, String senderName, + String receiverIds, String receiverNames, String dateIso, + String dateRaw, String datePrecision, String dateEnd) { + Map r = new LinkedHashMap<>(); + r.put("index", index); + r.put("file", file); + r.put("sender_person_id", senderId); + r.put("sender_name", senderName); + r.put("receiver_person_ids", receiverIds); + r.put("receiver_names", receiverNames); + r.put("date_iso", dateIso); + r.put("date_raw", dateRaw); + r.put("date_precision", datePrecision); + r.put("date_end", dateEnd); + r.put("location", ""); + r.put("tags", ""); + r.put("summary", ""); + return r; + } + + private Map docRowWithTag(String index, String tagPath) { + Map r = docRow(index, "", "", "", "", "", "", "", "", ""); + r.put("tags", tagPath); + return r; + } + + @SafeVarargs + private Path writeDocs(Path dir, Map... rows) throws Exception { + Path xlsx = dir.resolve("canonical-documents.xlsx"); + List headers = List.of("index", "file", "sender_person_id", "sender_name", + "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision", + "date_end", "location", "tags", "summary"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.length; r++) { + Row row = sheet.createRow(r + 1); + for (int c = 0; c < headers.size(); c++) { + row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java deleted file mode 100644 index d87d28c1..00000000 --- a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java +++ /dev/null @@ -1,896 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.tag.TagService; -import org.raddatz.familienarchiv.person.PersonService; -import org.springframework.test.util.ReflectionTestUtils; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.xml.sax.SAXParseException; - -import java.io.File; -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.*; - -@ExtendWith(MockitoExtension.class) -class MassImportServiceTest { - - @Mock DocumentService documentService; - @Mock PersonService personService; - @Mock TagService tagService; - @Mock S3Client s3Client; - @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; - - MassImportService service; - - @BeforeEach - void setUp() { - service = new MassImportService(documentService, personService, tagService, s3Client, thumbnailAsyncRunner); - ReflectionTestUtils.setField(service, "bucketName", "test-bucket"); - ReflectionTestUtils.setField(service, "importDir", "/import"); - ReflectionTestUtils.setField(service, "colIndex", 0); - ReflectionTestUtils.setField(service, "colBox", 1); - ReflectionTestUtils.setField(service, "colFolder", 2); - ReflectionTestUtils.setField(service, "colSender", 3); - ReflectionTestUtils.setField(service, "colReceivers", 5); - ReflectionTestUtils.setField(service, "colDate", 7); - ReflectionTestUtils.setField(service, "colLocation", 9); - ReflectionTestUtils.setField(service, "colTags", 10); - ReflectionTestUtils.setField(service, "colSummary", 11); - ReflectionTestUtils.setField(service, "colTranscription", 13); - } - - // ─── getStatus ──────────────────────────────────────────────────────────── - - @Test - void getStatus_returnsIdleByDefault() { - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.IDLE); - } - - @Test - void getStatus_hasStatusCode_IMPORT_IDLE_byDefault() { - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_IDLE"); - } - - // ─── runImportAsync ─────────────────────────────────────────────────────── - - @Test - void runImportAsync_setsFailedStatus_whenImportDirectoryDoesNotExist() { - // /import directory doesn't exist in test environment → IOException → IMPORT_FAILED_INTERNAL - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_INTERNAL"); - } - - @Test - void runImportAsync_readsFromConfiguredImportDir(@TempDir Path tempDir) { - // Empty temp dir → findSpreadsheetFile throws "no spreadsheet" with the - // configured path in the message. Proves the field, not a constant, - // drives the lookup. - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().message()).contains(tempDir.toString()); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_FAILED_NO_SPREADSHEET_whenDirIsEmpty(@TempDir Path tempDir) { - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_NO_SPREADSHEET"); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_DONE_whenSpreadsheetHasNoDataRows(@TempDir Path tempDir) throws Exception { - Path xlsx = tempDir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - wb.createSheet("Sheet1"); - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_DONE"); - } - - @Test - void runImportAsync_throwsConflict_whenAlreadyRunning() { - MassImportService.ImportStatus running = new MassImportService.ImportStatus( - MassImportService.State.RUNNING, "IMPORT_RUNNING", "Running...", 0, List.of(), LocalDateTime.now()); - ReflectionTestUtils.setField(service, "currentStatus", running); - - assertThatThrownBy(() -> service.runImportAsync()) - .isInstanceOf(DomainException.class) - .hasMessageContaining("already in progress"); - } - - // ─── importSingleDocument — skip already uploaded ───────────────────────── - - @Test - void importSingleDocument_skips_whenDocumentAlreadyUploadedNotPlaceholder() { - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("doc001.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.of(existing)); - - Optional result = service.importSingleDocument(minimalCells("doc001.pdf"), Optional.empty(), "doc001.pdf", "doc001"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — already-exists guard fires before file I/O ───── - - @Test - void importSingleDocument_skipsWithAlreadyExists_whenDocumentUploadedAndFileIsPresent(@TempDir Path tempDir) throws Exception { - // Document already exists with status UPLOADED (not PLACEHOLDER). - // A physical PDF file is also present on disk (valid magic bytes). - // Expected: ALREADY_EXISTS is returned and no S3 upload is attempted — - // the guard fires before any file I/O, so no partial processing occurs. - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("present.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("present.pdf")).thenReturn(Optional.of(existing)); - - Path physicalFile = tempDir.resolve("present.pdf"); - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(physicalFile, pdfHeader); - - Optional result = service.importSingleDocument( - minimalCells("present.pdf"), Optional.of(physicalFile.toFile()), "present.pdf", "present"); - - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService, never()).save(any()); - } - - // ─── importSingleDocument — S3 failure surfaced in skippedFiles ────────── - - @Test - void runImportAsync_addsS3UploadFailed_toSkippedFiles_whenS3Throws(@TempDir Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("upload_fail.pdf"), pdfHeader); - buildMinimalImportXlsx(tempDir, "upload_fail.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename("upload_fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 unavailable")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename, MassImportService.SkippedFile::reason) - .containsExactly(org.assertj.core.groups.Tuple.tuple("upload_fail.pdf", MassImportService.SkipReason.S3_UPLOAD_FAILED)); - } - - @Test - void runImportAsync_addsAlreadyExists_toSkippedFiles_whenDocumentAlreadyUploaded(@TempDir Path tempDir) throws Exception { - buildMinimalImportXlsx(tempDir, "existing.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(existing)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — create new document (metadata only) ─────────── - - @Test - void importSingleDocument_createsNewDocument_whenNotExists() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("doc002.pdf"), Optional.empty(), "doc002.pdf", "doc002"); - - verify(documentService).save(argThat(d -> - d.getOriginalFilename().equals("doc002.pdf") - && d.getStatus() == DocumentStatus.PLACEHOLDER)); - } - - // ─── importSingleDocument — update existing placeholder ────────────────── - - @Test - void importSingleDocument_updatesExistingPlaceholder() { - Document placeholder = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.PLACEHOLDER) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(placeholder)); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("existing.pdf"), Optional.empty(), "existing.pdf", "existing"); - - verify(documentService).save(same(placeholder)); - } - - // ─── importSingleDocument — with file (S3 upload) ───────────────────────── - - @Test - void importSingleDocument_uploadsFileToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("doc003.pdf"); - Files.write(tempFile, "PDF content".getBytes()); - - when(documentService.findByOriginalFilename("doc003.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument( - minimalCells("doc003.pdf"), Optional.of(tempFile.toFile()), "doc003.pdf", "doc003"); - - verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService).save(argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); - } - - @Test - void importSingleDocument_returnsS3UploadFailed_whenS3UploadFails(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("fail.pdf"); - Files.write(tempFile, "data".getBytes()); - - when(documentService.findByOriginalFilename("fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 error")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - Optional result = service.importSingleDocument( - minimalCells("fail.pdf"), Optional.of(tempFile.toFile()), "fail.pdf", "fail"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.S3_UPLOAD_FAILED); - } - - // ─── importSingleDocument — sender handling ─────────────────────────────── - - @Test - void importSingleDocument_setsNullSender_whenSenderCellIsBlank() { - when(documentService.findByOriginalFilename("nosender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nosender.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nosender.pdf", "nosender"); - - verify(documentService).save(argThat(d -> d.getSender() == null)); - verify(personService, never()).findOrCreateByAlias(any()); - } - - @Test - void importSingleDocument_createsSender_whenSenderCellIsNonBlank() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("withsender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(sender); - - List cells = buildCells("withsender.pdf", "Walter Müller", "", ""); - service.importSingleDocument(cells, Optional.empty(), "withsender.pdf", "withsender"); - - verify(personService).findOrCreateByAlias("Walter Müller"); - verify(documentService).save(argThat(d -> d.getSender() == sender)); - } - - // ─── importSingleDocument — tag handling ───────────────────────────────── - - @Test - void importSingleDocument_createsTag_whenTagCellIsNonBlank() { - Tag tag = Tag.builder().id(UUID.randomUUID()).name("Familie").build(); - when(documentService.findByOriginalFilename("tagged.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(tagService.findOrCreate("Familie")).thenReturn(tag); - - List cells = buildCells("tagged.pdf", "", "", "Familie"); - service.importSingleDocument(cells, Optional.empty(), "tagged.pdf", "tagged"); - - verify(tagService).findOrCreate("Familie"); - } - - @Test - void importSingleDocument_doesNotCreateTag_whenTagCellIsBlank() { - when(documentService.findByOriginalFilename("notag.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("notag.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "notag.pdf", "notag"); - - verify(tagService, never()).findOrCreate(any()); - } - - // ─── importSingleDocument — metadataComplete heuristic ─────────────────── - - @Test - void importSingleDocument_metadataComplete_whenSenderPresent() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("A").lastName("B").build(); - when(documentService.findByOriginalFilename("meta.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("A B")).thenReturn(sender); - - List cells = buildCells("meta.pdf", "A B", "", ""); - service.importSingleDocument(cells, Optional.empty(), "meta.pdf", "meta"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_metadataIncomplete_whenNoKeyFieldsPresent() { - when(documentService.findByOriginalFilename("nometa.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nometa.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nometa.pdf", "nometa"); - - verify(documentService).save(argThat(d -> !d.isMetadataComplete())); - } - - // ─── importSingleDocument — blank fields set to null ───────────────────── - - @Test - void importSingleDocument_setsBlankFieldsToNull() { - when(documentService.findByOriginalFilename("blank.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("blank.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "blank.pdf", "blank"); - - verify(documentService).save(argThat(d -> - d.getLocation() == null && - d.getSummary() == null && - d.getTranscription() == null && - d.getArchiveBox() == null && - d.getArchiveFolder() == null)); - } - - // ─── processRows — via ReflectionTestUtils ──────────────────────────────── - - @Test - void processRows_returnsZero_whenOnlyHeaderRow() { - List> rows = List.of(List.of("header", "col1")); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - } - - @Test - void processRows_skipsRowWithBlankIndex() { - List> rows = List.of( - List.of("header"), - minimalCells("") // blank index - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - verify(documentService, never()).findByOriginalFilename(any()); - } - - @Test - void processRows_addsExtension_whenIndexHasNoDot() { - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc001") // no dot → appends ".pdf" - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc001.pdf"); - } - - @Test - void processRows_usesFilenameAsIs_whenIndexHasDot() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc002.pdf") // has dot → used as-is - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc002.pdf"); - } - - // ─── isValidImportFilename — security regression — do not remove ───────── - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsNull() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", (String) null); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsBlank() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", " "); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsForwardSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsBackslash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "..\\etc\\passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "doc..evil.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".."); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsAbsolutePath() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "/etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsNullByte() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "file\0.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameIsPlainBasename() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "document.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeDivisionSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo∕bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsFullwidthSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo/bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeReverseSolidus() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo⧵bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasLeadingDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".hidden.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasSpaces() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "Brief an Oma.pdf"); - assertThat(result).isTrue(); - } - - @Test - void processRows_skipsRowAndContinues_whenFilenameIsPathTraversal() { - when(documentService.findByOriginalFilename("legitimate.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("../evil"), // row 1: path traversal — should be skipped - minimalCells("legitimate.pdf") // row 2: valid — should be processed - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - assertThat(result.skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); - } - - // ─── importSingleDocument — non-blank optional fields ──────────────────── - - @Test - void importSingleDocument_setsNonNullOptionalFields_whenPresent() { - when(documentService.findByOriginalFilename("rich.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - // box=1, folder=2, location=9, summary=11, transcription=13 - List cells = List.of( - "rich.pdf", // 0: index - "Box A", // 1: box - "Folder B", // 2: folder - "", // 3: sender - "", // 4: unused - "", // 5: receivers - "", // 6: unused - "", // 7: date - "", // 8: unused - "Hamburg", // 9: location - "", // 10: tags - "A summary", // 11: summary - "", // 12: unused - "A transcript" // 13: transcription - ); - - service.importSingleDocument(cells, Optional.empty(), "rich.pdf", "rich"); - - verify(documentService).save(argThat(d -> - "Box A".equals(d.getArchiveBox()) && - "Folder B".equals(d.getArchiveFolder()) && - "Hamburg".equals(d.getLocation()) && - "A summary".equals(d.getSummary()) && - "A transcript".equals(d.getTranscription()))); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenReceiversArePresent() { - Person receiver = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("rcv.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(receiver); - - List cells = List.of( - "rcv.pdf", "", "", "", "", "Walter Müller", "", "", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "rcv.pdf", "rcv"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenDateIsPresent() { - when(documentService.findByOriginalFilename("dated.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = List.of( - "dated.pdf", "", "", "", "", "", "", "2024-03-15", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "dated.pdf", "dated"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - // ─── buildTitle — null location ─────────────────────────────────────────── - - @Test - void buildTitle_withNullLocation_skipsLocationPart() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc005", LocalDate.of(1940, 5, 1), (String) null); - assertThat(result).contains("doc005").contains("1940"); - assertThat(result).doesNotContain("Berlin"); - } - - // ─── parseDate — via ReflectionTestUtils ───────────────────────────────── - - @Test - void parseDate_returnsNull_whenValueIsNull() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", (String) null); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsNull_whenValueIsBlank() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", " "); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsDate_whenValidIsoFormat() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "2024-03-15"); - assertThat(result).isEqualTo(LocalDate.of(2024, 3, 15)); - } - - @Test - void parseDate_returnsNull_whenInvalidDateString() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "15.03.2024"); - assertThat(result).isNull(); - } - - // ─── buildTitle — via ReflectionTestUtils ──────────────────────────────── - - @Test - void buildTitle_withDateAndLocation() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc001", LocalDate.of(1940, 5, 1), "Berlin"); - assertThat(result).contains("doc001").contains("Berlin").contains("1940"); - } - - @Test - void buildTitle_withDateOnly() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc002", LocalDate.of(1960, 8, 15), ""); - assertThat(result).contains("doc002").contains("1960"); - assertThat(result).doesNotContain("Berlin"); - } - - @Test - void buildTitle_withIndexOnly_whenDateAndLocationAreNull() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc003", null, ""); - assertThat(result).isEqualTo("doc003"); - } - - @Test - void buildTitle_withLocationOnly_whenDateIsNull() { - // date=null, location present → date part skipped, location appended - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc004", null, "Berlin"); - assertThat(result).contains("doc004").contains("Berlin"); - assertThat(result).doesNotContain("("); // no date part - } - - // ─── getCell — via ReflectionTestUtils ─────────────────────────────────── - - @Test - void getCell_returnsEmptyString_whenColBeyondListSize() { - List cells = List.of("a", "b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 5); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsEmptyString_whenValueIsNull() { - List cells = new ArrayList<>(); - cells.add(null); - cells.add("b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsTrimmedValue() { - List cells = List.of(" hello ", "world"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEqualTo("hello"); - } - - // ─── PDF magic byte validation regression ───────────────────────────────── - - @Test - void runImportAsync_uploadsValidPdf_andSkipsFakeOne(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - verify(s3Client, times(1)).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - } - - @Test - void runImportAsync_setsSkippedCount_toOne_whenOneFakeFile(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_includesRejectedFilename_inSkippedFiles(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename) - .contains("fake.pdf"); - } - - @Test - void runImportAsync_skipsFile_whenShorterThanFourBytes(@TempDir Path tempDir) throws Exception { - Files.write(tempDir.resolve("tiny.pdf"), new byte[]{0x25, 0x50, 0x44}); // only 3 bytes - buildMinimalImportXlsx(tempDir, "tiny.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_skipsFile_whenMagicBytesCheckThrowsIOException(@TempDir Path tempDir) throws Exception { - Files.writeString(tempDir.resolve("unreadable.pdf"), "some content"); - buildMinimalImportXlsx(tempDir, "unreadable.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - MassImportService spyService = spy(service); - doThrow(new java.io.IOException("simulated read error")).when(spyService).openFileStream(any(File.class)); - - spyService.runImportAsync(); - - assertThat(spyService.getStatus().skipped()).isEqualTo(1); - assertThat(spyService.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.FILE_READ_ERROR); - } - - // ─── findFileRecursive — symlink escape security regression — do not remove ─ - - @Test - void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( - @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { - Path outsideFile = outsideDir.resolve("secret.pdf"); - Files.writeString(outsideFile, "sensitive content"); - Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); - - ReflectionTestUtils.setField(service, "importDir", importDirPath.toString()); - - assertThatThrownBy(() -> ReflectionTestUtils.invokeMethod(service, "findFileRecursive", "secret.pdf")) - .isInstanceOf(DomainException.class); - } - - // ─── readOds — XXE security regression ─────────────────────────────────── - - // Security regression — do not remove. - @Test - void readOds_rejects_xxe_doctype_payload(@TempDir Path tempDir) throws Exception { - File malicious = buildXxeOds(tempDir, "file:///etc/hostname"); - assertThatThrownBy(() -> service.readOds(malicious)) - .isInstanceOf(SAXParseException.class) - .hasMessageContaining("DOCTYPE is disallowed"); - } - - @Test - void readOds_parses_valid_ods_correctly(@TempDir Path tempDir) throws Exception { - File valid = buildValidOds(tempDir, "Mustermann"); - List> rows = service.readOds(valid); - assertThat(rows).isNotEmpty(); - assertThat(rows.get(0)).contains("Mustermann"); - } - - // ─── helpers ────────────────────────────────────────────────────────────── - - /** - * Builds a minimal 14-element cell row with the given filename at index 0 - * and blanks for all optional fields. - */ - private List minimalCells(String filename) { - return buildCells(filename, "", "", ""); - } - - /** - * Builds a cell row with sender, receiver, and tag controls. - * Layout matches the default column indices set in setUp(). - */ - private List buildCells(String filename, String sender, String receivers, String tag) { - // 14 elements: index=0,box=1,folder=2,sender=3,[4],receivers=5,[6],date=7,[8],location=9,tag=10,summary=11,[12],transcription=13 - return List.of( - filename, // 0: index - "", // 1: box - "", // 2: folder - sender, // 3: sender - "", // 4: (unused) - receivers, // 5: receivers - "", // 6: (unused) - "", // 7: date - "", // 8: (unused) - "", // 9: location - tag, // 10: tags - "", // 11: summary - "", // 12: (unused) - "" // 13: transcription - ); - } - - /** Creates a minimal ODS ZIP containing a content.xml with an XXE payload. */ - private File buildXxeOds(Path dir, String entityTarget) throws Exception { - String xml = "" - + "]>" - + "" - + "" - + "" - + "&xxe;" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("malicious.ods"), xml); - } - - /** Creates a minimal valid ODS ZIP containing a content.xml with the given cell value. - * cellValue must not contain XML metacharacters ({@code < > &}). */ - private File buildValidOds(Path dir, String cellValue) throws Exception { - String xml = "" - + "" - + "" - + "" - + "" + cellValue + "" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("valid.ods"), xml); - } - - private File writeOdsZip(Path destination, String contentXml) throws Exception { - try (OutputStream fos = Files.newOutputStream(destination); - ZipOutputStream zip = new ZipOutputStream(fos)) { - zip.putNextEntry(new ZipEntry("content.xml")); - zip.write(contentXml.getBytes(StandardCharsets.UTF_8)); - zip.closeEntry(); - } - return destination.toFile(); - } - - private void setupOneValidOneFakeImport(Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("real.pdf"), pdfHeader); - Files.writeString(tempDir.resolve("fake.pdf"), "not a pdf"); - buildMinimalImportXlsx(tempDir, "real.pdf", "fake.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - } - - private void buildMinimalImportXlsx(Path dir, String... filenames) throws Exception { - Path xlsx = dir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - org.apache.poi.ss.usermodel.Sheet sheet = wb.createSheet("Sheet1"); - sheet.createRow(0).createCell(0).setCellValue("Index"); - for (int i = 0; i < filenames.length; i++) { - sheet.createRow(i + 1).createCell(0).setCellValue(filenames[i]); - } - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - } -} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java new file mode 100644 index 00000000..af5740c0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonRegisterImporterTest.java @@ -0,0 +1,130 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.ArgumentCaptor; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonRegisterImporterTest { + + @Test + void load_upsertsPersonBySourceRef_withProvisionalFalse(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, row( + "allemeyer-elsgard", "Allemeyer", "Elsgard", "Wöhler", "Nichte von Herbert", "False")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + PersonUpsertCommand cmd = captor.getValue(); + assertThat(cmd.sourceRef()).isEqualTo("allemeyer-elsgard"); + assertThat(cmd.lastName()).isEqualTo("Allemeyer"); + assertThat(cmd.firstName()).isEqualTo("Elsgard"); + assertThat(cmd.maidenName()).isEqualTo("Wöhler"); + assertThat(cmd.notes()).isEqualTo("Nichte von Herbert"); + assertThat(cmd.provisional()).isFalse(); + } + + @Test + void load_parsesCapitalisedPythonBool_True(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, row( + "noise-geschirr", "Geschirr", "", "", "", "True")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + assertThat(captor.getValue().provisional()).isTrue(); + } + + @Test + void load_skipsRowWithBlankPersonId(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + Path xlsx = writePersons(tempDir, row("", "NoId", "", "", "", "False")); + + new PersonRegisterImporter(personService).load(xlsx.toFile()); + + verify(personService, times(0)).upsertBySourceRef(any()); + } + + @Test + void load_returnsCountOfProcessedRows(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path xlsx = writePersons(tempDir, + row("a-one", "One", "A", "", "", "False"), + row("a-two", "Two", "B", "", "", "False")); + + int processed = new PersonRegisterImporter(personService).load(xlsx.toFile()); + + assertThat(processed).isEqualTo(2); + } + + private static Person personOf(PersonUpsertCommand cmd) { + return Person.builder().id(UUID.randomUUID()).sourceRef(cmd.sourceRef()) + .firstName(cmd.firstName()).lastName(cmd.lastName()) + .provisional(cmd.provisional()).build(); + } + + private Map row(String personId, String lastName, String firstName, + String maidenName, String notes, String provisional) { + Map r = new LinkedHashMap<>(); + r.put("person_id", personId); + r.put("last_name", lastName); + r.put("first_name", firstName); + r.put("maiden_name", maidenName); + r.put("notes", notes); + r.put("provisional", provisional); + return r; + } + + @SafeVarargs + private Path writePersons(Path dir, Map... rows) throws Exception { + Path xlsx = dir.resolve("canonical-persons.xlsx"); + List headers = List.of("person_id", "last_name", "first_name", "maiden_name", "notes", "provisional"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + for (int i = 0; i < headers.size(); i++) { + header.createCell(i).setCellValue(headers.get(i)); + } + for (int r = 0; r < rows.length; r++) { + Row row = sheet.createRow(r + 1); + for (int c = 0; c < headers.size(); c++) { + row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), "")); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java new file mode 100644 index 00000000..ce90d260 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/PersonTreeImporterTest.java @@ -0,0 +1,163 @@ +package org.raddatz.familienarchiv.importing; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.ArgumentCaptor; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.person.Person; +import org.raddatz.familienarchiv.person.PersonService; +import org.raddatz.familienarchiv.person.PersonUpsertCommand; +import org.raddatz.familienarchiv.person.relationship.RelationType; +import org.raddatz.familienarchiv.person.relationship.RelationshipService; +import org.raddatz.familienarchiv.person.relationship.dto.CreateRelationshipRequest; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonTreeImporterTest { + + @Test + void load_upsertsTreePersonBySourceRef_withFamilyMemberFlag(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_002","firstName":"Elsgard","lastName":"Allemeyer","maidenName":"Wöhler", + "notes":"Nichte","birthYear":1920,"deathYear":1999,"familyMember":true,"personId":"allemeyer-elsgard"} + ],"relationships":[]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(PersonUpsertCommand.class); + verify(personService).upsertBySourceRef(captor.capture()); + PersonUpsertCommand cmd = captor.getValue(); + assertThat(cmd.sourceRef()).isEqualTo("allemeyer-elsgard"); + assertThat(cmd.familyMember()).isTrue(); + assertThat(cmd.provisional()).isFalse(); + } + + @Test + void load_createsRelationship_resolvingRowIdsToUpsertedPersons(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + UUID idA = UUID.randomUUID(); + UUID idB = UUID.randomUUID(); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> { + PersonUpsertCommand c = inv.getArgument(0); + return Person.builder().id(c.sourceRef().equals("a") ? idA : idB) + .sourceRef(c.sourceRef()).lastName(c.lastName()).build(); + }); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(CreateRelationshipRequest.class); + verify(relationshipService).addRelationship(eq(idA), captor.capture()); + assertThat(captor.getValue().relatedPersonId()).isEqualTo(idB); + assertThat(captor.getValue().relationType()).isEqualTo(RelationType.SPOUSE_OF); + } + + @Test + void load_swallowsDuplicateRelationship_forIdempotentReimport(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())) + .thenAnswer(inv -> personOf(inv.getArgument(0))); + doThrow(DomainException.conflict(ErrorCode.DUPLICATE_RELATIONSHIP, "exists")) + .when(relationshipService).addRelationship(any(), any()); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + PersonTreeImporter importer = new PersonTreeImporter(personService, relationshipService); + + // Must not propagate the conflict — re-import is idempotent. + importer.load(json.toFile()); + + verify(relationshipService).addRelationship(any(), any()); + } + + @Test + void load_propagatesUnexpectedDomainException_fromAddRelationship(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())) + .thenAnswer(inv -> personOf(inv.getArgument(0))); + // An unexpected ErrorCode (not DUPLICATE/CIRCULAR) must NOT be swallowed. + doThrow(DomainException.internal(ErrorCode.INTERNAL_ERROR, "boom")) + .when(relationshipService).addRelationship(any(), any()); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"}, + {"rowId":"row_b","lastName":"B","familyMember":true,"personId":"b"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_b","type":"SPOUSE_OF","source":"verheiratet_mit"} + ]} + """); + + PersonTreeImporter importer = new PersonTreeImporter(personService, relationshipService); + + assertThatThrownBy(() -> importer.load(json.toFile())) + .isInstanceOf(DomainException.class) + .extracting("code").isEqualTo(ErrorCode.INTERNAL_ERROR); + } + + @Test + void load_skipsRelationship_whenRowIdUnresolved(@TempDir Path tempDir) throws Exception { + PersonService personService = mock(PersonService.class); + RelationshipService relationshipService = mock(RelationshipService.class); + when(personService.upsertBySourceRef(any())).thenAnswer(inv -> personOf(inv.getArgument(0))); + Path json = write(tempDir, """ + {"persons":[ + {"rowId":"row_a","lastName":"A","familyMember":true,"personId":"a"} + ],"relationships":[ + {"personId":"row_a","relatedPersonId":"row_ghost","type":"SPOUSE_OF","source":"x"} + ]} + """); + + new PersonTreeImporter(personService, relationshipService) + .load(json.toFile()); + + verify(relationshipService, org.mockito.Mockito.never()).addRelationship(any(), any()); + } + + private static Person personOf(PersonUpsertCommand cmd) { + return Person.builder().id(UUID.randomUUID()).sourceRef(cmd.sourceRef()).lastName(cmd.lastName()).build(); + } + + private Path write(Path dir, String json) throws Exception { + Path file = dir.resolve("canonical-persons-tree.json"); + Files.writeString(file, json); + return file; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java new file mode 100644 index 00000000..e6becae5 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/TagTreeImporterTest.java @@ -0,0 +1,103 @@ +package org.raddatz.familienarchiv.importing; + +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.tag.Tag; +import org.raddatz.familienarchiv.tag.TagService; + +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TagTreeImporterTest { + + @Test + void load_upsertsRootTagWithNullParent(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + when(tagService.upsertBySourceRef(any(), any(), any())) + .thenAnswer(inv -> tagOf(inv.getArgument(0), inv.getArgument(1), inv.getArgument(2))); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"})); + + new TagTreeImporter(tagService).load(xlsx.toFile()); + + verify(tagService).upsertBySourceRef("Themen", "Themen", null); + } + + @Test + void load_resolvesParentByPath_forChildTag(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + UUID rootId = UUID.randomUUID(); + when(tagService.upsertBySourceRef(eq("Themen"), eq("Themen"), isNull())) + .thenReturn(tagOf("Themen", "Themen", null, rootId)); + when(tagService.upsertBySourceRef(eq("Themen/Brautbriefe"), eq("Brautbriefe"), eq(rootId))) + .thenReturn(tagOf("Themen/Brautbriefe", "Brautbriefe", rootId)); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"}, + new String[]{"Themen/Brautbriefe", "Themen", "Brautbriefe"})); + + new TagTreeImporter(tagService).load(xlsx.toFile()); + + verify(tagService).upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", rootId); + } + + @Test + void load_returnsCountOfProcessedRows(@TempDir Path tempDir) throws Exception { + TagService tagService = mock(TagService.class); + when(tagService.upsertBySourceRef(any(), any(), any())) + .thenAnswer(inv -> tagOf(inv.getArgument(0), inv.getArgument(1), inv.getArgument(2))); + Path xlsx = writeTagTree(tempDir, List.of( + new String[]{"Themen", "", "Themen"}, + new String[]{"Themen/Brautbriefe", "Themen", "Brautbriefe"})); + + int processed = new TagTreeImporter(tagService).load(xlsx.toFile()); + + assertThat(processed).isEqualTo(2); + } + + private static Tag tagOf(String sourceRef, String name, UUID parentId) { + return tagOf(sourceRef, name, parentId, UUID.randomUUID()); + } + + private static Tag tagOf(String sourceRef, String name, UUID parentId, UUID id) { + return Tag.builder().id(id).sourceRef(sourceRef).name(name).parentId(parentId).build(); + } + + private Path writeTagTree(Path dir, List rows) throws Exception { + Path xlsx = dir.resolve("canonical-tag-tree.xlsx"); + try (XSSFWorkbook wb = new XSSFWorkbook()) { + Sheet sheet = wb.createSheet("Sheet1"); + Row header = sheet.createRow(0); + header.createCell(0).setCellValue("tag_path"); + header.createCell(1).setCellValue("parent_name"); + header.createCell(2).setCellValue("tag_name"); + for (int r = 0; r < rows.size(); r++) { + Row row = sheet.createRow(r + 1); + String[] values = rows.get(r); + for (int c = 0; c < values.length; c++) { + row.createCell(c).setCellValue(values[c]); + } + } + try (OutputStream out = Files.newOutputStream(xlsx)) { + wb.write(out); + } + } + return xlsx; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java new file mode 100644 index 00000000..c8b81b2b --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/person/PersonImportUpsertTest.java @@ -0,0 +1,151 @@ +package org.raddatz.familienarchiv.person; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class PersonImportUpsertTest { + + @Mock PersonRepository personRepository; + @Mock PersonNameAliasRepository aliasRepository; + @InjectMocks PersonService personService; + + @Test + void upsertBySourceRef_insertsNewPerson_whenSourceRefUnknown() { + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.empty()); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.getSourceRef()).isEqualTo("clara-cram"); + assertThat(result.getFirstName()).isEqualTo("Clara"); + assertThat(result.getLastName()).isEqualTo("Cram"); + assertThat(result.isProvisional()).isFalse(); + } + + @Test + void upsertBySourceRef_updatesInPlace_whenSourceRefExists() { + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Clara").lastName("Cram").build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Updated note").personType(PersonType.PERSON).provisional(false).build(); + + personService.upsertBySourceRef(cmd); + + verify(personRepository).save(argThat(p -> p.getId().equals(existing.getId()))); + verify(personRepository, never()).save(argThat(p -> p.getId() == null)); + } + + @Test + void upsertBySourceRef_preservesHumanEditedNonBlankFields() { + // A human renamed the maiden-name register person and added notes in-app. + Person humanEdited = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Klara").lastName("Cram-Müller").notes("Verified by Marcel").build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(humanEdited)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Auto note").personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + // Human edits survive the re-import. + assertThat(result.getFirstName()).isEqualTo("Klara"); + assertThat(result.getLastName()).isEqualTo("Cram-Müller"); + assertThat(result.getNotes()).isEqualTo("Verified by Marcel"); + } + + @Test + void upsertBySourceRef_fillsOnlyBlankFields_onReimport() { + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .firstName("Clara").lastName("Cram").notes(null).build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").firstName("Clara").lastName("Cram") + .notes("Nichte von Herbert").personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + // Blank field gets filled by canonical value. + assertThat(result.getNotes()).isEqualTo("Nichte von Herbert"); + } + + @Test + void upsertBySourceRef_fillsBlankYears_butPreservesHumanEditedYears_onReimport() { + // Existing has a human-set birthYear and a blank deathYear. + Person existing = Person.builder() + .id(UUID.randomUUID()).sourceRef("clara-cram") + .lastName("Cram").birthYear(1890).deathYear(null).build(); + when(personRepository.findBySourceRef("clara-cram")).thenReturn(Optional.of(existing)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("clara-cram").lastName("Cram") + .birthYear(1888).deathYear(1965) + .personType(PersonType.PERSON).provisional(false).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.getBirthYear()).isEqualTo(1890); // human value kept + assertThat(result.getDeathYear()).isEqualTo(1965); // blank filled from canonical + } + + @Test + void upsertBySourceRef_neverFlipsProvisionalBackToTrue_onceHumanConfirmed() { + // A human confirmed this provisional importer-created person (provisional -> false). + Person confirmed = Person.builder() + .id(UUID.randomUUID()).sourceRef("schwester-hanni") + .firstName(null).lastName("Schwester Hanni").provisional(false).build(); + when(personRepository.findBySourceRef("schwester-hanni")).thenReturn(Optional.of(confirmed)); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("schwester-hanni").lastName("Schwester Hanni") + .personType(PersonType.PERSON).provisional(true).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.isProvisional()).isFalse(); + } + + @Test + void upsertBySourceRef_setsProvisionalTrue_forNewProvisionalPerson() { + when(personRepository.findBySourceRef("noise-geschirr")).thenReturn(Optional.empty()); + when(personRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + PersonUpsertCommand cmd = PersonUpsertCommand.builder() + .sourceRef("noise-geschirr").lastName("Tante Tüten") + .personType(PersonType.PERSON).provisional(true).build(); + + Person result = personService.upsertBySourceRef(cmd); + + assertThat(result.isProvisional()).isTrue(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java b/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java new file mode 100644 index 00000000..c2e29dc0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/tag/TagImportUpsertTest.java @@ -0,0 +1,62 @@ +package org.raddatz.familienarchiv.tag; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TagImportUpsertTest { + + @Mock TagRepository tagRepository; + @InjectMocks TagService tagService; + + @Test + void upsertBySourceRef_insertsNewTag_whenSourceRefUnknown() { + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.empty()); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + UUID parentId = UUID.randomUUID(); + Tag result = tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", parentId); + + assertThat(result.getSourceRef()).isEqualTo("Themen/Brautbriefe"); + assertThat(result.getName()).isEqualTo("Brautbriefe"); + assertThat(result.getParentId()).isEqualTo(parentId); + } + + @Test + void upsertBySourceRef_updatesInPlace_whenSourceRefExists() { + Tag existing = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe") + .sourceRef("Themen/Brautbriefe").build(); + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(existing)); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", null); + + verify(tagRepository).save(argThat(t -> t.getId().equals(existing.getId()))); + verify(tagRepository, never()).save(argThat(t -> t.getId() == null)); + } + + @Test + void upsertBySourceRef_preservesHumanRenamedTag_onReimport() { + Tag humanRenamed = Tag.builder().id(UUID.randomUUID()).name("Verlobungsbriefe") + .sourceRef("Themen/Brautbriefe").build(); + when(tagRepository.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(humanRenamed)); + when(tagRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + Tag result = tagService.upsertBySourceRef("Themen/Brautbriefe", "Brautbriefe", null); + + assertThat(result.getName()).isEqualTo("Verlobungsbriefe"); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java index b87b928b..8e51fad7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java @@ -7,7 +7,8 @@ import org.raddatz.familienarchiv.security.PermissionAspect; import org.raddatz.familienarchiv.user.CustomUserDetailsService; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.aop.AopAutoConfiguration; @@ -35,7 +36,7 @@ class AdminControllerTest { @Autowired MockMvc mockMvc; - @MockitoBean MassImportService massImportService; + @MockitoBean CanonicalImportOrchestrator importOrchestrator; @MockitoBean DocumentService documentService; @MockitoBean DocumentVersionService documentVersionService; @MockitoBean ThumbnailBackfillService thumbnailBackfillService; @@ -46,9 +47,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_returns200_withStatusCode_whenAdmin() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk()) @@ -60,9 +61,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_messageField_notPresentInApiResponse() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk()) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index c6560a0a..0e1fe07e 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -559,20 +559,40 @@ bash scripts/download-kraken-models.sh > Downloads the Kurrent/Sütterlin HTR models. Run once after a fresh clone or when models are updated. -### Trigger a mass import (Excel/ODS) +### Trigger a canonical import -**Dev:** drop the ODS spreadsheet + PDFs into `./import/` at the repo root — the dev compose bind-mounts it to `/import` automatically. +The importer no longer parses the raw spreadsheet. It consumes the **canonical artifacts** +produced by the normalizer (`tools/import-normalizer/`) — `canonical-tag-tree.xlsx`, +`canonical-persons.xlsx`, `canonical-persons-tree.json`, `canonical-documents.xlsx` — which +are committed under `tools/import-normalizer/out/`. The semantic transformation +(German-date parsing, name classification) lives entirely in the normalizer; the backend +maps the clean columns by header name. See [ADR-025](adr/025-canonical-import-and-single-migration-schema-foundation.md). + +**Prerequisite — regenerate the artifacts when the source data changes:** + +```bash +cd tools/import-normalizer +python3 -m venv .venv && .venv/bin/pip install -r requirements.txt # once, on a fresh clone +.venv/bin/python normalize.py +# writes the four canonical artifacts into ./out/ +``` + +**Dev:** place all four canonical artifacts **plus** the referenced PDFs into `./import/` +at the repo root (the dev compose bind-mounts it to `/import`, which is `app.import.dir`). +The orchestrator smoke-checks that all four artifacts are present before starting and fails +closed (`IMPORT_ARTIFACT_INVALID`) if any is missing. **Staging/production:** -1. Pre-stage the payload on the host. Convention: `/srv/familienarchiv-staging/import/` or `/srv/familienarchiv-production/import/`. +1. Pre-stage the four canonical artifacts + PDFs on the host. Convention: + `/srv/familienarchiv-staging/import/` or `/srv/familienarchiv-production/import/`. ```bash rsync -avh --progress ./import/ user@host:/srv/familienarchiv-staging/import/ ``` 2. Make sure `IMPORT_HOST_DIR=` is set in `.env.staging` / `.env.production` (the nightly/release workflows already write this — see §3). Compose refuses to start without it. 3. Redeploy the stack so the bind mount picks up — or, if the mount is already in place, skip to step 4. 4. Call `POST /api/admin/trigger-import` (requires `ADMIN` permission), or click the "Import starten" button on `/admin/system`. -5. The import runs asynchronously — poll `GET /api/admin/import-status`, watch `/admin/system`, or tail the backend logs. +5. The import runs asynchronously — poll `GET /api/admin/import-status`, watch `/admin/system`, or tail the backend logs. Re-running is safe and idempotent (upsert by `source_ref` / document `index`). Person and tag scalar fields you edited in the app are preserved on re-import; a document's sender/receivers/tags are **canonical-authoritative** — a re-import re-applies them to exactly match the export, so a link removed from the export is removed from the document (the raw sender/receiver cell text is always kept). --- diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md index 1fefb7af..074f2fe1 100644 --- a/docs/GLOSSARY.md +++ b/docs/GLOSSARY.md @@ -64,9 +64,13 @@ _See also [Annotation](#annotation-documentannotation)._ - `REVIEWED`: a reviewer has approved the transcription. - `ARCHIVED`: the document is finalized and read-only. -**Mass import** — an asynchronous batch process (`MassImportService`) that reads an Excel or ODS file and creates `Person`s, `Tag`s, and `PLACEHOLDER` `Document`s in one shot. Only one import can run at a time (`IMPORT_ALREADY_RUNNING` error if attempted concurrently). +**Canonical import** — an asynchronous batch process (`CanonicalImportOrchestrator`) that consumes the normalizer's committed canonical artifacts and creates `Tag`s, `Person`s (register + tree), family relationships, and `Document`s. Four idempotent loaders run in a fixed dependency order — `TagTreeImporter` → `PersonRegisterImporter` → `PersonTreeImporter` → `DocumentImporter` — each calling the owning domain's service. Re-running it never duplicates rows (upsert by `source_ref` / document `index`) and never overwrites a human-edited field. Only one import can run at a time (`IMPORT_ALREADY_RUNNING` error if attempted concurrently); a missing or malformed artifact fails closed (`IMPORT_ARTIFACT_INVALID`). Replaced the legacy raw-spreadsheet `MassImportService` (see ADR-025). -**SkippedFile** (`MassImportService.SkippedFile`) — a file that was presented for import but not processed, recorded with a `filename` and a `reason` code. Possible reasons: `INVALID_PDF_SIGNATURE` (magic-byte validation failed), `S3_UPLOAD_FAILED` (file upload to MinIO/S3 threw an exception), `FILE_READ_ERROR` (the file could not be opened for reading), or `ALREADY_EXISTS` (a document with the same filename already exists in the archive with a status other than `PLACEHOLDER`). +**canonical artifact** — one of the four files the normalizer (`tools/import-normalizer/`) emits and commits to `tools/import-normalizer/out/`: `canonical-tag-tree.xlsx`, `canonical-persons.xlsx`, `canonical-persons-tree.json`, `canonical-documents.xlsx`. They are the contract the backend importer reads (mapped by header name); the semantic transformation (German-date parsing, name classification) lives only in the normalizer, never in Java. + +**CanonicalSheetReader** — the value-level POI helper that opens a canonical `.xlsx`, maps the header row to column indices by name (replacing the brittle positional column config), splits pipe-delimited list columns, and throws `IMPORT_ARTIFACT_INVALID` on a missing required header rather than NPE-ing on a null index. + +**SkippedFile** (`ImportStatus.SkippedFile`) — a file that was presented for import but not processed, recorded with a `filename` and a `reason` code. Possible reasons: `INVALID_FILENAME_PATH_TRAVERSAL` (the file-column basename failed the path-traversal guard), `INVALID_PDF_SIGNATURE` (magic-byte validation failed), `S3_UPLOAD_FAILED` (file upload to MinIO/S3 threw an exception), `FILE_READ_ERROR` (the file could not be opened for reading), or `ALREADY_EXISTS` (a document with the same `index` already exists in the archive with a status other than `PLACEHOLDER`). **skipped count** — the total number of `SkippedFile` entries accumulated during a single import run (`ImportStatus.skipped()`). Shown in the amber warning section of the Import Status Card in the admin UI; a value of zero suppresses the section entirely. diff --git a/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md b/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md index 0feb670b..3515413b 100644 --- a/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md +++ b/docs/adr/025-canonical-import-and-single-migration-schema-foundation.md @@ -2,7 +2,7 @@ **Date:** 2026-05-27 **Status:** Accepted -**Issue:** #671 +**Issue:** #671 (schema, decisions 1–2); #669 (importer architecture, decision 3) **Milestone:** Handling the Unknowns — honest uncertainty in dates & people --- @@ -56,6 +56,59 @@ The importer reads the Python normalizer's canonical output output strings are persisted as-is. The same applies to `source_ref`, which carries the normalizer's `person_id` / canonical `tag_path` unchanged as the re-import idempotency key. +### 3. The importer is four idempotent loaders over the canonical artifacts; Java no longer parses the raw spreadsheet (Phase 3, #669) + +The legacy `MassImportService` read the *raw* original spreadsheet by positional column +index (`@Value app.import.col.*`) and re-derived everything in Java (ISO-only date parsing, +name classification via `findOrCreateByAlias`, an ODS/XXE XML path). It is **deleted**. + +The rebuild is a `CanonicalImportOrchestrator` driving four single-responsibility loaders in +an explicit dependency DAG — `TagTreeImporter` → `PersonRegisterImporter` → +`PersonTreeImporter` → `DocumentImporter` — that **consume the committed canonical artifacts** +(`tools/import-normalizer/out/`). A shared `CanonicalSheetReader` maps columns **by header +name** (not by index) and fails closed (`IMPORT_ARTIFACT_INVALID`) on a missing header. Each +loader calls the **owning domain's service**, never a repository (layering rule); the tree +loader uses `RelationshipService`, never the relationship repository. + +Settled sub-decisions: + +- **Idempotency precedence is domain-specific.** Persons/tags upsert by `source_ref`, + documents by `index`. Two distinct rules apply: + - **Person/Tag scalar fields = preserve human edits.** On re-import a non-blank field a human + changed in-app is never overwritten (blank fields are filled from canonical via the single + `preferHuman` idiom), and `provisional` is monotonic-downward — once a human confirms a + person (`false`) it never reverts to `true`. Because the orchestrator loads the register and + tree *before* documents, a person already `false` can never be flipped provisional by a + later document row that references the same `source_ref`, regardless of document-row order. + - **Document sender/receivers/tags = canonical-authoritative.** A document's sender, receiver + set, and tag set are owned by the canonical row, not the archivist. On re-import of a + PLACEHOLDER document `DocumentImporter` clears and re-populates `receivers`/`tags` so a row + whose set *shrinks* prunes the removed links rather than accumulating stale ones. The + "preserve human edits" rule above does **not** extend to these collections. The raw + `sender_text`/`receiver_text` cells are always retained verbatim (a separate invariant). + Note non-PLACEHOLDER documents are skipped entirely (`ALREADY_EXISTS`), so once a document + has a file the importer never touches it again — this bounds the authoritative-overwrite + blast radius to placeholder rows. + Verified against real Postgres in `CanonicalImportIntegrationTest` + (`reimport_preservesHumanEditedPersonField`, `reimport_prunesRemovedReceiverAndTag…`, + `import_neverFlipsRegisterPersonToProvisional…`). +- **Name policy = Option A.** The normalizer resolved attribution upstream: the document sheet + carries the resolved slug in `sender_person_id` / `receiver_person_ids` and the raw cell in + `sender_name` / `receiver_names`. The importer routes register-first by `source_ref` + (provisional `Person` when a slug is unmatched), and **always retains the raw cell** in + `sender_text` / `receiver_text` even when a person is linked — the load-bearing invariant + behind the merge story. A row with no slug but raw text (prose / `?` / object-noise) links + no person and keeps only the raw text. +- **`provisional` is now populated.** Importer-minted persons are `provisional = true`; + register and tree persons stay `false`. This is the Phase-3 contract the schema (decision 1) + left at default-`false`. +- **Security guards are defense-in-depth, not upstream-trust.** The `file` column is treated as + hostile (CWE-22 does not care it came from our tool): its basename is validated + (`isValidImportFilename` — slash/backslash, three Unicode slash homoglyphs, `..`, null byte, + absolute path) and resolved only inside the import dir with canonical-path containment, so a + traversal value can never escape. The `%PDF` magic-byte check gates upload. These guards and + their tests were ported from `MassImportService` **before** it was deleted. + --- ## Consequences @@ -76,6 +129,20 @@ normalizer's `person_id` / canonical `tag_path` unchanged as the re-import idemp - **Forward-only.** The migration is immutable once shipped (Flyway checksum model); any fix goes in a later version. There is no down-migration — rollback means restoring from the nightly `pg_dump`, the standard procedure. +- **`runImport()` is non-transactional — per-loader transactions only.** The orchestrator + does not wrap the four loaders in a single transaction; each loader (or the per-call + `upsertBySourceRef` / `DocumentImporter.load`) carries its own `@Transactional` boundary. A + partial failure mid-run (e.g. the document loader throws after tags + persons committed) + leaves the earlier loaders' data committed and the `ImportStatus` set to `FAILED`. This is + acceptable precisely because the import is idempotent: re-running is safe and converges to + the same state, so the operational recovery for a partial failure is simply to fix the + offending artifact and re-trigger the import — no manual cleanup of half-written data is + required. A future maintainer must not assume all-or-nothing semantics. +- **Path-escape aborts the whole import (fail-closed), by design.** A path-traversal or + symlink-escape in a row's file path is treated as an attack signal: the import aborts rather + than recording the row as a `SkippedFile` and continuing. This is a deliberate owner decision + (2026-05-27) over a per-file skip — a malicious path must surface loudly, not be silently + tolerated. - **`PersonSummaryDTO` coupling.** `provisional` was added to the `PersonSummaryDTO` native interface projection; because the projection is backed by native SQL, the column had to be added to all three native `SELECT`s (`findAllWithDocumentCount`, `searchWithDocumentCount`, diff --git a/docs/architecture/c4/l3-backend-3b-document-management.puml b/docs/architecture/c4/l3-backend-3b-document-management.puml index a15eb00b..89d4a68b 100644 --- a/docs/architecture/c4/l3-backend-3b-document-management.puml +++ b/docs/architecture/c4/l3-backend-3b-document-management.puml @@ -1,7 +1,7 @@ @startuml !include -title Component Diagram: API Backend — Document Management & Import +title Component Diagram: API Backend — Document Management & Canonical Import Container(frontend, "Web Frontend", "SvelteKit") ContainerDb(db, "PostgreSQL", "PostgreSQL 16") @@ -9,30 +9,48 @@ ContainerDb(minio, "Object Storage", "MinIO (S3-compatible)") System_Boundary(backend, "API Backend (Spring Boot)") { Component(docCtrl, "DocumentController", "Spring MVC — /api/documents", "CRUD for documents: search, get by ID, update metadata, upload/download file, conversation thread, batch metadata updates, and per-month density aggregation for the timeline filter widget.") - Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers asynchronous Excel/ODS mass import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).") + Component(adminCtrl, "AdminController", "Spring MVC — /api/admin", "Triggers the asynchronous canonical import (requires ADMIN permission). Reports import state (IDLE/RUNNING/DONE/FAILED).") Component(docSvc, "DocumentService", "Spring Service", "Core document business logic: store, update, search. Resolves persons and tags, delegates file I/O to FileService, builds dynamic JPA Specifications, and integrates with audit logging.") Component(fileSvc, "FileService", "Spring Service", "Wraps AWS SDK v2 S3Client. Uploads files with UUID-keyed paths, computes SHA-256 hash, downloads with content-type detection, and generates presigned URLs for OCR access.") - Component(massImport, "MassImportService", "Spring Service — @Async", "Reads Excel/ODS files from /import mount. Tracks import state (IDLE/RUNNING/DONE/FAILED) and delegates to ExcelService. Returns immediately; processing runs asynchronously.") - Component(excelSvc, "ExcelService", "Spring Service", "Parses Excel/ODS workbooks (Apache POI). Column indices configurable via application.properties. Creates/updates document records per row.") + Component(importOrch, "CanonicalImportOrchestrator", "Spring Service — @Async", "Runs the four canonical loaders in an explicit dependency DAG (TagTree → PersonRegister → PersonTree → Document). Smoke-checks all four artifacts before starting, owns the IDLE/RUNNING/DONE/FAILED state machine, fails closed on a malformed artifact.") + Component(tagTreeLoader, "TagTreeImporter", "Spring Component", "Upserts the tag hierarchy from canonical-tag-tree.xlsx via TagService (by canonical tag_path).") + Component(personRegLoader, "PersonRegisterImporter", "Spring Component", "Upserts register persons from canonical-persons.xlsx via PersonService (by normalizer person_id).") + Component(personTreeLoader, "PersonTreeImporter", "Spring Component", "Upserts tree persons + relationships from canonical-persons-tree.json via PersonService and RelationshipService.") + Component(docLoader, "DocumentImporter", "Spring Component", "Loads canonical-documents.xlsx: routes attribution register-first (raw cell always retained in sender_text/receiver_text), parses clean dates, keeps the S3 upload + thumbnail plumbing, and ports the path-traversal / homoglyph / absolute-path / %PDF magic-byte security guards.") + Component(sheetReader, "CanonicalSheetReader", "POI helper", "Maps a canonical .xlsx by header name (no positional indices), splits pipe-delimited list columns, fails closed (IMPORT_ARTIFACT_INVALID) on a missing required header.") Component(minioConf, "MinioConfig", "Spring @Configuration", "Creates the S3Client and S3Presigner beans with path-style access for MinIO. Validates MinIO connectivity on startup.") Component(docRepo, "DocumentRepository", "Spring Data JPA", "Queries documents with Specification-based dynamic search, bidirectional conversation thread queries, full-text search with ranking and match highlighting, and transcription pipeline queue projections.") Component(docSpec, "DocumentSpecifications", "JPA Criteria API", "Factory for composable predicates: hasText (full-text), hasSender, hasReceiver, isBetween (date range), hasTags (subquery AND/OR logic).") } -Component(personSvc, "PersonService", "Spring Service", "See diagram 3e. Called by DocumentService to resolve sender / receiver persons by ID.") -Component(tagSvc, "TagService", "Spring Service", "See diagram 3d. Called by DocumentService to find or create tags by name.") +Component(personSvc, "PersonService", "Spring Service", "See diagram 3e. Resolves sender / receiver persons by ID; upserts persons by source_ref for the importer.") +Component(tagSvc, "TagService", "Spring Service", "See diagram 3d. Finds or creates tags by name; upserts tags by source_ref for the importer.") +Component(relSvc, "RelationshipService", "Spring Service", "See diagram 3e. Creates family relationships from the person tree during import.") Rel(frontend, docCtrl, "Document requests", "HTTP / JSON") Rel(frontend, adminCtrl, "Trigger import", "HTTP / JSON") Rel(docCtrl, docSvc, "Delegates to") -Rel(adminCtrl, massImport, "Triggers") +Rel(adminCtrl, importOrch, "Triggers") Rel(docSvc, fileSvc, "Upload / download files") Rel(docSvc, docRepo, "Reads / writes documents") Rel(docSvc, docSpec, "Builds search predicates") Rel(docSvc, personSvc, "Resolves sender / receivers") Rel(docSvc, tagSvc, "Finds or creates tags") -Rel(massImport, excelSvc, "Parses Excel/ODS file") -Rel(excelSvc, docSvc, "Creates / updates documents") +Rel(importOrch, tagTreeLoader, "1. Loads tags") +Rel(importOrch, personRegLoader, "2. Loads register persons") +Rel(importOrch, personTreeLoader, "3. Loads tree persons + relationships") +Rel(importOrch, docLoader, "4. Loads documents") +Rel(tagTreeLoader, sheetReader, "Reads canonical .xlsx") +Rel(personRegLoader, sheetReader, "Reads canonical .xlsx") +Rel(docLoader, sheetReader, "Reads canonical .xlsx") +Rel(tagTreeLoader, tagSvc, "Upserts tags by source_ref") +Rel(personRegLoader, personSvc, "Upserts persons by source_ref") +Rel(personTreeLoader, personSvc, "Upserts persons by source_ref") +Rel(personTreeLoader, relSvc, "Creates relationships") +Rel(docLoader, docSvc, "Upserts documents by index") +Rel(docLoader, personSvc, "Register-first match / provisional person") +Rel(docLoader, tagSvc, "Attaches tag by source_ref") +Rel(docLoader, fileSvc, "Uploads resolved file") Rel(minioConf, fileSvc, "Provides S3Client and S3Presigner beans") Rel(fileSvc, minio, "PUT / GET / presigned URL objects", "S3 API / HTTP") Rel(docRepo, db, "SQL queries", "JDBC") diff --git a/frontend/messages/de.json b/frontend/messages/de.json index 52087452..a54ab59e 100644 --- a/frontend/messages/de.json +++ b/frontend/messages/de.json @@ -14,6 +14,7 @@ "error_file_too_large": "Die Datei ist zu groß (max. 50 MB).", "error_user_not_found": "Der Benutzer wurde nicht gefunden.", "error_import_already_running": "Ein Import läuft bereits. Bitte warten Sie, bis dieser abgeschlossen ist.", + "error_import_artifact_invalid": "Eine Importdatei fehlt oder ist ungültig. Bitte führen Sie den Normalizer erneut aus.", "error_invalid_credentials": "E-Mail-Adresse oder Passwort ist falsch.", "error_session_expired": "Ihre Sitzung ist abgelaufen. Bitte melden Sie sich erneut an.", "error_session_expired_explainer": "Aus Sicherheitsgründen werden Sitzungen nach 8 Stunden Inaktivität automatisch beendet.", @@ -356,11 +357,13 @@ "admin_system_import_status_done_label": "Dokumente verarbeitet", "admin_system_import_skipped_label": "übersprungen", "import_reason_invalid_pdf_signature": "Keine gültige PDF-Signatur", + "import_reason_path_traversal": "Ungültiger Dateiname (Pfad)", "import_reason_file_read_error": "Fehler beim Lesen der Datei", "import_reason_s3_upload_failed": "Upload-Fehler (S3)", "import_reason_already_exists": "Bereits importiert", "admin_system_import_status_failed": "Import fehlgeschlagen", "admin_system_import_failed_no_spreadsheet": "Keine Tabellendatei gefunden.", + "admin_system_import_failed_artifact": "Eine Importdatei fehlt oder ist ungültig.", "admin_system_import_failed_internal": "Interner Fehler beim Import.", "admin_system_thumbnails_heading": "Thumbnails erzeugen", "admin_system_thumbnails_description": "Erzeugt Vorschaubilder für Dokumente ohne Thumbnail (z. B. nach dem Massenimport).", diff --git a/frontend/messages/en.json b/frontend/messages/en.json index 3e2c3ff8..5c6ca80a 100644 --- a/frontend/messages/en.json +++ b/frontend/messages/en.json @@ -14,6 +14,7 @@ "error_file_too_large": "The file is too large (max. 50 MB).", "error_user_not_found": "User not found.", "error_import_already_running": "An import is already running. Please wait for it to finish.", + "error_import_artifact_invalid": "A canonical import file is missing or invalid. Please re-run the normalizer.", "error_invalid_credentials": "Email address or password is incorrect.", "error_session_expired": "Your session has expired. Please sign in again.", "error_session_expired_explainer": "For security reasons, sessions are automatically ended after 8 hours of inactivity.", @@ -356,11 +357,13 @@ "admin_system_import_status_done_label": "Documents processed", "admin_system_import_skipped_label": "skipped", "import_reason_invalid_pdf_signature": "Invalid PDF signature", + "import_reason_path_traversal": "Invalid filename (path)", "import_reason_file_read_error": "File read error", "import_reason_s3_upload_failed": "Upload error (S3)", "import_reason_already_exists": "Already imported", "admin_system_import_status_failed": "Import failed", "admin_system_import_failed_no_spreadsheet": "No spreadsheet file found.", + "admin_system_import_failed_artifact": "A canonical import file is missing or invalid.", "admin_system_import_failed_internal": "Import failed due to an internal error.", "admin_system_thumbnails_heading": "Generate thumbnails", "admin_system_thumbnails_description": "Generates preview images for documents without a thumbnail (e.g. after the mass import).", diff --git a/frontend/messages/es.json b/frontend/messages/es.json index 972eecb8..cbda7fab 100644 --- a/frontend/messages/es.json +++ b/frontend/messages/es.json @@ -14,6 +14,7 @@ "error_file_too_large": "El archivo es demasiado grande (máx. 50 MB).", "error_user_not_found": "Usuario no encontrado.", "error_import_already_running": "Ya hay una importación en curso. Por favor, espere a que finalice.", + "error_import_artifact_invalid": "Falta un archivo de importación canónico o no es válido. Vuelva a ejecutar el normalizador.", "error_invalid_credentials": "El correo electrónico o la contraseña son incorrectos.", "error_session_expired": "Su sesión ha expirado. Por favor, inicie sesión de nuevo.", "error_session_expired_explainer": "Por razones de seguridad, las sesiones se terminan automáticamente tras 8 horas de inactividad.", @@ -356,11 +357,13 @@ "admin_system_import_status_done_label": "Documentos procesados", "admin_system_import_skipped_label": "omitidos", "import_reason_invalid_pdf_signature": "Firma PDF no válida", + "import_reason_path_traversal": "Nombre de archivo no válido (ruta)", "import_reason_file_read_error": "Error al leer el archivo", "import_reason_s3_upload_failed": "Error de carga (S3)", "import_reason_already_exists": "Ya importado", "admin_system_import_status_failed": "Importación fallida", "admin_system_import_failed_no_spreadsheet": "No se encontró ninguna hoja de cálculo.", + "admin_system_import_failed_artifact": "Falta un archivo de importación canónico o no es válido.", "admin_system_import_failed_internal": "Error interno durante la importación.", "admin_system_thumbnails_heading": "Generar miniaturas", "admin_system_thumbnails_description": "Genera imágenes de vista previa para documentos sin miniatura (p. ej. tras la importación masiva).", diff --git a/frontend/src/lib/shared/errors.ts b/frontend/src/lib/shared/errors.ts index 96700120..dcdb9f25 100644 --- a/frontend/src/lib/shared/errors.ts +++ b/frontend/src/lib/shared/errors.ts @@ -17,6 +17,7 @@ export type ErrorCode = | 'EMAIL_ALREADY_IN_USE' | 'WRONG_CURRENT_PASSWORD' | 'IMPORT_ALREADY_RUNNING' + | 'IMPORT_ARTIFACT_INVALID' | 'INVALID_RESET_TOKEN' | 'INVITE_NOT_FOUND' | 'INVITE_EXHAUSTED' @@ -104,6 +105,8 @@ export function getErrorMessage(code: ErrorCode | string | undefined): string { return m.error_wrong_current_password(); case 'IMPORT_ALREADY_RUNNING': return m.error_import_already_running(); + case 'IMPORT_ARTIFACT_INVALID': + return m.error_import_artifact_invalid(); case 'INVALID_RESET_TOKEN': return m.error_invalid_reset_token(); case 'INVITE_NOT_FOUND': diff --git a/frontend/src/routes/admin/system/ImportStatusCard.svelte b/frontend/src/routes/admin/system/ImportStatusCard.svelte index bb9bce72..e6556857 100644 --- a/frontend/src/routes/admin/system/ImportStatusCard.svelte +++ b/frontend/src/routes/admin/system/ImportStatusCard.svelte @@ -13,10 +13,13 @@ let { const failureMessage = $derived( importStatus?.statusCode === 'IMPORT_FAILED_NO_SPREADSHEET' ? m.admin_system_import_failed_no_spreadsheet() - : m.admin_system_import_failed_internal() + : importStatus?.statusCode === 'IMPORT_FAILED_ARTIFACT' + ? m.admin_system_import_failed_artifact() + : m.admin_system_import_failed_internal() ); function reasonLabel(code: string): string { + if (code === 'INVALID_FILENAME_PATH_TRAVERSAL') return m.import_reason_path_traversal(); if (code === 'INVALID_PDF_SIGNATURE') return m.import_reason_invalid_pdf_signature(); if (code === 'FILE_READ_ERROR') return m.import_reason_file_read_error(); if (code === 'S3_UPLOAD_FAILED') return m.import_reason_s3_upload_failed();