diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java new file mode 100644 index 00000000..2107bfda --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestrator.java @@ -0,0 +1,94 @@ +package org.raddatz.familienarchiv.importing; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.time.LocalDateTime; +import java.util.List; + +/** + * Runs the four canonical loaders in their real dependency order — encoded explicitly + * here, not implied by call order — and owns the async runner plus the {@link ImportStatus} + * state machine the admin UI consumes. The orchestrator smoke-checks that all four + * artifacts are present before starting, failing fast rather than half-loading tags but no + * documents. A malformed artifact (a loader throwing) sets {@code FAILED}; an individual + * bad file is surfaced through the {@link ImportStatus.SkippedFile} mechanism instead. + */ +@Service +@RequiredArgsConstructor +@Slf4j +public class CanonicalImportOrchestrator { + + private static final String TAG_TREE_ARTIFACT = "canonical-tag-tree.xlsx"; + private static final String PERSONS_ARTIFACT = "canonical-persons.xlsx"; + private static final String PERSONS_TREE_ARTIFACT = "canonical-persons-tree.json"; + private static final String DOCUMENTS_ARTIFACT = "canonical-documents.xlsx"; + + private final TagTreeImporter tagTreeImporter; + private final PersonRegisterImporter personRegisterImporter; + private final PersonTreeImporter personTreeImporter; + private final DocumentImporter documentImporter; + + @Value("${app.import.dir:/import}") + private String canonicalDir; + + private volatile ImportStatus currentStatus = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + + public ImportStatus getStatus() { + return currentStatus; + } + + @Async + public void runImportAsync() { + if (currentStatus.state() == ImportStatus.State.RUNNING) { + throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); + } + runImport(); + } + + /** Synchronous entry point — wrapped by {@link #runImportAsync()} and called directly in tests. */ + void runImport() { + currentStatus = new ImportStatus(ImportStatus.State.RUNNING, "IMPORT_RUNNING", + "Import läuft...", 0, List.of(), LocalDateTime.now()); + try { + File tagTree = requireArtifact(TAG_TREE_ARTIFACT); + File persons = requireArtifact(PERSONS_ARTIFACT); + File personsTree = requireArtifact(PERSONS_TREE_ARTIFACT); + File documents = requireArtifact(DOCUMENTS_ARTIFACT); + + // Dependency DAG: documents need persons + tags; the tree needs persons. + tagTreeImporter.load(tagTree); + personRegisterImporter.load(persons); + personTreeImporter.load(personsTree); + DocumentImporter.LoadResult result = documentImporter.load(documents); + + currentStatus = new ImportStatus(ImportStatus.State.DONE, "IMPORT_DONE", + "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", + result.processed(), result.skippedFiles(), currentStatus.startedAt()); + } catch (DomainException e) { + log.error("Canonical import failed: {}", e.getMessage()); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_ARTIFACT", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } catch (Exception e) { + log.error("Canonical import failed", e); + currentStatus = new ImportStatus(ImportStatus.State.FAILED, "IMPORT_FAILED_INTERNAL", + "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); + } + } + + private File requireArtifact(String name) { + File artifact = new File(canonicalDir, name); + if (!artifact.isFile()) { + throw DomainException.badRequest(ErrorCode.IMPORT_ARTIFACT_INVALID, + "Missing canonical artifact: " + name); + } + return artifact; + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java deleted file mode 100644 index 975517e7..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java +++ /dev/null @@ -1,509 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.v3.oas.annotations.media.Schema; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.poi.ss.usermodel.*; -import java.util.Objects; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.exception.ErrorCode; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.person.PersonNameParser; -import org.raddatz.familienarchiv.person.PersonService; -import org.raddatz.familienarchiv.tag.TagService; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.scheduling.annotation.Async; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import javax.xml.parsers.DocumentBuilderFactory; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Optional; -import java.util.UUID; -import java.util.stream.Stream; -import java.util.zip.ZipFile; - -@Service -@RequiredArgsConstructor -@Slf4j -public class MassImportService { - - public enum State { IDLE, RUNNING, DONE, FAILED } - - public enum SkipReason { - INVALID_FILENAME_PATH_TRAVERSAL, - INVALID_PDF_SIGNATURE, - FILE_READ_ERROR, - ALREADY_EXISTS, - S3_UPLOAD_FAILED - } - - public record SkippedFile( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason - ) {} - - public record ImportStatus( - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode, - @JsonIgnore String message, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed, - @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List skippedFiles, - LocalDateTime startedAt - ) { - // Note: @Schema on a record accessor method is not picked up by SpringDoc; the - // "skipped" count is a computed convenience field derived from skippedFiles.size(). - @JsonProperty("skipped") - public int skipped() { return skippedFiles.size(); } - - /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */ - public ImportStatus { - skippedFiles = List.copyOf(skippedFiles); - } - } - - record ProcessResult(int processed, List skippedFiles) {} - - private volatile ImportStatus currentStatus = new ImportStatus(State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - - public ImportStatus getStatus() { - return currentStatus; - } - - private final DocumentService documentService; - private final PersonService personService; - private final TagService tagService; - private final S3Client s3Client; - private final ThumbnailAsyncRunner thumbnailAsyncRunner; - - @Value("${app.s3.bucket}") - private String bucketName; - - @Value("${app.import.col.index:0}") - private int colIndex; - - @Value("${app.import.col.box:1}") - private int colBox; - - @Value("${app.import.col.folder:2}") - private int colFolder; - - @Value("${app.import.col.sender:3}") - private int colSender; - - @Value("${app.import.col.receivers:5}") - private int colReceivers; - - @Value("${app.import.col.date:7}") - private int colDate; - - @Value("${app.import.col.location:9}") - private int colLocation; - - @Value("${app.import.col.tags:10}") - private int colTags; - - @Value("${app.import.col.summary:11}") - private int colSummary; - - @Value("${app.import.col.transcription:13}") - private int colTranscription; - - @Value("${app.import.dir:/import}") - private String importDir; - - private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); - - // ODS XML namespaces - private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; - private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; - - // We only need up to this many columns; caps repeated-empty-cell expansion - private static final int MAX_COLS = 20; - - @Async - public void runImportAsync() { - if (currentStatus.state() == State.RUNNING) { - throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress"); - } - currentStatus = new ImportStatus(State.RUNNING, "IMPORT_RUNNING", "Import läuft...", 0, List.of(), LocalDateTime.now()); - try { - File spreadsheet = findSpreadsheetFile(); - log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath()); - ProcessResult result = processRows(readSpreadsheet(spreadsheet)); - currentStatus = new ImportStatus(State.DONE, "IMPORT_DONE", - "Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.", - result.processed(), result.skippedFiles(), currentStatus.startedAt()); - } catch (NoSpreadsheetException e) { - log.error("Massenimport fehlgeschlagen: keine Tabellendatei", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_NO_SPREADSHEET", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } catch (Exception e) { - log.error("Massenimport fehlgeschlagen", e); - currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_INTERNAL", - "Fehler: " + e.getMessage(), 0, List.of(), currentStatus.startedAt()); - } - } - - private static class NoSpreadsheetException extends RuntimeException { - NoSpreadsheetException(String message) { super(message); } - } - - private File findSpreadsheetFile() throws IOException { - try (Stream files = Files.list(Paths.get(importDir))) { - return files - .filter(p -> { - String name = p.toString().toLowerCase(); - return name.endsWith(".ods") || name.endsWith(".xlsx") || name.endsWith(".xls"); - }) - .findFirst() - .orElseThrow(() -> new NoSpreadsheetException( - "Keine Tabellendatei (.ods/.xlsx/.xls) in " + importDir + " gefunden!")) - .toFile(); - } - } - - // --- Spreadsheet reading (format-specific, produces neutral List>) --- - - private List> readSpreadsheet(File file) throws Exception { - String name = file.getName().toLowerCase(); - if (name.endsWith(".ods")) { - return readOds(file); - } - return readXlsx(file); - } - - /** - * Reads an ODS file by parsing its content.xml directly (no extra library needed). - * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML. - */ - List> readOds(File file) throws Exception { - List> result = new ArrayList<>(); - - try (ZipFile zip = new ZipFile(file)) { - var entry = zip.getEntry("content.xml"); - if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt"); - - var factory = XxeSafeXmlParser.hardenedFactory(); - factory.setNamespaceAware(true); - var builder = factory.newDocumentBuilder(); - var doc = builder.parse(zip.getInputStream(entry)); - - NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table"); - if (tables.getLength() == 0) return result; - - var table = (Element) tables.item(0); - NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row"); - - for (int i = 0; i < rows.getLength(); i++) { - var row = (Element) rows.item(i); - List rowData = new ArrayList<>(); - NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell"); - - for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) { - var cell = (Element) cells.item(j); - - // Read the display text (first ) - String value = ""; - NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p"); - if (textNodes.getLength() > 0) { - value = textNodes.item(0).getTextContent().trim(); - } - - // Expand number-columns-repeated (capped at MAX_COLS) - String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated"); - int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr); - repeat = Math.min(repeat, MAX_COLS - rowData.size()); - - for (int r = 0; r < repeat; r++) { - rowData.add(value); - } - } - result.add(rowData); - } - } - return result; - } - - /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */ - private List> readXlsx(File file) throws Exception { - List> result = new ArrayList<>(); - try (FileInputStream fis = new FileInputStream(file); - Workbook workbook = WorkbookFactory.create(fis)) { - - Sheet sheet = workbook.getSheetAt(0); - for (int i = 0; i <= sheet.getLastRowNum(); i++) { - Row row = sheet.getRow(i); - List rowData = new ArrayList<>(); - if (row != null) { - for (int j = 0; j < MAX_COLS; j++) { - rowData.add(xlsxCellToString(row.getCell(j))); - } - } - result.add(rowData); - } - } - return result; - } - - private String xlsxCellToString(Cell cell) { - if (cell == null) return ""; - return switch (cell.getCellType()) { - case STRING -> cell.getStringCellValue(); - case NUMERIC -> { - if (DateUtil.isCellDateFormatted(cell)) { - yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO - } - yield String.valueOf((int) cell.getNumericCellValue()); - } - case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); - default -> ""; - }; - } - - // --- Import logic (works on neutral List rows) --- - - private ProcessResult processRows(List> rows) { - int processed = 0; - List skippedFiles = new ArrayList<>(); - - for (int i = 1; i < rows.size(); i++) { // skip header row - List cells = rows.get(i); - String index = getCell(cells, colIndex); - if (index.isBlank()) continue; - - String filename = index.contains(".") ? index : index + ".pdf"; - if (!isValidImportFilename(filename)) { - log.warn("Skipping import row {}: filename rejected — {}", i, filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_FILENAME_PATH_TRAVERSAL)); - continue; - } - Optional fileOnDisk = findFileRecursive(filename); - if (fileOnDisk.isEmpty()) { - log.warn("Datei nicht gefunden, importiere nur Metadaten: {}", filename); - } - - if (fileOnDisk.isPresent()) { - try { - if (!isPdfMagicBytes(fileOnDisk.get())) { - log.warn("Überspringe {}: Datei beginnt nicht mit %PDF-Signatur", filename); - skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_PDF_SIGNATURE)); - continue; - } - } catch (IOException e) { - log.error("Fehler beim Prüfen der Magic-Bytes für {}", filename, e); - skippedFiles.add(new SkippedFile(filename, SkipReason.FILE_READ_ERROR)); - continue; - } - } - - Optional skipReason = importSingleDocument(cells, fileOnDisk, filename, index); - if (skipReason.isPresent()) { - skippedFiles.add(new SkippedFile(filename, skipReason.get())); - } else { - processed++; - } - } - return new ProcessResult(processed, skippedFiles); - } - - private boolean isValidImportFilename(String filename) { - if (filename == null || filename.isBlank()) return false; - if (filename.contains("/")) return false; - if (filename.contains("\\")) return false; - if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH - if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS - if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR - if (filename.contains("..")) return false; - if (filename.equals(".")) return false; - if (filename.contains("\0")) return false; - // Paths.get() is safe here on Linux for all inputs that passed the checks above; - // it may throw InvalidPathException for OS-specific illegal chars on Windows, - // but those are not reachable in production. - if (Paths.get(filename).isAbsolute()) return false; - return true; - } - - // package-private: Mockito spy in tests can override to inject IOException - InputStream openFileStream(File file) throws IOException { - return new FileInputStream(file); - } - - private boolean isPdfMagicBytes(File file) throws IOException { - try (InputStream is = openFileStream(file)) { - byte[] header = is.readNBytes(4); - return header.length == 4 - && header[0] == 0x25 // % - && header[1] == 0x50 // P - && header[2] == 0x44 // D - && header[3] == 0x46; // F - } - } - - /** - * Imports a single document row. - * - * @return empty Optional on success; an Optional containing the skip reason on failure/skip. - */ - @Transactional - protected Optional importSingleDocument(List cells, Optional file, String originalFilename, String index) { - Optional existing = documentService.findByOriginalFilename(originalFilename); - if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) { - log.info("Dokument {} existiert bereits, überspringe.", originalFilename); - return Optional.of(SkipReason.ALREADY_EXISTS); - } - - String archiveBox = getCell(cells, colBox); - String archiveFolder = getCell(cells, colFolder); - String senderRaw = getCell(cells, colSender); - String receiversRaw = getCell(cells, colReceivers); - LocalDate date = parseDate(getCell(cells, colDate)); - String location = getCell(cells, colLocation); - String tagRaw = getCell(cells, colTags); - String summary = getCell(cells, colSummary); - String transcription = getCell(cells, colTranscription); - - String s3Key = null; - String contentType = null; - DocumentStatus status = DocumentStatus.PLACEHOLDER; - - if (file.isPresent()) { - try { - contentType = Files.probeContentType(file.get().toPath()); - } catch (IOException e) { - contentType = null; - } - if (contentType == null) contentType = "application/octet-stream"; - - s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName(); - try { - s3Client.putObject(PutObjectRequest.builder() - .bucket(bucketName) - .key(s3Key) - .contentType(contentType) - .build(), - RequestBody.fromFile(file.get())); - status = DocumentStatus.UPLOADED; - } catch (Exception e) { - log.error("S3 Upload Fehler für {}", file.get().getName(), e); - return Optional.of(SkipReason.S3_UPLOAD_FAILED); - } - } - - Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw); - List receivers = PersonNameParser.parseReceivers(receiversRaw).stream() - .map(this::findOrCreatePerson) - .filter(Objects::nonNull) - .toList(); - - Tag tag = null; - if (!tagRaw.isBlank()) { - tag = tagService.findOrCreate(tagRaw); - } - - Document doc = existing.orElse(Document.builder() - .originalFilename(originalFilename) - .build()); - - // Heuristic: mark as complete if at least one key field is present in the spreadsheet row - boolean metadataComplete = date != null || !senderRaw.isBlank() || !receiversRaw.isBlank(); - - doc.setTitle(buildTitle(index, date, location)); - doc.setFilePath(s3Key); - doc.setContentType(contentType); - doc.setStatus(status); - doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox); - doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder); - doc.setDocumentDate(date); - doc.setLocation(location.isBlank() ? null : location); - doc.setSummary(summary.isBlank() ? null : summary); - doc.setTranscription(transcription.isBlank() ? null : transcription); - doc.setSender(sender); - doc.getReceivers().addAll(receivers); - if (tag != null) doc.getTags().add(tag); - doc.setMetadataComplete(metadataComplete); - - Document saved = documentService.save(doc); - if (file.isPresent()) { - thumbnailAsyncRunner.dispatchAfterCommit(saved.getId()); - } - log.info("Importiert{}: {}", file.isEmpty() ? " (nur Metadaten)" : "", originalFilename); - return Optional.empty(); - } - - // --- Helpers --- - - private String getCell(List cells, int col) { - if (col >= cells.size()) return ""; - String val = cells.get(col); - return val == null ? "" : val.trim(); - } - - private LocalDate parseDate(String value) { - if (value == null || value.isBlank()) return null; - try { - return LocalDate.parse(value.trim()); - } catch (DateTimeParseException e) { - return null; - } - } - - private String buildTitle(String index, LocalDate date, String location) { - StringBuilder sb = new StringBuilder(index); - if (date != null) { - sb.append(" \u2013 ").append(date.format(GERMAN_DATE)); - } - if (location != null && !location.isBlank()) { - sb.append(" \u2013 ").append(location); - } - return sb.toString(); - } - - private Person findOrCreatePerson(String rawName) { - return personService.findOrCreateByAlias(rawName); - } - - private Optional findFileRecursive(String filename) { - File baseDir = new File(importDir); - try (Stream walk = Files.walk(baseDir.toPath())) { - Optional match = walk.filter(p -> !Files.isDirectory(p)) - .filter(p -> p.getFileName().toString().equals(filename)) - .findFirst(); - if (match.isEmpty()) return Optional.empty(); - File candidate = match.get().toFile(); - String baseDirCanonical = baseDir.getCanonicalPath(); - if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) { - throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate); - } - return Optional.of(candidate); - } catch (IOException e) { - return Optional.empty(); - } - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java b/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java deleted file mode 100644 index 949ea054..00000000 --- a/backend/src/main/java/org/raddatz/familienarchiv/importing/XxeSafeXmlParser.java +++ /dev/null @@ -1,20 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -class XxeSafeXmlParser { - - private XxeSafeXmlParser() {} - - static DocumentBuilderFactory hardenedFactory() throws ParserConfigurationException { - var factory = DocumentBuilderFactory.newInstance(); - factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - factory.setXIncludeAware(false); - factory.setExpandEntityReferences(false); - return factory; - } -} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java index 18b6c2c0..74b5d643 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/user/AdminController.java @@ -5,7 +5,8 @@ import org.raddatz.familienarchiv.security.Permission; import org.raddatz.familienarchiv.security.RequirePermission; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.GetMapping; @@ -21,20 +22,20 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class AdminController { - private final MassImportService massImportService; + private final CanonicalImportOrchestrator importOrchestrator; private final DocumentService documentService; private final DocumentVersionService documentVersionService; private final ThumbnailBackfillService thumbnailBackfillService; @PostMapping("/trigger-import") - public ResponseEntity triggerMassImport() { - massImportService.runImportAsync(); - return ResponseEntity.accepted().body(massImportService.getStatus()); + public ResponseEntity triggerMassImport() { + importOrchestrator.runImportAsync(); + return ResponseEntity.accepted().body(importOrchestrator.getStatus()); } @GetMapping("/import-status") - public ResponseEntity importStatus() { - return ResponseEntity.ok(massImportService.getStatus()); + public ResponseEntity importStatus() { + return ResponseEntity.ok(importOrchestrator.getStatus()); } @PostMapping("/backfill-versions") diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index e74f4d41..1e4558e0 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -125,17 +125,10 @@ app: password: ${APP_ADMIN_PASSWORD:admin123} import: - col: - index: 0 - box: 1 - folder: 2 - sender: 3 - receivers: 5 - date: 7 - location: 9 - tags: 10 - summary: 11 - transcription: 13 + # Directory holding the normalizer's committed canonical artifacts + # (canonical-{documents,persons,tag-tree}.xlsx + canonical-persons-tree.json). + # The loader maps columns by header name — no positional indices (see ADR-025). + dir: ${IMPORT_DIR:/import} ocr: sender-model: diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java new file mode 100644 index 00000000..dc12d070 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/CanonicalImportOrchestratorTest.java @@ -0,0 +1,130 @@ +package org.raddatz.familienarchiv.importing; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.springframework.test.util.ReflectionTestUtils; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class CanonicalImportOrchestratorTest { + + @Mock TagTreeImporter tagTreeImporter; + @Mock PersonRegisterImporter personRegisterImporter; + @Mock PersonTreeImporter personTreeImporter; + @Mock DocumentImporter documentImporter; + + private CanonicalImportOrchestrator orchestrator(Path dir) { + CanonicalImportOrchestrator o = new CanonicalImportOrchestrator( + tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + ReflectionTestUtils.setField(o, "canonicalDir", dir.toString()); + return o; + } + + private void writeAllArtifacts(Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons.xlsx"), "x"); + Files.writeString(dir.resolve("canonical-persons-tree.json"), "x"); + Files.writeString(dir.resolve("canonical-documents.xlsx"), "x"); + } + + @Test + void getStatus_isIdleByDefault(@TempDir Path dir) { + assertThat(orchestrator(dir).getStatus().state()).isEqualTo(ImportStatus.State.IDLE); + } + + @Test + void runImport_loadsTagsAndPersonsBeforeDocuments(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(0, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + InOrder order = inOrder(tagTreeImporter, personRegisterImporter, personTreeImporter, documentImporter); + order.verify(tagTreeImporter).load(any()); + order.verify(personRegisterImporter).load(any()); + order.verify(personTreeImporter).load(any()); + order.verify(documentImporter).load(any()); + } + + @Test + void runImport_setsStatusDone_onSuccess(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(3, List.of())); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.DONE); + assertThat(o.getStatus().processed()).isEqualTo(3); + } + + @Test + void runImport_failsClosed_whenAnArtifactIsMissing(@TempDir Path dir) throws Exception { + Files.writeString(dir.resolve("canonical-tag-tree.xlsx"), "x"); + // the other three artifacts are absent + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(tagTreeImporter, never()).load(any()); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImport_setsStatusFailed_whenLoaderThrows(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(tagTreeImporter.load(any())).thenThrow(DomainException.badRequest( + org.raddatz.familienarchiv.exception.ErrorCode.IMPORT_ARTIFACT_INVALID, "bad")); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().state()).isEqualTo(ImportStatus.State.FAILED); + verify(documentImporter, never()).load(any()); + } + + @Test + void runImportAsync_throwsConflict_whenAlreadyRunning(@TempDir Path dir) { + CanonicalImportOrchestrator o = orchestrator(dir); + ReflectionTestUtils.setField(o, "currentStatus", new ImportStatus( + ImportStatus.State.RUNNING, "IMPORT_RUNNING", "running", 0, List.of(), null)); + + assertThatThrownBy(o::runImportAsync) + .isInstanceOf(DomainException.class) + .hasMessageContaining("already in progress"); + } + + @Test + void runImport_aggregatesDocumentSkips(@TempDir Path dir) throws Exception { + writeAllArtifacts(dir); + when(documentImporter.load(any())).thenReturn(new DocumentImporter.LoadResult(1, + List.of(new ImportStatus.SkippedFile("fake.pdf", ImportStatus.SkipReason.INVALID_PDF_SIGNATURE)))); + CanonicalImportOrchestrator o = orchestrator(dir); + + o.runImport(); + + assertThat(o.getStatus().skipped()).isEqualTo(1); + assertThat(o.getStatus().skippedFiles()) + .extracting(ImportStatus.SkippedFile::filename) + .containsExactly("fake.pdf"); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java deleted file mode 100644 index d87d28c1..00000000 --- a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java +++ /dev/null @@ -1,896 +0,0 @@ -package org.raddatz.familienarchiv.importing; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; -import org.raddatz.familienarchiv.exception.DomainException; -import org.raddatz.familienarchiv.document.Document; -import org.raddatz.familienarchiv.document.DocumentService; -import org.raddatz.familienarchiv.document.DocumentStatus; -import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner; -import org.raddatz.familienarchiv.person.Person; -import org.raddatz.familienarchiv.tag.Tag; -import org.raddatz.familienarchiv.tag.TagService; -import org.raddatz.familienarchiv.person.PersonService; -import org.springframework.test.util.ReflectionTestUtils; -import software.amazon.awssdk.core.sync.RequestBody; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.PutObjectRequest; - -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.xml.sax.SAXParseException; - -import java.io.File; -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.*; - -@ExtendWith(MockitoExtension.class) -class MassImportServiceTest { - - @Mock DocumentService documentService; - @Mock PersonService personService; - @Mock TagService tagService; - @Mock S3Client s3Client; - @Mock ThumbnailAsyncRunner thumbnailAsyncRunner; - - MassImportService service; - - @BeforeEach - void setUp() { - service = new MassImportService(documentService, personService, tagService, s3Client, thumbnailAsyncRunner); - ReflectionTestUtils.setField(service, "bucketName", "test-bucket"); - ReflectionTestUtils.setField(service, "importDir", "/import"); - ReflectionTestUtils.setField(service, "colIndex", 0); - ReflectionTestUtils.setField(service, "colBox", 1); - ReflectionTestUtils.setField(service, "colFolder", 2); - ReflectionTestUtils.setField(service, "colSender", 3); - ReflectionTestUtils.setField(service, "colReceivers", 5); - ReflectionTestUtils.setField(service, "colDate", 7); - ReflectionTestUtils.setField(service, "colLocation", 9); - ReflectionTestUtils.setField(service, "colTags", 10); - ReflectionTestUtils.setField(service, "colSummary", 11); - ReflectionTestUtils.setField(service, "colTranscription", 13); - } - - // ─── getStatus ──────────────────────────────────────────────────────────── - - @Test - void getStatus_returnsIdleByDefault() { - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.IDLE); - } - - @Test - void getStatus_hasStatusCode_IMPORT_IDLE_byDefault() { - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_IDLE"); - } - - // ─── runImportAsync ─────────────────────────────────────────────────────── - - @Test - void runImportAsync_setsFailedStatus_whenImportDirectoryDoesNotExist() { - // /import directory doesn't exist in test environment → IOException → IMPORT_FAILED_INTERNAL - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_INTERNAL"); - } - - @Test - void runImportAsync_readsFromConfiguredImportDir(@TempDir Path tempDir) { - // Empty temp dir → findSpreadsheetFile throws "no spreadsheet" with the - // configured path in the message. Proves the field, not a constant, - // drives the lookup. - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().state()).isEqualTo(MassImportService.State.FAILED); - assertThat(service.getStatus().message()).contains(tempDir.toString()); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_FAILED_NO_SPREADSHEET_whenDirIsEmpty(@TempDir Path tempDir) { - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_FAILED_NO_SPREADSHEET"); - } - - @Test - void runImportAsync_setsStatusCode_IMPORT_DONE_whenSpreadsheetHasNoDataRows(@TempDir Path tempDir) throws Exception { - Path xlsx = tempDir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - wb.createSheet("Sheet1"); - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - - service.runImportAsync(); - - assertThat(service.getStatus().statusCode()).isEqualTo("IMPORT_DONE"); - } - - @Test - void runImportAsync_throwsConflict_whenAlreadyRunning() { - MassImportService.ImportStatus running = new MassImportService.ImportStatus( - MassImportService.State.RUNNING, "IMPORT_RUNNING", "Running...", 0, List.of(), LocalDateTime.now()); - ReflectionTestUtils.setField(service, "currentStatus", running); - - assertThatThrownBy(() -> service.runImportAsync()) - .isInstanceOf(DomainException.class) - .hasMessageContaining("already in progress"); - } - - // ─── importSingleDocument — skip already uploaded ───────────────────────── - - @Test - void importSingleDocument_skips_whenDocumentAlreadyUploadedNotPlaceholder() { - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("doc001.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.of(existing)); - - Optional result = service.importSingleDocument(minimalCells("doc001.pdf"), Optional.empty(), "doc001.pdf", "doc001"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — already-exists guard fires before file I/O ───── - - @Test - void importSingleDocument_skipsWithAlreadyExists_whenDocumentUploadedAndFileIsPresent(@TempDir Path tempDir) throws Exception { - // Document already exists with status UPLOADED (not PLACEHOLDER). - // A physical PDF file is also present on disk (valid magic bytes). - // Expected: ALREADY_EXISTS is returned and no S3 upload is attempted — - // the guard fires before any file I/O, so no partial processing occurs. - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("present.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("present.pdf")).thenReturn(Optional.of(existing)); - - Path physicalFile = tempDir.resolve("present.pdf"); - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(physicalFile, pdfHeader); - - Optional result = service.importSingleDocument( - minimalCells("present.pdf"), Optional.of(physicalFile.toFile()), "present.pdf", "present"); - - assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS); - verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService, never()).save(any()); - } - - // ─── importSingleDocument — S3 failure surfaced in skippedFiles ────────── - - @Test - void runImportAsync_addsS3UploadFailed_toSkippedFiles_whenS3Throws(@TempDir Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("upload_fail.pdf"), pdfHeader); - buildMinimalImportXlsx(tempDir, "upload_fail.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename("upload_fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 unavailable")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename, MassImportService.SkippedFile::reason) - .containsExactly(org.assertj.core.groups.Tuple.tuple("upload_fail.pdf", MassImportService.SkipReason.S3_UPLOAD_FAILED)); - } - - @Test - void runImportAsync_addsAlreadyExists_toSkippedFiles_whenDocumentAlreadyUploaded(@TempDir Path tempDir) throws Exception { - buildMinimalImportXlsx(tempDir, "existing.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - Document existing = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.UPLOADED) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(existing)); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.ALREADY_EXISTS); - } - - // ─── importSingleDocument — create new document (metadata only) ─────────── - - @Test - void importSingleDocument_createsNewDocument_whenNotExists() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("doc002.pdf"), Optional.empty(), "doc002.pdf", "doc002"); - - verify(documentService).save(argThat(d -> - d.getOriginalFilename().equals("doc002.pdf") - && d.getStatus() == DocumentStatus.PLACEHOLDER)); - } - - // ─── importSingleDocument — update existing placeholder ────────────────── - - @Test - void importSingleDocument_updatesExistingPlaceholder() { - Document placeholder = Document.builder() - .id(UUID.randomUUID()) - .originalFilename("existing.pdf") - .status(DocumentStatus.PLACEHOLDER) - .build(); - when(documentService.findByOriginalFilename("existing.pdf")).thenReturn(Optional.of(placeholder)); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument(minimalCells("existing.pdf"), Optional.empty(), "existing.pdf", "existing"); - - verify(documentService).save(same(placeholder)); - } - - // ─── importSingleDocument — with file (S3 upload) ───────────────────────── - - @Test - void importSingleDocument_uploadsFileToS3_andSetsStatusUploaded(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("doc003.pdf"); - Files.write(tempFile, "PDF content".getBytes()); - - when(documentService.findByOriginalFilename("doc003.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - service.importSingleDocument( - minimalCells("doc003.pdf"), Optional.of(tempFile.toFile()), "doc003.pdf", "doc003"); - - verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - verify(documentService).save(argThat(d -> d.getStatus() == DocumentStatus.UPLOADED)); - } - - @Test - void importSingleDocument_returnsS3UploadFailed_whenS3UploadFails(@TempDir Path tempDir) throws Exception { - Path tempFile = tempDir.resolve("fail.pdf"); - Files.write(tempFile, "data".getBytes()); - - when(documentService.findByOriginalFilename("fail.pdf")).thenReturn(Optional.empty()); - doThrow(new RuntimeException("S3 error")) - .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - - Optional result = service.importSingleDocument( - minimalCells("fail.pdf"), Optional.of(tempFile.toFile()), "fail.pdf", "fail"); - - verify(documentService, never()).save(any()); - assertThat(result).isPresent().contains(MassImportService.SkipReason.S3_UPLOAD_FAILED); - } - - // ─── importSingleDocument — sender handling ─────────────────────────────── - - @Test - void importSingleDocument_setsNullSender_whenSenderCellIsBlank() { - when(documentService.findByOriginalFilename("nosender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nosender.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nosender.pdf", "nosender"); - - verify(documentService).save(argThat(d -> d.getSender() == null)); - verify(personService, never()).findOrCreateByAlias(any()); - } - - @Test - void importSingleDocument_createsSender_whenSenderCellIsNonBlank() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("withsender.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(sender); - - List cells = buildCells("withsender.pdf", "Walter Müller", "", ""); - service.importSingleDocument(cells, Optional.empty(), "withsender.pdf", "withsender"); - - verify(personService).findOrCreateByAlias("Walter Müller"); - verify(documentService).save(argThat(d -> d.getSender() == sender)); - } - - // ─── importSingleDocument — tag handling ───────────────────────────────── - - @Test - void importSingleDocument_createsTag_whenTagCellIsNonBlank() { - Tag tag = Tag.builder().id(UUID.randomUUID()).name("Familie").build(); - when(documentService.findByOriginalFilename("tagged.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(tagService.findOrCreate("Familie")).thenReturn(tag); - - List cells = buildCells("tagged.pdf", "", "", "Familie"); - service.importSingleDocument(cells, Optional.empty(), "tagged.pdf", "tagged"); - - verify(tagService).findOrCreate("Familie"); - } - - @Test - void importSingleDocument_doesNotCreateTag_whenTagCellIsBlank() { - when(documentService.findByOriginalFilename("notag.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("notag.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "notag.pdf", "notag"); - - verify(tagService, never()).findOrCreate(any()); - } - - // ─── importSingleDocument — metadataComplete heuristic ─────────────────── - - @Test - void importSingleDocument_metadataComplete_whenSenderPresent() { - Person sender = Person.builder().id(UUID.randomUUID()).firstName("A").lastName("B").build(); - when(documentService.findByOriginalFilename("meta.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("A B")).thenReturn(sender); - - List cells = buildCells("meta.pdf", "A B", "", ""); - service.importSingleDocument(cells, Optional.empty(), "meta.pdf", "meta"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_metadataIncomplete_whenNoKeyFieldsPresent() { - when(documentService.findByOriginalFilename("nometa.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("nometa.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "nometa.pdf", "nometa"); - - verify(documentService).save(argThat(d -> !d.isMetadataComplete())); - } - - // ─── importSingleDocument — blank fields set to null ───────────────────── - - @Test - void importSingleDocument_setsBlankFieldsToNull() { - when(documentService.findByOriginalFilename("blank.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = buildCells("blank.pdf", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "blank.pdf", "blank"); - - verify(documentService).save(argThat(d -> - d.getLocation() == null && - d.getSummary() == null && - d.getTranscription() == null && - d.getArchiveBox() == null && - d.getArchiveFolder() == null)); - } - - // ─── processRows — via ReflectionTestUtils ──────────────────────────────── - - @Test - void processRows_returnsZero_whenOnlyHeaderRow() { - List> rows = List.of(List.of("header", "col1")); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - } - - @Test - void processRows_skipsRowWithBlankIndex() { - List> rows = List.of( - List.of("header"), - minimalCells("") // blank index - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - assertThat(result.processed()).isEqualTo(0); - verify(documentService, never()).findByOriginalFilename(any()); - } - - @Test - void processRows_addsExtension_whenIndexHasNoDot() { - when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc001") // no dot → appends ".pdf" - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc001.pdf"); - } - - @Test - void processRows_usesFilenameAsIs_whenIndexHasDot() { - when(documentService.findByOriginalFilename("doc002.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("doc002.pdf") // has dot → used as-is - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - verify(documentService).findByOriginalFilename("doc002.pdf"); - } - - // ─── isValidImportFilename — security regression — do not remove ───────── - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsNull() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", (String) null); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsBlank() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", " "); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsForwardSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsBackslash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "..\\etc\\passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "doc..evil.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsDotDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".."); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameIsAbsolutePath() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "/etc/passwd"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsNullByte() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "file\0.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameIsPlainBasename() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "document.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeDivisionSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo∕bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsFullwidthSlash() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo/bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeReverseSolidus() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo⧵bar.pdf"); - assertThat(result).isFalse(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasLeadingDot() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".hidden.pdf"); - assertThat(result).isTrue(); - } - - @Test - void isValidImportFilename_returnsTrue_whenFilenameHasSpaces() { - boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "Brief an Oma.pdf"); - assertThat(result).isTrue(); - } - - @Test - void processRows_skipsRowAndContinues_whenFilenameIsPathTraversal() { - when(documentService.findByOriginalFilename("legitimate.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List> rows = List.of( - List.of("header"), - minimalCells("../evil"), // row 1: path traversal — should be skipped - minimalCells("legitimate.pdf") // row 2: valid — should be processed - ); - MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows); - - assertThat(result.processed()).isEqualTo(1); - assertThat(result.skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL); - } - - // ─── importSingleDocument — non-blank optional fields ──────────────────── - - @Test - void importSingleDocument_setsNonNullOptionalFields_whenPresent() { - when(documentService.findByOriginalFilename("rich.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - // box=1, folder=2, location=9, summary=11, transcription=13 - List cells = List.of( - "rich.pdf", // 0: index - "Box A", // 1: box - "Folder B", // 2: folder - "", // 3: sender - "", // 4: unused - "", // 5: receivers - "", // 6: unused - "", // 7: date - "", // 8: unused - "Hamburg", // 9: location - "", // 10: tags - "A summary", // 11: summary - "", // 12: unused - "A transcript" // 13: transcription - ); - - service.importSingleDocument(cells, Optional.empty(), "rich.pdf", "rich"); - - verify(documentService).save(argThat(d -> - "Box A".equals(d.getArchiveBox()) && - "Folder B".equals(d.getArchiveFolder()) && - "Hamburg".equals(d.getLocation()) && - "A summary".equals(d.getSummary()) && - "A transcript".equals(d.getTranscription()))); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenReceiversArePresent() { - Person receiver = Person.builder().id(UUID.randomUUID()).firstName("Walter").lastName("Müller").build(); - when(documentService.findByOriginalFilename("rcv.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - when(personService.findOrCreateByAlias("Walter Müller")).thenReturn(receiver); - - List cells = List.of( - "rcv.pdf", "", "", "", "", "Walter Müller", "", "", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "rcv.pdf", "rcv"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - @Test - void importSingleDocument_setsMetadataComplete_whenDateIsPresent() { - when(documentService.findByOriginalFilename("dated.pdf")).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - - List cells = List.of( - "dated.pdf", "", "", "", "", "", "", "2024-03-15", "", "", "", "", "", ""); - service.importSingleDocument(cells, Optional.empty(), "dated.pdf", "dated"); - - verify(documentService).save(argThat(Document::isMetadataComplete)); - } - - // ─── buildTitle — null location ─────────────────────────────────────────── - - @Test - void buildTitle_withNullLocation_skipsLocationPart() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc005", LocalDate.of(1940, 5, 1), (String) null); - assertThat(result).contains("doc005").contains("1940"); - assertThat(result).doesNotContain("Berlin"); - } - - // ─── parseDate — via ReflectionTestUtils ───────────────────────────────── - - @Test - void parseDate_returnsNull_whenValueIsNull() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", (String) null); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsNull_whenValueIsBlank() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", " "); - assertThat(result).isNull(); - } - - @Test - void parseDate_returnsDate_whenValidIsoFormat() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "2024-03-15"); - assertThat(result).isEqualTo(LocalDate.of(2024, 3, 15)); - } - - @Test - void parseDate_returnsNull_whenInvalidDateString() { - LocalDate result = ReflectionTestUtils.invokeMethod(service, "parseDate", "15.03.2024"); - assertThat(result).isNull(); - } - - // ─── buildTitle — via ReflectionTestUtils ──────────────────────────────── - - @Test - void buildTitle_withDateAndLocation() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc001", LocalDate.of(1940, 5, 1), "Berlin"); - assertThat(result).contains("doc001").contains("Berlin").contains("1940"); - } - - @Test - void buildTitle_withDateOnly() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc002", LocalDate.of(1960, 8, 15), ""); - assertThat(result).contains("doc002").contains("1960"); - assertThat(result).doesNotContain("Berlin"); - } - - @Test - void buildTitle_withIndexOnly_whenDateAndLocationAreNull() { - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc003", null, ""); - assertThat(result).isEqualTo("doc003"); - } - - @Test - void buildTitle_withLocationOnly_whenDateIsNull() { - // date=null, location present → date part skipped, location appended - String result = ReflectionTestUtils.invokeMethod(service, "buildTitle", - "doc004", null, "Berlin"); - assertThat(result).contains("doc004").contains("Berlin"); - assertThat(result).doesNotContain("("); // no date part - } - - // ─── getCell — via ReflectionTestUtils ─────────────────────────────────── - - @Test - void getCell_returnsEmptyString_whenColBeyondListSize() { - List cells = List.of("a", "b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 5); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsEmptyString_whenValueIsNull() { - List cells = new ArrayList<>(); - cells.add(null); - cells.add("b"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEmpty(); - } - - @Test - void getCell_returnsTrimmedValue() { - List cells = List.of(" hello ", "world"); - String result = ReflectionTestUtils.invokeMethod(service, "getCell", cells, 0); - assertThat(result).isEqualTo("hello"); - } - - // ─── PDF magic byte validation regression ───────────────────────────────── - - @Test - void runImportAsync_uploadsValidPdf_andSkipsFakeOne(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - verify(s3Client, times(1)).putObject(any(PutObjectRequest.class), any(RequestBody.class)); - } - - @Test - void runImportAsync_setsSkippedCount_toOne_whenOneFakeFile(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_includesRejectedFilename_inSkippedFiles(@TempDir Path tempDir) throws Exception { - setupOneValidOneFakeImport(tempDir); - - service.runImportAsync(); - - assertThat(service.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::filename) - .contains("fake.pdf"); - } - - @Test - void runImportAsync_skipsFile_whenShorterThanFourBytes(@TempDir Path tempDir) throws Exception { - Files.write(tempDir.resolve("tiny.pdf"), new byte[]{0x25, 0x50, 0x44}); // only 3 bytes - buildMinimalImportXlsx(tempDir, "tiny.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - service.runImportAsync(); - - assertThat(service.getStatus().skipped()).isEqualTo(1); - } - - @Test - void runImportAsync_skipsFile_whenMagicBytesCheckThrowsIOException(@TempDir Path tempDir) throws Exception { - Files.writeString(tempDir.resolve("unreadable.pdf"), "some content"); - buildMinimalImportXlsx(tempDir, "unreadable.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - - MassImportService spyService = spy(service); - doThrow(new java.io.IOException("simulated read error")).when(spyService).openFileStream(any(File.class)); - - spyService.runImportAsync(); - - assertThat(spyService.getStatus().skipped()).isEqualTo(1); - assertThat(spyService.getStatus().skippedFiles()) - .extracting(MassImportService.SkippedFile::reason) - .containsExactly(MassImportService.SkipReason.FILE_READ_ERROR); - } - - // ─── findFileRecursive — symlink escape security regression — do not remove ─ - - @Test - void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir( - @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception { - Path outsideFile = outsideDir.resolve("secret.pdf"); - Files.writeString(outsideFile, "sensitive content"); - Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile); - - ReflectionTestUtils.setField(service, "importDir", importDirPath.toString()); - - assertThatThrownBy(() -> ReflectionTestUtils.invokeMethod(service, "findFileRecursive", "secret.pdf")) - .isInstanceOf(DomainException.class); - } - - // ─── readOds — XXE security regression ─────────────────────────────────── - - // Security regression — do not remove. - @Test - void readOds_rejects_xxe_doctype_payload(@TempDir Path tempDir) throws Exception { - File malicious = buildXxeOds(tempDir, "file:///etc/hostname"); - assertThatThrownBy(() -> service.readOds(malicious)) - .isInstanceOf(SAXParseException.class) - .hasMessageContaining("DOCTYPE is disallowed"); - } - - @Test - void readOds_parses_valid_ods_correctly(@TempDir Path tempDir) throws Exception { - File valid = buildValidOds(tempDir, "Mustermann"); - List> rows = service.readOds(valid); - assertThat(rows).isNotEmpty(); - assertThat(rows.get(0)).contains("Mustermann"); - } - - // ─── helpers ────────────────────────────────────────────────────────────── - - /** - * Builds a minimal 14-element cell row with the given filename at index 0 - * and blanks for all optional fields. - */ - private List minimalCells(String filename) { - return buildCells(filename, "", "", ""); - } - - /** - * Builds a cell row with sender, receiver, and tag controls. - * Layout matches the default column indices set in setUp(). - */ - private List buildCells(String filename, String sender, String receivers, String tag) { - // 14 elements: index=0,box=1,folder=2,sender=3,[4],receivers=5,[6],date=7,[8],location=9,tag=10,summary=11,[12],transcription=13 - return List.of( - filename, // 0: index - "", // 1: box - "", // 2: folder - sender, // 3: sender - "", // 4: (unused) - receivers, // 5: receivers - "", // 6: (unused) - "", // 7: date - "", // 8: (unused) - "", // 9: location - tag, // 10: tags - "", // 11: summary - "", // 12: (unused) - "" // 13: transcription - ); - } - - /** Creates a minimal ODS ZIP containing a content.xml with an XXE payload. */ - private File buildXxeOds(Path dir, String entityTarget) throws Exception { - String xml = "" - + "]>" - + "" - + "" - + "" - + "&xxe;" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("malicious.ods"), xml); - } - - /** Creates a minimal valid ODS ZIP containing a content.xml with the given cell value. - * cellValue must not contain XML metacharacters ({@code < > &}). */ - private File buildValidOds(Path dir, String cellValue) throws Exception { - String xml = "" - + "" - + "" - + "" - + "" + cellValue + "" - + "" - + "" - + ""; - return writeOdsZip(dir.resolve("valid.ods"), xml); - } - - private File writeOdsZip(Path destination, String contentXml) throws Exception { - try (OutputStream fos = Files.newOutputStream(destination); - ZipOutputStream zip = new ZipOutputStream(fos)) { - zip.putNextEntry(new ZipEntry("content.xml")); - zip.write(contentXml.getBytes(StandardCharsets.UTF_8)); - zip.closeEntry(); - } - return destination.toFile(); - } - - private void setupOneValidOneFakeImport(Path tempDir) throws Exception { - byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF- - Files.write(tempDir.resolve("real.pdf"), pdfHeader); - Files.writeString(tempDir.resolve("fake.pdf"), "not a pdf"); - buildMinimalImportXlsx(tempDir, "real.pdf", "fake.pdf"); - ReflectionTestUtils.setField(service, "importDir", tempDir.toString()); - when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty()); - when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0)); - } - - private void buildMinimalImportXlsx(Path dir, String... filenames) throws Exception { - Path xlsx = dir.resolve("import.xlsx"); - try (XSSFWorkbook wb = new XSSFWorkbook()) { - org.apache.poi.ss.usermodel.Sheet sheet = wb.createSheet("Sheet1"); - sheet.createRow(0).createCell(0).setCellValue("Index"); - for (int i = 0; i < filenames.length; i++) { - sheet.createRow(i + 1).createCell(0).setCellValue(filenames[i]); - } - try (OutputStream out = Files.newOutputStream(xlsx)) { - wb.write(out); - } - } - } -} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java index b87b928b..8e51fad7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/user/AdminControllerTest.java @@ -7,7 +7,8 @@ import org.raddatz.familienarchiv.security.PermissionAspect; import org.raddatz.familienarchiv.user.CustomUserDetailsService; import org.raddatz.familienarchiv.document.DocumentService; import org.raddatz.familienarchiv.document.DocumentVersionService; -import org.raddatz.familienarchiv.importing.MassImportService; +import org.raddatz.familienarchiv.importing.CanonicalImportOrchestrator; +import org.raddatz.familienarchiv.importing.ImportStatus; import org.raddatz.familienarchiv.document.ThumbnailBackfillService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.aop.AopAutoConfiguration; @@ -35,7 +36,7 @@ class AdminControllerTest { @Autowired MockMvc mockMvc; - @MockitoBean MassImportService massImportService; + @MockitoBean CanonicalImportOrchestrator importOrchestrator; @MockitoBean DocumentService documentService; @MockitoBean DocumentVersionService documentVersionService; @MockitoBean ThumbnailBackfillService thumbnailBackfillService; @@ -46,9 +47,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_returns200_withStatusCode_whenAdmin() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk()) @@ -60,9 +61,9 @@ class AdminControllerTest { @Test @WithMockUser(authorities = "ADMIN") void importStatus_messageField_notPresentInApiResponse() throws Exception { - MassImportService.ImportStatus status = new MassImportService.ImportStatus( - MassImportService.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); - when(massImportService.getStatus()).thenReturn(status); + ImportStatus status = new ImportStatus( + ImportStatus.State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, List.of(), null); + when(importOrchestrator.getStatus()).thenReturn(status); mockMvc.perform(get("/api/admin/import-status")) .andExpect(status().isOk())