feat(importing): add DocumentImporter loader with ported security guards

Fourth canonical loader. Maps canonical-documents.xlsx by header name, routes each attribution register-first by source_ref (provisional person when a slug is unmatched), ALWAYS retains the raw sender_name/receiver_names in sender_text/receiver_text, splits pipe-delimited receivers, parses clean date_iso/date_precision/date_end/date_raw with no semantic logic, attaches the tag by canonical tag_path, and keeps the S3 upload + thumbnail plumbing in small resolveFile/uploadToS3/buildDocument methods. Documents upsert by index (originalFilename); UPLOADED when a file resolves on disk, PLACEHOLDER otherwise. Security guards ported intact from MassImportService BEFORE retiring it: isValidImportFilename (forward/back slash, three Unicode slash homoglyphs, .., null byte, absolute path), findFileRecursive canonical-path containment (symlink-escape), and the %PDF magic-byte check + FILE_READ_ERROR path. The file column is treated as hostile input (CWE-22): its basename is validated then resolved only inside importDir, so a traversal value cannot escape. Extracts the verbatim ImportStatus/SkipReason/SkippedFile shape into its own class so the admin UI contract is unchanged. Assumption: the committed canonical-documents.xlsx carries no sender_category/receiver_category columns (the issue's described schema) — the normalizer already resolved Option-A routing into slugs + raw names, so the loader routes by slug presence rather than a category enum. Refs #669 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 10:33:17 +02:00
parent cbf1984430
commit c56ba6219c
5 changed files with 822 additions and 0 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/DocumentImporter.java
@@ -0,0 +1,324 @@
+package org.raddatz.familienarchiv.importing;
+
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.raddatz.familienarchiv.document.DatePrecision;
+import org.raddatz.familienarchiv.document.Document;
+import org.raddatz.familienarchiv.document.DocumentService;
+import org.raddatz.familienarchiv.document.DocumentStatus;
+import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner;
+import org.raddatz.familienarchiv.exception.DomainException;
+import org.raddatz.familienarchiv.exception.ErrorCode;
+import org.raddatz.familienarchiv.person.Person;
+import org.raddatz.familienarchiv.person.PersonService;
+import org.raddatz.familienarchiv.person.PersonType;
+import org.raddatz.familienarchiv.person.PersonUpsertCommand;
+import org.raddatz.familienarchiv.tag.Tag;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+import org.springframework.transaction.annotation.Transactional;
+import software.amazon.awssdk.core.sync.RequestBody;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.model.PutObjectRequest;
+
+import org.raddatz.familienarchiv.tag.TagService;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Stream;
+
+/**
+ * Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
+ * semantic transformation: the normalizer already resolved people to slugs and dates to
+ * ISO values. This loader maps columns by header name, routes each attribution
+ * register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
+ * parses clean dates, and keeps the file/S3/thumbnail plumbing.
+ *
+ * <p>The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
+ * care that it came from our Python tool): its basename is validated with
+ * {@link #isValidImportFilename} and then resolved with canonical-path containment in
+ * {@link #findFileRecursive}.
+ */
+@Component
+@RequiredArgsConstructor
+@Slf4j
+public class DocumentImporter {
+
+    static final List<String> REQUIRED_HEADERS = List.of(
+            "index", "file", "sender_person_id", "sender_name",
+            "receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
+
+    private final DocumentService documentService;
+    private final PersonService personService;
+    private final TagService tagService;
+    private final S3Client s3Client;
+    private final ThumbnailAsyncRunner thumbnailAsyncRunner;
+
+    @Value("${app.s3.bucket:familienarchiv}")
+    private String bucketName;
+
+    @Value("${app.import.dir:/import}")
+    private String importDir;
+
+    /** Outcome of loading the document sheet: processed count + per-file skips. */
+    public record LoadResult(int processed, List<ImportStatus.SkippedFile> skippedFiles) {}
+
+    public LoadResult load(File artifact) {
+        List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
+        int processed = 0;
+        List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
+        for (CanonicalSheetReader.Row row : rows) {
+            String index = row.get("index");
+            if (index.isBlank()) continue;
+            Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, skipped);
+            if (skipReason.isPresent()) {
+                skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get()));
+            } else {
+                processed++;
+            }
+        }
+        log.info("Imported {} documents from {} ({} skipped)", processed, artifact.getName(), skipped.size());
+        return new LoadResult(processed, skipped);
+    }
+
+    private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index,
+                                                        List<ImportStatus.SkippedFile> skipped) {
+        Optional<File> resolved;
+        try {
+            resolved = resolveFile(row.get("file"));
+        } catch (InvalidImportFilenameException e) {
+            log.warn("Skipping import row {}: filename rejected", index);
+            return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
+        }
+        if (resolved.isPresent()) {
+            try {
+                if (!isPdfMagicBytes(resolved.get())) {
+                    return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
+                }
+            } catch (IOException e) {
+                log.error("Magic-byte check failed for row {}", index, e);
+                return Optional.of(ImportStatus.SkipReason.FILE_READ_ERROR);
+            }
+        }
+        return persist(row, index, resolved);
+    }
+
+    @Transactional
+    protected Optional<ImportStatus.SkipReason> persist(CanonicalSheetReader.Row row, String index, Optional<File> file) {
+        Document existing = documentService.findByOriginalFilename(index).orElse(null);
+        if (existing != null && existing.getStatus() != DocumentStatus.PLACEHOLDER) {
+            return Optional.of(ImportStatus.SkipReason.ALREADY_EXISTS);
+        }
+
+        String s3Key = null;
+        String contentType = null;
+        DocumentStatus status = DocumentStatus.PLACEHOLDER;
+        if (file.isPresent()) {
+            contentType = probeContentType(file.get());
+            s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName();
+            try {
+                uploadToS3(file.get(), s3Key, contentType);
+                status = DocumentStatus.UPLOADED;
+            } catch (Exception e) {
+                log.error("S3 upload failed for {}", file.get().getName(), e);
+                return Optional.of(ImportStatus.SkipReason.S3_UPLOAD_FAILED);
+            }
+        }
+
+        Document doc = buildDocument(row, index, existing, s3Key, contentType, status);
+        Document saved = documentService.save(doc);
+        if (file.isPresent()) {
+            thumbnailAsyncRunner.dispatchAfterCommit(saved.getId());
+        }
+        return Optional.empty();
+    }
+
+    private Document buildDocument(CanonicalSheetReader.Row row, String index, Document existing,
+                                   String s3Key, String contentType, DocumentStatus status) {
+        Document doc = existing != null ? existing
+                : Document.builder().originalFilename(index).build();
+
+        String senderName = row.get("sender_name");
+        String receiverNames = row.get("receiver_names");
+        Person sender = resolveSender(row.get("sender_person_id"), senderName);
+        Set<Person> receivers = resolveReceivers(row.get("receiver_person_ids"));
+
+        doc.setTitle(index);
+        doc.setStatus(status);
+        doc.setFilePath(s3Key);
+        doc.setContentType(contentType);
+        doc.setSender(sender);
+        doc.setSenderText(blankToNull(senderName));
+        doc.getReceivers().addAll(receivers);
+        doc.setReceiverText(blankToNull(receiverNames));
+        doc.setDocumentDate(parseIsoDate(row.get("date_iso")));
+        doc.setMetaDatePrecision(parsePrecision(row.get("date_precision")));
+        doc.setMetaDateEnd(parseIsoDate(row.get("date_end")));
+        doc.setMetaDateRaw(blankToNull(row.get("date_raw")));
+        doc.setLocation(blankToNull(row.get("location")));
+        doc.setSummary(blankToNull(row.get("summary")));
+        attachTag(doc, row.get("tags"));
+        doc.setMetadataComplete(doc.getDocumentDate() != null || sender != null || !receivers.isEmpty());
+        return doc;
+    }
+
+    // ─── attribution routing — register-first, always retain raw ─────────────────────
+
+    private Person resolveSender(String slug, String rawName) {
+        if (slug.isBlank()) return null;
+        return resolvePerson(slug, rawName);
+    }
+
+    private Set<Person> resolveReceivers(String slugs) {
+        Set<Person> receivers = new LinkedHashSet<>();
+        for (String slug : CanonicalSheetReader.splitList(slugs)) {
+            receivers.add(resolvePerson(slug, slug));
+        }
+        return receivers;
+    }
+
+    private Person resolvePerson(String slug, String rawName) {
+        return personService.findBySourceRef(slug)
+                .orElseGet(() -> personService.upsertBySourceRef(PersonUpsertCommand.builder()
+                        .sourceRef(slug)
+                        .lastName(blankToNull(rawName) == null ? slug : rawName)
+                        .personType(PersonType.PERSON)
+                        .provisional(true)
+                        .build()));
+    }
+
+    private void attachTag(Document doc, String tagPath) {
+        if (tagPath.isBlank()) return;
+        tagService.findBySourceRef(tagPath).ifPresent(tag -> doc.getTags().add(tag));
+    }
+
+    // ─── clean-value parsing (no semantic logic) ─────────────────────────────────────
+
+    private static LocalDate parseIsoDate(String value) {
+        if (value == null || value.isBlank()) return null;
+        try {
+            return LocalDate.parse(value.trim());
+        } catch (DateTimeParseException e) {
+            return null;
+        }
+    }
+
+    private static DatePrecision parsePrecision(String value) {
+        if (value == null || value.isBlank()) return DatePrecision.UNKNOWN;
+        try {
+            return DatePrecision.valueOf(value.trim());
+        } catch (IllegalArgumentException e) {
+            return DatePrecision.UNKNOWN;
+        }
+    }
+
+    // ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
+
+    private Optional<File> resolveFile(String fileColumn) {
+        if (fileColumn == null || fileColumn.isBlank()) return Optional.empty();
+        String basename = basenameOf(fileColumn);
+        if (!isValidImportFilename(basename)) {
+            throw new InvalidImportFilenameException();
+        }
+        return findFileRecursive(basename);
+    }
+
+    private static String basenameOf(String fileColumn) {
+        String normalized = fileColumn.replace('\\', '/');
+        int lastSlash = normalized.lastIndexOf('/');
+        return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim();
+    }
+
+    private String probeContentType(File file) {
+        try {
+            String probed = Files.probeContentType(file.toPath());
+            return probed != null ? probed : "application/octet-stream";
+        } catch (IOException e) {
+            return "application/octet-stream";
+        }
+    }
+
+    private void uploadToS3(File file, String s3Key, String contentType) {
+        s3Client.putObject(PutObjectRequest.builder()
+                        .bucket(bucketName)
+                        .key(s3Key)
+                        .contentType(contentType)
+                        .build(),
+                RequestBody.fromFile(file));
+    }
+
+    // ─── security guards — ported verbatim from MassImportService — do not weaken ────
+
+    private boolean isValidImportFilename(String filename) {
+        if (filename == null || filename.isBlank()) return false;
+        if (filename.contains("/")) return false;
+        if (filename.contains("\\")) return false;
+        if (filename.contains("∕")) return false;  // U+2215 DIVISION SLASH
+        if (filename.contains("／")) return false;  // U+FF0F FULLWIDTH SOLIDUS
+        if (filename.contains("⧵")) return false;  // U+29F5 REVERSE SOLIDUS OPERATOR
+        if (filename.contains("..")) return false;
+        if (filename.equals(".")) return false;
+        if (filename.contains("\0")) return false;
+        if (Paths.get(filename).isAbsolute()) return false;
+        return true;
+    }
+
+    // package-private: a Mockito spy in tests can override to inject IOException
+    InputStream openFileStream(File file) throws IOException {
+        return new FileInputStream(file);
+    }
+
+    private boolean isPdfMagicBytes(File file) throws IOException {
+        try (InputStream is = openFileStream(file)) {
+            byte[] header = is.readNBytes(4);
+            return header.length == 4
+                    && header[0] == 0x25  // %
+                    && header[1] == 0x50  // P
+                    && header[2] == 0x44  // D
+                    && header[3] == 0x46; // F
+        }
+    }
+
+    private Optional<File> findFileRecursive(String filename) {
+        File baseDir = new File(importDir);
+        try (Stream<Path> walk = Files.walk(baseDir.toPath())) {
+            Optional<Path> match = walk.filter(p -> !Files.isDirectory(p))
+                    .filter(p -> p.getFileName().toString().equals(filename))
+                    .findFirst();
+            if (match.isEmpty()) return Optional.empty();
+            File candidate = match.get().toFile();
+            String baseDirCanonical = baseDir.getCanonicalPath();
+            if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
+                throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
+            }
+            return Optional.of(candidate);
+        } catch (IOException e) {
+            return Optional.empty();
+        }
+    }
+
+    private static String displayName(CanonicalSheetReader.Row row, String index) {
+        String file = row.get("file");
+        return file.isBlank() ? index : basenameOf(file);
+    }
+
+    private static String blankToNull(String s) {
+        return (s == null || s.isBlank()) ? null : s;
+    }
+
+    private static final class InvalidImportFilenameException extends RuntimeException {
+    }
+}
--- a/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/ImportStatus.java
@@ -0,0 +1,50 @@
+package org.raddatz.familienarchiv.importing;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import io.swagger.v3.oas.annotations.media.Schema;
+
+import java.time.LocalDateTime;
+import java.util.List;
+
+/**
+ * Async import state surfaced to {@code admin/system/ImportStatusCard.svelte} via the
+ * generated types. The shape ({@code state, statusCode, processed, skippedFiles, skipped})
+ * is kept verbatim from the retired MassImportService so the admin UI keeps working.
+ */
+public record ImportStatus(
+        @Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state,
+        @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode,
+        @JsonIgnore String message,
+        @Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed,
+        @Schema(requiredMode = Schema.RequiredMode.REQUIRED) List<SkippedFile> skippedFiles,
+        LocalDateTime startedAt
+) {
+
+    public enum State { IDLE, RUNNING, DONE, FAILED }
+
+    public enum SkipReason {
+        INVALID_FILENAME_PATH_TRAVERSAL,
+        INVALID_PDF_SIGNATURE,
+        FILE_READ_ERROR,
+        ALREADY_EXISTS,
+        S3_UPLOAD_FAILED
+    }
+
+    public record SkippedFile(
+            @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename,
+            @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason
+    ) {}
+
+    // Note: @Schema on a record accessor method is not picked up by SpringDoc; the
+    // "skipped" count is a computed convenience field derived from skippedFiles.size().
+    @JsonProperty("skipped")
+    public int skipped() {
+        return skippedFiles.size();
+    }
+
+    /** Defensive-copy constructor — callers cannot mutate the stored list after construction. */
+    public ImportStatus {
+        skippedFiles = List.copyOf(skippedFiles);
+    }
+}
--- a/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/person/PersonService.java
@@ -80,6 +80,11 @@ public class PersonService {
        return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName);
    }

+    /** Lookup by the normalizer person_id — used by the canonical importer for register-first matching. */
+    public Optional<Person> findBySourceRef(String sourceRef) {
+        return personRepository.findBySourceRef(sourceRef);
+    }
+
    @Nullable
    @Transactional
    public Person findOrCreateByAlias(String rawName) {
--- a/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/tag/TagService.java
@@ -7,6 +7,7 @@ import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.UUID;
 import java.util.stream.Collectors;
@@ -49,6 +50,11 @@ public class TagService {
                .orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id));
    }

+    /** Lookup by the canonical tag_path — used by the canonical importer to attach a document's tag. */
+    public Optional<Tag> findBySourceRef(String sourceRef) {
+        return tagRepository.findBySourceRef(sourceRef);
+    }
+
    public Tag findOrCreate(String name) {
        String cleanName = name.trim();
        return tagRepository.findByNameIgnoreCase(cleanName)