feat(importing): add DocumentImporter loader with ported security guards
Fourth canonical loader. Maps canonical-documents.xlsx by header name, routes each attribution register-first by source_ref (provisional person when a slug is unmatched), ALWAYS retains the raw sender_name/receiver_names in sender_text/receiver_text, splits pipe-delimited receivers, parses clean date_iso/date_precision/date_end/date_raw with no semantic logic, attaches the tag by canonical tag_path, and keeps the S3 upload + thumbnail plumbing in small resolveFile/uploadToS3/buildDocument methods. Documents upsert by index (originalFilename); UPLOADED when a file resolves on disk, PLACEHOLDER otherwise. Security guards ported intact from MassImportService BEFORE retiring it: isValidImportFilename (forward/back slash, three Unicode slash homoglyphs, .., null byte, absolute path), findFileRecursive canonical-path containment (symlink-escape), and the %PDF magic-byte check + FILE_READ_ERROR path. The file column is treated as hostile input (CWE-22): its basename is validated then resolved only inside importDir, so a traversal value cannot escape. Extracts the verbatim ImportStatus/SkipReason/SkippedFile shape into its own class so the admin UI contract is unchanged. Assumption: the committed canonical-documents.xlsx carries no sender_category/receiver_category columns (the issue's described schema) — the normalizer already resolved Option-A routing into slugs + raw names, so the loader routes by slug presence rather than a category enum. Refs #669 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,324 @@
|
|||||||
|
package org.raddatz.familienarchiv.importing;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.raddatz.familienarchiv.document.DatePrecision;
|
||||||
|
import org.raddatz.familienarchiv.document.Document;
|
||||||
|
import org.raddatz.familienarchiv.document.DocumentService;
|
||||||
|
import org.raddatz.familienarchiv.document.DocumentStatus;
|
||||||
|
import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner;
|
||||||
|
import org.raddatz.familienarchiv.exception.DomainException;
|
||||||
|
import org.raddatz.familienarchiv.exception.ErrorCode;
|
||||||
|
import org.raddatz.familienarchiv.person.Person;
|
||||||
|
import org.raddatz.familienarchiv.person.PersonService;
|
||||||
|
import org.raddatz.familienarchiv.person.PersonType;
|
||||||
|
import org.raddatz.familienarchiv.person.PersonUpsertCommand;
|
||||||
|
import org.raddatz.familienarchiv.tag.Tag;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
import software.amazon.awssdk.core.sync.RequestBody;
|
||||||
|
import software.amazon.awssdk.services.s3.S3Client;
|
||||||
|
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
|
||||||
|
|
||||||
|
import org.raddatz.familienarchiv.tag.TagService;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.format.DateTimeParseException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads {@code canonical-documents.xlsx} into the document domain. Java performs no
|
||||||
|
* semantic transformation: the normalizer already resolved people to slugs and dates to
|
||||||
|
* ISO values. This loader maps columns by header name, routes each attribution
|
||||||
|
* register-first (always retaining the raw cell in {@code sender_text}/{@code receiver_text}),
|
||||||
|
* parses clean dates, and keeps the file/S3/thumbnail plumbing.
|
||||||
|
*
|
||||||
|
* <p>The {@code file} value is hostile input regardless of upstream trust (CWE-22 does not
|
||||||
|
* care that it came from our Python tool): its basename is validated with
|
||||||
|
* {@link #isValidImportFilename} and then resolved with canonical-path containment in
|
||||||
|
* {@link #findFileRecursive}.
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@Slf4j
|
||||||
|
public class DocumentImporter {
|
||||||
|
|
||||||
|
static final List<String> REQUIRED_HEADERS = List.of(
|
||||||
|
"index", "file", "sender_person_id", "sender_name",
|
||||||
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision");
|
||||||
|
|
||||||
|
private final DocumentService documentService;
|
||||||
|
private final PersonService personService;
|
||||||
|
private final TagService tagService;
|
||||||
|
private final S3Client s3Client;
|
||||||
|
private final ThumbnailAsyncRunner thumbnailAsyncRunner;
|
||||||
|
|
||||||
|
@Value("${app.s3.bucket:familienarchiv}")
|
||||||
|
private String bucketName;
|
||||||
|
|
||||||
|
@Value("${app.import.dir:/import}")
|
||||||
|
private String importDir;
|
||||||
|
|
||||||
|
/** Outcome of loading the document sheet: processed count + per-file skips. */
|
||||||
|
public record LoadResult(int processed, List<ImportStatus.SkippedFile> skippedFiles) {}
|
||||||
|
|
||||||
|
public LoadResult load(File artifact) {
|
||||||
|
List<CanonicalSheetReader.Row> rows = CanonicalSheetReader.readRows(artifact, REQUIRED_HEADERS);
|
||||||
|
int processed = 0;
|
||||||
|
List<ImportStatus.SkippedFile> skipped = new ArrayList<>();
|
||||||
|
for (CanonicalSheetReader.Row row : rows) {
|
||||||
|
String index = row.get("index");
|
||||||
|
if (index.isBlank()) continue;
|
||||||
|
Optional<ImportStatus.SkipReason> skipReason = importRow(row, index, skipped);
|
||||||
|
if (skipReason.isPresent()) {
|
||||||
|
skipped.add(new ImportStatus.SkippedFile(displayName(row, index), skipReason.get()));
|
||||||
|
} else {
|
||||||
|
processed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("Imported {} documents from {} ({} skipped)", processed, artifact.getName(), skipped.size());
|
||||||
|
return new LoadResult(processed, skipped);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<ImportStatus.SkipReason> importRow(CanonicalSheetReader.Row row, String index,
|
||||||
|
List<ImportStatus.SkippedFile> skipped) {
|
||||||
|
Optional<File> resolved;
|
||||||
|
try {
|
||||||
|
resolved = resolveFile(row.get("file"));
|
||||||
|
} catch (InvalidImportFilenameException e) {
|
||||||
|
log.warn("Skipping import row {}: filename rejected", index);
|
||||||
|
return Optional.of(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||||
|
}
|
||||||
|
if (resolved.isPresent()) {
|
||||||
|
try {
|
||||||
|
if (!isPdfMagicBytes(resolved.get())) {
|
||||||
|
return Optional.of(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Magic-byte check failed for row {}", index, e);
|
||||||
|
return Optional.of(ImportStatus.SkipReason.FILE_READ_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return persist(row, index, resolved);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
protected Optional<ImportStatus.SkipReason> persist(CanonicalSheetReader.Row row, String index, Optional<File> file) {
|
||||||
|
Document existing = documentService.findByOriginalFilename(index).orElse(null);
|
||||||
|
if (existing != null && existing.getStatus() != DocumentStatus.PLACEHOLDER) {
|
||||||
|
return Optional.of(ImportStatus.SkipReason.ALREADY_EXISTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
String s3Key = null;
|
||||||
|
String contentType = null;
|
||||||
|
DocumentStatus status = DocumentStatus.PLACEHOLDER;
|
||||||
|
if (file.isPresent()) {
|
||||||
|
contentType = probeContentType(file.get());
|
||||||
|
s3Key = "documents/" + UUID.randomUUID() + "_" + file.get().getName();
|
||||||
|
try {
|
||||||
|
uploadToS3(file.get(), s3Key, contentType);
|
||||||
|
status = DocumentStatus.UPLOADED;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("S3 upload failed for {}", file.get().getName(), e);
|
||||||
|
return Optional.of(ImportStatus.SkipReason.S3_UPLOAD_FAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Document doc = buildDocument(row, index, existing, s3Key, contentType, status);
|
||||||
|
Document saved = documentService.save(doc);
|
||||||
|
if (file.isPresent()) {
|
||||||
|
thumbnailAsyncRunner.dispatchAfterCommit(saved.getId());
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document buildDocument(CanonicalSheetReader.Row row, String index, Document existing,
|
||||||
|
String s3Key, String contentType, DocumentStatus status) {
|
||||||
|
Document doc = existing != null ? existing
|
||||||
|
: Document.builder().originalFilename(index).build();
|
||||||
|
|
||||||
|
String senderName = row.get("sender_name");
|
||||||
|
String receiverNames = row.get("receiver_names");
|
||||||
|
Person sender = resolveSender(row.get("sender_person_id"), senderName);
|
||||||
|
Set<Person> receivers = resolveReceivers(row.get("receiver_person_ids"));
|
||||||
|
|
||||||
|
doc.setTitle(index);
|
||||||
|
doc.setStatus(status);
|
||||||
|
doc.setFilePath(s3Key);
|
||||||
|
doc.setContentType(contentType);
|
||||||
|
doc.setSender(sender);
|
||||||
|
doc.setSenderText(blankToNull(senderName));
|
||||||
|
doc.getReceivers().addAll(receivers);
|
||||||
|
doc.setReceiverText(blankToNull(receiverNames));
|
||||||
|
doc.setDocumentDate(parseIsoDate(row.get("date_iso")));
|
||||||
|
doc.setMetaDatePrecision(parsePrecision(row.get("date_precision")));
|
||||||
|
doc.setMetaDateEnd(parseIsoDate(row.get("date_end")));
|
||||||
|
doc.setMetaDateRaw(blankToNull(row.get("date_raw")));
|
||||||
|
doc.setLocation(blankToNull(row.get("location")));
|
||||||
|
doc.setSummary(blankToNull(row.get("summary")));
|
||||||
|
attachTag(doc, row.get("tags"));
|
||||||
|
doc.setMetadataComplete(doc.getDocumentDate() != null || sender != null || !receivers.isEmpty());
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── attribution routing — register-first, always retain raw ─────────────────────
|
||||||
|
|
||||||
|
private Person resolveSender(String slug, String rawName) {
|
||||||
|
if (slug.isBlank()) return null;
|
||||||
|
return resolvePerson(slug, rawName);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<Person> resolveReceivers(String slugs) {
|
||||||
|
Set<Person> receivers = new LinkedHashSet<>();
|
||||||
|
for (String slug : CanonicalSheetReader.splitList(slugs)) {
|
||||||
|
receivers.add(resolvePerson(slug, slug));
|
||||||
|
}
|
||||||
|
return receivers;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Person resolvePerson(String slug, String rawName) {
|
||||||
|
return personService.findBySourceRef(slug)
|
||||||
|
.orElseGet(() -> personService.upsertBySourceRef(PersonUpsertCommand.builder()
|
||||||
|
.sourceRef(slug)
|
||||||
|
.lastName(blankToNull(rawName) == null ? slug : rawName)
|
||||||
|
.personType(PersonType.PERSON)
|
||||||
|
.provisional(true)
|
||||||
|
.build()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void attachTag(Document doc, String tagPath) {
|
||||||
|
if (tagPath.isBlank()) return;
|
||||||
|
tagService.findBySourceRef(tagPath).ifPresent(tag -> doc.getTags().add(tag));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── clean-value parsing (no semantic logic) ─────────────────────────────────────
|
||||||
|
|
||||||
|
private static LocalDate parseIsoDate(String value) {
|
||||||
|
if (value == null || value.isBlank()) return null;
|
||||||
|
try {
|
||||||
|
return LocalDate.parse(value.trim());
|
||||||
|
} catch (DateTimeParseException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DatePrecision parsePrecision(String value) {
|
||||||
|
if (value == null || value.isBlank()) return DatePrecision.UNKNOWN;
|
||||||
|
try {
|
||||||
|
return DatePrecision.valueOf(value.trim());
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
return DatePrecision.UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── file handling + S3 (small ≤20-line methods) ─────────────────────────────────
|
||||||
|
|
||||||
|
private Optional<File> resolveFile(String fileColumn) {
|
||||||
|
if (fileColumn == null || fileColumn.isBlank()) return Optional.empty();
|
||||||
|
String basename = basenameOf(fileColumn);
|
||||||
|
if (!isValidImportFilename(basename)) {
|
||||||
|
throw new InvalidImportFilenameException();
|
||||||
|
}
|
||||||
|
return findFileRecursive(basename);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String basenameOf(String fileColumn) {
|
||||||
|
String normalized = fileColumn.replace('\\', '/');
|
||||||
|
int lastSlash = normalized.lastIndexOf('/');
|
||||||
|
return lastSlash < 0 ? normalized.trim() : normalized.substring(lastSlash + 1).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String probeContentType(File file) {
|
||||||
|
try {
|
||||||
|
String probed = Files.probeContentType(file.toPath());
|
||||||
|
return probed != null ? probed : "application/octet-stream";
|
||||||
|
} catch (IOException e) {
|
||||||
|
return "application/octet-stream";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void uploadToS3(File file, String s3Key, String contentType) {
|
||||||
|
s3Client.putObject(PutObjectRequest.builder()
|
||||||
|
.bucket(bucketName)
|
||||||
|
.key(s3Key)
|
||||||
|
.contentType(contentType)
|
||||||
|
.build(),
|
||||||
|
RequestBody.fromFile(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── security guards — ported verbatim from MassImportService — do not weaken ────
|
||||||
|
|
||||||
|
private boolean isValidImportFilename(String filename) {
|
||||||
|
if (filename == null || filename.isBlank()) return false;
|
||||||
|
if (filename.contains("/")) return false;
|
||||||
|
if (filename.contains("\\")) return false;
|
||||||
|
if (filename.contains("∕")) return false; // U+2215 DIVISION SLASH
|
||||||
|
if (filename.contains("/")) return false; // U+FF0F FULLWIDTH SOLIDUS
|
||||||
|
if (filename.contains("⧵")) return false; // U+29F5 REVERSE SOLIDUS OPERATOR
|
||||||
|
if (filename.contains("..")) return false;
|
||||||
|
if (filename.equals(".")) return false;
|
||||||
|
if (filename.contains("\0")) return false;
|
||||||
|
if (Paths.get(filename).isAbsolute()) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// package-private: a Mockito spy in tests can override to inject IOException
|
||||||
|
InputStream openFileStream(File file) throws IOException {
|
||||||
|
return new FileInputStream(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isPdfMagicBytes(File file) throws IOException {
|
||||||
|
try (InputStream is = openFileStream(file)) {
|
||||||
|
byte[] header = is.readNBytes(4);
|
||||||
|
return header.length == 4
|
||||||
|
&& header[0] == 0x25 // %
|
||||||
|
&& header[1] == 0x50 // P
|
||||||
|
&& header[2] == 0x44 // D
|
||||||
|
&& header[3] == 0x46; // F
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<File> findFileRecursive(String filename) {
|
||||||
|
File baseDir = new File(importDir);
|
||||||
|
try (Stream<Path> walk = Files.walk(baseDir.toPath())) {
|
||||||
|
Optional<Path> match = walk.filter(p -> !Files.isDirectory(p))
|
||||||
|
.filter(p -> p.getFileName().toString().equals(filename))
|
||||||
|
.findFirst();
|
||||||
|
if (match.isEmpty()) return Optional.empty();
|
||||||
|
File candidate = match.get().toFile();
|
||||||
|
String baseDirCanonical = baseDir.getCanonicalPath();
|
||||||
|
if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
|
||||||
|
throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
|
||||||
|
}
|
||||||
|
return Optional.of(candidate);
|
||||||
|
} catch (IOException e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String displayName(CanonicalSheetReader.Row row, String index) {
|
||||||
|
String file = row.get("file");
|
||||||
|
return file.isBlank() ? index : basenameOf(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String blankToNull(String s) {
|
||||||
|
return (s == null || s.isBlank()) ? null : s;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final class InvalidImportFilenameException extends RuntimeException {
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
package org.raddatz.familienarchiv.importing;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Async import state surfaced to {@code admin/system/ImportStatusCard.svelte} via the
|
||||||
|
* generated types. The shape ({@code state, statusCode, processed, skippedFiles, skipped})
|
||||||
|
* is kept verbatim from the retired MassImportService so the admin UI keeps working.
|
||||||
|
*/
|
||||||
|
public record ImportStatus(
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) State state,
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) String statusCode,
|
||||||
|
@JsonIgnore String message,
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) int processed,
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) List<SkippedFile> skippedFiles,
|
||||||
|
LocalDateTime startedAt
|
||||||
|
) {
|
||||||
|
|
||||||
|
public enum State { IDLE, RUNNING, DONE, FAILED }
|
||||||
|
|
||||||
|
public enum SkipReason {
|
||||||
|
INVALID_FILENAME_PATH_TRAVERSAL,
|
||||||
|
INVALID_PDF_SIGNATURE,
|
||||||
|
FILE_READ_ERROR,
|
||||||
|
ALREADY_EXISTS,
|
||||||
|
S3_UPLOAD_FAILED
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SkippedFile(
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename,
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason
|
||||||
|
) {}
|
||||||
|
|
||||||
|
// Note: @Schema on a record accessor method is not picked up by SpringDoc; the
|
||||||
|
// "skipped" count is a computed convenience field derived from skippedFiles.size().
|
||||||
|
@JsonProperty("skipped")
|
||||||
|
public int skipped() {
|
||||||
|
return skippedFiles.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Defensive-copy constructor — callers cannot mutate the stored list after construction. */
|
||||||
|
public ImportStatus {
|
||||||
|
skippedFiles = List.copyOf(skippedFiles);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -80,6 +80,11 @@ public class PersonService {
|
|||||||
return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName);
|
return personRepository.findByFirstNameIgnoreCaseAndLastNameIgnoreCase(firstName, lastName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Lookup by the normalizer person_id — used by the canonical importer for register-first matching. */
|
||||||
|
public Optional<Person> findBySourceRef(String sourceRef) {
|
||||||
|
return personRepository.findBySourceRef(sourceRef);
|
||||||
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
@Transactional
|
@Transactional
|
||||||
public Person findOrCreateByAlias(String rawName) {
|
public Person findOrCreateByAlias(String rawName) {
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import java.util.HashSet;
|
|||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@@ -49,6 +50,11 @@ public class TagService {
|
|||||||
.orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id));
|
.orElseThrow(() -> DomainException.notFound(ErrorCode.TAG_NOT_FOUND, "Tag not found: " + id));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Lookup by the canonical tag_path — used by the canonical importer to attach a document's tag. */
|
||||||
|
public Optional<Tag> findBySourceRef(String sourceRef) {
|
||||||
|
return tagRepository.findBySourceRef(sourceRef);
|
||||||
|
}
|
||||||
|
|
||||||
public Tag findOrCreate(String name) {
|
public Tag findOrCreate(String name) {
|
||||||
String cleanName = name.trim();
|
String cleanName = name.trim();
|
||||||
return tagRepository.findByNameIgnoreCase(cleanName)
|
return tagRepository.findByNameIgnoreCase(cleanName)
|
||||||
|
|||||||
@@ -0,0 +1,437 @@
|
|||||||
|
package org.raddatz.familienarchiv.importing;
|
||||||
|
|
||||||
|
import org.apache.poi.ss.usermodel.Row;
|
||||||
|
import org.apache.poi.ss.usermodel.Sheet;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.raddatz.familienarchiv.document.Document;
|
||||||
|
import org.raddatz.familienarchiv.document.DocumentService;
|
||||||
|
import org.raddatz.familienarchiv.document.DocumentStatus;
|
||||||
|
import org.raddatz.familienarchiv.document.ThumbnailAsyncRunner;
|
||||||
|
import org.raddatz.familienarchiv.person.Person;
|
||||||
|
import org.raddatz.familienarchiv.person.PersonService;
|
||||||
|
import org.raddatz.familienarchiv.person.PersonUpsertCommand;
|
||||||
|
import org.raddatz.familienarchiv.tag.Tag;
|
||||||
|
import org.raddatz.familienarchiv.tag.TagService;
|
||||||
|
import org.springframework.test.util.ReflectionTestUtils;
|
||||||
|
import software.amazon.awssdk.core.sync.RequestBody;
|
||||||
|
import software.amazon.awssdk.services.s3.S3Client;
|
||||||
|
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class DocumentImporterTest {
|
||||||
|
|
||||||
|
@Mock DocumentService documentService;
|
||||||
|
@Mock PersonService personService;
|
||||||
|
@Mock TagService tagService;
|
||||||
|
@Mock S3Client s3Client;
|
||||||
|
@Mock ThumbnailAsyncRunner thumbnailAsyncRunner;
|
||||||
|
|
||||||
|
DocumentImporter importer;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
importer = new DocumentImporter(documentService, personService, tagService, s3Client, thumbnailAsyncRunner);
|
||||||
|
ReflectionTestUtils.setField(importer, "bucketName", "test-bucket");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── security regression — ported from MassImportServiceTest — do not remove ─────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenNull() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", (String) null)).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenBlank() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", " ")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenForwardSlash() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "etc/passwd")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenBackslash() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..\\etc\\passwd")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenDotDot() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "doc..evil.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenIsDotDot() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "..")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenAbsolutePath() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "/etc/passwd")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenNullByte() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "file\0.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenUnicodeDivisionSlash() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo∕bar.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenFullwidthSlash() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo/bar.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsFalse_whenReverseSolidusOperator() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "foo⧵bar.pdf")).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsTrue_whenPlainBasename() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "document.pdf")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsTrue_whenLeadingDot() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", ".hidden.pdf")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void isValidImportFilename_returnsTrue_whenHasSpaces() {
|
||||||
|
assertThat((Boolean) ReflectionTestUtils.invokeMethod(importer, "isValidImportFilename", "Brief an Oma.pdf")).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir(
|
||||||
|
@TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
|
||||||
|
Path outsideFile = outsideDir.resolve("secret.pdf");
|
||||||
|
Files.writeString(outsideFile, "sensitive");
|
||||||
|
Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile);
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", importDirPath.toString());
|
||||||
|
|
||||||
|
org.assertj.core.api.Assertions.assertThatThrownBy(
|
||||||
|
() -> ReflectionTestUtils.invokeMethod(importer, "findFileRecursive", "secret.pdf"))
|
||||||
|
.isInstanceOf(org.raddatz.familienarchiv.exception.DomainException.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── path traversal in the file column cannot escape importDir ───────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_rejectsFileColumn_whenBasenameIsTraversalToken(@TempDir Path tempDir) throws Exception {
|
||||||
|
// A file column whose basename is itself a traversal token must be rejected
|
||||||
|
// outright, never used for disk I/O.
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "evil/..", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
assertThat(result.skippedFiles())
|
||||||
|
.extracting(ImportStatus.SkippedFile::reason)
|
||||||
|
.containsExactly(ImportStatus.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
|
||||||
|
verify(documentService, never()).save(any());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_traversalFileColumn_cannotEscapeImportDir_yieldsPlaceholder(@TempDir Path tempDir) throws Exception {
|
||||||
|
// ../../etc/cron.d/x reduces to basename "x"; the disk lookup is confined to
|
||||||
|
// importDir, so no file is found, nothing is uploaded, and the row becomes a
|
||||||
|
// metadata-only PLACEHOLDER — the file outside importDir is never read.
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "../../etc/cron.d/x", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── PDF magic-byte guard — ported — do not remove ──────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_skipsFile_whenNotPdfMagicBytes(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Files.writeString(tempDir.resolve("W-0001.pdf"), "not a pdf");
|
||||||
|
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
assertThat(result.skippedFiles())
|
||||||
|
.extracting(ImportStatus.SkippedFile::reason)
|
||||||
|
.containsExactly(ImportStatus.SkipReason.INVALID_PDF_SIGNATURE);
|
||||||
|
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_skipsFile_whenMagicByteCheckThrowsIoException(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Files.writeString(tempDir.resolve("W-0001.pdf"), "content");
|
||||||
|
lenient().when(documentService.findByOriginalFilename(any())).thenReturn(Optional.empty());
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
DocumentImporter spyImporter = org.mockito.Mockito.spy(importer);
|
||||||
|
org.mockito.Mockito.doThrow(new java.io.IOException("read error"))
|
||||||
|
.when(spyImporter).openFileStream(any(File.class));
|
||||||
|
|
||||||
|
DocumentImporter.LoadResult result = spyImporter.load(xlsx.toFile());
|
||||||
|
|
||||||
|
assertThat(result.skippedFiles())
|
||||||
|
.extracting(ImportStatus.SkippedFile::reason)
|
||||||
|
.containsExactly(ImportStatus.SkipReason.FILE_READ_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_skipsAlreadyExists_whenDocumentUploadedNotPlaceholder(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Document existing = Document.builder().id(UUID.randomUUID())
|
||||||
|
.originalFilename("W-0001").status(DocumentStatus.UPLOADED).build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.of(existing));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
DocumentImporter.LoadResult result = importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
assertThat(result.skippedFiles())
|
||||||
|
.extracting(ImportStatus.SkippedFile::reason)
|
||||||
|
.containsExactly(ImportStatus.SkipReason.ALREADY_EXISTS);
|
||||||
|
verify(documentService, never()).save(any());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── file column drives status: present → UPLOADED, empty → PLACEHOLDER ───────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_uploadsToS3_andSetsStatusUploaded_whenFilePresent(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
byte[] pdf = {0x25, 0x50, 0x44, 0x46, 0x2D};
|
||||||
|
Files.write(tempDir.resolve("W-0001.pdf"), pdf);
|
||||||
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "..\\__scan\\W-0001.pdf", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.UPLOADED));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_setsStatusPlaceholder_whenFileColumnEmpty(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
when(documentService.findByOriginalFilename("W-0099")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0099", "", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getStatus() == DocumentStatus.PLACEHOLDER));
|
||||||
|
verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── attribution routing — register-first + always retain raw ────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_linksRegisterSender_andRetainsRawSenderText(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Person walter = Person.builder().id(UUID.randomUUID()).sourceRef("de-gruyter-walter")
|
||||||
|
.firstName("Walter").lastName("de Gruyter").build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0001")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
when(personService.findBySourceRef("de-gruyter-walter")).thenReturn(Optional.of(walter));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0001", "", "de-gruyter-walter", "Walter de Gruyter",
|
||||||
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
|
||||||
|
d.getSender() == walter && "Walter de Gruyter".equals(d.getSenderText())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_createsProvisionalSender_whenSlugUnmatchedInRegister(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Person provisional = Person.builder().id(UUID.randomUUID()).sourceRef("schwester-hanni")
|
||||||
|
.lastName("Schwester Hanni").provisional(true).build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0002")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
when(personService.findBySourceRef("schwester-hanni")).thenReturn(Optional.empty());
|
||||||
|
when(personService.upsertBySourceRef(any())).thenReturn(provisional);
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0002", "", "schwester-hanni", "Schwester Hanni",
|
||||||
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
org.mockito.ArgumentCaptor<PersonUpsertCommand> captor =
|
||||||
|
org.mockito.ArgumentCaptor.forClass(PersonUpsertCommand.class);
|
||||||
|
verify(personService).upsertBySourceRef(captor.capture());
|
||||||
|
assertThat(captor.getValue().provisional()).isTrue();
|
||||||
|
assertThat(captor.getValue().lastName()).isEqualTo("Schwester Hanni");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_createsNoSenderPerson_whenSlugEmptyButRawPresent(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
when(documentService.findByOriginalFilename("W-0003")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0003", "", "", "?",
|
||||||
|
"", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(personService, never()).findBySourceRef(any());
|
||||||
|
verify(personService, never()).upsertBySourceRef(any());
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
|
||||||
|
d.getSender() == null && "?".equals(d.getSenderText())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_splitsMultipleReceivers_andRetainsRawReceiverText(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Person herbert = Person.builder().id(UUID.randomUUID()).sourceRef("cram-herbert").lastName("Cram").build();
|
||||||
|
Person clara = Person.builder().id(UUID.randomUUID()).sourceRef("clara").lastName("Clara").build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0004")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
when(personService.findBySourceRef("cram-herbert")).thenReturn(Optional.of(herbert));
|
||||||
|
when(personService.findBySourceRef("clara")).thenReturn(Optional.of(clara));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0004", "", "", "",
|
||||||
|
"cram-herbert|clara", "Herbert Cram|Clara", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
|
||||||
|
d.getReceivers().size() == 2
|
||||||
|
&& d.getReceivers().contains(herbert)
|
||||||
|
&& d.getReceivers().contains(clara)
|
||||||
|
&& "Herbert Cram|Clara".equals(d.getReceiverText())));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── clean date values parse without semantic logic ──────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_parsesCleanDateAndPrecision(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
when(documentService.findByOriginalFilename("W-0005")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0005", "", "", "",
|
||||||
|
"", "", "1916-06-01", "1.6.1916", "MONTH", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d ->
|
||||||
|
LocalDate.of(1916, 6, 1).equals(d.getDocumentDate())
|
||||||
|
&& d.getMetaDatePrecision() == org.raddatz.familienarchiv.document.DatePrecision.MONTH
|
||||||
|
&& "1.6.1916".equals(d.getMetaDateRaw())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_attachesTagBySourceRef(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Tag tag = Tag.builder().id(UUID.randomUUID()).name("Brautbriefe").sourceRef("Themen/Brautbriefe").build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0006")).thenReturn(Optional.empty());
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
when(tagService.findBySourceRef("Themen/Brautbriefe")).thenReturn(Optional.of(tag));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRowWithTag("W-0006", "Themen/Brautbriefe"));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getTags().contains(tag)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── idempotency — update existing document in place by index ─────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void load_updatesExistingDocumentInPlace_whenIndexExists(@TempDir Path tempDir) throws Exception {
|
||||||
|
ReflectionTestUtils.setField(importer, "importDir", tempDir.toString());
|
||||||
|
Document existing = Document.builder().id(UUID.randomUUID())
|
||||||
|
.originalFilename("W-0007").status(DocumentStatus.PLACEHOLDER).build();
|
||||||
|
when(documentService.findByOriginalFilename("W-0007")).thenReturn(Optional.of(existing));
|
||||||
|
when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
Path xlsx = writeDocs(tempDir, docRow("W-0007", "", "", "", "", "", "", "", "", ""));
|
||||||
|
|
||||||
|
importer.load(xlsx.toFile());
|
||||||
|
|
||||||
|
verify(documentService).save(org.mockito.ArgumentMatchers.argThat(d -> d.getId().equals(existing.getId())));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── helpers ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
private Map<String, String> docRow(String index, String file, String senderId, String senderName,
|
||||||
|
String receiverIds, String receiverNames, String dateIso,
|
||||||
|
String dateRaw, String datePrecision, String dateEnd) {
|
||||||
|
Map<String, String> r = new LinkedHashMap<>();
|
||||||
|
r.put("index", index);
|
||||||
|
r.put("file", file);
|
||||||
|
r.put("sender_person_id", senderId);
|
||||||
|
r.put("sender_name", senderName);
|
||||||
|
r.put("receiver_person_ids", receiverIds);
|
||||||
|
r.put("receiver_names", receiverNames);
|
||||||
|
r.put("date_iso", dateIso);
|
||||||
|
r.put("date_raw", dateRaw);
|
||||||
|
r.put("date_precision", datePrecision);
|
||||||
|
r.put("date_end", dateEnd);
|
||||||
|
r.put("location", "");
|
||||||
|
r.put("tags", "");
|
||||||
|
r.put("summary", "");
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> docRowWithTag(String index, String tagPath) {
|
||||||
|
Map<String, String> r = docRow(index, "", "", "", "", "", "", "", "", "");
|
||||||
|
r.put("tags", tagPath);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SafeVarargs
|
||||||
|
private Path writeDocs(Path dir, Map<String, String>... rows) throws Exception {
|
||||||
|
Path xlsx = dir.resolve("canonical-documents.xlsx");
|
||||||
|
List<String> headers = List.of("index", "file", "sender_person_id", "sender_name",
|
||||||
|
"receiver_person_ids", "receiver_names", "date_iso", "date_raw", "date_precision",
|
||||||
|
"date_end", "location", "tags", "summary");
|
||||||
|
try (XSSFWorkbook wb = new XSSFWorkbook()) {
|
||||||
|
Sheet sheet = wb.createSheet("Sheet1");
|
||||||
|
Row header = sheet.createRow(0);
|
||||||
|
for (int i = 0; i < headers.size(); i++) {
|
||||||
|
header.createCell(i).setCellValue(headers.get(i));
|
||||||
|
}
|
||||||
|
for (int r = 0; r < rows.length; r++) {
|
||||||
|
Row row = sheet.createRow(r + 1);
|
||||||
|
for (int c = 0; c < headers.size(); c++) {
|
||||||
|
row.createCell(c).setCellValue(rows[r].getOrDefault(headers.get(c), ""));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try (OutputStream out = Files.newOutputStream(xlsx)) {
|
||||||
|
wb.write(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return xlsx;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user