feat(import): validate PDF magic bytes before S3 upload

Reads first 4 bytes of each candidate file before upload; rejects any
file whose header does not match %PDF (0x25 0x50 0x44 0x46). Skipped
files are counted and collected in ImportStatus.skippedFiles so
operators can see what was rejected without querying Loki.

Breaking: ImportStatus record gains skipped + skippedFiles fields.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-18 12:51:29 +02:00
committed by marcel
parent 1247b51d9e
commit f77fb79cd2
3 changed files with 141 additions and 27 deletions

View File

@@ -31,6 +31,7 @@ import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -53,9 +54,13 @@ public class MassImportService {
public enum State { IDLE, RUNNING, DONE, FAILED }
public record ImportStatus(State state, String statusCode, @JsonIgnore String message, int processed, LocalDateTime startedAt) {}
public record SkippedFile(String filename, String reason) {}
private volatile ImportStatus currentStatus = new ImportStatus(State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, null);
public record ImportStatus(State state, String statusCode, @JsonIgnore String message, int processed, int skipped, List<SkippedFile> skippedFiles, LocalDateTime startedAt) {}
record ProcessResult(int processed, int skipped, List<SkippedFile> skippedFiles) {}
private volatile ImportStatus currentStatus = new ImportStatus(State.IDLE, "IMPORT_IDLE", "Kein Import gestartet.", 0, 0, List.of(), null);
public ImportStatus getStatus() {
return currentStatus;
@@ -117,22 +122,22 @@ public class MassImportService {
if (currentStatus.state() == State.RUNNING) {
throw DomainException.conflict(ErrorCode.IMPORT_ALREADY_RUNNING, "A mass import is already in progress");
}
currentStatus = new ImportStatus(State.RUNNING, "IMPORT_RUNNING", "Import läuft...", 0, LocalDateTime.now());
currentStatus = new ImportStatus(State.RUNNING, "IMPORT_RUNNING", "Import läuft...", 0, 0, List.of(), LocalDateTime.now());
try {
File spreadsheet = findSpreadsheetFile();
log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
int processed = processRows(readSpreadsheet(spreadsheet));
ProcessResult result = processRows(readSpreadsheet(spreadsheet));
currentStatus = new ImportStatus(State.DONE, "IMPORT_DONE",
"Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
processed, currentStatus.startedAt());
"Import abgeschlossen. " + result.processed() + " Dokumente verarbeitet.",
result.processed(), result.skipped(), result.skippedFiles(), currentStatus.startedAt());
} catch (NoSpreadsheetException e) {
log.error("Massenimport fehlgeschlagen: keine Tabellendatei", e);
currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_NO_SPREADSHEET",
"Fehler: " + e.getMessage(), 0, currentStatus.startedAt());
"Fehler: " + e.getMessage(), 0, 0, List.of(), currentStatus.startedAt());
} catch (Exception e) {
log.error("Massenimport fehlgeschlagen", e);
currentStatus = new ImportStatus(State.FAILED, "IMPORT_FAILED_INTERNAL",
"Fehler: " + e.getMessage(), 0, currentStatus.startedAt());
"Fehler: " + e.getMessage(), 0, 0, List.of(), currentStatus.startedAt());
}
}
@@ -254,8 +259,10 @@ public class MassImportService {
// --- Import logic (works on neutral List<String> rows) ---
private int processRows(List<List<String>> rows) {
int count = 0;
private ProcessResult processRows(List<List<String>> rows) {
int processed = 0;
List<SkippedFile> skippedFiles = new ArrayList<>();
for (int i = 1; i < rows.size(); i++) { // skip header row
List<String> cells = rows.get(i);
String index = getCell(cells, colIndex);
@@ -266,18 +273,46 @@ public class MassImportService {
if (fileOnDisk.isEmpty()) {
log.warn("Datei nicht gefunden, importiere nur Metadaten: {}", filename);
}
importSingleDocument(cells, fileOnDisk, filename, index);
count++;
if (fileOnDisk.isPresent()) {
try {
if (!isPdfMagicBytes(fileOnDisk.get())) {
log.warn("Überspringe {}: Datei beginnt nicht mit %PDF-Signatur", filename);
skippedFiles.add(new SkippedFile(filename, "Keine gültige PDF-Signatur"));
continue;
}
} catch (IOException e) {
log.error("Fehler beim Prüfen der Magic-Bytes für {}", filename, e);
skippedFiles.add(new SkippedFile(filename, "Fehler beim Lesen der Datei"));
continue;
}
}
boolean imported = importSingleDocument(cells, fileOnDisk, filename, index);
if (imported) {
processed++;
}
}
return new ProcessResult(processed, skippedFiles.size(), skippedFiles);
}
private boolean isPdfMagicBytes(File file) throws IOException {
try (InputStream is = new FileInputStream(file)) {
byte[] header = is.readNBytes(4);
return header.length == 4
&& header[0] == 0x25 // %
&& header[1] == 0x50 // P
&& header[2] == 0x44 // D
&& header[3] == 0x46; // F
}
return count;
}
@Transactional
protected void importSingleDocument(List<String> cells, Optional<File> file, String originalFilename, String index) {
protected boolean importSingleDocument(List<String> cells, Optional<File> file, String originalFilename, String index) {
Optional<Document> existing = documentService.findByOriginalFilename(originalFilename);
if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
return;
return false;
}
String archiveBox = getCell(cells, colBox);
@@ -313,7 +348,7 @@ public class MassImportService {
status = DocumentStatus.UPLOADED;
} catch (Exception e) {
log.error("S3 Upload Fehler für {}", file.get().getName(), e);
return;
return false;
}
}
@@ -355,6 +390,7 @@ public class MassImportService {
thumbnailAsyncRunner.dispatchAfterCommit(saved.getId());
}
log.info("Importiert{}: {}", file.isEmpty() ? " (nur Metadaten)" : "", originalFilename);
return true;
}
// --- Helpers ---