fix: replace WorkbookFactory with native XML/ZIP parser for ODS files

WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files —
Apache POI does not support ODF format at all.

Replace ODS reading with a direct content.xml parser using Java's
built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive;
the spreadsheet lives in content.xml as standard ODF XML.

Also refactors the import pipeline to decouple file reading from import
logic: both ODS and XLSX paths now produce List<List<String>> which is
processed by format-agnostic row logic. XLSX date cells are now
converted to ISO strings before processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-03-15 21:09:46 +01:00
parent 5cc4dcf7aa
commit 4dd4d81ca3

View File

@@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async; import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.core.sync.RequestBody;
import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.PutObjectRequest;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
@@ -28,14 +31,15 @@ import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException; import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Optional; import java.util.Optional;
import java.util.UUID; import java.util.UUID;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.zip.ZipFile;
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@@ -93,6 +97,13 @@ public class MassImportService {
private static final String IMPORT_DIR = "/import"; private static final String IMPORT_DIR = "/import";
private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN);
// ODS XML namespaces
private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
// We only need up to this many columns; caps repeated-empty-cell expansion
private static final int MAX_COLS = 20;
@Async @Async
public void runImportAsync() { public void runImportAsync() {
if (currentStatus.state() == State.RUNNING) { if (currentStatus.state() == State.RUNNING) {
@@ -102,7 +113,7 @@ public class MassImportService {
try { try {
File spreadsheet = findSpreadsheetFile(); File spreadsheet = findSpreadsheetFile();
log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath()); log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
int processed = processSpreadsheet(spreadsheet); int processed = processRows(readSpreadsheet(spreadsheet));
currentStatus = new ImportStatus(State.DONE, currentStatus = new ImportStatus(State.DONE,
"Import abgeschlossen. " + processed + " Dokumente verarbeitet.", "Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
processed, currentStatus.startedAt()); processed, currentStatus.startedAt());
@@ -126,56 +137,143 @@ public class MassImportService {
} }
} }
private int processSpreadsheet(File file) throws IOException { // --- Spreadsheet reading (format-specific, produces neutral List<List<String>>) ---
int count = 0;
private List<List<String>> readSpreadsheet(File file) throws Exception {
String name = file.getName().toLowerCase();
if (name.endsWith(".ods")) {
return readOds(file);
}
return readXlsx(file);
}
/**
* Reads an ODS file by parsing its content.xml directly (no extra library needed).
* ODS is a ZIP archive; content.xml holds the spreadsheet data as XML.
*/
private List<List<String>> readOds(File file) throws Exception {
List<List<String>> result = new ArrayList<>();
try (ZipFile zip = new ZipFile(file)) {
var entry = zip.getEntry("content.xml");
if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt");
var factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
var builder = factory.newDocumentBuilder();
var doc = builder.parse(zip.getInputStream(entry));
NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table");
if (tables.getLength() == 0) return result;
var table = (Element) tables.item(0);
NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row");
for (int i = 0; i < rows.getLength(); i++) {
var row = (Element) rows.item(i);
List<String> rowData = new ArrayList<>();
NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell");
for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) {
var cell = (Element) cells.item(j);
// Read the display text (first <text:p>)
String value = "";
NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p");
if (textNodes.getLength() > 0) {
value = textNodes.item(0).getTextContent().trim();
}
// Expand number-columns-repeated (capped at MAX_COLS)
String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated");
int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr);
repeat = Math.min(repeat, MAX_COLS - rowData.size());
for (int r = 0; r < repeat; r++) {
rowData.add(value);
}
}
result.add(rowData);
}
}
return result;
}
/** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */
private List<List<String>> readXlsx(File file) throws Exception {
List<List<String>> result = new ArrayList<>();
try (FileInputStream fis = new FileInputStream(file); try (FileInputStream fis = new FileInputStream(file);
Workbook workbook = WorkbookFactory.create(fis)) { Workbook workbook = WorkbookFactory.create(fis)) {
Sheet sheet = workbook.getSheetAt(0); Sheet sheet = workbook.getSheetAt(0);
for (int i = 0; i <= sheet.getLastRowNum(); i++) {
for (int i = 1; i <= sheet.getLastRowNum(); i++) {
Row row = sheet.getRow(i); Row row = sheet.getRow(i);
if (row == null) continue; List<String> rowData = new ArrayList<>();
if (row != null) {
String index = getCellValue(row.getCell(colIndex)); for (int j = 0; j < MAX_COLS; j++) {
if (index == null || index.isBlank()) continue; rowData.add(xlsxCellToString(row.getCell(j)));
}
// Append .pdf extension if the index has none
String filename = index.contains(".") ? index : index + ".pdf";
Optional<File> fileOnDisk = findFileRecursive(filename);
if (fileOnDisk.isPresent()) {
importSingleDocument(row, fileOnDisk.get(), filename, index);
count++;
} else {
log.warn("Datei nicht gefunden: {}", filename);
} }
result.add(rowData);
}
}
return result;
}
private String xlsxCellToString(Cell cell) {
if (cell == null) return "";
return switch (cell.getCellType()) {
case STRING -> cell.getStringCellValue();
case NUMERIC -> {
if (DateUtil.isCellDateFormatted(cell)) {
yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO
}
yield String.valueOf((int) cell.getNumericCellValue());
}
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
default -> "";
};
}
// --- Import logic (works on neutral List<String> rows) ---
private int processRows(List<List<String>> rows) {
int count = 0;
for (int i = 1; i < rows.size(); i++) { // skip header row
List<String> cells = rows.get(i);
String index = getCell(cells, colIndex);
if (index.isBlank()) continue;
String filename = index.contains(".") ? index : index + ".pdf";
Optional<File> fileOnDisk = findFileRecursive(filename);
if (fileOnDisk.isPresent()) {
importSingleDocument(cells, fileOnDisk.get(), filename, index);
count++;
} else {
log.warn("Datei nicht gefunden: {}", filename);
} }
} }
return count; return count;
} }
@Transactional @Transactional
protected void importSingleDocument(Row row, File file, String originalFilename, String index) { protected void importSingleDocument(List<String> cells, File file, String originalFilename, String index) {
// Skip documents that have already been processed beyond placeholder stage
Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename); Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename);
if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) { if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
log.info("Dokument {} existiert bereits, überspringe.", originalFilename); log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
return; return;
} }
// Read metadata from ODS row String archiveBox = getCell(cells, colBox);
String archiveBox = getCellValue(row.getCell(colBox)); String archiveFolder = getCell(cells, colFolder);
String archiveFolder = getCellValue(row.getCell(colFolder)); String senderRaw = getCell(cells, colSender);
String senderRaw = getCellValue(row.getCell(colSender)); String receiversRaw = getCell(cells, colReceivers);
String receiversRaw = getCellValue(row.getCell(colReceivers)); LocalDate date = parseDate(getCell(cells, colDate));
LocalDate date = parseDate(row.getCell(colDate)); String location = getCell(cells, colLocation);
String location = getCellValue(row.getCell(colLocation)); String tagRaw = getCell(cells, colTags);
String tagRaw = getCellValue(row.getCell(colTags)); String summary = getCell(cells, colSummary);
String summary = getCellValue(row.getCell(colSummary)); String transcription = getCell(cells, colTranscription);
String transcription = getCellValue(row.getCell(colTranscription));
// Detect content type from the local file
String contentType; String contentType;
try { try {
contentType = Files.probeContentType(file.toPath()); contentType = Files.probeContentType(file.toPath());
@@ -184,7 +282,6 @@ public class MassImportService {
} }
if (contentType == null) contentType = "application/octet-stream"; if (contentType == null) contentType = "application/octet-stream";
// Upload to S3
String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName(); String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName();
try { try {
s3Client.putObject(PutObjectRequest.builder() s3Client.putObject(PutObjectRequest.builder()
@@ -198,21 +295,17 @@ public class MassImportService {
return; return;
} }
// Resolve sender and receivers to Person entities Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw);
Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null;
List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream() List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream()
.map(this::findOrCreatePerson) .map(this::findOrCreatePerson)
.toList(); .toList();
// Resolve tag
Tag tag = null; Tag tag = null;
if (tagRaw != null && !tagRaw.isBlank()) { if (!tagRaw.isBlank()) {
tag = tagRepository.findByNameIgnoreCase(tagRaw) tag = tagRepository.findByNameIgnoreCase(tagRaw)
.orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build())); .orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build()));
} }
// Build or update the Document record
Document doc = existing.orElse(Document.builder() Document doc = existing.orElse(Document.builder()
.originalFilename(originalFilename) .originalFilename(originalFilename)
.build()); .build());
@@ -221,12 +314,12 @@ public class MassImportService {
doc.setFilePath(s3Key); doc.setFilePath(s3Key);
doc.setContentType(contentType); doc.setContentType(contentType);
doc.setStatus(DocumentStatus.UPLOADED); doc.setStatus(DocumentStatus.UPLOADED);
doc.setArchiveBox(archiveBox); doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox);
doc.setArchiveFolder(archiveFolder); doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder);
doc.setDocumentDate(date); doc.setDocumentDate(date);
doc.setLocation(location); doc.setLocation(location.isBlank() ? null : location);
doc.setSummary(summary); doc.setSummary(summary.isBlank() ? null : summary);
doc.setTranscription(transcription); doc.setTranscription(transcription.isBlank() ? null : transcription);
doc.setSender(sender); doc.setSender(sender);
doc.getReceivers().addAll(receivers); doc.getReceivers().addAll(receivers);
if (tag != null) doc.getTags().add(tag); if (tag != null) doc.getTags().add(tag);
@@ -235,6 +328,34 @@ public class MassImportService {
log.info("Importiert: {}", originalFilename); log.info("Importiert: {}", originalFilename);
} }
// --- Helpers ---
private String getCell(List<String> cells, int col) {
if (col >= cells.size()) return "";
String val = cells.get(col);
return val == null ? "" : val.trim();
}
private LocalDate parseDate(String value) {
if (value == null || value.isBlank()) return null;
try {
return LocalDate.parse(value.trim());
} catch (DateTimeParseException e) {
return null;
}
}
private String buildTitle(String index, LocalDate date, String location) {
StringBuilder sb = new StringBuilder(index);
if (date != null) {
sb.append(" \u2013 ").append(date.format(GERMAN_DATE));
}
if (location != null && !location.isBlank()) {
sb.append(" \u2013 ").append(location);
}
return sb.toString();
}
private Person findOrCreatePerson(String rawName) { private Person findOrCreatePerson(String rawName) {
String alias = rawName.trim(); String alias = rawName.trim();
return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> { return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> {
@@ -247,32 +368,6 @@ public class MassImportService {
}); });
} }
private String buildTitle(String index, LocalDate date, String location) {
StringBuilder sb = new StringBuilder(index);
if (date != null) {
sb.append(" ").append(date.format(GERMAN_DATE));
}
if (location != null && !location.isBlank()) {
sb.append(" ").append(location);
}
return sb.toString();
}
private LocalDate parseDate(Cell cell) {
if (cell == null) return null;
if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
}
if (cell.getCellType() == CellType.STRING) {
try {
return LocalDate.parse(cell.getStringCellValue().trim());
} catch (DateTimeParseException e) {
return null;
}
}
return null;
}
private Optional<File> findFileRecursive(String filename) { private Optional<File> findFileRecursive(String filename) {
try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) { try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) {
return walk.filter(p -> !Files.isDirectory(p)) return walk.filter(p -> !Files.isDirectory(p))
@@ -283,14 +378,4 @@ public class MassImportService {
return Optional.empty(); return Optional.empty();
} }
} }
private String getCellValue(Cell cell) {
if (cell == null) return null;
return switch (cell.getCellType()) {
case STRING -> cell.getStringCellValue();
case NUMERIC -> String.valueOf((int) cell.getNumericCellValue());
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
default -> null;
};
}
} }