fix: replace WorkbookFactory with native XML/ZIP parser for ODS files

WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files —
Apache POI does not support ODF format at all.

Replace ODS reading with a direct content.xml parser using Java's
built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive;
the spreadsheet lives in content.xml as standard ODF XML.

Also refactors the import pipeline to decouple file reading from import
logic: both ODS and XLSX paths now produce List<List<String>> which is
processed by format-agnostic row logic. XLSX date cells are now
converted to ISO strings before processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-03-15 21:09:46 +01:00
parent 5cc4dcf7aa
commit 4dd4d81ca3

View File

@@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import software.amazon.awssdk.core.sync.RequestBody;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -28,14 +31,15 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.Stream;
import java.util.zip.ZipFile;
@Service
@RequiredArgsConstructor
@@ -93,6 +97,13 @@ public class MassImportService {
private static final String IMPORT_DIR = "/import";
private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN);
// ODS XML namespaces
private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
// We only need up to this many columns; caps repeated-empty-cell expansion
private static final int MAX_COLS = 20;
@Async
public void runImportAsync() {
if (currentStatus.state() == State.RUNNING) {
@@ -102,7 +113,7 @@ public class MassImportService {
try {
File spreadsheet = findSpreadsheetFile();
log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
int processed = processSpreadsheet(spreadsheet);
int processed = processRows(readSpreadsheet(spreadsheet));
currentStatus = new ImportStatus(State.DONE,
"Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
processed, currentStatus.startedAt());
@@ -126,56 +137,143 @@ public class MassImportService {
}
}
private int processSpreadsheet(File file) throws IOException {
int count = 0;
// --- Spreadsheet reading (format-specific, produces neutral List<List<String>>) ---
private List<List<String>> readSpreadsheet(File file) throws Exception {
String name = file.getName().toLowerCase();
if (name.endsWith(".ods")) {
return readOds(file);
}
return readXlsx(file);
}
/**
* Reads an ODS file by parsing its content.xml directly (no extra library needed).
* ODS is a ZIP archive; content.xml holds the spreadsheet data as XML.
*/
private List<List<String>> readOds(File file) throws Exception {
List<List<String>> result = new ArrayList<>();
try (ZipFile zip = new ZipFile(file)) {
var entry = zip.getEntry("content.xml");
if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt");
var factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
var builder = factory.newDocumentBuilder();
var doc = builder.parse(zip.getInputStream(entry));
NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table");
if (tables.getLength() == 0) return result;
var table = (Element) tables.item(0);
NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row");
for (int i = 0; i < rows.getLength(); i++) {
var row = (Element) rows.item(i);
List<String> rowData = new ArrayList<>();
NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell");
for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) {
var cell = (Element) cells.item(j);
// Read the display text (first <text:p>)
String value = "";
NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p");
if (textNodes.getLength() > 0) {
value = textNodes.item(0).getTextContent().trim();
}
// Expand number-columns-repeated (capped at MAX_COLS)
String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated");
int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr);
repeat = Math.min(repeat, MAX_COLS - rowData.size());
for (int r = 0; r < repeat; r++) {
rowData.add(value);
}
}
result.add(rowData);
}
}
return result;
}
/** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */
private List<List<String>> readXlsx(File file) throws Exception {
List<List<String>> result = new ArrayList<>();
try (FileInputStream fis = new FileInputStream(file);
Workbook workbook = WorkbookFactory.create(fis)) {
Sheet sheet = workbook.getSheetAt(0);
for (int i = 1; i <= sheet.getLastRowNum(); i++) {
for (int i = 0; i <= sheet.getLastRowNum(); i++) {
Row row = sheet.getRow(i);
if (row == null) continue;
String index = getCellValue(row.getCell(colIndex));
if (index == null || index.isBlank()) continue;
// Append .pdf extension if the index has none
String filename = index.contains(".") ? index : index + ".pdf";
Optional<File> fileOnDisk = findFileRecursive(filename);
if (fileOnDisk.isPresent()) {
importSingleDocument(row, fileOnDisk.get(), filename, index);
count++;
} else {
log.warn("Datei nicht gefunden: {}", filename);
List<String> rowData = new ArrayList<>();
if (row != null) {
for (int j = 0; j < MAX_COLS; j++) {
rowData.add(xlsxCellToString(row.getCell(j)));
}
}
result.add(rowData);
}
}
return result;
}
private String xlsxCellToString(Cell cell) {
if (cell == null) return "";
return switch (cell.getCellType()) {
case STRING -> cell.getStringCellValue();
case NUMERIC -> {
if (DateUtil.isCellDateFormatted(cell)) {
yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO
}
yield String.valueOf((int) cell.getNumericCellValue());
}
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
default -> "";
};
}
// --- Import logic (works on neutral List<String> rows) ---
private int processRows(List<List<String>> rows) {
int count = 0;
for (int i = 1; i < rows.size(); i++) { // skip header row
List<String> cells = rows.get(i);
String index = getCell(cells, colIndex);
if (index.isBlank()) continue;
String filename = index.contains(".") ? index : index + ".pdf";
Optional<File> fileOnDisk = findFileRecursive(filename);
if (fileOnDisk.isPresent()) {
importSingleDocument(cells, fileOnDisk.get(), filename, index);
count++;
} else {
log.warn("Datei nicht gefunden: {}", filename);
}
}
return count;
}
@Transactional
protected void importSingleDocument(Row row, File file, String originalFilename, String index) {
// Skip documents that have already been processed beyond placeholder stage
protected void importSingleDocument(List<String> cells, File file, String originalFilename, String index) {
Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename);
if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
return;
}
// Read metadata from ODS row
String archiveBox = getCellValue(row.getCell(colBox));
String archiveFolder = getCellValue(row.getCell(colFolder));
String senderRaw = getCellValue(row.getCell(colSender));
String receiversRaw = getCellValue(row.getCell(colReceivers));
LocalDate date = parseDate(row.getCell(colDate));
String location = getCellValue(row.getCell(colLocation));
String tagRaw = getCellValue(row.getCell(colTags));
String summary = getCellValue(row.getCell(colSummary));
String transcription = getCellValue(row.getCell(colTranscription));
String archiveBox = getCell(cells, colBox);
String archiveFolder = getCell(cells, colFolder);
String senderRaw = getCell(cells, colSender);
String receiversRaw = getCell(cells, colReceivers);
LocalDate date = parseDate(getCell(cells, colDate));
String location = getCell(cells, colLocation);
String tagRaw = getCell(cells, colTags);
String summary = getCell(cells, colSummary);
String transcription = getCell(cells, colTranscription);
// Detect content type from the local file
String contentType;
try {
contentType = Files.probeContentType(file.toPath());
@@ -184,7 +282,6 @@ public class MassImportService {
}
if (contentType == null) contentType = "application/octet-stream";
// Upload to S3
String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName();
try {
s3Client.putObject(PutObjectRequest.builder()
@@ -198,21 +295,17 @@ public class MassImportService {
return;
}
// Resolve sender and receivers to Person entities
Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null;
Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw);
List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream()
.map(this::findOrCreatePerson)
.toList();
// Resolve tag
Tag tag = null;
if (tagRaw != null && !tagRaw.isBlank()) {
if (!tagRaw.isBlank()) {
tag = tagRepository.findByNameIgnoreCase(tagRaw)
.orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build()));
}
// Build or update the Document record
Document doc = existing.orElse(Document.builder()
.originalFilename(originalFilename)
.build());
@@ -221,12 +314,12 @@ public class MassImportService {
doc.setFilePath(s3Key);
doc.setContentType(contentType);
doc.setStatus(DocumentStatus.UPLOADED);
doc.setArchiveBox(archiveBox);
doc.setArchiveFolder(archiveFolder);
doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox);
doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder);
doc.setDocumentDate(date);
doc.setLocation(location);
doc.setSummary(summary);
doc.setTranscription(transcription);
doc.setLocation(location.isBlank() ? null : location);
doc.setSummary(summary.isBlank() ? null : summary);
doc.setTranscription(transcription.isBlank() ? null : transcription);
doc.setSender(sender);
doc.getReceivers().addAll(receivers);
if (tag != null) doc.getTags().add(tag);
@@ -235,6 +328,34 @@ public class MassImportService {
log.info("Importiert: {}", originalFilename);
}
// --- Helpers ---
private String getCell(List<String> cells, int col) {
if (col >= cells.size()) return "";
String val = cells.get(col);
return val == null ? "" : val.trim();
}
private LocalDate parseDate(String value) {
if (value == null || value.isBlank()) return null;
try {
return LocalDate.parse(value.trim());
} catch (DateTimeParseException e) {
return null;
}
}
private String buildTitle(String index, LocalDate date, String location) {
StringBuilder sb = new StringBuilder(index);
if (date != null) {
sb.append(" \u2013 ").append(date.format(GERMAN_DATE));
}
if (location != null && !location.isBlank()) {
sb.append(" \u2013 ").append(location);
}
return sb.toString();
}
private Person findOrCreatePerson(String rawName) {
String alias = rawName.trim();
return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> {
@@ -247,32 +368,6 @@ public class MassImportService {
});
}
private String buildTitle(String index, LocalDate date, String location) {
StringBuilder sb = new StringBuilder(index);
if (date != null) {
sb.append(" ").append(date.format(GERMAN_DATE));
}
if (location != null && !location.isBlank()) {
sb.append(" ").append(location);
}
return sb.toString();
}
private LocalDate parseDate(Cell cell) {
if (cell == null) return null;
if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
}
if (cell.getCellType() == CellType.STRING) {
try {
return LocalDate.parse(cell.getStringCellValue().trim());
} catch (DateTimeParseException e) {
return null;
}
}
return null;
}
private Optional<File> findFileRecursive(String filename) {
try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) {
return walk.filter(p -> !Files.isDirectory(p))
@@ -283,14 +378,4 @@ public class MassImportService {
return Optional.empty();
}
}
private String getCellValue(Cell cell) {
if (cell == null) return null;
return switch (cell.getCellType()) {
case STRING -> cell.getStringCellValue();
case NUMERIC -> String.valueOf((int) cell.getNumericCellValue());
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
default -> null;
};
}
}