fix: replace WorkbookFactory with native XML/ZIP parser for ODS files
WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files — Apache POI does not support ODF format at all. Replace ODS reading with a direct content.xml parser using Java's built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive; the spreadsheet lives in content.xml as standard ODF XML. Also refactors the import pipeline to decouple file reading from import logic: both ODS and XLSX paths now produce List<List<String>> which is processed by format-agnostic row logic. XLSX date cells are now converted to ISO strings before processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import software.amazon.awssdk.core.sync.RequestBody;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
@@ -28,14 +31,15 @@ import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@@ -93,6 +97,13 @@ public class MassImportService {
|
||||
private static final String IMPORT_DIR = "/import";
|
||||
private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN);
|
||||
|
||||
// ODS XML namespaces
|
||||
private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
|
||||
private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
|
||||
|
||||
// We only need up to this many columns; caps repeated-empty-cell expansion
|
||||
private static final int MAX_COLS = 20;
|
||||
|
||||
@Async
|
||||
public void runImportAsync() {
|
||||
if (currentStatus.state() == State.RUNNING) {
|
||||
@@ -102,7 +113,7 @@ public class MassImportService {
|
||||
try {
|
||||
File spreadsheet = findSpreadsheetFile();
|
||||
log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
|
||||
int processed = processSpreadsheet(spreadsheet);
|
||||
int processed = processRows(readSpreadsheet(spreadsheet));
|
||||
currentStatus = new ImportStatus(State.DONE,
|
||||
"Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
|
||||
processed, currentStatus.startedAt());
|
||||
@@ -126,56 +137,143 @@ public class MassImportService {
|
||||
}
|
||||
}
|
||||
|
||||
private int processSpreadsheet(File file) throws IOException {
|
||||
int count = 0;
|
||||
// --- Spreadsheet reading (format-specific, produces neutral List<List<String>>) ---
|
||||
|
||||
private List<List<String>> readSpreadsheet(File file) throws Exception {
|
||||
String name = file.getName().toLowerCase();
|
||||
if (name.endsWith(".ods")) {
|
||||
return readOds(file);
|
||||
}
|
||||
return readXlsx(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads an ODS file by parsing its content.xml directly (no extra library needed).
|
||||
* ODS is a ZIP archive; content.xml holds the spreadsheet data as XML.
|
||||
*/
|
||||
private List<List<String>> readOds(File file) throws Exception {
|
||||
List<List<String>> result = new ArrayList<>();
|
||||
|
||||
try (ZipFile zip = new ZipFile(file)) {
|
||||
var entry = zip.getEntry("content.xml");
|
||||
if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt");
|
||||
|
||||
var factory = DocumentBuilderFactory.newInstance();
|
||||
factory.setNamespaceAware(true);
|
||||
var builder = factory.newDocumentBuilder();
|
||||
var doc = builder.parse(zip.getInputStream(entry));
|
||||
|
||||
NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table");
|
||||
if (tables.getLength() == 0) return result;
|
||||
|
||||
var table = (Element) tables.item(0);
|
||||
NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row");
|
||||
|
||||
for (int i = 0; i < rows.getLength(); i++) {
|
||||
var row = (Element) rows.item(i);
|
||||
List<String> rowData = new ArrayList<>();
|
||||
NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell");
|
||||
|
||||
for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) {
|
||||
var cell = (Element) cells.item(j);
|
||||
|
||||
// Read the display text (first <text:p>)
|
||||
String value = "";
|
||||
NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p");
|
||||
if (textNodes.getLength() > 0) {
|
||||
value = textNodes.item(0).getTextContent().trim();
|
||||
}
|
||||
|
||||
// Expand number-columns-repeated (capped at MAX_COLS)
|
||||
String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated");
|
||||
int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr);
|
||||
repeat = Math.min(repeat, MAX_COLS - rowData.size());
|
||||
|
||||
for (int r = 0; r < repeat; r++) {
|
||||
rowData.add(value);
|
||||
}
|
||||
}
|
||||
result.add(rowData);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */
|
||||
private List<List<String>> readXlsx(File file) throws Exception {
|
||||
List<List<String>> result = new ArrayList<>();
|
||||
try (FileInputStream fis = new FileInputStream(file);
|
||||
Workbook workbook = WorkbookFactory.create(fis)) {
|
||||
|
||||
Sheet sheet = workbook.getSheetAt(0);
|
||||
|
||||
for (int i = 1; i <= sheet.getLastRowNum(); i++) {
|
||||
for (int i = 0; i <= sheet.getLastRowNum(); i++) {
|
||||
Row row = sheet.getRow(i);
|
||||
if (row == null) continue;
|
||||
|
||||
String index = getCellValue(row.getCell(colIndex));
|
||||
if (index == null || index.isBlank()) continue;
|
||||
|
||||
// Append .pdf extension if the index has none
|
||||
String filename = index.contains(".") ? index : index + ".pdf";
|
||||
|
||||
Optional<File> fileOnDisk = findFileRecursive(filename);
|
||||
if (fileOnDisk.isPresent()) {
|
||||
importSingleDocument(row, fileOnDisk.get(), filename, index);
|
||||
count++;
|
||||
} else {
|
||||
log.warn("Datei nicht gefunden: {}", filename);
|
||||
List<String> rowData = new ArrayList<>();
|
||||
if (row != null) {
|
||||
for (int j = 0; j < MAX_COLS; j++) {
|
||||
rowData.add(xlsxCellToString(row.getCell(j)));
|
||||
}
|
||||
}
|
||||
result.add(rowData);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String xlsxCellToString(Cell cell) {
|
||||
if (cell == null) return "";
|
||||
return switch (cell.getCellType()) {
|
||||
case STRING -> cell.getStringCellValue();
|
||||
case NUMERIC -> {
|
||||
if (DateUtil.isCellDateFormatted(cell)) {
|
||||
yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO
|
||||
}
|
||||
yield String.valueOf((int) cell.getNumericCellValue());
|
||||
}
|
||||
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
|
||||
// --- Import logic (works on neutral List<String> rows) ---
|
||||
|
||||
private int processRows(List<List<String>> rows) {
|
||||
int count = 0;
|
||||
for (int i = 1; i < rows.size(); i++) { // skip header row
|
||||
List<String> cells = rows.get(i);
|
||||
String index = getCell(cells, colIndex);
|
||||
if (index.isBlank()) continue;
|
||||
|
||||
String filename = index.contains(".") ? index : index + ".pdf";
|
||||
Optional<File> fileOnDisk = findFileRecursive(filename);
|
||||
if (fileOnDisk.isPresent()) {
|
||||
importSingleDocument(cells, fileOnDisk.get(), filename, index);
|
||||
count++;
|
||||
} else {
|
||||
log.warn("Datei nicht gefunden: {}", filename);
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
@Transactional
|
||||
protected void importSingleDocument(Row row, File file, String originalFilename, String index) {
|
||||
// Skip documents that have already been processed beyond placeholder stage
|
||||
protected void importSingleDocument(List<String> cells, File file, String originalFilename, String index) {
|
||||
Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename);
|
||||
if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
|
||||
log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read metadata from ODS row
|
||||
String archiveBox = getCellValue(row.getCell(colBox));
|
||||
String archiveFolder = getCellValue(row.getCell(colFolder));
|
||||
String senderRaw = getCellValue(row.getCell(colSender));
|
||||
String receiversRaw = getCellValue(row.getCell(colReceivers));
|
||||
LocalDate date = parseDate(row.getCell(colDate));
|
||||
String location = getCellValue(row.getCell(colLocation));
|
||||
String tagRaw = getCellValue(row.getCell(colTags));
|
||||
String summary = getCellValue(row.getCell(colSummary));
|
||||
String transcription = getCellValue(row.getCell(colTranscription));
|
||||
String archiveBox = getCell(cells, colBox);
|
||||
String archiveFolder = getCell(cells, colFolder);
|
||||
String senderRaw = getCell(cells, colSender);
|
||||
String receiversRaw = getCell(cells, colReceivers);
|
||||
LocalDate date = parseDate(getCell(cells, colDate));
|
||||
String location = getCell(cells, colLocation);
|
||||
String tagRaw = getCell(cells, colTags);
|
||||
String summary = getCell(cells, colSummary);
|
||||
String transcription = getCell(cells, colTranscription);
|
||||
|
||||
// Detect content type from the local file
|
||||
String contentType;
|
||||
try {
|
||||
contentType = Files.probeContentType(file.toPath());
|
||||
@@ -184,7 +282,6 @@ public class MassImportService {
|
||||
}
|
||||
if (contentType == null) contentType = "application/octet-stream";
|
||||
|
||||
// Upload to S3
|
||||
String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName();
|
||||
try {
|
||||
s3Client.putObject(PutObjectRequest.builder()
|
||||
@@ -198,21 +295,17 @@ public class MassImportService {
|
||||
return;
|
||||
}
|
||||
|
||||
// Resolve sender and receivers to Person entities
|
||||
Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null;
|
||||
|
||||
Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw);
|
||||
List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream()
|
||||
.map(this::findOrCreatePerson)
|
||||
.toList();
|
||||
|
||||
// Resolve tag
|
||||
Tag tag = null;
|
||||
if (tagRaw != null && !tagRaw.isBlank()) {
|
||||
if (!tagRaw.isBlank()) {
|
||||
tag = tagRepository.findByNameIgnoreCase(tagRaw)
|
||||
.orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build()));
|
||||
}
|
||||
|
||||
// Build or update the Document record
|
||||
Document doc = existing.orElse(Document.builder()
|
||||
.originalFilename(originalFilename)
|
||||
.build());
|
||||
@@ -221,12 +314,12 @@ public class MassImportService {
|
||||
doc.setFilePath(s3Key);
|
||||
doc.setContentType(contentType);
|
||||
doc.setStatus(DocumentStatus.UPLOADED);
|
||||
doc.setArchiveBox(archiveBox);
|
||||
doc.setArchiveFolder(archiveFolder);
|
||||
doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox);
|
||||
doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder);
|
||||
doc.setDocumentDate(date);
|
||||
doc.setLocation(location);
|
||||
doc.setSummary(summary);
|
||||
doc.setTranscription(transcription);
|
||||
doc.setLocation(location.isBlank() ? null : location);
|
||||
doc.setSummary(summary.isBlank() ? null : summary);
|
||||
doc.setTranscription(transcription.isBlank() ? null : transcription);
|
||||
doc.setSender(sender);
|
||||
doc.getReceivers().addAll(receivers);
|
||||
if (tag != null) doc.getTags().add(tag);
|
||||
@@ -235,6 +328,34 @@ public class MassImportService {
|
||||
log.info("Importiert: {}", originalFilename);
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
private String getCell(List<String> cells, int col) {
|
||||
if (col >= cells.size()) return "";
|
||||
String val = cells.get(col);
|
||||
return val == null ? "" : val.trim();
|
||||
}
|
||||
|
||||
private LocalDate parseDate(String value) {
|
||||
if (value == null || value.isBlank()) return null;
|
||||
try {
|
||||
return LocalDate.parse(value.trim());
|
||||
} catch (DateTimeParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String buildTitle(String index, LocalDate date, String location) {
|
||||
StringBuilder sb = new StringBuilder(index);
|
||||
if (date != null) {
|
||||
sb.append(" \u2013 ").append(date.format(GERMAN_DATE));
|
||||
}
|
||||
if (location != null && !location.isBlank()) {
|
||||
sb.append(" \u2013 ").append(location);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private Person findOrCreatePerson(String rawName) {
|
||||
String alias = rawName.trim();
|
||||
return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> {
|
||||
@@ -247,32 +368,6 @@ public class MassImportService {
|
||||
});
|
||||
}
|
||||
|
||||
private String buildTitle(String index, LocalDate date, String location) {
|
||||
StringBuilder sb = new StringBuilder(index);
|
||||
if (date != null) {
|
||||
sb.append(" – ").append(date.format(GERMAN_DATE));
|
||||
}
|
||||
if (location != null && !location.isBlank()) {
|
||||
sb.append(" – ").append(location);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private LocalDate parseDate(Cell cell) {
|
||||
if (cell == null) return null;
|
||||
if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
|
||||
return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
|
||||
}
|
||||
if (cell.getCellType() == CellType.STRING) {
|
||||
try {
|
||||
return LocalDate.parse(cell.getStringCellValue().trim());
|
||||
} catch (DateTimeParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Optional<File> findFileRecursive(String filename) {
|
||||
try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) {
|
||||
return walk.filter(p -> !Files.isDirectory(p))
|
||||
@@ -283,14 +378,4 @@ public class MassImportService {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private String getCellValue(Cell cell) {
|
||||
if (cell == null) return null;
|
||||
return switch (cell.getCellType()) {
|
||||
case STRING -> cell.getStringCellValue();
|
||||
case NUMERIC -> String.valueOf((int) cell.getNumericCellValue());
|
||||
case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
|
||||
default -> null;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user