fix: replace WorkbookFactory with native XML/ZIP parser for ODS files

WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files — Apache POI does not support ODF format at all. Replace ODS reading with a direct content.xml parser using Java's built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive; the spreadsheet lives in content.xml as standard ODF XML. Also refactors the import pipeline to decouple file reading from import logic: both ODS and XLSX paths now produce List<List<String>> which is processed by format-agnostic row logic. XLSX date cells are now converted to ISO strings before processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 21:09:46 +01:00
parent 5cc4dcf7aa
commit 4dd4d81ca3
1 changed files with 166 additions and 81 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java
@@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 import software.amazon.awssdk.core.sync.RequestBody;
 import software.amazon.awssdk.services.s3.S3Client;
 import software.amazon.awssdk.services.s3.model.PutObjectRequest;

+import javax.xml.parsers.DocumentBuilderFactory;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -28,14 +31,15 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
-import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.UUID;
 import java.util.stream.Stream;
+import java.util.zip.ZipFile;

@Service
@RequiredArgsConstructor
@@ -93,6 +97,13 @@ public class MassImportService {
    private static final String IMPORT_DIR = "/import";
    private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN);

+    // ODS XML namespaces
+    private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+    private static final String NS_TEXT  = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+    // We only need up to this many columns; caps repeated-empty-cell expansion
+    private static final int MAX_COLS = 20;
+
    @Async
    public void runImportAsync() {
        if (currentStatus.state() == State.RUNNING) {
@@ -102,7 +113,7 @@ public class MassImportService {
        try {
            File spreadsheet = findSpreadsheetFile();
            log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
-            int processed = processSpreadsheet(spreadsheet);
+            int processed = processRows(readSpreadsheet(spreadsheet));
            currentStatus = new ImportStatus(State.DONE,
                    "Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
                    processed, currentStatus.startedAt());
@@ -126,56 +137,143 @@ public class MassImportService {
        }
    }

-    private int processSpreadsheet(File file) throws IOException {
-        int count = 0;
+    // --- Spreadsheet reading (format-specific, produces neutral List<List<String>>) ---
+
+    private List<List<String>> readSpreadsheet(File file) throws Exception {
+        String name = file.getName().toLowerCase();
+        if (name.endsWith(".ods")) {
+            return readOds(file);
+        }
+        return readXlsx(file);
+    }
+
+    /**
+     * Reads an ODS file by parsing its content.xml directly (no extra library needed).
+     * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML.
+     */
+    private List<List<String>> readOds(File file) throws Exception {
+        List<List<String>> result = new ArrayList<>();
+
+        try (ZipFile zip = new ZipFile(file)) {
+            var entry = zip.getEntry("content.xml");
+            if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt");
+
+            var factory = DocumentBuilderFactory.newInstance();
+            factory.setNamespaceAware(true);
+            var builder = factory.newDocumentBuilder();
+            var doc = builder.parse(zip.getInputStream(entry));
+
+            NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table");
+            if (tables.getLength() == 0) return result;
+
+            var table = (Element) tables.item(0);
+            NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row");
+
+            for (int i = 0; i < rows.getLength(); i++) {
+                var row = (Element) rows.item(i);
+                List<String> rowData = new ArrayList<>();
+                NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell");
+
+                for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) {
+                    var cell = (Element) cells.item(j);
+
+                    // Read the display text (first <text:p>)
+                    String value = "";
+                    NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p");
+                    if (textNodes.getLength() > 0) {
+                        value = textNodes.item(0).getTextContent().trim();
+                    }
+
+                    // Expand number-columns-repeated (capped at MAX_COLS)
+                    String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated");
+                    int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr);
+                    repeat = Math.min(repeat, MAX_COLS - rowData.size());
+
+                    for (int r = 0; r < repeat; r++) {
+                        rowData.add(value);
+                    }
+                }
+                result.add(rowData);
+            }
+        }
+        return result;
+    }
+
+    /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */
+    private List<List<String>> readXlsx(File file) throws Exception {
+        List<List<String>> result = new ArrayList<>();
        try (FileInputStream fis = new FileInputStream(file);
             Workbook workbook = WorkbookFactory.create(fis)) {

            Sheet sheet = workbook.getSheetAt(0);
-
-            for (int i = 1; i <= sheet.getLastRowNum(); i++) {
+            for (int i = 0; i <= sheet.getLastRowNum(); i++) {
                Row row = sheet.getRow(i);
-                if (row == null) continue;
-
-                String index = getCellValue(row.getCell(colIndex));
-                if (index == null || index.isBlank()) continue;
-
-                // Append .pdf extension if the index has none
-                String filename = index.contains(".") ? index : index + ".pdf";
-
-                Optional<File> fileOnDisk = findFileRecursive(filename);
-                if (fileOnDisk.isPresent()) {
-                    importSingleDocument(row, fileOnDisk.get(), filename, index);
-                    count++;
-                } else {
-                    log.warn("Datei nicht gefunden: {}", filename);
+                List<String> rowData = new ArrayList<>();
+                if (row != null) {
+                    for (int j = 0; j < MAX_COLS; j++) {
+                        rowData.add(xlsxCellToString(row.getCell(j)));
+                    }
                }
+                result.add(rowData);
+            }
+        }
+        return result;
+    }
+
+    private String xlsxCellToString(Cell cell) {
+        if (cell == null) return "";
+        return switch (cell.getCellType()) {
+            case STRING -> cell.getStringCellValue();
+            case NUMERIC -> {
+                if (DateUtil.isCellDateFormatted(cell)) {
+                    yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO
+                }
+                yield String.valueOf((int) cell.getNumericCellValue());
+            }
+            case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
+            default -> "";
+        };
+    }
+
+    // --- Import logic (works on neutral List<String> rows) ---
+
+    private int processRows(List<List<String>> rows) {
+        int count = 0;
+        for (int i = 1; i < rows.size(); i++) { // skip header row
+            List<String> cells = rows.get(i);
+            String index = getCell(cells, colIndex);
+            if (index.isBlank()) continue;
+
+            String filename = index.contains(".") ? index : index + ".pdf";
+            Optional<File> fileOnDisk = findFileRecursive(filename);
+            if (fileOnDisk.isPresent()) {
+                importSingleDocument(cells, fileOnDisk.get(), filename, index);
+                count++;
+            } else {
+                log.warn("Datei nicht gefunden: {}", filename);
            }
        }
        return count;
    }

    @Transactional
-    protected void importSingleDocument(Row row, File file, String originalFilename, String index) {
-        // Skip documents that have already been processed beyond placeholder stage
+    protected void importSingleDocument(List<String> cells, File file, String originalFilename, String index) {
        Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename);
        if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
            log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
            return;
        }

-        // Read metadata from ODS row
-        String archiveBox     = getCellValue(row.getCell(colBox));
-        String archiveFolder  = getCellValue(row.getCell(colFolder));
-        String senderRaw      = getCellValue(row.getCell(colSender));
-        String receiversRaw   = getCellValue(row.getCell(colReceivers));
-        LocalDate date        = parseDate(row.getCell(colDate));
-        String location       = getCellValue(row.getCell(colLocation));
-        String tagRaw         = getCellValue(row.getCell(colTags));
-        String summary        = getCellValue(row.getCell(colSummary));
-        String transcription  = getCellValue(row.getCell(colTranscription));
+        String archiveBox    = getCell(cells, colBox);
+        String archiveFolder = getCell(cells, colFolder);
+        String senderRaw     = getCell(cells, colSender);
+        String receiversRaw  = getCell(cells, colReceivers);
+        LocalDate date       = parseDate(getCell(cells, colDate));
+        String location      = getCell(cells, colLocation);
+        String tagRaw        = getCell(cells, colTags);
+        String summary       = getCell(cells, colSummary);
+        String transcription = getCell(cells, colTranscription);

-        // Detect content type from the local file
        String contentType;
        try {
            contentType = Files.probeContentType(file.toPath());
@@ -184,7 +282,6 @@ public class MassImportService {
        }
        if (contentType == null) contentType = "application/octet-stream";

-        // Upload to S3
        String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName();
        try {
            s3Client.putObject(PutObjectRequest.builder()
@@ -198,21 +295,17 @@ public class MassImportService {
            return;
        }

-        // Resolve sender and receivers to Person entities
-        Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null;
-
+        Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw);
        List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream()
                .map(this::findOrCreatePerson)
                .toList();

-        // Resolve tag
        Tag tag = null;
-        if (tagRaw != null && !tagRaw.isBlank()) {
+        if (!tagRaw.isBlank()) {
            tag = tagRepository.findByNameIgnoreCase(tagRaw)
                    .orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build()));
        }

-        // Build or update the Document record
        Document doc = existing.orElse(Document.builder()
                .originalFilename(originalFilename)
                .build());
@@ -221,12 +314,12 @@ public class MassImportService {
        doc.setFilePath(s3Key);
        doc.setContentType(contentType);
        doc.setStatus(DocumentStatus.UPLOADED);
-        doc.setArchiveBox(archiveBox);
-        doc.setArchiveFolder(archiveFolder);
+        doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox);
+        doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder);
        doc.setDocumentDate(date);
-        doc.setLocation(location);
-        doc.setSummary(summary);
-        doc.setTranscription(transcription);
+        doc.setLocation(location.isBlank() ? null : location);
+        doc.setSummary(summary.isBlank() ? null : summary);
+        doc.setTranscription(transcription.isBlank() ? null : transcription);
        doc.setSender(sender);
        doc.getReceivers().addAll(receivers);
        if (tag != null) doc.getTags().add(tag);
@@ -235,6 +328,34 @@ public class MassImportService {
        log.info("Importiert: {}", originalFilename);
    }

+    // --- Helpers ---
+
+    private String getCell(List<String> cells, int col) {
+        if (col >= cells.size()) return "";
+        String val = cells.get(col);
+        return val == null ? "" : val.trim();
+    }
+
+    private LocalDate parseDate(String value) {
+        if (value == null || value.isBlank()) return null;
+        try {
+            return LocalDate.parse(value.trim());
+        } catch (DateTimeParseException e) {
+            return null;
+        }
+    }
+
+    private String buildTitle(String index, LocalDate date, String location) {
+        StringBuilder sb = new StringBuilder(index);
+        if (date != null) {
+            sb.append(" \u2013 ").append(date.format(GERMAN_DATE));
+        }
+        if (location != null && !location.isBlank()) {
+            sb.append(" \u2013 ").append(location);
+        }
+        return sb.toString();
+    }
+
    private Person findOrCreatePerson(String rawName) {
        String alias = rawName.trim();
        return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> {
@@ -247,32 +368,6 @@ public class MassImportService {
        });
    }

-    private String buildTitle(String index, LocalDate date, String location) {
-        StringBuilder sb = new StringBuilder(index);
-        if (date != null) {
-            sb.append(" – ").append(date.format(GERMAN_DATE));
-        }
-        if (location != null && !location.isBlank()) {
-            sb.append(" – ").append(location);
-        }
-        return sb.toString();
-    }
-
-    private LocalDate parseDate(Cell cell) {
-        if (cell == null) return null;
-        if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
-            return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
-        }
-        if (cell.getCellType() == CellType.STRING) {
-            try {
-                return LocalDate.parse(cell.getStringCellValue().trim());
-            } catch (DateTimeParseException e) {
-                return null;
-            }
-        }
-        return null;
-    }
-
    private Optional<File> findFileRecursive(String filename) {
        try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) {
            return walk.filter(p -> !Files.isDirectory(p))
@@ -283,14 +378,4 @@ public class MassImportService {
            return Optional.empty();
        }
    }
-
-    private String getCellValue(Cell cell) {
-        if (cell == null) return null;
-        return switch (cell.getCellType()) {
-            case STRING -> cell.getStringCellValue();
-            case NUMERIC -> String.valueOf((int) cell.getNumericCellValue());
-            case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
-            default -> null;
-        };
-    }
 }