From 4dd4d81ca36d7daafb7604d97606be90828942aa Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sun, 15 Mar 2026 21:09:46 +0100
Subject: [PATCH] fix: replace WorkbookFactory with native XML/ZIP parser for
 ODS files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files —
Apache POI does not support ODF format at all.

Replace ODS reading with a direct content.xml parser using Java's
built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive;
the spreadsheet lives in content.xml as standard ODF XML.

Also refactors the import pipeline to decouple file reading from import
logic: both ODS and XLSX paths now produce List<List<String>> which is
processed by format-agnostic row logic. XLSX date cells are now
converted to ISO strings before processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../service/MassImportService.java            | 247 ++++++++++++------
 1 file changed, 166 insertions(+), 81 deletions(-)
diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java
index 9f1b93a3..98db4b45 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java
@@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value;
 import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 import software.amazon.awssdk.core.sync.RequestBody;
 import software.amazon.awssdk.services.s3.S3Client;
 import software.amazon.awssdk.services.s3.model.PutObjectRequest;
 
+import javax.xml.parsers.DocumentBuilderFactory;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -28,14 +31,15 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
-import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.UUID;
 import java.util.stream.Stream;
+import java.util.zip.ZipFile;
 
 @Service
 @RequiredArgsConstructor
@@ -93,6 +97,13 @@ public class MassImportService {
     private static final String IMPORT_DIR = "/import";
     private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN);
 
+    // ODS XML namespaces
+    private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+    private static final String NS_TEXT  = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+    // We only need up to this many columns; caps repeated-empty-cell expansion
+    private static final int MAX_COLS = 20;
+
     @Async
     public void runImportAsync() {
         if (currentStatus.state() == State.RUNNING) {
@@ -102,7 +113,7 @@ public class MassImportService {
         try {
             File spreadsheet = findSpreadsheetFile();
             log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath());
-            int processed = processSpreadsheet(spreadsheet);
+            int processed = processRows(readSpreadsheet(spreadsheet));
             currentStatus = new ImportStatus(State.DONE,
                     "Import abgeschlossen. " + processed + " Dokumente verarbeitet.",
                     processed, currentStatus.startedAt());
@@ -126,56 +137,143 @@ public class MassImportService {
         }
     }
 
-    private int processSpreadsheet(File file) throws IOException {
-        int count = 0;
+    // --- Spreadsheet reading (format-specific, produces neutral List<List<String>>) ---
+
+    private List<List<String>> readSpreadsheet(File file) throws Exception {
+        String name = file.getName().toLowerCase();
+        if (name.endsWith(".ods")) {
+            return readOds(file);
+        }
+        return readXlsx(file);
+    }
+
+    /**
+     * Reads an ODS file by parsing its content.xml directly (no extra library needed).
+     * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML.
+     */
+    private List<List<String>> readOds(File file) throws Exception {
+        List<List<String>> result = new ArrayList<>();
+
+        try (ZipFile zip = new ZipFile(file)) {
+            var entry = zip.getEntry("content.xml");
+            if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt");
+
+            var factory = DocumentBuilderFactory.newInstance();
+            factory.setNamespaceAware(true);
+            var builder = factory.newDocumentBuilder();
+            var doc = builder.parse(zip.getInputStream(entry));
+
+            NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table");
+            if (tables.getLength() == 0) return result;
+
+            var table = (Element) tables.item(0);
+            NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row");
+
+            for (int i = 0; i < rows.getLength(); i++) {
+                var row = (Element) rows.item(i);
+                List<String> rowData = new ArrayList<>();
+                NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell");
+
+                for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) {
+                    var cell = (Element) cells.item(j);
+
+                    // Read the display text (first <text:p>)
+                    String value = "";
+                    NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p");
+                    if (textNodes.getLength() > 0) {
+                        value = textNodes.item(0).getTextContent().trim();
+                    }
+
+                    // Expand number-columns-repeated (capped at MAX_COLS)
+                    String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated");
+                    int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr);
+                    repeat = Math.min(repeat, MAX_COLS - rowData.size());
+
+                    for (int r = 0; r < repeat; r++) {
+                        rowData.add(value);
+                    }
+                }
+                result.add(rowData);
+            }
+        }
+        return result;
+    }
+
+    /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */
+    private List<List<String>> readXlsx(File file) throws Exception {
+        List<List<String>> result = new ArrayList<>();
         try (FileInputStream fis = new FileInputStream(file);
              Workbook workbook = WorkbookFactory.create(fis)) {
 
             Sheet sheet = workbook.getSheetAt(0);
-
-            for (int i = 1; i <= sheet.getLastRowNum(); i++) {
+            for (int i = 0; i <= sheet.getLastRowNum(); i++) {
                 Row row = sheet.getRow(i);
-                if (row == null) continue;
-
-                String index = getCellValue(row.getCell(colIndex));
-                if (index == null || index.isBlank()) continue;
-
-                // Append .pdf extension if the index has none
-                String filename = index.contains(".") ? index : index + ".pdf";
-
-                Optional<File> fileOnDisk = findFileRecursive(filename);
-                if (fileOnDisk.isPresent()) {
-                    importSingleDocument(row, fileOnDisk.get(), filename, index);
-                    count++;
-                } else {
-                    log.warn("Datei nicht gefunden: {}", filename);
+                List<String> rowData = new ArrayList<>();
+                if (row != null) {
+                    for (int j = 0; j < MAX_COLS; j++) {
+                        rowData.add(xlsxCellToString(row.getCell(j)));
+                    }
                 }
+                result.add(rowData);
+            }
+        }
+        return result;
+    }
+
+    private String xlsxCellToString(Cell cell) {
+        if (cell == null) return "";
+        return switch (cell.getCellType()) {
+            case STRING -> cell.getStringCellValue();
+            case NUMERIC -> {
+                if (DateUtil.isCellDateFormatted(cell)) {
+                    yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO
+                }
+                yield String.valueOf((int) cell.getNumericCellValue());
+            }
+            case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
+            default -> "";
+        };
+    }
+
+    // --- Import logic (works on neutral List<String> rows) ---
+
+    private int processRows(List<List<String>> rows) {
+        int count = 0;
+        for (int i = 1; i < rows.size(); i++) { // skip header row
+            List<String> cells = rows.get(i);
+            String index = getCell(cells, colIndex);
+            if (index.isBlank()) continue;
+
+            String filename = index.contains(".") ? index : index + ".pdf";
+            Optional<File> fileOnDisk = findFileRecursive(filename);
+            if (fileOnDisk.isPresent()) {
+                importSingleDocument(cells, fileOnDisk.get(), filename, index);
+                count++;
+            } else {
+                log.warn("Datei nicht gefunden: {}", filename);
             }
         }
         return count;
     }
 
     @Transactional
-    protected void importSingleDocument(Row row, File file, String originalFilename, String index) {
-        // Skip documents that have already been processed beyond placeholder stage
+    protected void importSingleDocument(List<String> cells, File file, String originalFilename, String index) {
         Optional<Document> existing = documentRepository.findByOriginalFilename(originalFilename);
         if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
             log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
             return;
         }
 
-        // Read metadata from ODS row
-        String archiveBox     = getCellValue(row.getCell(colBox));
-        String archiveFolder  = getCellValue(row.getCell(colFolder));
-        String senderRaw      = getCellValue(row.getCell(colSender));
-        String receiversRaw   = getCellValue(row.getCell(colReceivers));
-        LocalDate date        = parseDate(row.getCell(colDate));
-        String location       = getCellValue(row.getCell(colLocation));
-        String tagRaw         = getCellValue(row.getCell(colTags));
-        String summary        = getCellValue(row.getCell(colSummary));
-        String transcription  = getCellValue(row.getCell(colTranscription));
+        String archiveBox    = getCell(cells, colBox);
+        String archiveFolder = getCell(cells, colFolder);
+        String senderRaw     = getCell(cells, colSender);
+        String receiversRaw  = getCell(cells, colReceivers);
+        LocalDate date       = parseDate(getCell(cells, colDate));
+        String location      = getCell(cells, colLocation);
+        String tagRaw        = getCell(cells, colTags);
+        String summary       = getCell(cells, colSummary);
+        String transcription = getCell(cells, colTranscription);
 
-        // Detect content type from the local file
         String contentType;
         try {
             contentType = Files.probeContentType(file.toPath());
@@ -184,7 +282,6 @@ public class MassImportService {
         }
         if (contentType == null) contentType = "application/octet-stream";
 
-        // Upload to S3
         String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName();
         try {
             s3Client.putObject(PutObjectRequest.builder()
@@ -198,21 +295,17 @@ public class MassImportService {
             return;
         }
 
-        // Resolve sender and receivers to Person entities
-        Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null;
-
+        Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw);
         List<Person> receivers = PersonNameParser.parseReceivers(receiversRaw).stream()
                 .map(this::findOrCreatePerson)
                 .toList();
 
-        // Resolve tag
         Tag tag = null;
-        if (tagRaw != null && !tagRaw.isBlank()) {
+        if (!tagRaw.isBlank()) {
             tag = tagRepository.findByNameIgnoreCase(tagRaw)
                     .orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build()));
         }
 
-        // Build or update the Document record
         Document doc = existing.orElse(Document.builder()
                 .originalFilename(originalFilename)
                 .build());
@@ -221,12 +314,12 @@ public class MassImportService {
         doc.setFilePath(s3Key);
         doc.setContentType(contentType);
         doc.setStatus(DocumentStatus.UPLOADED);
-        doc.setArchiveBox(archiveBox);
-        doc.setArchiveFolder(archiveFolder);
+        doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox);
+        doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder);
         doc.setDocumentDate(date);
-        doc.setLocation(location);
-        doc.setSummary(summary);
-        doc.setTranscription(transcription);
+        doc.setLocation(location.isBlank() ? null : location);
+        doc.setSummary(summary.isBlank() ? null : summary);
+        doc.setTranscription(transcription.isBlank() ? null : transcription);
         doc.setSender(sender);
         doc.getReceivers().addAll(receivers);
         if (tag != null) doc.getTags().add(tag);
@@ -235,6 +328,34 @@ public class MassImportService {
         log.info("Importiert: {}", originalFilename);
     }
 
+    // --- Helpers ---
+
+    private String getCell(List<String> cells, int col) {
+        if (col >= cells.size()) return "";
+        String val = cells.get(col);
+        return val == null ? "" : val.trim();
+    }
+
+    private LocalDate parseDate(String value) {
+        if (value == null || value.isBlank()) return null;
+        try {
+            return LocalDate.parse(value.trim());
+        } catch (DateTimeParseException e) {
+            return null;
+        }
+    }
+
+    private String buildTitle(String index, LocalDate date, String location) {
+        StringBuilder sb = new StringBuilder(index);
+        if (date != null) {
+            sb.append(" \u2013 ").append(date.format(GERMAN_DATE));
+        }
+        if (location != null && !location.isBlank()) {
+            sb.append(" \u2013 ").append(location);
+        }
+        return sb.toString();
+    }
+
     private Person findOrCreatePerson(String rawName) {
         String alias = rawName.trim();
         return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> {
@@ -247,32 +368,6 @@ public class MassImportService {
         });
     }
 
-    private String buildTitle(String index, LocalDate date, String location) {
-        StringBuilder sb = new StringBuilder(index);
-        if (date != null) {
-            sb.append(" – ").append(date.format(GERMAN_DATE));
-        }
-        if (location != null && !location.isBlank()) {
-            sb.append(" – ").append(location);
-        }
-        return sb.toString();
-    }
-
-    private LocalDate parseDate(Cell cell) {
-        if (cell == null) return null;
-        if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
-            return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
-        }
-        if (cell.getCellType() == CellType.STRING) {
-            try {
-                return LocalDate.parse(cell.getStringCellValue().trim());
-            } catch (DateTimeParseException e) {
-                return null;
-            }
-        }
-        return null;
-    }
-
     private Optional<File> findFileRecursive(String filename) {
         try (Stream<Path> walk = Files.walk(Paths.get(IMPORT_DIR))) {
             return walk.filter(p -> !Files.isDirectory(p))
@@ -283,14 +378,4 @@ public class MassImportService {
             return Optional.empty();
         }
     }
-
-    private String getCellValue(Cell cell) {
-        if (cell == null) return null;
-        return switch (cell.getCellType()) {
-            case STRING -> cell.getStringCellValue();
-            case NUMERIC -> String.valueOf((int) cell.getNumericCellValue());
-            case BOOLEAN -> String.valueOf(cell.getBooleanCellValue());
-            default -> null;
-        };
-    }
 }