From 4dd4d81ca36d7daafb7604d97606be90828942aa Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 15 Mar 2026 21:09:46 +0100 Subject: [PATCH] fix: replace WorkbookFactory with native XML/ZIP parser for ODS files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WorkbookFactory throws ODFNotOfficeXmlFileException on .ods files — Apache POI does not support ODF format at all. Replace ODS reading with a direct content.xml parser using Java's built-in ZipFile + DOM API (no new dependency). ODS is a ZIP archive; the spreadsheet lives in content.xml as standard ODF XML. Also refactors the import pipeline to decouple file reading from import logic: both ODS and XLSX paths now produce List> which is processed by format-agnostic row logic. XLSX date cells are now converted to ISO strings before processing. Co-Authored-By: Claude Sonnet 4.6 --- .../service/MassImportService.java | 247 ++++++++++++------ 1 file changed, 166 insertions(+), 81 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java index 9f1b93a3..98db4b45 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/MassImportService.java @@ -16,10 +16,13 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.PutObjectRequest; +import javax.xml.parsers.DocumentBuilderFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -28,14 +31,15 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.time.LocalDate; import java.time.LocalDateTime; -import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; +import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Optional; import java.util.UUID; import java.util.stream.Stream; +import java.util.zip.ZipFile; @Service @RequiredArgsConstructor @@ -93,6 +97,13 @@ public class MassImportService { private static final String IMPORT_DIR = "/import"; private static final DateTimeFormatter GERMAN_DATE = DateTimeFormatter.ofPattern("d. MMMM yyyy", Locale.GERMAN); + // ODS XML namespaces + private static final String NS_TABLE = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; + private static final String NS_TEXT = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; + + // We only need up to this many columns; caps repeated-empty-cell expansion + private static final int MAX_COLS = 20; + @Async public void runImportAsync() { if (currentStatus.state() == State.RUNNING) { @@ -102,7 +113,7 @@ public class MassImportService { try { File spreadsheet = findSpreadsheetFile(); log.info("Starte Massenimport aus: {}", spreadsheet.getAbsolutePath()); - int processed = processSpreadsheet(spreadsheet); + int processed = processRows(readSpreadsheet(spreadsheet)); currentStatus = new ImportStatus(State.DONE, "Import abgeschlossen. " + processed + " Dokumente verarbeitet.", processed, currentStatus.startedAt()); @@ -126,56 +137,143 @@ public class MassImportService { } } - private int processSpreadsheet(File file) throws IOException { - int count = 0; + // --- Spreadsheet reading (format-specific, produces neutral List>) --- + + private List> readSpreadsheet(File file) throws Exception { + String name = file.getName().toLowerCase(); + if (name.endsWith(".ods")) { + return readOds(file); + } + return readXlsx(file); + } + + /** + * Reads an ODS file by parsing its content.xml directly (no extra library needed). + * ODS is a ZIP archive; content.xml holds the spreadsheet data as XML. + */ + private List> readOds(File file) throws Exception { + List> result = new ArrayList<>(); + + try (ZipFile zip = new ZipFile(file)) { + var entry = zip.getEntry("content.xml"); + if (entry == null) throw new RuntimeException("Ungültige ODS-Datei: content.xml fehlt"); + + var factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + var builder = factory.newDocumentBuilder(); + var doc = builder.parse(zip.getInputStream(entry)); + + NodeList tables = doc.getElementsByTagNameNS(NS_TABLE, "table"); + if (tables.getLength() == 0) return result; + + var table = (Element) tables.item(0); + NodeList rows = table.getElementsByTagNameNS(NS_TABLE, "table-row"); + + for (int i = 0; i < rows.getLength(); i++) { + var row = (Element) rows.item(i); + List rowData = new ArrayList<>(); + NodeList cells = row.getElementsByTagNameNS(NS_TABLE, "table-cell"); + + for (int j = 0; j < cells.getLength() && rowData.size() < MAX_COLS; j++) { + var cell = (Element) cells.item(j); + + // Read the display text (first ) + String value = ""; + NodeList textNodes = cell.getElementsByTagNameNS(NS_TEXT, "p"); + if (textNodes.getLength() > 0) { + value = textNodes.item(0).getTextContent().trim(); + } + + // Expand number-columns-repeated (capped at MAX_COLS) + String repeatAttr = cell.getAttributeNS(NS_TABLE, "number-columns-repeated"); + int repeat = repeatAttr.isEmpty() ? 1 : Integer.parseInt(repeatAttr); + repeat = Math.min(repeat, MAX_COLS - rowData.size()); + + for (int r = 0; r < repeat; r++) { + rowData.add(value); + } + } + result.add(rowData); + } + } + return result; + } + + /** Reads an XLSX/XLS file using Apache POI. Converts all cells to strings. */ + private List> readXlsx(File file) throws Exception { + List> result = new ArrayList<>(); try (FileInputStream fis = new FileInputStream(file); Workbook workbook = WorkbookFactory.create(fis)) { Sheet sheet = workbook.getSheetAt(0); - - for (int i = 1; i <= sheet.getLastRowNum(); i++) { + for (int i = 0; i <= sheet.getLastRowNum(); i++) { Row row = sheet.getRow(i); - if (row == null) continue; - - String index = getCellValue(row.getCell(colIndex)); - if (index == null || index.isBlank()) continue; - - // Append .pdf extension if the index has none - String filename = index.contains(".") ? index : index + ".pdf"; - - Optional fileOnDisk = findFileRecursive(filename); - if (fileOnDisk.isPresent()) { - importSingleDocument(row, fileOnDisk.get(), filename, index); - count++; - } else { - log.warn("Datei nicht gefunden: {}", filename); + List rowData = new ArrayList<>(); + if (row != null) { + for (int j = 0; j < MAX_COLS; j++) { + rowData.add(xlsxCellToString(row.getCell(j))); + } } + result.add(rowData); + } + } + return result; + } + + private String xlsxCellToString(Cell cell) { + if (cell == null) return ""; + return switch (cell.getCellType()) { + case STRING -> cell.getStringCellValue(); + case NUMERIC -> { + if (DateUtil.isCellDateFormatted(cell)) { + yield cell.getLocalDateTimeCellValue().toLocalDate().toString(); // ISO + } + yield String.valueOf((int) cell.getNumericCellValue()); + } + case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); + default -> ""; + }; + } + + // --- Import logic (works on neutral List rows) --- + + private int processRows(List> rows) { + int count = 0; + for (int i = 1; i < rows.size(); i++) { // skip header row + List cells = rows.get(i); + String index = getCell(cells, colIndex); + if (index.isBlank()) continue; + + String filename = index.contains(".") ? index : index + ".pdf"; + Optional fileOnDisk = findFileRecursive(filename); + if (fileOnDisk.isPresent()) { + importSingleDocument(cells, fileOnDisk.get(), filename, index); + count++; + } else { + log.warn("Datei nicht gefunden: {}", filename); } } return count; } @Transactional - protected void importSingleDocument(Row row, File file, String originalFilename, String index) { - // Skip documents that have already been processed beyond placeholder stage + protected void importSingleDocument(List cells, File file, String originalFilename, String index) { Optional existing = documentRepository.findByOriginalFilename(originalFilename); if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) { log.info("Dokument {} existiert bereits, überspringe.", originalFilename); return; } - // Read metadata from ODS row - String archiveBox = getCellValue(row.getCell(colBox)); - String archiveFolder = getCellValue(row.getCell(colFolder)); - String senderRaw = getCellValue(row.getCell(colSender)); - String receiversRaw = getCellValue(row.getCell(colReceivers)); - LocalDate date = parseDate(row.getCell(colDate)); - String location = getCellValue(row.getCell(colLocation)); - String tagRaw = getCellValue(row.getCell(colTags)); - String summary = getCellValue(row.getCell(colSummary)); - String transcription = getCellValue(row.getCell(colTranscription)); + String archiveBox = getCell(cells, colBox); + String archiveFolder = getCell(cells, colFolder); + String senderRaw = getCell(cells, colSender); + String receiversRaw = getCell(cells, colReceivers); + LocalDate date = parseDate(getCell(cells, colDate)); + String location = getCell(cells, colLocation); + String tagRaw = getCell(cells, colTags); + String summary = getCell(cells, colSummary); + String transcription = getCell(cells, colTranscription); - // Detect content type from the local file String contentType; try { contentType = Files.probeContentType(file.toPath()); @@ -184,7 +282,6 @@ public class MassImportService { } if (contentType == null) contentType = "application/octet-stream"; - // Upload to S3 String s3Key = "documents/" + UUID.randomUUID() + "_" + file.getName(); try { s3Client.putObject(PutObjectRequest.builder() @@ -198,21 +295,17 @@ public class MassImportService { return; } - // Resolve sender and receivers to Person entities - Person sender = senderRaw != null && !senderRaw.isBlank() ? findOrCreatePerson(senderRaw) : null; - + Person sender = senderRaw.isBlank() ? null : findOrCreatePerson(senderRaw); List receivers = PersonNameParser.parseReceivers(receiversRaw).stream() .map(this::findOrCreatePerson) .toList(); - // Resolve tag Tag tag = null; - if (tagRaw != null && !tagRaw.isBlank()) { + if (!tagRaw.isBlank()) { tag = tagRepository.findByNameIgnoreCase(tagRaw) .orElseGet(() -> tagRepository.save(Tag.builder().name(tagRaw).build())); } - // Build or update the Document record Document doc = existing.orElse(Document.builder() .originalFilename(originalFilename) .build()); @@ -221,12 +314,12 @@ public class MassImportService { doc.setFilePath(s3Key); doc.setContentType(contentType); doc.setStatus(DocumentStatus.UPLOADED); - doc.setArchiveBox(archiveBox); - doc.setArchiveFolder(archiveFolder); + doc.setArchiveBox(archiveBox.isBlank() ? null : archiveBox); + doc.setArchiveFolder(archiveFolder.isBlank() ? null : archiveFolder); doc.setDocumentDate(date); - doc.setLocation(location); - doc.setSummary(summary); - doc.setTranscription(transcription); + doc.setLocation(location.isBlank() ? null : location); + doc.setSummary(summary.isBlank() ? null : summary); + doc.setTranscription(transcription.isBlank() ? null : transcription); doc.setSender(sender); doc.getReceivers().addAll(receivers); if (tag != null) doc.getTags().add(tag); @@ -235,6 +328,34 @@ public class MassImportService { log.info("Importiert: {}", originalFilename); } + // --- Helpers --- + + private String getCell(List cells, int col) { + if (col >= cells.size()) return ""; + String val = cells.get(col); + return val == null ? "" : val.trim(); + } + + private LocalDate parseDate(String value) { + if (value == null || value.isBlank()) return null; + try { + return LocalDate.parse(value.trim()); + } catch (DateTimeParseException e) { + return null; + } + } + + private String buildTitle(String index, LocalDate date, String location) { + StringBuilder sb = new StringBuilder(index); + if (date != null) { + sb.append(" \u2013 ").append(date.format(GERMAN_DATE)); + } + if (location != null && !location.isBlank()) { + sb.append(" \u2013 ").append(location); + } + return sb.toString(); + } + private Person findOrCreatePerson(String rawName) { String alias = rawName.trim(); return personRepository.findByAliasIgnoreCase(alias).orElseGet(() -> { @@ -247,32 +368,6 @@ public class MassImportService { }); } - private String buildTitle(String index, LocalDate date, String location) { - StringBuilder sb = new StringBuilder(index); - if (date != null) { - sb.append(" – ").append(date.format(GERMAN_DATE)); - } - if (location != null && !location.isBlank()) { - sb.append(" – ").append(location); - } - return sb.toString(); - } - - private LocalDate parseDate(Cell cell) { - if (cell == null) return null; - if (cell.getCellType() == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) { - return cell.getDateCellValue().toInstant().atZone(ZoneId.systemDefault()).toLocalDate(); - } - if (cell.getCellType() == CellType.STRING) { - try { - return LocalDate.parse(cell.getStringCellValue().trim()); - } catch (DateTimeParseException e) { - return null; - } - } - return null; - } - private Optional findFileRecursive(String filename) { try (Stream walk = Files.walk(Paths.get(IMPORT_DIR))) { return walk.filter(p -> !Files.isDirectory(p)) @@ -283,14 +378,4 @@ public class MassImportService { return Optional.empty(); } } - - private String getCellValue(Cell cell) { - if (cell == null) return null; - return switch (cell.getCellType()) { - case STRING -> cell.getStringCellValue(); - case NUMERIC -> String.valueOf((int) cell.getNumericCellValue()); - case BOOLEAN -> String.valueOf(cell.getBooleanCellValue()); - default -> null; - }; - } }