feat(ocr): add OcrService, OcrBatchService, OcrProgressService, OcrController

- OcrService: single-document OCR (health check, block clearing, presigned URL, annotation + block creation) - OcrBatchService: batch processing with @Async, per-document status tracking, SKIPPED for PLACEHOLDER documents, failure isolation - OcrProgressService: SSE emitter registry per job ID with 5-min timeout - OcrController: POST /api/documents/{id}/ocr (WRITE_ALL), POST /api/ocr/batch (ADMIN), GET /api/ocr/jobs/{id} (READ_ALL), GET /api/ocr/jobs/{id}/progress (SSE), GET /api/documents/{id}/ocr-status 19 tests: 6 OcrService, 4 OcrBatchService, 3 OcrProgressService, 6 OcrController Refs #226 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:24:15 +02:00
parent ff3990710e
commit aea46c5fd0
8 changed files with 906 additions and 0 deletions
--- a/backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java
@@ -0,0 +1,114 @@
+package org.raddatz.familienarchiv.controller;
+
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.raddatz.familienarchiv.dto.BatchOcrDTO;
+import org.raddatz.familienarchiv.dto.OcrStatusDTO;
+import org.raddatz.familienarchiv.dto.TriggerOcrDTO;
+import org.raddatz.familienarchiv.exception.DomainException;
+import org.raddatz.familienarchiv.exception.ErrorCode;
+import org.raddatz.familienarchiv.model.*;
+import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
+import org.raddatz.familienarchiv.repository.OcrJobRepository;
+import org.raddatz.familienarchiv.security.Permission;
+import org.raddatz.familienarchiv.security.RequirePermission;
+import org.raddatz.familienarchiv.service.OcrBatchService;
+import org.raddatz.familienarchiv.service.OcrProgressService;
+import org.raddatz.familienarchiv.service.OcrService;
+import org.raddatz.familienarchiv.service.UserService;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
+import org.springframework.security.core.Authentication;
+import org.springframework.web.bind.annotation.*;
+import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
+
+import jakarta.validation.Valid;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.UUID;
+
+@RestController
+@RequiredArgsConstructor
+@Slf4j
+public class OcrController {
+
+    private final OcrService ocrService;
+    private final OcrBatchService ocrBatchService;
+    private final OcrProgressService ocrProgressService;
+    private final OcrJobRepository ocrJobRepository;
+    private final OcrJobDocumentRepository ocrJobDocumentRepository;
+    private final UserService userService;
+
+    @PostMapping("/api/documents/{documentId}/ocr")
+    @ResponseStatus(HttpStatus.ACCEPTED)
+    @RequirePermission(Permission.WRITE_ALL)
+    public Map<String, UUID> triggerOcr(
+            @PathVariable UUID documentId,
+            @RequestBody TriggerOcrDTO dto,
+            Authentication authentication) {
+        UUID userId = resolveUserId(authentication);
+        UUID jobId = ocrService.startOcr(documentId, dto.getScriptType(), userId);
+        return Map.of("jobId", jobId);
+    }
+
+    @PostMapping("/api/ocr/batch")
+    @ResponseStatus(HttpStatus.ACCEPTED)
+    @RequirePermission(Permission.ADMIN)
+    public Map<String, UUID> triggerBatch(
+            @RequestBody @Valid BatchOcrDTO dto,
+            Authentication authentication) {
+        UUID userId = resolveUserId(authentication);
+        UUID jobId = ocrBatchService.startBatch(dto.getDocumentIds(), userId);
+        return Map.of("jobId", jobId);
+    }
+
+    @GetMapping("/api/ocr/jobs/{jobId}")
+    @RequirePermission(Permission.READ_ALL)
+    public OcrJob getJobStatus(@PathVariable UUID jobId) {
+        return ocrJobRepository.findById(jobId)
+                .orElseThrow(() -> DomainException.notFound(
+                        ErrorCode.OCR_JOB_NOT_FOUND, "OCR job not found: " + jobId));
+    }
+
+    @GetMapping(value = "/api/ocr/jobs/{jobId}/progress", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
+    @RequirePermission(Permission.READ_ALL)
+    public SseEmitter streamProgress(@PathVariable UUID jobId) {
+        ocrJobRepository.findById(jobId)
+                .orElseThrow(() -> DomainException.notFound(
+                        ErrorCode.OCR_JOB_NOT_FOUND, "OCR job not found: " + jobId));
+        return ocrProgressService.register(jobId);
+    }
+
+    @GetMapping("/api/documents/{documentId}/ocr-status")
+    @RequirePermission(Permission.READ_ALL)
+    public OcrStatusDTO getDocumentOcrStatus(@PathVariable UUID documentId) {
+        List<OcrDocumentStatus> activeStatuses = List.of(
+                OcrDocumentStatus.PENDING, OcrDocumentStatus.RUNNING);
+
+        Optional<OcrJobDocument> activeJobDoc = ocrJobDocumentRepository
+                .findFirstByDocumentIdAndStatusIn(documentId, activeStatuses);
+
+        if (activeJobDoc.isEmpty()) {
+            return OcrStatusDTO.builder().status("NONE").build();
+        }
+
+        OcrJobDocument jobDoc = activeJobDoc.get();
+        return OcrStatusDTO.builder()
+                .status(jobDoc.getStatus().name())
+                .jobId(jobDoc.getJobId())
+                .currentPage(jobDoc.getCurrentPage())
+                .totalPages(jobDoc.getTotalPages())
+                .build();
+    }
+
+    private UUID resolveUserId(Authentication authentication) {
+        if (authentication == null || !authentication.isAuthenticated()) return null;
+        try {
+            AppUser user = userService.findByUsername(authentication.getName());
+            return user != null ? user.getId() : null;
+        } catch (Exception e) {
+            return null;
+        }
+    }
+}
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java
@@ -0,0 +1,114 @@
+package org.raddatz.familienarchiv.service;
+
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.raddatz.familienarchiv.exception.DomainException;
+import org.raddatz.familienarchiv.exception.ErrorCode;
+import org.raddatz.familienarchiv.model.*;
+import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
+import org.raddatz.familienarchiv.repository.OcrJobRepository;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+
+@Service
+@RequiredArgsConstructor
+@Slf4j
+public class OcrBatchService {
+
+    private final OcrService ocrService;
+    private final OcrHealthClient ocrHealthClient;
+    private final DocumentService documentService;
+    private final OcrJobRepository ocrJobRepository;
+    private final OcrJobDocumentRepository ocrJobDocumentRepository;
+    private final OcrProgressService ocrProgressService;
+
+    public UUID startBatch(List<UUID> documentIds, UUID userId) {
+        if (!ocrHealthClient.isHealthy()) {
+            throw DomainException.internal(ErrorCode.OCR_SERVICE_UNAVAILABLE,
+                    "OCR service is not available");
+        }
+
+        OcrJob job = OcrJob.builder()
+                .totalDocuments(documentIds.size())
+                .createdBy(userId)
+                .status(OcrJobStatus.PENDING)
+                .build();
+        job = ocrJobRepository.save(job);
+
+        for (UUID docId : documentIds) {
+            OcrJobDocument jobDoc = OcrJobDocument.builder()
+                    .jobId(job.getId())
+                    .documentId(docId)
+                    .status(OcrDocumentStatus.PENDING)
+                    .build();
+            ocrJobDocumentRepository.save(jobDoc);
+        }
+
+        processBatchAsync(job.getId(), userId);
+        return job.getId();
+    }
+
+    @Async
+    void processBatchAsync(UUID jobId, UUID userId) {
+        OcrJob job = ocrJobRepository.findById(jobId).orElse(null);
+        if (job == null) return;
+
+        job.setStatus(OcrJobStatus.RUNNING);
+        ocrJobRepository.save(job);
+
+        List<OcrJobDocument> jobDocs = ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId);
+
+        for (OcrJobDocument jobDoc : jobDocs) {
+            Document doc = documentService.getDocumentById(jobDoc.getDocumentId());
+
+            if (doc.getStatus() == DocumentStatus.PLACEHOLDER) {
+                jobDoc.setStatus(OcrDocumentStatus.SKIPPED);
+                ocrJobDocumentRepository.save(jobDoc);
+                job.setSkippedCount(job.getSkippedCount() + 1);
+                ocrJobRepository.save(job);
+                ocrProgressService.emit(jobId, "document", Map.of(
+                        "documentId", jobDoc.getDocumentId(),
+                        "status", "SKIPPED",
+                        "processed", job.getProcessedDocuments(),
+                        "total", job.getTotalDocuments()));
+                continue;
+            }
+
+            jobDoc.setStatus(OcrDocumentStatus.RUNNING);
+            ocrJobDocumentRepository.save(jobDoc);
+
+            try {
+                ocrService.processDocument(jobDoc.getDocumentId(), doc, userId);
+                jobDoc.setStatus(OcrDocumentStatus.DONE);
+                job.setProcessedDocuments(job.getProcessedDocuments() + 1);
+            } catch (Exception e) {
+                log.error("OCR batch: failed document {}", jobDoc.getDocumentId(), e);
+                jobDoc.setStatus(OcrDocumentStatus.FAILED);
+                jobDoc.setErrorMessage(e.getMessage());
+                job.setErrorCount(job.getErrorCount() + 1);
+            }
+
+            ocrJobDocumentRepository.save(jobDoc);
+            ocrJobRepository.save(job);
+
+            ocrProgressService.emit(jobId, "document", Map.of(
+                    "documentId", jobDoc.getDocumentId(),
+                    "status", jobDoc.getStatus().name(),
+                    "processed", job.getProcessedDocuments(),
+                    "total", job.getTotalDocuments()));
+        }
+
+        job.setStatus(OcrJobStatus.DONE);
+        ocrJobRepository.save(job);
+
+        ocrProgressService.emit(jobId, "done", Map.of(
+                "processed", job.getProcessedDocuments(),
+                "errors", job.getErrorCount(),
+                "skipped", job.getSkippedCount()));
+        ocrProgressService.complete(jobId);
+    }
+}
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java
@@ -0,0 +1,69 @@
+package org.raddatz.familienarchiv.service;
+
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CopyOnWriteArrayList;
+
+@Service
+@Slf4j
+public class OcrProgressService {
+
+    private static final long SSE_TIMEOUT = 5 * 60 * 1000L;
+
+    private final ConcurrentHashMap<UUID, List<SseEmitter>> emitters = new ConcurrentHashMap<>();
+
+    public SseEmitter register(UUID jobId) {
+        SseEmitter emitter = new SseEmitter(SSE_TIMEOUT);
+        emitters.computeIfAbsent(jobId, k -> new CopyOnWriteArrayList<>()).add(emitter);
+
+        emitter.onCompletion(() -> removeEmitter(jobId, emitter));
+        emitter.onTimeout(() -> removeEmitter(jobId, emitter));
+        emitter.onError(e -> removeEmitter(jobId, emitter));
+
+        return emitter;
+    }
+
+    public void emit(UUID jobId, String eventType, Object data) {
+        List<SseEmitter> jobEmitters = emitters.get(jobId);
+        if (jobEmitters == null) return;
+
+        for (SseEmitter emitter : jobEmitters) {
+            try {
+                emitter.send(SseEmitter.event().name(eventType).data(data));
+            } catch (IOException e) {
+                log.debug("SSE send failed for job {} — removing emitter", jobId);
+                removeEmitter(jobId, emitter);
+            }
+        }
+    }
+
+    public void complete(UUID jobId) {
+        List<SseEmitter> jobEmitters = emitters.remove(jobId);
+        if (jobEmitters == null) return;
+
+        for (SseEmitter emitter : jobEmitters) {
+            try {
+                emitter.complete();
+            } catch (Exception e) {
+                log.debug("SSE complete failed for job {}", jobId);
+            }
+        }
+    }
+
+    private void removeEmitter(UUID jobId, SseEmitter emitter) {
+        List<SseEmitter> jobEmitters = emitters.get(jobId);
+        if (jobEmitters != null) {
+            jobEmitters.remove(emitter);
+            if (jobEmitters.isEmpty()) {
+                emitters.remove(jobId);
+            }
+        }
+    }
+}
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java
@@ -0,0 +1,120 @@
+package org.raddatz.familienarchiv.service;
+
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
+import org.raddatz.familienarchiv.exception.DomainException;
+import org.raddatz.familienarchiv.exception.ErrorCode;
+import org.raddatz.familienarchiv.model.*;
+import org.raddatz.familienarchiv.repository.OcrJobRepository;
+import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
+
+import java.util.List;
+import java.util.UUID;
+
+@Service
+@RequiredArgsConstructor
+@Slf4j
+public class OcrService {
+
+    private static final String OCR_ANNOTATION_COLOR = "#00C7B1";
+
+    private final OcrClient ocrClient;
+    private final OcrHealthClient ocrHealthClient;
+    private final DocumentService documentService;
+    private final TranscriptionService transcriptionService;
+    private final AnnotationService annotationService;
+    private final TranscriptionBlockRepository blockRepository;
+    private final OcrJobRepository ocrJobRepository;
+
+    @Value("${app.s3.internal-url:http://minio:9000}")
+    private String s3InternalUrl;
+
+    @Value("${app.s3.bucket}")
+    private String bucketName;
+
+    @Transactional
+    public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) {
+        Document doc = documentService.getDocumentById(documentId);
+
+        if (doc.getStatus() == DocumentStatus.PLACEHOLDER) {
+            throw DomainException.badRequest(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED,
+                    "Document has no file attached: " + documentId);
+        }
+
+        if (!ocrHealthClient.isHealthy()) {
+            throw DomainException.internal(ErrorCode.OCR_SERVICE_UNAVAILABLE,
+                    "OCR service is not available");
+        }
+
+        if (scriptTypeOverride != null) {
+            doc.setScriptType(scriptTypeOverride);
+        }
+
+        OcrJob job = OcrJob.builder()
+                .totalDocuments(1)
+                .createdBy(userId)
+                .status(OcrJobStatus.RUNNING)
+                .build();
+        job = ocrJobRepository.save(job);
+
+        try {
+            processDocument(documentId, doc, userId);
+            job.setStatus(OcrJobStatus.DONE);
+            job.setProcessedDocuments(1);
+        } catch (Exception e) {
+            log.error("OCR processing failed for document {}", documentId, e);
+            job.setStatus(OcrJobStatus.FAILED);
+            job.setErrorCount(1);
+        }
+
+        ocrJobRepository.save(job);
+        return job.getId();
+    }
+
+    void processDocument(UUID documentId, Document doc, UUID userId) {
+        clearExistingBlocks(documentId);
+
+        String pdfUrl = buildInternalUrl(doc.getFilePath());
+        List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
+        createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
+    }
+
+    private void clearExistingBlocks(UUID documentId) {
+        List<TranscriptionBlock> existing = transcriptionService.listBlocks(documentId);
+        for (TranscriptionBlock block : existing) {
+            transcriptionService.deleteBlock(documentId, block.getId());
+        }
+    }
+
+    private void createTranscriptionBlocks(UUID documentId, List<OcrBlockResult> blocks,
+                                            UUID userId, String fileHash) {
+        for (int i = 0; i < blocks.size(); i++) {
+            OcrBlockResult block = blocks.get(i);
+
+            CreateAnnotationDTO annotationDTO = new CreateAnnotationDTO(
+                    block.pageNumber(), block.x(), block.y(),
+                    block.width(), block.height(), OCR_ANNOTATION_COLOR);
+
+            DocumentAnnotation annotation = annotationService.createOcrAnnotation(
+                    documentId, annotationDTO, userId, fileHash, block.polygon());
+
+            TranscriptionBlock transcriptionBlock = TranscriptionBlock.builder()
+                    .annotationId(annotation.getId())
+                    .documentId(documentId)
+                    .text(block.text() != null ? block.text() : "")
+                    .sortOrder(i)
+                    .createdBy(userId)
+                    .updatedBy(userId)
+                    .build();
+            blockRepository.save(transcriptionBlock);
+        }
+    }
+
+    String buildInternalUrl(String filePath) {
+        return s3InternalUrl + "/" + bucketName + "/" + filePath;
+    }
+}