feat(ocr): add OcrService, OcrBatchService, OcrProgressService, OcrController

- OcrService: single-document OCR (health check, block clearing,
  presigned URL, annotation + block creation)
- OcrBatchService: batch processing with @Async, per-document status
  tracking, SKIPPED for PLACEHOLDER documents, failure isolation
- OcrProgressService: SSE emitter registry per job ID with 5-min timeout
- OcrController: POST /api/documents/{id}/ocr (WRITE_ALL),
  POST /api/ocr/batch (ADMIN), GET /api/ocr/jobs/{id} (READ_ALL),
  GET /api/ocr/jobs/{id}/progress (SSE), GET /api/documents/{id}/ocr-status

19 tests: 6 OcrService, 4 OcrBatchService, 3 OcrProgressService, 6 OcrController

Refs #226

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 15:24:15 +02:00
parent ff3990710e
commit aea46c5fd0
8 changed files with 906 additions and 0 deletions

View File

@@ -0,0 +1,138 @@
package org.raddatz.familienarchiv.controller;
import tools.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.Test;
import org.raddatz.familienarchiv.config.SecurityConfig;
import org.raddatz.familienarchiv.dto.BatchOcrDTO;
import org.raddatz.familienarchiv.dto.TriggerOcrDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import org.raddatz.familienarchiv.security.PermissionAspect;
import org.raddatz.familienarchiv.service.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.aop.AopAutoConfiguration;
import org.springframework.boot.webmvc.test.autoconfigure.WebMvcTest;
import org.springframework.context.annotation.Import;
import org.springframework.http.MediaType;
import org.springframework.security.test.context.support.WithMockUser;
import org.springframework.test.context.bean.override.mockito.MockitoBean;
import org.springframework.test.web.servlet.MockMvc;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.when;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
@WebMvcTest(OcrController.class)
@Import({SecurityConfig.class, PermissionAspect.class, AopAutoConfiguration.class})
class OcrControllerTest {
@Autowired MockMvc mockMvc;
private final ObjectMapper objectMapper = new ObjectMapper();
@MockitoBean OcrService ocrService;
@MockitoBean OcrBatchService ocrBatchService;
@MockitoBean OcrProgressService ocrProgressService;
@MockitoBean OcrJobRepository ocrJobRepository;
@MockitoBean OcrJobDocumentRepository ocrJobDocumentRepository;
@MockitoBean UserService userService;
@MockitoBean CustomUserDetailsService customUserDetailsService;
@Test
@WithMockUser(authorities = "WRITE_ALL")
void triggerOcr_returns202_withJobId() throws Exception {
UUID docId = UUID.randomUUID();
UUID jobId = UUID.randomUUID();
TriggerOcrDTO dto = new TriggerOcrDTO(ScriptType.TYPEWRITER);
when(ocrService.startOcr(eq(docId), eq(ScriptType.TYPEWRITER), any())).thenReturn(jobId);
mockMvc.perform(post("/api/documents/{id}/ocr", docId)
.contentType(MediaType.APPLICATION_JSON)
.content(objectMapper.writeValueAsString(dto)))
.andExpect(status().isAccepted())
.andExpect(jsonPath("$.jobId").value(jobId.toString()));
}
@Test
@WithMockUser(authorities = "WRITE_ALL")
void triggerOcr_returns400_whenDocumentNotUploaded() throws Exception {
UUID docId = UUID.randomUUID();
when(ocrService.startOcr(eq(docId), any(), any()))
.thenThrow(DomainException.badRequest(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED, "Not uploaded"));
mockMvc.perform(post("/api/documents/{id}/ocr", docId)
.contentType(MediaType.APPLICATION_JSON)
.content("{}"))
.andExpect(status().isBadRequest());
}
@Test
@WithMockUser(authorities = "READ_ALL")
void getJobStatus_returns404_whenJobNotFound() throws Exception {
UUID jobId = UUID.randomUUID();
when(ocrJobRepository.findById(jobId)).thenReturn(Optional.empty());
mockMvc.perform(get("/api/ocr/jobs/{jobId}", jobId))
.andExpect(status().isNotFound());
}
@Test
@WithMockUser(authorities = "READ_ALL")
void getJobStatus_returnsJobInfo_whenFound() throws Exception {
UUID jobId = UUID.randomUUID();
OcrJob job = OcrJob.builder()
.id(jobId)
.status(OcrJobStatus.RUNNING)
.totalDocuments(10)
.processedDocuments(3)
.errorCount(1)
.skippedCount(0)
.build();
when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job));
mockMvc.perform(get("/api/ocr/jobs/{jobId}", jobId))
.andExpect(status().isOk())
.andExpect(jsonPath("$.status").value("RUNNING"))
.andExpect(jsonPath("$.totalDocuments").value(10))
.andExpect(jsonPath("$.processedDocuments").value(3));
}
@Test
@WithMockUser(authorities = "ADMIN")
void triggerBatch_returns202_withJobId() throws Exception {
UUID jobId = UUID.randomUUID();
List<UUID> docIds = List.of(UUID.randomUUID(), UUID.randomUUID());
BatchOcrDTO dto = new BatchOcrDTO(docIds);
when(ocrBatchService.startBatch(eq(docIds), any())).thenReturn(jobId);
mockMvc.perform(post("/api/ocr/batch")
.contentType(MediaType.APPLICATION_JSON)
.content(objectMapper.writeValueAsString(dto)))
.andExpect(status().isAccepted())
.andExpect(jsonPath("$.jobId").value(jobId.toString()));
}
@Test
@WithMockUser(authorities = "READ_ALL")
void getDocumentOcrStatus_returnsNone_whenNoOcrJobExists() throws Exception {
UUID docId = UUID.randomUUID();
when(ocrJobDocumentRepository.findFirstByDocumentIdAndStatusIn(eq(docId), any()))
.thenReturn(Optional.empty());
mockMvc.perform(get("/api/documents/{id}/ocr-status", docId))
.andExpect(status().isOk())
.andExpect(jsonPath("$.status").value("NONE"));
}
}

View File

@@ -0,0 +1,142 @@
package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.*;
@ExtendWith(MockitoExtension.class)
class OcrBatchServiceTest {
@Mock OcrService ocrService;
@Mock OcrHealthClient ocrHealthClient;
@Mock DocumentService documentService;
@Mock OcrJobRepository ocrJobRepository;
@Mock OcrJobDocumentRepository ocrJobDocumentRepository;
@Mock OcrProgressService ocrProgressService;
@InjectMocks OcrBatchService ocrBatchService;
@Test
void startBatch_throwsServiceUnavailable_whenOcrServiceIsDown() {
when(ocrHealthClient.isHealthy()).thenReturn(false);
assertThatThrownBy(() -> ocrBatchService.startBatch(List.of(UUID.randomUUID()), UUID.randomUUID()))
.isInstanceOf(DomainException.class)
.satisfies(e -> assertThat(((DomainException) e).getCode())
.isEqualTo(ErrorCode.OCR_SERVICE_UNAVAILABLE));
}
@Test
void startBatch_createsJobAndReturnsJobId() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
UUID jobId = UUID.randomUUID();
when(ocrHealthClient.isHealthy()).thenReturn(true);
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(jobId);
return job;
});
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(
OcrJob.builder().id(jobId).totalDocuments(1).status(OcrJobStatus.PENDING).build()));
when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId)).thenReturn(List.of(
OcrJobDocument.builder().jobId(jobId).documentId(docId).status(OcrDocumentStatus.PENDING).build()));
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("test.pdf").fileHash("hash").scriptType(ScriptType.TYPEWRITER).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
UUID resultJobId = ocrBatchService.startBatch(List.of(docId), userId);
assertThat(resultJobId).isEqualTo(jobId);
verify(ocrService).processDocument(eq(docId), eq(doc), eq(userId));
}
@Test
void processBatchAsync_skipsPlaceholderDocuments() {
UUID jobId = UUID.randomUUID();
UUID uploadedId = UUID.randomUUID();
UUID placeholderId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
OcrJob job = OcrJob.builder().id(jobId).totalDocuments(2).status(OcrJobStatus.PENDING).build();
when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job));
when(ocrJobRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
OcrJobDocument uploadedJobDoc = OcrJobDocument.builder()
.jobId(jobId).documentId(uploadedId).status(OcrDocumentStatus.PENDING).build();
OcrJobDocument placeholderJobDoc = OcrJobDocument.builder()
.jobId(jobId).documentId(placeholderId).status(OcrDocumentStatus.PENDING).build();
when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId))
.thenReturn(List.of(uploadedJobDoc, placeholderJobDoc));
Document uploaded = Document.builder().id(uploadedId).status(DocumentStatus.UPLOADED)
.filePath("test.pdf").fileHash("hash").scriptType(ScriptType.TYPEWRITER).build();
Document placeholder = Document.builder().id(placeholderId).status(DocumentStatus.PLACEHOLDER).build();
when(documentService.getDocumentById(uploadedId)).thenReturn(uploaded);
when(documentService.getDocumentById(placeholderId)).thenReturn(placeholder);
ocrBatchService.processBatchAsync(jobId, userId);
verify(ocrService).processDocument(eq(uploadedId), eq(uploaded), eq(userId));
verify(ocrService, never()).processDocument(eq(placeholderId), any(), any());
assertThat(placeholderJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.SKIPPED);
}
@Test
void processBatchAsync_continuesAfterSingleDocumentFailure() {
UUID jobId = UUID.randomUUID();
UUID failDocId = UUID.randomUUID();
UUID successDocId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
OcrJob job = OcrJob.builder().id(jobId).totalDocuments(2).status(OcrJobStatus.PENDING).build();
when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job));
when(ocrJobRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
OcrJobDocument failJobDoc = OcrJobDocument.builder()
.jobId(jobId).documentId(failDocId).status(OcrDocumentStatus.PENDING).build();
OcrJobDocument successJobDoc = OcrJobDocument.builder()
.jobId(jobId).documentId(successDocId).status(OcrDocumentStatus.PENDING).build();
when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId))
.thenReturn(List.of(failJobDoc, successJobDoc));
Document failDoc = Document.builder().id(failDocId).status(DocumentStatus.UPLOADED)
.filePath("fail.pdf").fileHash("hash1").scriptType(ScriptType.TYPEWRITER).build();
Document successDoc = Document.builder().id(successDocId).status(DocumentStatus.UPLOADED)
.filePath("success.pdf").fileHash("hash2").scriptType(ScriptType.TYPEWRITER).build();
when(documentService.getDocumentById(failDocId)).thenReturn(failDoc);
when(documentService.getDocumentById(successDocId)).thenReturn(successDoc);
doThrow(new RuntimeException("OCR failed")).when(ocrService)
.processDocument(eq(failDocId), any(), any());
ocrBatchService.processBatchAsync(jobId, userId);
verify(ocrService).processDocument(eq(successDocId), eq(successDoc), eq(userId));
assertThat(failJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.FAILED);
assertThat(successJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.DONE);
assertThat(job.getStatus()).isEqualTo(OcrJobStatus.DONE);
}
}

View File

@@ -0,0 +1,33 @@
package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatCode;
class OcrProgressServiceTest {
private final OcrProgressService progressService = new OcrProgressService();
@Test
void register_returnsNonNullEmitter() {
UUID jobId = UUID.randomUUID();
SseEmitter emitter = progressService.register(jobId);
assertThat(emitter).isNotNull();
}
@Test
void emit_doesNotThrow_whenNoEmittersRegistered() {
assertThatCode(() -> progressService.emit(UUID.randomUUID(), "test", "data"))
.doesNotThrowAnyException();
}
@Test
void complete_doesNotThrow_whenNoEmittersRegistered() {
assertThatCode(() -> progressService.complete(UUID.randomUUID()))
.doesNotThrowAnyException();
}
}

View File

@@ -0,0 +1,176 @@
package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import java.util.List;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.*;
import static org.springframework.http.HttpStatus.*;
@ExtendWith(MockitoExtension.class)
class OcrServiceTest {
@Mock OcrClient ocrClient;
@Mock OcrHealthClient ocrHealthClient;
@Mock DocumentService documentService;
@Mock TranscriptionService transcriptionService;
@Mock AnnotationService annotationService;
@Mock TranscriptionBlockRepository blockRepository;
@Mock OcrJobRepository ocrJobRepository;
@InjectMocks OcrService ocrService;
@Test
void startOcr_throwsBadRequest_whenDocumentIsPlaceholder() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.PLACEHOLDER).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
assertThatThrownBy(() -> ocrService.startOcr(docId, null, userId))
.isInstanceOf(DomainException.class)
.satisfies(e -> {
DomainException de = (DomainException) e;
assertThat(de.getStatus()).isEqualTo(BAD_REQUEST);
assertThat(de.getCode()).isEqualTo(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED);
});
}
@Test
void startOcr_throwsServiceUnavailable_whenOcrServiceIsDown() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123").build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(false);
assertThatThrownBy(() -> ocrService.startOcr(docId, null, userId))
.isInstanceOf(DomainException.class)
.satisfies(e -> {
DomainException de = (DomainException) e;
assertThat(de.getCode()).isEqualTo(ErrorCode.OCR_SERVICE_UNAVAILABLE);
});
}
@Test
void startOcr_createsJobAndReturnsJobId() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
UUID jobId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.TYPEWRITER).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(jobId);
return job;
});
UUID resultJobId = ocrService.startOcr(docId, ScriptType.TYPEWRITER, userId);
assertThat(resultJobId).isEqualTo(jobId);
verify(ocrJobRepository, atLeastOnce()).save(any());
}
@Test
void startOcr_setsScriptTypeOnDocument_whenProvided() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.UNKNOWN).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(UUID.randomUUID());
return job;
});
ocrService.startOcr(docId, ScriptType.HANDWRITING_LATIN, userId);
assertThat(doc.getScriptType()).isEqualTo(ScriptType.HANDWRITING_LATIN);
}
@Test
void startOcr_clearsExistingBlocks_beforeCreatingNew() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.TYPEWRITER).build();
TranscriptionBlock existingBlock = TranscriptionBlock.builder()
.id(UUID.randomUUID()).documentId(docId).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock));
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello")));
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(UUID.randomUUID());
return job;
});
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrService.startOcr(docId, null, userId);
verify(transcriptionService).deleteBlock(docId, existingBlock.getId());
}
@Test
void startOcr_createsAnnotationAndBlock_forEachOcrResult() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.TYPEWRITER).build();
OcrBlockResult block1 = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Line 1");
OcrBlockResult block2 = new OcrBlockResult(0, 0.1, 0.2, 0.8, 0.04, null, "Line 2");
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2));
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(UUID.randomUUID());
return job;
});
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrService.startOcr(docId, null, userId);
verify(annotationService, times(2)).createOcrAnnotation(
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
}
}