refactor(ocr): route block lifecycle through TranscriptionService

OcrAsyncRunner was bypassing TranscriptionService — building blocks
directly and calling blockRepository.save(), skipping sanitizeText()
and saveVersion(). Also replaced N individual deleteBlock() calls with
a single bulk deleteAllBlocksByDocument() for OCR re-runs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 12:27:01 +02:00
parent 6a0fd25662
commit caae2ead81
4 changed files with 134 additions and 41 deletions

View File

@@ -6,7 +6,6 @@ import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
@@ -26,7 +25,6 @@ public class OcrAsyncRunner {
private final DocumentService documentService;
private final TranscriptionService transcriptionService;
private final AnnotationService annotationService;
private final TranscriptionBlockRepository blockRepository;
private final FileService fileService;
private final OcrJobRepository ocrJobRepository;
private final OcrJobDocumentRepository ocrJobDocumentRepository;
@@ -194,10 +192,7 @@ public class OcrAsyncRunner {
}
private void clearExistingBlocks(UUID documentId) {
List<TranscriptionBlock> existing = transcriptionService.listBlocks(documentId);
for (TranscriptionBlock block : existing) {
transcriptionService.deleteBlock(documentId, block.getId());
}
transcriptionService.deleteAllBlocksByDocument(documentId);
}
private void createTranscriptionBlocks(UUID documentId, List<OcrBlockResult> blocks,
@@ -216,15 +211,7 @@ public class OcrAsyncRunner {
DocumentAnnotation annotation = annotationService.createOcrAnnotation(
documentId, annotationDTO, userId, fileHash, block.polygon());
TranscriptionBlock transcriptionBlock = TranscriptionBlock.builder()
.annotationId(annotation.getId())
.documentId(documentId)
.text(block.text() != null ? block.text() : "")
.sortOrder(sortOrder)
.source(BlockSource.OCR)
.createdBy(userId)
.updatedBy(userId)
.build();
blockRepository.save(transcriptionBlock);
transcriptionService.createOcrBlock(documentId, annotation.getId(),
block.text(), sortOrder, userId);
}
}

View File

@@ -8,6 +8,7 @@ import org.raddatz.familienarchiv.dto.ReorderTranscriptionBlocksDTO;
import org.raddatz.familienarchiv.dto.UpdateTranscriptionBlockDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
import org.raddatz.familienarchiv.model.BlockSource;
import org.raddatz.familienarchiv.model.Document;
import org.raddatz.familienarchiv.model.DocumentAnnotation;
import org.raddatz.familienarchiv.model.TranscriptionBlock;
@@ -75,6 +76,24 @@ public class TranscriptionService {
return saved;
}
@Transactional
public TranscriptionBlock createOcrBlock(UUID documentId, UUID annotationId,
String text, int sortOrder, UUID userId) {
String sanitized = sanitizeText(text);
TranscriptionBlock block = TranscriptionBlock.builder()
.annotationId(annotationId)
.documentId(documentId)
.text(sanitized)
.sortOrder(sortOrder)
.source(BlockSource.OCR)
.createdBy(userId)
.updatedBy(userId)
.build();
TranscriptionBlock saved = blockRepository.save(block);
saveVersion(saved, userId);
return saved;
}
@Transactional
public TranscriptionBlock updateBlock(UUID documentId, UUID blockId,
UpdateTranscriptionBlockDTO dto, UUID userId) {
@@ -106,6 +125,21 @@ public class TranscriptionService {
blockId, annotationId, documentId);
}
@Transactional
public void deleteAllBlocksByDocument(UUID documentId) {
List<TranscriptionBlock> blocks = blockRepository.findByDocumentIdOrderBySortOrderAsc(documentId);
if (blocks.isEmpty()) return;
List<UUID> annotationIds = blocks.stream()
.map(TranscriptionBlock::getAnnotationId)
.toList();
blockRepository.deleteAll(blocks);
blockRepository.flush();
annotationRepository.deleteAllById(annotationIds);
log.info("Bulk-deleted {} transcription blocks for document {}", blocks.size(), documentId);
}
@Transactional
public void reorderBlocks(UUID documentId, ReorderTranscriptionBlocksDTO dto) {
List<UUID> blockIds = dto.getBlockIds();

View File

@@ -2,7 +2,6 @@ package org.raddatz.familienarchiv.service;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.ArgumentCaptor;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@@ -10,7 +9,6 @@ import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import java.util.ArrayList;
import java.util.List;
@@ -19,8 +17,7 @@ import java.util.UUID;
import java.util.function.Consumer;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.ArgumentMatchers.*;
import static org.mockito.Mockito.*;
@ExtendWith(MockitoExtension.class)
@@ -30,7 +27,6 @@ class OcrAsyncRunnerTest {
@Mock DocumentService documentService;
@Mock TranscriptionService transcriptionService;
@Mock AnnotationService annotationService;
@Mock TranscriptionBlockRepository blockRepository;
@Mock FileService fileService;
@Mock OcrJobRepository ocrJobRepository;
@Mock OcrJobDocumentRepository ocrJobDocumentRepository;
@@ -42,61 +38,59 @@ class OcrAsyncRunnerTest {
void processDocument_clearsExistingBlocks() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
TranscriptionBlock existing = TranscriptionBlock.builder()
.id(UUID.randomUUID()).documentId(docId).build();
Document doc = Document.builder().id(docId).filePath("test.pdf")
.fileHash("hash").scriptType(ScriptType.TYPEWRITER).build();
when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existing));
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
ocrAsyncRunner.processDocument(docId, doc, userId);
verify(transcriptionService).deleteBlock(docId, existing.getId());
verify(transcriptionService).deleteAllBlocksByDocument(docId);
}
@Test
void processDocument_createsAnnotationAndBlock_forEachResult() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
UUID annId = UUID.randomUUID();
Document doc = Document.builder().id(docId).filePath("test.pdf")
.fileHash("hash").scriptType(ScriptType.TYPEWRITER).build();
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Line 1"),
new OcrBlockResult(0, 0.1, 0.2, 0.8, 0.04, null, "Line 2")));
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
DocumentAnnotation ann = DocumentAnnotation.builder().id(annId).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrAsyncRunner.processDocument(docId, doc, userId);
verify(annotationService, times(2)).createOcrAnnotation(
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash"), any());
verify(blockRepository, times(2)).save(any());
verify(transcriptionService, times(2)).createOcrBlock(
eq(docId), eq(annId), any(), anyInt(), eq(userId));
}
@Test
void processDocument_setsBlockSourceToOcr() {
void processDocument_delegatesBlockCreationToTranscriptionService() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
UUID annId = UUID.randomUUID();
Document doc = Document.builder().id(docId).filePath("test.pdf")
.fileHash("hash").scriptType(ScriptType.TYPEWRITER).build();
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test")));
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
DocumentAnnotation ann = DocumentAnnotation.builder().id(annId).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrAsyncRunner.processDocument(docId, doc, userId);
ArgumentCaptor<TranscriptionBlock> captor = ArgumentCaptor.forClass(TranscriptionBlock.class);
verify(blockRepository).save(captor.capture());
assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR);
verify(transcriptionService).createOcrBlock(docId, annId, "Test", 0, userId);
}
@Test
@@ -115,7 +109,7 @@ class OcrAsyncRunnerTest {
when(ocrJobDocumentRepository.findByJobIdAndDocumentId(jobId, docId))
.thenReturn(Optional.of(jobDoc));
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
doAnswer(inv -> {
Consumer<OcrStreamEvent> handler = inv.getArgument(2);
@@ -146,7 +140,7 @@ class OcrAsyncRunnerTest {
when(ocrJobDocumentRepository.findByJobIdAndDocumentId(jobId, docId))
.thenReturn(Optional.of(jobDoc));
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
doThrow(new RuntimeException("OCR failed")).when(ocrClient).streamBlocks(any(), any(), any());
@@ -174,7 +168,7 @@ class OcrAsyncRunnerTest {
.thenReturn(Optional.of(jobDoc));
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
@@ -217,7 +211,7 @@ class OcrAsyncRunnerTest {
.thenReturn(Optional.of(jobDoc));
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
doAnswer(inv -> {
@@ -253,7 +247,7 @@ class OcrAsyncRunnerTest {
.thenReturn(Optional.of(jobDoc));
when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(fileService.generatePresignedUrl(any())).thenReturn("http://presigned");
doAnswer(inv -> {

View File

@@ -10,6 +10,7 @@ import org.raddatz.familienarchiv.dto.CreateTranscriptionBlockDTO;
import org.raddatz.familienarchiv.dto.ReorderTranscriptionBlocksDTO;
import org.raddatz.familienarchiv.dto.UpdateTranscriptionBlockDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.model.BlockSource;
import org.raddatz.familienarchiv.model.Document;
import org.raddatz.familienarchiv.model.DocumentAnnotation;
import org.raddatz.familienarchiv.model.TranscriptionBlock;
@@ -26,8 +27,8 @@ import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.*;
import static org.springframework.http.HttpStatus.NOT_FOUND;
@ExtendWith(MockitoExtension.class)
@@ -99,6 +100,50 @@ class TranscriptionServiceTest {
verify(versionRepository).save(any(TranscriptionBlockVersion.class));
}
// ─── createOcrBlock ──────────────────────────────────────────────────────────
@Test
void createOcrBlock_createsBlockWithOcrSourceAndSavesVersion() {
UUID docId = UUID.randomUUID();
UUID annotId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
when(blockRepository.save(any())).thenAnswer(inv -> {
TranscriptionBlock b = inv.getArgument(0);
b.setId(UUID.randomUUID());
return b;
});
TranscriptionBlock result = transcriptionService.createOcrBlock(
docId, annotId, "OCR text", 3, userId);
assertThat(result.getAnnotationId()).isEqualTo(annotId);
assertThat(result.getDocumentId()).isEqualTo(docId);
assertThat(result.getText()).isEqualTo("OCR text");
assertThat(result.getSortOrder()).isEqualTo(3);
assertThat(result.getSource()).isEqualTo(BlockSource.OCR);
assertThat(result.getCreatedBy()).isEqualTo(userId);
verify(versionRepository).save(any(TranscriptionBlockVersion.class));
}
@Test
void createOcrBlock_sanitizesNullText() {
UUID docId = UUID.randomUUID();
UUID annotId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
when(blockRepository.save(any())).thenAnswer(inv -> {
TranscriptionBlock b = inv.getArgument(0);
b.setId(UUID.randomUUID());
return b;
});
TranscriptionBlock result = transcriptionService.createOcrBlock(
docId, annotId, null, 0, userId);
assertThat(result.getText()).isEmpty();
}
// ─── updateBlock ─────────────────────────────────────────────────────────────
@Test
@@ -168,6 +213,39 @@ class TranscriptionServiceTest {
.satisfies(e -> assertThat(((DomainException) e).getStatus()).isEqualTo(NOT_FOUND));
}
// ─── deleteAllBlocksByDocument ─────────────────────────────────────────────
@Test
void deleteAllBlocksByDocument_deletesAllBlocksAndAnnotations() {
UUID docId = UUID.randomUUID();
UUID annId1 = UUID.randomUUID();
UUID annId2 = UUID.randomUUID();
TranscriptionBlock block1 = TranscriptionBlock.builder()
.id(UUID.randomUUID()).documentId(docId).annotationId(annId1).sortOrder(0).build();
TranscriptionBlock block2 = TranscriptionBlock.builder()
.id(UUID.randomUUID()).documentId(docId).annotationId(annId2).sortOrder(1).build();
when(blockRepository.findByDocumentIdOrderBySortOrderAsc(docId))
.thenReturn(List.of(block1, block2));
transcriptionService.deleteAllBlocksByDocument(docId);
verify(blockRepository).deleteAll(List.of(block1, block2));
verify(blockRepository).flush();
verify(annotationRepository).deleteAllById(List.of(annId1, annId2));
}
@Test
void deleteAllBlocksByDocument_doesNothing_whenNoBlocksExist() {
UUID docId = UUID.randomUUID();
when(blockRepository.findByDocumentIdOrderBySortOrderAsc(docId)).thenReturn(List.of());
transcriptionService.deleteAllBlocksByDocument(docId);
verify(blockRepository, never()).deleteAll(any());
}
// ─── reorderBlocks ───────────────────────────────────────────────────────────
@Test