From 3aaec014212176c811b307be5619ff69f61bc1e8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 21:44:51 +0200 Subject: [PATCH] feat(transcription): add source/reviewed fields for training pipeline - BlockSource enum: MANUAL, OCR - V26 migration adds source + reviewed columns to transcription_blocks - OcrService sets source=OCR when creating blocks - TranscriptionService.reviewBlock() toggles the reviewed flag - PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint - 5 new tests: reviewBlock toggle/untoggle/notfound, controller, OcrService source=OCR verification The reviewed flag enables the Kraken fine-tuning pipeline: only blocks marked as reviewed by a human are exported as training data. Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../TranscriptionBlockController.java | 8 ++++ .../familienarchiv/model/BlockSource.java | 6 +++ .../model/TranscriptionBlock.java | 11 +++++ .../familienarchiv/service/OcrService.java | 1 + .../service/TranscriptionService.java | 7 +++ ...e_and_reviewed_to_transcription_blocks.sql | 2 + .../TranscriptionBlockControllerTest.java | 16 +++++++ .../service/OcrServiceTest.java | 29 +++++++++++++ .../service/TranscriptionServiceTest.java | 43 +++++++++++++++++++ 9 files changed, 123 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java create mode 100644 backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql diff --git a/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java b/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java index 227713d0..fd52d8f4 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java @@ -81,6 +81,14 @@ public class TranscriptionBlockController { return transcriptionService.listBlocks(documentId); } + @PutMapping("/{blockId}/review") + @RequirePermission(Permission.WRITE_ALL) + public TranscriptionBlock reviewBlock( + @PathVariable UUID documentId, + @PathVariable UUID blockId) { + return transcriptionService.reviewBlock(documentId, blockId); + } + @GetMapping("/{blockId}/history") @RequirePermission(Permission.READ_ALL) public List getBlockHistory( diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java b/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java new file mode 100644 index 00000000..eb412e64 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java @@ -0,0 +1,6 @@ +package org.raddatz.familienarchiv.model; + +public enum BlockSource { + MANUAL, + OCR +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java b/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java index 6f1e008e..8f01dbeb 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java @@ -41,6 +41,17 @@ public class TranscriptionBlock { @Schema(requiredMode = Schema.RequiredMode.REQUIRED) private int sortOrder; + @Enumerated(EnumType.STRING) + @Column(nullable = false, length = 10) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private BlockSource source = BlockSource.MANUAL; + + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private boolean reviewed = false; + @Version @Column(nullable = false) @Schema(requiredMode = Schema.RequiredMode.REQUIRED) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java index 5ec7a2f1..5587b588 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java @@ -107,6 +107,7 @@ public class OcrService { .documentId(documentId) .text(block.text() != null ? block.text() : "") .sortOrder(i) + .source(BlockSource.OCR) .createdBy(userId) .updatedBy(userId) .build(); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java index 2aff91bb..1f8126c1 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java @@ -116,6 +116,13 @@ public class TranscriptionService { } } + @Transactional + public TranscriptionBlock reviewBlock(UUID documentId, UUID blockId) { + TranscriptionBlock block = getBlock(documentId, blockId); + block.setReviewed(!block.isReviewed()); + return blockRepository.save(block); + } + public List getBlockHistory(UUID documentId, UUID blockId) { getBlock(documentId, blockId); return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId); diff --git a/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql b/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql new file mode 100644 index 00000000..de655f91 --- /dev/null +++ b/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql @@ -0,0 +1,2 @@ +ALTER TABLE transcription_blocks ADD COLUMN source VARCHAR(10) NOT NULL DEFAULT 'MANUAL'; +ALTER TABLE transcription_blocks ADD COLUMN reviewed BOOLEAN NOT NULL DEFAULT FALSE; diff --git a/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java index a891413e..54a9be2a 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java @@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest { .andExpect(status().isOk()) .andExpect(jsonPath("$").isEmpty()); } + + // ─── PUT .../review ────────────────────────────────────────────────────── + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void reviewBlock_returns200_withToggledBlock() throws Exception { + TranscriptionBlock reviewed = TranscriptionBlock.builder() + .id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID()) + .text("text").sortOrder(0).reviewed(true).build(); + when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed); + + mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review", + DOC_ID, BLOCK_ID)) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.reviewed").value(true)); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java index 44c598e0..61c62fa3 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java @@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor; import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.ArgumentCaptor; import org.raddatz.familienarchiv.dto.CreateAnnotationDTO; import org.raddatz.familienarchiv.exception.DomainException; import org.raddatz.familienarchiv.exception.ErrorCode; @@ -173,4 +174,32 @@ class OcrServiceTest { verify(annotationService, times(2)).createOcrAnnotation( eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any()); } + + @Test + void startOcr_setsBlockSourceToOcr() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.TYPEWRITER).build(); + OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test"); + + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + when(transcriptionService.listBlocks(docId)).thenReturn(List.of()); + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block)); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(UUID.randomUUID()); + return job; + }); + DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build(); + when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann); + + ocrService.startOcr(docId, null, userId); + + ArgumentCaptor captor = ArgumentCaptor.forClass(TranscriptionBlock.class); + verify(blockRepository).save(captor.capture()); + assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java index ebe02d10..f25a884b 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java @@ -243,4 +243,47 @@ class TranscriptionServiceTest { assertThat(transcriptionService.listBlocks(docId)).containsExactly(b); } + + // ─── reviewBlock ───────────────────────────────────────────────────────── + + @Test + void reviewBlock_setsReviewedTrue() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + TranscriptionBlock block = TranscriptionBlock.builder() + .id(blockId).documentId(docId).annotationId(UUID.randomUUID()) + .text("corrected text").sortOrder(0).reviewed(false).build(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block)); + when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId); + + assertThat(result.isReviewed()).isTrue(); + verify(blockRepository).save(block); + } + + @Test + void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + TranscriptionBlock block = TranscriptionBlock.builder() + .id(blockId).documentId(docId).annotationId(UUID.randomUUID()) + .text("corrected text").sortOrder(0).reviewed(true).build(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block)); + when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId); + + assertThat(result.isReviewed()).isFalse(); + } + + @Test + void reviewBlock_throwsNotFound_whenBlockMissing() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty()); + + assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId)) + .isInstanceOf(DomainException.class); + } }