feat(transcription): add source/reviewed fields for training pipeline
- BlockSource enum: MANUAL, OCR
- V26 migration adds source + reviewed columns to transcription_blocks
- OcrService sets source=OCR when creating blocks
- TranscriptionService.reviewBlock() toggles the reviewed flag
- PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint
- 5 new tests: reviewBlock toggle/untoggle/notfound, controller,
OcrService source=OCR verification
The reviewed flag enables the Kraken fine-tuning pipeline: only blocks
marked as reviewed by a human are exported as training data.
Refs #226
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,6 +81,14 @@ public class TranscriptionBlockController {
|
||||
return transcriptionService.listBlocks(documentId);
|
||||
}
|
||||
|
||||
@PutMapping("/{blockId}/review")
|
||||
@RequirePermission(Permission.WRITE_ALL)
|
||||
public TranscriptionBlock reviewBlock(
|
||||
@PathVariable UUID documentId,
|
||||
@PathVariable UUID blockId) {
|
||||
return transcriptionService.reviewBlock(documentId, blockId);
|
||||
}
|
||||
|
||||
@GetMapping("/{blockId}/history")
|
||||
@RequirePermission(Permission.READ_ALL)
|
||||
public List<TranscriptionBlockVersion> getBlockHistory(
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
package org.raddatz.familienarchiv.model;
|
||||
|
||||
public enum BlockSource {
|
||||
MANUAL,
|
||||
OCR
|
||||
}
|
||||
@@ -41,6 +41,17 @@ public class TranscriptionBlock {
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private int sortOrder;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(nullable = false, length = 10)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private BlockSource source = BlockSource.MANUAL;
|
||||
|
||||
@Column(nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private boolean reviewed = false;
|
||||
|
||||
@Version
|
||||
@Column(nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
|
||||
@@ -107,6 +107,7 @@ public class OcrService {
|
||||
.documentId(documentId)
|
||||
.text(block.text() != null ? block.text() : "")
|
||||
.sortOrder(i)
|
||||
.source(BlockSource.OCR)
|
||||
.createdBy(userId)
|
||||
.updatedBy(userId)
|
||||
.build();
|
||||
|
||||
@@ -116,6 +116,13 @@ public class TranscriptionService {
|
||||
}
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public TranscriptionBlock reviewBlock(UUID documentId, UUID blockId) {
|
||||
TranscriptionBlock block = getBlock(documentId, blockId);
|
||||
block.setReviewed(!block.isReviewed());
|
||||
return blockRepository.save(block);
|
||||
}
|
||||
|
||||
public List<TranscriptionBlockVersion> getBlockHistory(UUID documentId, UUID blockId) {
|
||||
getBlock(documentId, blockId);
|
||||
return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId);
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE transcription_blocks ADD COLUMN source VARCHAR(10) NOT NULL DEFAULT 'MANUAL';
|
||||
ALTER TABLE transcription_blocks ADD COLUMN reviewed BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
@@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest {
|
||||
.andExpect(status().isOk())
|
||||
.andExpect(jsonPath("$").isEmpty());
|
||||
}
|
||||
|
||||
// ─── PUT .../review ──────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
@WithMockUser(authorities = "WRITE_ALL")
|
||||
void reviewBlock_returns200_withToggledBlock() throws Exception {
|
||||
TranscriptionBlock reviewed = TranscriptionBlock.builder()
|
||||
.id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID())
|
||||
.text("text").sortOrder(0).reviewed(true).build();
|
||||
when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed);
|
||||
|
||||
mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review",
|
||||
DOC_ID, BLOCK_ID))
|
||||
.andExpect(status().isOk())
|
||||
.andExpect(jsonPath("$.reviewed").value(true));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.InjectMocks;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import org.mockito.ArgumentCaptor;
|
||||
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
|
||||
import org.raddatz.familienarchiv.exception.DomainException;
|
||||
import org.raddatz.familienarchiv.exception.ErrorCode;
|
||||
@@ -173,4 +174,32 @@ class OcrServiceTest {
|
||||
verify(annotationService, times(2)).createOcrAnnotation(
|
||||
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
|
||||
}
|
||||
|
||||
@Test
|
||||
void startOcr_setsBlockSourceToOcr() {
|
||||
UUID docId = UUID.randomUUID();
|
||||
UUID userId = UUID.randomUUID();
|
||||
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
|
||||
.filePath("documents/test.pdf").fileHash("hash123")
|
||||
.scriptType(ScriptType.TYPEWRITER).build();
|
||||
OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test");
|
||||
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block));
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
OcrJob job = inv.getArgument(0);
|
||||
job.setId(UUID.randomUUID());
|
||||
return job;
|
||||
});
|
||||
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
|
||||
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
|
||||
|
||||
ocrService.startOcr(docId, null, userId);
|
||||
|
||||
ArgumentCaptor<TranscriptionBlock> captor = ArgumentCaptor.forClass(TranscriptionBlock.class);
|
||||
verify(blockRepository).save(captor.capture());
|
||||
assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,4 +243,47 @@ class TranscriptionServiceTest {
|
||||
|
||||
assertThat(transcriptionService.listBlocks(docId)).containsExactly(b);
|
||||
}
|
||||
|
||||
// ─── reviewBlock ─────────────────────────────────────────────────────────
|
||||
|
||||
@Test
|
||||
void reviewBlock_setsReviewedTrue() {
|
||||
UUID docId = UUID.randomUUID();
|
||||
UUID blockId = UUID.randomUUID();
|
||||
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
|
||||
.text("corrected text").sortOrder(0).reviewed(false).build();
|
||||
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
|
||||
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||
|
||||
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
|
||||
|
||||
assertThat(result.isReviewed()).isTrue();
|
||||
verify(blockRepository).save(block);
|
||||
}
|
||||
|
||||
@Test
|
||||
void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() {
|
||||
UUID docId = UUID.randomUUID();
|
||||
UUID blockId = UUID.randomUUID();
|
||||
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
|
||||
.text("corrected text").sortOrder(0).reviewed(true).build();
|
||||
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
|
||||
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||
|
||||
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
|
||||
|
||||
assertThat(result.isReviewed()).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
void reviewBlock_throwsNotFound_whenBlockMissing() {
|
||||
UUID docId = UUID.randomUUID();
|
||||
UUID blockId = UUID.randomUUID();
|
||||
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty());
|
||||
|
||||
assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId))
|
||||
.isInstanceOf(DomainException.class);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user