feat(transcription): add source/reviewed fields for training pipeline
- BlockSource enum: MANUAL, OCR
- V26 migration adds source + reviewed columns to transcription_blocks
- OcrService sets source=OCR when creating blocks
- TranscriptionService.reviewBlock() toggles the reviewed flag
- PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint
- 5 new tests: reviewBlock toggle/untoggle/notfound, controller,
OcrService source=OCR verification
The reviewed flag enables the Kraken fine-tuning pipeline: only blocks
marked as reviewed by a human are exported as training data.
Refs #226
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,6 +81,14 @@ public class TranscriptionBlockController {
|
|||||||
return transcriptionService.listBlocks(documentId);
|
return transcriptionService.listBlocks(documentId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@PutMapping("/{blockId}/review")
|
||||||
|
@RequirePermission(Permission.WRITE_ALL)
|
||||||
|
public TranscriptionBlock reviewBlock(
|
||||||
|
@PathVariable UUID documentId,
|
||||||
|
@PathVariable UUID blockId) {
|
||||||
|
return transcriptionService.reviewBlock(documentId, blockId);
|
||||||
|
}
|
||||||
|
|
||||||
@GetMapping("/{blockId}/history")
|
@GetMapping("/{blockId}/history")
|
||||||
@RequirePermission(Permission.READ_ALL)
|
@RequirePermission(Permission.READ_ALL)
|
||||||
public List<TranscriptionBlockVersion> getBlockHistory(
|
public List<TranscriptionBlockVersion> getBlockHistory(
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
package org.raddatz.familienarchiv.model;
|
||||||
|
|
||||||
|
public enum BlockSource {
|
||||||
|
MANUAL,
|
||||||
|
OCR
|
||||||
|
}
|
||||||
@@ -41,6 +41,17 @@ public class TranscriptionBlock {
|
|||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
private int sortOrder;
|
private int sortOrder;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(nullable = false, length = 10)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private BlockSource source = BlockSource.MANUAL;
|
||||||
|
|
||||||
|
@Column(nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private boolean reviewed = false;
|
||||||
|
|
||||||
@Version
|
@Version
|
||||||
@Column(nullable = false)
|
@Column(nullable = false)
|
||||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
|||||||
@@ -107,6 +107,7 @@ public class OcrService {
|
|||||||
.documentId(documentId)
|
.documentId(documentId)
|
||||||
.text(block.text() != null ? block.text() : "")
|
.text(block.text() != null ? block.text() : "")
|
||||||
.sortOrder(i)
|
.sortOrder(i)
|
||||||
|
.source(BlockSource.OCR)
|
||||||
.createdBy(userId)
|
.createdBy(userId)
|
||||||
.updatedBy(userId)
|
.updatedBy(userId)
|
||||||
.build();
|
.build();
|
||||||
|
|||||||
@@ -116,6 +116,13 @@ public class TranscriptionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public TranscriptionBlock reviewBlock(UUID documentId, UUID blockId) {
|
||||||
|
TranscriptionBlock block = getBlock(documentId, blockId);
|
||||||
|
block.setReviewed(!block.isReviewed());
|
||||||
|
return blockRepository.save(block);
|
||||||
|
}
|
||||||
|
|
||||||
public List<TranscriptionBlockVersion> getBlockHistory(UUID documentId, UUID blockId) {
|
public List<TranscriptionBlockVersion> getBlockHistory(UUID documentId, UUID blockId) {
|
||||||
getBlock(documentId, blockId);
|
getBlock(documentId, blockId);
|
||||||
return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId);
|
return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId);
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE transcription_blocks ADD COLUMN source VARCHAR(10) NOT NULL DEFAULT 'MANUAL';
|
||||||
|
ALTER TABLE transcription_blocks ADD COLUMN reviewed BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
@@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest {
|
|||||||
.andExpect(status().isOk())
|
.andExpect(status().isOk())
|
||||||
.andExpect(jsonPath("$").isEmpty());
|
.andExpect(jsonPath("$").isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── PUT .../review ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@WithMockUser(authorities = "WRITE_ALL")
|
||||||
|
void reviewBlock_returns200_withToggledBlock() throws Exception {
|
||||||
|
TranscriptionBlock reviewed = TranscriptionBlock.builder()
|
||||||
|
.id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID())
|
||||||
|
.text("text").sortOrder(0).reviewed(true).build();
|
||||||
|
when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed);
|
||||||
|
|
||||||
|
mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review",
|
||||||
|
DOC_ID, BLOCK_ID))
|
||||||
|
.andExpect(status().isOk())
|
||||||
|
.andExpect(jsonPath("$.reviewed").value(true));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor;
|
|||||||
import org.mockito.InjectMocks;
|
import org.mockito.InjectMocks;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
|
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
|
||||||
import org.raddatz.familienarchiv.exception.DomainException;
|
import org.raddatz.familienarchiv.exception.DomainException;
|
||||||
import org.raddatz.familienarchiv.exception.ErrorCode;
|
import org.raddatz.familienarchiv.exception.ErrorCode;
|
||||||
@@ -173,4 +174,32 @@ class OcrServiceTest {
|
|||||||
verify(annotationService, times(2)).createOcrAnnotation(
|
verify(annotationService, times(2)).createOcrAnnotation(
|
||||||
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
|
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void startOcr_setsBlockSourceToOcr() {
|
||||||
|
UUID docId = UUID.randomUUID();
|
||||||
|
UUID userId = UUID.randomUUID();
|
||||||
|
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
|
||||||
|
.filePath("documents/test.pdf").fileHash("hash123")
|
||||||
|
.scriptType(ScriptType.TYPEWRITER).build();
|
||||||
|
OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test");
|
||||||
|
|
||||||
|
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||||
|
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||||
|
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
|
||||||
|
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block));
|
||||||
|
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||||
|
OcrJob job = inv.getArgument(0);
|
||||||
|
job.setId(UUID.randomUUID());
|
||||||
|
return job;
|
||||||
|
});
|
||||||
|
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
|
||||||
|
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
|
||||||
|
|
||||||
|
ocrService.startOcr(docId, null, userId);
|
||||||
|
|
||||||
|
ArgumentCaptor<TranscriptionBlock> captor = ArgumentCaptor.forClass(TranscriptionBlock.class);
|
||||||
|
verify(blockRepository).save(captor.capture());
|
||||||
|
assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -243,4 +243,47 @@ class TranscriptionServiceTest {
|
|||||||
|
|
||||||
assertThat(transcriptionService.listBlocks(docId)).containsExactly(b);
|
assertThat(transcriptionService.listBlocks(docId)).containsExactly(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── reviewBlock ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void reviewBlock_setsReviewedTrue() {
|
||||||
|
UUID docId = UUID.randomUUID();
|
||||||
|
UUID blockId = UUID.randomUUID();
|
||||||
|
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||||
|
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
|
||||||
|
.text("corrected text").sortOrder(0).reviewed(false).build();
|
||||||
|
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
|
||||||
|
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
|
||||||
|
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
|
||||||
|
|
||||||
|
assertThat(result.isReviewed()).isTrue();
|
||||||
|
verify(blockRepository).save(block);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() {
|
||||||
|
UUID docId = UUID.randomUUID();
|
||||||
|
UUID blockId = UUID.randomUUID();
|
||||||
|
TranscriptionBlock block = TranscriptionBlock.builder()
|
||||||
|
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
|
||||||
|
.text("corrected text").sortOrder(0).reviewed(true).build();
|
||||||
|
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
|
||||||
|
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
|
||||||
|
|
||||||
|
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
|
||||||
|
|
||||||
|
assertThat(result.isReviewed()).isFalse();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void reviewBlock_throwsNotFound_whenBlockMissing() {
|
||||||
|
UUID docId = UUID.randomUUID();
|
||||||
|
UUID blockId = UUID.randomUUID();
|
||||||
|
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty());
|
||||||
|
|
||||||
|
assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId))
|
||||||
|
.isInstanceOf(DomainException.class);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user