feat(transcription): add source/reviewed fields for training pipeline
Some checks failed
CI / Unit & Component Tests (push) Failing after 1s
CI / Backend Unit Tests (push) Failing after 0s
CI / Unit & Component Tests (pull_request) Failing after 0s
CI / Backend Unit Tests (pull_request) Failing after 1s

- BlockSource enum: MANUAL, OCR
- V26 migration adds source + reviewed columns to transcription_blocks
- OcrService sets source=OCR when creating blocks
- TranscriptionService.reviewBlock() toggles the reviewed flag
- PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint
- 5 new tests: reviewBlock toggle/untoggle/notfound, controller,
  OcrService source=OCR verification

The reviewed flag enables the Kraken fine-tuning pipeline: only blocks
marked as reviewed by a human are exported as training data.

Refs #226

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 21:44:51 +02:00
parent f064b27439
commit 3aaec01421
9 changed files with 123 additions and 0 deletions

View File

@@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest {
.andExpect(status().isOk())
.andExpect(jsonPath("$").isEmpty());
}
// ─── PUT .../review ──────────────────────────────────────────────────────
@Test
@WithMockUser(authorities = "WRITE_ALL")
void reviewBlock_returns200_withToggledBlock() throws Exception {
TranscriptionBlock reviewed = TranscriptionBlock.builder()
.id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID())
.text("text").sortOrder(0).reviewed(true).build();
when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed);
mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review",
DOC_ID, BLOCK_ID))
.andExpect(status().isOk())
.andExpect(jsonPath("$.reviewed").value(true));
}
}

View File

@@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.mockito.ArgumentCaptor;
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
@@ -173,4 +174,32 @@ class OcrServiceTest {
verify(annotationService, times(2)).createOcrAnnotation(
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
}
@Test
void startOcr_setsBlockSourceToOcr() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.TYPEWRITER).build();
OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test");
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block));
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(UUID.randomUUID());
return job;
});
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrService.startOcr(docId, null, userId);
ArgumentCaptor<TranscriptionBlock> captor = ArgumentCaptor.forClass(TranscriptionBlock.class);
verify(blockRepository).save(captor.capture());
assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR);
}
}

View File

@@ -243,4 +243,47 @@ class TranscriptionServiceTest {
assertThat(transcriptionService.listBlocks(docId)).containsExactly(b);
}
// ─── reviewBlock ─────────────────────────────────────────────────────────
@Test
void reviewBlock_setsReviewedTrue() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
TranscriptionBlock block = TranscriptionBlock.builder()
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
.text("corrected text").sortOrder(0).reviewed(false).build();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
assertThat(result.isReviewed()).isTrue();
verify(blockRepository).save(block);
}
@Test
void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
TranscriptionBlock block = TranscriptionBlock.builder()
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
.text("corrected text").sortOrder(0).reviewed(true).build();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
assertThat(result.isReviewed()).isFalse();
}
@Test
void reviewBlock_throwsNotFound_whenBlockMissing() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty());
assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId))
.isInstanceOf(DomainException.class);
}
}