feat(transcription): add source/reviewed fields for training pipeline
Some checks failed
CI / Unit & Component Tests (push) Failing after 1s
CI / Backend Unit Tests (push) Failing after 0s
CI / Unit & Component Tests (pull_request) Failing after 0s
CI / Backend Unit Tests (pull_request) Failing after 1s

- BlockSource enum: MANUAL, OCR
- V26 migration adds source + reviewed columns to transcription_blocks
- OcrService sets source=OCR when creating blocks
- TranscriptionService.reviewBlock() toggles the reviewed flag
- PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint
- 5 new tests: reviewBlock toggle/untoggle/notfound, controller,
  OcrService source=OCR verification

The reviewed flag enables the Kraken fine-tuning pipeline: only blocks
marked as reviewed by a human are exported as training data.

Refs #226

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 21:44:51 +02:00
parent f064b27439
commit 3aaec01421
9 changed files with 123 additions and 0 deletions

View File

@@ -81,6 +81,14 @@ public class TranscriptionBlockController {
return transcriptionService.listBlocks(documentId);
}
@PutMapping("/{blockId}/review")
@RequirePermission(Permission.WRITE_ALL)
public TranscriptionBlock reviewBlock(
@PathVariable UUID documentId,
@PathVariable UUID blockId) {
return transcriptionService.reviewBlock(documentId, blockId);
}
@GetMapping("/{blockId}/history")
@RequirePermission(Permission.READ_ALL)
public List<TranscriptionBlockVersion> getBlockHistory(

View File

@@ -0,0 +1,6 @@
package org.raddatz.familienarchiv.model;
public enum BlockSource {
MANUAL,
OCR
}

View File

@@ -41,6 +41,17 @@ public class TranscriptionBlock {
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private int sortOrder;
@Enumerated(EnumType.STRING)
@Column(nullable = false, length = 10)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private BlockSource source = BlockSource.MANUAL;
@Column(nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private boolean reviewed = false;
@Version
@Column(nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)

View File

@@ -107,6 +107,7 @@ public class OcrService {
.documentId(documentId)
.text(block.text() != null ? block.text() : "")
.sortOrder(i)
.source(BlockSource.OCR)
.createdBy(userId)
.updatedBy(userId)
.build();

View File

@@ -116,6 +116,13 @@ public class TranscriptionService {
}
}
@Transactional
public TranscriptionBlock reviewBlock(UUID documentId, UUID blockId) {
TranscriptionBlock block = getBlock(documentId, blockId);
block.setReviewed(!block.isReviewed());
return blockRepository.save(block);
}
public List<TranscriptionBlockVersion> getBlockHistory(UUID documentId, UUID blockId) {
getBlock(documentId, blockId);
return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId);

View File

@@ -0,0 +1,2 @@
ALTER TABLE transcription_blocks ADD COLUMN source VARCHAR(10) NOT NULL DEFAULT 'MANUAL';
ALTER TABLE transcription_blocks ADD COLUMN reviewed BOOLEAN NOT NULL DEFAULT FALSE;

View File

@@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest {
.andExpect(status().isOk())
.andExpect(jsonPath("$").isEmpty());
}
// ─── PUT .../review ──────────────────────────────────────────────────────
@Test
@WithMockUser(authorities = "WRITE_ALL")
void reviewBlock_returns200_withToggledBlock() throws Exception {
TranscriptionBlock reviewed = TranscriptionBlock.builder()
.id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID())
.text("text").sortOrder(0).reviewed(true).build();
when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed);
mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review",
DOC_ID, BLOCK_ID))
.andExpect(status().isOk())
.andExpect(jsonPath("$.reviewed").value(true));
}
}

View File

@@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.mockito.ArgumentCaptor;
import org.raddatz.familienarchiv.dto.CreateAnnotationDTO;
import org.raddatz.familienarchiv.exception.DomainException;
import org.raddatz.familienarchiv.exception.ErrorCode;
@@ -173,4 +174,32 @@ class OcrServiceTest {
verify(annotationService, times(2)).createOcrAnnotation(
eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any());
}
@Test
void startOcr_setsBlockSourceToOcr() {
UUID docId = UUID.randomUUID();
UUID userId = UUID.randomUUID();
Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED)
.filePath("documents/test.pdf").fileHash("hash123")
.scriptType(ScriptType.TYPEWRITER).build();
OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test");
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block));
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
job.setId(UUID.randomUUID());
return job;
});
DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build();
when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann);
ocrService.startOcr(docId, null, userId);
ArgumentCaptor<TranscriptionBlock> captor = ArgumentCaptor.forClass(TranscriptionBlock.class);
verify(blockRepository).save(captor.capture());
assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR);
}
}

View File

@@ -243,4 +243,47 @@ class TranscriptionServiceTest {
assertThat(transcriptionService.listBlocks(docId)).containsExactly(b);
}
// ─── reviewBlock ─────────────────────────────────────────────────────────
@Test
void reviewBlock_setsReviewedTrue() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
TranscriptionBlock block = TranscriptionBlock.builder()
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
.text("corrected text").sortOrder(0).reviewed(false).build();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
assertThat(result.isReviewed()).isTrue();
verify(blockRepository).save(block);
}
@Test
void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
TranscriptionBlock block = TranscriptionBlock.builder()
.id(blockId).documentId(docId).annotationId(UUID.randomUUID())
.text("corrected text").sortOrder(0).reviewed(true).build();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block));
when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0));
TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId);
assertThat(result.isReviewed()).isFalse();
}
@Test
void reviewBlock_throwsNotFound_whenBlockMissing() {
UUID docId = UUID.randomUUID();
UUID blockId = UUID.randomUUID();
when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty());
assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId))
.isInstanceOf(DomainException.class);
}
}