fix(training): only count reviewed blocks as checked text for recognition

Previously all MANUAL blocks counted as eligible training data, even ones
where text was filled in by guided OCR but never explicitly reviewed. This
caused segmentation and recognition counts to always match.

Now only reviewed=true blocks qualify for recognition training, so the
counts properly reflect: segments = all drawn annotation boxes,
checked text = only boxes where the user has verified the transcription.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 18:00:59 +02:00
parent 1fd5c31fd1
commit a99afef319
3 changed files with 36 additions and 7 deletions

View File

@@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository<Transcriptio
SELECT b FROM TranscriptionBlock b
JOIN DocumentAnnotation a ON a.id = b.annotationId
JOIN Document d ON d.id = b.documentId
WHERE (b.source = 'MANUAL' OR (b.source = 'OCR' AND b.reviewed = true))
WHERE b.reviewed = true
AND 'KURRENT_RECOGNITION' MEMBER OF d.trainingLabels
""")
List<TranscriptionBlock> findEligibleKurrentBlocks();

View File

@@ -52,8 +52,8 @@ class TrainingBlockQueryTest {
}
@Test
void findEligibleKurrentBlocks_includesManualBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
void findEligibleKurrentBlocks_includesReviewedManualBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
@@ -61,6 +61,16 @@ class TrainingBlockQueryTest {
assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL);
}
@Test
void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() {
// MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();
assertThat(result).isEmpty();
}
@Test
void findEligibleKurrentBlocks_includesReviewedOcrBlock() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
@@ -90,10 +100,11 @@ class TrainingBlockQueryTest {
}
@Test
void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false));
void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() {
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true));
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true));
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded
blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded
List<TranscriptionBlock> result = blockRepository.findEligibleKurrentBlocks();

View File

@@ -67,6 +67,24 @@ class TrainingDataExportServiceTest {
assertThat(zipEntryNames(zipBytes)).isNotEmpty();
}
@Test
void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception {
// MANUAL blocks whose text hasn't been verified yet should not count as training data
UUID docId = enrolledDoc("unreviewed-manual.pdf");
UUID annotId = annotation(docId);
TranscriptionBlock block = TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text("Liebe Tante").sortOrder(0)
.source(BlockSource.MANUAL).reviewed(false).build();
blockRepository.save(block);
FileService fileService = mockFileService();
TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService);
StreamingResponseBody body = service.exportToZip();
assertThat(zipEntryNames(stream(body))).isEmpty();
}
@Test
void export_excludesManualBlockFromNonEnrolledDocument() throws Exception {
UUID docId = nonEnrolledDoc("notenrolled.pdf");
@@ -212,7 +230,7 @@ class TrainingDataExportServiceTest {
return TranscriptionBlock.builder()
.annotationId(annotId).documentId(docId)
.text(text).sortOrder(0)
.source(BlockSource.MANUAL).reviewed(false).build();
.source(BlockSource.MANUAL).reviewed(true).build();
}
private FileService mockFileService() {