diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java index 09ac5b60..82834365 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java @@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository findEligibleKurrentBlocks(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java index d7406ba0..57e9f3e1 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java @@ -52,8 +52,8 @@ class TrainingBlockQueryTest { } @Test - void findEligibleKurrentBlocks_includesManualBlock() { - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + void findEligibleKurrentBlocks_includesReviewedManualBlock() { + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true)); List result = blockRepository.findEligibleKurrentBlocks(); @@ -61,6 +61,16 @@ class TrainingBlockQueryTest { assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL); } + @Test + void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() { + // MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + + List result = blockRepository.findEligibleKurrentBlocks(); + + assertThat(result).isEmpty(); + } + @Test void findEligibleKurrentBlocks_includesReviewedOcrBlock() { blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true)); @@ -90,10 +100,11 @@ class TrainingBlockQueryTest { } @Test - void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() { - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() { + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true)); blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true)); - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded List result = blockRepository.findEligibleKurrentBlocks(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java index f214f989..12523084 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java @@ -67,6 +67,24 @@ class TrainingDataExportServiceTest { assertThat(zipEntryNames(zipBytes)).isNotEmpty(); } + @Test + void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception { + // MANUAL blocks whose text hasn't been verified yet should not count as training data + UUID docId = enrolledDoc("unreviewed-manual.pdf"); + UUID annotId = annotation(docId); + TranscriptionBlock block = TranscriptionBlock.builder() + .annotationId(annotId).documentId(docId) + .text("Liebe Tante").sortOrder(0) + .source(BlockSource.MANUAL).reviewed(false).build(); + blockRepository.save(block); + + FileService fileService = mockFileService(); + TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService); + + StreamingResponseBody body = service.exportToZip(); + assertThat(zipEntryNames(stream(body))).isEmpty(); + } + @Test void export_excludesManualBlockFromNonEnrolledDocument() throws Exception { UUID docId = nonEnrolledDoc("notenrolled.pdf"); @@ -212,7 +230,7 @@ class TrainingDataExportServiceTest { return TranscriptionBlock.builder() .annotationId(annotId).documentId(docId) .text(text).sortOrder(0) - .source(BlockSource.MANUAL).reviewed(false).build(); + .source(BlockSource.MANUAL).reviewed(true).build(); } private FileService mockFileService() {