From a99afef31988c1345c7d46a17443bb851b51a214 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 18:00:59 +0200 Subject: [PATCH] fix(training): only count reviewed blocks as checked text for recognition Previously all MANUAL blocks counted as eligible training data, even ones where text was filled in by guided OCR but never explicitly reviewed. This caused segmentation and recognition counts to always match. Now only reviewed=true blocks qualify for recognition training, so the counts properly reflect: segments = all drawn annotation boxes, checked text = only boxes where the user has verified the transcription. Co-Authored-By: Claude Sonnet 4.6 --- .../TranscriptionBlockRepository.java | 2 +- .../repository/TrainingBlockQueryTest.java | 21 ++++++++++++++----- .../TrainingDataExportServiceTest.java | 20 +++++++++++++++++- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java index 09ac5b60..82834365 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/TranscriptionBlockRepository.java @@ -22,7 +22,7 @@ public interface TranscriptionBlockRepository extends JpaRepository findEligibleKurrentBlocks(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java b/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java index d7406ba0..57e9f3e1 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/repository/TrainingBlockQueryTest.java @@ -52,8 +52,8 @@ class TrainingBlockQueryTest { } @Test - void findEligibleKurrentBlocks_includesManualBlock() { - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + void findEligibleKurrentBlocks_includesReviewedManualBlock() { + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true)); List result = blockRepository.findEligibleKurrentBlocks(); @@ -61,6 +61,16 @@ class TrainingBlockQueryTest { assertThat(result.get(0).getSource()).isEqualTo(BlockSource.MANUAL); } + @Test + void findEligibleKurrentBlocks_excludesUnreviewedManualBlock() { + // MANUAL blocks filled by guided OCR but not yet reviewed should not count as checked text + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + + List result = blockRepository.findEligibleKurrentBlocks(); + + assertThat(result).isEmpty(); + } + @Test void findEligibleKurrentBlocks_includesReviewedOcrBlock() { blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true)); @@ -90,10 +100,11 @@ class TrainingBlockQueryTest { } @Test - void findEligibleKurrentBlocks_returnsAllEligibleAcrossBothSources() { - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); + void findEligibleKurrentBlocks_returnsAllReviewedBlocksAcrossBothSources() { + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, true)); blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, true)); - blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.MANUAL, false)); // excluded + blockRepository.save(block(kurrentDocId, kurrentAnnotationId, BlockSource.OCR, false)); // excluded List result = blockRepository.findEligibleKurrentBlocks(); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java index f214f989..12523084 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TrainingDataExportServiceTest.java @@ -67,6 +67,24 @@ class TrainingDataExportServiceTest { assertThat(zipEntryNames(zipBytes)).isNotEmpty(); } + @Test + void export_excludesUnreviewedManualBlockFromEnrolledDocument() throws Exception { + // MANUAL blocks whose text hasn't been verified yet should not count as training data + UUID docId = enrolledDoc("unreviewed-manual.pdf"); + UUID annotId = annotation(docId); + TranscriptionBlock block = TranscriptionBlock.builder() + .annotationId(annotId).documentId(docId) + .text("Liebe Tante").sortOrder(0) + .source(BlockSource.MANUAL).reviewed(false).build(); + blockRepository.save(block); + + FileService fileService = mockFileService(); + TrainingDataExportService service = new TrainingDataExportService(blockRepository, annotationRepository, documentRepository, fileService); + + StreamingResponseBody body = service.exportToZip(); + assertThat(zipEntryNames(stream(body))).isEmpty(); + } + @Test void export_excludesManualBlockFromNonEnrolledDocument() throws Exception { UUID docId = nonEnrolledDoc("notenrolled.pdf"); @@ -212,7 +230,7 @@ class TrainingDataExportServiceTest { return TranscriptionBlock.builder() .annotationId(annotId).documentId(docId) .text(text).sortOrder(0) - .source(BlockSource.MANUAL).reviewed(false).build(); + .source(BlockSource.MANUAL).reviewed(true).build(); } private FileService mockFileService() {