From ec32d225b59b369124ae67a81842b55d24581906 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:07:46 +0200 Subject: [PATCH 01/74] docs(adr): add ADR-001 (OCR microservice) and ADR-002 (polygon JSONB) ADR-001 documents the decision to use a separate Python container for OCR (Surya + Kraken), the interface contract, and why alternatives like Tess4J were rejected. ADR-002 documents the decision to store polygon annotations as JSONB with a 4-point CHECK constraint, backed by an AttributeConverter. Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 --- docs/adr/001-ocr-python-microservice.md | 84 +++++++++++++++++++++++++ docs/adr/002-polygon-jsonb-storage.md | 52 +++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 docs/adr/001-ocr-python-microservice.md create mode 100644 docs/adr/002-polygon-jsonb-storage.md diff --git a/docs/adr/001-ocr-python-microservice.md b/docs/adr/001-ocr-python-microservice.md new file mode 100644 index 00000000..869ff950 --- /dev/null +++ b/docs/adr/001-ocr-python-microservice.md @@ -0,0 +1,84 @@ +# ADR-001: OCR Python Microservice + +## Status + +Accepted + +## Context + +The Familienarchiv needs OCR capability to pre-populate transcription blocks from scanned documents. Two OCR engines are required: + +- **Surya** — transformer-based, handles typewritten and modern Latin handwriting +- **Kraken** — historical HTR model support, required for pre-1941 German Kurrent/Suetterlin scripts + +Both engines exist exclusively in the Python ecosystem. There are no production-quality Java bindings for either engine. Tess4J (Tesseract for Java) was considered but rejected: Tesseract has poor accuracy on degraded historical handwriting and no HTR-United model support. + +The server has no GPU. CPU-only inference is the target (16-32 GB system RAM). + +## Decision + +Introduce a separate Python container (`ocr-service`) that exposes a simple HTTP API. Spring Boot calls this service via `RestClient`. The Python service is stateless — all job tracking and business logic remain in Spring Boot. + +**Interface contract:** + +Request: +```json +{ + "pdfUrl": "http://minio:9000/archive-documents/abc.pdf?presigned...", + "scriptType": "HANDWRITING_KURRENT", + "language": "de" +} +``` + +Response: +```json +[ + { + "pageNumber": 0, + "x": 0.12, "y": 0.08, "width": 0.76, "height": 0.04, + "polygon": [[0.12,0.08],[0.88,0.09],[0.87,0.12],[0.13,0.11]], + "text": "Sehr geehrter Herr ..." + } +] +``` + +Coordinates are normalized (0-1) relative to page dimensions. + +**Java-side integration:** + +- `OcrClient` interface with `extractBlocks()` method — mockable for unit tests +- `OcrHealthClient` interface with `isHealthy()` — separate concern from block extraction +- `RestClientOcrClient` implements both interfaces +- `OcrService` orchestrates: presigned URL generation, OCR call, block mapping, TranscriptionService delegation + +**Docker networking:** + +- `ocr-service` is on the internal Docker network only — no host port mapping +- Spring Boot reaches it via `http://ocr-service:8000` +- Health check with `start_period: 60s` to account for model loading (~30-60s on CPU) + +## Alternatives Considered + +| Alternative | Why rejected | +|---|---| +| Tess4J (Tesseract in Java) | No HTR-United model support; poor Kurrent accuracy | +| Calling Python via ProcessBuilder | Fragile, no health checks, model reloading on every call | +| Embedding Python via GraalVM | Experimental, complex dependency management for ML libraries | +| External SaaS OCR (Google Vision, AWS Textract) | Data sovereignty concern for private family documents; no Kurrent support | + +## Consequences + +**Easier:** +- Each engine is used via its native Python API — no bridging complexity +- OCR service can be updated independently of the main application +- Models can be swapped via volume mount without code changes + +**Harder:** +- One additional container to operate (memory, health checks, restarts) +- Integration tests require WireMock stub — real OCR service is too slow for CI +- Presigned URL TTL must be managed (15-30 min recommended) + +## Future Direction + +- LISTEN/NOTIFY from PostgreSQL to push progress events when scaling to multiple instances +- GPU acceleration if the server is upgraded — only the Docker image needs to change diff --git a/docs/adr/002-polygon-jsonb-storage.md b/docs/adr/002-polygon-jsonb-storage.md new file mode 100644 index 00000000..6383759c --- /dev/null +++ b/docs/adr/002-polygon-jsonb-storage.md @@ -0,0 +1,52 @@ +# ADR-002: Polygon JSONB Storage for Annotations + +## Status + +Accepted + +## Context + +Document annotations currently store axis-aligned bounding boxes (`x, y, width, height`). Kraken OCR outputs polygon boundaries for text lines — historical handwriting (Kurrent, Suetterlin) produces rotated and curved text that axis-aligned rectangles approximate poorly. + +We need to store an optional quadrilateral (4 corner points) per annotation to represent the precise text region. The polygon is display-only — overlap detection and all server-side geometry logic continues to use the AABB fields. + +## Decision + +Add a `polygon JSONB` column to `document_annotations`: + +```sql +ALTER TABLE document_annotations ADD COLUMN polygon JSONB; +ALTER TABLE document_annotations +ADD CONSTRAINT chk_annotation_polygon_quad + CHECK (polygon IS NULL OR jsonb_array_length(polygon) = 4); +``` + +- `null` means rectangle — render using existing `x, y, width, height` fields (fully backward compatible) +- Non-null value is a normalized 4-point quadrilateral: `[[x1,y1],[x2,y2],[x3,y3],[x4,y4]]` with coordinates in the 0-1 range relative to page dimensions + +The existing AABB fields are always populated (even when a polygon is present) and remain the authoritative geometry for overlap detection. + +**Java entity:** `List> polygon` backed by a custom `AttributeConverter>, String>`. No new dependency (Hypersistence Utils is not in the project and won't be added for a single column). + +**Semantic invariant:** `polygon`, if present, is a 4-point quadrilateral with coordinates normalized to [0, 1] relative to page dimensions. It may originate from OCR engine output (Kraken) or from a future manual drawing tool. The AABB fields remain the geometry source of truth for server-side logic. + +## Alternatives Considered + +| Alternative | Why rejected | +|---|---| +| 8 `NUMERIC(8,6)` columns (x1,y1,...,x4,y4) | Verbose, no structural enforcement, awkward to query or extend | +| Separate `annotation_polygons` join table | Unnecessary complexity for a 1:1 optional relationship | +| PostGIS geometry column | Adds a heavyweight extension for a display-only field with no spatial queries | +| `String polygon` on the entity | Requires manual parsing at every callsite; error-prone | + +## Consequences + +**Easier:** +- Backward compatible — all existing annotations continue to work unchanged +- Frontend renders `` or `` based on a simple null check +- Schema can accommodate N-point polygons in the future (JSONB is flexible), though the CHECK constraint currently enforces exactly 4 + +**Harder:** +- Cannot express range checks (`0 <= x <= 1`) as database constraints without a PL/pgSQL function — validated at the DTO layer instead +- No server-side geometry queries on polygon coordinates (acceptable — polygon is display-only) +- AttributeConverter adds a small amount of serialization code to maintain -- 2.49.1 From 878a90a86dda00e5f30dfc43df9db7944b4c2926 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:10:35 +0200 Subject: [PATCH 02/74] feat(annotations): add polygon JSONB support for quadrilateral shapes - V23 migration adds polygon JSONB column with 4-point CHECK constraint - PolygonConverter: AttributeConverter for List> <-> JSONB - @UniquePoints custom validator rejects duplicate coordinates - CreateAnnotationDTO: validated optional polygon field - DocumentAnnotation entity: polygon field with converter Refs #227 Co-Authored-By: Claude Sonnet 4.6 --- .../dto/CreateAnnotationDTO.java | 21 +++ .../familienarchiv/dto/UniquePoints.java | 16 +++ .../dto/UniquePointsValidator.java | 16 +++ .../model/DocumentAnnotation.java | 5 + .../model/PolygonConverter.java | 36 +++++ .../V23__add_polygon_to_annotations.sql | 8 ++ .../dto/UniquePointsValidatorTest.java | 124 ++++++++++++++++++ .../model/PolygonConverterTest.java | 65 +++++++++ 8 files changed, 291 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePoints.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePointsValidator.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/PolygonConverter.java create mode 100644 backend/src/main/resources/db/migration/V23__add_polygon_to_annotations.sql create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/dto/UniquePointsValidatorTest.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/model/PolygonConverterTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/CreateAnnotationDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/CreateAnnotationDTO.java index db81687f..846d9321 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/dto/CreateAnnotationDTO.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/CreateAnnotationDTO.java @@ -1,9 +1,15 @@ package org.raddatz.familienarchiv.dto; +import jakarta.validation.Valid; +import jakarta.validation.constraints.DecimalMax; +import jakarta.validation.constraints.DecimalMin; +import jakarta.validation.constraints.Size; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; +import java.util.List; + @Data @NoArgsConstructor @AllArgsConstructor @@ -14,4 +20,19 @@ public class CreateAnnotationDTO { private double width; private double height; private String color; + + @Size(min = 4, max = 4, message = "polygon must have exactly 4 points") + @UniquePoints + @Valid + private List<@Size(min = 2, max = 2, message = "each point must have exactly 2 coordinates") + List<@DecimalMin("0.0") @DecimalMax("1.0") Double>> polygon; + + public CreateAnnotationDTO(int pageNumber, double x, double y, double width, double height, String color) { + this.pageNumber = pageNumber; + this.x = x; + this.y = y; + this.width = width; + this.height = height; + this.color = color; + } } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePoints.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePoints.java new file mode 100644 index 00000000..6e954094 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePoints.java @@ -0,0 +1,16 @@ +package org.raddatz.familienarchiv.dto; + +import jakarta.validation.Constraint; +import jakarta.validation.Payload; + +import java.lang.annotation.*; + +@Documented +@Constraint(validatedBy = UniquePointsValidator.class) +@Target({ElementType.FIELD}) +@Retention(RetentionPolicy.RUNTIME) +public @interface UniquePoints { + String message() default "polygon must contain 4 unique points"; + Class[] groups() default {}; + Class[] payload() default {}; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePointsValidator.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePointsValidator.java new file mode 100644 index 00000000..eac16820 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/UniquePointsValidator.java @@ -0,0 +1,16 @@ +package org.raddatz.familienarchiv.dto; + +import jakarta.validation.ConstraintValidator; +import jakarta.validation.ConstraintValidatorContext; + +import java.util.HashSet; +import java.util.List; + +public class UniquePointsValidator implements ConstraintValidator>> { + + @Override + public boolean isValid(List> polygon, ConstraintValidatorContext context) { + if (polygon == null) return true; + return new HashSet<>(polygon).size() == polygon.size(); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java b/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java index 281f88a2..d4e02258 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java @@ -6,6 +6,7 @@ import lombok.*; import org.hibernate.annotations.CreationTimestamp; import java.time.LocalDateTime; +import java.util.List; import java.util.UUID; @Entity @@ -52,6 +53,10 @@ public class DocumentAnnotation { @Column(name = "file_hash", length = 64) private String fileHash; + @Column(columnDefinition = "jsonb") + @Convert(converter = PolygonConverter.class) + private List> polygon; + @Column(name = "created_by") private UUID createdBy; diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/PolygonConverter.java b/backend/src/main/java/org/raddatz/familienarchiv/model/PolygonConverter.java new file mode 100644 index 00000000..28362e8f --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/PolygonConverter.java @@ -0,0 +1,36 @@ +package org.raddatz.familienarchiv.model; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.persistence.AttributeConverter; +import jakarta.persistence.Converter; + +import java.util.List; + +@Converter +public class PolygonConverter implements AttributeConverter>, String> { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final TypeReference>> TYPE_REF = new TypeReference<>() {}; + + @Override + public String convertToDatabaseColumn(List> polygon) { + if (polygon == null) return null; + try { + return MAPPER.writeValueAsString(polygon); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("Failed to serialize polygon", e); + } + } + + @Override + public List> convertToEntityAttribute(String json) { + if (json == null || json.isEmpty()) return null; + try { + return MAPPER.readValue(json, TYPE_REF); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("Failed to deserialize polygon", e); + } + } +} diff --git a/backend/src/main/resources/db/migration/V23__add_polygon_to_annotations.sql b/backend/src/main/resources/db/migration/V23__add_polygon_to_annotations.sql new file mode 100644 index 00000000..74a4d246 --- /dev/null +++ b/backend/src/main/resources/db/migration/V23__add_polygon_to_annotations.sql @@ -0,0 +1,8 @@ +-- Add optional polygon field for quadrilateral annotation shapes (Kraken OCR output). +-- See ADR-002 for the design decision. + +ALTER TABLE document_annotations ADD COLUMN polygon JSONB; + +ALTER TABLE document_annotations +ADD CONSTRAINT chk_annotation_polygon_quad + CHECK (polygon IS NULL OR jsonb_array_length(polygon) = 4); diff --git a/backend/src/test/java/org/raddatz/familienarchiv/dto/UniquePointsValidatorTest.java b/backend/src/test/java/org/raddatz/familienarchiv/dto/UniquePointsValidatorTest.java new file mode 100644 index 00000000..be2690c4 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/dto/UniquePointsValidatorTest.java @@ -0,0 +1,124 @@ +package org.raddatz.familienarchiv.dto; + +import jakarta.validation.ConstraintViolation; +import jakarta.validation.Validation; +import jakarta.validation.Validator; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +class UniquePointsValidatorTest { + + private final Validator validator = Validation.buildDefaultValidatorFactory().getValidator(); + + @Test + void shouldAcceptNull() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(null); + + Set> violations = validator.validate(dto); + + assertThat(violations).noneMatch(v -> v.getPropertyPath().toString().equals("polygon")); + } + + @Test + void shouldAcceptFourUniquePoints() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(0.1, 0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).noneMatch(v -> v.getPropertyPath().toString().equals("polygon")); + } + + @Test + void shouldRejectDuplicatePoints() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(0.1, 0.1), + List.of(0.1, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().equals("polygon")); + } + + @Test + void shouldRejectPolygonWithThreePoints() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(0.1, 0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().equals("polygon")); + } + + @Test + void shouldRejectPolygonWithFivePoints() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(0.1, 0.1), + List.of(0.5, 0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().equals("polygon")); + } + + @Test + void shouldRejectCoordinateOutOfRange() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(1.5, 0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().contains("polygon")); + } + + @Test + void shouldRejectNegativeCoordinate() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(-0.1, 0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().contains("polygon")); + } + + @Test + void shouldRejectPointWithOneCoordinate() { + var dto = new CreateAnnotationDTO(); + dto.setPolygon(List.of( + List.of(0.1), + List.of(0.9, 0.1), + List.of(0.9, 0.9), + List.of(0.1, 0.9))); + + Set> violations = validator.validate(dto); + + assertThat(violations).anyMatch(v -> v.getPropertyPath().toString().contains("polygon")); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/model/PolygonConverterTest.java b/backend/src/test/java/org/raddatz/familienarchiv/model/PolygonConverterTest.java new file mode 100644 index 00000000..916cfa2f --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/model/PolygonConverterTest.java @@ -0,0 +1,65 @@ +package org.raddatz.familienarchiv.model; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class PolygonConverterTest { + + private final PolygonConverter converter = new PolygonConverter(); + + @Test + void convertToDatabaseColumn_returnsNull_whenPolygonIsNull() { + assertThat(converter.convertToDatabaseColumn(null)).isNull(); + } + + @Test + void convertToDatabaseColumn_returnsJsonArray_whenPolygonIsValid() { + List> polygon = List.of( + List.of(0.1, 0.2), + List.of(0.9, 0.2), + List.of(0.9, 0.8), + List.of(0.1, 0.8)); + + String json = converter.convertToDatabaseColumn(polygon); + + assertThat(json).isEqualTo("[[0.1,0.2],[0.9,0.2],[0.9,0.8],[0.1,0.8]]"); + } + + @Test + void convertToEntityAttribute_returnsNull_whenJsonIsNull() { + assertThat(converter.convertToEntityAttribute(null)).isNull(); + } + + @Test + void convertToEntityAttribute_returnsNull_whenJsonIsEmpty() { + assertThat(converter.convertToEntityAttribute("")).isNull(); + } + + @Test + void convertToEntityAttribute_returnsPolygon_whenJsonIsValid() { + String json = "[[0.1,0.2],[0.9,0.2],[0.9,0.8],[0.1,0.8]]"; + + List> polygon = converter.convertToEntityAttribute(json); + + assertThat(polygon).hasSize(4); + assertThat(polygon.get(0)).containsExactly(0.1, 0.2); + assertThat(polygon.get(3)).containsExactly(0.1, 0.8); + } + + @Test + void roundTrip_preservesValues() { + List> original = List.of( + List.of(0.12, 0.08), + List.of(0.88, 0.09), + List.of(0.87, 0.14), + List.of(0.11, 0.13)); + + String json = converter.convertToDatabaseColumn(original); + List> restored = converter.convertToEntityAttribute(json); + + assertThat(restored).isEqualTo(original); + } +} -- 2.49.1 From c19c41f8122c6962fad40c03270978ed46f30559 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:12:11 +0200 Subject: [PATCH 03/74] feat(annotations): add createOcrAnnotation that skips overlap check OCR creates many adjacent text line annotations that would fail the existing overlap check. createOcrAnnotation() accepts an optional polygon and bypasses overlap detection entirely. Refs #227 Co-Authored-By: Claude Sonnet 4.6 --- .../service/AnnotationService.java | 20 ++++++++ .../service/AnnotationServiceTest.java | 49 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/AnnotationService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/AnnotationService.java index f52c70b0..6735ef31 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/AnnotationService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/AnnotationService.java @@ -48,6 +48,26 @@ public class AnnotationService { return annotationRepository.save(annotation); } + @Transactional + public DocumentAnnotation createOcrAnnotation(UUID documentId, CreateAnnotationDTO dto, + UUID userId, String fileHash, + List> polygon) { + DocumentAnnotation annotation = DocumentAnnotation.builder() + .documentId(documentId) + .pageNumber(dto.getPageNumber()) + .x(dto.getX()) + .y(dto.getY()) + .width(dto.getWidth()) + .height(dto.getHeight()) + .color(dto.getColor()) + .fileHash(fileHash) + .createdBy(userId) + .polygon(polygon) + .build(); + + return annotationRepository.save(annotation); + } + @Transactional public void deleteAnnotation(UUID documentId, UUID annotationId, UUID userId) { DocumentAnnotation annotation = annotationRepository diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/AnnotationServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/AnnotationServiceTest.java index 2605cfb1..37652179 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/AnnotationServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/AnnotationServiceTest.java @@ -260,6 +260,55 @@ class AnnotationServiceTest { verify(annotationRepository).save(any()); } + // ─── createOcrAnnotation ────────────────────────────────────────────────── + + @Test + void createOcrAnnotation_skipsOverlapCheck_andSavesWithPolygon() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + CreateAnnotationDTO dto = new CreateAnnotationDTO(1, 0.1, 0.1, 0.8, 0.04, "#00C7B1"); + List> polygon = List.of( + List.of(0.1, 0.1), List.of(0.9, 0.11), + List.of(0.89, 0.14), List.of(0.11, 0.13)); + when(annotationRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + DocumentAnnotation result = annotationService.createOcrAnnotation( + docId, dto, userId, "filehash", polygon); + + assertThat(result.getPolygon()).isEqualTo(polygon); + assertThat(result.getDocumentId()).isEqualTo(docId); + verify(annotationRepository).save(any()); + verify(annotationRepository, never()).findByDocumentIdAndPageNumber(any(), any(int.class)); + } + + @Test + void createOcrAnnotation_savesWithNullPolygon_whenPolygonNotProvided() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + CreateAnnotationDTO dto = new CreateAnnotationDTO(1, 0.1, 0.1, 0.8, 0.04, "#00C7B1"); + when(annotationRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + DocumentAnnotation result = annotationService.createOcrAnnotation( + docId, dto, userId, "filehash", null); + + assertThat(result.getPolygon()).isNull(); + verify(annotationRepository).save(any()); + } + + @Test + void createOcrAnnotation_doesNotCheckOverlap_evenWhenOverlappingAnnotationExists() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + CreateAnnotationDTO dto = new CreateAnnotationDTO(1, 0.1, 0.1, 0.3, 0.3, "#00C7B1"); + when(annotationRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + annotationService.createOcrAnnotation(docId, dto, userId, "hash", null); + + verify(annotationRepository, never()).findByDocumentIdAndPageNumber(any(), any(int.class)); + } + + // ─── overlaps — partial overlap cases ──────────────────────────────────── + @Test void createAnnotation_noConflict_whenAnnotationIsAbove() { // x ranges overlap, y ranges don't — existing is ABOVE the new annotation -- 2.49.1 From d194b6b22581d4e5ed03f6687393ecfa41acc532 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:13:42 +0200 Subject: [PATCH 04/74] feat(documents): add ScriptType enum and script_type column - ScriptType enum: UNKNOWN, TYPEWRITER, HANDWRITING_LATIN, HANDWRITING_KURRENT - V24 migration adds script_type VARCHAR(30) NOT NULL DEFAULT 'UNKNOWN' - Document entity: scriptType field with @Builder.Default UNKNOWN - DocumentUpdateDTO: optional scriptType field - DocumentService: wires scriptType through update method Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../org/raddatz/familienarchiv/dto/DocumentUpdateDTO.java | 2 ++ .../java/org/raddatz/familienarchiv/model/Document.java | 6 ++++++ .../java/org/raddatz/familienarchiv/model/ScriptType.java | 8 ++++++++ .../raddatz/familienarchiv/service/DocumentService.java | 4 ++++ .../db/migration/V24__add_script_type_to_documents.sql | 1 + 5 files changed, 21 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/ScriptType.java create mode 100644 backend/src/main/resources/db/migration/V24__add_script_type_to_documents.sql diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/DocumentUpdateDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/DocumentUpdateDTO.java index 79789f24..2cf39760 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/dto/DocumentUpdateDTO.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/DocumentUpdateDTO.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.UUID; import lombok.Data; +import org.raddatz.familienarchiv.model.ScriptType; @Data public class DocumentUpdateDTO { @@ -18,4 +19,5 @@ public class DocumentUpdateDTO { private List receiverIds; private String tags; private Boolean metadataComplete; + private ScriptType scriptType; } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/Document.java b/backend/src/main/java/org/raddatz/familienarchiv/model/Document.java index f72e3f5e..e5be77a3 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/model/Document.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/Document.java @@ -91,6 +91,12 @@ public class Document { @Builder.Default private boolean metadataComplete = false; + @Enumerated(EnumType.STRING) + @Column(name = "script_type", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private ScriptType scriptType = ScriptType.UNKNOWN; + @ManyToMany(fetch = FetchType.EAGER) @JoinTable(name = "document_receivers", joinColumns = @JoinColumn(name = "document_id"), inverseJoinColumns = @JoinColumn(name = "person_id")) @Builder.Default diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/ScriptType.java b/backend/src/main/java/org/raddatz/familienarchiv/model/ScriptType.java new file mode 100644 index 00000000..b6ff83e4 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/ScriptType.java @@ -0,0 +1,8 @@ +package org.raddatz.familienarchiv.model; + +public enum ScriptType { + UNKNOWN, + TYPEWRITER, + HANDWRITING_LATIN, + HANDWRITING_KURRENT +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java index 7d1bef2b..e3a6aea0 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/DocumentService.java @@ -222,6 +222,10 @@ public class DocumentService { doc.setMetadataComplete(dto.getMetadataComplete()); } + if (dto.getScriptType() != null) { + doc.setScriptType(dto.getScriptType()); + } + // 4. Datei austauschen (nur wenn eine neue ausgewählt wurde) if (newFile != null && !newFile.isEmpty()) { FileService.UploadResult upload = fileService.uploadFile(newFile, newFile.getOriginalFilename()); diff --git a/backend/src/main/resources/db/migration/V24__add_script_type_to_documents.sql b/backend/src/main/resources/db/migration/V24__add_script_type_to_documents.sql new file mode 100644 index 00000000..87a48a00 --- /dev/null +++ b/backend/src/main/resources/db/migration/V24__add_script_type_to_documents.sql @@ -0,0 +1 @@ +ALTER TABLE documents ADD COLUMN script_type VARCHAR(30) NOT NULL DEFAULT 'UNKNOWN'; -- 2.49.1 From ff3990710e54f4b2f0f809888758cf8d6bf82be7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:15:16 +0200 Subject: [PATCH 05/74] feat(ocr): add OCR infrastructure (interfaces, entities, migrations, DTOs) - OcrClient + OcrHealthClient interfaces for testable OCR integration - OcrBlockResult record for OCR engine response mapping - OcrJob + OcrJobDocument entities with status enums - V25 migration creates ocr_jobs and ocr_job_documents tables - Repositories for job and job-document queries - TriggerOcrDTO, BatchOcrDTO (@Size max=500), OcrStatusDTO - ErrorCodes: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND, OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../familienarchiv/dto/BatchOcrDTO.java | 19 ++++++ .../familienarchiv/dto/OcrStatusDTO.java | 19 ++++++ .../familienarchiv/dto/TriggerOcrDTO.java | 13 ++++ .../familienarchiv/exception/ErrorCode.java | 10 +++ .../model/OcrDocumentStatus.java | 9 +++ .../raddatz/familienarchiv/model/OcrJob.java | 62 +++++++++++++++++++ .../familienarchiv/model/OcrJobDocument.java | 59 ++++++++++++++++++ .../familienarchiv/model/OcrJobStatus.java | 8 +++ .../repository/OcrJobDocumentRepository.java | 20 ++++++ .../repository/OcrJobRepository.java | 9 +++ .../service/OcrBlockResult.java | 13 ++++ .../familienarchiv/service/OcrClient.java | 9 +++ .../service/OcrHealthClient.java | 5 ++ .../db/migration/V25__add_ocr_job_tables.sql | 26 ++++++++ 14 files changed, 281 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java create mode 100644 backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java new file mode 100644 index 00000000..69506437 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java @@ -0,0 +1,19 @@ +package org.raddatz.familienarchiv.dto; + +import jakarta.validation.constraints.NotEmpty; +import jakarta.validation.constraints.Size; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; +import java.util.UUID; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class BatchOcrDTO { + @NotEmpty + @Size(max = 500, message = "batch size must not exceed 500 documents") + private List documentIds; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java new file mode 100644 index 00000000..c23ca303 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java @@ -0,0 +1,19 @@ +package org.raddatz.familienarchiv.dto; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.UUID; + +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrStatusDTO { + private String status; + private UUID jobId; + private int currentPage; + private int totalPages; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java new file mode 100644 index 00000000..dda443b3 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java @@ -0,0 +1,13 @@ +package org.raddatz.familienarchiv.dto; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.raddatz.familienarchiv.model.ScriptType; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class TriggerOcrDTO { + private ScriptType scriptType; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java index b105df54..e3b0c99c 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java @@ -66,6 +66,16 @@ public enum ErrorCode { /** The notification with the given ID does not exist. 404 */ NOTIFICATION_NOT_FOUND, + // --- OCR --- + /** The OCR service is not available or not healthy. 503 */ + OCR_SERVICE_UNAVAILABLE, + /** The OCR job with the given ID does not exist. 404 */ + OCR_JOB_NOT_FOUND, + /** The document is not in UPLOADED status and cannot be OCR'd. 400 */ + OCR_DOCUMENT_NOT_UPLOADED, + /** OCR processing failed for the document. 500 */ + OCR_PROCESSING_FAILED, + // --- Generic --- /** Request validation failed (missing or malformed fields). 400 */ VALIDATION_ERROR, diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java new file mode 100644 index 00000000..d96620b3 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.model; + +public enum OcrDocumentStatus { + PENDING, + RUNNING, + DONE, + FAILED, + SKIPPED +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java new file mode 100644 index 00000000..81f205fe --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java @@ -0,0 +1,62 @@ +package org.raddatz.familienarchiv.model; + +import io.swagger.v3.oas.annotations.media.Schema; +import jakarta.persistence.*; +import lombok.*; +import org.hibernate.annotations.CreationTimestamp; +import org.hibernate.annotations.UpdateTimestamp; + +import java.time.LocalDateTime; +import java.util.UUID; + +@Entity +@Table(name = "ocr_jobs") +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrJob { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID id; + + @Enumerated(EnumType.STRING) + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private OcrJobStatus status = OcrJobStatus.PENDING; + + @Column(name = "total_documents", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private int totalDocuments; + + @Column(name = "processed_documents", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int processedDocuments = 0; + + @Column(name = "error_count", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int errorCount = 0; + + @Column(name = "skipped_count", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int skippedCount = 0; + + @Column(name = "created_by") + private UUID createdBy; + + @Column(name = "created_at", nullable = false, updatable = false) + @CreationTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime createdAt; + + @Column(name = "updated_at", nullable = false) + @UpdateTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime updatedAt; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java new file mode 100644 index 00000000..c8f3f702 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java @@ -0,0 +1,59 @@ +package org.raddatz.familienarchiv.model; + +import io.swagger.v3.oas.annotations.media.Schema; +import jakarta.persistence.*; +import lombok.*; +import org.hibernate.annotations.CreationTimestamp; +import org.hibernate.annotations.UpdateTimestamp; + +import java.time.LocalDateTime; +import java.util.UUID; + +@Entity +@Table(name = "ocr_job_documents") +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrJobDocument { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID id; + + @Column(name = "job_id", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID jobId; + + @Column(name = "document_id", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID documentId; + + @Enumerated(EnumType.STRING) + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private OcrDocumentStatus status = OcrDocumentStatus.PENDING; + + @Column(name = "error_message") + private String errorMessage; + + @Column(name = "current_page") + @Builder.Default + private int currentPage = 0; + + @Column(name = "total_pages") + @Builder.Default + private int totalPages = 0; + + @Column(name = "created_at", nullable = false, updatable = false) + @CreationTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime createdAt; + + @Column(name = "updated_at", nullable = false) + @UpdateTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime updatedAt; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java new file mode 100644 index 00000000..5f1bf442 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java @@ -0,0 +1,8 @@ +package org.raddatz.familienarchiv.model; + +public enum OcrJobStatus { + PENDING, + RUNNING, + DONE, + FAILED +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java new file mode 100644 index 00000000..3d781804 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java @@ -0,0 +1,20 @@ +package org.raddatz.familienarchiv.repository; + +import org.raddatz.familienarchiv.model.OcrDocumentStatus; +import org.raddatz.familienarchiv.model.OcrJobDocument; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +public interface OcrJobDocumentRepository extends JpaRepository { + + List findByJobIdOrderByCreatedAtAsc(UUID jobId); + + List findByJobIdAndStatus(UUID jobId, OcrDocumentStatus status); + + Optional findByJobIdAndDocumentId(UUID jobId, UUID documentId); + + Optional findFirstByDocumentIdAndStatusIn(UUID documentId, List statuses); +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java new file mode 100644 index 00000000..5d319ccf --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.repository; + +import org.raddatz.familienarchiv.model.OcrJob; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.UUID; + +public interface OcrJobRepository extends JpaRepository { +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java new file mode 100644 index 00000000..e01b7def --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java @@ -0,0 +1,13 @@ +package org.raddatz.familienarchiv.service; + +import java.util.List; + +public record OcrBlockResult( + int pageNumber, + double x, + double y, + double width, + double height, + List> polygon, + String text +) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java new file mode 100644 index 00000000..3b33aaf2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.service; + +import org.raddatz.familienarchiv.model.ScriptType; + +import java.util.List; + +public interface OcrClient { + List extractBlocks(String pdfUrl, ScriptType scriptType); +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java new file mode 100644 index 00000000..3a62f592 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java @@ -0,0 +1,5 @@ +package org.raddatz.familienarchiv.service; + +public interface OcrHealthClient { + boolean isHealthy(); +} diff --git a/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql b/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql new file mode 100644 index 00000000..a9f6945c --- /dev/null +++ b/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql @@ -0,0 +1,26 @@ +CREATE TABLE ocr_jobs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + status VARCHAR(20) NOT NULL DEFAULT 'PENDING', + total_documents INT NOT NULL, + processed_documents INT NOT NULL DEFAULT 0, + error_count INT NOT NULL DEFAULT 0, + skipped_count INT NOT NULL DEFAULT 0, + created_by UUID, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE TABLE ocr_job_documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + job_id UUID NOT NULL REFERENCES ocr_jobs(id) ON DELETE CASCADE, + document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + status VARCHAR(20) NOT NULL DEFAULT 'PENDING', + error_message TEXT, + current_page INT DEFAULT 0, + total_pages INT DEFAULT 0, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX idx_ocr_job_documents_job_id ON ocr_job_documents(job_id); +CREATE INDEX idx_ocr_job_documents_document_id ON ocr_job_documents(document_id); -- 2.49.1 From aea46c5fd07f73e3cc10663447819057a0d92050 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:24:15 +0200 Subject: [PATCH 06/74] feat(ocr): add OcrService, OcrBatchService, OcrProgressService, OcrController - OcrService: single-document OCR (health check, block clearing, presigned URL, annotation + block creation) - OcrBatchService: batch processing with @Async, per-document status tracking, SKIPPED for PLACEHOLDER documents, failure isolation - OcrProgressService: SSE emitter registry per job ID with 5-min timeout - OcrController: POST /api/documents/{id}/ocr (WRITE_ALL), POST /api/ocr/batch (ADMIN), GET /api/ocr/jobs/{id} (READ_ALL), GET /api/ocr/jobs/{id}/progress (SSE), GET /api/documents/{id}/ocr-status 19 tests: 6 OcrService, 4 OcrBatchService, 3 OcrProgressService, 6 OcrController Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../controller/OcrController.java | 114 ++++++++++++ .../service/OcrBatchService.java | 114 ++++++++++++ .../service/OcrProgressService.java | 69 +++++++ .../familienarchiv/service/OcrService.java | 120 ++++++++++++ .../controller/OcrControllerTest.java | 138 ++++++++++++++ .../service/OcrBatchServiceTest.java | 142 ++++++++++++++ .../service/OcrProgressServiceTest.java | 33 ++++ .../service/OcrServiceTest.java | 176 ++++++++++++++++++ 8 files changed, 906 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/controller/OcrControllerTest.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/OcrBatchServiceTest.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/OcrProgressServiceTest.java create mode 100644 backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java diff --git a/backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java b/backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java new file mode 100644 index 00000000..bd1e41f9 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/controller/OcrController.java @@ -0,0 +1,114 @@ +package org.raddatz.familienarchiv.controller; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.dto.BatchOcrDTO; +import org.raddatz.familienarchiv.dto.OcrStatusDTO; +import org.raddatz.familienarchiv.dto.TriggerOcrDTO; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository; +import org.raddatz.familienarchiv.repository.OcrJobRepository; +import org.raddatz.familienarchiv.security.Permission; +import org.raddatz.familienarchiv.security.RequirePermission; +import org.raddatz.familienarchiv.service.OcrBatchService; +import org.raddatz.familienarchiv.service.OcrProgressService; +import org.raddatz.familienarchiv.service.OcrService; +import org.raddatz.familienarchiv.service.UserService; +import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; +import org.springframework.security.core.Authentication; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; + +import jakarta.validation.Valid; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; + +@RestController +@RequiredArgsConstructor +@Slf4j +public class OcrController { + + private final OcrService ocrService; + private final OcrBatchService ocrBatchService; + private final OcrProgressService ocrProgressService; + private final OcrJobRepository ocrJobRepository; + private final OcrJobDocumentRepository ocrJobDocumentRepository; + private final UserService userService; + + @PostMapping("/api/documents/{documentId}/ocr") + @ResponseStatus(HttpStatus.ACCEPTED) + @RequirePermission(Permission.WRITE_ALL) + public Map triggerOcr( + @PathVariable UUID documentId, + @RequestBody TriggerOcrDTO dto, + Authentication authentication) { + UUID userId = resolveUserId(authentication); + UUID jobId = ocrService.startOcr(documentId, dto.getScriptType(), userId); + return Map.of("jobId", jobId); + } + + @PostMapping("/api/ocr/batch") + @ResponseStatus(HttpStatus.ACCEPTED) + @RequirePermission(Permission.ADMIN) + public Map triggerBatch( + @RequestBody @Valid BatchOcrDTO dto, + Authentication authentication) { + UUID userId = resolveUserId(authentication); + UUID jobId = ocrBatchService.startBatch(dto.getDocumentIds(), userId); + return Map.of("jobId", jobId); + } + + @GetMapping("/api/ocr/jobs/{jobId}") + @RequirePermission(Permission.READ_ALL) + public OcrJob getJobStatus(@PathVariable UUID jobId) { + return ocrJobRepository.findById(jobId) + .orElseThrow(() -> DomainException.notFound( + ErrorCode.OCR_JOB_NOT_FOUND, "OCR job not found: " + jobId)); + } + + @GetMapping(value = "/api/ocr/jobs/{jobId}/progress", produces = MediaType.TEXT_EVENT_STREAM_VALUE) + @RequirePermission(Permission.READ_ALL) + public SseEmitter streamProgress(@PathVariable UUID jobId) { + ocrJobRepository.findById(jobId) + .orElseThrow(() -> DomainException.notFound( + ErrorCode.OCR_JOB_NOT_FOUND, "OCR job not found: " + jobId)); + return ocrProgressService.register(jobId); + } + + @GetMapping("/api/documents/{documentId}/ocr-status") + @RequirePermission(Permission.READ_ALL) + public OcrStatusDTO getDocumentOcrStatus(@PathVariable UUID documentId) { + List activeStatuses = List.of( + OcrDocumentStatus.PENDING, OcrDocumentStatus.RUNNING); + + Optional activeJobDoc = ocrJobDocumentRepository + .findFirstByDocumentIdAndStatusIn(documentId, activeStatuses); + + if (activeJobDoc.isEmpty()) { + return OcrStatusDTO.builder().status("NONE").build(); + } + + OcrJobDocument jobDoc = activeJobDoc.get(); + return OcrStatusDTO.builder() + .status(jobDoc.getStatus().name()) + .jobId(jobDoc.getJobId()) + .currentPage(jobDoc.getCurrentPage()) + .totalPages(jobDoc.getTotalPages()) + .build(); + } + + private UUID resolveUserId(Authentication authentication) { + if (authentication == null || !authentication.isAuthenticated()) return null; + try { + AppUser user = userService.findByUsername(authentication.getName()); + return user != null ? user.getId() : null; + } catch (Exception e) { + return null; + } + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java new file mode 100644 index 00000000..52639c36 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBatchService.java @@ -0,0 +1,114 @@ +package org.raddatz.familienarchiv.service; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository; +import org.raddatz.familienarchiv.repository.OcrJobRepository; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.util.List; +import java.util.Map; +import java.util.UUID; + +@Service +@RequiredArgsConstructor +@Slf4j +public class OcrBatchService { + + private final OcrService ocrService; + private final OcrHealthClient ocrHealthClient; + private final DocumentService documentService; + private final OcrJobRepository ocrJobRepository; + private final OcrJobDocumentRepository ocrJobDocumentRepository; + private final OcrProgressService ocrProgressService; + + public UUID startBatch(List documentIds, UUID userId) { + if (!ocrHealthClient.isHealthy()) { + throw DomainException.internal(ErrorCode.OCR_SERVICE_UNAVAILABLE, + "OCR service is not available"); + } + + OcrJob job = OcrJob.builder() + .totalDocuments(documentIds.size()) + .createdBy(userId) + .status(OcrJobStatus.PENDING) + .build(); + job = ocrJobRepository.save(job); + + for (UUID docId : documentIds) { + OcrJobDocument jobDoc = OcrJobDocument.builder() + .jobId(job.getId()) + .documentId(docId) + .status(OcrDocumentStatus.PENDING) + .build(); + ocrJobDocumentRepository.save(jobDoc); + } + + processBatchAsync(job.getId(), userId); + return job.getId(); + } + + @Async + void processBatchAsync(UUID jobId, UUID userId) { + OcrJob job = ocrJobRepository.findById(jobId).orElse(null); + if (job == null) return; + + job.setStatus(OcrJobStatus.RUNNING); + ocrJobRepository.save(job); + + List jobDocs = ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId); + + for (OcrJobDocument jobDoc : jobDocs) { + Document doc = documentService.getDocumentById(jobDoc.getDocumentId()); + + if (doc.getStatus() == DocumentStatus.PLACEHOLDER) { + jobDoc.setStatus(OcrDocumentStatus.SKIPPED); + ocrJobDocumentRepository.save(jobDoc); + job.setSkippedCount(job.getSkippedCount() + 1); + ocrJobRepository.save(job); + ocrProgressService.emit(jobId, "document", Map.of( + "documentId", jobDoc.getDocumentId(), + "status", "SKIPPED", + "processed", job.getProcessedDocuments(), + "total", job.getTotalDocuments())); + continue; + } + + jobDoc.setStatus(OcrDocumentStatus.RUNNING); + ocrJobDocumentRepository.save(jobDoc); + + try { + ocrService.processDocument(jobDoc.getDocumentId(), doc, userId); + jobDoc.setStatus(OcrDocumentStatus.DONE); + job.setProcessedDocuments(job.getProcessedDocuments() + 1); + } catch (Exception e) { + log.error("OCR batch: failed document {}", jobDoc.getDocumentId(), e); + jobDoc.setStatus(OcrDocumentStatus.FAILED); + jobDoc.setErrorMessage(e.getMessage()); + job.setErrorCount(job.getErrorCount() + 1); + } + + ocrJobDocumentRepository.save(jobDoc); + ocrJobRepository.save(job); + + ocrProgressService.emit(jobId, "document", Map.of( + "documentId", jobDoc.getDocumentId(), + "status", jobDoc.getStatus().name(), + "processed", job.getProcessedDocuments(), + "total", job.getTotalDocuments())); + } + + job.setStatus(OcrJobStatus.DONE); + ocrJobRepository.save(job); + + ocrProgressService.emit(jobId, "done", Map.of( + "processed", job.getProcessedDocuments(), + "errors", job.getErrorCount(), + "skipped", job.getSkippedCount())); + ocrProgressService.complete(jobId); + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java new file mode 100644 index 00000000..8b3bc798 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrProgressService.java @@ -0,0 +1,69 @@ +package org.raddatz.familienarchiv.service; + +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; + +@Service +@Slf4j +public class OcrProgressService { + + private static final long SSE_TIMEOUT = 5 * 60 * 1000L; + + private final ConcurrentHashMap> emitters = new ConcurrentHashMap<>(); + + public SseEmitter register(UUID jobId) { + SseEmitter emitter = new SseEmitter(SSE_TIMEOUT); + emitters.computeIfAbsent(jobId, k -> new CopyOnWriteArrayList<>()).add(emitter); + + emitter.onCompletion(() -> removeEmitter(jobId, emitter)); + emitter.onTimeout(() -> removeEmitter(jobId, emitter)); + emitter.onError(e -> removeEmitter(jobId, emitter)); + + return emitter; + } + + public void emit(UUID jobId, String eventType, Object data) { + List jobEmitters = emitters.get(jobId); + if (jobEmitters == null) return; + + for (SseEmitter emitter : jobEmitters) { + try { + emitter.send(SseEmitter.event().name(eventType).data(data)); + } catch (IOException e) { + log.debug("SSE send failed for job {} — removing emitter", jobId); + removeEmitter(jobId, emitter); + } + } + } + + public void complete(UUID jobId) { + List jobEmitters = emitters.remove(jobId); + if (jobEmitters == null) return; + + for (SseEmitter emitter : jobEmitters) { + try { + emitter.complete(); + } catch (Exception e) { + log.debug("SSE complete failed for job {}", jobId); + } + } + } + + private void removeEmitter(UUID jobId, SseEmitter emitter) { + List jobEmitters = emitters.get(jobId); + if (jobEmitters != null) { + jobEmitters.remove(emitter); + if (jobEmitters.isEmpty()) { + emitters.remove(jobId); + } + } + } +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java new file mode 100644 index 00000000..5ec7a2f1 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java @@ -0,0 +1,120 @@ +package org.raddatz.familienarchiv.service; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.dto.CreateAnnotationDTO; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobRepository; +import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +import java.util.List; +import java.util.UUID; + +@Service +@RequiredArgsConstructor +@Slf4j +public class OcrService { + + private static final String OCR_ANNOTATION_COLOR = "#00C7B1"; + + private final OcrClient ocrClient; + private final OcrHealthClient ocrHealthClient; + private final DocumentService documentService; + private final TranscriptionService transcriptionService; + private final AnnotationService annotationService; + private final TranscriptionBlockRepository blockRepository; + private final OcrJobRepository ocrJobRepository; + + @Value("${app.s3.internal-url:http://minio:9000}") + private String s3InternalUrl; + + @Value("${app.s3.bucket}") + private String bucketName; + + @Transactional + public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) { + Document doc = documentService.getDocumentById(documentId); + + if (doc.getStatus() == DocumentStatus.PLACEHOLDER) { + throw DomainException.badRequest(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED, + "Document has no file attached: " + documentId); + } + + if (!ocrHealthClient.isHealthy()) { + throw DomainException.internal(ErrorCode.OCR_SERVICE_UNAVAILABLE, + "OCR service is not available"); + } + + if (scriptTypeOverride != null) { + doc.setScriptType(scriptTypeOverride); + } + + OcrJob job = OcrJob.builder() + .totalDocuments(1) + .createdBy(userId) + .status(OcrJobStatus.RUNNING) + .build(); + job = ocrJobRepository.save(job); + + try { + processDocument(documentId, doc, userId); + job.setStatus(OcrJobStatus.DONE); + job.setProcessedDocuments(1); + } catch (Exception e) { + log.error("OCR processing failed for document {}", documentId, e); + job.setStatus(OcrJobStatus.FAILED); + job.setErrorCount(1); + } + + ocrJobRepository.save(job); + return job.getId(); + } + + void processDocument(UUID documentId, Document doc, UUID userId) { + clearExistingBlocks(documentId); + + String pdfUrl = buildInternalUrl(doc.getFilePath()); + List blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType()); + createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash()); + } + + private void clearExistingBlocks(UUID documentId) { + List existing = transcriptionService.listBlocks(documentId); + for (TranscriptionBlock block : existing) { + transcriptionService.deleteBlock(documentId, block.getId()); + } + } + + private void createTranscriptionBlocks(UUID documentId, List blocks, + UUID userId, String fileHash) { + for (int i = 0; i < blocks.size(); i++) { + OcrBlockResult block = blocks.get(i); + + CreateAnnotationDTO annotationDTO = new CreateAnnotationDTO( + block.pageNumber(), block.x(), block.y(), + block.width(), block.height(), OCR_ANNOTATION_COLOR); + + DocumentAnnotation annotation = annotationService.createOcrAnnotation( + documentId, annotationDTO, userId, fileHash, block.polygon()); + + TranscriptionBlock transcriptionBlock = TranscriptionBlock.builder() + .annotationId(annotation.getId()) + .documentId(documentId) + .text(block.text() != null ? block.text() : "") + .sortOrder(i) + .createdBy(userId) + .updatedBy(userId) + .build(); + blockRepository.save(transcriptionBlock); + } + } + + String buildInternalUrl(String filePath) { + return s3InternalUrl + "/" + bucketName + "/" + filePath; + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/controller/OcrControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/controller/OcrControllerTest.java new file mode 100644 index 00000000..aef427ef --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/controller/OcrControllerTest.java @@ -0,0 +1,138 @@ +package org.raddatz.familienarchiv.controller; + +import tools.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.raddatz.familienarchiv.config.SecurityConfig; +import org.raddatz.familienarchiv.dto.BatchOcrDTO; +import org.raddatz.familienarchiv.dto.TriggerOcrDTO; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository; +import org.raddatz.familienarchiv.repository.OcrJobRepository; +import org.raddatz.familienarchiv.security.PermissionAspect; +import org.raddatz.familienarchiv.service.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.aop.AopAutoConfiguration; +import org.springframework.boot.webmvc.test.autoconfigure.WebMvcTest; +import org.springframework.context.annotation.Import; +import org.springframework.http.MediaType; +import org.springframework.security.test.context.support.WithMockUser; +import org.springframework.test.context.bean.override.mockito.MockitoBean; +import org.springframework.test.web.servlet.MockMvc; + +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.when; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +@WebMvcTest(OcrController.class) +@Import({SecurityConfig.class, PermissionAspect.class, AopAutoConfiguration.class}) +class OcrControllerTest { + + @Autowired MockMvc mockMvc; + private final ObjectMapper objectMapper = new ObjectMapper(); + + @MockitoBean OcrService ocrService; + @MockitoBean OcrBatchService ocrBatchService; + @MockitoBean OcrProgressService ocrProgressService; + @MockitoBean OcrJobRepository ocrJobRepository; + @MockitoBean OcrJobDocumentRepository ocrJobDocumentRepository; + @MockitoBean UserService userService; + @MockitoBean CustomUserDetailsService customUserDetailsService; + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void triggerOcr_returns202_withJobId() throws Exception { + UUID docId = UUID.randomUUID(); + UUID jobId = UUID.randomUUID(); + TriggerOcrDTO dto = new TriggerOcrDTO(ScriptType.TYPEWRITER); + + when(ocrService.startOcr(eq(docId), eq(ScriptType.TYPEWRITER), any())).thenReturn(jobId); + + mockMvc.perform(post("/api/documents/{id}/ocr", docId) + .contentType(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(dto))) + .andExpect(status().isAccepted()) + .andExpect(jsonPath("$.jobId").value(jobId.toString())); + } + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void triggerOcr_returns400_whenDocumentNotUploaded() throws Exception { + UUID docId = UUID.randomUUID(); + when(ocrService.startOcr(eq(docId), any(), any())) + .thenThrow(DomainException.badRequest(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED, "Not uploaded")); + + mockMvc.perform(post("/api/documents/{id}/ocr", docId) + .contentType(MediaType.APPLICATION_JSON) + .content("{}")) + .andExpect(status().isBadRequest()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getJobStatus_returns404_whenJobNotFound() throws Exception { + UUID jobId = UUID.randomUUID(); + when(ocrJobRepository.findById(jobId)).thenReturn(Optional.empty()); + + mockMvc.perform(get("/api/ocr/jobs/{jobId}", jobId)) + .andExpect(status().isNotFound()); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getJobStatus_returnsJobInfo_whenFound() throws Exception { + UUID jobId = UUID.randomUUID(); + OcrJob job = OcrJob.builder() + .id(jobId) + .status(OcrJobStatus.RUNNING) + .totalDocuments(10) + .processedDocuments(3) + .errorCount(1) + .skippedCount(0) + .build(); + when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job)); + + mockMvc.perform(get("/api/ocr/jobs/{jobId}", jobId)) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.status").value("RUNNING")) + .andExpect(jsonPath("$.totalDocuments").value(10)) + .andExpect(jsonPath("$.processedDocuments").value(3)); + } + + @Test + @WithMockUser(authorities = "ADMIN") + void triggerBatch_returns202_withJobId() throws Exception { + UUID jobId = UUID.randomUUID(); + List docIds = List.of(UUID.randomUUID(), UUID.randomUUID()); + BatchOcrDTO dto = new BatchOcrDTO(docIds); + + when(ocrBatchService.startBatch(eq(docIds), any())).thenReturn(jobId); + + mockMvc.perform(post("/api/ocr/batch") + .contentType(MediaType.APPLICATION_JSON) + .content(objectMapper.writeValueAsString(dto))) + .andExpect(status().isAccepted()) + .andExpect(jsonPath("$.jobId").value(jobId.toString())); + } + + @Test + @WithMockUser(authorities = "READ_ALL") + void getDocumentOcrStatus_returnsNone_whenNoOcrJobExists() throws Exception { + UUID docId = UUID.randomUUID(); + when(ocrJobDocumentRepository.findFirstByDocumentIdAndStatusIn(eq(docId), any())) + .thenReturn(Optional.empty()); + + mockMvc.perform(get("/api/documents/{id}/ocr-status", docId)) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.status").value("NONE")); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrBatchServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrBatchServiceTest.java new file mode 100644 index 00000000..9640c3b0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrBatchServiceTest.java @@ -0,0 +1,142 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobDocumentRepository; +import org.raddatz.familienarchiv.repository.OcrJobRepository; + +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.*; + +@ExtendWith(MockitoExtension.class) +class OcrBatchServiceTest { + + @Mock OcrService ocrService; + @Mock OcrHealthClient ocrHealthClient; + @Mock DocumentService documentService; + @Mock OcrJobRepository ocrJobRepository; + @Mock OcrJobDocumentRepository ocrJobDocumentRepository; + @Mock OcrProgressService ocrProgressService; + + @InjectMocks OcrBatchService ocrBatchService; + + @Test + void startBatch_throwsServiceUnavailable_whenOcrServiceIsDown() { + when(ocrHealthClient.isHealthy()).thenReturn(false); + + assertThatThrownBy(() -> ocrBatchService.startBatch(List.of(UUID.randomUUID()), UUID.randomUUID())) + .isInstanceOf(DomainException.class) + .satisfies(e -> assertThat(((DomainException) e).getCode()) + .isEqualTo(ErrorCode.OCR_SERVICE_UNAVAILABLE)); + } + + @Test + void startBatch_createsJobAndReturnsJobId() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + UUID jobId = UUID.randomUUID(); + + when(ocrHealthClient.isHealthy()).thenReturn(true); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(jobId); + return job; + }); + when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of( + OcrJob.builder().id(jobId).totalDocuments(1).status(OcrJobStatus.PENDING).build())); + when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId)).thenReturn(List.of( + OcrJobDocument.builder().jobId(jobId).documentId(docId).status(OcrDocumentStatus.PENDING).build())); + + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("test.pdf").fileHash("hash").scriptType(ScriptType.TYPEWRITER).build(); + when(documentService.getDocumentById(docId)).thenReturn(doc); + + UUID resultJobId = ocrBatchService.startBatch(List.of(docId), userId); + + assertThat(resultJobId).isEqualTo(jobId); + verify(ocrService).processDocument(eq(docId), eq(doc), eq(userId)); + } + + @Test + void processBatchAsync_skipsPlaceholderDocuments() { + UUID jobId = UUID.randomUUID(); + UUID uploadedId = UUID.randomUUID(); + UUID placeholderId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + + OcrJob job = OcrJob.builder().id(jobId).totalDocuments(2).status(OcrJobStatus.PENDING).build(); + when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job)); + when(ocrJobRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + OcrJobDocument uploadedJobDoc = OcrJobDocument.builder() + .jobId(jobId).documentId(uploadedId).status(OcrDocumentStatus.PENDING).build(); + OcrJobDocument placeholderJobDoc = OcrJobDocument.builder() + .jobId(jobId).documentId(placeholderId).status(OcrDocumentStatus.PENDING).build(); + when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId)) + .thenReturn(List.of(uploadedJobDoc, placeholderJobDoc)); + + Document uploaded = Document.builder().id(uploadedId).status(DocumentStatus.UPLOADED) + .filePath("test.pdf").fileHash("hash").scriptType(ScriptType.TYPEWRITER).build(); + Document placeholder = Document.builder().id(placeholderId).status(DocumentStatus.PLACEHOLDER).build(); + when(documentService.getDocumentById(uploadedId)).thenReturn(uploaded); + when(documentService.getDocumentById(placeholderId)).thenReturn(placeholder); + + ocrBatchService.processBatchAsync(jobId, userId); + + verify(ocrService).processDocument(eq(uploadedId), eq(uploaded), eq(userId)); + verify(ocrService, never()).processDocument(eq(placeholderId), any(), any()); + assertThat(placeholderJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.SKIPPED); + } + + @Test + void processBatchAsync_continuesAfterSingleDocumentFailure() { + UUID jobId = UUID.randomUUID(); + UUID failDocId = UUID.randomUUID(); + UUID successDocId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + + OcrJob job = OcrJob.builder().id(jobId).totalDocuments(2).status(OcrJobStatus.PENDING).build(); + when(ocrJobRepository.findById(jobId)).thenReturn(Optional.of(job)); + when(ocrJobRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + when(ocrJobDocumentRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + OcrJobDocument failJobDoc = OcrJobDocument.builder() + .jobId(jobId).documentId(failDocId).status(OcrDocumentStatus.PENDING).build(); + OcrJobDocument successJobDoc = OcrJobDocument.builder() + .jobId(jobId).documentId(successDocId).status(OcrDocumentStatus.PENDING).build(); + when(ocrJobDocumentRepository.findByJobIdOrderByCreatedAtAsc(jobId)) + .thenReturn(List.of(failJobDoc, successJobDoc)); + + Document failDoc = Document.builder().id(failDocId).status(DocumentStatus.UPLOADED) + .filePath("fail.pdf").fileHash("hash1").scriptType(ScriptType.TYPEWRITER).build(); + Document successDoc = Document.builder().id(successDocId).status(DocumentStatus.UPLOADED) + .filePath("success.pdf").fileHash("hash2").scriptType(ScriptType.TYPEWRITER).build(); + when(documentService.getDocumentById(failDocId)).thenReturn(failDoc); + when(documentService.getDocumentById(successDocId)).thenReturn(successDoc); + + doThrow(new RuntimeException("OCR failed")).when(ocrService) + .processDocument(eq(failDocId), any(), any()); + + ocrBatchService.processBatchAsync(jobId, userId); + + verify(ocrService).processDocument(eq(successDocId), eq(successDoc), eq(userId)); + assertThat(failJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.FAILED); + assertThat(successJobDoc.getStatus()).isEqualTo(OcrDocumentStatus.DONE); + assertThat(job.getStatus()).isEqualTo(OcrJobStatus.DONE); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrProgressServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrProgressServiceTest.java new file mode 100644 index 00000000..44ed276f --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrProgressServiceTest.java @@ -0,0 +1,33 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; +import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; + +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; + +class OcrProgressServiceTest { + + private final OcrProgressService progressService = new OcrProgressService(); + + @Test + void register_returnsNonNullEmitter() { + UUID jobId = UUID.randomUUID(); + SseEmitter emitter = progressService.register(jobId); + assertThat(emitter).isNotNull(); + } + + @Test + void emit_doesNotThrow_whenNoEmittersRegistered() { + assertThatCode(() -> progressService.emit(UUID.randomUUID(), "test", "data")) + .doesNotThrowAnyException(); + } + + @Test + void complete_doesNotThrow_whenNoEmittersRegistered() { + assertThatCode(() -> progressService.complete(UUID.randomUUID())) + .doesNotThrowAnyException(); + } +} diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java new file mode 100644 index 00000000..44c598e0 --- /dev/null +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java @@ -0,0 +1,176 @@ +package org.raddatz.familienarchiv.service; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.ArgumentCaptor; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.raddatz.familienarchiv.dto.CreateAnnotationDTO; +import org.raddatz.familienarchiv.exception.DomainException; +import org.raddatz.familienarchiv.exception.ErrorCode; +import org.raddatz.familienarchiv.model.*; +import org.raddatz.familienarchiv.repository.OcrJobRepository; +import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository; + +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.*; +import static org.springframework.http.HttpStatus.*; + +@ExtendWith(MockitoExtension.class) +class OcrServiceTest { + + @Mock OcrClient ocrClient; + @Mock OcrHealthClient ocrHealthClient; + @Mock DocumentService documentService; + @Mock TranscriptionService transcriptionService; + @Mock AnnotationService annotationService; + @Mock TranscriptionBlockRepository blockRepository; + @Mock OcrJobRepository ocrJobRepository; + + @InjectMocks OcrService ocrService; + + @Test + void startOcr_throwsBadRequest_whenDocumentIsPlaceholder() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.PLACEHOLDER).build(); + when(documentService.getDocumentById(docId)).thenReturn(doc); + + assertThatThrownBy(() -> ocrService.startOcr(docId, null, userId)) + .isInstanceOf(DomainException.class) + .satisfies(e -> { + DomainException de = (DomainException) e; + assertThat(de.getStatus()).isEqualTo(BAD_REQUEST); + assertThat(de.getCode()).isEqualTo(ErrorCode.OCR_DOCUMENT_NOT_UPLOADED); + }); + } + + @Test + void startOcr_throwsServiceUnavailable_whenOcrServiceIsDown() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123").build(); + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(false); + + assertThatThrownBy(() -> ocrService.startOcr(docId, null, userId)) + .isInstanceOf(DomainException.class) + .satisfies(e -> { + DomainException de = (DomainException) e; + assertThat(de.getCode()).isEqualTo(ErrorCode.OCR_SERVICE_UNAVAILABLE); + }); + } + + @Test + void startOcr_createsJobAndReturnsJobId() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + UUID jobId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.TYPEWRITER).build(); + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + // ocrService constructs the internal MinIO URL from S3 key + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(jobId); + return job; + }); + + UUID resultJobId = ocrService.startOcr(docId, ScriptType.TYPEWRITER, userId); + + assertThat(resultJobId).isEqualTo(jobId); + verify(ocrJobRepository, atLeastOnce()).save(any()); + } + + @Test + void startOcr_setsScriptTypeOnDocument_whenProvided() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.UNKNOWN).build(); + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + // ocrService constructs the internal MinIO URL from S3 key + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(UUID.randomUUID()); + return job; + }); + + ocrService.startOcr(docId, ScriptType.HANDWRITING_LATIN, userId); + + assertThat(doc.getScriptType()).isEqualTo(ScriptType.HANDWRITING_LATIN); + } + + @Test + void startOcr_clearsExistingBlocks_beforeCreatingNew() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.TYPEWRITER).build(); + TranscriptionBlock existingBlock = TranscriptionBlock.builder() + .id(UUID.randomUUID()).documentId(docId).build(); + + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + // ocrService constructs the internal MinIO URL from S3 key + when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock)); + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of( + new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello"))); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(UUID.randomUUID()); + return job; + }); + DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build(); + when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann); + + ocrService.startOcr(docId, null, userId); + + verify(transcriptionService).deleteBlock(docId, existingBlock.getId()); + } + + @Test + void startOcr_createsAnnotationAndBlock_forEachOcrResult() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.TYPEWRITER).build(); + + OcrBlockResult block1 = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Line 1"); + OcrBlockResult block2 = new OcrBlockResult(0, 0.1, 0.2, 0.8, 0.04, null, "Line 2"); + + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + // ocrService constructs the internal MinIO URL from S3 key + when(transcriptionService.listBlocks(docId)).thenReturn(List.of()); + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2)); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(UUID.randomUUID()); + return job; + }); + DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build(); + when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann); + + ocrService.startOcr(docId, null, userId); + + verify(annotationService, times(2)).createOcrAnnotation( + eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any()); + } +} -- 2.49.1 From 6737bd6db5ef8c85a29b8fd3dcf2e19316ab7b0c Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:26:40 +0200 Subject: [PATCH 07/74] feat(ocr): add Python OCR microservice, RestClientOcrClient, Docker Compose Python microservice (ocr-service/): - FastAPI app with /ocr and /health endpoints - Surya engine: transformer-based OCR for typewritten/modern handwriting - Kraken engine: historical HTR for Kurrent/Suetterlin with pure-Python polygon-to-quad approximation (gift wrapping + rotating calipers) - Eager model loading at startup via lifespan context manager - PDF download via httpx, page rendering via pypdfium2 at 300 DPI Java RestClientOcrClient: - Implements OcrClient + OcrHealthClient interfaces - Calls Python service via Spring RestClient - Health check with graceful fallback Docker Compose: - New ocr-service container (mem_limit 6g, no host ports) - Health check with start_period 60s for model loading - ocr_models volume for Kraken model files - Backend depends on ocr-service health Refs #226, #227 Co-Authored-By: Claude Sonnet 4.6 --- .../service/RestClientOcrClient.java | 73 +++++++ docker-compose.yml | 27 +++ ocr-service/Dockerfile | 23 +++ ocr-service/engines/__init__.py | 0 ocr-service/engines/kraken.py | 192 ++++++++++++++++++ ocr-service/engines/surya.py | 66 ++++++ ocr-service/main.py | 93 +++++++++ ocr-service/models.py | 20 ++ ocr-service/requirements.txt | 6 + 9 files changed, 500 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java create mode 100644 ocr-service/Dockerfile create mode 100644 ocr-service/engines/__init__.py create mode 100644 ocr-service/engines/kraken.py create mode 100644 ocr-service/engines/surya.py create mode 100644 ocr-service/main.py create mode 100644 ocr-service/models.py create mode 100644 ocr-service/requirements.txt diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java new file mode 100644 index 00000000..0bbb533c --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/RestClientOcrClient.java @@ -0,0 +1,73 @@ +package org.raddatz.familienarchiv.service; + +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.extern.slf4j.Slf4j; +import org.raddatz.familienarchiv.model.ScriptType; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.ParameterizedTypeReference; +import org.springframework.http.MediaType; +import org.springframework.stereotype.Component; +import org.springframework.web.client.RestClient; + +import java.util.List; +import java.util.Map; + +@Component +@Slf4j +public class RestClientOcrClient implements OcrClient, OcrHealthClient { + + private final RestClient restClient; + + public RestClientOcrClient(@Value("${app.ocr.base-url:http://ocr-service:8000}") String baseUrl) { + this.restClient = RestClient.builder().baseUrl(baseUrl).build(); + } + + @Override + public List extractBlocks(String pdfUrl, ScriptType scriptType) { + Map body = Map.of( + "pdfUrl", pdfUrl, + "scriptType", scriptType.name(), + "language", "de"); + + List response = restClient.post() + .uri("/ocr") + .contentType(MediaType.APPLICATION_JSON) + .body(body) + .retrieve() + .body(new ParameterizedTypeReference<>() {}); + + if (response == null) return List.of(); + + return response.stream() + .map(OcrBlockJson::toResult) + .toList(); + } + + @Override + public boolean isHealthy() { + try { + restClient.get() + .uri("/health") + .retrieve() + .toBodilessEntity(); + return true; + } catch (Exception e) { + log.warn("OCR service health check failed: {}", e.getMessage()); + return false; + } + } + + record OcrBlockJson( + @JsonProperty("pageNumber") int pageNumber, + double x, + double y, + double width, + double height, + List> polygon, + String text + ) { + OcrBlockResult toResult() { + return new OcrBlockResult(pageNumber, x, y, width, height, polygon, text); + } + } +} diff --git a/docker-compose.yml b/docker-compose.yml index 7ceabc66..0dd05942 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,28 @@ services: networks: - archive-net + # --- OCR: Python microservice (Surya + Kraken) --- + ocr-service: + build: + context: ./ocr-service + dockerfile: Dockerfile + container_name: archive-ocr + restart: unless-stopped + mem_limit: 6g + memswap_limit: 6g + volumes: + - ocr_models:/app/models + environment: + KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel + networks: + - archive-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 60s + # --- Backend: Spring Boot --- backend: build: @@ -89,6 +111,8 @@ services: condition: service_healthy mailpit: condition: service_started + ocr-service: + condition: service_healthy environment: SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB} SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER} @@ -109,6 +133,8 @@ services: # Mailpit needs no auth or STARTTLS; production SMTP overrides these via .env SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false} SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false} + APP_OCR_BASE_URL: http://ocr-service:8000 + APP_S3_INTERNAL_URL: http://minio:9000 ports: - "${PORT_BACKEND}:8080" networks: @@ -155,3 +181,4 @@ networks: volumes: frontend_node_modules: maven_cache: + ocr_models: diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile new file mode 100644 index 00000000..24f74be0 --- /dev/null +++ b/ocr-service/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +WORKDIR /app + +# curl for healthcheck; libgomp1 for PyTorch CPU threading +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved) +RUN pip install --no-cache-dir \ + torch==2.5.1 \ + --index-url https://download.pytorch.org/whl/cpu + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/ocr-service/engines/__init__.py b/ocr-service/engines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py new file mode 100644 index 00000000..16cb3d0b --- /dev/null +++ b/ocr-service/engines/kraken.py @@ -0,0 +1,192 @@ +"""Kraken OCR engine wrapper — historical HTR model support for Kurrent/Suetterlin.""" + +import logging +import os + +logger = logging.getLogger(__name__) + +_model = None +_model_path = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel") + + +def load_models(): + """Load the Kraken model at startup. Skips if model file is not present.""" + global _model + + if not os.path.exists(_model_path): + logger.warning("Kraken model not found at %s — Kurrent OCR will not be available", _model_path) + return + + logger.info("Loading Kraken model from %s...", _model_path) + + from kraken.lib import models as kraken_models + _model = kraken_models.load_any(_model_path) + + logger.info("Kraken model loaded successfully") + + +def is_available() -> bool: + return _model is not None + + +def extract_blocks(images: list, language: str = "de") -> list[dict]: + """Run Kraken segmentation + recognition on a list of PIL images. + + Returns block dicts with pageNumber, x, y, width, height, polygon, text. + Polygon is a 4-point quadrilateral approximation of the baseline polygon. + Coordinates are normalized to [0, 1]. + """ + from kraken import blla, rpred + + if _model is None: + raise RuntimeError("Kraken model is not loaded") + + all_blocks = [] + + for page_idx, image in enumerate(images): + page_w, page_h = image.size + + baseline_seg = blla.segment(image) + + pred_it = rpred.rpred(_model, image, baseline_seg) + + for record in pred_it: + # record.prediction is the recognized text + # record.cuts contains polygon points + # record.line is the baseline polygon + + polygon_pts = record.cuts if hasattr(record, "cuts") else [] + + # Compute AABB from the polygon + if polygon_pts: + xs = [p[0] for p in polygon_pts] + ys = [p[1] for p in polygon_pts] + x1, y1 = min(xs), min(ys) + x2, y2 = max(xs), max(ys) + else: + # Fallback to line baseline + xs = [p[0] for p in record.line] + ys = [p[1] for p in record.line] + x1, y1 = min(xs), min(ys) - 5 + x2, y2 = max(xs), max(ys) + 5 + + # Approximate polygon to quadrilateral + quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None + + all_blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": quad, + "text": record.prediction, + }) + + return all_blocks + + +def _approximate_to_quad(points: list[tuple], page_w: float, page_h: float) -> list[list[float]] | None: + """Approximate a polygon to a 4-point quadrilateral using the minimum bounding rectangle. + + Uses gift-wrapping (Jarvis march) for convex hull, then rotating calipers + for the minimum area bounding rectangle. Pure Python, no scipy/numpy. + """ + if len(points) < 3: + return None + + try: + hull = _convex_hull(points) + if len(hull) < 3: + return None + + rect = _min_bounding_rect(hull) + + # Normalize to [0, 1] + return [[p[0] / page_w, p[1] / page_h] for p in rect] + except Exception: + logger.debug("Failed to approximate polygon to quad, returning None") + return None + + +def _convex_hull(points: list[tuple]) -> list[tuple]: + """Jarvis march (gift wrapping) algorithm for 2D convex hull.""" + pts = list(set(points)) + if len(pts) < 3: + return pts + + # Start from leftmost point + start = min(pts, key=lambda p: (p[0], p[1])) + hull = [] + current = start + + while True: + hull.append(current) + candidate = pts[0] + for p in pts[1:]: + if candidate == current: + candidate = p + continue + cross = _cross(current, candidate, p) + if cross < 0: + candidate = p + elif cross == 0: + # Collinear — pick the farther point + if _dist_sq(current, p) > _dist_sq(current, candidate): + candidate = p + + current = candidate + if current == start: + break + + return hull + + +def _min_bounding_rect(hull: list[tuple]) -> list[tuple]: + """Find the minimum area bounding rectangle of a convex hull using rotating calipers.""" + n = len(hull) + if n < 2: + return hull + + min_area = float("inf") + best_rect = None + + for i in range(n): + # Edge vector + edge_x = hull[(i + 1) % n][0] - hull[i][0] + edge_y = hull[(i + 1) % n][1] - hull[i][1] + edge_len = (edge_x ** 2 + edge_y ** 2) ** 0.5 + if edge_len == 0: + continue + + # Unit vectors along and perpendicular to the edge + ux, uy = edge_x / edge_len, edge_y / edge_len + vx, vy = -uy, ux + + # Project all hull points onto the edge coordinate system + projs_u = [p[0] * ux + p[1] * uy for p in hull] + projs_v = [p[0] * vx + p[1] * vy for p in hull] + + min_u, max_u = min(projs_u), max(projs_u) + min_v, max_v = min(projs_v), max(projs_v) + + area = (max_u - min_u) * (max_v - min_v) + if area < min_area: + min_area = area + # Reconstruct 4 corners in original coordinates + best_rect = [ + (min_u * ux + min_v * vx, min_u * uy + min_v * vy), + (max_u * ux + min_v * vx, max_u * uy + min_v * vy), + (max_u * ux + max_v * vx, max_u * uy + max_v * vy), + (min_u * ux + max_v * vx, min_u * uy + max_v * vy), + ] + + return best_rect if best_rect else hull[:4] + + +def _cross(o: tuple, a: tuple, b: tuple) -> float: + return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0]) + + +def _dist_sq(a: tuple, b: tuple) -> float: + return (a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2 diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py new file mode 100644 index 00000000..c6cc7768 --- /dev/null +++ b/ocr-service/engines/surya.py @@ -0,0 +1,66 @@ +"""Surya OCR engine wrapper — transformer-based, handles typewritten and modern Latin handwriting.""" + +import logging + +logger = logging.getLogger(__name__) + +# Lazy-loaded at startup via load_models() +_recognition_model = None +_recognition_processor = None +_detection_model = None +_detection_processor = None + + +def load_models(): + """Eagerly load Surya models into memory. Called once at container startup.""" + global _recognition_model, _recognition_processor, _detection_model, _detection_processor + + logger.info("Loading Surya models...") + + from surya.model.detection.model import load_model as load_det_model + from surya.model.detection.model import load_processor as load_det_processor + from surya.model.recognition.model import load_model as load_rec_model + from surya.model.recognition.processor import load_processor as load_rec_processor + + _detection_model = load_det_model() + _detection_processor = load_det_processor() + _recognition_model = load_rec_model() + _recognition_processor = load_rec_processor() + + logger.info("Surya models loaded successfully") + + +def extract_blocks(images: list, language: str = "de") -> list[dict]: + """Run Surya OCR on a list of PIL images (one per page). + + Returns a flat list of block dicts with pageNumber, x, y, width, height, text. + Coordinates are normalized to [0, 1] relative to page dimensions. + """ + from surya.detection import batch_text_detection + from surya.recognition import batch_recognition + + all_blocks = [] + + for page_idx, image in enumerate(images): + page_w, page_h = image.size + + det_predictions = batch_text_detection([image], _detection_model, _detection_processor) + rec_predictions = batch_recognition( + [image], det_predictions, _recognition_model, _recognition_processor, [language] + ) + + for line in rec_predictions[0].text_lines: + bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates + x1, y1, x2, y2 = bbox + + all_blocks.append({ + "pageNumber": page_idx, + "x": x1 / page_w, + "y": y1 / page_h, + "width": (x2 - x1) / page_w, + "height": (y2 - y1) / page_h, + "polygon": None, + "text": line.text, + }) + + return all_blocks diff --git a/ocr-service/main.py b/ocr-service/main.py new file mode 100644 index 00000000..d4e3f957 --- /dev/null +++ b/ocr-service/main.py @@ -0,0 +1,93 @@ +"""OCR microservice — FastAPI app with Surya and Kraken engine support.""" + +import io +import logging +from contextlib import asynccontextmanager + +import httpx +import pypdfium2 as pdfium +from fastapi import FastAPI, HTTPException +from PIL import Image + +from engines import kraken as kraken_engine +from engines import surya as surya_engine +from models import OcrBlock, OcrRequest + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +_models_ready = False + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Load all OCR models at startup before accepting requests.""" + global _models_ready + + logger.info("Loading OCR models at startup...") + surya_engine.load_models() + kraken_engine.load_models() + _models_ready = True + logger.info("All OCR models loaded — ready to accept requests") + + yield + + logger.info("Shutting down OCR service") + + +app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan) + + +@app.get("/health") +def health(): + """Health endpoint — returns 200 only after models are loaded.""" + if not _models_ready: + raise HTTPException(status_code=503, detail="Models not loaded yet") + return {"status": "ok", "surya": True, "kraken": kraken_engine.is_available()} + + +@app.post("/ocr", response_model=list[OcrBlock]) +async def run_ocr(request: OcrRequest): + """Run OCR on a PDF document. + + Downloads the PDF from the provided URL, converts pages to images, + and runs the appropriate OCR engine based on scriptType. + """ + if not _models_ready: + raise HTTPException(status_code=503, detail="Models not loaded yet") + + images = await _download_and_convert_pdf(request.pdf_url) + + script_type = request.script_type.upper() + + if script_type == "HANDWRITING_KURRENT": + if not kraken_engine.is_available(): + raise HTTPException( + status_code=400, + detail="Kraken model not available — cannot process Kurrent script", + ) + blocks = kraken_engine.extract_blocks(images, request.language) + else: + # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya + blocks = surya_engine.extract_blocks(images, request.language) + + return [OcrBlock(**b) for b in blocks] + + +async def _download_and_convert_pdf(url: str) -> list[Image.Image]: + """Download a PDF from URL and convert each page to a PIL Image.""" + async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: + response = await client.get(url) + response.raise_for_status() + + pdf = pdfium.PdfDocument(io.BytesIO(response.content)) + images = [] + + for page_idx in range(len(pdf)): + page = pdf[page_idx] + # Render at 300 DPI for good OCR quality + bitmap = page.render(scale=300 / 72) + pil_image = bitmap.to_pil() + images.append(pil_image) + + return images diff --git a/ocr-service/models.py b/ocr-service/models.py new file mode 100644 index 00000000..0d2c1590 --- /dev/null +++ b/ocr-service/models.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field + + +class OcrRequest(BaseModel): + pdf_url: str = Field(..., alias="pdfUrl") + script_type: str = Field("UNKNOWN", alias="scriptType") + language: str = "de" + + +class OcrBlock(BaseModel): + page_number: int = Field(..., alias="pageNumber") + x: float + y: float + width: float + height: float + polygon: list[list[float]] | None = None + text: str + + class Config: + populate_by_name = True diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt new file mode 100644 index 00000000..49bd00e9 --- /dev/null +++ b/ocr-service/requirements.txt @@ -0,0 +1,6 @@ +fastapi[standard]==0.115.6 +surya-ocr==0.6.3 +kraken==5.2.9 +pillow==11.1.0 +pypdfium2==4.30.0 +httpx==0.28.1 -- 2.49.1 From cf8dc3559fe41321d0e0cf3cc2c7898fd8dcf45a Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:30:27 +0200 Subject: [PATCH 08/74] feat(frontend): extract AnnotationShape component with polygon support - AnnotationShape.svelte: renders a single annotation as either a rectangle or a polygon-clipped div (via CSS clip-path: polygon()) - AnnotationLayer.svelte: refactored to delegate rendering to AnnotationShape, keeping draw logic and hover state management - Annotation type: added optional polygon field ([number, number][] | null) - Polygon coordinates are converted from page-normalized to bounding-box-relative percentages for clip-path All 687 existing frontend tests pass. Refs #227 Co-Authored-By: Claude Sonnet 4.6 --- .../src/lib/components/AnnotationLayer.svelte | 90 ++---------- .../src/lib/components/AnnotationShape.svelte | 136 ++++++++++++++++++ frontend/src/lib/types.ts | 1 + 3 files changed, 147 insertions(+), 80 deletions(-) create mode 100644 frontend/src/lib/components/AnnotationShape.svelte diff --git a/frontend/src/lib/components/AnnotationLayer.svelte b/frontend/src/lib/components/AnnotationLayer.svelte index ec1c29c0..65e87b42 100644 --- a/frontend/src/lib/components/AnnotationLayer.svelte +++ b/frontend/src/lib/components/AnnotationLayer.svelte @@ -1,5 +1,6 @@ + +
{ + if (e.key === 'Enter' || e.key === ' ') onclick(); + }} + onpointerenter={onpointerenter} + onpointerleave={onpointerleave} + style={shapeStyle} +> + {#if !dimmed && blockNumber} +
+ {blockNumber} +
+ {/if} +
+ + diff --git a/frontend/src/lib/types.ts b/frontend/src/lib/types.ts index 7ddfd95c..adb2aec3 100644 --- a/frontend/src/lib/types.ts +++ b/frontend/src/lib/types.ts @@ -49,4 +49,5 @@ export type Annotation = { color: string; createdAt: string; fileHash?: string | null; + polygon?: [number, number][] | null; }; -- 2.49.1 From a4651aa317ffb9c0607e5e1b580d10206cce836b Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:36:00 +0200 Subject: [PATCH 09/74] feat(frontend): add OCR UI components and translations - ScriptTypeSelect: native select for TYPEWRITER/HANDWRITING_LATIN/KURRENT - OcrTrigger: wraps script type select + start button + confirmation dialog - OcrProgress: SSE-based progress display with page counter and progress bar - Paraglide translations for OCR (de/en/es): script types, trigger labels, confirmation dialog, progress messages, error messages - ErrorCode type + getErrorMessage: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND, OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED All 687 frontend tests pass. Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- frontend/messages/de.json | 22 ++++- frontend/messages/en.json | 22 ++++- frontend/messages/es.json | 22 ++++- .../src/lib/components/OcrProgress.svelte | 88 +++++++++++++++++++ frontend/src/lib/components/OcrTrigger.svelte | 49 +++++++++++ .../lib/components/ScriptTypeSelect.svelte | 27 ++++++ frontend/src/lib/errors.ts | 12 +++ 7 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 frontend/src/lib/components/OcrProgress.svelte create mode 100644 frontend/src/lib/components/OcrTrigger.svelte create mode 100644 frontend/src/lib/components/ScriptTypeSelect.svelte diff --git a/frontend/messages/de.json b/frontend/messages/de.json index 53f8ed96..886c468a 100644 --- a/frontend/messages/de.json +++ b/frontend/messages/de.json @@ -500,5 +500,25 @@ "person_alias_delete_title": "Alias entfernen?", "person_alias_delete_body": "Dieser Name wird aus der Suche entfernt.", "person_alias_btn_delete": "Entfernen", - "error_alias_not_found": "Der Namensalias wurde nicht gefunden." + "error_alias_not_found": "Der Namensalias wurde nicht gefunden.", + "error_ocr_service_unavailable": "Der OCR-Dienst ist nicht verfügbar.", + "error_ocr_job_not_found": "Der OCR-Auftrag wurde nicht gefunden.", + "error_ocr_document_not_uploaded": "Das Dokument hat keine Datei — OCR ist nicht möglich.", + "error_ocr_processing_failed": "Die OCR-Verarbeitung ist fehlgeschlagen.", + "ocr_script_type_typewriter": "Schreibmaschine", + "ocr_script_type_handwriting_latin": "Handschrift (lateinisch)", + "ocr_script_type_handwriting_kurrent": "Handschrift (Kurrent/Sütterlin)", + "ocr_trigger_label": "Schrifttyp", + "ocr_trigger_select_placeholder": "Schrifttyp wählen…", + "ocr_trigger_btn": "OCR starten", + "ocr_trigger_btn_disabled": "Bitte wählen Sie einen Schrifttyp", + "ocr_confirm_title": "Vorhandene Transkription ersetzen?", + "ocr_confirm_body": "Alle {count} vorhandenen Blöcke werden gelöscht und durch die OCR-Ergebnisse ersetzt. Diese Aktion kann nicht rückgängig gemacht werden.", + "ocr_confirm_btn": "Ersetzen", + "ocr_progress_heading": "OCR läuft", + "ocr_progress_page": "Seite {current} von {total}", + "ocr_error_heading": "OCR fehlgeschlagen", + "ocr_error_retry": "Erneut versuchen", + "ocr_batch_running": "OCR läuft · {processed} von {total} Dokumente abgeschlossen", + "ocr_batch_done": "OCR abgeschlossen · {processed} erfolgreich · {errors} fehlgeschlagen" } diff --git a/frontend/messages/en.json b/frontend/messages/en.json index 7c535417..86777394 100644 --- a/frontend/messages/en.json +++ b/frontend/messages/en.json @@ -500,5 +500,25 @@ "person_alias_delete_title": "Remove alias?", "person_alias_delete_body": "This name will be removed from search results.", "person_alias_btn_delete": "Remove", - "error_alias_not_found": "The name alias was not found." + "error_alias_not_found": "The name alias was not found.", + "error_ocr_service_unavailable": "The OCR service is not available.", + "error_ocr_job_not_found": "The OCR job was not found.", + "error_ocr_document_not_uploaded": "The document has no file — OCR is not possible.", + "error_ocr_processing_failed": "OCR processing failed.", + "ocr_script_type_typewriter": "Typewriter", + "ocr_script_type_handwriting_latin": "Handwriting (Latin)", + "ocr_script_type_handwriting_kurrent": "Handwriting (Kurrent/Sütterlin)", + "ocr_trigger_label": "Script type", + "ocr_trigger_select_placeholder": "Select script type…", + "ocr_trigger_btn": "Start OCR", + "ocr_trigger_btn_disabled": "Please select a script type", + "ocr_confirm_title": "Replace existing transcription?", + "ocr_confirm_body": "All {count} existing blocks will be deleted and replaced with OCR results. This action cannot be undone.", + "ocr_confirm_btn": "Replace", + "ocr_progress_heading": "OCR running", + "ocr_progress_page": "Page {current} of {total}", + "ocr_error_heading": "OCR failed", + "ocr_error_retry": "Try again", + "ocr_batch_running": "OCR running · {processed} of {total} documents complete", + "ocr_batch_done": "OCR complete · {processed} successful · {errors} failed" } diff --git a/frontend/messages/es.json b/frontend/messages/es.json index 52502800..6764392c 100644 --- a/frontend/messages/es.json +++ b/frontend/messages/es.json @@ -500,5 +500,25 @@ "person_alias_delete_title": "Eliminar alias?", "person_alias_delete_body": "Este nombre se eliminara de los resultados de busqueda.", "person_alias_btn_delete": "Eliminar", - "error_alias_not_found": "No se encontro el alias de nombre." + "error_alias_not_found": "No se encontro el alias de nombre.", + "error_ocr_service_unavailable": "El servicio OCR no está disponible.", + "error_ocr_job_not_found": "No se encontró el trabajo OCR.", + "error_ocr_document_not_uploaded": "El documento no tiene archivo — OCR no es posible.", + "error_ocr_processing_failed": "El procesamiento OCR ha fallado.", + "ocr_script_type_typewriter": "Máquina de escribir", + "ocr_script_type_handwriting_latin": "Escritura manuscrita (latina)", + "ocr_script_type_handwriting_kurrent": "Escritura manuscrita (Kurrent/Sütterlin)", + "ocr_trigger_label": "Tipo de escritura", + "ocr_trigger_select_placeholder": "Seleccionar tipo de escritura…", + "ocr_trigger_btn": "Iniciar OCR", + "ocr_trigger_btn_disabled": "Por favor seleccione un tipo de escritura", + "ocr_confirm_title": "¿Reemplazar transcripción existente?", + "ocr_confirm_body": "Los {count} bloques existentes serán eliminados y reemplazados con los resultados del OCR. Esta acción no se puede deshacer.", + "ocr_confirm_btn": "Reemplazar", + "ocr_progress_heading": "OCR en curso", + "ocr_progress_page": "Página {current} de {total}", + "ocr_error_heading": "OCR fallido", + "ocr_error_retry": "Intentar de nuevo", + "ocr_batch_running": "OCR en curso · {processed} de {total} documentos completados", + "ocr_batch_done": "OCR completado · {processed} exitosos · {errors} fallidos" } diff --git a/frontend/src/lib/components/OcrProgress.svelte b/frontend/src/lib/components/OcrProgress.svelte new file mode 100644 index 00000000..17d60e46 --- /dev/null +++ b/frontend/src/lib/components/OcrProgress.svelte @@ -0,0 +1,88 @@ + + +{#if status === 'running'} +
+

+ {m.ocr_progress_heading()} +

+
+
+
+

+ {m.ocr_progress_page({ current: String(currentPage), total: String(totalPages) })} +

+
+{:else if status === 'error'} +
+

+ {m.ocr_error_heading()} +

+ +
+{/if} diff --git a/frontend/src/lib/components/OcrTrigger.svelte b/frontend/src/lib/components/OcrTrigger.svelte new file mode 100644 index 00000000..45002059 --- /dev/null +++ b/frontend/src/lib/components/OcrTrigger.svelte @@ -0,0 +1,49 @@ + + +
+ + +
diff --git a/frontend/src/lib/components/ScriptTypeSelect.svelte b/frontend/src/lib/components/ScriptTypeSelect.svelte new file mode 100644 index 00000000..9db31811 --- /dev/null +++ b/frontend/src/lib/components/ScriptTypeSelect.svelte @@ -0,0 +1,27 @@ + + +
+ + +
diff --git a/frontend/src/lib/errors.ts b/frontend/src/lib/errors.ts index 1adfaa03..1b8e8876 100644 --- a/frontend/src/lib/errors.ts +++ b/frontend/src/lib/errors.ts @@ -22,6 +22,10 @@ export type ErrorCode = | 'TRANSCRIPTION_BLOCK_NOT_FOUND' | 'TRANSCRIPTION_BLOCK_CONFLICT' | 'COMMENT_NOT_FOUND' + | 'OCR_SERVICE_UNAVAILABLE' + | 'OCR_JOB_NOT_FOUND' + | 'OCR_DOCUMENT_NOT_UPLOADED' + | 'OCR_PROCESSING_FAILED' | 'UNAUTHORIZED' | 'FORBIDDEN' | 'VALIDATION_ERROR' @@ -85,6 +89,14 @@ export function getErrorMessage(code: ErrorCode | string | undefined): string { return m.error_transcription_block_conflict(); case 'COMMENT_NOT_FOUND': return m.error_comment_not_found(); + case 'OCR_SERVICE_UNAVAILABLE': + return m.error_ocr_service_unavailable(); + case 'OCR_JOB_NOT_FOUND': + return m.error_ocr_job_not_found(); + case 'OCR_DOCUMENT_NOT_UPLOADED': + return m.error_ocr_document_not_uploaded(); + case 'OCR_PROCESSING_FAILED': + return m.error_ocr_processing_failed(); case 'UNAUTHORIZED': return m.error_unauthorized(); case 'FORBIDDEN': -- 2.49.1 From 931fbc28e54d4ce2790a47df1967ce16e6295be8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:39:54 +0200 Subject: [PATCH 10/74] fix(annotations): use @JdbcTypeCode(JSON) for polygon JSONB column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace @Convert(PolygonConverter) with Hibernate native @JdbcTypeCode(SqlTypes.JSON) to fix JDBC type mismatch — PostgreSQL requires jsonb type, not varchar. The PolygonConverter is retained as a standalone utility but no longer used on the entity. Hibernate 6 natively handles List> serialization to JSONB. Refs #227 Co-Authored-By: Claude Sonnet 4.6 --- .../org/raddatz/familienarchiv/model/DocumentAnnotation.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java b/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java index d4e02258..5aaaff2d 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/DocumentAnnotation.java @@ -4,6 +4,8 @@ import io.swagger.v3.oas.annotations.media.Schema; import jakarta.persistence.*; import lombok.*; import org.hibernate.annotations.CreationTimestamp; +import org.hibernate.annotations.JdbcTypeCode; +import org.hibernate.type.SqlTypes; import java.time.LocalDateTime; import java.util.List; @@ -53,8 +55,8 @@ public class DocumentAnnotation { @Column(name = "file_hash", length = 64) private String fileHash; + @JdbcTypeCode(SqlTypes.JSON) @Column(columnDefinition = "jsonb") - @Convert(converter = PolygonConverter.class) private List> polygon; @Column(name = "created_by") -- 2.49.1 From d49010cd7b13300bb16ea8e4c4b20b17bc709722 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 18:40:46 +0200 Subject: [PATCH 11/74] fix(ocr): relax pillow version to match surya-ocr constraint surya-ocr 0.6.3 requires pillow<11.0.0,>=10.2.0. The previous pin at 11.1.0 caused a dependency resolution failure during Docker build. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 49bd00e9..d0d141bc 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -1,6 +1,6 @@ fastapi[standard]==0.115.6 surya-ocr==0.6.3 kraken==5.2.9 -pillow==11.1.0 +pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 httpx==0.28.1 -- 2.49.1 From e29c8650161504d10563f450f07820723b7c394a Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 18:48:14 +0200 Subject: [PATCH 12/74] fix(ocr): upgrade kraken to 6.0.3 for torch>=2.4 compatibility kraken 5.2.9 required torch~=2.1.0, incompatible with surya-ocr's torch>=2.3.0. kraken 6.0.3 requires torch>=2.4.0,<=2.9 which overlaps with surya and our pinned torch==2.5.1. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index d0d141bc..130c76ef 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -1,6 +1,6 @@ fastapi[standard]==0.115.6 surya-ocr==0.6.3 -kraken==5.2.9 +kraken==6.0.3 pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 httpx==0.28.1 -- 2.49.1 From 49975154d9273559ce1938ae258e8396ab02d8bd Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 18:53:14 +0200 Subject: [PATCH 13/74] feat(ocr): bump to latest surya 0.17.1, kraken 7.0, torch 2.7.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - surya-ocr 0.6.3 → 0.17.1: new predictor API (FoundationPredictor, RecognitionPredictor, DetectionPredictor), native polygon output on text lines (4-point clockwise) - kraken 5.2.9 → 7.0: wider torch range (>=2.4,<=2.10), unpinned numpy - torch 2.5.1 → 2.7.1: satisfies surya's >=2.7.0 requirement - Rewrite engines/surya.py for the 0.17 predictor class API - Surya now outputs polygons natively — no longer rectangle-only Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 2 +- ocr-service/engines/surya.py | 52 +++++++++++++++++------------------- ocr-service/requirements.txt | 4 +-- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 24f74be0..57368726 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved) RUN pip install --no-cache-dir \ - torch==2.5.1 \ + torch==2.7.1 \ --index-url https://download.pytorch.org/whl/cpu COPY requirements.txt . diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index c6cc7768..77a895d9 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -4,28 +4,23 @@ import logging logger = logging.getLogger(__name__) -# Lazy-loaded at startup via load_models() -_recognition_model = None -_recognition_processor = None -_detection_model = None -_detection_processor = None +_recognition_predictor = None +_detection_predictor = None def load_models(): """Eagerly load Surya models into memory. Called once at container startup.""" - global _recognition_model, _recognition_processor, _detection_model, _detection_processor + global _recognition_predictor, _detection_predictor logger.info("Loading Surya models...") - from surya.model.detection.model import load_model as load_det_model - from surya.model.detection.model import load_processor as load_det_processor - from surya.model.recognition.model import load_model as load_rec_model - from surya.model.recognition.processor import load_processor as load_rec_processor + from surya.foundation import FoundationPredictor + from surya.recognition import RecognitionPredictor + from surya.detection import DetectionPredictor - _detection_model = load_det_model() - _detection_processor = load_det_processor() - _recognition_model = load_rec_model() - _recognition_processor = load_rec_processor() + foundation_predictor = FoundationPredictor() + _recognition_predictor = RecognitionPredictor(foundation_predictor) + _detection_predictor = DetectionPredictor() logger.info("Surya models loaded successfully") @@ -33,33 +28,36 @@ def load_models(): def extract_blocks(images: list, language: str = "de") -> list[dict]: """Run Surya OCR on a list of PIL images (one per page). - Returns a flat list of block dicts with pageNumber, x, y, width, height, text. - Coordinates are normalized to [0, 1] relative to page dimensions. + Returns a flat list of block dicts with pageNumber, x, y, width, height, + polygon, text. Coordinates are normalized to [0, 1] relative to page dimensions. + Surya 0.17+ returns polygon (4-point) natively on each text line. """ - from surya.detection import batch_text_detection - from surya.recognition import batch_recognition - all_blocks = [] - for page_idx, image in enumerate(images): - page_w, page_h = image.size + predictions = _recognition_predictor(images, det_predictor=_detection_predictor) - det_predictions = batch_text_detection([image], _detection_model, _detection_processor) - rec_predictions = batch_recognition( - [image], det_predictions, _recognition_model, _recognition_processor, [language] - ) + for page_idx, page_pred in enumerate(predictions): + page_w, page_h = images[page_idx].size - for line in rec_predictions[0].text_lines: + for line in page_pred.text_lines: bbox = line.bbox # [x1, y1, x2, y2] in pixel coordinates x1, y1, x2, y2 = bbox + # Surya 0.17 provides polygon as list of (x, y) tuples (4 points, clockwise) + polygon = None + if hasattr(line, "polygon") and line.polygon and len(line.polygon) == 4: + polygon = [ + [p[0] / page_w, p[1] / page_h] + for p in line.polygon + ] + all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, "y": y1 / page_h, "width": (x2 - x1) / page_w, "height": (y2 - y1) / page_h, - "polygon": None, + "polygon": polygon, "text": line.text, }) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 130c76ef..5a090f8b 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -1,6 +1,6 @@ fastapi[standard]==0.115.6 -surya-ocr==0.6.3 -kraken==6.0.3 +surya-ocr==0.17.1 +kraken==7.0 pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 httpx==0.28.1 -- 2.49.1 From c74539b04b7a97c06e0f6632b3922e65823976fb Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:16:17 +0200 Subject: [PATCH 14/74] feat(ocr): auto-insert [unleserlich] markers for low-confidence words New confidence.py module with two functions: - apply_confidence_markers(): replaces words below threshold with [unleserlich], collapses adjacent markers into one - words_from_characters(): reconstructs word-level confidence from Kraken's character-level data Surya 0.17 provides native word-level confidence via line.words. Kraken 7.0 provides per-character confidences via record.confidences. Both engines now pass word+confidence data through main.py, which applies the marker post-processing before returning the API response. Threshold configurable via OCR_CONFIDENCE_THRESHOLD env var (default 0.3). Frontend already renders [unleserlich] markers via transcriptionMarkers.ts. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 1 + ocr-service/confidence.py | 79 +++++++++++++++++ ocr-service/engines/kraken.py | 6 ++ ocr-service/engines/surya.py | 12 +++ ocr-service/main.py | 6 ++ ocr-service/test_confidence.py | 153 +++++++++++++++++++++++++++++++++ 6 files changed, 257 insertions(+) create mode 100644 ocr-service/confidence.py create mode 100644 ocr-service/test_confidence.py diff --git a/docker-compose.yml b/docker-compose.yml index 0dd05942..5e88f381 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -84,6 +84,7 @@ services: - ocr_models:/app/models environment: KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel + OCR_CONFIDENCE_THRESHOLD: "0.3" networks: - archive-net healthcheck: diff --git a/ocr-service/confidence.py b/ocr-service/confidence.py new file mode 100644 index 00000000..092c2892 --- /dev/null +++ b/ocr-service/confidence.py @@ -0,0 +1,79 @@ +"""Confidence-based [unleserlich] marker insertion for OCR output.""" + +import os + +CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) + +ILLEGIBLE_MARKER = "[unleserlich]" + + +def apply_confidence_markers(words: list[dict]) -> str: + """Replace low-confidence words with [unleserlich], collapsing adjacent markers. + + Args: + words: list of {"text": str, "confidence": float} dicts + + Returns: + Reconstructed text string with [unleserlich] substitutions. + """ + if not words: + return "" + + result: list[str] = [] + prev_was_marker = False + + for word in words: + if word["confidence"] < CONFIDENCE_THRESHOLD: + if not prev_was_marker: + result.append(ILLEGIBLE_MARKER) + prev_was_marker = True + else: + result.append(word["text"]) + prev_was_marker = False + + return " ".join(result) + + +def words_from_characters(prediction: str, confidences: list[float]) -> list[dict]: + """Reconstruct word-level confidence from character-level data. + + Splits prediction on whitespace, maps characters to their confidences, + computes mean confidence per word. + + Args: + prediction: full line text from Kraken + confidences: per-character confidence list (same length as prediction) + + Returns: + list of {"text": str, "confidence": float} dicts + """ + if not prediction or not prediction.strip(): + return [] + + if len(confidences) != len(prediction): + return [{"text": prediction, "confidence": 1.0}] + + result: list[dict] = [] + current_word: list[str] = [] + current_confs: list[float] = [] + + for char, conf in zip(prediction, confidences): + if char == " ": + if current_word: + result.append({ + "text": "".join(current_word), + "confidence": sum(current_confs) / len(current_confs), + }) + current_word = [] + current_confs = [] + else: + current_word.append(char) + current_confs.append(conf) + + if current_word: + result.append({ + "text": "".join(current_word), + "confidence": sum(current_confs) / len(current_confs), + }) + + return result diff --git a/ocr-service/engines/kraken.py b/ocr-service/engines/kraken.py index 16cb3d0b..a0fec491 100644 --- a/ocr-service/engines/kraken.py +++ b/ocr-service/engines/kraken.py @@ -37,6 +37,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: Coordinates are normalized to [0, 1]. """ from kraken import blla, rpred + from confidence import words_from_characters if _model is None: raise RuntimeError("Kraken model is not loaded") @@ -73,6 +74,10 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: # Approximate polygon to quadrilateral quad = _approximate_to_quad(polygon_pts, page_w, page_h) if polygon_pts else None + # Extract word-level confidence for [unleserlich] marking + char_confidences = getattr(record, "confidences", []) + words = words_from_characters(record.prediction, char_confidences) + all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, @@ -81,6 +86,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: "height": (y2 - y1) / page_h, "polygon": quad, "text": record.prediction, + "words": words, }) return all_blocks diff --git a/ocr-service/engines/surya.py b/ocr-service/engines/surya.py index 77a895d9..94fc330b 100644 --- a/ocr-service/engines/surya.py +++ b/ocr-service/engines/surya.py @@ -51,6 +51,17 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: for p in line.polygon ] + # Extract word-level confidence for [unleserlich] marking + words = [] + if hasattr(line, "words") and line.words: + for word in line.words: + words.append({ + "text": word.text, + "confidence": word.confidence, + }) + else: + words = [{"text": line.text, "confidence": getattr(line, "confidence", 1.0)}] + all_blocks.append({ "pageNumber": page_idx, "x": x1 / page_w, @@ -59,6 +70,7 @@ def extract_blocks(images: list, language: str = "de") -> list[dict]: "height": (y2 - y1) / page_h, "polygon": polygon, "text": line.text, + "words": words, }) return all_blocks diff --git a/ocr-service/main.py b/ocr-service/main.py index d4e3f957..f87985e6 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -9,6 +9,7 @@ import pypdfium2 as pdfium from fastapi import FastAPI, HTTPException from PIL import Image +from confidence import apply_confidence_markers from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest @@ -71,6 +72,11 @@ async def run_ocr(request: OcrRequest): # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = surya_engine.extract_blocks(images, request.language) + for block in blocks: + if block.get("words"): + block["text"] = apply_confidence_markers(block["words"]) + block.pop("words", None) + return [OcrBlock(**b) for b in blocks] diff --git a/ocr-service/test_confidence.py b/ocr-service/test_confidence.py new file mode 100644 index 00000000..e1359eb1 --- /dev/null +++ b/ocr-service/test_confidence.py @@ -0,0 +1,153 @@ +"""Tests for confidence-based [unleserlich] marker insertion.""" + +import os +import pytest +from confidence import apply_confidence_markers, words_from_characters + + +# ─── apply_confidence_markers ───────────────────────────────────────────────── + + +def test_all_words_above_threshold_passes_through(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber Freund" + + +def test_single_low_confidence_word_replaced(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" + + +def test_adjacent_low_confidence_words_collapsed(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "abc", "confidence": 0.05}, + {"text": "yyy", "confidence": 0.2}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] Freund" + + +def test_mixed_high_low_each_group_gets_marker(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkqz", "confidence": 0.1}, + {"text": "wie", "confidence": 0.9}, + {"text": "abc", "confidence": 0.05}, + {"text": "dir", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich] wie [unleserlich] dir" + + +def test_all_below_threshold_returns_single_marker(): + words = [ + {"text": "xkq", "confidence": 0.1}, + {"text": "zzz", "confidence": 0.05}, + ] + assert apply_confidence_markers(words) == "[unleserlich]" + + +def test_empty_list_returns_empty_string(): + assert apply_confidence_markers([]) == "" + + +def test_single_word_above_threshold(): + words = [{"text": "Hallo", "confidence": 0.9}] + assert apply_confidence_markers(words) == "Hallo" + + +def test_exact_threshold_passes_through(): + """Confidence exactly at threshold should NOT be replaced (strict <).""" + words = [{"text": "Wort", "confidence": 0.3}] + assert apply_confidence_markers(words) == "Wort" + + +def test_just_below_threshold_replaced(): + words = [{"text": "Wort", "confidence": 0.29}] + assert apply_confidence_markers(words) == "[unleserlich]" + + +def test_custom_threshold_via_env(monkeypatch): + monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8") + # Need to reload the module to pick up the new env var + import importlib + import confidence + importlib.reload(confidence) + + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "Freund", "confidence": 0.5}, + ] + assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]" + + # Reset + monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3") + importlib.reload(confidence) + + +def test_low_confidence_at_start(): + words = [ + {"text": "xkq", "confidence": 0.1}, + {"text": "Freund", "confidence": 0.88}, + ] + assert apply_confidence_markers(words) == "[unleserlich] Freund" + + +def test_low_confidence_at_end(): + words = [ + {"text": "Lieber", "confidence": 0.95}, + {"text": "xkq", "confidence": 0.1}, + ] + assert apply_confidence_markers(words) == "Lieber [unleserlich]" + + +# ─── words_from_characters ──────────────────────────────────────────────────── + + +def test_single_word_matching_confidences(): + words = words_from_characters("Hallo", [0.9, 0.8, 0.85, 0.7, 0.95]) + assert len(words) == 1 + assert words[0]["text"] == "Hallo" + assert abs(words[0]["confidence"] - 0.84) < 0.01 + + +def test_multi_word_with_spaces(): + prediction = "Sehr geehrter" + confidences = [0.9, 0.8, 0.7, 0.6, 0.5, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2] + words = words_from_characters(prediction, confidences) + assert len(words) == 2 + assert words[0]["text"] == "Sehr" + assert words[1]["text"] == "geehrter" + + +def test_length_mismatch_falls_back_safely(): + words = words_from_characters("Hallo Welt", [0.9, 0.8]) + assert len(words) == 1 + assert words[0]["text"] == "Hallo Welt" + assert words[0]["confidence"] == 1.0 + + +def test_empty_prediction_returns_empty(): + assert words_from_characters("", []) == [] + + +def test_single_character_word(): + words = words_from_characters("A B", [0.9, 0.5, 0.3]) + assert len(words) == 2 + assert words[0]["text"] == "A" + assert words[0]["confidence"] == 0.9 + assert words[1]["text"] == "B" + assert words[1]["confidence"] == 0.3 + + +def test_whitespace_only_prediction(): + words = words_from_characters(" ", [0.5, 0.5, 0.5]) + assert words == [] -- 2.49.1 From 41f92622382fa019d89fb1cd0a0b720bb84d233d Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:19:39 +0200 Subject: [PATCH 15/74] feat(ocr): add Kraken model download and evaluation script Runbook script to download both HTR-United Kurrent model candidates (german_kurrent_manu_9, kurrent-de) into the ocr_models Docker volume, test them against sample documents, and activate the winner. Usage: ./scripts/download-kraken-models.sh # download both ./scripts/download-kraken-models.sh --activate 1 # pick model 1 Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 89 +++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 scripts/download-kraken-models.sh diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh new file mode 100755 index 00000000..1486e66b --- /dev/null +++ b/scripts/download-kraken-models.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -euo pipefail + +# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume. +# Run this once after first deployment, or whenever you want to switch models. +# +# Usage: +# ./scripts/download-kraken-models.sh # download both candidates +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) + +COMPOSE_SERVICE="ocr-service" +MODEL_DIR="/app/models" +ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel" + +MODEL_1_NAME="german_kurrent_manu_9" +MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)" +MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel" + +MODEL_2_NAME="kurrent-de" +MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" +MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" + +# ─── Functions ──────────────────────────────────────────────────────────────── + +download_models() { + echo "Downloading Kraken HTR models into the ocr_models volume..." + echo "" + + echo "Model 1: $MODEL_1_NAME" + echo " $MODEL_1_DESC" + docker compose run --rm "$COMPOSE_SERVICE" \ + kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + echo "" + + echo "Model 2: $MODEL_2_NAME" + echo " $MODEL_2_DESC" + docker compose run --rm "$COMPOSE_SERVICE" \ + kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + echo "" + + echo "Both models downloaded. To test them against a sample document:" + echo "" + echo " # Copy a sample Kurrent scan into the container:" + echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png" + echo "" + echo " # Test model 1:" + echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH" + echo " docker compose exec ocr-service cat /tmp/out1.txt" + echo "" + echo " # Test model 2:" + echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH" + echo " docker compose exec ocr-service cat /tmp/out2.txt" + echo "" + echo "Then activate the better model:" + echo " ./scripts/download-kraken-models.sh --activate 1 # or 2" +} + +activate_model() { + local choice="$1" + case "$choice" in + 1) + echo "Activating model 1: $MODEL_1_NAME" + docker compose run --rm "$COMPOSE_SERVICE" \ + cp "$MODEL_1_PATH" "$ACTIVE_MODEL" + ;; + 2) + echo "Activating model 2: $MODEL_2_NAME" + docker compose run --rm "$COMPOSE_SERVICE" \ + cp "$MODEL_2_PATH" "$ACTIVE_MODEL" + ;; + *) + echo "Error: --activate expects 1 or 2" + exit 1 + ;; + esac + + echo "Active model is now: $ACTIVE_MODEL" + echo "Restart the OCR service to load the new model:" + echo " docker compose restart ocr-service" +} + +# ─── Main ───────────────────────────────────────────────────────────────────── + +if [[ "${1:-}" == "--activate" ]]; then + activate_model "${2:-}" +else + download_models +fi -- 2.49.1 From 6669fffeade9efe91310a3a0de9a67e8383cf105 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:34:03 +0200 Subject: [PATCH 16/74] fix(ocr): pin transformers<5.0 and torch==2.7.1 in requirements.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit transformers 5.x breaks surya 0.17.1 — SuryaDecoderConfig is missing pad_token_id. Pin to transformers>=4.56.1,<5.0.0. Also add torch==2.7.1 to requirements.txt to prevent pip from upgrading it past the CPU-only build installed in the Dockerfile layer. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 5a090f8b..5337edef 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -1,6 +1,8 @@ fastapi[standard]==0.115.6 surya-ocr==0.17.1 kraken==7.0 +torch==2.7.1 +transformers>=4.56.1,<5.0.0 pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 httpx==0.28.1 -- 2.49.1 From 0af474967743a936dafcbd253898d25219cabdbc Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:41:59 +0200 Subject: [PATCH 17/74] feat(ocr): extend model script with automatic OCR evaluation Downloads both Kraken models, then runs each against 4 sample PDFs from the import folder (Eu-0693, Eu-0692, W-0150, W-0575). Output goes to ocr-model-evaluation//.txt for side-by-side comparison. Usage: ./scripts/download-kraken-models.sh # download + evaluate ./scripts/download-kraken-models.sh --eval-only # re-run evaluation ./scripts/download-kraken-models.sh --activate 1 # pick winner Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 117 ++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 29 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 1486e66b..18d2cdc7 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -1,13 +1,13 @@ #!/bin/bash set -euo pipefail -# Downloads Kraken HTR models for German Kurrent/Suetterlin into the ocr_models volume. -# Run this once after first deployment, or whenever you want to switch models. +# Downloads Kraken HTR models, runs OCR evaluation on sample PDFs, and activates the winner. # # Usage: -# ./scripts/download-kraken-models.sh # download both candidates -# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) -# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh # download models + run evaluation +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded) COMPOSE_SERVICE="ocr-service" MODEL_DIR="/app/models" @@ -21,6 +21,10 @@ MODEL_2_NAME="kurrent-de" MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" +EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf") +IMPORT_DIR="./import" +OUTPUT_BASE="./ocr-model-evaluation" + # ─── Functions ──────────────────────────────────────────────────────────────── download_models() { @@ -29,31 +33,81 @@ download_models() { echo "Model 1: $MODEL_1_NAME" echo " $MODEL_1_DESC" - docker compose run --rm "$COMPOSE_SERVICE" \ - kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" echo "" echo "Model 2: $MODEL_2_NAME" echo " $MODEL_2_DESC" - docker compose run --rm "$COMPOSE_SERVICE" \ - kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" echo "" - echo "Both models downloaded. To test them against a sample document:" + echo "Both models downloaded." +} + +run_evaluation() { echo "" - echo " # Copy a sample Kurrent scan into the container:" - echo " docker cp sample-kurrent.png archive-ocr:/tmp/sample.png" + echo "═══════════════════════════════════════════════════════" + echo " Running OCR evaluation on ${#EVAL_PDFS[@]} documents" + echo "═══════════════════════════════════════════════════════" echo "" - echo " # Test model 1:" - echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out1.txt segment -bl ocr -m $MODEL_1_PATH" - echo " docker compose exec ocr-service cat /tmp/out1.txt" + + # Create output directories on the host + local dir_1="$OUTPUT_BASE/$MODEL_1_NAME" + local dir_2="$OUTPUT_BASE/$MODEL_2_NAME" + mkdir -p "$dir_1" "$dir_2" + + for pdf in "${EVAL_PDFS[@]}"; do + local src="$IMPORT_DIR/$pdf" + local basename="${pdf%.pdf}" + + if [[ ! -f "$src" ]]; then + echo "SKIP: $src not found" + continue + fi + + echo "──── $pdf ────" + + # Model 1 + echo " Model 1: $MODEL_1_NAME ..." + docker compose run --rm \ + -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ + -v "$(cd "$dir_1" && pwd):/eval-output" \ + "$COMPOSE_SERVICE" \ + kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ + 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" + + # Model 2 + echo " Model 2: $MODEL_2_NAME ..." + docker compose run --rm \ + -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ + -v "$(cd "$dir_2" && pwd):/eval-output" \ + "$COMPOSE_SERVICE" \ + kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ + 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" + + echo "" + done + + echo "═══════════════════════════════════════════════════════" + echo " Evaluation complete. Results:" echo "" - echo " # Test model 2:" - echo " docker compose exec ocr-service kraken -i /tmp/sample.png /tmp/out2.txt segment -bl ocr -m $MODEL_2_PATH" - echo " docker compose exec ocr-service cat /tmp/out2.txt" + echo " Model 1 ($MODEL_1_NAME):" + for f in "$dir_1"/*.txt; do + [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" + done echo "" - echo "Then activate the better model:" - echo " ./scripts/download-kraken-models.sh --activate 1 # or 2" + echo " Model 2 ($MODEL_2_NAME):" + for f in "$dir_2"/*.txt; do + [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" + done + echo "" + echo " Compare the outputs side by side:" + echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt" + echo "" + echo " Then activate the better model:" + echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME" + echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME" + echo "═══════════════════════════════════════════════════════" } activate_model() { @@ -61,13 +115,11 @@ activate_model() { case "$choice" in 1) echo "Activating model 1: $MODEL_1_NAME" - docker compose run --rm "$COMPOSE_SERVICE" \ - cp "$MODEL_1_PATH" "$ACTIVE_MODEL" + docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_1_PATH" "$ACTIVE_MODEL" ;; 2) echo "Activating model 2: $MODEL_2_NAME" - docker compose run --rm "$COMPOSE_SERVICE" \ - cp "$MODEL_2_PATH" "$ACTIVE_MODEL" + docker compose run --rm "$COMPOSE_SERVICE" cp "$MODEL_2_PATH" "$ACTIVE_MODEL" ;; *) echo "Error: --activate expects 1 or 2" @@ -82,8 +134,15 @@ activate_model() { # ─── Main ───────────────────────────────────────────────────────────────────── -if [[ "${1:-}" == "--activate" ]]; then - activate_model "${2:-}" -else - download_models -fi +case "${1:-}" in + --activate) + activate_model "${2:-}" + ;; + --eval-only) + run_evaluation + ;; + *) + download_models + run_evaluation + ;; +esac -- 2.49.1 From 37abc376ec01847638314fe6f05cc352f559fd24 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 19:46:37 +0200 Subject: [PATCH 18/74] fix(ocr): install torchvision from CPU index alongside torch torchvision installed from PyPI expects CUDA torch operator registrations. Installing from the CPU whl index ensures torchvision matches the CPU-only torch build. Fixes 'torchvision::nms does not exist' RuntimeError on startup. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 2 ++ ocr-service/requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 57368726..99ac88fc 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -9,8 +9,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved) +# torchvision must also come from the CPU index to match torch's operator registrations RUN pip install --no-cache-dir \ torch==2.7.1 \ + torchvision==0.22.1 \ --index-url https://download.pytorch.org/whl/cpu COPY requirements.txt . diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 5337edef..7ee1d67a 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -2,6 +2,7 @@ fastapi[standard]==0.115.6 surya-ocr==0.17.1 kraken==7.0 torch==2.7.1 +torchvision==0.22.1 transformers>=4.56.1,<5.0.0 pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 -- 2.49.1 From f12b41161ecf088a6f5a954ce74466b53a07178f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:05:29 +0200 Subject: [PATCH 19/74] fix(ocr): update model script for kraken 7 DOI-based downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kraken 7 uses DOIs (not short names) to identify models from Zenodo. Updated to use actual DOIs: - 10.5281/zenodo.7933463 — German handwriting HTR - 10.5281/zenodo.13788177 — McCATMuS generic handwritten/printed/typed Added -f pdf flag for PDF input, volume mounts for import dir, and post-download copy from htrmopo cache to the models volume. Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 62 ++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 18d2cdc7..8c9898b1 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -5,20 +5,23 @@ set -euo pipefail # # Usage: # ./scripts/download-kraken-models.sh # download models + run evaluation -# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 (german_kurrent_manu_9) -# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 (kurrent-de) +# ./scripts/download-kraken-models.sh --activate 1 # activate model 1 +# ./scripts/download-kraken-models.sh --activate 2 # activate model 2 # ./scripts/download-kraken-models.sh --eval-only # re-run evaluation (models already downloaded) COMPOSE_SERVICE="ocr-service" MODEL_DIR="/app/models" ACTIVE_MODEL="$MODEL_DIR/german_kurrent.mlmodel" -MODEL_1_NAME="german_kurrent_manu_9" -MODEL_1_DESC="19th-century German administrative Kurrent (HTR-United)" +# Kraken 7 uses DOIs to identify models from HTR-United / Zenodo +MODEL_1_DOI="10.5281/zenodo.7933463" +MODEL_1_NAME="german_handwriting" +MODEL_1_DESC="HTR model for German manuscripts (handwritten text recognition)" MODEL_1_PATH="$MODEL_DIR/$MODEL_1_NAME.mlmodel" -MODEL_2_NAME="kurrent-de" -MODEL_2_DESC="Broad German Kurrent coverage (HTR-United)" +MODEL_2_DOI="10.5281/zenodo.13788177" +MODEL_2_NAME="mccatmus" +MODEL_2_DESC="McCATMuS — generic model for handwritten, printed & typewritten (16th c. onward)" MODEL_2_PATH="$MODEL_DIR/$MODEL_2_NAME.mlmodel" EVAL_PDFS=("Eu-0693.pdf" "Eu-0692.pdf" "W-0150.pdf" "W-0575.pdf") @@ -27,18 +30,44 @@ OUTPUT_BASE="./ocr-model-evaluation" # ─── Functions ──────────────────────────────────────────────────────────────── +download_model() { + local doi="$1" + local dest="$2" + local name="$3" + + echo " Downloading $name ($doi)..." + + # kraken get downloads to /root/.local/share/htrmopo// + # We find the .mlmodel file after download and copy it to our volume + docker compose run --rm "$COMPOSE_SERVICE" sh -c " + kraken get $doi 2>&1 + # Find the most recently downloaded .mlmodel and copy to target + FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1) + if [ -n \"\$FOUND\" ]; then + cp \"\$FOUND\" $dest + echo \"Saved to $dest\" + else + echo 'ERROR: No .mlmodel file found after download' + exit 1 + fi + " +} + download_models() { echo "Downloading Kraken HTR models into the ocr_models volume..." echo "" + # Create a timestamp marker so we can find newly downloaded files + docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker + echo "Model 1: $MODEL_1_NAME" echo " $MODEL_1_DESC" - docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_1_NAME" -o "$MODEL_1_PATH" + download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME" echo "" echo "Model 2: $MODEL_2_NAME" echo " $MODEL_2_DESC" - docker compose run --rm "$COMPOSE_SERVICE" kraken get "$MODEL_2_NAME" -o "$MODEL_2_PATH" + download_model "$MODEL_2_DOI" "$MODEL_2_PATH" "$MODEL_2_NAME" echo "" echo "Both models downloaded." @@ -51,7 +80,6 @@ run_evaluation() { echo "═══════════════════════════════════════════════════════" echo "" - # Create output directories on the host local dir_1="$OUTPUT_BASE/$MODEL_1_NAME" local dir_2="$OUTPUT_BASE/$MODEL_2_NAME" mkdir -p "$dir_1" "$dir_2" @@ -67,22 +95,20 @@ run_evaluation() { echo "──── $pdf ────" - # Model 1 echo " Model 1: $MODEL_1_NAME ..." docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ -v "$(cd "$dir_1" && pwd):/eval-output" \ "$COMPOSE_SERVICE" \ - kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ + kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" - # Model 2 echo " Model 2: $MODEL_2_NAME ..." docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ -v "$(cd "$dir_2" && pwd):/eval-output" \ "$COMPOSE_SERVICE" \ - kraken -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ + kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" echo "" @@ -91,20 +117,22 @@ run_evaluation() { echo "═══════════════════════════════════════════════════════" echo " Evaluation complete. Results:" echo "" - echo " Model 1 ($MODEL_1_NAME):" + echo " Model 1 — $MODEL_1_NAME ($MODEL_1_DOI):" for f in "$dir_1"/*.txt; do [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" done echo "" - echo " Model 2 ($MODEL_2_NAME):" + echo " Model 2 — $MODEL_2_NAME ($MODEL_2_DOI):" for f in "$dir_2"/*.txt; do [[ -f "$f" ]] && echo " $(basename "$f"): $(wc -l < "$f") lines, $(wc -c < "$f") bytes" done echo "" - echo " Compare the outputs side by side:" + echo " Compare outputs:" echo " diff $dir_1/Eu-0693.txt $dir_2/Eu-0693.txt" + echo " # or view individually:" + echo " cat $dir_1/Eu-0693.txt" echo "" - echo " Then activate the better model:" + echo " Activate the better model:" echo " ./scripts/download-kraken-models.sh --activate 1 # $MODEL_1_NAME" echo " ./scripts/download-kraken-models.sh --activate 2 # $MODEL_2_NAME" echo "═══════════════════════════════════════════════════════" -- 2.49.1 From c0004f5e6f0906fb8f2b7af6782966e786aff1c7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:09:23 +0200 Subject: [PATCH 20/74] fix(ocr): parse kraken 'Model dir' output to locate downloaded model The previous approach used find across the htrmopo cache which failed because -newer /tmp ran in a separate container. Now parses the 'Model dir: ' line from kraken get output directly. Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 8c9898b1..6e3d5a47 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -37,29 +37,33 @@ download_model() { echo " Downloading $name ($doi)..." - # kraken get downloads to /root/.local/share/htrmopo// - # We find the .mlmodel file after download and copy it to our volume - docker compose run --rm "$COMPOSE_SERVICE" sh -c " - kraken get $doi 2>&1 - # Find the most recently downloaded .mlmodel and copy to target - FOUND=\$(find /root/.local/share/htrmopo -name '*.mlmodel' -newer /tmp 2>/dev/null | head -1) - if [ -n \"\$FOUND\" ]; then - cp \"\$FOUND\" $dest - echo \"Saved to $dest\" + # kraken get downloads to /root/.local/share/htrmopo//.mlmodel + # Parse the "Model dir: " line from kraken output to locate the file + docker compose run --rm "$COMPOSE_SERVICE" sh -c ' + OUTPUT=$(kraken get '"$doi"' 2>&1) + echo "$OUTPUT" + MODEL_DIR=$(echo "$OUTPUT" | grep -oP "Model dir: \K[^ ]+") + if [ -n "$MODEL_DIR" ] && [ -d "$MODEL_DIR" ]; then + FOUND=$(find "$MODEL_DIR" -name "*.mlmodel" | head -1) + if [ -n "$FOUND" ]; then + cp "$FOUND" '"$dest"' + echo "Saved to '"$dest"'" + else + echo "ERROR: No .mlmodel file in $MODEL_DIR" + ls -la "$MODEL_DIR" + exit 1 + fi else - echo 'ERROR: No .mlmodel file found after download' + echo "ERROR: Could not parse model directory from kraken output" exit 1 fi - " + ' } download_models() { echo "Downloading Kraken HTR models into the ocr_models volume..." echo "" - # Create a timestamp marker so we can find newly downloaded files - docker compose run --rm "$COMPOSE_SERVICE" touch /tmp/.download-marker - echo "Model 1: $MODEL_1_NAME" echo " $MODEL_1_DESC" download_model "$MODEL_1_DOI" "$MODEL_1_PATH" "$MODEL_1_NAME" -- 2.49.1 From 31519af1a4d32c9b73191678930569846b01fc4b Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:11:14 +0200 Subject: [PATCH 21/74] fix(ocr): add pyvips for kraken PDF input support Kraken 7 requires pyvips (optional dep) for -f pdf mode. Added libvips42 system package and pyvips Python package. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 3 ++- ocr-service/requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 99ac88fc..0c90503c 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -2,10 +2,11 @@ FROM python:3.11-slim WORKDIR /app -# curl for healthcheck; libgomp1 for PyTorch CPU threading +# curl for healthcheck; libgomp1 for PyTorch CPU threading; libvips for kraken PDF support RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ libgomp1 \ + libvips42 \ && rm -rf /var/lib/apt/lists/* # PyTorch CPU-only — separate layer; the whl/cpu index strips all CUDA variants (~2 GB saved) diff --git a/ocr-service/requirements.txt b/ocr-service/requirements.txt index 7ee1d67a..5c3023d8 100644 --- a/ocr-service/requirements.txt +++ b/ocr-service/requirements.txt @@ -6,4 +6,5 @@ torchvision==0.22.1 transformers>=4.56.1,<5.0.0 pillow>=10.2.0,<11.0.0 pypdfium2==4.30.0 +pyvips>=2.2.0 httpx==0.28.1 -- 2.49.1 From dd078d50da7fd36ef8fed45238edafc56e67c542 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:37:29 +0200 Subject: [PATCH 22/74] fix(ocr): extract PDF pages as PNGs before running kraken OCR Kraken's -f pdf mode tries to write output next to the input file, which fails on read-only mounts. Instead, extract pages as PNGs via pypdfium2 (already installed), then run kraken on each image. Both models run in a single container per PDF to avoid overhead. Co-Authored-By: Claude Sonnet 4.6 --- scripts/download-kraken-models.sh | 45 +++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/scripts/download-kraken-models.sh b/scripts/download-kraken-models.sh index 6e3d5a47..c7387322 100755 --- a/scripts/download-kraken-models.sh +++ b/scripts/download-kraken-models.sh @@ -99,21 +99,44 @@ run_evaluation() { echo "──── $pdf ────" - echo " Model 1: $MODEL_1_NAME ..." + # Run both models inside a single container run: + # 1. Extract PDF pages as PNGs (pypdfium2 is already installed) + # 2. Run kraken on each page image for both models + # 3. Concatenate per-page output into one file per model docker compose run --rm \ -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ - -v "$(cd "$dir_1" && pwd):/eval-output" \ + -v "$(cd "$dir_1" && pwd):/eval-out-1" \ + -v "$(cd "$dir_2" && pwd):/eval-out-2" \ "$COMPOSE_SERVICE" \ - kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_1_PATH" \ - 2>/dev/null || echo " ⚠ Model 1 failed on $pdf" + python3 -c " +import pypdfium2 as pdfium, subprocess, sys, os - echo " Model 2: $MODEL_2_NAME ..." - docker compose run --rm \ - -v "$(cd "$IMPORT_DIR" && pwd):/eval-input:ro" \ - -v "$(cd "$dir_2" && pwd):/eval-output" \ - "$COMPOSE_SERVICE" \ - kraken -f pdf -i "/eval-input/$pdf" "/eval-output/${basename}.txt" segment -bl ocr -m "$MODEL_2_PATH" \ - 2>/dev/null || echo " ⚠ Model 2 failed on $pdf" +pdf = pdfium.PdfDocument('/eval-input/$pdf') +pages = [] +for i in range(len(pdf)): + bmp = pdf[i].render(scale=300/72) + path = f'/tmp/page_{i:04d}.png' + bmp.to_pil().save(path) + pages.append(path) +print(f'Extracted {len(pages)} pages') + +for label, model, outdir in [ + ('Model 1', '$MODEL_1_PATH', '/eval-out-1'), + ('Model 2', '$MODEL_2_PATH', '/eval-out-2'), +]: + print(f' {label}...') + combined = '' + for p in pages: + args = ['kraken', '-i', p, '/dev/stdout', 'segment', '-bl', 'ocr', '-m', model] + r = subprocess.run(args, capture_output=True, text=True) + combined += r.stdout + if r.returncode != 0: + print(f' ⚠ failed on {os.path.basename(p)}: {r.stderr[:200]}', file=sys.stderr) + with open(f'{outdir}/${basename}.txt', 'w') as f: + f.write(combined) + lines = combined.count(chr(10)) + print(f' → {lines} lines') +" || echo " ⚠ Failed on $pdf" echo "" done -- 2.49.1 From f064b27439bfe13aff0298608b5f86d9b943251f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 20:50:59 +0200 Subject: [PATCH 23/74] feat(ocr): per-script-type confidence thresholds Kurrent OCR produces much lower confidence than typewriter/Latin. Separate thresholds allow aggressive filtering for Kurrent (0.5) while keeping typewriter lenient (0.3). - OCR_CONFIDENCE_THRESHOLD: default for Surya paths (0.3) - OCR_CONFIDENCE_THRESHOLD_KURRENT: Kraken Kurrent path (0.5) - apply_confidence_markers() now accepts threshold parameter - get_threshold(script_type) selects the right threshold Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 1 + ocr-service/confidence.py | 17 ++++++++++++++--- ocr-service/main.py | 5 +++-- ocr-service/test_confidence.py | 31 +++++++++++++++++++------------ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5e88f381..6d68e1aa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,6 +85,7 @@ services: environment: KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel OCR_CONFIDENCE_THRESHOLD: "0.3" + OCR_CONFIDENCE_THRESHOLD_KURRENT: "0.5" networks: - archive-net healthcheck: diff --git a/ocr-service/confidence.py b/ocr-service/confidence.py index 092c2892..e331443f 100644 --- a/ocr-service/confidence.py +++ b/ocr-service/confidence.py @@ -2,16 +2,24 @@ import os -CONFIDENCE_THRESHOLD = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) +THRESHOLD_DEFAULT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD", "0.3")) +THRESHOLD_KURRENT = float(os.environ.get("OCR_CONFIDENCE_THRESHOLD_KURRENT", "0.5")) ILLEGIBLE_MARKER = "[unleserlich]" -def apply_confidence_markers(words: list[dict]) -> str: +def get_threshold(script_type: str) -> float: + if script_type and script_type.upper() == "HANDWRITING_KURRENT": + return THRESHOLD_KURRENT + return THRESHOLD_DEFAULT + + +def apply_confidence_markers(words: list[dict], threshold: float | None = None) -> str: """Replace low-confidence words with [unleserlich], collapsing adjacent markers. Args: words: list of {"text": str, "confidence": float} dicts + threshold: confidence threshold (uses THRESHOLD_DEFAULT if None) Returns: Reconstructed text string with [unleserlich] substitutions. @@ -19,11 +27,14 @@ def apply_confidence_markers(words: list[dict]) -> str: if not words: return "" + if threshold is None: + threshold = THRESHOLD_DEFAULT + result: list[str] = [] prev_was_marker = False for word in words: - if word["confidence"] < CONFIDENCE_THRESHOLD: + if word["confidence"] < threshold: if not prev_was_marker: result.append(ILLEGIBLE_MARKER) prev_was_marker = True diff --git a/ocr-service/main.py b/ocr-service/main.py index f87985e6..b1766516 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -9,7 +9,7 @@ import pypdfium2 as pdfium from fastapi import FastAPI, HTTPException from PIL import Image -from confidence import apply_confidence_markers +from confidence import apply_confidence_markers, get_threshold from engines import kraken as kraken_engine from engines import surya as surya_engine from models import OcrBlock, OcrRequest @@ -72,9 +72,10 @@ async def run_ocr(request: OcrRequest): # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya blocks = surya_engine.extract_blocks(images, request.language) + threshold = get_threshold(script_type) for block in blocks: if block.get("words"): - block["text"] = apply_confidence_markers(block["words"]) + block["text"] = apply_confidence_markers(block["words"], threshold) block.pop("words", None) return [OcrBlock(**b) for b in blocks] diff --git a/ocr-service/test_confidence.py b/ocr-service/test_confidence.py index e1359eb1..c89ade9b 100644 --- a/ocr-service/test_confidence.py +++ b/ocr-service/test_confidence.py @@ -2,7 +2,7 @@ import os import pytest -from confidence import apply_confidence_markers, words_from_characters +from confidence import apply_confidence_markers, words_from_characters, get_threshold # ─── apply_confidence_markers ───────────────────────────────────────────────── @@ -75,22 +75,29 @@ def test_just_below_threshold_replaced(): assert apply_confidence_markers(words) == "[unleserlich]" -def test_custom_threshold_via_env(monkeypatch): - monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.8") - # Need to reload the module to pick up the new env var - import importlib - import confidence - importlib.reload(confidence) - +def test_custom_threshold_via_parameter(): words = [ {"text": "Lieber", "confidence": 0.95}, {"text": "Freund", "confidence": 0.5}, ] - assert confidence.apply_confidence_markers(words) == "Lieber [unleserlich]" + assert apply_confidence_markers(words, threshold=0.8) == "Lieber [unleserlich]" + assert apply_confidence_markers(words, threshold=0.3) == "Lieber Freund" - # Reset - monkeypatch.setenv("OCR_CONFIDENCE_THRESHOLD", "0.3") - importlib.reload(confidence) + +def test_kurrent_threshold_is_higher_than_default(): + default = get_threshold("TYPEWRITER") + kurrent = get_threshold("HANDWRITING_KURRENT") + assert kurrent > default + + +def test_get_threshold_kurrent(): + assert get_threshold("HANDWRITING_KURRENT") == 0.5 + + +def test_get_threshold_default(): + assert get_threshold("TYPEWRITER") == 0.3 + assert get_threshold("HANDWRITING_LATIN") == 0.3 + assert get_threshold("UNKNOWN") == 0.3 def test_low_confidence_at_start(): -- 2.49.1 From 3aaec014212176c811b307be5619ff69f61bc1e8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 21:44:51 +0200 Subject: [PATCH 24/74] feat(transcription): add source/reviewed fields for training pipeline - BlockSource enum: MANUAL, OCR - V26 migration adds source + reviewed columns to transcription_blocks - OcrService sets source=OCR when creating blocks - TranscriptionService.reviewBlock() toggles the reviewed flag - PUT /api/documents/{id}/transcription-blocks/{blockId}/review endpoint - 5 new tests: reviewBlock toggle/untoggle/notfound, controller, OcrService source=OCR verification The reviewed flag enables the Kraken fine-tuning pipeline: only blocks marked as reviewed by a human are exported as training data. Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../TranscriptionBlockController.java | 8 ++++ .../familienarchiv/model/BlockSource.java | 6 +++ .../model/TranscriptionBlock.java | 11 +++++ .../familienarchiv/service/OcrService.java | 1 + .../service/TranscriptionService.java | 7 +++ ...e_and_reviewed_to_transcription_blocks.sql | 2 + .../TranscriptionBlockControllerTest.java | 16 +++++++ .../service/OcrServiceTest.java | 29 +++++++++++++ .../service/TranscriptionServiceTest.java | 43 +++++++++++++++++++ 9 files changed, 123 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java create mode 100644 backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql diff --git a/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java b/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java index 227713d0..fd52d8f4 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/controller/TranscriptionBlockController.java @@ -81,6 +81,14 @@ public class TranscriptionBlockController { return transcriptionService.listBlocks(documentId); } + @PutMapping("/{blockId}/review") + @RequirePermission(Permission.WRITE_ALL) + public TranscriptionBlock reviewBlock( + @PathVariable UUID documentId, + @PathVariable UUID blockId) { + return transcriptionService.reviewBlock(documentId, blockId); + } + @GetMapping("/{blockId}/history") @RequirePermission(Permission.READ_ALL) public List getBlockHistory( diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java b/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java new file mode 100644 index 00000000..eb412e64 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/BlockSource.java @@ -0,0 +1,6 @@ +package org.raddatz.familienarchiv.model; + +public enum BlockSource { + MANUAL, + OCR +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java b/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java index 6f1e008e..8f01dbeb 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/TranscriptionBlock.java @@ -41,6 +41,17 @@ public class TranscriptionBlock { @Schema(requiredMode = Schema.RequiredMode.REQUIRED) private int sortOrder; + @Enumerated(EnumType.STRING) + @Column(nullable = false, length = 10) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private BlockSource source = BlockSource.MANUAL; + + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private boolean reviewed = false; + @Version @Column(nullable = false) @Schema(requiredMode = Schema.RequiredMode.REQUIRED) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java index 5ec7a2f1..5587b588 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java @@ -107,6 +107,7 @@ public class OcrService { .documentId(documentId) .text(block.text() != null ? block.text() : "") .sortOrder(i) + .source(BlockSource.OCR) .createdBy(userId) .updatedBy(userId) .build(); diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java index 2aff91bb..1f8126c1 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/TranscriptionService.java @@ -116,6 +116,13 @@ public class TranscriptionService { } } + @Transactional + public TranscriptionBlock reviewBlock(UUID documentId, UUID blockId) { + TranscriptionBlock block = getBlock(documentId, blockId); + block.setReviewed(!block.isReviewed()); + return blockRepository.save(block); + } + public List getBlockHistory(UUID documentId, UUID blockId) { getBlock(documentId, blockId); return versionRepository.findByBlockIdOrderByChangedAtDesc(blockId); diff --git a/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql b/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql new file mode 100644 index 00000000..de655f91 --- /dev/null +++ b/backend/src/main/resources/db/migration/V26__add_source_and_reviewed_to_transcription_blocks.sql @@ -0,0 +1,2 @@ +ALTER TABLE transcription_blocks ADD COLUMN source VARCHAR(10) NOT NULL DEFAULT 'MANUAL'; +ALTER TABLE transcription_blocks ADD COLUMN reviewed BOOLEAN NOT NULL DEFAULT FALSE; diff --git a/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java b/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java index a891413e..54a9be2a 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/controller/TranscriptionBlockControllerTest.java @@ -356,4 +356,20 @@ class TranscriptionBlockControllerTest { .andExpect(status().isOk()) .andExpect(jsonPath("$").isEmpty()); } + + // ─── PUT .../review ────────────────────────────────────────────────────── + + @Test + @WithMockUser(authorities = "WRITE_ALL") + void reviewBlock_returns200_withToggledBlock() throws Exception { + TranscriptionBlock reviewed = TranscriptionBlock.builder() + .id(BLOCK_ID).documentId(DOC_ID).annotationId(UUID.randomUUID()) + .text("text").sortOrder(0).reviewed(true).build(); + when(transcriptionService.reviewBlock(DOC_ID, BLOCK_ID)).thenReturn(reviewed); + + mockMvc.perform(put("/api/documents/{documentId}/transcription-blocks/{blockId}/review", + DOC_ID, BLOCK_ID)) + .andExpect(status().isOk()) + .andExpect(jsonPath("$.reviewed").value(true)); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java index 44c598e0..61c62fa3 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java @@ -6,6 +6,7 @@ import org.mockito.ArgumentCaptor; import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.ArgumentCaptor; import org.raddatz.familienarchiv.dto.CreateAnnotationDTO; import org.raddatz.familienarchiv.exception.DomainException; import org.raddatz.familienarchiv.exception.ErrorCode; @@ -173,4 +174,32 @@ class OcrServiceTest { verify(annotationService, times(2)).createOcrAnnotation( eq(docId), any(CreateAnnotationDTO.class), eq(userId), eq("hash123"), any()); } + + @Test + void startOcr_setsBlockSourceToOcr() { + UUID docId = UUID.randomUUID(); + UUID userId = UUID.randomUUID(); + Document doc = Document.builder().id(docId).status(DocumentStatus.UPLOADED) + .filePath("documents/test.pdf").fileHash("hash123") + .scriptType(ScriptType.TYPEWRITER).build(); + OcrBlockResult block = new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Test"); + + when(documentService.getDocumentById(docId)).thenReturn(doc); + when(ocrHealthClient.isHealthy()).thenReturn(true); + when(transcriptionService.listBlocks(docId)).thenReturn(List.of()); + when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block)); + when(ocrJobRepository.save(any())).thenAnswer(inv -> { + OcrJob job = inv.getArgument(0); + job.setId(UUID.randomUUID()); + return job; + }); + DocumentAnnotation ann = DocumentAnnotation.builder().id(UUID.randomUUID()).build(); + when(annotationService.createOcrAnnotation(any(), any(), any(), any(), any())).thenReturn(ann); + + ocrService.startOcr(docId, null, userId); + + ArgumentCaptor captor = ArgumentCaptor.forClass(TranscriptionBlock.class); + verify(blockRepository).save(captor.capture()); + assertThat(captor.getValue().getSource()).isEqualTo(BlockSource.OCR); + } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java index ebe02d10..f25a884b 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/TranscriptionServiceTest.java @@ -243,4 +243,47 @@ class TranscriptionServiceTest { assertThat(transcriptionService.listBlocks(docId)).containsExactly(b); } + + // ─── reviewBlock ───────────────────────────────────────────────────────── + + @Test + void reviewBlock_setsReviewedTrue() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + TranscriptionBlock block = TranscriptionBlock.builder() + .id(blockId).documentId(docId).annotationId(UUID.randomUUID()) + .text("corrected text").sortOrder(0).reviewed(false).build(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block)); + when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId); + + assertThat(result.isReviewed()).isTrue(); + verify(blockRepository).save(block); + } + + @Test + void reviewBlock_togglesReviewedFalse_whenAlreadyReviewed() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + TranscriptionBlock block = TranscriptionBlock.builder() + .id(blockId).documentId(docId).annotationId(UUID.randomUUID()) + .text("corrected text").sortOrder(0).reviewed(true).build(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.of(block)); + when(blockRepository.save(any())).thenAnswer(inv -> inv.getArgument(0)); + + TranscriptionBlock result = transcriptionService.reviewBlock(docId, blockId); + + assertThat(result.isReviewed()).isFalse(); + } + + @Test + void reviewBlock_throwsNotFound_whenBlockMissing() { + UUID docId = UUID.randomUUID(); + UUID blockId = UUID.randomUUID(); + when(blockRepository.findByIdAndDocumentId(blockId, docId)).thenReturn(Optional.empty()); + + assertThatThrownBy(() -> transcriptionService.reviewBlock(docId, blockId)) + .isInstanceOf(DomainException.class); + } } -- 2.49.1 From 8dc9243add16685af68f1786a4d1f9926e02d1b4 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 22:02:56 +0200 Subject: [PATCH 25/74] feat(frontend): wire OCR trigger + review toggle into transcription panel - OcrTrigger component rendered in the transcription empty state when the document has a file and user has write permission - Review checkmark toggle on each TranscriptionBlock (turquoise when reviewed, muted outline when not). Calls PUT .../review to toggle. - TranscriptionBlockData type: added source + reviewed fields - +page.svelte: triggerOcr() and reviewToggle() functions wired up - Paraglide translations (de/en/es) for review toggle + reviewed count All 687 frontend tests pass. Refs #226, #230 Co-Authored-By: Claude Sonnet 4.6 --- frontend/messages/de.json | 5 ++- frontend/messages/en.json | 5 ++- frontend/messages/es.json | 5 ++- .../lib/components/TranscriptionBlock.svelte | 27 ++++++++++++++ .../components/TranscriptionEditView.svelte | 36 ++++++++++++++++--- frontend/src/lib/types.ts | 2 ++ .../src/routes/documents/[id]/+page.svelte | 29 +++++++++++++++ 7 files changed, 102 insertions(+), 7 deletions(-) diff --git a/frontend/messages/de.json b/frontend/messages/de.json index 886c468a..3eac5fd0 100644 --- a/frontend/messages/de.json +++ b/frontend/messages/de.json @@ -520,5 +520,8 @@ "ocr_error_heading": "OCR fehlgeschlagen", "ocr_error_retry": "Erneut versuchen", "ocr_batch_running": "OCR läuft · {processed} von {total} Dokumente abgeschlossen", - "ocr_batch_done": "OCR abgeschlossen · {processed} erfolgreich · {errors} fehlgeschlagen" + "ocr_batch_done": "OCR abgeschlossen · {processed} erfolgreich · {errors} fehlgeschlagen", + "transcription_block_review": "Als geprüft markieren", + "transcription_block_unreview": "Markierung aufheben", + "transcription_reviewed_count": "{reviewed} von {total} geprüft" } diff --git a/frontend/messages/en.json b/frontend/messages/en.json index 86777394..9853b3d7 100644 --- a/frontend/messages/en.json +++ b/frontend/messages/en.json @@ -520,5 +520,8 @@ "ocr_error_heading": "OCR failed", "ocr_error_retry": "Try again", "ocr_batch_running": "OCR running · {processed} of {total} documents complete", - "ocr_batch_done": "OCR complete · {processed} successful · {errors} failed" + "ocr_batch_done": "OCR complete · {processed} successful · {errors} failed", + "transcription_block_review": "Mark as reviewed", + "transcription_block_unreview": "Unmark as reviewed", + "transcription_reviewed_count": "{reviewed} of {total} reviewed" } diff --git a/frontend/messages/es.json b/frontend/messages/es.json index 6764392c..9062c2ed 100644 --- a/frontend/messages/es.json +++ b/frontend/messages/es.json @@ -520,5 +520,8 @@ "ocr_error_heading": "OCR fallido", "ocr_error_retry": "Intentar de nuevo", "ocr_batch_running": "OCR en curso · {processed} de {total} documentos completados", - "ocr_batch_done": "OCR completado · {processed} exitosos · {errors} fallidos" + "ocr_batch_done": "OCR completado · {processed} exitosos · {errors} fallidos", + "transcription_block_review": "Marcar como revisado", + "transcription_block_unreview": "Desmarcar como revisado", + "transcription_reviewed_count": "{reviewed} de {total} revisados" } diff --git a/frontend/src/lib/components/TranscriptionBlock.svelte b/frontend/src/lib/components/TranscriptionBlock.svelte index ce15ff66..41598b15 100644 --- a/frontend/src/lib/components/TranscriptionBlock.svelte +++ b/frontend/src/lib/components/TranscriptionBlock.svelte @@ -14,6 +14,7 @@ type Props = { text: string; label: string | null; active: boolean; + reviewed: boolean; saveState: SaveState; canComment: boolean; currentUserId: string | null; @@ -21,6 +22,7 @@ type Props = { onFocus: () => void; onDeleteClick: () => void; onRetry: () => void; + onReviewToggle: () => void; onMoveUp?: () => void; onMoveDown?: () => void; isFirst?: boolean; @@ -34,6 +36,7 @@ let { text, label = null, active, + reviewed, saveState, canComment, currentUserId, @@ -41,6 +44,7 @@ let { onFocus, onDeleteClick, onRetry, + onReviewToggle, onMoveUp, onMoveDown, isFirst = false, @@ -239,6 +243,29 @@ function handleTextareaMouseUp() { {/if} + + +