From ff3990710e54f4b2f0f809888758cf8d6bf82be7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 15:15:16 +0200 Subject: [PATCH] feat(ocr): add OCR infrastructure (interfaces, entities, migrations, DTOs) - OcrClient + OcrHealthClient interfaces for testable OCR integration - OcrBlockResult record for OCR engine response mapping - OcrJob + OcrJobDocument entities with status enums - V25 migration creates ocr_jobs and ocr_job_documents tables - Repositories for job and job-document queries - TriggerOcrDTO, BatchOcrDTO (@Size max=500), OcrStatusDTO - ErrorCodes: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND, OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED Refs #226 Co-Authored-By: Claude Sonnet 4.6 --- .../familienarchiv/dto/BatchOcrDTO.java | 19 ++++++ .../familienarchiv/dto/OcrStatusDTO.java | 19 ++++++ .../familienarchiv/dto/TriggerOcrDTO.java | 13 ++++ .../familienarchiv/exception/ErrorCode.java | 10 +++ .../model/OcrDocumentStatus.java | 9 +++ .../raddatz/familienarchiv/model/OcrJob.java | 62 +++++++++++++++++++ .../familienarchiv/model/OcrJobDocument.java | 59 ++++++++++++++++++ .../familienarchiv/model/OcrJobStatus.java | 8 +++ .../repository/OcrJobDocumentRepository.java | 20 ++++++ .../repository/OcrJobRepository.java | 9 +++ .../service/OcrBlockResult.java | 13 ++++ .../familienarchiv/service/OcrClient.java | 9 +++ .../service/OcrHealthClient.java | 5 ++ .../db/migration/V25__add_ocr_job_tables.sql | 26 ++++++++ 14 files changed, 281 insertions(+) create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java create mode 100644 backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java create mode 100644 backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java new file mode 100644 index 00000000..69506437 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/BatchOcrDTO.java @@ -0,0 +1,19 @@ +package org.raddatz.familienarchiv.dto; + +import jakarta.validation.constraints.NotEmpty; +import jakarta.validation.constraints.Size; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; +import java.util.UUID; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class BatchOcrDTO { + @NotEmpty + @Size(max = 500, message = "batch size must not exceed 500 documents") + private List documentIds; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java new file mode 100644 index 00000000..c23ca303 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/OcrStatusDTO.java @@ -0,0 +1,19 @@ +package org.raddatz.familienarchiv.dto; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.UUID; + +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrStatusDTO { + private String status; + private UUID jobId; + private int currentPage; + private int totalPages; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java b/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java new file mode 100644 index 00000000..dda443b3 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/dto/TriggerOcrDTO.java @@ -0,0 +1,13 @@ +package org.raddatz.familienarchiv.dto; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.raddatz.familienarchiv.model.ScriptType; + +@Data +@NoArgsConstructor +@AllArgsConstructor +public class TriggerOcrDTO { + private ScriptType scriptType; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java index b105df54..e3b0c99c 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/exception/ErrorCode.java @@ -66,6 +66,16 @@ public enum ErrorCode { /** The notification with the given ID does not exist. 404 */ NOTIFICATION_NOT_FOUND, + // --- OCR --- + /** The OCR service is not available or not healthy. 503 */ + OCR_SERVICE_UNAVAILABLE, + /** The OCR job with the given ID does not exist. 404 */ + OCR_JOB_NOT_FOUND, + /** The document is not in UPLOADED status and cannot be OCR'd. 400 */ + OCR_DOCUMENT_NOT_UPLOADED, + /** OCR processing failed for the document. 500 */ + OCR_PROCESSING_FAILED, + // --- Generic --- /** Request validation failed (missing or malformed fields). 400 */ VALIDATION_ERROR, diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java new file mode 100644 index 00000000..d96620b3 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrDocumentStatus.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.model; + +public enum OcrDocumentStatus { + PENDING, + RUNNING, + DONE, + FAILED, + SKIPPED +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java new file mode 100644 index 00000000..81f205fe --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJob.java @@ -0,0 +1,62 @@ +package org.raddatz.familienarchiv.model; + +import io.swagger.v3.oas.annotations.media.Schema; +import jakarta.persistence.*; +import lombok.*; +import org.hibernate.annotations.CreationTimestamp; +import org.hibernate.annotations.UpdateTimestamp; + +import java.time.LocalDateTime; +import java.util.UUID; + +@Entity +@Table(name = "ocr_jobs") +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrJob { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID id; + + @Enumerated(EnumType.STRING) + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private OcrJobStatus status = OcrJobStatus.PENDING; + + @Column(name = "total_documents", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private int totalDocuments; + + @Column(name = "processed_documents", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int processedDocuments = 0; + + @Column(name = "error_count", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int errorCount = 0; + + @Column(name = "skipped_count", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private int skippedCount = 0; + + @Column(name = "created_by") + private UUID createdBy; + + @Column(name = "created_at", nullable = false, updatable = false) + @CreationTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime createdAt; + + @Column(name = "updated_at", nullable = false) + @UpdateTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime updatedAt; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java new file mode 100644 index 00000000..c8f3f702 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobDocument.java @@ -0,0 +1,59 @@ +package org.raddatz.familienarchiv.model; + +import io.swagger.v3.oas.annotations.media.Schema; +import jakarta.persistence.*; +import lombok.*; +import org.hibernate.annotations.CreationTimestamp; +import org.hibernate.annotations.UpdateTimestamp; + +import java.time.LocalDateTime; +import java.util.UUID; + +@Entity +@Table(name = "ocr_job_documents") +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OcrJobDocument { + + @Id + @GeneratedValue(strategy = GenerationType.UUID) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID id; + + @Column(name = "job_id", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID jobId; + + @Column(name = "document_id", nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private UUID documentId; + + @Enumerated(EnumType.STRING) + @Column(nullable = false) + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + @Builder.Default + private OcrDocumentStatus status = OcrDocumentStatus.PENDING; + + @Column(name = "error_message") + private String errorMessage; + + @Column(name = "current_page") + @Builder.Default + private int currentPage = 0; + + @Column(name = "total_pages") + @Builder.Default + private int totalPages = 0; + + @Column(name = "created_at", nullable = false, updatable = false) + @CreationTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime createdAt; + + @Column(name = "updated_at", nullable = false) + @UpdateTimestamp + @Schema(requiredMode = Schema.RequiredMode.REQUIRED) + private LocalDateTime updatedAt; +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java new file mode 100644 index 00000000..5f1bf442 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/model/OcrJobStatus.java @@ -0,0 +1,8 @@ +package org.raddatz.familienarchiv.model; + +public enum OcrJobStatus { + PENDING, + RUNNING, + DONE, + FAILED +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java new file mode 100644 index 00000000..3d781804 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobDocumentRepository.java @@ -0,0 +1,20 @@ +package org.raddatz.familienarchiv.repository; + +import org.raddatz.familienarchiv.model.OcrDocumentStatus; +import org.raddatz.familienarchiv.model.OcrJobDocument; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +public interface OcrJobDocumentRepository extends JpaRepository { + + List findByJobIdOrderByCreatedAtAsc(UUID jobId); + + List findByJobIdAndStatus(UUID jobId, OcrDocumentStatus status); + + Optional findByJobIdAndDocumentId(UUID jobId, UUID documentId); + + Optional findFirstByDocumentIdAndStatusIn(UUID documentId, List statuses); +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java new file mode 100644 index 00000000..5d319ccf --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrJobRepository.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.repository; + +import org.raddatz.familienarchiv.model.OcrJob; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.UUID; + +public interface OcrJobRepository extends JpaRepository { +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java new file mode 100644 index 00000000..e01b7def --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrBlockResult.java @@ -0,0 +1,13 @@ +package org.raddatz.familienarchiv.service; + +import java.util.List; + +public record OcrBlockResult( + int pageNumber, + double x, + double y, + double width, + double height, + List> polygon, + String text +) {} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java new file mode 100644 index 00000000..3b33aaf2 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrClient.java @@ -0,0 +1,9 @@ +package org.raddatz.familienarchiv.service; + +import org.raddatz.familienarchiv.model.ScriptType; + +import java.util.List; + +public interface OcrClient { + List extractBlocks(String pdfUrl, ScriptType scriptType); +} diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java new file mode 100644 index 00000000..3a62f592 --- /dev/null +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrHealthClient.java @@ -0,0 +1,5 @@ +package org.raddatz.familienarchiv.service; + +public interface OcrHealthClient { + boolean isHealthy(); +} diff --git a/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql b/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql new file mode 100644 index 00000000..a9f6945c --- /dev/null +++ b/backend/src/main/resources/db/migration/V25__add_ocr_job_tables.sql @@ -0,0 +1,26 @@ +CREATE TABLE ocr_jobs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + status VARCHAR(20) NOT NULL DEFAULT 'PENDING', + total_documents INT NOT NULL, + processed_documents INT NOT NULL DEFAULT 0, + error_count INT NOT NULL DEFAULT 0, + skipped_count INT NOT NULL DEFAULT 0, + created_by UUID, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE TABLE ocr_job_documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + job_id UUID NOT NULL REFERENCES ocr_jobs(id) ON DELETE CASCADE, + document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + status VARCHAR(20) NOT NULL DEFAULT 'PENDING', + error_message TEXT, + current_page INT DEFAULT 0, + total_pages INT DEFAULT 0, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX idx_ocr_job_documents_job_id ON ocr_job_documents(job_id); +CREATE INDEX idx_ocr_job_documents_document_id ON ocr_job_documents(document_id);