feat(ocr): add OCR infrastructure (interfaces, entities, migrations, DTOs)

- OcrClient + OcrHealthClient interfaces for testable OCR integration
- OcrBlockResult record for OCR engine response mapping
- OcrJob + OcrJobDocument entities with status enums
- V25 migration creates ocr_jobs and ocr_job_documents tables
- Repositories for job and job-document queries
- TriggerOcrDTO, BatchOcrDTO (@Size max=500), OcrStatusDTO
- ErrorCodes: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND,
  OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED

Refs #226

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 15:15:16 +02:00
parent d194b6b225
commit ff3990710e
14 changed files with 281 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
package org.raddatz.familienarchiv.dto;
import jakarta.validation.constraints.NotEmpty;
import jakarta.validation.constraints.Size;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
import java.util.UUID;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class BatchOcrDTO {
@NotEmpty
@Size(max = 500, message = "batch size must not exceed 500 documents")
private List<UUID> documentIds;
}

View File

@@ -0,0 +1,19 @@
package org.raddatz.familienarchiv.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.UUID;
@Data
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class OcrStatusDTO {
private String status;
private UUID jobId;
private int currentPage;
private int totalPages;
}

View File

@@ -0,0 +1,13 @@
package org.raddatz.familienarchiv.dto;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.raddatz.familienarchiv.model.ScriptType;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class TriggerOcrDTO {
private ScriptType scriptType;
}

View File

@@ -66,6 +66,16 @@ public enum ErrorCode {
/** The notification with the given ID does not exist. 404 */
NOTIFICATION_NOT_FOUND,
// --- OCR ---
/** The OCR service is not available or not healthy. 503 */
OCR_SERVICE_UNAVAILABLE,
/** The OCR job with the given ID does not exist. 404 */
OCR_JOB_NOT_FOUND,
/** The document is not in UPLOADED status and cannot be OCR'd. 400 */
OCR_DOCUMENT_NOT_UPLOADED,
/** OCR processing failed for the document. 500 */
OCR_PROCESSING_FAILED,
// --- Generic ---
/** Request validation failed (missing or malformed fields). 400 */
VALIDATION_ERROR,

View File

@@ -0,0 +1,9 @@
package org.raddatz.familienarchiv.model;
public enum OcrDocumentStatus {
PENDING,
RUNNING,
DONE,
FAILED,
SKIPPED
}

View File

@@ -0,0 +1,62 @@
package org.raddatz.familienarchiv.model;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.persistence.*;
import lombok.*;
import org.hibernate.annotations.CreationTimestamp;
import org.hibernate.annotations.UpdateTimestamp;
import java.time.LocalDateTime;
import java.util.UUID;
@Entity
@Table(name = "ocr_jobs")
@Data
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class OcrJob {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private UUID id;
@Enumerated(EnumType.STRING)
@Column(nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private OcrJobStatus status = OcrJobStatus.PENDING;
@Column(name = "total_documents", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private int totalDocuments;
@Column(name = "processed_documents", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private int processedDocuments = 0;
@Column(name = "error_count", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private int errorCount = 0;
@Column(name = "skipped_count", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private int skippedCount = 0;
@Column(name = "created_by")
private UUID createdBy;
@Column(name = "created_at", nullable = false, updatable = false)
@CreationTimestamp
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime createdAt;
@Column(name = "updated_at", nullable = false)
@UpdateTimestamp
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,59 @@
package org.raddatz.familienarchiv.model;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.persistence.*;
import lombok.*;
import org.hibernate.annotations.CreationTimestamp;
import org.hibernate.annotations.UpdateTimestamp;
import java.time.LocalDateTime;
import java.util.UUID;
@Entity
@Table(name = "ocr_job_documents")
@Data
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class OcrJobDocument {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private UUID id;
@Column(name = "job_id", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private UUID jobId;
@Column(name = "document_id", nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private UUID documentId;
@Enumerated(EnumType.STRING)
@Column(nullable = false)
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
@Builder.Default
private OcrDocumentStatus status = OcrDocumentStatus.PENDING;
@Column(name = "error_message")
private String errorMessage;
@Column(name = "current_page")
@Builder.Default
private int currentPage = 0;
@Column(name = "total_pages")
@Builder.Default
private int totalPages = 0;
@Column(name = "created_at", nullable = false, updatable = false)
@CreationTimestamp
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime createdAt;
@Column(name = "updated_at", nullable = false)
@UpdateTimestamp
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,8 @@
package org.raddatz.familienarchiv.model;
public enum OcrJobStatus {
PENDING,
RUNNING,
DONE,
FAILED
}

View File

@@ -0,0 +1,20 @@
package org.raddatz.familienarchiv.repository;
import org.raddatz.familienarchiv.model.OcrDocumentStatus;
import org.raddatz.familienarchiv.model.OcrJobDocument;
import org.springframework.data.jpa.repository.JpaRepository;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
public interface OcrJobDocumentRepository extends JpaRepository<OcrJobDocument, UUID> {
List<OcrJobDocument> findByJobIdOrderByCreatedAtAsc(UUID jobId);
List<OcrJobDocument> findByJobIdAndStatus(UUID jobId, OcrDocumentStatus status);
Optional<OcrJobDocument> findByJobIdAndDocumentId(UUID jobId, UUID documentId);
Optional<OcrJobDocument> findFirstByDocumentIdAndStatusIn(UUID documentId, List<OcrDocumentStatus> statuses);
}

View File

@@ -0,0 +1,9 @@
package org.raddatz.familienarchiv.repository;
import org.raddatz.familienarchiv.model.OcrJob;
import org.springframework.data.jpa.repository.JpaRepository;
import java.util.UUID;
public interface OcrJobRepository extends JpaRepository<OcrJob, UUID> {
}

View File

@@ -0,0 +1,13 @@
package org.raddatz.familienarchiv.service;
import java.util.List;
public record OcrBlockResult(
int pageNumber,
double x,
double y,
double width,
double height,
List<List<Double>> polygon,
String text
) {}

View File

@@ -0,0 +1,9 @@
package org.raddatz.familienarchiv.service;
import org.raddatz.familienarchiv.model.ScriptType;
import java.util.List;
public interface OcrClient {
List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType);
}

View File

@@ -0,0 +1,5 @@
package org.raddatz.familienarchiv.service;
public interface OcrHealthClient {
boolean isHealthy();
}

View File

@@ -0,0 +1,26 @@
CREATE TABLE ocr_jobs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
total_documents INT NOT NULL,
processed_documents INT NOT NULL DEFAULT 0,
error_count INT NOT NULL DEFAULT 0,
skipped_count INT NOT NULL DEFAULT 0,
created_by UUID,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE ocr_job_documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
job_id UUID NOT NULL REFERENCES ocr_jobs(id) ON DELETE CASCADE,
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
error_message TEXT,
current_page INT DEFAULT 0,
total_pages INT DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_ocr_job_documents_job_id ON ocr_job_documents(job_id);
CREATE INDEX idx_ocr_job_documents_document_id ON ocr_job_documents(document_id);