feat(ocr): add OCR infrastructure (interfaces, entities, migrations, DTOs)
- OcrClient + OcrHealthClient interfaces for testable OCR integration - OcrBlockResult record for OCR engine response mapping - OcrJob + OcrJobDocument entities with status enums - V25 migration creates ocr_jobs and ocr_job_documents tables - Repositories for job and job-document queries - TriggerOcrDTO, BatchOcrDTO (@Size max=500), OcrStatusDTO - ErrorCodes: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND, OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED Refs #226 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
package org.raddatz.familienarchiv.dto;
|
||||
|
||||
import jakarta.validation.constraints.NotEmpty;
|
||||
import jakarta.validation.constraints.Size;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class BatchOcrDTO {
|
||||
@NotEmpty
|
||||
@Size(max = 500, message = "batch size must not exceed 500 documents")
|
||||
private List<UUID> documentIds;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package org.raddatz.familienarchiv.dto;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class OcrStatusDTO {
|
||||
private String status;
|
||||
private UUID jobId;
|
||||
private int currentPage;
|
||||
private int totalPages;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package org.raddatz.familienarchiv.dto;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.raddatz.familienarchiv.model.ScriptType;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TriggerOcrDTO {
|
||||
private ScriptType scriptType;
|
||||
}
|
||||
@@ -66,6 +66,16 @@ public enum ErrorCode {
|
||||
/** The notification with the given ID does not exist. 404 */
|
||||
NOTIFICATION_NOT_FOUND,
|
||||
|
||||
// --- OCR ---
|
||||
/** The OCR service is not available or not healthy. 503 */
|
||||
OCR_SERVICE_UNAVAILABLE,
|
||||
/** The OCR job with the given ID does not exist. 404 */
|
||||
OCR_JOB_NOT_FOUND,
|
||||
/** The document is not in UPLOADED status and cannot be OCR'd. 400 */
|
||||
OCR_DOCUMENT_NOT_UPLOADED,
|
||||
/** OCR processing failed for the document. 500 */
|
||||
OCR_PROCESSING_FAILED,
|
||||
|
||||
// --- Generic ---
|
||||
/** Request validation failed (missing or malformed fields). 400 */
|
||||
VALIDATION_ERROR,
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
package org.raddatz.familienarchiv.model;
|
||||
|
||||
public enum OcrDocumentStatus {
|
||||
PENDING,
|
||||
RUNNING,
|
||||
DONE,
|
||||
FAILED,
|
||||
SKIPPED
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package org.raddatz.familienarchiv.model;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import jakarta.persistence.*;
|
||||
import lombok.*;
|
||||
import org.hibernate.annotations.CreationTimestamp;
|
||||
import org.hibernate.annotations.UpdateTimestamp;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "ocr_jobs")
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class OcrJob {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private UUID id;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private OcrJobStatus status = OcrJobStatus.PENDING;
|
||||
|
||||
@Column(name = "total_documents", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private int totalDocuments;
|
||||
|
||||
@Column(name = "processed_documents", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private int processedDocuments = 0;
|
||||
|
||||
@Column(name = "error_count", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private int errorCount = 0;
|
||||
|
||||
@Column(name = "skipped_count", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private int skippedCount = 0;
|
||||
|
||||
@Column(name = "created_by")
|
||||
private UUID createdBy;
|
||||
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
@CreationTimestamp
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private LocalDateTime createdAt;
|
||||
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
@UpdateTimestamp
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private LocalDateTime updatedAt;
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package org.raddatz.familienarchiv.model;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import jakarta.persistence.*;
|
||||
import lombok.*;
|
||||
import org.hibernate.annotations.CreationTimestamp;
|
||||
import org.hibernate.annotations.UpdateTimestamp;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.UUID;
|
||||
|
||||
@Entity
|
||||
@Table(name = "ocr_job_documents")
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
public class OcrJobDocument {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy = GenerationType.UUID)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private UUID id;
|
||||
|
||||
@Column(name = "job_id", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private UUID jobId;
|
||||
|
||||
@Column(name = "document_id", nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private UUID documentId;
|
||||
|
||||
@Enumerated(EnumType.STRING)
|
||||
@Column(nullable = false)
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
@Builder.Default
|
||||
private OcrDocumentStatus status = OcrDocumentStatus.PENDING;
|
||||
|
||||
@Column(name = "error_message")
|
||||
private String errorMessage;
|
||||
|
||||
@Column(name = "current_page")
|
||||
@Builder.Default
|
||||
private int currentPage = 0;
|
||||
|
||||
@Column(name = "total_pages")
|
||||
@Builder.Default
|
||||
private int totalPages = 0;
|
||||
|
||||
@Column(name = "created_at", nullable = false, updatable = false)
|
||||
@CreationTimestamp
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private LocalDateTime createdAt;
|
||||
|
||||
@Column(name = "updated_at", nullable = false)
|
||||
@UpdateTimestamp
|
||||
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||
private LocalDateTime updatedAt;
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
package org.raddatz.familienarchiv.model;
|
||||
|
||||
public enum OcrJobStatus {
|
||||
PENDING,
|
||||
RUNNING,
|
||||
DONE,
|
||||
FAILED
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package org.raddatz.familienarchiv.repository;
|
||||
|
||||
import org.raddatz.familienarchiv.model.OcrDocumentStatus;
|
||||
import org.raddatz.familienarchiv.model.OcrJobDocument;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
|
||||
public interface OcrJobDocumentRepository extends JpaRepository<OcrJobDocument, UUID> {
|
||||
|
||||
List<OcrJobDocument> findByJobIdOrderByCreatedAtAsc(UUID jobId);
|
||||
|
||||
List<OcrJobDocument> findByJobIdAndStatus(UUID jobId, OcrDocumentStatus status);
|
||||
|
||||
Optional<OcrJobDocument> findByJobIdAndDocumentId(UUID jobId, UUID documentId);
|
||||
|
||||
Optional<OcrJobDocument> findFirstByDocumentIdAndStatusIn(UUID documentId, List<OcrDocumentStatus> statuses);
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
package org.raddatz.familienarchiv.repository;
|
||||
|
||||
import org.raddatz.familienarchiv.model.OcrJob;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
public interface OcrJobRepository extends JpaRepository<OcrJob, UUID> {
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record OcrBlockResult(
|
||||
int pageNumber,
|
||||
double x,
|
||||
double y,
|
||||
double width,
|
||||
double height,
|
||||
List<List<Double>> polygon,
|
||||
String text
|
||||
) {}
|
||||
@@ -0,0 +1,9 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
import org.raddatz.familienarchiv.model.ScriptType;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface OcrClient {
|
||||
List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
package org.raddatz.familienarchiv.service;
|
||||
|
||||
public interface OcrHealthClient {
|
||||
boolean isHealthy();
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
CREATE TABLE ocr_jobs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
|
||||
total_documents INT NOT NULL,
|
||||
processed_documents INT NOT NULL DEFAULT 0,
|
||||
error_count INT NOT NULL DEFAULT 0,
|
||||
skipped_count INT NOT NULL DEFAULT 0,
|
||||
created_by UUID,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE ocr_job_documents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
job_id UUID NOT NULL REFERENCES ocr_jobs(id) ON DELETE CASCADE,
|
||||
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
|
||||
error_message TEXT,
|
||||
current_page INT DEFAULT 0,
|
||||
total_pages INT DEFAULT 0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_ocr_job_documents_job_id ON ocr_job_documents(job_id);
|
||||
CREATE INDEX idx_ocr_job_documents_document_id ON ocr_job_documents(document_id);
|
||||
Reference in New Issue
Block a user