feat(ocr): add OCR infrastructure (interfaces, entities, migrations, DTOs)
- OcrClient + OcrHealthClient interfaces for testable OCR integration - OcrBlockResult record for OCR engine response mapping - OcrJob + OcrJobDocument entities with status enums - V25 migration creates ocr_jobs and ocr_job_documents tables - Repositories for job and job-document queries - TriggerOcrDTO, BatchOcrDTO (@Size max=500), OcrStatusDTO - ErrorCodes: OCR_SERVICE_UNAVAILABLE, OCR_JOB_NOT_FOUND, OCR_DOCUMENT_NOT_UPLOADED, OCR_PROCESSING_FAILED Refs #226 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
|||||||
|
package org.raddatz.familienarchiv.dto;
|
||||||
|
|
||||||
|
import jakarta.validation.constraints.NotEmpty;
|
||||||
|
import jakarta.validation.constraints.Size;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class BatchOcrDTO {
|
||||||
|
@NotEmpty
|
||||||
|
@Size(max = 500, message = "batch size must not exceed 500 documents")
|
||||||
|
private List<UUID> documentIds;
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
package org.raddatz.familienarchiv.dto;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class OcrStatusDTO {
|
||||||
|
private String status;
|
||||||
|
private UUID jobId;
|
||||||
|
private int currentPage;
|
||||||
|
private int totalPages;
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package org.raddatz.familienarchiv.dto;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import org.raddatz.familienarchiv.model.ScriptType;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TriggerOcrDTO {
|
||||||
|
private ScriptType scriptType;
|
||||||
|
}
|
||||||
@@ -66,6 +66,16 @@ public enum ErrorCode {
|
|||||||
/** The notification with the given ID does not exist. 404 */
|
/** The notification with the given ID does not exist. 404 */
|
||||||
NOTIFICATION_NOT_FOUND,
|
NOTIFICATION_NOT_FOUND,
|
||||||
|
|
||||||
|
// --- OCR ---
|
||||||
|
/** The OCR service is not available or not healthy. 503 */
|
||||||
|
OCR_SERVICE_UNAVAILABLE,
|
||||||
|
/** The OCR job with the given ID does not exist. 404 */
|
||||||
|
OCR_JOB_NOT_FOUND,
|
||||||
|
/** The document is not in UPLOADED status and cannot be OCR'd. 400 */
|
||||||
|
OCR_DOCUMENT_NOT_UPLOADED,
|
||||||
|
/** OCR processing failed for the document. 500 */
|
||||||
|
OCR_PROCESSING_FAILED,
|
||||||
|
|
||||||
// --- Generic ---
|
// --- Generic ---
|
||||||
/** Request validation failed (missing or malformed fields). 400 */
|
/** Request validation failed (missing or malformed fields). 400 */
|
||||||
VALIDATION_ERROR,
|
VALIDATION_ERROR,
|
||||||
|
|||||||
@@ -0,0 +1,9 @@
|
|||||||
|
package org.raddatz.familienarchiv.model;
|
||||||
|
|
||||||
|
public enum OcrDocumentStatus {
|
||||||
|
PENDING,
|
||||||
|
RUNNING,
|
||||||
|
DONE,
|
||||||
|
FAILED,
|
||||||
|
SKIPPED
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
package org.raddatz.familienarchiv.model;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import jakarta.persistence.*;
|
||||||
|
import lombok.*;
|
||||||
|
import org.hibernate.annotations.CreationTimestamp;
|
||||||
|
import org.hibernate.annotations.UpdateTimestamp;
|
||||||
|
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Entity
|
||||||
|
@Table(name = "ocr_jobs")
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class OcrJob {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private OcrJobStatus status = OcrJobStatus.PENDING;
|
||||||
|
|
||||||
|
@Column(name = "total_documents", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private int totalDocuments;
|
||||||
|
|
||||||
|
@Column(name = "processed_documents", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private int processedDocuments = 0;
|
||||||
|
|
||||||
|
@Column(name = "error_count", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private int errorCount = 0;
|
||||||
|
|
||||||
|
@Column(name = "skipped_count", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private int skippedCount = 0;
|
||||||
|
|
||||||
|
@Column(name = "created_by")
|
||||||
|
private UUID createdBy;
|
||||||
|
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
@CreationTimestamp
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private LocalDateTime createdAt;
|
||||||
|
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
@UpdateTimestamp
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private LocalDateTime updatedAt;
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
package org.raddatz.familienarchiv.model;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import jakarta.persistence.*;
|
||||||
|
import lombok.*;
|
||||||
|
import org.hibernate.annotations.CreationTimestamp;
|
||||||
|
import org.hibernate.annotations.UpdateTimestamp;
|
||||||
|
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Entity
|
||||||
|
@Table(name = "ocr_job_documents")
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Builder
|
||||||
|
public class OcrJobDocument {
|
||||||
|
|
||||||
|
@Id
|
||||||
|
@GeneratedValue(strategy = GenerationType.UUID)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private UUID id;
|
||||||
|
|
||||||
|
@Column(name = "job_id", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private UUID jobId;
|
||||||
|
|
||||||
|
@Column(name = "document_id", nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private UUID documentId;
|
||||||
|
|
||||||
|
@Enumerated(EnumType.STRING)
|
||||||
|
@Column(nullable = false)
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
@Builder.Default
|
||||||
|
private OcrDocumentStatus status = OcrDocumentStatus.PENDING;
|
||||||
|
|
||||||
|
@Column(name = "error_message")
|
||||||
|
private String errorMessage;
|
||||||
|
|
||||||
|
@Column(name = "current_page")
|
||||||
|
@Builder.Default
|
||||||
|
private int currentPage = 0;
|
||||||
|
|
||||||
|
@Column(name = "total_pages")
|
||||||
|
@Builder.Default
|
||||||
|
private int totalPages = 0;
|
||||||
|
|
||||||
|
@Column(name = "created_at", nullable = false, updatable = false)
|
||||||
|
@CreationTimestamp
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private LocalDateTime createdAt;
|
||||||
|
|
||||||
|
@Column(name = "updated_at", nullable = false)
|
||||||
|
@UpdateTimestamp
|
||||||
|
@Schema(requiredMode = Schema.RequiredMode.REQUIRED)
|
||||||
|
private LocalDateTime updatedAt;
|
||||||
|
}
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
package org.raddatz.familienarchiv.model;
|
||||||
|
|
||||||
|
public enum OcrJobStatus {
|
||||||
|
PENDING,
|
||||||
|
RUNNING,
|
||||||
|
DONE,
|
||||||
|
FAILED
|
||||||
|
}
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
package org.raddatz.familienarchiv.repository;
|
||||||
|
|
||||||
|
import org.raddatz.familienarchiv.model.OcrDocumentStatus;
|
||||||
|
import org.raddatz.familienarchiv.model.OcrJobDocument;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public interface OcrJobDocumentRepository extends JpaRepository<OcrJobDocument, UUID> {
|
||||||
|
|
||||||
|
List<OcrJobDocument> findByJobIdOrderByCreatedAtAsc(UUID jobId);
|
||||||
|
|
||||||
|
List<OcrJobDocument> findByJobIdAndStatus(UUID jobId, OcrDocumentStatus status);
|
||||||
|
|
||||||
|
Optional<OcrJobDocument> findByJobIdAndDocumentId(UUID jobId, UUID documentId);
|
||||||
|
|
||||||
|
Optional<OcrJobDocument> findFirstByDocumentIdAndStatusIn(UUID documentId, List<OcrDocumentStatus> statuses);
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
package org.raddatz.familienarchiv.repository;
|
||||||
|
|
||||||
|
import org.raddatz.familienarchiv.model.OcrJob;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public interface OcrJobRepository extends JpaRepository<OcrJob, UUID> {
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record OcrBlockResult(
|
||||||
|
int pageNumber,
|
||||||
|
double x,
|
||||||
|
double y,
|
||||||
|
double width,
|
||||||
|
double height,
|
||||||
|
List<List<Double>> polygon,
|
||||||
|
String text
|
||||||
|
) {}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
import org.raddatz.familienarchiv.model.ScriptType;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public interface OcrClient {
|
||||||
|
List<OcrBlockResult> extractBlocks(String pdfUrl, ScriptType scriptType);
|
||||||
|
}
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
package org.raddatz.familienarchiv.service;
|
||||||
|
|
||||||
|
public interface OcrHealthClient {
|
||||||
|
boolean isHealthy();
|
||||||
|
}
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
CREATE TABLE ocr_jobs (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
|
||||||
|
total_documents INT NOT NULL,
|
||||||
|
processed_documents INT NOT NULL DEFAULT 0,
|
||||||
|
error_count INT NOT NULL DEFAULT 0,
|
||||||
|
skipped_count INT NOT NULL DEFAULT 0,
|
||||||
|
created_by UUID,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE ocr_job_documents (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
job_id UUID NOT NULL REFERENCES ocr_jobs(id) ON DELETE CASCADE,
|
||||||
|
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'PENDING',
|
||||||
|
error_message TEXT,
|
||||||
|
current_page INT DEFAULT 0,
|
||||||
|
total_pages INT DEFAULT 0,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_ocr_job_documents_job_id ON ocr_job_documents(job_id);
|
||||||
|
CREATE INDEX idx_ocr_job_documents_document_id ON ocr_job_documents(document_id);
|
||||||
Reference in New Issue
Block a user