@@ -5,6 +5,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
|
||||
import software.amazon.awssdk.regions.Region;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.S3Configuration;
|
||||
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
@@ -44,6 +45,19 @@ public class MinioConfig {
|
||||
.build();
|
||||
}
|
||||
|
||||
@Bean
|
||||
public S3Presigner s3Presigner() {
|
||||
return S3Presigner.builder()
|
||||
.endpointOverride(URI.create(endpoint))
|
||||
.serviceConfiguration(S3Configuration.builder()
|
||||
.pathStyleAccessEnabled(true)
|
||||
.build())
|
||||
.region(Region.of(region))
|
||||
.credentialsProvider(StaticCredentialsProvider.create(
|
||||
AwsBasicCredentials.create(accessKey, secretKey)))
|
||||
.build();
|
||||
}
|
||||
|
||||
@Bean
|
||||
public CommandLineRunner testS3Connection(S3Client s3Client) {
|
||||
return args -> {
|
||||
|
||||
@@ -4,6 +4,8 @@ import software.amazon.awssdk.core.ResponseInputStream;
|
||||
import software.amazon.awssdk.core.sync.RequestBody;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.model.*;
|
||||
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
|
||||
import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -16,6 +18,7 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.UUID;
|
||||
|
||||
@Service
|
||||
@@ -24,10 +27,13 @@ public class FileService {
|
||||
private static final Logger log = LoggerFactory.getLogger(FileService.class);
|
||||
|
||||
private final S3Client s3Client;
|
||||
private final S3Presigner s3Presigner;
|
||||
private final String bucketName;
|
||||
|
||||
public FileService(S3Client s3Client, @Value("${app.s3.bucket}") String bucketName) {
|
||||
public FileService(S3Client s3Client, S3Presigner s3Presigner,
|
||||
@Value("${app.s3.bucket}") String bucketName) {
|
||||
this.s3Client = s3Client;
|
||||
this.s3Presigner = s3Presigner;
|
||||
this.bucketName = bucketName;
|
||||
}
|
||||
|
||||
@@ -106,6 +112,24 @@ public class FileService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a presigned URL for downloading an object from S3/MinIO.
|
||||
* Valid for 15 minutes — enough for OCR processing on CPU.
|
||||
*/
|
||||
public String generatePresignedUrl(String s3Key) {
|
||||
GetObjectRequest getObjectRequest = GetObjectRequest.builder()
|
||||
.bucket(bucketName)
|
||||
.key(s3Key)
|
||||
.build();
|
||||
|
||||
GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder()
|
||||
.signatureDuration(Duration.ofMinutes(15))
|
||||
.getObjectRequest(getObjectRequest)
|
||||
.build();
|
||||
|
||||
return s3Presigner.presignGetObject(presignRequest).url().toString();
|
||||
}
|
||||
|
||||
// ─── private helpers ──────────────────────────────────────────────────────
|
||||
|
||||
private static String sha256Hex(byte[] bytes) {
|
||||
|
||||
@@ -8,7 +8,6 @@ import org.raddatz.familienarchiv.exception.ErrorCode;
|
||||
import org.raddatz.familienarchiv.model.*;
|
||||
import org.raddatz.familienarchiv.repository.OcrJobRepository;
|
||||
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@@ -29,12 +28,7 @@ public class OcrService {
|
||||
private final AnnotationService annotationService;
|
||||
private final TranscriptionBlockRepository blockRepository;
|
||||
private final OcrJobRepository ocrJobRepository;
|
||||
|
||||
@Value("${app.s3.internal-url:http://minio:9000}")
|
||||
private String s3InternalUrl;
|
||||
|
||||
@Value("${app.s3.bucket}")
|
||||
private String bucketName;
|
||||
private final FileService fileService;
|
||||
|
||||
@Transactional
|
||||
public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) {
|
||||
@@ -78,7 +72,7 @@ public class OcrService {
|
||||
void processDocument(UUID documentId, Document doc, UUID userId) {
|
||||
clearExistingBlocks(documentId);
|
||||
|
||||
String pdfUrl = buildInternalUrl(doc.getFilePath());
|
||||
String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath());
|
||||
List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
|
||||
createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
|
||||
}
|
||||
@@ -115,7 +109,4 @@ public class OcrService {
|
||||
}
|
||||
}
|
||||
|
||||
String buildInternalUrl(String filePath) {
|
||||
return s3InternalUrl + "/" + bucketName + "/" + filePath;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ class FileServiceTest {
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
s3Client = mock(S3Client.class);
|
||||
fileService = new FileService(s3Client, "test-bucket");
|
||||
fileService = new FileService(s3Client, null, "test-bucket");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -34,6 +34,7 @@ class OcrServiceTest {
|
||||
@Mock AnnotationService annotationService;
|
||||
@Mock TranscriptionBlockRepository blockRepository;
|
||||
@Mock OcrJobRepository ocrJobRepository;
|
||||
@Mock FileService fileService;
|
||||
|
||||
@InjectMocks OcrService ocrService;
|
||||
|
||||
@@ -80,7 +81,7 @@ class OcrServiceTest {
|
||||
.scriptType(ScriptType.TYPEWRITER).build();
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
OcrJob job = inv.getArgument(0);
|
||||
@@ -103,7 +104,7 @@ class OcrServiceTest {
|
||||
.scriptType(ScriptType.UNKNOWN).build();
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
OcrJob job = inv.getArgument(0);
|
||||
@@ -128,7 +129,7 @@ class OcrServiceTest {
|
||||
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock));
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
|
||||
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello")));
|
||||
@@ -158,7 +159,7 @@ class OcrServiceTest {
|
||||
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2));
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
|
||||
@@ -82,7 +82,7 @@ async def run_ocr(request: OcrRequest):
|
||||
|
||||
|
||||
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
||||
"""Download a PDF from URL and convert each page to a PIL Image."""
|
||||
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
Reference in New Issue
Block a user