diff --git a/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java b/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java index a3fb187c..981ddb65 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java @@ -5,6 +5,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Configuration; +import software.amazon.awssdk.services.s3.presigner.S3Presigner; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.CommandLineRunner; import org.springframework.context.annotation.Bean; @@ -44,6 +45,19 @@ public class MinioConfig { .build(); } + @Bean + public S3Presigner s3Presigner() { + return S3Presigner.builder() + .endpointOverride(URI.create(endpoint)) + .serviceConfiguration(S3Configuration.builder() + .pathStyleAccessEnabled(true) + .build()) + .region(Region.of(region)) + .credentialsProvider(StaticCredentialsProvider.create( + AwsBasicCredentials.create(accessKey, secretKey))) + .build(); + } + @Bean public CommandLineRunner testS3Connection(S3Client s3Client) { return args -> { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java index 57e225c6..acf6f23d 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java @@ -4,6 +4,8 @@ import software.amazon.awssdk.core.ResponseInputStream; import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.*; +import software.amazon.awssdk.services.s3.presigner.S3Presigner; +import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.time.Duration; import java.util.UUID; @Service @@ -24,10 +27,13 @@ public class FileService { private static final Logger log = LoggerFactory.getLogger(FileService.class); private final S3Client s3Client; + private final S3Presigner s3Presigner; private final String bucketName; - public FileService(S3Client s3Client, @Value("${app.s3.bucket}") String bucketName) { + public FileService(S3Client s3Client, S3Presigner s3Presigner, + @Value("${app.s3.bucket}") String bucketName) { this.s3Client = s3Client; + this.s3Presigner = s3Presigner; this.bucketName = bucketName; } @@ -106,6 +112,24 @@ public class FileService { } } + /** + * Generates a presigned URL for downloading an object from S3/MinIO. + * Valid for 15 minutes — enough for OCR processing on CPU. + */ + public String generatePresignedUrl(String s3Key) { + GetObjectRequest getObjectRequest = GetObjectRequest.builder() + .bucket(bucketName) + .key(s3Key) + .build(); + + GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder() + .signatureDuration(Duration.ofMinutes(15)) + .getObjectRequest(getObjectRequest) + .build(); + + return s3Presigner.presignGetObject(presignRequest).url().toString(); + } + // ─── private helpers ────────────────────────────────────────────────────── private static String sha256Hex(byte[] bytes) { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java index 5587b588..c92634ee 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java @@ -8,7 +8,6 @@ import org.raddatz.familienarchiv.exception.ErrorCode; import org.raddatz.familienarchiv.model.*; import org.raddatz.familienarchiv.repository.OcrJobRepository; import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository; -import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -29,12 +28,7 @@ public class OcrService { private final AnnotationService annotationService; private final TranscriptionBlockRepository blockRepository; private final OcrJobRepository ocrJobRepository; - - @Value("${app.s3.internal-url:http://minio:9000}") - private String s3InternalUrl; - - @Value("${app.s3.bucket}") - private String bucketName; + private final FileService fileService; @Transactional public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) { @@ -78,7 +72,7 @@ public class OcrService { void processDocument(UUID documentId, Document doc, UUID userId) { clearExistingBlocks(documentId); - String pdfUrl = buildInternalUrl(doc.getFilePath()); + String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath()); List blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType()); createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash()); } @@ -115,7 +109,4 @@ public class OcrService { } } - String buildInternalUrl(String filePath) { - return s3InternalUrl + "/" + bucketName + "/" + filePath; - } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java index 187c144e..e043c3b7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java @@ -32,7 +32,7 @@ class FileServiceTest { @BeforeEach void setUp() { s3Client = mock(S3Client.class); - fileService = new FileService(s3Client, "test-bucket"); + fileService = new FileService(s3Client, null, "test-bucket"); } @Test diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java index 61c62fa3..0c8dc70f 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java @@ -34,6 +34,7 @@ class OcrServiceTest { @Mock AnnotationService annotationService; @Mock TranscriptionBlockRepository blockRepository; @Mock OcrJobRepository ocrJobRepository; + @Mock FileService fileService; @InjectMocks OcrService ocrService; @@ -80,7 +81,7 @@ class OcrServiceTest { .scriptType(ScriptType.TYPEWRITER).build(); when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); when(ocrJobRepository.save(any())).thenAnswer(inv -> { OcrJob job = inv.getArgument(0); @@ -103,7 +104,7 @@ class OcrServiceTest { .scriptType(ScriptType.UNKNOWN).build(); when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); when(ocrJobRepository.save(any())).thenAnswer(inv -> { OcrJob job = inv.getArgument(0); @@ -128,7 +129,7 @@ class OcrServiceTest { when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock)); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of( new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello"))); @@ -158,7 +159,7 @@ class OcrServiceTest { when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(transcriptionService.listBlocks(docId)).thenReturn(List.of()); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2)); when(ocrJobRepository.save(any())).thenAnswer(inv -> { diff --git a/ocr-service/main.py b/ocr-service/main.py index b1766516..34e996f3 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -82,7 +82,7 @@ async def run_ocr(request: OcrRequest): async def _download_and_convert_pdf(url: str) -> list[Image.Image]: - """Download a PDF from URL and convert each page to a PIL Image.""" + """Download a PDF from a presigned URL and convert each page to a PIL Image.""" async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: response = await client.get(url) response.raise_for_status()