From 4500c99e403576845c320d70ccafad91f71f21d0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 12 Apr 2026 22:16:52 +0200 Subject: [PATCH] fix(ocr): use presigned URLs for MinIO access from OCR service The OCR service was getting 403 Forbidden because it tried to download PDFs from MinIO using plain internal URLs without authentication. MinIO buckets are private. - Add S3Presigner bean to MinioConfig - FileService.generatePresignedUrl(): generates 15-min presigned URLs - OcrService uses presigned URLs instead of plain internal URLs - Remove unused s3InternalUrl / bucketName @Value fields from OcrService Co-Authored-By: Claude Sonnet 4.6 --- .../familienarchiv/config/MinioConfig.java | 14 ++++++++++ .../familienarchiv/service/FileService.java | 26 ++++++++++++++++++- .../familienarchiv/service/OcrService.java | 13 ++-------- .../service/FileServiceTest.java | 2 +- .../service/OcrServiceTest.java | 9 ++++--- ocr-service/main.py | 2 +- 6 files changed, 48 insertions(+), 18 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java b/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java index a3fb187c..981ddb65 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/config/MinioConfig.java @@ -5,6 +5,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Configuration; +import software.amazon.awssdk.services.s3.presigner.S3Presigner; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.CommandLineRunner; import org.springframework.context.annotation.Bean; @@ -44,6 +45,19 @@ public class MinioConfig { .build(); } + @Bean + public S3Presigner s3Presigner() { + return S3Presigner.builder() + .endpointOverride(URI.create(endpoint)) + .serviceConfiguration(S3Configuration.builder() + .pathStyleAccessEnabled(true) + .build()) + .region(Region.of(region)) + .credentialsProvider(StaticCredentialsProvider.create( + AwsBasicCredentials.create(accessKey, secretKey))) + .build(); + } + @Bean public CommandLineRunner testS3Connection(S3Client s3Client) { return args -> { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java index 57e225c6..acf6f23d 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/FileService.java @@ -4,6 +4,8 @@ import software.amazon.awssdk.core.ResponseInputStream; import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.*; +import software.amazon.awssdk.services.s3.presigner.S3Presigner; +import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.time.Duration; import java.util.UUID; @Service @@ -24,10 +27,13 @@ public class FileService { private static final Logger log = LoggerFactory.getLogger(FileService.class); private final S3Client s3Client; + private final S3Presigner s3Presigner; private final String bucketName; - public FileService(S3Client s3Client, @Value("${app.s3.bucket}") String bucketName) { + public FileService(S3Client s3Client, S3Presigner s3Presigner, + @Value("${app.s3.bucket}") String bucketName) { this.s3Client = s3Client; + this.s3Presigner = s3Presigner; this.bucketName = bucketName; } @@ -106,6 +112,24 @@ public class FileService { } } + /** + * Generates a presigned URL for downloading an object from S3/MinIO. + * Valid for 15 minutes — enough for OCR processing on CPU. + */ + public String generatePresignedUrl(String s3Key) { + GetObjectRequest getObjectRequest = GetObjectRequest.builder() + .bucket(bucketName) + .key(s3Key) + .build(); + + GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder() + .signatureDuration(Duration.ofMinutes(15)) + .getObjectRequest(getObjectRequest) + .build(); + + return s3Presigner.presignGetObject(presignRequest).url().toString(); + } + // ─── private helpers ────────────────────────────────────────────────────── private static String sha256Hex(byte[] bytes) { diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java index 5587b588..c92634ee 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrService.java @@ -8,7 +8,6 @@ import org.raddatz.familienarchiv.exception.ErrorCode; import org.raddatz.familienarchiv.model.*; import org.raddatz.familienarchiv.repository.OcrJobRepository; import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository; -import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -29,12 +28,7 @@ public class OcrService { private final AnnotationService annotationService; private final TranscriptionBlockRepository blockRepository; private final OcrJobRepository ocrJobRepository; - - @Value("${app.s3.internal-url:http://minio:9000}") - private String s3InternalUrl; - - @Value("${app.s3.bucket}") - private String bucketName; + private final FileService fileService; @Transactional public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) { @@ -78,7 +72,7 @@ public class OcrService { void processDocument(UUID documentId, Document doc, UUID userId) { clearExistingBlocks(documentId); - String pdfUrl = buildInternalUrl(doc.getFilePath()); + String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath()); List blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType()); createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash()); } @@ -115,7 +109,4 @@ public class OcrService { } } - String buildInternalUrl(String filePath) { - return s3InternalUrl + "/" + bucketName + "/" + filePath; - } } diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java index 187c144e..e043c3b7 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/FileServiceTest.java @@ -32,7 +32,7 @@ class FileServiceTest { @BeforeEach void setUp() { s3Client = mock(S3Client.class); - fileService = new FileService(s3Client, "test-bucket"); + fileService = new FileService(s3Client, null, "test-bucket"); } @Test diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java index 61c62fa3..0c8dc70f 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrServiceTest.java @@ -34,6 +34,7 @@ class OcrServiceTest { @Mock AnnotationService annotationService; @Mock TranscriptionBlockRepository blockRepository; @Mock OcrJobRepository ocrJobRepository; + @Mock FileService fileService; @InjectMocks OcrService ocrService; @@ -80,7 +81,7 @@ class OcrServiceTest { .scriptType(ScriptType.TYPEWRITER).build(); when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); when(ocrJobRepository.save(any())).thenAnswer(inv -> { OcrJob job = inv.getArgument(0); @@ -103,7 +104,7 @@ class OcrServiceTest { .scriptType(ScriptType.UNKNOWN).build(); when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of()); when(ocrJobRepository.save(any())).thenAnswer(inv -> { OcrJob job = inv.getArgument(0); @@ -128,7 +129,7 @@ class OcrServiceTest { when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock)); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of( new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello"))); @@ -158,7 +159,7 @@ class OcrServiceTest { when(documentService.getDocumentById(docId)).thenReturn(doc); when(ocrHealthClient.isHealthy()).thenReturn(true); - // ocrService constructs the internal MinIO URL from S3 key + when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned"); when(transcriptionService.listBlocks(docId)).thenReturn(List.of()); when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2)); when(ocrJobRepository.save(any())).thenAnswer(inv -> { diff --git a/ocr-service/main.py b/ocr-service/main.py index b1766516..34e996f3 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -82,7 +82,7 @@ async def run_ocr(request: OcrRequest): async def _download_and_convert_pdf(url: str) -> list[Image.Image]: - """Download a PDF from URL and convert each page to a PIL Image.""" + """Download a PDF from a presigned URL and convert each page to a PIL Image.""" async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client: response = await client.get(url) response.raise_for_status()