fix(ocr): use presigned URLs for MinIO access from OCR service
Some checks failed
CI / Unit & Component Tests (push) Failing after 2s
CI / Backend Unit Tests (push) Failing after 0s
CI / Unit & Component Tests (pull_request) Failing after 1s
CI / Backend Unit Tests (pull_request) Failing after 1s

The OCR service was getting 403 Forbidden because it tried to
download PDFs from MinIO using plain internal URLs without
authentication. MinIO buckets are private.

- Add S3Presigner bean to MinioConfig
- FileService.generatePresignedUrl(): generates 15-min presigned URLs
- OcrService uses presigned URLs instead of plain internal URLs
- Remove unused s3InternalUrl / bucketName @Value fields from OcrService

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-12 22:16:52 +02:00
parent 7a4da7cb98
commit 4500c99e40
6 changed files with 48 additions and 18 deletions

View File

@@ -5,6 +5,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.S3Configuration;
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.CommandLineRunner;
import org.springframework.context.annotation.Bean;
@@ -44,6 +45,19 @@ public class MinioConfig {
.build();
}
@Bean
public S3Presigner s3Presigner() {
return S3Presigner.builder()
.endpointOverride(URI.create(endpoint))
.serviceConfiguration(S3Configuration.builder()
.pathStyleAccessEnabled(true)
.build())
.region(Region.of(region))
.credentialsProvider(StaticCredentialsProvider.create(
AwsBasicCredentials.create(accessKey, secretKey)))
.build();
}
@Bean
public CommandLineRunner testS3Connection(S3Client s3Client) {
return args -> {

View File

@@ -4,6 +4,8 @@ import software.amazon.awssdk.core.ResponseInputStream;
import software.amazon.awssdk.core.sync.RequestBody;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.*;
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -16,6 +18,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.util.UUID;
@Service
@@ -24,10 +27,13 @@ public class FileService {
private static final Logger log = LoggerFactory.getLogger(FileService.class);
private final S3Client s3Client;
private final S3Presigner s3Presigner;
private final String bucketName;
public FileService(S3Client s3Client, @Value("${app.s3.bucket}") String bucketName) {
public FileService(S3Client s3Client, S3Presigner s3Presigner,
@Value("${app.s3.bucket}") String bucketName) {
this.s3Client = s3Client;
this.s3Presigner = s3Presigner;
this.bucketName = bucketName;
}
@@ -106,6 +112,24 @@ public class FileService {
}
}
/**
* Generates a presigned URL for downloading an object from S3/MinIO.
* Valid for 15 minutes — enough for OCR processing on CPU.
*/
public String generatePresignedUrl(String s3Key) {
GetObjectRequest getObjectRequest = GetObjectRequest.builder()
.bucket(bucketName)
.key(s3Key)
.build();
GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder()
.signatureDuration(Duration.ofMinutes(15))
.getObjectRequest(getObjectRequest)
.build();
return s3Presigner.presignGetObject(presignRequest).url().toString();
}
// ─── private helpers ──────────────────────────────────────────────────────
private static String sha256Hex(byte[] bytes) {

View File

@@ -8,7 +8,6 @@ import org.raddatz.familienarchiv.exception.ErrorCode;
import org.raddatz.familienarchiv.model.*;
import org.raddatz.familienarchiv.repository.OcrJobRepository;
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@@ -29,12 +28,7 @@ public class OcrService {
private final AnnotationService annotationService;
private final TranscriptionBlockRepository blockRepository;
private final OcrJobRepository ocrJobRepository;
@Value("${app.s3.internal-url:http://minio:9000}")
private String s3InternalUrl;
@Value("${app.s3.bucket}")
private String bucketName;
private final FileService fileService;
@Transactional
public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) {
@@ -78,7 +72,7 @@ public class OcrService {
void processDocument(UUID documentId, Document doc, UUID userId) {
clearExistingBlocks(documentId);
String pdfUrl = buildInternalUrl(doc.getFilePath());
String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath());
List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
}
@@ -115,7 +109,4 @@ public class OcrService {
}
}
String buildInternalUrl(String filePath) {
return s3InternalUrl + "/" + bucketName + "/" + filePath;
}
}

View File

@@ -32,7 +32,7 @@ class FileServiceTest {
@BeforeEach
void setUp() {
s3Client = mock(S3Client.class);
fileService = new FileService(s3Client, "test-bucket");
fileService = new FileService(s3Client, null, "test-bucket");
}
@Test

View File

@@ -34,6 +34,7 @@ class OcrServiceTest {
@Mock AnnotationService annotationService;
@Mock TranscriptionBlockRepository blockRepository;
@Mock OcrJobRepository ocrJobRepository;
@Mock FileService fileService;
@InjectMocks OcrService ocrService;
@@ -80,7 +81,7 @@ class OcrServiceTest {
.scriptType(ScriptType.TYPEWRITER).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
@@ -103,7 +104,7 @@ class OcrServiceTest {
.scriptType(ScriptType.UNKNOWN).build();
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
OcrJob job = inv.getArgument(0);
@@ -128,7 +129,7 @@ class OcrServiceTest {
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock));
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello")));
@@ -158,7 +159,7 @@ class OcrServiceTest {
when(documentService.getDocumentById(docId)).thenReturn(doc);
when(ocrHealthClient.isHealthy()).thenReturn(true);
// ocrService constructs the internal MinIO URL from S3 key
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2));
when(ocrJobRepository.save(any())).thenAnswer(inv -> {

View File

@@ -82,7 +82,7 @@ async def run_ocr(request: OcrRequest):
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
"""Download a PDF from URL and convert each page to a PIL Image."""
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
response = await client.get(url)
response.raise_for_status()