fix(ocr): use presigned URLs for MinIO access from OCR service
The OCR service was getting 403 Forbidden because it tried to download PDFs from MinIO using plain internal URLs without authentication. MinIO buckets are private. - Add S3Presigner bean to MinioConfig - FileService.generatePresignedUrl(): generates 15-min presigned URLs - OcrService uses presigned URLs instead of plain internal URLs - Remove unused s3InternalUrl / bucketName @Value fields from OcrService Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
|
||||
import software.amazon.awssdk.regions.Region;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.S3Configuration;
|
||||
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
@@ -44,6 +45,19 @@ public class MinioConfig {
|
||||
.build();
|
||||
}
|
||||
|
||||
@Bean
|
||||
public S3Presigner s3Presigner() {
|
||||
return S3Presigner.builder()
|
||||
.endpointOverride(URI.create(endpoint))
|
||||
.serviceConfiguration(S3Configuration.builder()
|
||||
.pathStyleAccessEnabled(true)
|
||||
.build())
|
||||
.region(Region.of(region))
|
||||
.credentialsProvider(StaticCredentialsProvider.create(
|
||||
AwsBasicCredentials.create(accessKey, secretKey)))
|
||||
.build();
|
||||
}
|
||||
|
||||
@Bean
|
||||
public CommandLineRunner testS3Connection(S3Client s3Client) {
|
||||
return args -> {
|
||||
|
||||
@@ -4,6 +4,8 @@ import software.amazon.awssdk.core.ResponseInputStream;
|
||||
import software.amazon.awssdk.core.sync.RequestBody;
|
||||
import software.amazon.awssdk.services.s3.S3Client;
|
||||
import software.amazon.awssdk.services.s3.model.*;
|
||||
import software.amazon.awssdk.services.s3.presigner.S3Presigner;
|
||||
import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -16,6 +18,7 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.UUID;
|
||||
|
||||
@Service
|
||||
@@ -24,10 +27,13 @@ public class FileService {
|
||||
private static final Logger log = LoggerFactory.getLogger(FileService.class);
|
||||
|
||||
private final S3Client s3Client;
|
||||
private final S3Presigner s3Presigner;
|
||||
private final String bucketName;
|
||||
|
||||
public FileService(S3Client s3Client, @Value("${app.s3.bucket}") String bucketName) {
|
||||
public FileService(S3Client s3Client, S3Presigner s3Presigner,
|
||||
@Value("${app.s3.bucket}") String bucketName) {
|
||||
this.s3Client = s3Client;
|
||||
this.s3Presigner = s3Presigner;
|
||||
this.bucketName = bucketName;
|
||||
}
|
||||
|
||||
@@ -106,6 +112,24 @@ public class FileService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a presigned URL for downloading an object from S3/MinIO.
|
||||
* Valid for 15 minutes — enough for OCR processing on CPU.
|
||||
*/
|
||||
public String generatePresignedUrl(String s3Key) {
|
||||
GetObjectRequest getObjectRequest = GetObjectRequest.builder()
|
||||
.bucket(bucketName)
|
||||
.key(s3Key)
|
||||
.build();
|
||||
|
||||
GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder()
|
||||
.signatureDuration(Duration.ofMinutes(15))
|
||||
.getObjectRequest(getObjectRequest)
|
||||
.build();
|
||||
|
||||
return s3Presigner.presignGetObject(presignRequest).url().toString();
|
||||
}
|
||||
|
||||
// ─── private helpers ──────────────────────────────────────────────────────
|
||||
|
||||
private static String sha256Hex(byte[] bytes) {
|
||||
|
||||
@@ -8,7 +8,6 @@ import org.raddatz.familienarchiv.exception.ErrorCode;
|
||||
import org.raddatz.familienarchiv.model.*;
|
||||
import org.raddatz.familienarchiv.repository.OcrJobRepository;
|
||||
import org.raddatz.familienarchiv.repository.TranscriptionBlockRepository;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
@@ -29,12 +28,7 @@ public class OcrService {
|
||||
private final AnnotationService annotationService;
|
||||
private final TranscriptionBlockRepository blockRepository;
|
||||
private final OcrJobRepository ocrJobRepository;
|
||||
|
||||
@Value("${app.s3.internal-url:http://minio:9000}")
|
||||
private String s3InternalUrl;
|
||||
|
||||
@Value("${app.s3.bucket}")
|
||||
private String bucketName;
|
||||
private final FileService fileService;
|
||||
|
||||
@Transactional
|
||||
public UUID startOcr(UUID documentId, ScriptType scriptTypeOverride, UUID userId) {
|
||||
@@ -78,7 +72,7 @@ public class OcrService {
|
||||
void processDocument(UUID documentId, Document doc, UUID userId) {
|
||||
clearExistingBlocks(documentId);
|
||||
|
||||
String pdfUrl = buildInternalUrl(doc.getFilePath());
|
||||
String pdfUrl = fileService.generatePresignedUrl(doc.getFilePath());
|
||||
List<OcrBlockResult> blocks = ocrClient.extractBlocks(pdfUrl, doc.getScriptType());
|
||||
createTranscriptionBlocks(documentId, blocks, userId, doc.getFileHash());
|
||||
}
|
||||
@@ -115,7 +109,4 @@ public class OcrService {
|
||||
}
|
||||
}
|
||||
|
||||
String buildInternalUrl(String filePath) {
|
||||
return s3InternalUrl + "/" + bucketName + "/" + filePath;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ class FileServiceTest {
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
s3Client = mock(S3Client.class);
|
||||
fileService = new FileService(s3Client, "test-bucket");
|
||||
fileService = new FileService(s3Client, null, "test-bucket");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -34,6 +34,7 @@ class OcrServiceTest {
|
||||
@Mock AnnotationService annotationService;
|
||||
@Mock TranscriptionBlockRepository blockRepository;
|
||||
@Mock OcrJobRepository ocrJobRepository;
|
||||
@Mock FileService fileService;
|
||||
|
||||
@InjectMocks OcrService ocrService;
|
||||
|
||||
@@ -80,7 +81,7 @@ class OcrServiceTest {
|
||||
.scriptType(ScriptType.TYPEWRITER).build();
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
OcrJob job = inv.getArgument(0);
|
||||
@@ -103,7 +104,7 @@ class OcrServiceTest {
|
||||
.scriptType(ScriptType.UNKNOWN).build();
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of());
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
OcrJob job = inv.getArgument(0);
|
||||
@@ -128,7 +129,7 @@ class OcrServiceTest {
|
||||
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(transcriptionService.listBlocks(docId)).thenReturn(List.of(existingBlock));
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(
|
||||
new OcrBlockResult(0, 0.1, 0.1, 0.8, 0.04, null, "Hello")));
|
||||
@@ -158,7 +159,7 @@ class OcrServiceTest {
|
||||
|
||||
when(documentService.getDocumentById(docId)).thenReturn(doc);
|
||||
when(ocrHealthClient.isHealthy()).thenReturn(true);
|
||||
// ocrService constructs the internal MinIO URL from S3 key
|
||||
when(fileService.generatePresignedUrl(any())).thenReturn("http://minio/presigned");
|
||||
when(transcriptionService.listBlocks(docId)).thenReturn(List.of());
|
||||
when(ocrClient.extractBlocks(any(), any())).thenReturn(List.of(block1, block2));
|
||||
when(ocrJobRepository.save(any())).thenAnswer(inv -> {
|
||||
|
||||
@@ -82,7 +82,7 @@ async def run_ocr(request: OcrRequest):
|
||||
|
||||
|
||||
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
|
||||
"""Download a PDF from URL and convert each page to a PIL Image."""
|
||||
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
Reference in New Issue
Block a user