@startuml !include title Component Diagram: API Backend — OCR Orchestration Container(frontend, "Web Frontend", "SvelteKit") ContainerDb(db, "PostgreSQL", "PostgreSQL 16") ContainerDb(minio, "Object Storage", "MinIO (S3-compatible)") Container(ocrPy, "OCR Service", "Python FastAPI") System_Boundary(backend, "API Backend (Spring Boot)") { Component(ocrCtrl, "OcrController", "Spring MVC — /api/ocr", "REST entry point: trigger single or batch OCR jobs, stream progress via SSE, query job status, and manage training runs and per-sender models.") Component(ocrSvc, "OcrService", "Spring Service", "Creates OcrJob and OcrJobDocument records, checks Python service health, and delegates async execution to OcrAsyncRunner.") Component(ocrBatch, "OcrBatchService", "Spring Service", "Orchestrates multi-document OCR jobs, iterating documents and delegating each to OcrAsyncRunner.") Component(ocrAsync, "OcrAsyncRunner", "Spring Component — @Async", "Async worker that streams OCR results from Python page by page, persists transcription blocks and annotations via domain services, and emits progress via SSE.") Component(ocrClient, "RestClientOcrClient", "Spring Component", "HTTP client wrapping the Python service: POST /ocr/stream (NDJSON), /train, /segtrain, and /train-sender. Falls back from streaming to batch on 404.") Component(ocrTraining, "OcrTrainingService", "Spring Service", "Orchestrates model training: exports training data as ZIP, calls Python /train or /segtrain, persists training metrics in OcrTrainingRunRepository.") Component(ocrJobRepo, "OcrJobRepository, OcrJobDocumentRepository", "Spring Data JPA", "Reads and writes OcrJob and OcrJobDocument records. Tracks job status (RUNNING/DONE/FAILED), per-document progress, page counts, and error messages.") } Component(transcriptionSvc, "TranscriptionService", "Spring Service", "See diagram 3c. Called by OcrAsyncRunner to persist transcription blocks per page.") Component(annotationSvc, "AnnotationService", "Spring Service", "See diagram 3c. Called by OcrAsyncRunner to persist OCR-generated annotation regions per page.") Rel(frontend, ocrCtrl, "OCR trigger, status, and progress requests", "HTTP / JSON / SSE") Rel(ocrCtrl, ocrSvc, "Single-document jobs") Rel(ocrCtrl, ocrBatch, "Batch jobs") Rel(ocrCtrl, ocrTraining, "Training runs") Rel(ocrSvc, ocrAsync, "Delegates async execution") Rel(ocrBatch, ocrAsync, "Delegates async execution") Rel(ocrAsync, ocrClient, "Streams OCR results page by page", "HTTP / NDJSON") Rel(ocrTraining, ocrClient, "Sends training data ZIP", "HTTP / multipart") Rel(ocrClient, ocrPy, "POST /ocr/stream, /train, /segtrain, /train-sender", "HTTP / REST") Rel(ocrAsync, transcriptionSvc, "Saves transcription blocks per page") Rel(ocrAsync, annotationSvc, "Saves annotation regions per page") Rel(ocrAsync, ocrJobRepo, "Reads / writes OCR job state") Rel(ocrJobRepo, db, "SQL queries", "JDBC") Rel(ocrAsync, minio, "Generates presigned URLs for PDF fetch", "S3 API") Rel(ocrPy, minio, "Fetches PDF via presigned URL", "HTTP / S3 presigned") Rel(ocrTraining, db, "Persists training run metrics", "JDBC") @enduml