feat(ocr): add /train endpoint to OCR service and OcrClient.trainModel()

- POST /train in ocr-service with ZIP Slip validation, TemporaryDirectory,
  ketos transfer learning, timestamped backups (keep last 3), in-process reload
- X-Training-Token auth (no-op in dev when TRAINING_TOKEN env is empty)
- trainModel() in OcrClient interface + RestClientOcrClient (10-min timeout,
  multipart upload, forwards X-Training-Token when configured)
- TRAINING_TOKEN env var wired in docker-compose; --workers 2 in Dockerfile
  so /health stays responsive during synchronous training

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 14:40:53 +02:00
parent cfa3c4df67
commit bc97a2dade
6 changed files with 188 additions and 8 deletions

View File

@@ -23,4 +23,4 @@ COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]

View File

@@ -1,16 +1,21 @@
"""OCR microservice — FastAPI app with Surya and Kraken engine support."""
import asyncio
import glob
import io
import json
import logging
import os
import shutil
import tempfile
import zipfile
from contextlib import asynccontextmanager
from datetime import datetime, timezone
from urllib.parse import urlparse
import httpx
import pypdfium2 as pdfium
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, Header, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
from PIL import Image
@@ -19,6 +24,9 @@ from engines import kraken as kraken_engine
from engines import surya as surya_engine
from models import OcrBlock, OcrRequest
TRAINING_TOKEN = os.environ.get("TRAINING_TOKEN", "")
KRAKEN_MODEL_PATH = os.environ.get("KRAKEN_MODEL_PATH", "/app/models/german_kurrent.mlmodel")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -169,6 +177,107 @@ async def run_ocr_stream(request: OcrRequest):
)
def _check_training_token(x_training_token: str | None) -> None:
"""Validate training token if TRAINING_TOKEN env var is set."""
if TRAINING_TOKEN and x_training_token != TRAINING_TOKEN:
raise HTTPException(status_code=403, detail="Invalid or missing X-Training-Token")
def _validate_zip_entry(name: str, extract_dir: str) -> None:
"""Reject ZIP Slip attacks: path traversal and absolute paths."""
if os.path.isabs(name) or name.startswith(".."):
raise HTTPException(status_code=400, detail=f"Unsafe ZIP entry: {name}")
resolved = os.path.realpath(os.path.join(extract_dir, name))
if not resolved.startswith(os.path.realpath(extract_dir)):
raise HTTPException(status_code=400, detail=f"ZIP Slip detected: {name}")
def _rotate_backups(model_path: str, keep: int = 3) -> None:
"""Keep only the last `keep` timestamped backups of the model."""
pattern = model_path + ".*.bak"
backups = sorted(glob.glob(pattern))
for old in backups[:-keep]:
try:
os.remove(old)
except OSError:
logger.warning("Could not remove old backup: %s", old)
@app.post("/train")
async def train_model(
file: UploadFile,
x_training_token: str | None = Header(default=None),
):
"""Fine-tune the Kurrent recognition model with uploaded training data.
Accepts a ZIP archive containing .png/.gt.txt training pairs exported
by the Java backend. Training mutates in-process model state — not safe
if the service is replicated.
"""
_check_training_token(x_training_token)
if not _models_ready:
raise HTTPException(status_code=503, detail="Models not loaded yet")
zip_bytes = await file.read()
training_run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
log = logging.LoggerAdapter(logger, {"training_run_id": training_run_id})
log.info("Starting training run %s", training_run_id)
def _run_training() -> dict:
with tempfile.TemporaryDirectory() as tmp_dir:
# Extract ZIP with safety checks
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
for entry in zf.namelist():
_validate_zip_entry(entry, tmp_dir)
zf.extractall(tmp_dir)
log.info("Extracted %d ZIP entries to %s", len(os.listdir(tmp_dir)), tmp_dir)
# Run ketos train (transfer learning from existing model)
from kraken import ketos
ground_truth = glob.glob(os.path.join(tmp_dir, "*.gt.txt"))
if not ground_truth:
raise HTTPException(status_code=422, detail="No ground-truth files found in ZIP")
log.info("Training on %d ground-truth pairs", len(ground_truth))
output_model_path = os.path.join(tmp_dir, "fine_tuned.mlmodel")
result = ketos.train(
ground_truth=ground_truth,
load=KRAKEN_MODEL_PATH,
output=output_model_path,
format_type="path",
)
epochs = getattr(result, "epochs", None) or 0
loss = getattr(result, "best_loss", None)
accuracy = getattr(result, "best_accuracy", None)
log.info("Training complete — epochs=%s loss=%s accuracy=%s", epochs, loss, accuracy)
# Backup existing model and replace
if os.path.exists(KRAKEN_MODEL_PATH):
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
backup_path = f"{KRAKEN_MODEL_PATH}.{timestamp}.bak"
shutil.copy2(KRAKEN_MODEL_PATH, backup_path)
log.info("Backed up model to %s", backup_path)
_rotate_backups(KRAKEN_MODEL_PATH, keep=3)
shutil.move(output_model_path, KRAKEN_MODEL_PATH)
log.info("Replaced model at %s", KRAKEN_MODEL_PATH)
# Reload model in-process
kraken_engine.load_models()
log.info("Reloaded Kraken model in-process")
return {"loss": loss, "accuracy": accuracy, "epochs": epochs}
result = await asyncio.to_thread(_run_training)
return result
async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
"""Download a PDF from a presigned URL and convert each page to a PIL Image."""
_validate_url(url)