Compare commits

..

4 Commits

Author SHA1 Message Date
Marcel
581ba01d8d security(ocr): log warning on startup when running as root
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m3s
CI / OCR Service Tests (pull_request) Successful in 18s
CI / Backend Unit Tests (pull_request) Successful in 3m10s
CI / fail2ban Regex (pull_request) Successful in 42s
CI / Semgrep Security Scan (pull_request) Successful in 19s
CI / Compose Bucket Idempotency (pull_request) Successful in 59s
Adds a canary log line if os.getuid() == 0. Produces an observable
signal in container logs if the USER directive is ever removed from
the Dockerfile, without requiring an external audit tool.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-17 16:51:00 +02:00
Marcel
9db42d6cc1 fix(ocr): resolve HTRMOPO_DIR from env var, not ~ expansion
With --no-create-home, os.path.expanduser("~") resolves to "/" causing
kraken get to write to /.local/share/htrmopo. Replace with
os.environ.get("HTRMOPO_DIR", "/app/models/.htrmopo") so the path is
explicit and override-friendly without a home directory.

Adds two tests verifying env-var resolution and ~-free default.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-17 16:49:21 +02:00
Marcel
ab24786d2a security(ocr): harden compose — fix cache volume path, add read_only + cap_drop
Move ocr_cache mount from /root/.cache to /app/cache (correct path for
non-root user). Add HF_HOME so Hugging Face resolves to the same path.
Add runtime hardening: read_only, tmpfs /tmp (512 MB cap), cap_drop ALL,
no-new-privileges.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-17 16:47:18 +02:00
Marcel
1aca4c4a41 security(ocr): add non-root user and set HOME/HF_HOME in Dockerfile
CIS Docker §4.1: run uvicorn as UID 1000 (ocr) instead of root.
Creates /home/ocr and /app/cache with correct ownership so named
volumes inherit ocr:ocr on first Docker mount. Sets HOME and HF_HOME
so ~ expansion and Hugging Face caching resolve under /app, not /root.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-17 16:46:25 +02:00
5 changed files with 44 additions and 2 deletions

View File

@@ -87,8 +87,9 @@ services:
memswap_limit: 12g
volumes:
- ocr_models:/app/models
- ocr_cache:/root/.cache # Hugging Face / ketos model download cache — prevents re-downloads on container recreate
- ocr_cache:/app/cache
environment:
HF_HOME: /app/cache
KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
OCR_CONFIDENCE_THRESHOLD: "0.3"
@@ -106,6 +107,12 @@ services:
timeout: 5s
retries: 12
start_period: 120s
read_only: true
tmpfs:
- /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (2050 images)
cap_drop: [ALL]
security_opt:
- no-new-privileges:true
# --- Backend: Spring Boot ---
backend:

View File

@@ -23,8 +23,16 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1000 ocr \
&& mkdir -p /home/ocr /app/models /app/cache \
&& chown -R ocr:ocr /app /home/ocr
RUN chmod +x /app/entrypoint.sh
ENV HOME=/home/ocr
ENV HF_HOME=/app/cache
USER ocr
EXPOSE 8000
CMD ["/app/entrypoint.sh"]

View File

@@ -24,7 +24,7 @@ log = logging.getLogger(__name__)
BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel")
# DOI for "General segmentation model for print and handwriting" — ketos 7 compatible.
BLLA_MODEL_DOI = "10.5281/zenodo.14602569"
HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo")
HTRMOPO_DIR = os.environ.get("HTRMOPO_DIR", "/app/models/.htrmopo")
def _model_is_loadable(path: str) -> bool:

View File

@@ -56,6 +56,8 @@ async def lifespan(app: FastAPI):
"""Load lightweight models at startup. Surya loads lazily on first request."""
global _models_ready
if os.getuid() == 0:
logger.warning("Running as root — CIS Docker §4.1 violation")
logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...")
kraken_engine.load_models()
load_spell_checker()

View File

@@ -1,10 +1,35 @@
"""Unit tests for ensure_blla_model.main()."""
import importlib
import os
from unittest.mock import MagicMock, call, patch
import ensure_blla_model
# ─── HTRMOPO_DIR env var resolution ──────────────────────────────────────────
def test_htrmopo_dir_reads_from_env_var():
"""HTRMOPO_DIR uses the HTRMOPO_DIR env var when set, not ~ expansion."""
with patch.dict(os.environ, {"HTRMOPO_DIR": "/custom/htrmopo"}):
importlib.reload(ensure_blla_model)
result = ensure_blla_model.HTRMOPO_DIR
importlib.reload(ensure_blla_model)
assert result == "/custom/htrmopo"
def test_htrmopo_dir_default_is_fixed_path():
"""Default HTRMOPO_DIR is a fixed path not derived from ~ (no-create-home safe)."""
clean_env = {k: v for k, v in os.environ.items() if k != "HTRMOPO_DIR"}
with patch.dict(os.environ, clean_env, clear=True):
importlib.reload(ensure_blla_model)
result = ensure_blla_model.HTRMOPO_DIR
importlib.reload(ensure_blla_model)
assert "~" not in result
assert not result.startswith("/.")
# ─── Model already loadable ───────────────────────────────────────────────────