From 1aca4c4a41fa294f5715382aaad4154d29c772c0 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 16:46:25 +0200 Subject: [PATCH 01/12] security(ocr): add non-root user and set HOME/HF_HOME in Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CIS Docker §4.1: run uvicorn as UID 1000 (ocr) instead of root. Creates /home/ocr and /app/cache with correct ownership so named volumes inherit ocr:ocr on first Docker mount. Sets HOME and HF_HOME so ~ expansion and Hugging Face caching resolve under /app, not /root. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 2de1d862..e6d60360 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -23,8 +23,16 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . +RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1000 ocr \ + && mkdir -p /home/ocr /app/models /app/cache \ + && chown -R ocr:ocr /app /home/ocr RUN chmod +x /app/entrypoint.sh +ENV HOME=/home/ocr +ENV HF_HOME=/app/cache + +USER ocr + EXPOSE 8000 CMD ["/app/entrypoint.sh"] -- 2.49.1 From ab24786d2acaa727870946a6b6e77dfcabeeb105 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 16:47:18 +0200 Subject: [PATCH 02/12] =?UTF-8?q?security(ocr):=20harden=20compose=20?= =?UTF-8?q?=E2=80=94=20fix=20cache=20volume=20path,=20add=20read=5Fonly=20?= =?UTF-8?q?+=20cap=5Fdrop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ocr_cache mount from /root/.cache to /app/cache (correct path for non-root user). Add HF_HOME so Hugging Face resolves to the same path. Add runtime hardening: read_only, tmpfs /tmp (512 MB cap), cap_drop ALL, no-new-privileges. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2a3b7407..c256fa39 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,8 +87,9 @@ services: memswap_limit: 12g volumes: - ocr_models:/app/models - - ocr_cache:/root/.cache # Hugging Face / ketos model download cache — prevents re-downloads on container recreate + - ocr_cache:/app/cache environment: + HF_HOME: /app/cache KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -106,6 +107,12 @@ services: timeout: 5s retries: 12 start_period: 120s + read_only: true + tmpfs: + - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + cap_drop: [ALL] + security_opt: + - no-new-privileges:true # --- Backend: Spring Boot --- backend: -- 2.49.1 From 9db42d6cc131e7365fa32ee9c45342f6ce613a10 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 16:49:21 +0200 Subject: [PATCH 03/12] fix(ocr): resolve HTRMOPO_DIR from env var, not ~ expansion With --no-create-home, os.path.expanduser("~") resolves to "/" causing kraken get to write to /.local/share/htrmopo. Replace with os.environ.get("HTRMOPO_DIR", "/app/models/.htrmopo") so the path is explicit and override-friendly without a home directory. Adds two tests verifying env-var resolution and ~-free default. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/ensure_blla_model.py | 2 +- ocr-service/test_ensure_blla_model.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py index d0fa0305..0328c0cb 100644 --- a/ocr-service/ensure_blla_model.py +++ b/ocr-service/ensure_blla_model.py @@ -24,7 +24,7 @@ log = logging.getLogger(__name__) BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel") # DOI for "General segmentation model for print and handwriting" — ketos 7 compatible. BLLA_MODEL_DOI = "10.5281/zenodo.14602569" -HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo") +HTRMOPO_DIR = os.environ.get("HTRMOPO_DIR", "/app/models/.htrmopo") def _model_is_loadable(path: str) -> bool: diff --git a/ocr-service/test_ensure_blla_model.py b/ocr-service/test_ensure_blla_model.py index cc171e92..a3fa0698 100644 --- a/ocr-service/test_ensure_blla_model.py +++ b/ocr-service/test_ensure_blla_model.py @@ -1,10 +1,35 @@ """Unit tests for ensure_blla_model.main().""" +import importlib +import os from unittest.mock import MagicMock, call, patch import ensure_blla_model +# ─── HTRMOPO_DIR env var resolution ────────────────────────────────────────── + + +def test_htrmopo_dir_reads_from_env_var(): + """HTRMOPO_DIR uses the HTRMOPO_DIR env var when set, not ~ expansion.""" + with patch.dict(os.environ, {"HTRMOPO_DIR": "/custom/htrmopo"}): + importlib.reload(ensure_blla_model) + result = ensure_blla_model.HTRMOPO_DIR + importlib.reload(ensure_blla_model) + assert result == "/custom/htrmopo" + + +def test_htrmopo_dir_default_is_fixed_path(): + """Default HTRMOPO_DIR is a fixed path not derived from ~ (no-create-home safe).""" + clean_env = {k: v for k, v in os.environ.items() if k != "HTRMOPO_DIR"} + with patch.dict(os.environ, clean_env, clear=True): + importlib.reload(ensure_blla_model) + result = ensure_blla_model.HTRMOPO_DIR + importlib.reload(ensure_blla_model) + assert "~" not in result + assert not result.startswith("/.") + + # ─── Model already loadable ─────────────────────────────────────────────────── -- 2.49.1 From 581ba01d8d2a5ea54ca510bea92df3399163ffb7 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 16:51:00 +0200 Subject: [PATCH 04/12] security(ocr): log warning on startup when running as root Adds a canary log line if os.getuid() == 0. Produces an observable signal in container logs if the USER directive is ever removed from the Dockerfile, without requiring an external audit tool. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocr-service/main.py b/ocr-service/main.py index bc541c78..783bf224 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -56,6 +56,8 @@ async def lifespan(app: FastAPI): """Load lightweight models at startup. Surya loads lazily on first request.""" global _models_ready + if os.getuid() == 0: + logger.warning("Running as root — CIS Docker §4.1 violation") logger.info("Loading Kraken model at startup (Surya loads lazily on first OCR request)...") kraken_engine.load_models() load_spell_checker() -- 2.49.1 From 53bd57466059e65966394be5239fce5a71cb0d3e Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:26:58 +0200 Subject: [PATCH 05/12] test(ocr): replace vacuous startswith assertion with equality check Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_ensure_blla_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocr-service/test_ensure_blla_model.py b/ocr-service/test_ensure_blla_model.py index a3fa0698..cb5c6f7d 100644 --- a/ocr-service/test_ensure_blla_model.py +++ b/ocr-service/test_ensure_blla_model.py @@ -26,8 +26,7 @@ def test_htrmopo_dir_default_is_fixed_path(): importlib.reload(ensure_blla_model) result = ensure_blla_model.HTRMOPO_DIR importlib.reload(ensure_blla_model) - assert "~" not in result - assert not result.startswith("/.") + assert result == "/app/models/.htrmopo" # ─── Model already loadable ─────────────────────────────────────────────────── -- 2.49.1 From eb63df200059e79b2c1f729e28b58a70ce2a8d6e Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:29:47 +0200 Subject: [PATCH 06/12] test(ocr): add startup root canary tests for main.py lifespan Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/test_main.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 ocr-service/test_main.py diff --git a/ocr-service/test_main.py b/ocr-service/test_main.py new file mode 100644 index 00000000..a5eca819 --- /dev/null +++ b/ocr-service/test_main.py @@ -0,0 +1,36 @@ +"""Tests for main.py — startup behavior.""" + +import logging +from unittest.mock import patch + +import pytest +from httpx import ASGITransport, AsyncClient + +from main import app + + +# ─── Root canary ────────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_startup_logs_warning_when_running_as_root(caplog): + """Lifespan emits a WARNING when the process uid is 0 (running as root).""" + with patch("main.os.getuid", return_value=0), \ + patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + caplog.at_level(logging.WARNING, logger="main"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test"): + pass + assert "Running as root" in caplog.text + + +@pytest.mark.asyncio +async def test_startup_does_not_warn_when_running_as_non_root(caplog): + """Lifespan does not emit a root warning when running as a non-root user.""" + with patch("main.os.getuid", return_value=1000), \ + patch("main.kraken_engine.load_models"), \ + patch("main.load_spell_checker"), \ + caplog.at_level(logging.WARNING, logger="main"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test"): + pass + assert "Running as root" not in caplog.text -- 2.49.1 From fc8b4b164b0c242b5a6add454fc2e6bc49577873 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:30:39 +0200 Subject: [PATCH 07/12] security(ocr): redirect XDG cache and Torch home away from read-only HOME MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents PyTorch/Matplotlib/Ketos from writing to /home/ocr which is on the read-only container filesystem — fixes Nora's blocker. Also restores the explanatory comment on the ocr_cache volume mount. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 4 +++- ocr-service/Dockerfile | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index c256fa39..53a1cf97 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,9 +87,11 @@ services: memswap_limit: 12g volumes: - ocr_models:/app/models - - ocr_cache:/app/cache + - ocr_cache:/app/cache # HuggingFace / ketos cache — prevents re-downloads on recreate (HF_HOME) environment: HF_HOME: /app/cache + XDG_CACHE_HOME: /app/cache + TORCH_HOME: /app/models/torch KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" OCR_CONFIDENCE_THRESHOLD: "0.3" diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index e6d60360..9ad75f5c 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -30,6 +30,8 @@ RUN chmod +x /app/entrypoint.sh ENV HOME=/home/ocr ENV HF_HOME=/app/cache +ENV XDG_CACHE_HOME=/app/cache +ENV TORCH_HOME=/app/models/torch USER ocr -- 2.49.1 From 38973a014e177443619ea772e21ed1418d6264cf Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:32:02 +0200 Subject: [PATCH 08/12] docs: add XDG_CACHE_HOME/TORCH_HOME to OCR env table and upgrade notes for PR #611 Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 0b852e4d..9da72276 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -19,6 +19,7 @@ This doc is the Day-1 checklist and operational reference. It links to the canon 5. [Backup + recovery](#5-backup--recovery) 6. [Common operational tasks](#6-common-operational-tasks) 7. [Known limitations](#7-known-limitations) +8. [Upgrade notes](#8-upgrade-notes) --- @@ -140,6 +141,8 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | `KRAKEN_MODEL_PATH` | Directory containing Kraken HTR models (populated by `download-kraken-models.sh`) | `/app/models/` | — | — | | `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — | | `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — | +| `XDG_CACHE_HOME` | XDG cache base dir — redirects Matplotlib and other XDG-aware libraries away from the read-only `HOME` (`/home/ocr`) to the writable cache volume | `/app/cache` | — | — | +| `TORCH_HOME` | PyTorch model cache — redirects `~/.cache/torch` to the writable models volume | `/app/models/torch` | — | — | ### Observability stack (`docker-compose.observability.yml`) @@ -554,3 +557,21 @@ bash scripts/download-kraken-models.sh | **No multi-region** | Single PostgreSQL + MinIO instance; no replication or failover | Deliberate scope decision | | **Max upload size** | 50 MB per file (500 MB per request for multi-file) | Configurable in `application.yaml` (`spring.servlet.multipart`) | | **No automated backup** | Phase 5 of Production v1 milestone is not yet implemented | See §5 above | + +--- + +## 8. Upgrade notes + +Version-specific one-time steps that must be run before or after upgrading to a given release. Each subsection is safe to skip on a fresh install. + +### Upgrading to PR #611 — non-root OCR container + +The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing `ocr_cache` volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup. + +**Before starting the updated container stack**, drop the old root-owned volume: + +```bash +docker volume rm familienarchiv_ocr_cache +``` + +The volume is recreated automatically on `docker compose up`. The OCR service will re-download its model cache on first startup (approximately 1–2 GB, one-time cost). -- 2.49.1 From 74ca5ee35ff1a037076d2e41e33c6013849c8f52 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:33:06 +0200 Subject: [PATCH 09/12] =?UTF-8?q?docs(adr):=20ADR-019=20=E2=80=94=20contai?= =?UTF-8?q?ner=20hardening=20baseline=20(non-root=20+=20read-only)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- docs/adr/019-container-hardening-baseline.md | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 docs/adr/019-container-hardening-baseline.md diff --git a/docs/adr/019-container-hardening-baseline.md b/docs/adr/019-container-hardening-baseline.md new file mode 100644 index 00000000..6bd0c53b --- /dev/null +++ b/docs/adr/019-container-hardening-baseline.md @@ -0,0 +1,94 @@ +# ADR-019 — Container hardening baseline: non-root user + read-only filesystem + +**Status:** Accepted +**Date:** 2026-05-17 +**PR:** #611 + +--- + +## Context + +The OCR service ran as `root` inside its container by default. This violated CIS Docker Benchmark §4.1 and CIS §4.6, and meant that any exploit in the OCR pipeline (untrusted PDF content, model deserialization, ZIP handling) could write to or execute anything inside the container without restriction. + +The following risks were present before this baseline: + +- A path-traversal in the ZIP-based training endpoint could overwrite arbitrary paths on the container filesystem (including Python source files and model files). +- A compromised dependency running at startup could persist itself to the image layers or model volumes. +- Misconfigured model downloads could overwrite `/etc/passwd` or similar via path-traversal — possible because root can write everywhere. + +--- + +## Decision + +All containers in this project that have no operational need for elevated privileges **must** apply the following hardening baseline: + +### 1. Non-root user + +Create a dedicated user with a fixed UID and no login shell: + +```dockerfile +RUN useradd --no-create-home --shell /usr/sbin/nologin --uid 1000 +``` + +Set `HOME` explicitly to a path owned by this user. Do not rely on `~` expansion for any path resolution in application code. + +### 2. Read-only container filesystem + +```yaml +read_only: true +``` + +All paths the application writes to at runtime must be explicitly declared as either a named volume or a `tmpfs` mount. This turns any unexpected write attempt into an immediate, visible `PermissionError` rather than a silent success. + +### 3. Per-path write carve-outs + +Declare only the paths that are actually written at runtime: + +```yaml +volumes: + - _models:/app/models # persistent model storage + - _cache:/app/cache # HuggingFace / ketos download cache +tmpfs: + - /tmp:size=512m # transient scratch space (ZIP extraction etc.) +``` + +Do not mount the home directory as a volume unless necessary — use `XDG_CACHE_HOME` and `TORCH_HOME` env vars to redirect library cache writes to the declared writable paths instead. + +### 4. Dropped capabilities and privilege escalation prevention + +```yaml +cap_drop: [ALL] +security_opt: + - no-new-privileges:true +``` + +A Python/FastAPI service on port 8000+ requires no Linux capabilities. Dropping all and blocking privilege escalation via setuid prevents any capability regain even if a dependency contains a SUID binary. + +### 5. Startup root canary + +Log a warning during startup if the process is running as root. This catches misconfiguration (e.g., `USER` directive accidentally removed in a future Dockerfile edit) before it becomes a silent vulnerability: + +```python +if os.getuid() == 0: + logger.warning("Running as root — CIS Docker §4.1 violation") +``` + +--- + +## Consequences + +**Positive:** +- Any exploit that achieves code execution inside the container is confined: it cannot write outside the declared volumes, cannot acquire new capabilities, and cannot persist to the image filesystem. +- `PermissionError` on startup is an explicit, diagnosable failure rather than a silent privilege misuse. +- The startup canary catches accidental regressions in the non-root setup. + +**Negative / operational cost:** +- Every new feature that writes to a new path (e.g., a new model cache directory, a new scratch path) must add a volume or tmpfs mount. The `read_only: true` flag makes this a hard constraint, not a suggestion. +- Library dependencies that write to `HOME` without respecting `XDG_CACHE_HOME` must be identified and redirected explicitly (see `TORCH_HOME`, `XDG_CACHE_HOME`, `HF_HOME` in `docker-compose.yml`). +- Existing named volumes written by root (pre-baseline) must be dropped and recreated before upgrading. See [DEPLOYMENT.md §8](../DEPLOYMENT.md#8-upgrade-notes). + +--- + +## Applicability + +This baseline applies to the OCR service (PR #611). It should be applied to any new container added to the project unless there is a documented, specific operational reason a capability or writable filesystem is required. -- 2.49.1 From 7769dbc9f46a574e073ebe00add1bf263419d679 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 17:43:18 +0200 Subject: [PATCH 10/12] security(ocr): apply container hardening baseline to docker-compose.prod.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the CIS Docker §4.1/§4.6 hardening from docker-compose.yml to the production/staging compose file, which is standalone (not an overlay). - Fix cache volume mount path: ocr-cache:/root/.cache → /app/cache (matches the non-root user's HF_HOME/XDG_CACHE_HOME, avoids PermissionError) - Add HF_HOME, XDG_CACHE_HOME, TORCH_HOME env vars so HuggingFace, ketos, and PyTorch all write to the declared writable volumes, not HOME - Add read_only: true, tmpfs (/tmp:512m), cap_drop: [ALL], no-new-privileges:true — matching the dev baseline Also extend DEPLOYMENT.md §8 upgrade notes to cover all three environments (dev/production/staging), each with its correct project-namespaced volume name. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.prod.yml | 12 +++++++++++- docs/DEPLOYMENT.md | 11 +++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 9fcb453f..dbae6e9a 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -142,8 +142,11 @@ services: memswap_limit: ${OCR_MEM_LIMIT:-12g} volumes: - ocr-models:/app/models - - ocr-cache:/root/.cache + - ocr-cache:/app/cache # HuggingFace / ketos cache — prevents re-downloads on recreate (HF_HOME) environment: + HF_HOME: /app/cache + XDG_CACHE_HOME: /app/cache + TORCH_HOME: /app/models/torch KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} OCR_CONFIDENCE_THRESHOLD: "0.3" @@ -161,6 +164,13 @@ services: timeout: 5s retries: 12 start_period: 120s + read_only: true + tmpfs: + - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) + cap_drop: + - ALL + security_opt: + - no-new-privileges:true backend: image: familienarchiv/backend:${TAG:-nightly} diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 9da72276..aaba04e2 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -566,12 +566,19 @@ Version-specific one-time steps that must be run before or after upgrading to a ### Upgrading to PR #611 — non-root OCR container -The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing `ocr_cache` volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup. +The OCR cache volume path changed from `/root/.cache` to `/app/cache` (PR #611 — CIS Docker §4.1 hardening). The existing volume was written as root and is inaccessible to the new non-root `ocr` user, causing a `PermissionError` on startup. -**Before starting the updated container stack**, drop the old root-owned volume: +**Before starting the updated container stack**, drop the old root-owned volume. The volume name depends on the compose project name: ```bash +# Dev (docker-compose.yml — project name: familienarchiv) docker volume rm familienarchiv_ocr_cache + +# Production (docker-compose.prod.yml -p archiv-production) +docker volume rm archiv-production_ocr-cache + +# Staging (docker-compose.prod.yml -p archiv-staging) +docker volume rm archiv-staging_ocr-cache ``` The volume is recreated automatically on `docker compose up`. The OCR service will re-download its model cache on first startup (approximately 1–2 GB, one-time cost). -- 2.49.1 From bead6f1811de9f9d4b1327a1d4b084d23c51a2f5 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 18:53:26 +0200 Subject: [PATCH 11/12] fix(ocr): handle empty-string HTRMOPO_DIR env var with or-fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit os.environ.get(key, default) returns "" when the key exists but is blank — the default is only used when the key is absent. The or-fallback treats both absence and blank values as "use the default". Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/ensure_blla_model.py | 2 +- ocr-service/test_ensure_blla_model.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py index 0328c0cb..399e8a80 100644 --- a/ocr-service/ensure_blla_model.py +++ b/ocr-service/ensure_blla_model.py @@ -24,7 +24,7 @@ log = logging.getLogger(__name__) BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel") # DOI for "General segmentation model for print and handwriting" — ketos 7 compatible. BLLA_MODEL_DOI = "10.5281/zenodo.14602569" -HTRMOPO_DIR = os.environ.get("HTRMOPO_DIR", "/app/models/.htrmopo") +HTRMOPO_DIR = os.environ.get("HTRMOPO_DIR") or "/app/models/.htrmopo" def _model_is_loadable(path: str) -> bool: diff --git a/ocr-service/test_ensure_blla_model.py b/ocr-service/test_ensure_blla_model.py index cb5c6f7d..c3c65bd0 100644 --- a/ocr-service/test_ensure_blla_model.py +++ b/ocr-service/test_ensure_blla_model.py @@ -29,6 +29,15 @@ def test_htrmopo_dir_default_is_fixed_path(): assert result == "/app/models/.htrmopo" +def test_htrmopo_dir_falls_back_to_default_when_set_to_empty_string(): + """HTRMOPO_DIR='' must not produce an empty path — get() returns '' for blank env vars.""" + with patch.dict(os.environ, {"HTRMOPO_DIR": ""}): + importlib.reload(ensure_blla_model) + result = ensure_blla_model.HTRMOPO_DIR + importlib.reload(ensure_blla_model) + assert result != "" + + # ─── Model already loadable ─────────────────────────────────────────────────── -- 2.49.1 From f1e0b92f4793f955e37ead79abd2c89e15decb0a Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 17 May 2026 18:54:24 +0200 Subject: [PATCH 12/12] style(ocr): normalize cap_drop to block notation in docker-compose.yml Aligns with the block sequence style used in docker-compose.prod.yml and the rest of the compose file, removing the inline [ALL] inconsistency. Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 53a1cf97..91f8bbda 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -112,7 +112,8 @@ services: read_only: true tmpfs: - /tmp:size=512m # training endpoints write ZIPs to /tmp; 512 MB covers typical batches (20–50 images) - cap_drop: [ALL] + cap_drop: + - ALL security_opt: - no-new-privileges:true -- 2.49.1