diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index 36d5298a..ce517f25 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -133,7 +133,9 @@ app: ollama: base-url: http://ollama:11434 model: qwen2.5:7b-instruct-q4_K_M - timeout-seconds: 30 + # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the + # first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it. + timeout-seconds: 60 health-check-timeout-seconds: 2 nl-search: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 26e07442..9c60b3bf 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -50,6 +50,7 @@ volumes: minio-data: ocr-models: ocr-cache: + ollama-models: services: db: @@ -200,6 +201,73 @@ services: security_opt: - no-new-privileges:true + # --- Ollama: Model init (one-shot pull) --- + # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on + # first start; exits quickly on subsequent starts (model already cached). + # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT + # curl, so the entrypoint is overridden to a shell and readiness is probed with + # `ollama list` (not curl). The pull is guarded by a `grep` on the cached model + # list so a model already on the volume exits clean WITHOUT a registry round-trip + # — a host reboot during a registry/network blip can no longer fail init (which + # would block the ollama service via service_completed_successfully). + # Backend degrades gracefully (503) if Ollama is absent. + ollama-model-init: + image: ollama/ollama:0.30.6 + restart: "no" + entrypoint: ["/bin/sh", "-c"] + command: + - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)" + networks: + - archiv-net + volumes: + - ollama-models:/root/.ollama + mem_limit: 2g + read_only: true + tmpfs: + - /tmp:size=512m + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + + # --- Ollama: LLM inference server --- + # Serves the pre-pulled model for NL search inference. Backend reaches it at + # http://ollama:11434 (application.yaml default; no env override required). + # Healthcheck uses `ollama list` because the image has no curl. + ollama: + image: ollama/ollama:0.30.6 + restart: unless-stopped + expose: + - "11434" + networks: + - archiv-net + volumes: + - ollama-models:/root/.ollama + environment: + # Pin the model in memory (no idle unload). Without this, Ollama evicts + # the model after ~5 min idle and the next query pays a cold-load penalty + # that exceeds the backend read timeout → NL search 503 after idle. + OLLAMA_KEEP_ALIVE: "-1" + cpus: "${OLLAMA_CPU_LIMIT:-4.0}" + mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" + memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}" + read_only: true + tmpfs: + - /tmp:size=512m + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + healthcheck: + test: ["CMD", "ollama", "list"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s + depends_on: + ollama-model-init: + condition: service_completed_successfully + backend: image: familienarchiv/backend:${TAG:-nightly} build: diff --git a/docker-compose.yml b/docker-compose.yml index 78ac969a..f9e618ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -161,8 +161,13 @@ services: - ALL security_opt: - no-new-privileges:true - command: > - sh -c "ollama serve & SERVE_PID=$$! && until curl -sf http://localhost:11434/api/tags; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M && kill $$SERVE_PID" + # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has + # no curl, so readiness is probed with `ollama list` instead of a curl loop. + # The pull is guarded by a `grep` on the cached model list so an already-cached + # model exits clean without a registry round-trip (offline-safe re-up). + entrypoint: ["/bin/sh", "-c"] + command: + - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)" # --- Ollama: LLM inference server --- # Serves the pre-pulled model for NL search inference. @@ -180,6 +185,9 @@ services: - ollama_models:/root/.ollama environment: OLLAMA_API_KEY: "${OLLAMA_API_KEY}" + # Pin the model in memory (no idle unload) so queries never pay a cold-load + # penalty that exceeds the backend read timeout → NL search 503 after idle. + OLLAMA_KEEP_ALIVE: "-1" cpus: "${OLLAMA_CPU_LIMIT:-4.0}" mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}" @@ -191,7 +199,9 @@ services: security_opt: - no-new-privileges:true healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] + # `ollama list` hits the local API and exits non-zero if the server is + # down — used instead of curl, which the image does not ship. + test: ["CMD", "ollama", "list"] interval: 30s timeout: 10s retries: 5 diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 5c2580de..f8523515 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -613,7 +613,7 @@ Expected output includes `qwen2.5:7b-instruct-q4_K_M`. |---|---|---| | `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) | | `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference | -| `app.ollama.timeout-seconds` | `30` | Read timeout for inference calls | +| `app.ollama.timeout-seconds` | `60` | Read timeout for inference calls (absorbs cold model load on the first query after an Ollama restart) | | `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit | ### Upgrade the Ollama model @@ -625,7 +625,7 @@ To switch to a newer model version (e.g. a future release of `qwen2.5`): ```bash docker volume rm familienarchiv_ollama_models ``` - (In production the volume name is prefixed with the compose project: `archiv-production_ollama_models`.) + (In production the volume name is prefixed with the compose project: `archiv-production_ollama-models`.) 3. Restart the stack: ```bash docker compose up -d diff --git a/docs/adr/034-ollama-production-deployment-and-keep-alive.md b/docs/adr/034-ollama-production-deployment-and-keep-alive.md new file mode 100644 index 00000000..0ff4a790 --- /dev/null +++ b/docs/adr/034-ollama-production-deployment-and-keep-alive.md @@ -0,0 +1,125 @@ +# ADR-034: Ollama in production — deployment, keep-alive pinning, and corrected init recipe + +**Date:** 2026-06-06 +**Status:** Accepted +**Deciders:** Marcel Raddatz +**Relates to:** #758 (bug), #759 (fix), #737 (NL search infrastructure) +**Corrects:** ADR-028 §10–§11 (init recipe and readiness probe) + +--- + +## Context + +ADR-028 introduced Ollama as a Docker Compose service for NL search and documented +its topology, graceful-degradation contract, and memory budget. Two defects survived +that work and only surfaced once NL search reached staging (#758): + +1. **Ollama was added only to the dev `docker-compose.yml`.** Staging and production + deploy from the self-contained `docker-compose.prod.yml`, which had no `ollama` + service. The backend defaults to `app.ollama.base-url: http://ollama:11434`, so its + client bean was active and resolved to a non-existent host → `ResourceAccessException` + → HTTP 503 on every NL search. +2. **The init recipe documented in ADR-028 §10 never worked.** The `ollama/ollama` image + `ENTRYPOINT` is `ollama`, so a bare `command: sh -c "…"` ran as `ollama sh -c "…"` + (`unknown command "sh"`), and the image ships **no curl**, so the curl-based readiness + loop and the curl healthcheck could never pass. + +This ADR records the production deployment decision and the corrected operational +contract. It is also the durable record of *why* `OLLAMA_KEEP_ALIVE=-1` is set, so a +future maintainer does not "optimize" it away and reintroduce the cold-load 503. + +--- + +## Decisions + +### 1. Ollama is a first-class production service + +`docker-compose.prod.yml` now defines `ollama` + `ollama-model-init` + the +`ollama-models` volume, mirroring the dev stack. The graceful-degradation contract from +ADR-028 §3 is preserved: `backend` has **no** hard `depends_on` on `ollama`, so an absent +or unhealthy Ollama still yields a clean 503 rather than blocking backend startup. + +### 2. Corrected init recipe (supersedes ADR-028 §10) + +The init container overrides the image entrypoint to a shell and probes readiness with +`ollama list` (not curl, which the image lacks): + +```sh +ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && \ + (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M) +``` + +```yaml +entrypoint: ["/bin/sh", "-c"] +``` + +The pull is **guarded by a grep on the cached model list**. A model already on the volume +exits clean without any registry round-trip. This makes re-up offline-safe: a host reboot +during a registry/network blip can no longer fail init (which, via +`condition: service_completed_successfully`, would otherwise block the `ollama` service +and take NL search down until the registry was reachable again). The same recipe is used +in dev and prod — one mental model. + +### 3. Healthcheck uses `ollama list` (supersedes ADR-028 §11 probe) + +```yaml +healthcheck: + test: ["CMD", "ollama", "list"] +``` + +`ollama list` hits the local API and exits non-zero when the server is down — the correct +probe for a curl-less image. The `start_period: 60s` rationale from ADR-028 §11 still holds. + +### 4. `OLLAMA_KEEP_ALIVE=-1` — pin the model in memory + +```yaml +environment: + OLLAMA_KEEP_ALIVE: "-1" +``` + +By default Ollama evicts an idle model after ~5 minutes. The next query then pays a +cold-load penalty that exceeds the backend read timeout, producing an NL search 503 after +any idle period. Pinning the model (`-1` = never unload) keeps warm-path latency +predictable (~18 s on CPU). **Do not remove this** without re-introducing the post-idle +cold-load 503. + +### 5. Read timeout raised 30 → 60 s + +`app.ollama.timeout-seconds` is raised from 30 to 60 (`application.yaml`, mirrored in +`DEPLOYMENT.md`). Warm CPU inference is ~18 s; the higher ceiling absorbs the one cold +model load on the first query after an Ollama (re)start, before §4's pin takes hold. + +**Implicit NFR made explicit:** NL search shall return a result or a 503 within 60 s; the +cold-start path immediately after an Ollama restart is the only path that approaches this +ceiling. + +### 6. Hard-OOM trade-off (refines ADR-028 §2) + +`memswap_limit == mem_limit` (both `${OLLAMA_MEM_LIMIT:-8g}`) disables swap for the +container. Combined with §4's pinned model, a memory-pressure event is a **hard OOM-kill, +not graceful latency degradation**. This is deliberate — swap-thrashing an LLM is worse +than a clean restart — but it means the 8 GB envelope is a real ceiling. `qwen2.5-7B-q4` +plus its KV cache under load sits close enough to 8 GB that this needs a Prometheus +memory alert on the `ollama` container before it bites in production (tracked as +observability follow-up, not in this PR). + +--- + +## Consequences + +### Positive + +- NL search works on staging/production, not just dev — the actual deploy artifact now + matches the documented architecture. +- Re-up is offline-safe: a cached model never depends on registry reachability. +- The keep-alive pin and timeout ceiling make NL search latency predictable on CPU. + +### Risks and operational implications + +- **Hard OOM under memory pressure** (§6): a Prometheus alert on `ollama` container memory + is required before this is load-bearing in prod. Tracked as an observability follow-up. +- **Unauthenticated inference** relies entirely on `archiv-net` isolation (ADR-028 §7/§12, + unchanged). Sending an `Authorization` header from `RestClientOllamaClient` is a separate + durable hardening item, tracked outside this PR. +- ADR-028 §10–§11 describe a recipe that never functioned; this ADR is the authoritative + init/healthcheck contract going forward. diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index 2d471dd9..b8630001 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -17,7 +17,6 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") { ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.") ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.") Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.") - Container(ollama, "Ollama", "Ollama / port 11434", "Local LLM inference server. Hosts qwen2.5:7b-instruct-q4_K_M for natural-language query parsing (NL Search). CPU-only; GPU not required.") } System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") { @@ -49,7 +48,6 @@ Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API") Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)") Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus") Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics") -Rel(backend, ollama, "NL search inference requests", "HTTP / REST / JSON") Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics") Rel(grafana, prometheus, "Queries metrics", "HTTP 9090") Rel(grafana, loki, "Queries logs", "HTTP 3100")