fix(infra): deploy Ollama to prod/staging compose + fix broken model-init recipe #759

Merged
marcel merged 8 commits from fix/issue-758-ollama-prod-compose into main 2026-06-06 20:30:35 +02:00
6 changed files with 211 additions and 8 deletions

View File

@@ -133,7 +133,9 @@ app:
ollama: ollama:
base-url: http://ollama:11434 base-url: http://ollama:11434
model: qwen2.5:7b-instruct-q4_K_M model: qwen2.5:7b-instruct-q4_K_M
timeout-seconds: 30 # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
# first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
timeout-seconds: 60
health-check-timeout-seconds: 2 health-check-timeout-seconds: 2
nl-search: nl-search:

View File

@@ -50,6 +50,7 @@ volumes:
minio-data: minio-data:
ocr-models: ocr-models:
ocr-cache: ocr-cache:
ollama-models:
services: services:
db: db:
@@ -200,6 +201,73 @@ services:
security_opt: security_opt:
- no-new-privileges:true - no-new-privileges:true
# --- Ollama: Model init (one-shot pull) ---
# Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on
# first start; exits quickly on subsequent starts (model already cached).
# The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
# curl, so the entrypoint is overridden to a shell and readiness is probed with
# `ollama list` (not curl). The pull is guarded by a `grep` on the cached model
# list so a model already on the volume exits clean WITHOUT a registry round-trip
# — a host reboot during a registry/network blip can no longer fail init (which
# would block the ollama service via service_completed_successfully).
# Backend degrades gracefully (503) if Ollama is absent.
ollama-model-init:
image: ollama/ollama:0.30.6
restart: "no"
entrypoint: ["/bin/sh", "-c"]
command:
- "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
networks:
- archiv-net
volumes:
- ollama-models:/root/.ollama
mem_limit: 2g
read_only: true
tmpfs:
- /tmp:size=512m
cap_drop:
- ALL
security_opt:
- no-new-privileges:true
# --- Ollama: LLM inference server ---
# Serves the pre-pulled model for NL search inference. Backend reaches it at
# http://ollama:11434 (application.yaml default; no env override required).
# Healthcheck uses `ollama list` because the image has no curl.
ollama:
image: ollama/ollama:0.30.6
restart: unless-stopped
expose:
- "11434"
networks:
- archiv-net
volumes:
- ollama-models:/root/.ollama
environment:
# Pin the model in memory (no idle unload). Without this, Ollama evicts
# the model after ~5 min idle and the next query pays a cold-load penalty
# that exceeds the backend read timeout → NL search 503 after idle.
OLLAMA_KEEP_ALIVE: "-1"
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
read_only: true
tmpfs:
- /tmp:size=512m
cap_drop:
- ALL
security_opt:
- no-new-privileges:true
healthcheck:
test: ["CMD", "ollama", "list"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
depends_on:
ollama-model-init:
condition: service_completed_successfully
backend: backend:
image: familienarchiv/backend:${TAG:-nightly} image: familienarchiv/backend:${TAG:-nightly}
build: build:

View File

@@ -161,8 +161,13 @@ services:
- ALL - ALL
security_opt: security_opt:
- no-new-privileges:true - no-new-privileges:true
command: > # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
sh -c "ollama serve & SERVE_PID=$$! && until curl -sf http://localhost:11434/api/tags; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M && kill $$SERVE_PID" # no curl, so readiness is probed with `ollama list` instead of a curl loop.
# The pull is guarded by a `grep` on the cached model list so an already-cached
# model exits clean without a registry round-trip (offline-safe re-up).
entrypoint: ["/bin/sh", "-c"]
command:
- "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
# --- Ollama: LLM inference server --- # --- Ollama: LLM inference server ---
# Serves the pre-pulled model for NL search inference. # Serves the pre-pulled model for NL search inference.
@@ -180,6 +185,9 @@ services:
- ollama_models:/root/.ollama - ollama_models:/root/.ollama
environment: environment:
OLLAMA_API_KEY: "${OLLAMA_API_KEY}" OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
# Pin the model in memory (no idle unload) so queries never pay a cold-load
# penalty that exceeds the backend read timeout → NL search 503 after idle.
OLLAMA_KEEP_ALIVE: "-1"
cpus: "${OLLAMA_CPU_LIMIT:-4.0}" cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}" memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
@@ -191,7 +199,9 @@ services:
security_opt: security_opt:
- no-new-privileges:true - no-new-privileges:true
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] # `ollama list` hits the local API and exits non-zero if the server is
# down — used instead of curl, which the image does not ship.
test: ["CMD", "ollama", "list"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 5 retries: 5

View File

@@ -613,7 +613,7 @@ Expected output includes `qwen2.5:7b-instruct-q4_K_M`.
|---|---|---| |---|---|---|
| `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) | | `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) |
| `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference | | `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference |
| `app.ollama.timeout-seconds` | `30` | Read timeout for inference calls | | `app.ollama.timeout-seconds` | `60` | Read timeout for inference calls (absorbs cold model load on the first query after an Ollama restart) |
| `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit | | `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit |
### Upgrade the Ollama model ### Upgrade the Ollama model
@@ -625,7 +625,7 @@ To switch to a newer model version (e.g. a future release of `qwen2.5`):
```bash ```bash
docker volume rm familienarchiv_ollama_models docker volume rm familienarchiv_ollama_models
``` ```
(In production the volume name is prefixed with the compose project: `archiv-production_ollama_models`.) (In production the volume name is prefixed with the compose project: `archiv-production_ollama-models`.)
3. Restart the stack: 3. Restart the stack:
```bash ```bash
docker compose up -d docker compose up -d

View File

@@ -0,0 +1,125 @@
# ADR-034: Ollama in production — deployment, keep-alive pinning, and corrected init recipe
**Date:** 2026-06-06
**Status:** Accepted
**Deciders:** Marcel Raddatz
**Relates to:** #758 (bug), #759 (fix), #737 (NL search infrastructure)
**Corrects:** ADR-028 §10§11 (init recipe and readiness probe)
---
## Context
ADR-028 introduced Ollama as a Docker Compose service for NL search and documented
its topology, graceful-degradation contract, and memory budget. Two defects survived
that work and only surfaced once NL search reached staging (#758):
1. **Ollama was added only to the dev `docker-compose.yml`.** Staging and production
deploy from the self-contained `docker-compose.prod.yml`, which had no `ollama`
service. The backend defaults to `app.ollama.base-url: http://ollama:11434`, so its
client bean was active and resolved to a non-existent host → `ResourceAccessException`
→ HTTP 503 on every NL search.
2. **The init recipe documented in ADR-028 §10 never worked.** The `ollama/ollama` image
`ENTRYPOINT` is `ollama`, so a bare `command: sh -c "…"` ran as `ollama sh -c "…"`
(`unknown command "sh"`), and the image ships **no curl**, so the curl-based readiness
loop and the curl healthcheck could never pass.
This ADR records the production deployment decision and the corrected operational
contract. It is also the durable record of *why* `OLLAMA_KEEP_ALIVE=-1` is set, so a
future maintainer does not "optimize" it away and reintroduce the cold-load 503.
---
## Decisions
### 1. Ollama is a first-class production service
`docker-compose.prod.yml` now defines `ollama` + `ollama-model-init` + the
`ollama-models` volume, mirroring the dev stack. The graceful-degradation contract from
ADR-028 §3 is preserved: `backend` has **no** hard `depends_on` on `ollama`, so an absent
or unhealthy Ollama still yields a clean 503 rather than blocking backend startup.
### 2. Corrected init recipe (supersedes ADR-028 §10)
The init container overrides the image entrypoint to a shell and probes readiness with
`ollama list` (not curl, which the image lacks):
```sh
ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && \
(ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)
```
```yaml
entrypoint: ["/bin/sh", "-c"]
```
The pull is **guarded by a grep on the cached model list**. A model already on the volume
exits clean without any registry round-trip. This makes re-up offline-safe: a host reboot
during a registry/network blip can no longer fail init (which, via
`condition: service_completed_successfully`, would otherwise block the `ollama` service
and take NL search down until the registry was reachable again). The same recipe is used
in dev and prod — one mental model.
### 3. Healthcheck uses `ollama list` (supersedes ADR-028 §11 probe)
```yaml
healthcheck:
test: ["CMD", "ollama", "list"]
```
`ollama list` hits the local API and exits non-zero when the server is down — the correct
probe for a curl-less image. The `start_period: 60s` rationale from ADR-028 §11 still holds.
### 4. `OLLAMA_KEEP_ALIVE=-1` — pin the model in memory
```yaml
environment:
OLLAMA_KEEP_ALIVE: "-1"
```
By default Ollama evicts an idle model after ~5 minutes. The next query then pays a
cold-load penalty that exceeds the backend read timeout, producing an NL search 503 after
any idle period. Pinning the model (`-1` = never unload) keeps warm-path latency
predictable (~18 s on CPU). **Do not remove this** without re-introducing the post-idle
cold-load 503.
### 5. Read timeout raised 30 → 60 s
`app.ollama.timeout-seconds` is raised from 30 to 60 (`application.yaml`, mirrored in
`DEPLOYMENT.md`). Warm CPU inference is ~18 s; the higher ceiling absorbs the one cold
model load on the first query after an Ollama (re)start, before §4's pin takes hold.
**Implicit NFR made explicit:** NL search shall return a result or a 503 within 60 s; the
cold-start path immediately after an Ollama restart is the only path that approaches this
ceiling.
### 6. Hard-OOM trade-off (refines ADR-028 §2)
`memswap_limit == mem_limit` (both `${OLLAMA_MEM_LIMIT:-8g}`) disables swap for the
container. Combined with §4's pinned model, a memory-pressure event is a **hard OOM-kill,
not graceful latency degradation**. This is deliberate — swap-thrashing an LLM is worse
than a clean restart — but it means the 8 GB envelope is a real ceiling. `qwen2.5-7B-q4`
plus its KV cache under load sits close enough to 8 GB that this needs a Prometheus
memory alert on the `ollama` container before it bites in production (tracked as
observability follow-up, not in this PR).
---
## Consequences
### Positive
- NL search works on staging/production, not just dev — the actual deploy artifact now
matches the documented architecture.
- Re-up is offline-safe: a cached model never depends on registry reachability.
- The keep-alive pin and timeout ceiling make NL search latency predictable on CPU.
### Risks and operational implications
- **Hard OOM under memory pressure** (§6): a Prometheus alert on `ollama` container memory
is required before this is load-bearing in prod. Tracked as an observability follow-up.
- **Unauthenticated inference** relies entirely on `archiv-net` isolation (ADR-028 §7/§12,
unchanged). Sending an `Authorization` header from `RestClientOllamaClient` is a separate
durable hardening item, tracked outside this PR.
- ADR-028 §10§11 describe a recipe that never functioned; this ADR is the authoritative
init/healthcheck contract going forward.

View File

@@ -17,7 +17,6 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") {
ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.") ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.")
ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.") ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.")
Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.") Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.")
Container(ollama, "Ollama", "Ollama / port 11434", "Local LLM inference server. Hosts qwen2.5:7b-instruct-q4_K_M for natural-language query parsing (NL Search). CPU-only; GPU not required.")
} }
System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") { System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") {
@@ -49,7 +48,6 @@ Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)") Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus") Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus")
Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics") Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
Rel(backend, ollama, "NL search inference requests", "HTTP / REST / JSON")
Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics") Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics")
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090") Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
Rel(grafana, loki, "Queries logs", "HTTP 3100") Rel(grafana, loki, "Queries logs", "HTTP 3100")