fix(infra): deploy Ollama to prod/staging compose + fix broken model-init recipe #759
@@ -133,7 +133,9 @@ app:
|
|||||||
ollama:
|
ollama:
|
||||||
base-url: http://ollama:11434
|
base-url: http://ollama:11434
|
||||||
model: qwen2.5:7b-instruct-q4_K_M
|
model: qwen2.5:7b-instruct-q4_K_M
|
||||||
timeout-seconds: 30
|
# CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
|
||||||
|
# first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
|
||||||
|
timeout-seconds: 60
|
||||||
health-check-timeout-seconds: 2
|
health-check-timeout-seconds: 2
|
||||||
|
|
||||||
nl-search:
|
nl-search:
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ volumes:
|
|||||||
minio-data:
|
minio-data:
|
||||||
ocr-models:
|
ocr-models:
|
||||||
ocr-cache:
|
ocr-cache:
|
||||||
|
ollama-models:
|
||||||
|
|
||||||
services:
|
services:
|
||||||
db:
|
db:
|
||||||
@@ -200,6 +201,73 @@ services:
|
|||||||
security_opt:
|
security_opt:
|
||||||
- no-new-privileges:true
|
- no-new-privileges:true
|
||||||
|
|
||||||
|
# --- Ollama: Model init (one-shot pull) ---
|
||||||
|
# Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on
|
||||||
|
# first start; exits quickly on subsequent starts (model already cached).
|
||||||
|
# The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
|
||||||
|
# curl, so the entrypoint is overridden to a shell and readiness is probed with
|
||||||
|
# `ollama list` (not curl). The pull is guarded by a `grep` on the cached model
|
||||||
|
# list so a model already on the volume exits clean WITHOUT a registry round-trip
|
||||||
|
# — a host reboot during a registry/network blip can no longer fail init (which
|
||||||
|
# would block the ollama service via service_completed_successfully).
|
||||||
|
# Backend degrades gracefully (503) if Ollama is absent.
|
||||||
|
ollama-model-init:
|
||||||
|
image: ollama/ollama:0.30.6
|
||||||
|
restart: "no"
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
command:
|
||||||
|
- "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
|
||||||
|
networks:
|
||||||
|
- archiv-net
|
||||||
|
volumes:
|
||||||
|
- ollama-models:/root/.ollama
|
||||||
|
mem_limit: 2g
|
||||||
|
read_only: true
|
||||||
|
tmpfs:
|
||||||
|
- /tmp:size=512m
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
|
||||||
|
# --- Ollama: LLM inference server ---
|
||||||
|
# Serves the pre-pulled model for NL search inference. Backend reaches it at
|
||||||
|
# http://ollama:11434 (application.yaml default; no env override required).
|
||||||
|
# Healthcheck uses `ollama list` because the image has no curl.
|
||||||
|
ollama:
|
||||||
|
image: ollama/ollama:0.30.6
|
||||||
|
restart: unless-stopped
|
||||||
|
expose:
|
||||||
|
- "11434"
|
||||||
|
networks:
|
||||||
|
- archiv-net
|
||||||
|
volumes:
|
||||||
|
- ollama-models:/root/.ollama
|
||||||
|
environment:
|
||||||
|
# Pin the model in memory (no idle unload). Without this, Ollama evicts
|
||||||
|
# the model after ~5 min idle and the next query pays a cold-load penalty
|
||||||
|
# that exceeds the backend read timeout → NL search 503 after idle.
|
||||||
|
OLLAMA_KEEP_ALIVE: "-1"
|
||||||
|
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
|
||||||
|
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
||||||
|
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
||||||
|
read_only: true
|
||||||
|
tmpfs:
|
||||||
|
- /tmp:size=512m
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "ollama", "list"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 60s
|
||||||
|
depends_on:
|
||||||
|
ollama-model-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
|
||||||
backend:
|
backend:
|
||||||
image: familienarchiv/backend:${TAG:-nightly}
|
image: familienarchiv/backend:${TAG:-nightly}
|
||||||
build:
|
build:
|
||||||
|
|||||||
@@ -161,8 +161,13 @@ services:
|
|||||||
- ALL
|
- ALL
|
||||||
security_opt:
|
security_opt:
|
||||||
- no-new-privileges:true
|
- no-new-privileges:true
|
||||||
command: >
|
# The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
|
||||||
sh -c "ollama serve & SERVE_PID=$$! && until curl -sf http://localhost:11434/api/tags; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M && kill $$SERVE_PID"
|
# no curl, so readiness is probed with `ollama list` instead of a curl loop.
|
||||||
|
# The pull is guarded by a `grep` on the cached model list so an already-cached
|
||||||
|
# model exits clean without a registry round-trip (offline-safe re-up).
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
command:
|
||||||
|
- "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
|
||||||
|
|
||||||
# --- Ollama: LLM inference server ---
|
# --- Ollama: LLM inference server ---
|
||||||
# Serves the pre-pulled model for NL search inference.
|
# Serves the pre-pulled model for NL search inference.
|
||||||
@@ -180,6 +185,9 @@ services:
|
|||||||
- ollama_models:/root/.ollama
|
- ollama_models:/root/.ollama
|
||||||
environment:
|
environment:
|
||||||
OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
|
OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
|
||||||
|
# Pin the model in memory (no idle unload) so queries never pay a cold-load
|
||||||
|
# penalty that exceeds the backend read timeout → NL search 503 after idle.
|
||||||
|
OLLAMA_KEEP_ALIVE: "-1"
|
||||||
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
|
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
|
||||||
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
||||||
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
|
||||||
@@ -191,7 +199,9 @@ services:
|
|||||||
security_opt:
|
security_opt:
|
||||||
- no-new-privileges:true
|
- no-new-privileges:true
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
|
# `ollama list` hits the local API and exits non-zero if the server is
|
||||||
|
# down — used instead of curl, which the image does not ship.
|
||||||
|
test: ["CMD", "ollama", "list"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|||||||
@@ -613,7 +613,7 @@ Expected output includes `qwen2.5:7b-instruct-q4_K_M`.
|
|||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) |
|
| `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) |
|
||||||
| `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference |
|
| `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference |
|
||||||
| `app.ollama.timeout-seconds` | `30` | Read timeout for inference calls |
|
| `app.ollama.timeout-seconds` | `60` | Read timeout for inference calls (absorbs cold model load on the first query after an Ollama restart) |
|
||||||
| `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit |
|
| `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit |
|
||||||
|
|
||||||
### Upgrade the Ollama model
|
### Upgrade the Ollama model
|
||||||
@@ -625,7 +625,7 @@ To switch to a newer model version (e.g. a future release of `qwen2.5`):
|
|||||||
```bash
|
```bash
|
||||||
docker volume rm familienarchiv_ollama_models
|
docker volume rm familienarchiv_ollama_models
|
||||||
```
|
```
|
||||||
(In production the volume name is prefixed with the compose project: `archiv-production_ollama_models`.)
|
(In production the volume name is prefixed with the compose project: `archiv-production_ollama-models`.)
|
||||||
3. Restart the stack:
|
3. Restart the stack:
|
||||||
```bash
|
```bash
|
||||||
docker compose up -d
|
docker compose up -d
|
||||||
|
|||||||
125
docs/adr/034-ollama-production-deployment-and-keep-alive.md
Normal file
125
docs/adr/034-ollama-production-deployment-and-keep-alive.md
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
# ADR-034: Ollama in production — deployment, keep-alive pinning, and corrected init recipe
|
||||||
|
|
||||||
|
**Date:** 2026-06-06
|
||||||
|
**Status:** Accepted
|
||||||
|
**Deciders:** Marcel Raddatz
|
||||||
|
**Relates to:** #758 (bug), #759 (fix), #737 (NL search infrastructure)
|
||||||
|
**Corrects:** ADR-028 §10–§11 (init recipe and readiness probe)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
ADR-028 introduced Ollama as a Docker Compose service for NL search and documented
|
||||||
|
its topology, graceful-degradation contract, and memory budget. Two defects survived
|
||||||
|
that work and only surfaced once NL search reached staging (#758):
|
||||||
|
|
||||||
|
1. **Ollama was added only to the dev `docker-compose.yml`.** Staging and production
|
||||||
|
deploy from the self-contained `docker-compose.prod.yml`, which had no `ollama`
|
||||||
|
service. The backend defaults to `app.ollama.base-url: http://ollama:11434`, so its
|
||||||
|
client bean was active and resolved to a non-existent host → `ResourceAccessException`
|
||||||
|
→ HTTP 503 on every NL search.
|
||||||
|
2. **The init recipe documented in ADR-028 §10 never worked.** The `ollama/ollama` image
|
||||||
|
`ENTRYPOINT` is `ollama`, so a bare `command: sh -c "…"` ran as `ollama sh -c "…"`
|
||||||
|
(`unknown command "sh"`), and the image ships **no curl**, so the curl-based readiness
|
||||||
|
loop and the curl healthcheck could never pass.
|
||||||
|
|
||||||
|
This ADR records the production deployment decision and the corrected operational
|
||||||
|
contract. It is also the durable record of *why* `OLLAMA_KEEP_ALIVE=-1` is set, so a
|
||||||
|
future maintainer does not "optimize" it away and reintroduce the cold-load 503.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
### 1. Ollama is a first-class production service
|
||||||
|
|
||||||
|
`docker-compose.prod.yml` now defines `ollama` + `ollama-model-init` + the
|
||||||
|
`ollama-models` volume, mirroring the dev stack. The graceful-degradation contract from
|
||||||
|
ADR-028 §3 is preserved: `backend` has **no** hard `depends_on` on `ollama`, so an absent
|
||||||
|
or unhealthy Ollama still yields a clean 503 rather than blocking backend startup.
|
||||||
|
|
||||||
|
### 2. Corrected init recipe (supersedes ADR-028 §10)
|
||||||
|
|
||||||
|
The init container overrides the image entrypoint to a shell and probes readiness with
|
||||||
|
`ollama list` (not curl, which the image lacks):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && \
|
||||||
|
(ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)
|
||||||
|
```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
```
|
||||||
|
|
||||||
|
The pull is **guarded by a grep on the cached model list**. A model already on the volume
|
||||||
|
exits clean without any registry round-trip. This makes re-up offline-safe: a host reboot
|
||||||
|
during a registry/network blip can no longer fail init (which, via
|
||||||
|
`condition: service_completed_successfully`, would otherwise block the `ollama` service
|
||||||
|
and take NL search down until the registry was reachable again). The same recipe is used
|
||||||
|
in dev and prod — one mental model.
|
||||||
|
|
||||||
|
### 3. Healthcheck uses `ollama list` (supersedes ADR-028 §11 probe)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "ollama", "list"]
|
||||||
|
```
|
||||||
|
|
||||||
|
`ollama list` hits the local API and exits non-zero when the server is down — the correct
|
||||||
|
probe for a curl-less image. The `start_period: 60s` rationale from ADR-028 §11 still holds.
|
||||||
|
|
||||||
|
### 4. `OLLAMA_KEEP_ALIVE=-1` — pin the model in memory
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
environment:
|
||||||
|
OLLAMA_KEEP_ALIVE: "-1"
|
||||||
|
```
|
||||||
|
|
||||||
|
By default Ollama evicts an idle model after ~5 minutes. The next query then pays a
|
||||||
|
cold-load penalty that exceeds the backend read timeout, producing an NL search 503 after
|
||||||
|
any idle period. Pinning the model (`-1` = never unload) keeps warm-path latency
|
||||||
|
predictable (~18 s on CPU). **Do not remove this** without re-introducing the post-idle
|
||||||
|
cold-load 503.
|
||||||
|
|
||||||
|
### 5. Read timeout raised 30 → 60 s
|
||||||
|
|
||||||
|
`app.ollama.timeout-seconds` is raised from 30 to 60 (`application.yaml`, mirrored in
|
||||||
|
`DEPLOYMENT.md`). Warm CPU inference is ~18 s; the higher ceiling absorbs the one cold
|
||||||
|
model load on the first query after an Ollama (re)start, before §4's pin takes hold.
|
||||||
|
|
||||||
|
**Implicit NFR made explicit:** NL search shall return a result or a 503 within 60 s; the
|
||||||
|
cold-start path immediately after an Ollama restart is the only path that approaches this
|
||||||
|
ceiling.
|
||||||
|
|
||||||
|
### 6. Hard-OOM trade-off (refines ADR-028 §2)
|
||||||
|
|
||||||
|
`memswap_limit == mem_limit` (both `${OLLAMA_MEM_LIMIT:-8g}`) disables swap for the
|
||||||
|
container. Combined with §4's pinned model, a memory-pressure event is a **hard OOM-kill,
|
||||||
|
not graceful latency degradation**. This is deliberate — swap-thrashing an LLM is worse
|
||||||
|
than a clean restart — but it means the 8 GB envelope is a real ceiling. `qwen2.5-7B-q4`
|
||||||
|
plus its KV cache under load sits close enough to 8 GB that this needs a Prometheus
|
||||||
|
memory alert on the `ollama` container before it bites in production (tracked as
|
||||||
|
observability follow-up, not in this PR).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- NL search works on staging/production, not just dev — the actual deploy artifact now
|
||||||
|
matches the documented architecture.
|
||||||
|
- Re-up is offline-safe: a cached model never depends on registry reachability.
|
||||||
|
- The keep-alive pin and timeout ceiling make NL search latency predictable on CPU.
|
||||||
|
|
||||||
|
### Risks and operational implications
|
||||||
|
|
||||||
|
- **Hard OOM under memory pressure** (§6): a Prometheus alert on `ollama` container memory
|
||||||
|
is required before this is load-bearing in prod. Tracked as an observability follow-up.
|
||||||
|
- **Unauthenticated inference** relies entirely on `archiv-net` isolation (ADR-028 §7/§12,
|
||||||
|
unchanged). Sending an `Authorization` header from `RestClientOllamaClient` is a separate
|
||||||
|
durable hardening item, tracked outside this PR.
|
||||||
|
- ADR-028 §10–§11 describe a recipe that never functioned; this ADR is the authoritative
|
||||||
|
init/healthcheck contract going forward.
|
||||||
@@ -17,7 +17,6 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") {
|
|||||||
ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.")
|
ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.")
|
||||||
ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.")
|
ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.")
|
||||||
Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.")
|
Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.")
|
||||||
Container(ollama, "Ollama", "Ollama / port 11434", "Local LLM inference server. Hosts qwen2.5:7b-instruct-q4_K_M for natural-language query parsing (NL Search). CPU-only; GPU not required.")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") {
|
System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") {
|
||||||
@@ -49,7 +48,6 @@ Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
|
|||||||
Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
|
Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
|
||||||
Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus")
|
Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus")
|
||||||
Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
|
Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
|
||||||
Rel(backend, ollama, "NL search inference requests", "HTTP / REST / JSON")
|
|
||||||
Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics")
|
Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics")
|
||||||
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
|
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
|
||||||
Rel(grafana, loki, "Queries logs", "HTTP 3100")
|
Rel(grafana, loki, "Queries logs", "HTTP 3100")
|
||||||
|
|||||||
Reference in New Issue
Block a user