From b665e1132d2a5d86131e151598c3df33035b2484 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 19:20:22 +0200
Subject: [PATCH 1/8] fix(infra): deploy Ollama to prod/staging compose + fix
 broken model-init recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NL search returned 503 (SMART_SEARCH_UNAVAILABLE / "Intelligente Suche
nicht verfügbar") on staging because Ollama was never reachable.

Two defects, both downstream of #737:

1. Ollama was added only to the dev docker-compose.yml. Staging/prod
   deploy from the self-contained docker-compose.prod.yml, which had no
   ollama service — so the backend (defaulting to http://ollama:11434)
   hit a non-existent host (ResourceAccessException -> 503).

2. The merged model-init recipe never worked: the ollama/ollama image
   ENTRYPOINT is `ollama` (so `command: sh -c ...` ran as `ollama sh ...`
   -> "unknown command sh"), and the image ships no curl (so both the
   readiness loop and the healthcheck could never pass).

- docker-compose.prod.yml: add ollama-model-init + ollama services and
  the ollama-models volume, with the corrected recipe (entrypoint
  override to /bin/sh -c, `ollama list` for readiness and healthcheck).
- docker-compose.yml: fix the same broken entrypoint/command and the
  curl healthcheck so the dev stack actually starts Ollama.

Verified on staging end-to-end: model-init exits 0, ollama healthy,
backend reaches /api/tags, inference succeeds within the 8g limit.

Refs #758

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 59 +++++++++++++++++++++++++++++++++++++++++
 docker-compose.yml      | 11 +++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 26e07442..9aa8f80c 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -50,6 +50,7 @@ volumes:
   minio-data:
   ocr-models:
   ocr-cache:
+  ollama-models:
 
 services:
   db:
@@ -200,6 +201,64 @@ services:
     security_opt:
       - no-new-privileges:true
 
+  # --- Ollama: Model init (one-shot pull) ---
+  # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on
+  # first start; exits quickly on subsequent starts (model already cached).
+  # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
+  # curl, so the entrypoint is overridden to a shell and readiness is probed with
+  # `ollama list` (not curl). Backend degrades gracefully (503) if Ollama is absent.
+  ollama-model-init:
+    image: ollama/ollama:0.30.6
+    restart: "no"
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
+    networks:
+      - archiv-net
+    volumes:
+      - ollama-models:/root/.ollama
+    mem_limit: 2g
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true
+
+  # --- Ollama: LLM inference server ---
+  # Serves the pre-pulled model for NL search inference. Backend reaches it at
+  # http://ollama:11434 (application.yaml default; no env override required).
+  # Healthcheck uses `ollama list` because the image has no curl.
+  ollama:
+    image: ollama/ollama:0.30.6
+    restart: unless-stopped
+    expose:
+      - "11434"
+    networks:
+      - archiv-net
+    volumes:
+      - ollama-models:/root/.ollama
+    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
+    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true
+    healthcheck:
+      test: ["CMD", "ollama", "list"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 60s
+    depends_on:
+      ollama-model-init:
+        condition: service_completed_successfully
+
   backend:
     image: familienarchiv/backend:${TAG:-nightly}
     build:
diff --git a/docker-compose.yml b/docker-compose.yml
index 78ac969a..bd54432f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -161,8 +161,11 @@ services:
       - ALL
     security_opt:
       - no-new-privileges:true
-    command: >
-      sh -c "ollama serve & SERVE_PID=$$! && until curl -sf http://localhost:11434/api/tags; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M && kill $$SERVE_PID"
+    # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
+    # no curl, so readiness is probed with `ollama list` instead of a curl loop.
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
 
   # --- Ollama: LLM inference server ---
   # Serves the pre-pulled model for NL search inference.
@@ -191,7 +194,9 @@ services:
     security_opt:
       - no-new-privileges:true
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+      # `ollama list` hits the local API and exits non-zero if the server is
+      # down — used instead of curl, which the image does not ship.
+      test: ["CMD", "ollama", "list"]
       interval: 30s
       timeout: 10s
       retries: 5
-- 
2.49.1


From 9e97687d0fadcf14c46bb9c275b6a531345fc9ff Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 19:27:02 +0200
Subject: [PATCH 2/8] fix(search): pin Ollama model in memory + raise read
 timeout

NL search recovered after deploy but went 503 again after a few minutes:
Ollama unloads the model after its default ~5 min keep-alive, so the next
query cold-loads the 4.7 GB model and exceeds the backend's 30s read
timeout (ResourceAccessException -> SMART_SEARCH_UNAVAILABLE). Warm
inference is ~18s; the cold load after idle is what timed out.

- docker-compose.{prod,yml}: set OLLAMA_KEEP_ALIVE=-1 on the ollama
  service so the model stays resident and never pays a cold-load penalty
  during normal operation (verified on staging: `ollama ps` -> UNTIL
  "Forever"; host has 47 GB free).
- application.yaml: raise app.ollama.timeout-seconds 30 -> 60 so the one
  unavoidable cold load (first query after an Ollama restart, before the
  model is pinned) completes instead of timing out.

Refs #758

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/src/main/resources/application.yaml | 4 +++-
 docker-compose.prod.yml                     | 5 +++++
 docker-compose.yml                          | 3 +++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml
index 36d5298a..ce517f25 100644
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -133,7 +133,9 @@ app:
   ollama:
     base-url: http://ollama:11434
     model: qwen2.5:7b-instruct-q4_K_M
-    timeout-seconds: 30
+    # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
+    # first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
+    timeout-seconds: 60
     health-check-timeout-seconds: 2
 
   nl-search:
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 9aa8f80c..7008c41b 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -239,6 +239,11 @@ services:
       - archiv-net
     volumes:
       - ollama-models:/root/.ollama
+    environment:
+      # Pin the model in memory (no idle unload). Without this, Ollama evicts
+      # the model after ~5 min idle and the next query pays a cold-load penalty
+      # that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
     cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
     mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
     memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
diff --git a/docker-compose.yml b/docker-compose.yml
index bd54432f..94df5bd7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -183,6 +183,9 @@ services:
       - ollama_models:/root/.ollama
     environment:
       OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
+      # Pin the model in memory (no idle unload) so queries never pay a cold-load
+      # penalty that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
     cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
     mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
     memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
-- 
2.49.1


From 2a0863cf3eb99e296bfaf4197c1b1fb28f31b704 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:08:55 +0200
Subject: [PATCH 3/8] docs(deploy): correct Ollama read timeout default to 60s

application.yaml sets app.ollama.timeout-seconds: 60 (raised from 30 to
absorb the cold model load on the first query after an Ollama restart),
but DEPLOYMENT.md still documented 30. A doc that contradicts the shipped
value is a traceability defect.

Addresses #759 review (Markus, Felix, Elicit).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/DEPLOYMENT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index 5c2580de..d5f6e1c1 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -613,7 +613,7 @@ Expected output includes `qwen2.5:7b-instruct-q4_K_M`.
 |---|---|---|
 | `app.ollama.base-url` | `http://ollama:11434` | Ollama service URL (dev: `http://localhost:11434`) |
 | `app.ollama.model` | `qwen2.5:7b-instruct-q4_K_M` | Model to use for inference |
-| `app.ollama.timeout-seconds` | `30` | Read timeout for inference calls |
+| `app.ollama.timeout-seconds` | `60` | Read timeout for inference calls (absorbs cold model load on the first query after an Ollama restart) |
 | `app.nl-search.rate-limit.max-requests-per-minute` | `5` | Per-user rate limit |
 
 ### Upgrade the Ollama model
-- 
2.49.1


From f22a1a1cfa2c65f795beceb69bfad423e56d70d4 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:09:48 +0200
Subject: [PATCH 4/8] docs(deploy): fix prod Ollama volume name to match
 hyphenated compose volume

docker-compose.prod.yml declares the volume as `ollama-models` (hyphen),
so the compose-project-prefixed name is `archiv-production_ollama-models`,
not the underscored `archiv-production_ollama_models` the model-upgrade
guide documented. The documented `docker volume rm` would not have matched
the real volume.

Addresses #759 review (Tobias #2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/DEPLOYMENT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index d5f6e1c1..f8523515 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -625,7 +625,7 @@ To switch to a newer model version (e.g. a future release of `qwen2.5`):
    ```bash
    docker volume rm familienarchiv_ollama_models
    ```
-   (In production the volume name is prefixed with the compose project: `archiv-production_ollama_models`.)
+   (In production the volume name is prefixed with the compose project: `archiv-production_ollama-models`.)
 3. Restart the stack:
    ```bash
    docker compose up -d
-- 
2.49.1


From a2f37f85a6bdbba520b5ad665e4e9ba7d6b64877 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:12:21 +0200
Subject: [PATCH 5/8] fix(infra): make prod Ollama model-init offline-safe

The init command unconditionally ran `ollama pull`, which contacts the
registry to verify the manifest digest even when the model is already on
the volume. A host reboot during a registry/upstream-network blip would
then fail init non-zero, the `service_completed_successfully` gate would
never be met, and the ollama service (hence NL search) would stay down
until the registry was reachable again.

Guard the pull with `ollama list | grep -q <model>` so a cached model
exits clean without any registry round-trip.

Addresses #759 review (Tobias #1).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 7008c41b..9c60b3bf 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -206,13 +206,17 @@ services:
   # first start; exits quickly on subsequent starts (model already cached).
   # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
   # curl, so the entrypoint is overridden to a shell and readiness is probed with
-  # `ollama list` (not curl). Backend degrades gracefully (503) if Ollama is absent.
+  # `ollama list` (not curl). The pull is guarded by a `grep` on the cached model
+  # list so a model already on the volume exits clean WITHOUT a registry round-trip
+  # — a host reboot during a registry/network blip can no longer fail init (which
+  # would block the ollama service via service_completed_successfully).
+  # Backend degrades gracefully (503) if Ollama is absent.
   ollama-model-init:
     image: ollama/ollama:0.30.6
     restart: "no"
     entrypoint: ["/bin/sh", "-c"]
     command:
-      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
     networks:
       - archiv-net
     volumes:
-- 
2.49.1


From d7d6d0638ca8ab6e481313cf86dd20af1bce2096 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:13:19 +0200
Subject: [PATCH 6/8] fix(infra): make dev Ollama model-init offline-safe

Mirror the prod hardening in the dev stack: guard the model pull with
`ollama list | grep -q <model>` so an already-cached model exits clean
without a registry round-trip. Keeps dev and prod on one recipe.

Addresses #759 review (Tobias #1).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docker-compose.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 94df5bd7..f9e618ea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -163,9 +163,11 @@ services:
       - no-new-privileges:true
     # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
     # no curl, so readiness is probed with `ollama list` instead of a curl loop.
+    # The pull is guarded by a `grep` on the cached model list so an already-cached
+    # model exits clean without a registry round-trip (offline-safe re-up).
     entrypoint: ["/bin/sh", "-c"]
     command:
-      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
 
   # --- Ollama: LLM inference server ---
   # Serves the pre-pulled model for NL search inference.
-- 
2.49.1


From db87a64cc0c13b6d76780a1322e8aa85a6e29d4d Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:14:26 +0200
Subject: [PATCH 7/8] docs(c4): de-duplicate Ollama container in l2-containers
 diagram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The diagram declared Container(ollama, ...) twice — an alias collision that
renders a duplicate box. It also declared the backend->ollama relationship
twice. Keep the richer 'Ollama LLM Service' declaration and the more
specific 'NL query parsing (POST /api/generate)' relationship; drop the
duplicates.

Addresses #759 review (Markus #2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/architecture/c4/l2-containers.puml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml
index 2d471dd9..b8630001 100644
--- a/docs/architecture/c4/l2-containers.puml
+++ b/docs/architecture/c4/l2-containers.puml
@@ -17,7 +17,6 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") {
     ContainerDb(db, "Relational Database", "PostgreSQL 16", "Stores document metadata, persons, users, permission groups, tags, transcription blocks, audit log, and Spring Session data.")
     ContainerDb(storage, "Object Storage", "MinIO (S3-compatible)", "Stores the actual document files (PDFs, scans). Backend uses a bucket-scoped service account (archiv-app), not MinIO root.")
     Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.")
-    Container(ollama, "Ollama", "Ollama / port 11434", "Local LLM inference server. Hosts qwen2.5:7b-instruct-q4_K_M for natural-language query parsing (NL Search). CPU-only; GPU not required.")
 }
 
 System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") {
@@ -49,7 +48,6 @@ Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
 Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
 Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus")
 Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
-Rel(backend, ollama, "NL search inference requests", "HTTP / REST / JSON")
 Rel(prometheus, ollama, "Scrapes LLM request metrics", "HTTP 11434 /metrics")
 Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
 Rel(grafana, loki, "Queries logs", "HTTP 3100")
-- 
2.49.1


From ed98729f7502cfee7b6032da9186cf642c133db8 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 20:16:03 +0200
Subject: [PATCH 8/8] docs(adr): record prod Ollama deployment + keep-alive
 decision (ADR-034)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Capture the why behind deploying Ollama to prod/staging compose: the
corrected init recipe (supersedes ADR-028 §10's never-functional curl
loop), the OLLAMA_KEEP_ALIVE=-1 pin (so a future maintainer doesn't
optimize it away and reintroduce the post-idle cold-load 503), the
30->60s timeout NFR, and the memswap==mem hard-OOM trade-off.

Addresses #759 review (Markus #3, Nora #2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ...ma-production-deployment-and-keep-alive.md | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 docs/adr/034-ollama-production-deployment-and-keep-alive.md

diff --git a/docs/adr/034-ollama-production-deployment-and-keep-alive.md b/docs/adr/034-ollama-production-deployment-and-keep-alive.md
new file mode 100644
index 00000000..0ff4a790
--- /dev/null
+++ b/docs/adr/034-ollama-production-deployment-and-keep-alive.md
@@ -0,0 +1,125 @@
+# ADR-034: Ollama in production — deployment, keep-alive pinning, and corrected init recipe
+
+**Date:** 2026-06-06
+**Status:** Accepted
+**Deciders:** Marcel Raddatz
+**Relates to:** #758 (bug), #759 (fix), #737 (NL search infrastructure)
+**Corrects:** ADR-028 §10–§11 (init recipe and readiness probe)
+
+---
+
+## Context
+
+ADR-028 introduced Ollama as a Docker Compose service for NL search and documented
+its topology, graceful-degradation contract, and memory budget. Two defects survived
+that work and only surfaced once NL search reached staging (#758):
+
+1. **Ollama was added only to the dev `docker-compose.yml`.** Staging and production
+   deploy from the self-contained `docker-compose.prod.yml`, which had no `ollama`
+   service. The backend defaults to `app.ollama.base-url: http://ollama:11434`, so its
+   client bean was active and resolved to a non-existent host → `ResourceAccessException`
+   → HTTP 503 on every NL search.
+2. **The init recipe documented in ADR-028 §10 never worked.** The `ollama/ollama` image
+   `ENTRYPOINT` is `ollama`, so a bare `command: sh -c "…"` ran as `ollama sh -c "…"`
+   (`unknown command "sh"`), and the image ships **no curl**, so the curl-based readiness
+   loop and the curl healthcheck could never pass.
+
+This ADR records the production deployment decision and the corrected operational
+contract. It is also the durable record of *why* `OLLAMA_KEEP_ALIVE=-1` is set, so a
+future maintainer does not "optimize" it away and reintroduce the cold-load 503.
+
+---
+
+## Decisions
+
+### 1. Ollama is a first-class production service
+
+`docker-compose.prod.yml` now defines `ollama` + `ollama-model-init` + the
+`ollama-models` volume, mirroring the dev stack. The graceful-degradation contract from
+ADR-028 §3 is preserved: `backend` has **no** hard `depends_on` on `ollama`, so an absent
+or unhealthy Ollama still yields a clean 503 rather than blocking backend startup.
+
+### 2. Corrected init recipe (supersedes ADR-028 §10)
+
+The init container overrides the image entrypoint to a shell and probes readiness with
+`ollama list` (not curl, which the image lacks):
+
+```sh
+ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && \
+  (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)
+```
+
+```yaml
+entrypoint: ["/bin/sh", "-c"]
+```
+
+The pull is **guarded by a grep on the cached model list**. A model already on the volume
+exits clean without any registry round-trip. This makes re-up offline-safe: a host reboot
+during a registry/network blip can no longer fail init (which, via
+`condition: service_completed_successfully`, would otherwise block the `ollama` service
+and take NL search down until the registry was reachable again). The same recipe is used
+in dev and prod — one mental model.
+
+### 3. Healthcheck uses `ollama list` (supersedes ADR-028 §11 probe)
+
+```yaml
+healthcheck:
+  test: ["CMD", "ollama", "list"]
+```
+
+`ollama list` hits the local API and exits non-zero when the server is down — the correct
+probe for a curl-less image. The `start_period: 60s` rationale from ADR-028 §11 still holds.
+
+### 4. `OLLAMA_KEEP_ALIVE=-1` — pin the model in memory
+
+```yaml
+environment:
+  OLLAMA_KEEP_ALIVE: "-1"
+```
+
+By default Ollama evicts an idle model after ~5 minutes. The next query then pays a
+cold-load penalty that exceeds the backend read timeout, producing an NL search 503 after
+any idle period. Pinning the model (`-1` = never unload) keeps warm-path latency
+predictable (~18 s on CPU). **Do not remove this** without re-introducing the post-idle
+cold-load 503.
+
+### 5. Read timeout raised 30 → 60 s
+
+`app.ollama.timeout-seconds` is raised from 30 to 60 (`application.yaml`, mirrored in
+`DEPLOYMENT.md`). Warm CPU inference is ~18 s; the higher ceiling absorbs the one cold
+model load on the first query after an Ollama (re)start, before §4's pin takes hold.
+
+**Implicit NFR made explicit:** NL search shall return a result or a 503 within 60 s; the
+cold-start path immediately after an Ollama restart is the only path that approaches this
+ceiling.
+
+### 6. Hard-OOM trade-off (refines ADR-028 §2)
+
+`memswap_limit == mem_limit` (both `${OLLAMA_MEM_LIMIT:-8g}`) disables swap for the
+container. Combined with §4's pinned model, a memory-pressure event is a **hard OOM-kill,
+not graceful latency degradation**. This is deliberate — swap-thrashing an LLM is worse
+than a clean restart — but it means the 8 GB envelope is a real ceiling. `qwen2.5-7B-q4`
+plus its KV cache under load sits close enough to 8 GB that this needs a Prometheus
+memory alert on the `ollama` container before it bites in production (tracked as
+observability follow-up, not in this PR).
+
+---
+
+## Consequences
+
+### Positive
+
+- NL search works on staging/production, not just dev — the actual deploy artifact now
+  matches the documented architecture.
+- Re-up is offline-safe: a cached model never depends on registry reachability.
+- The keep-alive pin and timeout ceiling make NL search latency predictable on CPU.
+
+### Risks and operational implications
+
+- **Hard OOM under memory pressure** (§6): a Prometheus alert on `ollama` container memory
+  is required before this is load-bearing in prod. Tracked as an observability follow-up.
+- **Unauthenticated inference** relies entirely on `archiv-net` isolation (ADR-028 §7/§12,
+  unchanged). Sending an `Authorization` header from `RestClientOllamaClient` is a separate
+  durable hardening item, tracked outside this PR.
+- ADR-028 §10–§11 describe a recipe that never functioned; this ADR is the authoritative
+  init/healthcheck contract going forward.
-- 
2.49.1