From b665e1132d2a5d86131e151598c3df33035b2484 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 19:20:22 +0200
Subject: [PATCH] fix(infra): deploy Ollama to prod/staging compose + fix
 broken model-init recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NL search returned 503 (SMART_SEARCH_UNAVAILABLE / "Intelligente Suche
nicht verfügbar") on staging because Ollama was never reachable.

Two defects, both downstream of #737:

1. Ollama was added only to the dev docker-compose.yml. Staging/prod
   deploy from the self-contained docker-compose.prod.yml, which had no
   ollama service — so the backend (defaulting to http://ollama:11434)
   hit a non-existent host (ResourceAccessException -> 503).

2. The merged model-init recipe never worked: the ollama/ollama image
   ENTRYPOINT is `ollama` (so `command: sh -c ...` ran as `ollama sh ...`
   -> "unknown command sh"), and the image ships no curl (so both the
   readiness loop and the healthcheck could never pass).

- docker-compose.prod.yml: add ollama-model-init + ollama services and
  the ollama-models volume, with the corrected recipe (entrypoint
  override to /bin/sh -c, `ollama list` for readiness and healthcheck).
- docker-compose.yml: fix the same broken entrypoint/command and the
  curl healthcheck so the dev stack actually starts Ollama.

Verified on staging end-to-end: model-init exits 0, ollama healthy,
backend reaches /api/tags, inference succeeds within the 8g limit.

Refs #758

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docker-compose.prod.yml | 59 +++++++++++++++++++++++++++++++++++++++++
 docker-compose.yml      | 11 +++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 26e07442..9aa8f80c 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -50,6 +50,7 @@ volumes:
   minio-data:
   ocr-models:
   ocr-cache:
+  ollama-models:
 
 services:
   db:
@@ -200,6 +201,64 @@ services:
     security_opt:
       - no-new-privileges:true
 
+  # --- Ollama: Model init (one-shot pull) ---
+  # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on
+  # first start; exits quickly on subsequent starts (model already cached).
+  # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
+  # curl, so the entrypoint is overridden to a shell and readiness is probed with
+  # `ollama list` (not curl). Backend degrades gracefully (503) if Ollama is absent.
+  ollama-model-init:
+    image: ollama/ollama:0.30.6
+    restart: "no"
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
+    networks:
+      - archiv-net
+    volumes:
+      - ollama-models:/root/.ollama
+    mem_limit: 2g
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true
+
+  # --- Ollama: LLM inference server ---
+  # Serves the pre-pulled model for NL search inference. Backend reaches it at
+  # http://ollama:11434 (application.yaml default; no env override required).
+  # Healthcheck uses `ollama list` because the image has no curl.
+  ollama:
+    image: ollama/ollama:0.30.6
+    restart: unless-stopped
+    expose:
+      - "11434"
+    networks:
+      - archiv-net
+    volumes:
+      - ollama-models:/root/.ollama
+    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
+    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+    read_only: true
+    tmpfs:
+      - /tmp:size=512m
+    cap_drop:
+      - ALL
+    security_opt:
+      - no-new-privileges:true
+    healthcheck:
+      test: ["CMD", "ollama", "list"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 60s
+    depends_on:
+      ollama-model-init:
+        condition: service_completed_successfully
+
   backend:
     image: familienarchiv/backend:${TAG:-nightly}
     build:
diff --git a/docker-compose.yml b/docker-compose.yml
index 78ac969a..bd54432f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -161,8 +161,11 @@ services:
       - ALL
     security_opt:
       - no-new-privileges:true
-    command: >
-      sh -c "ollama serve & SERVE_PID=$$! && until curl -sf http://localhost:11434/api/tags; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M && kill $$SERVE_PID"
+    # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
+    # no curl, so readiness is probed with `ollama list` instead of a curl loop.
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && ollama pull qwen2.5:7b-instruct-q4_K_M"
 
   # --- Ollama: LLM inference server ---
   # Serves the pre-pulled model for NL search inference.
@@ -191,7 +194,9 @@ services:
     security_opt:
       - no-new-privileges:true
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+      # `ollama list` hits the local API and exits non-zero if the server is
+      # down — used instead of curl, which the image does not ship.
+      test: ["CMD", "ollama", "list"]
       interval: 30s
       timeout: 10s
       retries: 5