From 9e97687d0fadcf14c46bb9c275b6a531345fc9ff Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Sat, 6 Jun 2026 19:27:02 +0200
Subject: [PATCH] fix(search): pin Ollama model in memory + raise read timeout

NL search recovered after deploy but went 503 again after a few minutes:
Ollama unloads the model after its default ~5 min keep-alive, so the next
query cold-loads the 4.7 GB model and exceeds the backend's 30s read
timeout (ResourceAccessException -> SMART_SEARCH_UNAVAILABLE). Warm
inference is ~18s; the cold load after idle is what timed out.

- docker-compose.{prod,yml}: set OLLAMA_KEEP_ALIVE=-1 on the ollama
  service so the model stays resident and never pays a cold-load penalty
  during normal operation (verified on staging: `ollama ps` -> UNTIL
  "Forever"; host has 47 GB free).
- application.yaml: raise app.ollama.timeout-seconds 30 -> 60 so the one
  unavoidable cold load (first query after an Ollama restart, before the
  model is pinned) completes instead of timing out.

Refs #758

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/src/main/resources/application.yaml | 4 +++-
 docker-compose.prod.yml                     | 5 +++++
 docker-compose.yml                          | 3 +++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml
index 36d5298a..ce517f25 100644
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -133,7 +133,9 @@ app:
   ollama:
     base-url: http://ollama:11434
     model: qwen2.5:7b-instruct-q4_K_M
-    timeout-seconds: 30
+    # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
+    # first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
+    timeout-seconds: 60
     health-check-timeout-seconds: 2
 
   nl-search:
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 9aa8f80c..7008c41b 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -239,6 +239,11 @@ services:
       - archiv-net
     volumes:
       - ollama-models:/root/.ollama
+    environment:
+      # Pin the model in memory (no idle unload). Without this, Ollama evicts
+      # the model after ~5 min idle and the next query pays a cold-load penalty
+      # that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
     cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
     mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
     memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
diff --git a/docker-compose.yml b/docker-compose.yml
index bd54432f..94df5bd7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -183,6 +183,9 @@ services:
       - ollama_models:/root/.ollama
     environment:
       OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
+      # Pin the model in memory (no idle unload) so queries never pay a cold-load
+      # penalty that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
     cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
     mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
     memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"