2026-06-06 20:30:35 +02:00
3 changed files with 11 additions and 1 deletions
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -133,7 +133,9 @@ app:
  ollama:
    base-url: http://ollama:11434
    model: qwen2.5:7b-instruct-q4_K_M
-    timeout-seconds: 30
+    # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
+    # first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
+    timeout-seconds: 60
    health-check-timeout-seconds: 2

  nl-search:
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -239,6 +239,11 @@ services:
      - archiv-net
    volumes:
      - ollama-models:/root/.ollama
+    environment:
+      # Pin the model in memory (no idle unload). Without this, Ollama evicts
+      # the model after ~5 min idle and the next query pays a cold-load penalty
+      # that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -183,6 +183,9 @@ services:
      - ollama_models:/root/.ollama
    environment:
      OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
+      # Pin the model in memory (no idle unload) so queries never pay a cold-load
+      # penalty that exceeds the backend read timeout → NL search 503 after idle.
+      OLLAMA_KEEP_ALIVE: "-1"
    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"