fix(infra): deploy Ollama to prod/staging compose + fix broken model-init recipe #759

Merged
marcel merged 8 commits from fix/issue-758-ollama-prod-compose into main 2026-06-06 20:30:35 +02:00
3 changed files with 11 additions and 1 deletions
Showing only changes of commit 9e97687d0f - Show all commits

View File

@@ -133,7 +133,9 @@ app:
ollama:
base-url: http://ollama:11434
model: qwen2.5:7b-instruct-q4_K_M
timeout-seconds: 30
# CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the
# first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it.
timeout-seconds: 60
health-check-timeout-seconds: 2
nl-search:

View File

@@ -239,6 +239,11 @@ services:
- archiv-net
volumes:
- ollama-models:/root/.ollama
environment:
# Pin the model in memory (no idle unload). Without this, Ollama evicts
# the model after ~5 min idle and the next query pays a cold-load penalty
# that exceeds the backend read timeout → NL search 503 after idle.
OLLAMA_KEEP_ALIVE: "-1"
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"

View File

@@ -183,6 +183,9 @@ services:
- ollama_models:/root/.ollama
environment:
OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
# Pin the model in memory (no idle unload) so queries never pay a cold-load
# penalty that exceeds the backend read timeout → NL search 503 after idle.
OLLAMA_KEEP_ALIVE: "-1"
cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"