diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml index 36d5298a..ce517f25 100644 --- a/backend/src/main/resources/application.yaml +++ b/backend/src/main/resources/application.yaml @@ -133,7 +133,9 @@ app: ollama: base-url: http://ollama:11434 model: qwen2.5:7b-instruct-q4_K_M - timeout-seconds: 30 + # CPU inference: ~18s warm. Higher ceiling absorbs the cold model load on the + # first query after an Ollama (re)start before OLLAMA_KEEP_ALIVE pins it. + timeout-seconds: 60 health-check-timeout-seconds: 2 nl-search: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 9aa8f80c..7008c41b 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -239,6 +239,11 @@ services: - archiv-net volumes: - ollama-models:/root/.ollama + environment: + # Pin the model in memory (no idle unload). Without this, Ollama evicts + # the model after ~5 min idle and the next query pays a cold-load penalty + # that exceeds the backend read timeout → NL search 503 after idle. + OLLAMA_KEEP_ALIVE: "-1" cpus: "${OLLAMA_CPU_LIMIT:-4.0}" mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}" diff --git a/docker-compose.yml b/docker-compose.yml index bd54432f..94df5bd7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -183,6 +183,9 @@ services: - ollama_models:/root/.ollama environment: OLLAMA_API_KEY: "${OLLAMA_API_KEY}" + # Pin the model in memory (no idle unload) so queries never pay a cold-load + # penalty that exceeds the backend read timeout → NL search 503 after idle. + OLLAMA_KEEP_ALIVE: "-1" cpus: "${OLLAMA_CPU_LIMIT:-4.0}" mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"