diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 9c60b3bf..9a328b36 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -50,7 +50,6 @@ volumes: minio-data: ocr-models: ocr-cache: - ollama-models: services: db: @@ -201,72 +200,38 @@ services: security_opt: - no-new-privileges:true - # --- Ollama: Model init (one-shot pull) --- - # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on - # first start; exits quickly on subsequent starts (model already cached). - # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT - # curl, so the entrypoint is overridden to a shell and readiness is probed with - # `ollama list` (not curl). The pull is guarded by a `grep` on the cached model - # list so a model already on the volume exits clean WITHOUT a registry round-trip - # — a host reboot during a registry/network blip can no longer fail init (which - # would block the ollama service via service_completed_successfully). - # Backend degrades gracefully (503) if Ollama is absent. - ollama-model-init: - image: ollama/ollama:0.30.6 - restart: "no" - entrypoint: ["/bin/sh", "-c"] - command: - - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)" - networks: - - archiv-net - volumes: - - ollama-models:/root/.ollama - mem_limit: 2g - read_only: true - tmpfs: - - /tmp:size=512m - cap_drop: - - ALL - security_opt: - - no-new-privileges:true - - # --- Ollama: LLM inference server --- - # Serves the pre-pulled model for NL search inference. Backend reaches it at - # http://ollama:11434 (application.yaml default; no env override required). - # Healthcheck uses `ollama list` because the image has no curl. - ollama: - image: ollama/ollama:0.30.6 + # --- NLP service: rule-based NL query parser --- + # Lightweight FastAPI service; replaces Ollama for smart search query parsing. + # Connects to the DB at startup to build person/tag lookup tables. + nlp-service: + build: + context: ./nlp-service restart: unless-stopped expose: - - "11434" + - "8001" networks: - archiv-net - volumes: - - ollama-models:/root/.ollama environment: - # Pin the model in memory (no idle unload). Without this, Ollama evicts - # the model after ~5 min idle and the next query pays a cold-load penalty - # that exceeds the backend read timeout → NL search 503 after idle. - OLLAMA_KEEP_ALIVE: "-1" - cpus: "${OLLAMA_CPU_LIMIT:-4.0}" - mem_limit: "${OLLAMA_MEM_LIMIT:-8g}" - memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}" + DATABASE_URL: "postgresql://archiv:${POSTGRES_PASSWORD}@db:5432/archiv" + NLP_FUZZY_THRESHOLD: "${NLP_FUZZY_THRESHOLD:-80}" + mem_limit: 256m + memswap_limit: 256m read_only: true tmpfs: - - /tmp:size=512m + - /tmp:size=32m cap_drop: - ALL security_opt: - no-new-privileges:true healthcheck: - test: ["CMD", "ollama", "list"] - interval: 30s - timeout: 10s + test: ["CMD", "curl", "-f", "http://localhost:8001/health"] + interval: 10s + timeout: 5s retries: 5 - start_period: 60s + start_period: 15s depends_on: - ollama-model-init: - condition: service_completed_successfully + db: + condition: service_healthy backend: image: familienarchiv/backend:${TAG:-nightly} @@ -286,6 +251,8 @@ services: # is a one-shot that must complete successfully. See #510. create-buckets: condition: service_completed_successfully + nlp-service: + condition: service_healthy # Bound to localhost only — Caddy fronts external traffic. ports: - "127.0.0.1:${PORT_BACKEND}:8080" @@ -320,6 +287,7 @@ services: APP_ADMIN_PASSWORD: ${APP_ADMIN_PASSWORD} APP_OCR_BASE_URL: http://ocr-service:8000 APP_OCR_TRAINING_TOKEN: ${OCR_TRAINING_TOKEN} + APP_NLP_BASE_URL: http://nlp-service:8001 MAIL_HOST: ${MAIL_HOST} MAIL_PORT: ${MAIL_PORT:-587} MAIL_USERNAME: ${MAIL_USERNAME:-}