diff --git a/docker-compose.yml b/docker-compose.yml
index f9e618ea..eb0a75ce 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -141,74 +141,41 @@ services:
     security_opt:
       - no-new-privileges:true
 
-  # --- Ollama: Model init (one-shot pull) ---
-  # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama_models volume on first start.
-  # On subsequent starts (model already in volume), exits quickly without re-downloading.
+  # --- NLP service: rule-based NL query parser ---
+  # FastAPI Python service; replaces Ollama for smart search query parsing.
   # Not started in CI — CI uses explicit service selection
   # (docker-compose.ci.yml: db minio create-buckets)
-  ollama-model-init:
-    image: ollama/ollama:0.30.6
-    restart: "no"
-    networks:
-      - archiv-net
-    volumes:
-      - ollama_models:/root/.ollama
-    mem_limit: 2g
-    read_only: true
-    tmpfs:
-      - /tmp:size=512m
-    cap_drop:
-      - ALL
-    security_opt:
-      - no-new-privileges:true
-    # The image ENTRYPOINT is `ollama`, so override it to a shell; the image has
-    # no curl, so readiness is probed with `ollama list` instead of a curl loop.
-    # The pull is guarded by a `grep` on the cached model list so an already-cached
-    # model exits clean without a registry round-trip (offline-safe re-up).
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
-
-  # --- Ollama: LLM inference server ---
-  # Serves the pre-pulled model for NL search inference.
-  # Not started in CI — CI uses explicit service selection
-  # (docker-compose.ci.yml: db minio create-buckets)
-  ollama:
-    image: ollama/ollama:0.30.6
-    container_name: archive-ollama
+  nlp-service:
+    build:
+      context: ./nlp-service
+      dockerfile: Dockerfile
+    container_name: archive-nlp
     restart: unless-stopped
     expose:
-      - "11434"
+      - "8001"
     networks:
       - archiv-net
-    volumes:
-      - ollama_models:/root/.ollama
     environment:
-      OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
-      # Pin the model in memory (no idle unload) so queries never pay a cold-load
-      # penalty that exceeds the backend read timeout → NL search 503 after idle.
-      OLLAMA_KEEP_ALIVE: "-1"
-    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
-    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
-    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+      DATABASE_URL: "postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB}"
+      NLP_FUZZY_THRESHOLD: "${NLP_FUZZY_THRESHOLD:-80}"
+    mem_limit: 256m
+    memswap_limit: 256m
     read_only: true
     tmpfs:
-      - /tmp:size=512m
+      - /tmp:size=32m
     cap_drop:
       - ALL
     security_opt:
       - no-new-privileges:true
     healthcheck:
-      # `ollama list` hits the local API and exits non-zero if the server is
-      # down — used instead of curl, which the image does not ship.
-      test: ["CMD", "ollama", "list"]
-      interval: 30s
-      timeout: 10s
+      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
+      interval: 10s
+      timeout: 5s
       retries: 5
-      start_period: 60s  # model weights are pre-loaded by ollama-model-init; service only needs to bind port
+      start_period: 15s
     depends_on:
-      ollama-model-init:
-        condition: service_completed_successfully
+      db:
+        condition: service_healthy
 
   # --- Backend: Spring Boot ---
   backend:
@@ -228,6 +195,8 @@ services:
         condition: service_started
       ocr-service:
         condition: service_started
+      nlp-service:
+        condition: service_started
     environment:
       SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
       SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
@@ -253,8 +222,7 @@ services:
       SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
       APP_OCR_BASE_URL: http://ocr-service:8000
       APP_OCR_TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
-      APP_OLLAMA_BASE_URL: "${APP_OLLAMA_BASE_URL:-http://ollama:11434}"
-      APP_OLLAMA_API_KEY: "${OLLAMA_API_KEY}"
+      APP_NLP_BASE_URL: "http://nlp-service:8001"
       SENTRY_DSN: ${SENTRY_DSN:-}
       SENTRY_TRACES_SAMPLE_RATE: ${SENTRY_TRACES_SAMPLE_RATE:-1.0}
       # Observability: send traces to Tempo inside archiv-net (OTLP gRPC port 4317)
@@ -318,4 +286,3 @@ volumes:
   frontend_node_modules:
   ocr_models:
   ocr_cache:
-  ollama_models:
diff --git a/nlp-service/Dockerfile b/nlp-service/Dockerfile
index 61c723b0..ccb9e7e6 100644
--- a/nlp-service/Dockerfile
+++ b/nlp-service/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.11.12-slim
 
 WORKDIR /app