diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 9c60b3bf..9a328b36 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -50,7 +50,6 @@ volumes:
   minio-data:
   ocr-models:
   ocr-cache:
-  ollama-models:
 
 services:
   db:
@@ -201,72 +200,38 @@ services:
     security_opt:
       - no-new-privileges:true
 
-  # --- Ollama: Model init (one-shot pull) ---
-  # Pulls qwen2.5:7b-instruct-q4_K_M (~4.7 GB) into the ollama-models volume on
-  # first start; exits quickly on subsequent starts (model already cached).
-  # The ollama/ollama image's ENTRYPOINT is `ollama` and the image ships WITHOUT
-  # curl, so the entrypoint is overridden to a shell and readiness is probed with
-  # `ollama list` (not curl). The pull is guarded by a `grep` on the cached model
-  # list so a model already on the volume exits clean WITHOUT a registry round-trip
-  # — a host reboot during a registry/network blip can no longer fail init (which
-  # would block the ollama service via service_completed_successfully).
-  # Backend degrades gracefully (503) if Ollama is absent.
-  ollama-model-init:
-    image: ollama/ollama:0.30.6
-    restart: "no"
-    entrypoint: ["/bin/sh", "-c"]
-    command:
-      - "ollama serve & until ollama list >/dev/null 2>&1; do sleep 1; done && (ollama list | grep -q 'qwen2.5:7b-instruct-q4_K_M' || ollama pull qwen2.5:7b-instruct-q4_K_M)"
-    networks:
-      - archiv-net
-    volumes:
-      - ollama-models:/root/.ollama
-    mem_limit: 2g
-    read_only: true
-    tmpfs:
-      - /tmp:size=512m
-    cap_drop:
-      - ALL
-    security_opt:
-      - no-new-privileges:true
-
-  # --- Ollama: LLM inference server ---
-  # Serves the pre-pulled model for NL search inference. Backend reaches it at
-  # http://ollama:11434 (application.yaml default; no env override required).
-  # Healthcheck uses `ollama list` because the image has no curl.
-  ollama:
-    image: ollama/ollama:0.30.6
+  # --- NLP service: rule-based NL query parser ---
+  # Lightweight FastAPI service; replaces Ollama for smart search query parsing.
+  # Connects to the DB at startup to build person/tag lookup tables.
+  nlp-service:
+    build:
+      context: ./nlp-service
     restart: unless-stopped
     expose:
-      - "11434"
+      - "8001"
     networks:
       - archiv-net
-    volumes:
-      - ollama-models:/root/.ollama
     environment:
-      # Pin the model in memory (no idle unload). Without this, Ollama evicts
-      # the model after ~5 min idle and the next query pays a cold-load penalty
-      # that exceeds the backend read timeout → NL search 503 after idle.
-      OLLAMA_KEEP_ALIVE: "-1"
-    cpus: "${OLLAMA_CPU_LIMIT:-4.0}"
-    mem_limit: "${OLLAMA_MEM_LIMIT:-8g}"
-    memswap_limit: "${OLLAMA_MEM_LIMIT:-8g}"
+      DATABASE_URL: "postgresql://archiv:${POSTGRES_PASSWORD}@db:5432/archiv"
+      NLP_FUZZY_THRESHOLD: "${NLP_FUZZY_THRESHOLD:-80}"
+    mem_limit: 256m
+    memswap_limit: 256m
     read_only: true
     tmpfs:
-      - /tmp:size=512m
+      - /tmp:size=32m
     cap_drop:
       - ALL
     security_opt:
       - no-new-privileges:true
     healthcheck:
-      test: ["CMD", "ollama", "list"]
-      interval: 30s
-      timeout: 10s
+      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
+      interval: 10s
+      timeout: 5s
       retries: 5
-      start_period: 60s
+      start_period: 15s
     depends_on:
-      ollama-model-init:
-        condition: service_completed_successfully
+      db:
+        condition: service_healthy
 
   backend:
     image: familienarchiv/backend:${TAG:-nightly}
@@ -286,6 +251,8 @@ services:
       # is a one-shot that must complete successfully. See #510.
       create-buckets:
         condition: service_completed_successfully
+      nlp-service:
+        condition: service_healthy
     # Bound to localhost only — Caddy fronts external traffic.
     ports:
       - "127.0.0.1:${PORT_BACKEND}:8080"
@@ -320,6 +287,7 @@ services:
       APP_ADMIN_PASSWORD: ${APP_ADMIN_PASSWORD}
       APP_OCR_BASE_URL: http://ocr-service:8000
       APP_OCR_TRAINING_TOKEN: ${OCR_TRAINING_TOKEN}
+      APP_NLP_BASE_URL: http://nlp-service:8001
       MAIL_HOST: ${MAIL_HOST}
       MAIL_PORT: ${MAIL_PORT:-587}
       MAIL_USERNAME: ${MAIL_USERNAME:-}