# Observability stack — Grafana LGTM + GlitchTip # # Requires the main stack to be running first: # docker compose up -d # creates archiv-net # docker compose -f docker-compose.observability.yml up -d # # To validate without starting: # docker compose -f docker-compose.observability.yml config services: # --- Metrics: Prometheus --- prometheus: image: prom/prometheus:v3.4.0 container_name: obs-prometheus restart: unless-stopped volumes: - ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle' ports: - "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090" healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"] interval: 30s timeout: 5s retries: 3 networks: - archiv-net - obs-net node-exporter: image: prom/node-exporter:v1.9.0 container_name: obs-node-exporter restart: unless-stopped # pid: host — required for process-level CPU/memory metrics; cgroup isolation applies pid: host volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' # $$ is YAML Compose escaping for a literal $ in the regex alternation - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' expose: - "9100" networks: - obs-net cadvisor: image: gcr.io/cadvisor/cadvisor:v0.52.1 container_name: obs-cadvisor restart: unless-stopped # privileged: true — required for cgroup and namespace metrics, see cAdvisor docs. # Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net. privileged: true volumes: - /:/rootfs:ro # /var/run/docker.sock mounted read-only — sufficient for container metadata discovery - /var/run/docker.sock:/var/run/docker.sock:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro expose: - "8080" networks: - obs-net # --- Logs: Loki + Promtail --- loki: image: grafana/loki:3.4.2 container_name: obs-loki restart: unless-stopped volumes: - ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro - loki_data:/loki command: -config.file=/etc/loki/loki-config.yml expose: - "3100" healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"] interval: 10s timeout: 5s retries: 5 start_period: 30s networks: - obs-net promtail: image: grafana/promtail:3.4.2 container_name: obs-promtail restart: unless-stopped volumes: - ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro # :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive. - /var/run/docker.sock:/var/run/docker.sock:ro - promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion command: -config.file=/etc/promtail/promtail-config.yml networks: - archiv-net # label discovery from application containers via Docker socket - obs-net # log shipping to Loki depends_on: loki: condition: service_healthy # --- Traces: Tempo --- tempo: image: grafana/tempo:2.7.2 container_name: obs-tempo restart: unless-stopped volumes: - ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro - tempo_data:/var/tempo command: -config.file=/etc/tempo.yml expose: - "3200" # Grafana queries Tempo on this port (obs-net only) - "4317" # OTLP gRPC — backend sends traces here (archiv-net) - "4318" # OTLP HTTP — alternative transport (archiv-net) healthcheck: test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"] interval: 10s timeout: 5s retries: 5 start_period: 15s networks: - archiv-net # backend (archive-backend) reaches tempo:4317 over this network - obs-net # Grafana reaches tempo:3200 over this network # --- Dashboards: Grafana --- obs-grafana: image: grafana/grafana-oss:11.6.1 container_name: obs-grafana restart: unless-stopped ports: - "127.0.0.1:${PORT_GRAFANA:-3001}:3000" environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme} GF_USERS_ALLOW_SIGN_UP: "false" volumes: - grafana_data:/var/lib/grafana - ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro depends_on: - prometheus - loki - tempo networks: - obs-net # --- Error Tracking: GlitchTip --- # glitchtip: (see future issue) networks: # Shared network created by the main docker-compose.yml. # The observability stack joins as a peer so Prometheus can scrape # archive-backend by container name. The observability stack must NOT # attempt to create this network — it will fail with a clear error if # the main stack is not running yet. archiv-net: external: true # Internal network for observability-service-to-service traffic # (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo). obs-net: driver: bridge volumes: prometheus_data: loki_data: promtail_positions: tempo_data: grafana_data: glitchtip_data: