Some checks failed
GlitchTip 6.x moved its internal listen port from 8080 to 8000. The ports mapping was forwarding to the wrong port (host traffic never reached the app), and the healthcheck was probing 8080 with wget (not present in the image), causing the container to stay permanently unhealthy. Fix: map to port 8000, check with bash /dev/tcp (no external tools needed, available in the Python base image). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
267 lines
8.3 KiB
YAML
267 lines
8.3 KiB
YAML
# Observability stack — Grafana LGTM + GlitchTip
|
|
#
|
|
# Requires the main stack to be running first:
|
|
# docker compose up -d # creates archiv-net
|
|
# docker compose -f docker-compose.observability.yml up -d
|
|
#
|
|
# To validate without starting:
|
|
# docker compose -f docker-compose.observability.yml config
|
|
|
|
services:
|
|
|
|
# --- Metrics: Prometheus ---
|
|
|
|
prometheus:
|
|
image: prom/prometheus:v3.4.0
|
|
container_name: obs-prometheus
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus_data:/prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
ports:
|
|
- "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090"
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
|
interval: 30s
|
|
timeout: 5s
|
|
retries: 3
|
|
networks:
|
|
- archiv-net
|
|
- obs-net
|
|
|
|
node-exporter:
|
|
image: prom/node-exporter:v1.9.0
|
|
container_name: obs-node-exporter
|
|
restart: unless-stopped
|
|
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
|
|
pid: host
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
# $$ is YAML Compose escaping for a literal $ in the regex alternation
|
|
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
|
expose:
|
|
- "9100"
|
|
networks:
|
|
- obs-net
|
|
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:v0.52.1
|
|
container_name: obs-cadvisor
|
|
restart: unless-stopped
|
|
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
|
|
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
|
|
privileged: true
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker:/var/lib/docker:ro
|
|
expose:
|
|
- "8080"
|
|
networks:
|
|
- obs-net
|
|
|
|
# --- Logs: Loki + Promtail ---
|
|
|
|
loki:
|
|
image: grafana/loki:3.4.2
|
|
container_name: obs-loki
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
|
|
- loki_data:/loki
|
|
command: -config.file=/etc/loki/loki-config.yml
|
|
expose:
|
|
- "3100"
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 30s
|
|
networks:
|
|
- obs-net
|
|
|
|
promtail:
|
|
image: grafana/promtail:3.4.2
|
|
container_name: obs-promtail
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
|
|
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
|
# :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive.
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
- promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion
|
|
command: -config.file=/etc/promtail/promtail-config.yml
|
|
networks:
|
|
- archiv-net # label discovery from application containers via Docker socket
|
|
- obs-net # log shipping to Loki
|
|
depends_on:
|
|
loki:
|
|
condition: service_healthy
|
|
|
|
# --- Traces: Tempo ---
|
|
|
|
tempo:
|
|
image: grafana/tempo:2.7.2
|
|
container_name: obs-tempo
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro
|
|
- tempo_data:/var/tempo
|
|
command: -config.file=/etc/tempo.yml
|
|
expose:
|
|
- "3200" # Grafana queries Tempo on this port (obs-net only)
|
|
- "4317" # OTLP gRPC — backend sends traces here (archiv-net)
|
|
- "4318" # OTLP HTTP — alternative transport (archiv-net)
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 15s
|
|
networks:
|
|
- archiv-net # backend (archive-backend) reaches tempo:4317 over this network
|
|
- obs-net # Grafana reaches tempo:3200 over this network
|
|
|
|
# --- Dashboards: Grafana ---
|
|
|
|
obs-grafana:
|
|
image: grafana/grafana-oss:11.6.1
|
|
container_name: obs-grafana
|
|
restart: unless-stopped
|
|
ports:
|
|
- "127.0.0.1:${PORT_GRAFANA:-3003}:3000"
|
|
environment:
|
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
|
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-http://localhost:3003}
|
|
volumes:
|
|
- grafana_data:/var/lib/grafana
|
|
- ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health | grep -q ok || exit 1"]
|
|
interval: 30s
|
|
timeout: 5s
|
|
retries: 3
|
|
start_period: 30s
|
|
depends_on:
|
|
prometheus:
|
|
condition: service_healthy
|
|
loki:
|
|
condition: service_healthy
|
|
tempo:
|
|
condition: service_healthy
|
|
networks:
|
|
- obs-net
|
|
|
|
# --- Error Tracking: GlitchTip ---
|
|
|
|
obs-redis:
|
|
image: redis:7-alpine
|
|
container_name: obs-redis
|
|
restart: unless-stopped
|
|
volumes:
|
|
- glitchtip_data:/data
|
|
expose:
|
|
- "6379"
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "ping"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
networks:
|
|
- obs-net
|
|
|
|
obs-glitchtip:
|
|
image: glitchtip/glitchtip:6.1.6
|
|
container_name: obs-glitchtip
|
|
restart: unless-stopped
|
|
depends_on:
|
|
obs-redis:
|
|
condition: service_healthy
|
|
obs-glitchtip-db-init:
|
|
condition: service_completed_successfully
|
|
environment:
|
|
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip
|
|
REDIS_URL: redis://obs-redis:6379/0
|
|
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
|
|
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:3002}
|
|
DEFAULT_FROM_EMAIL: ${APP_MAIL_FROM:-noreply@familienarchiv.local}
|
|
EMAIL_URL: smtp://mailpit:1025
|
|
GLITCHTIP_MAX_EVENT_LIFE_DAYS: 90
|
|
ports:
|
|
- "127.0.0.1:${PORT_GLITCHTIP:-3002}:8000"
|
|
healthcheck:
|
|
test: ["CMD", "bash", "-c", "echo > /dev/tcp/localhost/8000"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 60s
|
|
networks:
|
|
- archiv-net
|
|
- obs-net
|
|
|
|
obs-glitchtip-worker:
|
|
image: glitchtip/glitchtip:6.1.6
|
|
container_name: obs-glitchtip-worker
|
|
restart: unless-stopped
|
|
command: ./bin/run-celery-with-beat.sh
|
|
depends_on:
|
|
obs-redis:
|
|
condition: service_healthy
|
|
environment:
|
|
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip
|
|
REDIS_URL: redis://obs-redis:6379/0
|
|
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
|
|
networks:
|
|
- archiv-net
|
|
- obs-net
|
|
|
|
obs-glitchtip-db-init:
|
|
image: postgres:16-alpine
|
|
container_name: obs-glitchtip-db-init
|
|
restart: "no"
|
|
environment:
|
|
PGPASSWORD: ${POSTGRES_PASSWORD}
|
|
command: >
|
|
sh -c "psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -tc
|
|
\"SELECT 1 FROM pg_database WHERE datname = 'glitchtip'\" |
|
|
grep -q 1 ||
|
|
psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -c \"CREATE DATABASE glitchtip;\""
|
|
networks:
|
|
- archiv-net
|
|
|
|
networks:
|
|
# Shared network created by the main docker-compose.yml.
|
|
# The observability stack joins as a peer so Prometheus can scrape
|
|
# archive-backend by container name. The observability stack must NOT
|
|
# attempt to create this network — it will fail with a clear error if
|
|
# the main stack is not running yet.
|
|
archiv-net:
|
|
external: true
|
|
|
|
# Internal network for observability-service-to-service traffic
|
|
# (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo).
|
|
obs-net:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
prometheus_data:
|
|
loki_data:
|
|
promtail_positions:
|
|
tempo_data:
|
|
grafana_data:
|
|
glitchtip_data:
|