Files
familienarchiv/docker-compose.observability.yml
Marcel cd715029eb feat(observability): add GlitchTip error tracking to observability stack
Adds obs-glitchtip, obs-glitchtip-worker, obs-redis, and obs-glitchtip-db-init
services to docker-compose.observability.yml. The one-shot db-init container
creates the dedicated glitchtip database on the existing archive-db PostgreSQL
instance automatically on first stack start.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 04:38:06 +02:00

260 lines
7.9 KiB
YAML

# Observability stack — Grafana LGTM + GlitchTip
#
# Requires the main stack to be running first:
# docker compose up -d # creates archiv-net
# docker compose -f docker-compose.observability.yml up -d
#
# To validate without starting:
# docker compose -f docker-compose.observability.yml config
services:
# --- Metrics: Prometheus ---
prometheus:
image: prom/prometheus:v3.4.0
container_name: obs-prometheus
restart: unless-stopped
volumes:
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
networks:
- archiv-net
- obs-net
node-exporter:
image: prom/node-exporter:v1.9.0
container_name: obs-node-exporter
restart: unless-stopped
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
# $$ is YAML Compose escaping for a literal $ in the regex alternation
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
expose:
- "9100"
networks:
- obs-net
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.52.1
container_name: obs-cadvisor
restart: unless-stopped
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
privileged: true
volumes:
- /:/rootfs:ro
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
- /var/run/docker.sock:/var/run/docker.sock:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
expose:
- "8080"
networks:
- obs-net
# --- Logs: Loki + Promtail ---
loki:
image: grafana/loki:3.4.2
container_name: obs-loki
restart: unless-stopped
volumes:
- ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- loki_data:/loki
command: -config.file=/etc/loki/loki-config.yml
expose:
- "3100"
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
networks:
- obs-net
promtail:
image: grafana/promtail:3.4.2
container_name: obs-promtail
restart: unless-stopped
volumes:
- ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
# :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive.
- /var/run/docker.sock:/var/run/docker.sock:ro
- promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion
command: -config.file=/etc/promtail/promtail-config.yml
networks:
- archiv-net # label discovery from application containers via Docker socket
- obs-net # log shipping to Loki
depends_on:
loki:
condition: service_healthy
# --- Traces: Tempo ---
tempo:
image: grafana/tempo:2.7.2
container_name: obs-tempo
restart: unless-stopped
volumes:
- ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro
- tempo_data:/var/tempo
command: -config.file=/etc/tempo.yml
expose:
- "3200" # Grafana queries Tempo on this port (obs-net only)
- "4317" # OTLP gRPC — backend sends traces here (archiv-net)
- "4318" # OTLP HTTP — alternative transport (archiv-net)
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 15s
networks:
- archiv-net # backend (archive-backend) reaches tempo:4317 over this network
- obs-net # Grafana reaches tempo:3200 over this network
# --- Dashboards: Grafana ---
obs-grafana:
image: grafana/grafana-oss:11.6.1
container_name: obs-grafana
restart: unless-stopped
ports:
- "127.0.0.1:${PORT_GRAFANA:-3001}:3000"
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- grafana_data:/var/lib/grafana
- ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health | grep -q ok || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
prometheus:
condition: service_healthy
loki:
condition: service_healthy
tempo:
condition: service_healthy
networks:
- obs-net
# --- Error Tracking: GlitchTip ---
obs-redis:
image: redis:7-alpine
container_name: obs-redis
restart: unless-stopped
volumes:
- glitchtip_data:/data
expose:
- "6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- obs-net
obs-glitchtip:
image: glitchtip/glitchtip:v4
container_name: obs-glitchtip
restart: unless-stopped
depends_on:
obs-redis:
condition: service_healthy
obs-glitchtip-db-init:
condition: service_completed_successfully
environment:
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@archive-db:5432/glitchtip
REDIS_URL: redis://obs-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:3002}
DEFAULT_FROM_EMAIL: ${APP_MAIL_FROM:-noreply@familienarchiv.local}
EMAIL_URL: smtp://mailpit:1025
GLITCHTIP_MAX_EVENT_LIFE_DAYS: 90
ports:
- "127.0.0.1:${PORT_GLITCHTIP:-3002}:8080"
networks:
- archiv-net
- obs-net
obs-glitchtip-worker:
image: glitchtip/glitchtip:v4
container_name: obs-glitchtip-worker
restart: unless-stopped
command: ./bin/run-celery-with-beat.sh
depends_on:
obs-redis:
condition: service_healthy
environment:
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@archive-db:5432/glitchtip
REDIS_URL: redis://obs-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
networks:
- archiv-net
- obs-net
obs-glitchtip-db-init:
image: postgres:16-alpine
container_name: obs-glitchtip-db-init
restart: "no"
environment:
PGPASSWORD: ${POSTGRES_PASSWORD}
command: >
sh -c "psql -h archive-db -U ${POSTGRES_USER} -tc
\"SELECT 1 FROM pg_database WHERE datname = 'glitchtip'\" |
grep -q 1 ||
psql -h archive-db -U ${POSTGRES_USER} -c \"CREATE DATABASE glitchtip;\""
networks:
- archiv-net
networks:
# Shared network created by the main docker-compose.yml.
# The observability stack joins as a peer so Prometheus can scrape
# archive-backend by container name. The observability stack must NOT
# attempt to create this network — it will fail with a clear error if
# the main stack is not running yet.
archiv-net:
external: true
# Internal network for observability-service-to-service traffic
# (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo).
obs-net:
driver: bridge
volumes:
prometheus_data:
loki_data:
promtail_positions:
tempo_data:
grafana_data:
glitchtip_data: