Files
familienarchiv/docker-compose.observability.yml
Marcel 6145a25fe2
Some checks failed
CI / OCR Service Tests (push) Has been cancelled
CI / Backend Unit Tests (push) Has been cancelled
CI / fail2ban Regex (push) Has been cancelled
CI / Compose Bucket Idempotency (push) Has been cancelled
CI / Unit & Component Tests (push) Has been cancelled
fix(obs): correct GlitchTip port and healthcheck for v6.x
GlitchTip 6.x moved its internal listen port from 8080 to 8000.
The ports mapping was forwarding to the wrong port (host traffic
never reached the app), and the healthcheck was probing 8080 with
wget (not present in the image), causing the container to stay
permanently unhealthy.

Fix: map to port 8000, check with bash /dev/tcp (no external tools
needed, available in the Python base image).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 10:31:07 +02:00

267 lines
8.3 KiB
YAML

# Observability stack — Grafana LGTM + GlitchTip
#
# Requires the main stack to be running first:
# docker compose up -d # creates archiv-net
# docker compose -f docker-compose.observability.yml up -d
#
# To validate without starting:
# docker compose -f docker-compose.observability.yml config
services:
# --- Metrics: Prometheus ---
prometheus:
image: prom/prometheus:v3.4.0
container_name: obs-prometheus
restart: unless-stopped
volumes:
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
networks:
- archiv-net
- obs-net
node-exporter:
image: prom/node-exporter:v1.9.0
container_name: obs-node-exporter
restart: unless-stopped
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
# $$ is YAML Compose escaping for a literal $ in the regex alternation
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
expose:
- "9100"
networks:
- obs-net
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.52.1
container_name: obs-cadvisor
restart: unless-stopped
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
privileged: true
volumes:
- /:/rootfs:ro
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
- /var/run/docker.sock:/var/run/docker.sock:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
expose:
- "8080"
networks:
- obs-net
# --- Logs: Loki + Promtail ---
loki:
image: grafana/loki:3.4.2
container_name: obs-loki
restart: unless-stopped
volumes:
- ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- loki_data:/loki
command: -config.file=/etc/loki/loki-config.yml
expose:
- "3100"
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
networks:
- obs-net
promtail:
image: grafana/promtail:3.4.2
container_name: obs-promtail
restart: unless-stopped
volumes:
- ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
# :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive.
- /var/run/docker.sock:/var/run/docker.sock:ro
- promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion
command: -config.file=/etc/promtail/promtail-config.yml
networks:
- archiv-net # label discovery from application containers via Docker socket
- obs-net # log shipping to Loki
depends_on:
loki:
condition: service_healthy
# --- Traces: Tempo ---
tempo:
image: grafana/tempo:2.7.2
container_name: obs-tempo
restart: unless-stopped
volumes:
- ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro
- tempo_data:/var/tempo
command: -config.file=/etc/tempo.yml
expose:
- "3200" # Grafana queries Tempo on this port (obs-net only)
- "4317" # OTLP gRPC — backend sends traces here (archiv-net)
- "4318" # OTLP HTTP — alternative transport (archiv-net)
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 15s
networks:
- archiv-net # backend (archive-backend) reaches tempo:4317 over this network
- obs-net # Grafana reaches tempo:3200 over this network
# --- Dashboards: Grafana ---
obs-grafana:
image: grafana/grafana-oss:11.6.1
container_name: obs-grafana
restart: unless-stopped
ports:
- "127.0.0.1:${PORT_GRAFANA:-3003}:3000"
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-http://localhost:3003}
volumes:
- grafana_data:/var/lib/grafana
- ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health | grep -q ok || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
prometheus:
condition: service_healthy
loki:
condition: service_healthy
tempo:
condition: service_healthy
networks:
- obs-net
# --- Error Tracking: GlitchTip ---
obs-redis:
image: redis:7-alpine
container_name: obs-redis
restart: unless-stopped
volumes:
- glitchtip_data:/data
expose:
- "6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- obs-net
obs-glitchtip:
image: glitchtip/glitchtip:6.1.6
container_name: obs-glitchtip
restart: unless-stopped
depends_on:
obs-redis:
condition: service_healthy
obs-glitchtip-db-init:
condition: service_completed_successfully
environment:
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip
REDIS_URL: redis://obs-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:3002}
DEFAULT_FROM_EMAIL: ${APP_MAIL_FROM:-noreply@familienarchiv.local}
EMAIL_URL: smtp://mailpit:1025
GLITCHTIP_MAX_EVENT_LIFE_DAYS: 90
ports:
- "127.0.0.1:${PORT_GLITCHTIP:-3002}:8000"
healthcheck:
test: ["CMD", "bash", "-c", "echo > /dev/tcp/localhost/8000"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
networks:
- archiv-net
- obs-net
obs-glitchtip-worker:
image: glitchtip/glitchtip:6.1.6
container_name: obs-glitchtip-worker
restart: unless-stopped
command: ./bin/run-celery-with-beat.sh
depends_on:
obs-redis:
condition: service_healthy
environment:
DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip
REDIS_URL: redis://obs-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
networks:
- archiv-net
- obs-net
obs-glitchtip-db-init:
image: postgres:16-alpine
container_name: obs-glitchtip-db-init
restart: "no"
environment:
PGPASSWORD: ${POSTGRES_PASSWORD}
command: >
sh -c "psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -tc
\"SELECT 1 FROM pg_database WHERE datname = 'glitchtip'\" |
grep -q 1 ||
psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -c \"CREATE DATABASE glitchtip;\""
networks:
- archiv-net
networks:
# Shared network created by the main docker-compose.yml.
# The observability stack joins as a peer so Prometheus can scrape
# archive-backend by container name. The observability stack must NOT
# attempt to create this network — it will fail with a clear error if
# the main stack is not running yet.
archiv-net:
external: true
# Internal network for observability-service-to-service traffic
# (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo).
obs-net:
driver: bridge
volumes:
prometheus_data:
loki_data:
promtail_positions:
tempo_data:
grafana_data:
glitchtip_data: