Files
familienarchiv/docker-compose.observability.yml
Marcel 0c9973fdff
All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m22s
CI / OCR Service Tests (pull_request) Successful in 16s
CI / Backend Unit Tests (pull_request) Successful in 4m40s
CI / fail2ban Regex (pull_request) Successful in 39s
CI / Compose Bucket Idempotency (pull_request) Successful in 57s
devops(observability): add Prometheus + Node Exporter + cAdvisor for host and container metrics
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 01:47:07 +02:00

108 lines
3.1 KiB
YAML

# Observability stack — Grafana LGTM + GlitchTip
#
# Requires the main stack to be running first:
# docker compose up -d # creates archiv-net
# docker compose -f docker-compose.observability.yml up -d
#
# To validate without starting:
# docker compose -f docker-compose.observability.yml config
services:
# --- Metrics: Prometheus ---
prometheus:
image: prom/prometheus:v3.4.0
container_name: obs-prometheus
restart: unless-stopped
volumes:
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "${PORT_PROMETHEUS:-9090}:9090"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
networks:
- archiv-net
- obs-net
node-exporter:
image: prom/node-exporter:v1.9.0
container_name: obs-node-exporter
restart: unless-stopped
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
# $$ is YAML Compose escaping for a literal $ in the regex alternation
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
expose:
- "9100"
networks:
- obs-net
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.52.1
container_name: obs-cadvisor
restart: unless-stopped
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
privileged: true
volumes:
- /:/rootfs:ro
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
- /var/run/docker.sock:/var/run/docker.sock:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
expose:
- "8080"
networks:
- obs-net
# --- Logs: Loki + Promtail ---
# loki: (see issue #574)
# promtail: (see issue #575)
#
# --- Traces: Tempo ---
# tempo: (see future issue)
#
# --- Dashboards: Grafana ---
# grafana: (see future issue)
#
# --- Error Tracking: GlitchTip ---
# glitchtip: (see future issue)
networks:
# Shared network created by the main docker-compose.yml.
# The observability stack joins as a peer so Prometheus can scrape
# archive-backend by container name. The observability stack must NOT
# attempt to create this network — it will fail with a clear error if
# the main stack is not running yet.
archiv-net:
external: true
# Internal network for observability-service-to-service traffic
# (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo).
obs-net:
driver: bridge
volumes:
prometheus_data:
loki_data:
tempo_data:
grafana_data:
glitchtip_data: