All checks were successful
CI / Unit & Component Tests (pull_request) Successful in 3m22s
CI / OCR Service Tests (pull_request) Successful in 16s
CI / Backend Unit Tests (pull_request) Successful in 4m33s
CI / fail2ban Regex (pull_request) Successful in 38s
CI / Compose Bucket Idempotency (pull_request) Successful in 56s
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
144 lines
4.5 KiB
YAML
144 lines
4.5 KiB
YAML
# Observability stack — Grafana LGTM + GlitchTip
|
|
#
|
|
# Requires the main stack to be running first:
|
|
# docker compose up -d # creates archiv-net
|
|
# docker compose -f docker-compose.observability.yml up -d
|
|
#
|
|
# To validate without starting:
|
|
# docker compose -f docker-compose.observability.yml config
|
|
|
|
services:
|
|
|
|
# --- Metrics: Prometheus ---
|
|
|
|
prometheus:
|
|
image: prom/prometheus:v3.4.0
|
|
container_name: obs-prometheus
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus_data:/prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
ports:
|
|
- "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090"
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
|
interval: 30s
|
|
timeout: 5s
|
|
retries: 3
|
|
networks:
|
|
- archiv-net
|
|
- obs-net
|
|
|
|
node-exporter:
|
|
image: prom/node-exporter:v1.9.0
|
|
container_name: obs-node-exporter
|
|
restart: unless-stopped
|
|
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
|
|
pid: host
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
# $$ is YAML Compose escaping for a literal $ in the regex alternation
|
|
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
|
expose:
|
|
- "9100"
|
|
networks:
|
|
- obs-net
|
|
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:v0.52.1
|
|
container_name: obs-cadvisor
|
|
restart: unless-stopped
|
|
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
|
|
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
|
|
privileged: true
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker:/var/lib/docker:ro
|
|
expose:
|
|
- "8080"
|
|
networks:
|
|
- obs-net
|
|
|
|
# --- Logs: Loki + Promtail ---
|
|
|
|
loki:
|
|
image: grafana/loki:3.4.2
|
|
container_name: obs-loki
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
|
|
- loki_data:/loki
|
|
command: -config.file=/etc/loki/loki-config.yml
|
|
expose:
|
|
- "3100"
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 30s
|
|
networks:
|
|
- obs-net
|
|
|
|
promtail:
|
|
image: grafana/promtail:3.4.2
|
|
container_name: obs-promtail
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
|
|
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
|
# :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive.
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
- promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion
|
|
command: -config.file=/etc/promtail/promtail-config.yml
|
|
networks:
|
|
- archiv-net # label discovery from application containers via Docker socket
|
|
- obs-net # log shipping to Loki
|
|
depends_on:
|
|
loki:
|
|
condition: service_healthy
|
|
|
|
# --- Traces: Tempo ---
|
|
# tempo: (see future issue)
|
|
#
|
|
# --- Dashboards: Grafana ---
|
|
# grafana: (see future issue)
|
|
#
|
|
# --- Error Tracking: GlitchTip ---
|
|
# glitchtip: (see future issue)
|
|
|
|
networks:
|
|
# Shared network created by the main docker-compose.yml.
|
|
# The observability stack joins as a peer so Prometheus can scrape
|
|
# archive-backend by container name. The observability stack must NOT
|
|
# attempt to create this network — it will fail with a clear error if
|
|
# the main stack is not running yet.
|
|
archiv-net:
|
|
external: true
|
|
|
|
# Internal network for observability-service-to-service traffic
|
|
# (e.g. Grafana → Prometheus, Grafana → Loki, Grafana → Tempo).
|
|
obs-net:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
prometheus_data:
|
|
loki_data:
|
|
promtail_positions:
|
|
tempo_data:
|
|
grafana_data:
|
|
glitchtip_data:
|