diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 461f2905..216e46f6 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -7,24 +7,83 @@ # To validate without starting: # docker compose -f docker-compose.observability.yml config -# No services defined yet — added in subsequent issues: -# -# --- Metrics: Prometheus --- -# prometheus: (see issue #573) -# -# --- Logs: Loki + Promtail --- -# loki: (see issue #574) -# promtail: (see issue #575) -# -# --- Traces: Tempo --- -# tempo: (see future issue) -# -# --- Dashboards: Grafana --- -# grafana: (see future issue) -# -# --- Error Tracking: GlitchTip --- -# glitchtip: (see future issue) -services: {} +services: + + # --- Metrics: Prometheus --- + + prometheus: + image: prom/prometheus:v3.4.0 + container_name: obs-prometheus + restart: unless-stopped + volumes: + - ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + ports: + - "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090" + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 5s + retries: 3 + networks: + - archiv-net + - obs-net + + node-exporter: + image: prom/node-exporter:v1.9.0 + container_name: obs-node-exporter + restart: unless-stopped + # pid: host — required for process-level CPU/memory metrics; cgroup isolation applies + pid: host + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + # $$ is YAML Compose escaping for a literal $ in the regex alternation + - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' + expose: + - "9100" + networks: + - obs-net + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.52.1 + container_name: obs-cadvisor + restart: unless-stopped + # privileged: true — required for cgroup and namespace metrics, see cAdvisor docs. + # Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net. + privileged: true + volumes: + - /:/rootfs:ro + # /var/run/docker.sock mounted read-only — sufficient for container metadata discovery + - /var/run/docker.sock:/var/run/docker.sock:ro + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + expose: + - "8080" + networks: + - obs-net + + # --- Logs: Loki + Promtail --- + # loki: (see issue #574) + # promtail: (see issue #575) + # + # --- Traces: Tempo --- + # tempo: (see future issue) + # + # --- Dashboards: Grafana --- + # grafana: (see future issue) + # + # --- Error Tracking: GlitchTip --- + # glitchtip: (see future issue) networks: # Shared network created by the main docker-compose.yml. diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 1bfb49e1..c4df74aa 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -43,6 +43,7 @@ graph TD - SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not. - The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy. - Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001). +- An optional observability stack (Prometheus, Node Exporter, cAdvisor) runs as a separate compose file: `docker compose -f docker-compose.observability.yml up -d`. It joins `archiv-net` and scrapes the backend's management port (`:8081`). Configuration lives under `infra/observability/`. ### OCR memory requirements @@ -134,6 +135,12 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — | | `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — | +### Observability stack (`docker-compose.observability.yml`) + +| Variable | Purpose | Default | Required? | Sensitive? | +|---|---|---|---|---| +| `PORT_PROMETHEUS` | Host port for the Prometheus UI (bound to `127.0.0.1` only) | `9090` | — | — | + --- ## 3. Bootstrap from scratch diff --git a/infra/observability/prometheus/prometheus.yml b/infra/observability/prometheus/prometheus.yml new file mode 100644 index 00000000..38a0f8d6 --- /dev/null +++ b/infra/observability/prometheus/prometheus.yml @@ -0,0 +1,28 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: node + static_configs: + - targets: ['node-exporter:9100'] + + - job_name: cadvisor + static_configs: + - targets: ['cadvisor:8080'] + + - job_name: spring-boot + metrics_path: /actuator/prometheus + static_configs: + # Uses the Docker service name (not container_name) for reliable DNS resolution. + # Target will show as DOWN until backend instrumentation issue adds + # micrometer-registry-prometheus and exposes the endpoint — this is expected. + - targets: ['backend:8081'] + + - job_name: ocr-service + metrics_path: /metrics + static_configs: + # TODO: remove or add prometheus-client to ocr-service. + # The Python OCR service does not currently expose Prometheus metrics. + # This target will show as DOWN until prometheus-client is added to ocr-service. + - targets: ['ocr:8000']