devops(observability): add Prometheus + Node Exporter + cAdvisor for host and container metrics #585
@@ -7,24 +7,83 @@
|
|||||||
# To validate without starting:
|
# To validate without starting:
|
||||||
# docker compose -f docker-compose.observability.yml config
|
# docker compose -f docker-compose.observability.yml config
|
||||||
|
|
||||||
# No services defined yet — added in subsequent issues:
|
services:
|
||||||
#
|
|
||||||
# --- Metrics: Prometheus ---
|
# --- Metrics: Prometheus ---
|
||||||
# prometheus: (see issue #573)
|
|
||||||
#
|
prometheus:
|
||||||
# --- Logs: Loki + Promtail ---
|
image: prom/prometheus:v3.4.0
|
||||||
# loki: (see issue #574)
|
container_name: obs-prometheus
|
||||||
# promtail: (see issue #575)
|
restart: unless-stopped
|
||||||
#
|
volumes:
|
||||||
# --- Traces: Tempo ---
|
- ./infra/observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
# tempo: (see future issue)
|
- prometheus_data:/prometheus
|
||||||
#
|
command:
|
||||||
# --- Dashboards: Grafana ---
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
# grafana: (see future issue)
|
- '--storage.tsdb.path=/prometheus'
|
||||||
#
|
- '--storage.tsdb.retention.time=30d'
|
||||||
# --- Error Tracking: GlitchTip ---
|
- '--web.enable-lifecycle'
|
||||||
# glitchtip: (see future issue)
|
ports:
|
||||||
services: {}
|
- "127.0.0.1:${PORT_PROMETHEUS:-9090}:9090"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
networks:
|
||||||
|
- archiv-net
|
||||||
|
- obs-net
|
||||||
|
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter:v1.9.0
|
||||||
|
container_name: obs-node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
# pid: host — required for process-level CPU/memory metrics; cgroup isolation applies
|
||||||
|
pid: host
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
# $$ is YAML Compose escaping for a literal $ in the regex alternation
|
||||||
|
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||||
|
expose:
|
||||||
|
- "9100"
|
||||||
|
networks:
|
||||||
|
- obs-net
|
||||||
|
|
||||||
|
cadvisor:
|
||||||
|
image: gcr.io/cadvisor/cadvisor:v0.52.1
|
||||||
|
container_name: obs-cadvisor
|
||||||
|
restart: unless-stopped
|
||||||
|
# privileged: true — required for cgroup and namespace metrics, see cAdvisor docs.
|
||||||
|
# Accepted risk: cAdvisor is pinned, on Renovate, and not exposed outside obs-net.
|
||||||
|
privileged: true
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
# /var/run/docker.sock mounted read-only — sufficient for container metadata discovery
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
expose:
|
||||||
|
- "8080"
|
||||||
|
networks:
|
||||||
|
- obs-net
|
||||||
|
|
||||||
|
# --- Logs: Loki + Promtail ---
|
||||||
|
# loki: (see issue #574)
|
||||||
|
# promtail: (see issue #575)
|
||||||
|
#
|
||||||
|
# --- Traces: Tempo ---
|
||||||
|
# tempo: (see future issue)
|
||||||
|
#
|
||||||
|
# --- Dashboards: Grafana ---
|
||||||
|
# grafana: (see future issue)
|
||||||
|
#
|
||||||
|
# --- Error Tracking: GlitchTip ---
|
||||||
|
# glitchtip: (see future issue)
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
# Shared network created by the main docker-compose.yml.
|
# Shared network created by the main docker-compose.yml.
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ graph TD
|
|||||||
- SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not.
|
- SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not.
|
||||||
- The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy.
|
- The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy.
|
||||||
- Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001).
|
- Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001).
|
||||||
|
- An optional observability stack (Prometheus, Node Exporter, cAdvisor) runs as a separate compose file: `docker compose -f docker-compose.observability.yml up -d`. It joins `archiv-net` and scrapes the backend's management port (`:8081`). Configuration lives under `infra/observability/`.
|
||||||
|
|
||||||
### OCR memory requirements
|
### OCR memory requirements
|
||||||
|
|
||||||
@@ -134,6 +135,12 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
|
|||||||
| `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — |
|
| `BLLA_MODEL_PATH` | Kraken baseline layout analysis model path | `/app/models/blla.mlmodel` | — | — |
|
||||||
| `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — |
|
| `OCR_MEM_LIMIT` | Container memory cap for ocr-service in `docker-compose.prod.yml`. Set to `6g` on CX32 hosts; leave unset on CX42+ to use the 12g default | `12g` (prod compose default) | — | — |
|
||||||
|
|
||||||
|
### Observability stack (`docker-compose.observability.yml`)
|
||||||
|
|
||||||
|
| Variable | Purpose | Default | Required? | Sensitive? |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| `PORT_PROMETHEUS` | Host port for the Prometheus UI (bound to `127.0.0.1` only) | `9090` | — | — |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 3. Bootstrap from scratch
|
## 3. Bootstrap from scratch
|
||||||
|
|||||||
28
infra/observability/prometheus/prometheus.yml
Normal file
28
infra/observability/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: node
|
||||||
|
static_configs:
|
||||||
|
- targets: ['node-exporter:9100']
|
||||||
|
|
||||||
|
- job_name: cadvisor
|
||||||
|
static_configs:
|
||||||
|
- targets: ['cadvisor:8080']
|
||||||
|
|
||||||
|
- job_name: spring-boot
|
||||||
|
metrics_path: /actuator/prometheus
|
||||||
|
static_configs:
|
||||||
|
# Uses the Docker service name (not container_name) for reliable DNS resolution.
|
||||||
|
# Target will show as DOWN until backend instrumentation issue adds
|
||||||
|
# micrometer-registry-prometheus and exposes the endpoint — this is expected.
|
||||||
|
- targets: ['backend:8081']
|
||||||
|
|
||||||
|
- job_name: ocr-service
|
||||||
|
metrics_path: /metrics
|
||||||
|
static_configs:
|
||||||
|
# TODO: remove or add prometheus-client to ocr-service.
|
||||||
|
# The Python OCR service does not currently expose Prometheus metrics.
|
||||||
|
# This target will show as DOWN until prometheus-client is added to ocr-service.
|
||||||
|
- targets: ['ocr:8000']
|
||||||
Reference in New Issue
Block a user