diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 82025222..1e7393e5 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -112,8 +112,29 @@ services: condition: service_healthy # --- Traces: Tempo --- - # tempo: (see future issue) - # + + tempo: + image: grafana/tempo:2.7.2 + container_name: obs-tempo + restart: unless-stopped + volumes: + - ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro + - tempo_data:/var/tempo + command: -config.file=/etc/tempo.yml + expose: + - "3200" # Grafana queries Tempo on this port (obs-net only) + - "4317" # OTLP gRPC — backend sends traces here (archiv-net) + - "4318" # OTLP HTTP — alternative transport (archiv-net) + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + networks: + - archiv-net # backend (archive-backend) reaches tempo:4317 over this network + - obs-net # Grafana reaches tempo:3200 over this network + # --- Dashboards: Grafana --- # grafana: (see future issue) # diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 4eb12fbd..8b96b3fd 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -281,6 +281,7 @@ Current services: | `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics | | `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). | | `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels | +| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP gRPC receiver on port 4317, OTLP HTTP on port 4318 (both `archiv-net`-internal). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). | **Loki quick checks** (after ~60 s, run from inside the `obs-loki` container): diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index 56968766..16bf5c5f 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -21,7 +21,8 @@ System_Boundary(observability, "Observability Stack (docker-compose.observabilit Container(prometheus, "Prometheus", "prom/prometheus", "Scrapes metrics from backend management port 8081 (/actuator/prometheus). Retention and alert rules TBD — see issue #581.") Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.") Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD") - Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki. Wiring TBD — see issue #581.") + Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP gRPC receiver on port 4317 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.") + Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki + Tempo. Wiring TBD — see issue #581.") } Rel(user, caddy, "HTTPS", "TLS 1.2/1.3") @@ -36,5 +37,6 @@ Rel(backend, mail, "Sends notification and password-reset emails (optional)", "S Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API") +Rel(backend, tempo, "Sends distributed traces via OTLP", "gRPC / OTLP / port 4317 (archiv-net)") @enduml diff --git a/infra/observability/tempo/tempo.yml b/infra/observability/tempo/tempo.yml new file mode 100644 index 00000000..f09a28b6 --- /dev/null +++ b/infra/observability/tempo/tempo.yml @@ -0,0 +1,51 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + # 30 days — matches Loki retention. Compactor enforces this automatically; + # no manual intervention needed under normal trace volumes. + block_retention: 720h + +storage: + trace: + # Local filesystem storage — single-VPS deployment, no S3 backend needed. + # Both paths are on the same named Docker volume (tempo_data) so they + # survive container restarts without split-brain between WAL and blocks. + backend: local + local: + path: /var/tempo/blocks + wal: + path: /var/tempo/wal + +metrics_generator: + registry: + external_labels: + source: tempo + storage: + path: /var/tempo/generator/wal + processors: + - service-graphs + - span-metrics + +# Tempo HTTP API (port 3200) is unauthenticated. Access is controlled entirely +# by network isolation: only Grafana (on obs-net) should reach this port. +# The OTLP receivers (4317 gRPC, 4318 HTTP) are internal to archiv-net only. +overrides: + defaults: + metrics_generator: + processors: + - service-graphs + - span-metrics