Merge pull request 'devops(observability): add Tempo for distributed trace storage (OTLP receiver)' (#587) from feat/issue-575-tempo into main
All checks were successful
All checks were successful
devops(observability): add Tempo for distributed trace storage (#587)
This commit was merged in pull request #587.
This commit is contained in:
@@ -112,8 +112,29 @@ services:
|
|||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
# --- Traces: Tempo ---
|
# --- Traces: Tempo ---
|
||||||
# tempo: (see future issue)
|
|
||||||
#
|
tempo:
|
||||||
|
image: grafana/tempo:2.7.2
|
||||||
|
container_name: obs-tempo
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro
|
||||||
|
- tempo_data:/var/tempo
|
||||||
|
command: -config.file=/etc/tempo.yml
|
||||||
|
expose:
|
||||||
|
- "3200" # Grafana queries Tempo on this port (obs-net only)
|
||||||
|
- "4317" # OTLP gRPC — backend sends traces here (archiv-net)
|
||||||
|
- "4318" # OTLP HTTP — alternative transport (archiv-net)
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- archiv-net # backend (archive-backend) reaches tempo:4317 over this network
|
||||||
|
- obs-net # Grafana reaches tempo:3200 over this network
|
||||||
|
|
||||||
# --- Dashboards: Grafana ---
|
# --- Dashboards: Grafana ---
|
||||||
# grafana: (see future issue)
|
# grafana: (see future issue)
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -281,6 +281,7 @@ Current services:
|
|||||||
| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics |
|
| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics |
|
||||||
| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). |
|
| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). |
|
||||||
| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels |
|
| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels |
|
||||||
|
| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP gRPC receiver on port 4317, OTLP HTTP on port 4318 (both `archiv-net`-internal). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). |
|
||||||
|
|
||||||
**Loki quick checks** (after ~60 s, run from inside the `obs-loki` container):
|
**Loki quick checks** (after ~60 s, run from inside the `obs-loki` container):
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,8 @@ System_Boundary(observability, "Observability Stack (docker-compose.observabilit
|
|||||||
Container(prometheus, "Prometheus", "prom/prometheus", "Scrapes metrics from backend management port 8081 (/actuator/prometheus). Retention and alert rules TBD — see issue #581.")
|
Container(prometheus, "Prometheus", "prom/prometheus", "Scrapes metrics from backend management port 8081 (/actuator/prometheus). Retention and alert rules TBD — see issue #581.")
|
||||||
Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.")
|
Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.")
|
||||||
Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD")
|
Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD")
|
||||||
Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki. Wiring TBD — see issue #581.")
|
Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP gRPC receiver on port 4317 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.")
|
||||||
|
Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki + Tempo. Wiring TBD — see issue #581.")
|
||||||
}
|
}
|
||||||
|
|
||||||
Rel(user, caddy, "HTTPS", "TLS 1.2/1.3")
|
Rel(user, caddy, "HTTPS", "TLS 1.2/1.3")
|
||||||
@@ -36,5 +37,6 @@ Rel(backend, mail, "Sends notification and password-reset emails (optional)", "S
|
|||||||
Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
|
Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
|
||||||
Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
|
Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
|
||||||
Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
|
Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
|
||||||
|
Rel(backend, tempo, "Sends distributed traces via OTLP", "gRPC / OTLP / port 4317 (archiv-net)")
|
||||||
|
|
||||||
@enduml
|
@enduml
|
||||||
|
|||||||
51
infra/observability/tempo/tempo.yml
Normal file
51
infra/observability/tempo/tempo.yml
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
server:
|
||||||
|
http_listen_port: 3200
|
||||||
|
|
||||||
|
distributor:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
ingester:
|
||||||
|
max_block_duration: 5m
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
# 30 days — matches Loki retention. Compactor enforces this automatically;
|
||||||
|
# no manual intervention needed under normal trace volumes.
|
||||||
|
block_retention: 720h
|
||||||
|
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
# Local filesystem storage — single-VPS deployment, no S3 backend needed.
|
||||||
|
# Both paths are on the same named Docker volume (tempo_data) so they
|
||||||
|
# survive container restarts without split-brain between WAL and blocks.
|
||||||
|
backend: local
|
||||||
|
local:
|
||||||
|
path: /var/tempo/blocks
|
||||||
|
wal:
|
||||||
|
path: /var/tempo/wal
|
||||||
|
|
||||||
|
metrics_generator:
|
||||||
|
registry:
|
||||||
|
external_labels:
|
||||||
|
source: tempo
|
||||||
|
storage:
|
||||||
|
path: /var/tempo/generator/wal
|
||||||
|
processors:
|
||||||
|
- service-graphs
|
||||||
|
- span-metrics
|
||||||
|
|
||||||
|
# Tempo HTTP API (port 3200) is unauthenticated. Access is controlled entirely
|
||||||
|
# by network isolation: only Grafana (on obs-net) should reach this port.
|
||||||
|
# The OTLP receivers (4317 gRPC, 4318 HTTP) are internal to archiv-net only.
|
||||||
|
overrides:
|
||||||
|
defaults:
|
||||||
|
metrics_generator:
|
||||||
|
processors:
|
||||||
|
- service-graphs
|
||||||
|
- span-metrics
|
||||||
Reference in New Issue
Block a user