Merge pull request 'devops(observability): add Tempo for distributed trace storage (OTLP receiver)' (#587) from feat/issue-575-tempo into main
All checks were successful
CI / Unit & Component Tests (push) Successful in 3m21s
CI / OCR Service Tests (push) Successful in 16s
CI / Backend Unit Tests (push) Successful in 4m38s
CI / fail2ban Regex (push) Successful in 40s
CI / Compose Bucket Idempotency (push) Successful in 57s

devops(observability): add Tempo for distributed trace storage (#587)
This commit was merged in pull request #587.
This commit is contained in:
2026-05-15 03:21:11 +02:00
4 changed files with 78 additions and 3 deletions

View File

@@ -112,8 +112,29 @@ services:
condition: service_healthy
# --- Traces: Tempo ---
# tempo: (see future issue)
#
tempo:
image: grafana/tempo:2.7.2
container_name: obs-tempo
restart: unless-stopped
volumes:
- ./infra/observability/tempo/tempo.yml:/etc/tempo.yml:ro
- tempo_data:/var/tempo
command: -config.file=/etc/tempo.yml
expose:
- "3200" # Grafana queries Tempo on this port (obs-net only)
- "4317" # OTLP gRPC — backend sends traces here (archiv-net)
- "4318" # OTLP HTTP — alternative transport (archiv-net)
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3200/ready | grep -q ready || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 15s
networks:
- archiv-net # backend (archive-backend) reaches tempo:4317 over this network
- obs-net # Grafana reaches tempo:3200 over this network
# --- Dashboards: Grafana ---
# grafana: (see future issue)
#

View File

@@ -281,6 +281,7 @@ Current services:
| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics |
| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). |
| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels |
| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP gRPC receiver on port 4317, OTLP HTTP on port 4318 (both `archiv-net`-internal). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). |
**Loki quick checks** (after ~60 s, run from inside the `obs-loki` container):

View File

@@ -21,7 +21,8 @@ System_Boundary(observability, "Observability Stack (docker-compose.observabilit
Container(prometheus, "Prometheus", "prom/prometheus", "Scrapes metrics from backend management port 8081 (/actuator/prometheus). Retention and alert rules TBD — see issue #581.")
Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.")
Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD")
Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki. Wiring TBD — see issue #581.")
Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP gRPC receiver on port 4317 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.")
Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki + Tempo. Wiring TBD — see issue #581.")
}
Rel(user, caddy, "HTTPS", "TLS 1.2/1.3")
@@ -36,5 +37,6 @@ Rel(backend, mail, "Sends notification and password-reset emails (optional)", "S
Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
Rel(backend, tempo, "Sends distributed traces via OTLP", "gRPC / OTLP / port 4317 (archiv-net)")
@enduml

View File

@@ -0,0 +1,51 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
max_block_duration: 5m
compactor:
compaction:
# 30 days — matches Loki retention. Compactor enforces this automatically;
# no manual intervention needed under normal trace volumes.
block_retention: 720h
storage:
trace:
# Local filesystem storage — single-VPS deployment, no S3 backend needed.
# Both paths are on the same named Docker volume (tempo_data) so they
# survive container restarts without split-brain between WAL and blocks.
backend: local
local:
path: /var/tempo/blocks
wal:
path: /var/tempo/wal
metrics_generator:
registry:
external_labels:
source: tempo
storage:
path: /var/tempo/generator/wal
processors:
- service-graphs
- span-metrics
# Tempo HTTP API (port 3200) is unauthenticated. Access is controlled entirely
# by network isolation: only Grafana (on obs-net) should reach this port.
# The OTLP receivers (4317 gRPC, 4318 HTTP) are internal to archiv-net only.
overrides:
defaults:
metrics_generator:
processors:
- service-graphs
- span-metrics