fix(obs): Tempo processors schema fix + configurable Postgres host and Grafana port #602

Merged
marcel merged 27 commits from fix/issue-601-obs-stack-permanent into main 2026-05-16 09:46:12 +02:00
Showing only changes of commit f628ab6435 - Show all commits

View File

@@ -115,7 +115,24 @@ jobs:
POSTGRES_HOST=archiv-production-db-1
EOF
- name: Validate observability compose config
# Dry-run: resolves all variable substitutions and reports any missing
# required keys before containers start. Catches undefined variables and
# YAML errors in config files updated by the previous step.
# Keep in sync with the equivalent step in nightly.yml.
run: |
docker compose \
-f /opt/familienarchiv/docker-compose.observability.yml \
--env-file /opt/familienarchiv/infra/observability/obs.env \
--env-file /opt/familienarchiv/obs-secrets.env \
config --quiet
- name: Start observability stack
# Runs with absolute paths so bind mounts resolve to stable host paths
# that survive workspace wipes between runs (see ADR-016).
# Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
# (written fresh from Gitea secrets above).
# Keep in sync with the equivalent step in nightly.yml.
run: |
docker compose \
-f /opt/familienarchiv/docker-compose.observability.yml \
@@ -123,6 +140,26 @@ jobs:
--env-file /opt/familienarchiv/obs-secrets.env \
up -d --wait --remove-orphans
- name: Assert observability stack health
# docker compose up --wait covers services WITH healthcheck directives only.
# obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
# no healthcheck — they are considered "started" as soon as the process runs.
# This step explicitly asserts the four healthchecked critical services are
# healthy before the smoke test proceeds.
# Keep in sync with the equivalent step in nightly.yml.
run: |
set -e
unhealthy=""
for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do
status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
if [ "$status" != "healthy" ]; then
echo "::error::$svc is not healthy (status: $status)"
unhealthy="$unhealthy $svc"
fi
done
[ -z "$unhealthy" ] || exit 1
echo "All critical observability services are healthy"
- name: Reload Caddy
# See nightly.yml — same rationale and mechanism: DooD job containers
# cannot call systemctl directly; nsenter via a privileged sibling