devops: extract composite actions for obs stack deploy steps (#603) #715
127
.gitea/actions/deploy-obs/action.yml
Normal file
127
.gitea/actions/deploy-obs/action.yml
Normal file
@@ -0,0 +1,127 @@
|
||||
name: Deploy observability stack
|
||||
description: >-
|
||||
Deploy observability configs + secrets to /opt/familienarchiv, validate the
|
||||
compose config, start the stack, and assert the five healthchecked services
|
||||
are healthy. Per-environment values arrive as inputs.
|
||||
|
||||
inputs:
|
||||
grafana_admin_password:
|
||||
description: Grafana admin password (secret)
|
||||
required: true
|
||||
grafana_db_password:
|
||||
description: Read-only grafana_reader DB role password (secret, issue #651)
|
||||
required: true
|
||||
glitchtip_secret_key:
|
||||
description: GlitchTip Django secret key (secret)
|
||||
required: true
|
||||
postgres_password:
|
||||
description: PostgreSQL password for the environment (secret)
|
||||
required: true
|
||||
postgres_host:
|
||||
description: >-
|
||||
Compose project + service hostname, e.g. archiv-staging-db-1. Derived
|
||||
from the Compose project name and service name — a project rename
|
||||
requires updating the caller's value. Plain input, not a secret.
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Deploy observability configs
|
||||
shell: bash
|
||||
# Copies the compose file and config tree from the workspace checkout
|
||||
# into /opt/familienarchiv/ — the permanent location that persists
|
||||
# between CI runs. Containers started in the next step bind-mount
|
||||
# from there, so a future workspace wipe cannot corrupt a running
|
||||
# config file.
|
||||
#
|
||||
# obs-secrets.env is written fresh from Gitea secrets on every run so
|
||||
# Gitea is always the single source of truth for secret rotation.
|
||||
# Non-secret config lives in infra/observability/obs.env (tracked in git).
|
||||
#
|
||||
# secrets.* is NOT available inside a composite action, so the values
|
||||
# arrive as inputs mapped to env: below and are referenced as $VAR in
|
||||
# the heredoc. The delimiter MUST stay unquoted (<<EOF, not <<'EOF') so
|
||||
# the shell expands $VAR — a quoted delimiter would write the literal
|
||||
# string "$GRAFANA_ADMIN_PASSWORD" and `config --quiet` would still pass
|
||||
# (the var is present, just wrong). Do not stage these into intermediate
|
||||
# variables either, or Gitea log masking can be lost.
|
||||
env:
|
||||
GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
|
||||
GRAFANA_DB_PASSWORD: ${{ inputs.grafana_db_password }}
|
||||
GLITCHTIP_SECRET_KEY: ${{ inputs.glitchtip_secret_key }}
|
||||
POSTGRES_PASSWORD: ${{ inputs.postgres_password }}
|
||||
POSTGRES_HOST: ${{ inputs.postgres_host }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
rm -rf /opt/familienarchiv/infra/observability
|
||||
mkdir -p /opt/familienarchiv/infra/observability
|
||||
cp -r infra/observability/. /opt/familienarchiv/infra/observability/
|
||||
cp docker-compose.observability.yml /opt/familienarchiv/
|
||||
cat > /opt/familienarchiv/obs-secrets.env <<EOF
|
||||
GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
|
||||
GRAFANA_DB_PASSWORD=$GRAFANA_DB_PASSWORD
|
||||
GLITCHTIP_SECRET_KEY=$GLITCHTIP_SECRET_KEY
|
||||
POSTGRES_PASSWORD=$POSTGRES_PASSWORD
|
||||
POSTGRES_HOST=$POSTGRES_HOST
|
||||
EOF
|
||||
# Five-key non-empty guard: a bare presence check matches an empty
|
||||
# `KEY=` line, so assert each key has a value. Fail loudly on any
|
||||
# missing/empty key rather than starting the stack with broken auth.
|
||||
for key in GRAFANA_ADMIN_PASSWORD GRAFANA_DB_PASSWORD GLITCHTIP_SECRET_KEY POSTGRES_PASSWORD POSTGRES_HOST; do
|
||||
grep -Eq "^${key}=.+" /opt/familienarchiv/obs-secrets.env \
|
||||
|| { echo "::error::obs-secrets.env missing or empty: ${key}"; exit 1; }
|
||||
done
|
||||
# chmod 600 MUST be the final operation: the ordering is the security
|
||||
# property — there is no window where the file is world-readable.
|
||||
chmod 600 /opt/familienarchiv/obs-secrets.env
|
||||
|
||||
- name: Validate observability compose config
|
||||
shell: bash
|
||||
# Dry-run: resolves all variable substitutions and reports any missing
|
||||
# required keys before containers start. Catches undefined variables and
|
||||
# YAML errors in config files updated by the previous step.
|
||||
# --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
|
||||
# second (CI-written secrets). Later files win on duplicate keys. POSTGRES_HOST
|
||||
# is environment-specific and supplied only by obs-secrets.env — obs.env
|
||||
# documents it but deliberately does not set a value.
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
config --quiet
|
||||
|
||||
- name: Start observability stack
|
||||
shell: bash
|
||||
# Runs with absolute paths so bind mounts resolve to stable host paths
|
||||
# that survive workspace wipes between runs (see ADR-016).
|
||||
# Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
|
||||
# (written fresh from Gitea secrets above). --env-file order: obs.env first,
|
||||
# obs-secrets.env second — later file wins on duplicate keys.
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
up -d --wait --remove-orphans
|
||||
|
||||
- name: Assert observability stack health
|
||||
shell: bash
|
||||
# docker compose up --wait covers services WITH healthcheck directives only.
|
||||
# obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
|
||||
# no healthcheck — they are considered "started" as soon as the process runs.
|
||||
# This step explicitly asserts the five healthchecked critical services are
|
||||
# healthy before the smoke test proceeds.
|
||||
run: |
|
||||
set -e
|
||||
unhealthy=""
|
||||
for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
|
||||
status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
|
||||
if [ "$status" != "healthy" ]; then
|
||||
echo "::error::$svc is not healthy (status: $status)"
|
||||
unhealthy="$unhealthy $svc"
|
||||
fi
|
||||
done
|
||||
[ -z "$unhealthy" ] || exit 1
|
||||
echo "All critical observability services are healthy"
|
||||
41
.gitea/actions/reload-caddy/action.yml
Normal file
41
.gitea/actions/reload-caddy/action.yml
Normal file
@@ -0,0 +1,41 @@
|
||||
name: Reload Caddy
|
||||
description: >-
|
||||
Reload the host Caddy service from a DooD job container via a privileged
|
||||
sibling container and nsenter. No inputs.
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Reload Caddy
|
||||
shell: bash
|
||||
# Apply any committed Caddyfile changes before smoke-testing the
|
||||
# public surface. Without this step, a Caddyfile edit lands in the
|
||||
# repo but Caddy keeps serving the previous config until someone
|
||||
# reloads it manually — the smoke test would then catch a stale
|
||||
# header or a still-proxied /actuator route rather than confirming
|
||||
# the current config is live.
|
||||
#
|
||||
# The runner executes job steps inside Docker containers (DooD).
|
||||
# `systemctl` is not present in container images and cannot reach
|
||||
# the host's systemd directly. We use the Docker socket (mounted
|
||||
# into every job container via runner-config.yaml) to spin up a
|
||||
# privileged sibling container in the host PID namespace; nsenter
|
||||
# then enters the host's namespaces so systemctl talks to the real
|
||||
# host systemd daemon. No sudoers entry is required — the Docker
|
||||
# socket already grants root-equivalent host access.
|
||||
#
|
||||
# Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
|
||||
# tooling, and the digest is pinned so any upstream change requires
|
||||
# an explicit bump PR. util-linux (which ships nsenter) is installed
|
||||
# at run time; apk add takes ~1 s on the warm VPS cache.
|
||||
#
|
||||
# `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
|
||||
# config in-process without dropping TLS connections. `restart`
|
||||
# would briefly stop the service, losing in-flight requests.
|
||||
#
|
||||
# If Caddy is not running this step fails fast before the smoke test
|
||||
# issues a misleading "port 443 refused" error.
|
||||
run: |
|
||||
docker run --rm --privileged --pid=host \
|
||||
alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
|
||||
sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
|
||||
58
.gitea/actions/smoke-test/action.yml
Normal file
58
.gitea/actions/smoke-test/action.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
name: Smoke test
|
||||
description: >-
|
||||
Verify the deployed public surface (login reachable, HSTS pinned,
|
||||
Permissions-Policy present, /actuator blocked) against a given vhost.
|
||||
|
||||
inputs:
|
||||
host:
|
||||
description: Public vhost to smoke-test, e.g. staging.raddatz.cloud
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Smoke test deployed environment
|
||||
shell: bash
|
||||
# Healthchecks confirm containers are healthy; they do NOT confirm the
|
||||
# public surface works. This step catches: Caddy not reloaded, HSTS
|
||||
# header dropped, /actuator block bypassed.
|
||||
#
|
||||
# --resolve pins the public host to the Docker bridge gateway IP
|
||||
# (the host) so we do NOT depend on hairpin NAT on the host router.
|
||||
# 127.0.0.1 cannot be used: job containers run in bridge network mode
|
||||
# (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
|
||||
# the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
|
||||
# and is therefore reachable from the container via that IP.
|
||||
# SNI still uses the public hostname so the TLS cert validates correctly.
|
||||
#
|
||||
# --resolve is stored as a Bash array so "${RESOLVE[@]}" expands to two
|
||||
# separate arguments; a quoted string would pass the flag and its value
|
||||
# as one token and curl would reject it as an unknown option.
|
||||
#
|
||||
# Gateway detection reads /proc/net/route (always present, no package
|
||||
# required) instead of `ip route` to avoid a dependency on iproute2.
|
||||
# Field $2=="00000000" is the default route; field $3 is the gateway as
|
||||
# a little-endian 32-bit hex value which awk decodes to dotted-decimal.
|
||||
env:
|
||||
HOST: ${{ inputs.host }}
|
||||
run: |
|
||||
set -e
|
||||
URL="https://$HOST"
|
||||
HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
|
||||
[ -n "$HOST_IP" ] || { echo "::error::could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
|
||||
RESOLVE=(--resolve "$HOST:443:$HOST_IP")
|
||||
echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
|
||||
# Pin the preload-list-eligible HSTS value, not just header presence:
|
||||
# a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
|
||||
# fail this check rather than pass it silently.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
|
||||
# Permissions-Policy denies APIs the app does not use (camera,
|
||||
# microphone, geolocation). A regression that loosens or drops the
|
||||
# header now fails the smoke step.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
|
||||
status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
|
||||
[ "$status" = "404" ] || { echo "::error::expected 404 from /actuator/health, got $status"; exit 1; }
|
||||
echo "All smoke checks passed"
|
||||
@@ -108,6 +108,32 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Assert deploy-obs writes obs-secrets.env via an unquoted heredoc (#603)
|
||||
shell: bash
|
||||
run: |
|
||||
# Inside a composite action, secrets arrive as $VAR from env: (secrets.*
|
||||
# is unavailable there), so the obs-secrets.env heredoc MUST use an
|
||||
# unquoted delimiter (<<EOF) for $VAR to expand. A quoted delimiter
|
||||
# (<<'EOF') would write the literal string "$GRAFANA_ADMIN_PASSWORD",
|
||||
# and the action's five-key non-empty guard would STILL pass (the line
|
||||
# is present, just wrong). This guard enforces the invariant in CI so a
|
||||
# future re-quote cannot ship broken obs auth green. See ADR-029 / #603.
|
||||
action='.gitea/actions/deploy-obs/action.yml'
|
||||
quoted='obs-secrets\.env\s*<<-?\s*[\x27\x22]'
|
||||
# Self-test: the regex must catch a quoted delimiter and ignore the unquoted one.
|
||||
printf "obs-secrets.env <<'EOF'\n" | grep -qP "$quoted" \
|
||||
|| { echo "FAIL: guard self-test — regex missed the quoted <<'EOF' form"; exit 1; }
|
||||
printf 'obs-secrets.env <<EOF\n' | grep -qvP "$quoted" \
|
||||
|| { echo "FAIL: guard self-test — regex wrongly flagged the unquoted <<EOF form"; exit 1; }
|
||||
# Positive: the unquoted heredoc must be present at all.
|
||||
grep -qP 'obs-secrets\.env\s*<<-?EOF\b' "$action" \
|
||||
|| { echo "::error::$action no longer writes obs-secrets.env via an unquoted <<EOF heredoc (ADR-029 / #603)"; exit 1; }
|
||||
# Negative: never a quoted delimiter on the obs-secrets.env heredoc.
|
||||
if grep -nP "$quoted" "$action"; then
|
||||
echo "::error::$action writes obs-secrets.env with a quoted heredoc delimiter — secrets would be written as literal \$VAR strings. Use unquoted <<EOF (ADR-029 / #603)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Run unit and component tests with coverage
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
@@ -23,6 +23,11 @@ name: nightly
|
||||
# - host ports: backend 8081, frontend 3001
|
||||
# - profile: staging (starts mailpit instead of a real SMTP relay)
|
||||
#
|
||||
# The obs-stack deploy, Caddy reload, and smoke test are shared with
|
||||
# release.yml via the composite actions under .gitea/actions/ (ADR-029).
|
||||
# actions/checkout MUST stay the first step: a local `uses: ./…` action
|
||||
# only exists on disk after checkout.
|
||||
#
|
||||
# Required Gitea secrets:
|
||||
# STAGING_POSTGRES_PASSWORD
|
||||
# STAGING_MINIO_PASSWORD
|
||||
@@ -55,6 +60,8 @@ jobs:
|
||||
# for the same repo is within that boundary.
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# MUST be first: the composite actions below live under .gitea/actions/
|
||||
# and only exist on disk once the repo is checked out (ADR-029).
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Write staging env file
|
||||
@@ -92,6 +99,7 @@ jobs:
|
||||
# `compose config` renders both shorthand and longform mounts as
|
||||
# `target: /import` + `read_only: true`, so we assert against
|
||||
# the rendered form rather than the raw source YAML.
|
||||
# App-compose check (not obs), nightly-only — stays inline.
|
||||
run: |
|
||||
set -e
|
||||
docker compose \
|
||||
@@ -128,150 +136,21 @@ jobs:
|
||||
--profile staging \
|
||||
up -d --wait --remove-orphans
|
||||
|
||||
- name: Deploy observability configs
|
||||
# Copies the compose file and config tree from the workspace checkout
|
||||
# into /opt/familienarchiv/ — the permanent location that persists
|
||||
# between CI runs. Containers started in the next step bind-mount
|
||||
# from there, so a future workspace wipe cannot corrupt a running
|
||||
# config file.
|
||||
#
|
||||
# obs-secrets.env is written fresh from Gitea secrets on every run so
|
||||
# Gitea is always the single source of truth for secret rotation.
|
||||
# Non-secret config lives in infra/observability/obs.env (tracked in git).
|
||||
run: |
|
||||
rm -rf /opt/familienarchiv/infra/observability
|
||||
mkdir -p /opt/familienarchiv/infra/observability
|
||||
cp -r infra/observability/. /opt/familienarchiv/infra/observability/
|
||||
cp docker-compose.observability.yml /opt/familienarchiv/
|
||||
cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
|
||||
GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||||
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
|
||||
GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||||
POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
|
||||
POSTGRES_HOST=archiv-staging-db-1
|
||||
EOF
|
||||
# Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging)
|
||||
# and service name (db). A project rename requires updating this value.
|
||||
chmod 600 /opt/familienarchiv/obs-secrets.env
|
||||
# POSTGRES_HOST is derived from the Compose project name (archiv-staging)
|
||||
# and service name (db). A project rename requires updating this value.
|
||||
- uses: ./.gitea/actions/deploy-obs
|
||||
with:
|
||||
grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||||
grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
|
||||
glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||||
postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
|
||||
postgres_host: archiv-staging-db-1
|
||||
|
||||
- name: Validate observability compose config
|
||||
# Dry-run: resolves all variable substitutions and reports any missing
|
||||
# required keys before containers start. Catches undefined variables and
|
||||
# YAML errors in config files updated by the previous step.
|
||||
# --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
|
||||
# second (CI-written secrets). Later files win on duplicate keys, so
|
||||
# obs-secrets.env overrides POSTGRES_HOST set in obs.env.
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
config --quiet
|
||||
- uses: ./.gitea/actions/reload-caddy
|
||||
|
||||
- name: Start observability stack
|
||||
# Runs with absolute paths so bind mounts resolve to stable host paths
|
||||
# that survive workspace wipes between nightly runs (see ADR-016).
|
||||
# Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
|
||||
# (written fresh from Gitea secrets above). --env-file order: obs.env first,
|
||||
# obs-secrets.env second — later file wins on duplicate keys.
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
up -d --wait --remove-orphans
|
||||
|
||||
- name: Assert observability stack health
|
||||
# docker compose up --wait covers services WITH healthcheck directives only.
|
||||
# obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
|
||||
# no healthcheck — they are considered "started" as soon as the process runs.
|
||||
# This step explicitly asserts the five healthchecked critical services are
|
||||
# healthy before the smoke test proceeds.
|
||||
run: |
|
||||
set -e
|
||||
unhealthy=""
|
||||
for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
|
||||
status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
|
||||
if [ "$status" != "healthy" ]; then
|
||||
echo "::error::$svc is not healthy (status: $status)"
|
||||
unhealthy="$unhealthy $svc"
|
||||
fi
|
||||
done
|
||||
[ -z "$unhealthy" ] || exit 1
|
||||
echo "All critical observability services are healthy"
|
||||
|
||||
- name: Reload Caddy
|
||||
# Apply any committed Caddyfile changes before smoke-testing the
|
||||
# public surface. Without this step, a Caddyfile edit lands in the
|
||||
# repo but Caddy keeps serving the previous config until someone
|
||||
# reloads it manually — the smoke test would then catch a stale
|
||||
# header or a still-proxied /actuator route rather than confirming
|
||||
# the current config is live.
|
||||
#
|
||||
# The runner executes job steps inside Docker containers (DooD).
|
||||
# `systemctl` is not present in container images and cannot reach
|
||||
# the host's systemd directly. We use the Docker socket (mounted
|
||||
# into every job container via runner-config.yaml) to spin up a
|
||||
# privileged sibling container in the host PID namespace; nsenter
|
||||
# then enters the host's namespaces so systemctl talks to the real
|
||||
# host systemd daemon. No sudoers entry is required — the Docker
|
||||
# socket already grants root-equivalent host access.
|
||||
#
|
||||
# Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
|
||||
# tooling, and the digest is pinned so any upstream change requires
|
||||
# an explicit bump PR. util-linux (which ships nsenter) is installed
|
||||
# at run time; apk add takes ~1 s on the warm VPS cache.
|
||||
#
|
||||
# `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
|
||||
# config in-process without dropping TLS connections. `restart`
|
||||
# would briefly stop the service, losing in-flight requests.
|
||||
#
|
||||
# If Caddy is not running this step fails fast before the smoke test
|
||||
# issues a misleading "port 443 refused" error.
|
||||
run: |
|
||||
docker run --rm --privileged --pid=host \
|
||||
alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
|
||||
sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
|
||||
|
||||
- name: Smoke test deployed environment
|
||||
# Healthchecks confirm containers are healthy; they do NOT confirm the
|
||||
# public surface works. This step catches: Caddy not reloaded, HSTS
|
||||
# header dropped, /actuator block bypassed.
|
||||
#
|
||||
# --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP
|
||||
# (the host) so we do NOT depend on hairpin NAT on the host router.
|
||||
# 127.0.0.1 cannot be used: job containers run in bridge network mode
|
||||
# (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
|
||||
# the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
|
||||
# and is therefore reachable from the container via that IP.
|
||||
# SNI still uses the public hostname so the TLS cert validates correctly.
|
||||
#
|
||||
# Gateway detection reads /proc/net/route (always present, no package
|
||||
# required) instead of `ip route` to avoid a dependency on iproute2.
|
||||
# Field $2=="00000000" is the default route; field $3 is the gateway as
|
||||
# a little-endian 32-bit hex value which awk decodes to dotted-decimal.
|
||||
run: |
|
||||
set -e
|
||||
HOST="staging.raddatz.cloud"
|
||||
URL="https://$HOST"
|
||||
HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
|
||||
[ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
|
||||
RESOLVE=(--resolve "$HOST:443:$HOST_IP")
|
||||
echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
|
||||
# Pin the preload-list-eligible HSTS value, not just header presence:
|
||||
# a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
|
||||
# fail this check rather than pass it silently.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
|
||||
# Permissions-Policy denies APIs the app does not use (camera,
|
||||
# microphone, geolocation). A regression that loosens or drops the
|
||||
# header now fails the smoke step.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
|
||||
status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
|
||||
[ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
|
||||
echo "All smoke checks passed"
|
||||
- uses: ./.gitea/actions/smoke-test
|
||||
with:
|
||||
host: staging.raddatz.cloud
|
||||
|
||||
- name: Cleanup env file
|
||||
# LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
|
||||
|
||||
@@ -23,6 +23,11 @@ name: release
|
||||
# - host ports: backend 8080, frontend 3000
|
||||
# - profile: (none) — mailpit is excluded; real SMTP relay is used
|
||||
#
|
||||
# The obs-stack deploy, Caddy reload, and smoke test are shared with
|
||||
# nightly.yml via the composite actions under .gitea/actions/ (ADR-029).
|
||||
# actions/checkout MUST stay the first step: a local `uses: ./…` action
|
||||
# only exists on disk after checkout.
|
||||
#
|
||||
# Required Gitea secrets:
|
||||
# PROD_POSTGRES_PASSWORD
|
||||
# PROD_MINIO_PASSWORD
|
||||
@@ -53,6 +58,8 @@ jobs:
|
||||
# advertised label of our single-tenant self-hosted runner.
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# MUST be first: the composite actions below live under .gitea/actions/
|
||||
# and only exist on disk once the repo is checked out (ADR-029).
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Write production env file
|
||||
@@ -100,117 +107,21 @@ jobs:
|
||||
--env-file .env.production \
|
||||
up -d --wait --remove-orphans
|
||||
|
||||
- name: Deploy observability configs
|
||||
# Mirrors the nightly approach: copies obs compose file and config tree
|
||||
# to /opt/familienarchiv/ (permanent path, survives workspace wipes — ADR-016),
|
||||
# then writes obs-secrets.env fresh from Gitea secrets.
|
||||
# Non-secret config lives in infra/observability/obs.env (tracked in git).
|
||||
run: |
|
||||
rm -rf /opt/familienarchiv/infra/observability
|
||||
mkdir -p /opt/familienarchiv/infra/observability
|
||||
cp -r infra/observability/. /opt/familienarchiv/infra/observability/
|
||||
cp docker-compose.observability.yml /opt/familienarchiv/
|
||||
cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
|
||||
GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||||
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
|
||||
GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||||
POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
|
||||
POSTGRES_HOST=archiv-production-db-1
|
||||
EOF
|
||||
# Note: POSTGRES_HOST is derived from the Compose project name (archiv-production)
|
||||
# and service name (db). A project rename requires updating this value.
|
||||
chmod 600 /opt/familienarchiv/obs-secrets.env
|
||||
# POSTGRES_HOST is derived from the Compose project name (archiv-production)
|
||||
# and service name (db). A project rename requires updating this value.
|
||||
- uses: ./.gitea/actions/deploy-obs
|
||||
with:
|
||||
grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||||
grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
|
||||
glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||||
postgres_password: ${{ secrets.PROD_POSTGRES_PASSWORD }}
|
||||
postgres_host: archiv-production-db-1
|
||||
|
||||
- name: Validate observability compose config
|
||||
# Dry-run: resolves all variable substitutions and reports any missing
|
||||
# required keys before containers start. Catches undefined variables and
|
||||
# YAML errors in config files updated by the previous step.
|
||||
# --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
|
||||
# second (CI-written secrets). Later files win on duplicate keys, so
|
||||
# obs-secrets.env overrides POSTGRES_HOST set in obs.env.
|
||||
# Keep in sync with the equivalent step in nightly.yml (#603).
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
config --quiet
|
||||
- uses: ./.gitea/actions/reload-caddy
|
||||
|
||||
- name: Start observability stack
|
||||
# Runs with absolute paths so bind mounts resolve to stable host paths
|
||||
# that survive workspace wipes between runs (see ADR-016).
|
||||
# Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
|
||||
# (written fresh from Gitea secrets above). --env-file order: obs.env first,
|
||||
# obs-secrets.env second — later file wins on duplicate keys.
|
||||
# Keep in sync with the equivalent step in nightly.yml (#603).
|
||||
run: |
|
||||
docker compose \
|
||||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||||
up -d --wait --remove-orphans
|
||||
|
||||
- name: Assert observability stack health
|
||||
# docker compose up --wait covers services WITH healthcheck directives only.
|
||||
# obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
|
||||
# no healthcheck — they are considered "started" as soon as the process runs.
|
||||
# This step explicitly asserts the five healthchecked critical services are
|
||||
# healthy before the smoke test proceeds.
|
||||
# Keep in sync with the equivalent step in nightly.yml (#603).
|
||||
run: |
|
||||
set -e
|
||||
unhealthy=""
|
||||
for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
|
||||
status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
|
||||
if [ "$status" != "healthy" ]; then
|
||||
echo "::error::$svc is not healthy (status: $status)"
|
||||
unhealthy="$unhealthy $svc"
|
||||
fi
|
||||
done
|
||||
[ -z "$unhealthy" ] || exit 1
|
||||
echo "All critical observability services are healthy"
|
||||
|
||||
- name: Reload Caddy
|
||||
# See nightly.yml — same rationale and mechanism: DooD job containers
|
||||
# cannot call systemctl directly; nsenter via a privileged sibling
|
||||
# container reaches the host systemd. Must run after deploy (so the
|
||||
# latest Caddyfile is on disk) and before the smoke test (so the
|
||||
# public surface reflects the current config). Alpine with pinned
|
||||
# digest; reload not restart — see nightly.yml for full rationale.
|
||||
run: |
|
||||
docker run --rm --privileged --pid=host \
|
||||
alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
|
||||
sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
|
||||
|
||||
- name: Smoke test deployed environment
|
||||
# See nightly.yml — same three checks, against the prod vhost.
|
||||
# --resolve stored as a Bash array so "${RESOLVE[@]}" expands to two
|
||||
# separate arguments; a quoted string would pass the flag and its value
|
||||
# as one token and curl would reject it as an unknown option.
|
||||
# Gateway detection via /proc/net/route — no iproute2 dependency.
|
||||
# See nightly.yml for the full network topology explanation.
|
||||
run: |
|
||||
set -e
|
||||
HOST="archiv.raddatz.cloud"
|
||||
URL="https://$HOST"
|
||||
HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
|
||||
[ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
|
||||
RESOLVE=(--resolve "$HOST:443:$HOST_IP")
|
||||
echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 "$URL/login" -o /dev/null
|
||||
# Pin the preload-list-eligible HSTS value, not just header presence:
|
||||
# a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
|
||||
# fail this check rather than pass it silently.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
|
||||
# Permissions-Policy denies APIs the app does not use (camera,
|
||||
# microphone, geolocation). A regression that loosens or drops the
|
||||
# header now fails the smoke step.
|
||||
curl -fsS "${RESOLVE[@]}" --max-time 10 -I "$URL/" \
|
||||
| grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
|
||||
status=$(curl -s "${RESOLVE[@]}" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
|
||||
[ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
|
||||
echo "All smoke checks passed"
|
||||
- uses: ./.gitea/actions/smoke-test
|
||||
with:
|
||||
host: archiv.raddatz.cloud
|
||||
|
||||
- name: Cleanup env file
|
||||
# LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
# ADR-029: Composite actions for cross-workflow deploy logic
|
||||
|
||||
## Status
|
||||
|
||||
Accepted
|
||||
|
||||
## Context
|
||||
|
||||
The `nightly.yml` (staging) and `release.yml` (production) workflows shared three
|
||||
blocks of deploy logic verbatim: the four observability-stack steps (deploy configs,
|
||||
validate, start, assert health), the Caddy reload step, and the public-surface smoke
|
||||
test. The only per-environment differences were secret names (`STAGING_*` vs `PROD_*`),
|
||||
the `POSTGRES_HOST` value, and the smoke-test hostname.
|
||||
|
||||
This duplication was held together by `# Keep in sync with nightly.yml` comments — an
|
||||
honour-system invariant. Any change (a new healthchecked service, a different rsync flag,
|
||||
a new secret) had to be applied in two places, and nothing enforced that it was. Issue #603
|
||||
documents a real instance: the obs secret set had grown to five keys while a refactor draft
|
||||
listed only four.
|
||||
|
||||
### Decision drivers
|
||||
|
||||
1. Cross-workflow deploy logic must have a single definition, enforced — not a
|
||||
discipline-based "keep in sync" promise.
|
||||
2. Per-environment variation must be expressed as explicit, typed inputs, not by forking
|
||||
the whole step block.
|
||||
3. The mechanism must work on the existing single-tenant self-hosted Gitea runner with no
|
||||
new infrastructure.
|
||||
|
||||
### Alternatives considered
|
||||
|
||||
**A: Reusable workflow (`workflow_call`)** — Gitea supports called workflows. Rejected for
|
||||
this case: reusable workflows run as a separate job with their own runner context, which
|
||||
breaks the in-job, sequential `deploy → reload → smoke` ordering these steps rely on and
|
||||
complicates passing the already-checked-out workspace. Composite actions run inline in the
|
||||
calling job, preserving step order and the workspace.
|
||||
|
||||
**B: Shared shell script invoked from both workflows** — e.g. `scripts/deploy-obs.sh`.
|
||||
Rejected: loses the typed-input contract and per-step CI log sections, and reintroduces
|
||||
manual argument threading that is as error-prone as the duplication it replaces.
|
||||
|
||||
**C: Keep the `# Keep in sync` comments** — status quo. Rejected: unenforced; issue #603
|
||||
is direct evidence it fails.
|
||||
|
||||
## Decision
|
||||
|
||||
Extract the shared logic into three single-responsibility Gitea composite actions under
|
||||
`.gitea/actions/`: `deploy-obs` (five inputs), `reload-caddy` (no inputs), and `smoke-test`
|
||||
(`host` input). Both workflows invoke each via a single `uses: ./.gitea/actions/<name>` call,
|
||||
passing per-environment values as `with:` inputs. This is the repository's first composite
|
||||
action and sets the convention; `docs/infrastructure/ci-gitea.md` documents it.
|
||||
|
||||
## Consequences
|
||||
|
||||
**Positive:**
|
||||
- Shared deploy logic has one enforced definition; a change lands once and both
|
||||
environments get it. The `# Keep in sync` comments are deleted.
|
||||
- Per-environment variation is a typed input contract, not a forked block.
|
||||
- Runs inline on the existing runner — no reusable-workflow job context, no new
|
||||
infrastructure.
|
||||
|
||||
**Negative / constraints:**
|
||||
- Workflows now depend on a checked-out `.gitea/actions/` tree: `actions/checkout` MUST run
|
||||
before the first `uses: ./…` (a local action does not exist on disk until checkout).
|
||||
- Secrets cannot be read from the `secrets.*` context inside a composite action; they must
|
||||
be passed as inputs and mapped to `env:`. The `obs-secrets.env` heredoc therefore uses an
|
||||
unquoted delimiter so `$VAR` expands at the shell layer.
|
||||
- The `reload-caddy` pinned alpine digest now lives in the action, not the workflow file —
|
||||
it must be added to Renovate's watch list so it does not go stale.
|
||||
@@ -68,6 +68,8 @@ Job containers are unprivileged and do not share the host's PID/mount/network na
|
||||
|
||||
Alpine is used instead of Ubuntu: ~5 MB vs ~70 MB, and the digest is pinned to a specific sha256 so any upstream change requires an explicit Renovate bump PR. `util-linux` (which ships `nsenter`) is not part of the Alpine base image but is installed at run time in ~1 s from the warm VPS cache.
|
||||
|
||||
This exact step now lives in the `reload-caddy` composite action (see [Composite actions](#composite-actions) below); both deploy workflows call it via `uses: ./.gitea/actions/reload-caddy`. The pinned digest moved with it, so Renovate's privileged-digest watch covers `.gitea/actions/**` as well as `.gitea/workflows/**`.
|
||||
|
||||
#### Why not `sudo systemctl` in the job container?
|
||||
|
||||
Job containers run as root inside an unprivileged Docker namespace. There is no systemd PID 1 inside the container — `systemctl` would attempt to reach a socket that does not exist. `sudo` is not present in container images and would not help even if it were.
|
||||
@@ -170,6 +172,72 @@ See `docs/DEPLOYMENT.md §3.1` and ADR-015 for the full setup rationale.
|
||||
|
||||
---
|
||||
|
||||
## Composite actions
|
||||
|
||||
The `nightly.yml` (staging) and `release.yml` (production) deploy workflows share their observability-stack deploy, Caddy reload, and smoke-test logic through three single-responsibility composite actions under `.gitea/actions/` (ADR-029). Before this, the shared logic was duplicated in both workflows and held together by `# Keep in sync with nightly.yml` comments — an unenforced honour-system invariant.
|
||||
|
||||
| Action | Inputs | Purpose |
|
||||
|---|---|---|
|
||||
| `deploy-obs` | `grafana_admin_password`, `grafana_db_password`, `glitchtip_secret_key`, `postgres_password`, `postgres_host` | Deploy obs configs + secrets to `/opt/familienarchiv`, validate the compose config, start the stack, assert the five healthchecked services |
|
||||
| `reload-caddy` | — | Reload host Caddy via the privileged-sibling + nsenter pattern |
|
||||
| `smoke-test` | `host` | Verify the public surface (login reachable, HSTS pinned, Permissions-Policy present, `/actuator → 404`) |
|
||||
|
||||
A workflow calls them by relative path, passing per-environment values as `with:` inputs:
|
||||
|
||||
```yaml
|
||||
- uses: ./.gitea/actions/deploy-obs
|
||||
with:
|
||||
grafana_admin_password: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||||
grafana_db_password: ${{ secrets.GRAFANA_DB_PASSWORD }}
|
||||
glitchtip_secret_key: ${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||||
postgres_password: ${{ secrets.STAGING_POSTGRES_PASSWORD }}
|
||||
postgres_host: archiv-staging-db-1
|
||||
- uses: ./.gitea/actions/reload-caddy
|
||||
- uses: ./.gitea/actions/smoke-test
|
||||
with:
|
||||
host: staging.raddatz.cloud
|
||||
```
|
||||
|
||||
### Checkout-first ordering rule
|
||||
|
||||
A local composite action (`uses: ./…`) only exists on disk **after** the repo is checked out. `actions/checkout@v4` MUST therefore be the **first step** of any job that calls one — if a future reorder moves checkout later, every `uses: ./.gitea/actions/…` call fails because the action file is not yet on disk. Both deploy workflows pin checkout as step 1 for exactly this reason.
|
||||
|
||||
### Secrets inside composite actions
|
||||
|
||||
The `secrets.*` context is **not** available inside a composite action. Secrets are passed in as `inputs`, mapped to an `env:` block, and referenced as `$VAR`:
|
||||
|
||||
```yaml
|
||||
inputs:
|
||||
grafana_admin_password:
|
||||
required: true # no default — a missing secret must fail loudly, never fall back to empty
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- shell: bash # composite steps do NOT default the shell — always declare it
|
||||
env:
|
||||
GRAFANA_ADMIN_PASSWORD: ${{ inputs.grafana_admin_password }}
|
||||
run: |
|
||||
cat > obs-secrets.env <<EOF # unquoted EOF — $VAR expands at the shell layer
|
||||
GRAFANA_ADMIN_PASSWORD=$GRAFANA_ADMIN_PASSWORD
|
||||
EOF
|
||||
```
|
||||
|
||||
Two load-bearing details:
|
||||
|
||||
- **Unquoted heredoc delimiter (`<<EOF`, not `<<'EOF'`).** With a quoted delimiter the shell writes the literal string `$GRAFANA_ADMIN_PASSWORD`, and `docker compose config --quiet` still passes (the variable is *present, just wrong*). The `deploy-obs` action guards against this with a five-key **non-empty** check (`grep -Eq "^KEY=.+"`) immediately after writing `obs-secrets.env`. `chmod 600` is the action's final operation so the file is never world-readable.
|
||||
- **Every `run:` step declares `shell: bash`.** Composite actions do not inherit the workflow's default shell; a step without it fails to run.
|
||||
|
||||
### Adding an input to an action
|
||||
|
||||
To thread a new per-environment value (e.g. a new secret) through `deploy-obs`:
|
||||
|
||||
1. Add it under `inputs:` in `.gitea/actions/deploy-obs/action.yml` with `required: true` and **no `default:`**.
|
||||
2. Map it in the relevant step's `env:` block: `NEW_KEY: ${{ inputs.new_key }}`.
|
||||
3. Reference it as `$NEW_KEY` in the `run:` script — add a `NEW_KEY=$NEW_KEY` line to the heredoc **and** a matching entry to the five-key guard loop.
|
||||
4. Pass it from **both** workflows' `with:` blocks. That is the whole point of the action: the contract lives in one place, so neither environment can silently drift.
|
||||
|
||||
---
|
||||
|
||||
## Gitea vs GitHub Actions Differences
|
||||
|
||||
### Context Variable Names
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
"automerge": false
|
||||
},
|
||||
{
|
||||
"description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access.",
|
||||
"matchPaths": [".gitea/workflows/**"],
|
||||
"description": "Digest bumps for images used in privileged CI steps (--privileged --pid=host) must be reviewed manually — a compromised image has root-equivalent host access. Covers .gitea/actions/** too: the reload-caddy alpine digest now lives in a composite action (#603).",
|
||||
"matchPaths": [".gitea/workflows/**", ".gitea/actions/**"],
|
||||
"matchUpdateTypes": ["digest"],
|
||||
"automerge": false,
|
||||
"reviewersFromCodeOwners": false
|
||||
|
||||
Reference in New Issue
Block a user