281 lines
14 KiB
YAML
281 lines
14 KiB
YAML
name: nightly
|
||
|
||
# Builds and deploys the staging environment from main every night.
|
||
# Runs on the self-hosted runner using Docker-out-of-Docker (the docker
|
||
# socket is mounted in), so `docker compose build` produces images on
|
||
# the host daemon and `docker compose up` consumes them directly — no
|
||
# registry hop.
|
||
#
|
||
# Operational assumptions (see docs/DEPLOYMENT.md §3 for the full setup):
|
||
#
|
||
# 1. Single-tenant self-hosted runner. The "Write staging env file" step
|
||
# writes every secret to .env.staging on the runner filesystem; the
|
||
# `if: always()` cleanup step removes it. A multi-tenant runner
|
||
# would need to switch to docker compose --env-file <(stdin) instead.
|
||
#
|
||
# 2. Host docker layer cache is authoritative. There is no
|
||
# actions/cache; we rely on the host daemon to keep Maven and npm
|
||
# layers warm between runs. A `docker system prune` on the host
|
||
# will cause the next nightly build to be cold (5–10 min slower).
|
||
#
|
||
# Staging environment isolation:
|
||
# - project name: archiv-staging
|
||
# - host ports: backend 8081, frontend 3001
|
||
# - profile: staging (starts mailpit instead of a real SMTP relay)
|
||
#
|
||
# Required Gitea secrets:
|
||
# STAGING_POSTGRES_PASSWORD
|
||
# STAGING_MINIO_PASSWORD
|
||
# STAGING_MINIO_APP_PASSWORD
|
||
# STAGING_OCR_TRAINING_TOKEN
|
||
# STAGING_APP_ADMIN_USERNAME
|
||
# STAGING_APP_ADMIN_PASSWORD
|
||
# GRAFANA_ADMIN_PASSWORD
|
||
# GLITCHTIP_SECRET_KEY
|
||
# SENTRY_DSN (set after GlitchTip first-run; empty = Sentry disabled)
|
||
|
||
on:
|
||
schedule:
|
||
- cron: "0 2 * * *"
|
||
workflow_dispatch:
|
||
|
||
env:
|
||
# Ensures the backend Dockerfile's `RUN --mount=type=cache` lines are
|
||
# honoured (Maven cache survives between runs).
|
||
DOCKER_BUILDKIT: "1"
|
||
|
||
jobs:
|
||
deploy-staging:
|
||
# `ubuntu-latest` matches our self-hosted runner's advertised label
|
||
# (the runner has labels: ubuntu-latest / ubuntu-24.04 / ubuntu-22.04).
|
||
# `self-hosted` would never match — no runner advertises it — so the
|
||
# job parks in the queue forever. ADR-011's "single-tenant" promise
|
||
# is at the repo level; sharing this runner between CI and deploys
|
||
# for the same repo is within that boundary.
|
||
runs-on: ubuntu-latest
|
||
steps:
|
||
- uses: actions/checkout@v4
|
||
|
||
- name: Write staging env file
|
||
run: |
|
||
cat > .env.staging <<EOF
|
||
TAG=nightly
|
||
PORT_BACKEND=8081
|
||
PORT_FRONTEND=3001
|
||
APP_DOMAIN=staging.raddatz.cloud
|
||
POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
|
||
MINIO_PASSWORD=${{ secrets.STAGING_MINIO_PASSWORD }}
|
||
MINIO_APP_PASSWORD=${{ secrets.STAGING_MINIO_APP_PASSWORD }}
|
||
OCR_TRAINING_TOKEN=${{ secrets.STAGING_OCR_TRAINING_TOKEN }}
|
||
APP_ADMIN_USERNAME=${{ secrets.STAGING_APP_ADMIN_USERNAME }}
|
||
APP_ADMIN_PASSWORD=${{ secrets.STAGING_APP_ADMIN_PASSWORD }}
|
||
MAIL_HOST=mailpit
|
||
MAIL_PORT=1025
|
||
MAIL_USERNAME=
|
||
MAIL_PASSWORD=
|
||
MAIL_SMTP_AUTH=false
|
||
MAIL_STARTTLS_ENABLE=false
|
||
APP_MAIL_FROM=noreply@staging.raddatz.cloud
|
||
IMPORT_HOST_DIR=/srv/familienarchiv-staging/import
|
||
POSTGRES_USER=archiv
|
||
SENTRY_DSN=${{ secrets.SENTRY_DSN }}
|
||
EOF
|
||
|
||
- name: Verify backend /import:ro mount is wired
|
||
# Regression guard for #526: the /admin/system mass-import card
|
||
# only works when the backend service mounts the host import
|
||
# payload at /import (read-only). If a future "compose cleanup"
|
||
# PR drops the volumes block, mass import silently breaks again.
|
||
# `compose config` renders both shorthand and longform mounts as
|
||
# `target: /import` + `read_only: true`, so we assert against
|
||
# the rendered form rather than the raw source YAML.
|
||
run: |
|
||
set -e
|
||
docker compose \
|
||
-f docker-compose.prod.yml \
|
||
-p archiv-staging \
|
||
--env-file .env.staging \
|
||
--profile staging \
|
||
config > /tmp/compose-rendered.yml
|
||
grep -q '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
|
||
|| { echo "::error::backend is missing the /import bind mount (see #526)"; exit 1; }
|
||
grep -A2 '^[[:space:]]*target: /import$' /tmp/compose-rendered.yml \
|
||
| grep -q 'read_only: true' \
|
||
|| { echo "::error::backend /import mount is not read-only (see #526)"; exit 1; }
|
||
|
||
- name: Build images
|
||
# `--pull` forces re-fetching pinned base images so a CVE
|
||
# re-publication of the same tag (e.g. node:20.19.0-alpine3.21,
|
||
# postgres:16-alpine) is picked up instead of being served
|
||
# from the host's stale Docker layer cache.
|
||
run: |
|
||
docker compose \
|
||
-f docker-compose.prod.yml \
|
||
-p archiv-staging \
|
||
--env-file .env.staging \
|
||
--profile staging \
|
||
build --pull
|
||
|
||
- name: Deploy staging
|
||
run: |
|
||
docker compose \
|
||
-f docker-compose.prod.yml \
|
||
-p archiv-staging \
|
||
--env-file .env.staging \
|
||
--profile staging \
|
||
up -d --wait --remove-orphans
|
||
|
||
- name: Deploy observability configs
|
||
# Copies the compose file and config tree from the workspace checkout
|
||
# into /opt/familienarchiv/ — the permanent location that persists
|
||
# between CI runs. Containers started in the next step bind-mount
|
||
# from there, so a future workspace wipe cannot corrupt a running
|
||
# config file.
|
||
#
|
||
# obs-secrets.env is written fresh from Gitea secrets on every run so
|
||
# Gitea is always the single source of truth for secret rotation.
|
||
# Non-secret config lives in infra/observability/obs.env (tracked in git).
|
||
run: |
|
||
rm -rf /opt/familienarchiv/infra/observability
|
||
mkdir -p /opt/familienarchiv/infra/observability
|
||
cp -r infra/observability/. /opt/familienarchiv/infra/observability/
|
||
cp docker-compose.observability.yml /opt/familienarchiv/
|
||
cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
|
||
GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
|
||
GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
|
||
POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
|
||
POSTGRES_HOST=archiv-staging-db-1
|
||
EOF
|
||
# Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging)
|
||
# and service name (db). A project rename requires updating this value.
|
||
chmod 600 /opt/familienarchiv/obs-secrets.env
|
||
|
||
- name: Validate observability compose config
|
||
# Dry-run: resolves all variable substitutions and reports any missing
|
||
# required keys before containers start. Catches undefined variables and
|
||
# YAML errors in config files updated by the previous step.
|
||
# --env-file order: obs.env first (git-tracked defaults), obs-secrets.env
|
||
# second (CI-written secrets). Later files win on duplicate keys, so
|
||
# obs-secrets.env overrides POSTGRES_HOST set in obs.env.
|
||
run: |
|
||
docker compose \
|
||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||
config --quiet
|
||
|
||
- name: Start observability stack
|
||
# Runs with absolute paths so bind mounts resolve to stable host paths
|
||
# that survive workspace wipes between nightly runs (see ADR-016).
|
||
# Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env
|
||
# (written fresh from Gitea secrets above). --env-file order: obs.env first,
|
||
# obs-secrets.env second — later file wins on duplicate keys.
|
||
run: |
|
||
docker compose \
|
||
-f /opt/familienarchiv/docker-compose.observability.yml \
|
||
--env-file /opt/familienarchiv/infra/observability/obs.env \
|
||
--env-file /opt/familienarchiv/obs-secrets.env \
|
||
up -d --wait --remove-orphans
|
||
|
||
- name: Assert observability stack health
|
||
# docker compose up --wait covers services WITH healthcheck directives only.
|
||
# obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have
|
||
# no healthcheck — they are considered "started" as soon as the process runs.
|
||
# This step explicitly asserts the five healthchecked critical services are
|
||
# healthy before the smoke test proceeds.
|
||
run: |
|
||
set -e
|
||
unhealthy=""
|
||
for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do
|
||
status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing")
|
||
if [ "$status" != "healthy" ]; then
|
||
echo "::error::$svc is not healthy (status: $status)"
|
||
unhealthy="$unhealthy $svc"
|
||
fi
|
||
done
|
||
[ -z "$unhealthy" ] || exit 1
|
||
echo "All critical observability services are healthy"
|
||
|
||
- name: Reload Caddy
|
||
# Apply any committed Caddyfile changes before smoke-testing the
|
||
# public surface. Without this step, a Caddyfile edit lands in the
|
||
# repo but Caddy keeps serving the previous config until someone
|
||
# reloads it manually — the smoke test would then catch a stale
|
||
# header or a still-proxied /actuator route rather than confirming
|
||
# the current config is live.
|
||
#
|
||
# The runner executes job steps inside Docker containers (DooD).
|
||
# `systemctl` is not present in container images and cannot reach
|
||
# the host's systemd directly. We use the Docker socket (mounted
|
||
# into every job container via runner-config.yaml) to spin up a
|
||
# privileged sibling container in the host PID namespace; nsenter
|
||
# then enters the host's namespaces so systemctl talks to the real
|
||
# host systemd daemon. No sudoers entry is required — the Docker
|
||
# socket already grants root-equivalent host access.
|
||
#
|
||
# Alpine is used: ~5 MB vs ~70 MB for ubuntu, no unnecessary
|
||
# tooling, and the digest is pinned so any upstream change requires
|
||
# an explicit bump PR. util-linux (which ships nsenter) is installed
|
||
# at run time; apk add takes ~1 s on the warm VPS cache.
|
||
#
|
||
# `reload` not `restart`: reload sends SIGHUP so Caddy re-reads its
|
||
# config in-process without dropping TLS connections. `restart`
|
||
# would briefly stop the service, losing in-flight requests.
|
||
#
|
||
# If Caddy is not running this step fails fast before the smoke test
|
||
# issues a misleading "port 443 refused" error.
|
||
run: |
|
||
docker run --rm --privileged --pid=host \
|
||
alpine:3.21@sha256:48b0309ca019d89d40f670aa1bc06e426dc0931948452e8491e3d65087abc07d \
|
||
sh -c 'apk add --no-cache util-linux -q && nsenter -t 1 -m -u -n -p -i -- /bin/systemctl reload caddy'
|
||
|
||
- name: Smoke test deployed environment
|
||
# Healthchecks confirm containers are healthy; they do NOT confirm the
|
||
# public surface works. This step catches: Caddy not reloaded, HSTS
|
||
# header dropped, /actuator block bypassed.
|
||
#
|
||
# --resolve pins staging.raddatz.cloud to the Docker bridge gateway IP
|
||
# (the host) so we do NOT depend on hairpin NAT on the host router.
|
||
# 127.0.0.1 cannot be used: job containers run in bridge network mode
|
||
# (runner-config.yaml), so 127.0.0.1 is the container's loopback, not
|
||
# the host's. The bridge gateway IS the host; Caddy binds 0.0.0.0:443
|
||
# and is therefore reachable from the container via that IP.
|
||
# SNI still uses the public hostname so the TLS cert validates correctly.
|
||
#
|
||
# Gateway detection reads /proc/net/route (always present, no package
|
||
# required) instead of `ip route` to avoid a dependency on iproute2.
|
||
# Field $2=="00000000" is the default route; field $3 is the gateway as
|
||
# a little-endian 32-bit hex value which awk decodes to dotted-decimal.
|
||
run: |
|
||
set -e
|
||
HOST="staging.raddatz.cloud"
|
||
URL="https://$HOST"
|
||
HOST_IP=$(awk 'NR>1 && $2=="00000000"{h=$3;printf "%d.%d.%d.%d\n",strtonum("0x"substr(h,7,2)),strtonum("0x"substr(h,5,2)),strtonum("0x"substr(h,3,2)),strtonum("0x"substr(h,1,2));exit}' /proc/net/route)
|
||
[ -n "$HOST_IP" ] || { echo "ERROR: could not detect Docker bridge gateway via /proc/net/route"; exit 1; }
|
||
RESOLVE="--resolve $HOST:443:$HOST_IP"
|
||
echo "Smoke test: $URL (pinned to $HOST_IP via bridge gateway)"
|
||
curl -fsS "$RESOLVE" --max-time 10 "$URL/login" -o /dev/null
|
||
# Pin the preload-list-eligible HSTS value, not just header presence:
|
||
# a degraded `max-age=1` or a dropped `includeSubDomains; preload` must
|
||
# fail this check rather than pass it silently.
|
||
curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
|
||
| grep -Eqi 'strict-transport-security:[[:space:]]*max-age=31536000.*includeSubDomains.*preload'
|
||
# Permissions-Policy denies APIs the app does not use (camera,
|
||
# microphone, geolocation). A regression that loosens or drops the
|
||
# header now fails the smoke step.
|
||
curl -fsS "$RESOLVE" --max-time 10 -I "$URL/" \
|
||
| grep -Eqi 'permissions-policy:[[:space:]]*camera=\(\),[[:space:]]*microphone=\(\),[[:space:]]*geolocation=\(\)'
|
||
status=$(curl -s "$RESOLVE" -o /dev/null -w "%{http_code}" --max-time 10 "$URL/actuator/health")
|
||
[ "$status" = "404" ] || { echo "expected 404 from /actuator/health, got $status"; exit 1; }
|
||
echo "All smoke checks passed"
|
||
|
||
- name: Cleanup env file
|
||
# LOAD-BEARING: `if: always()` is the linchpin of the ADR-011
|
||
# single-tenant runner trust model. Every secret in .env.staging
|
||
# is plain text on the runner filesystem until this step runs.
|
||
# If a future refactor drops `if: always()`, a failed deploy
|
||
# leaves the env-file behind. Do not remove this conditional
|
||
# without first re-evaluating ADR-011.
|
||
if: always()
|
||
run: rm -f .env.staging
|