From 458968ded547f625a4801ff60badf005a8c81a9f Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 21:45:49 +0200 Subject: [PATCH 01/27] fix(obs): remove invalid processors block from tempo metrics_generator Tempo 2.7.2 removed `processors` from the top-level metrics_generator config; the field is only valid under `overrides.defaults.metrics_generator`. The setting was already present there, so this only removes the now-rejected duplicate at the top level. Closes part of #601 Co-Authored-By: Claude Sonnet 4.6 --- infra/observability/tempo/tempo.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/infra/observability/tempo/tempo.yml b/infra/observability/tempo/tempo.yml index f09a28b6..58c9ac15 100644 --- a/infra/observability/tempo/tempo.yml +++ b/infra/observability/tempo/tempo.yml @@ -36,9 +36,6 @@ metrics_generator: source: tempo storage: path: /var/tempo/generator/wal - processors: - - service-graphs - - span-metrics # Tempo HTTP API (port 3200) is unauthenticated. Access is controlled entirely # by network isolation: only Grafana (on obs-net) should reach this port. -- 2.49.1 From 1181b97f9487ce6604c4010fba5b55066794fbcb Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 21:46:11 +0200 Subject: [PATCH 02/27] fix(obs): make Postgres host configurable and fix PORT_GRAFANA default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POSTGRES_HOST variable (default: archive-db) lets the observability stack connect to a different Postgres container — needed when only the staging stack is running (container name: archiv-staging-db-1). PORT_GRAFANA default changed from 3001 to 3003 to avoid collision with the staging frontend which occupies 3001. Closes #601 Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.observability.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index b83cb439..7c30bcca 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -142,7 +142,7 @@ services: container_name: obs-grafana restart: unless-stopped ports: - - "127.0.0.1:${PORT_GRAFANA:-3001}:3000" + - "127.0.0.1:${PORT_GRAFANA:-3003}:3000" environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme} GF_USERS_ALLOW_SIGN_UP: "false" @@ -193,7 +193,7 @@ services: obs-glitchtip-db-init: condition: service_completed_successfully environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@archive-db:5432/glitchtip + DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip REDIS_URL: redis://obs-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:3002} @@ -215,7 +215,7 @@ services: obs-redis: condition: service_healthy environment: - DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@archive-db:5432/glitchtip + DATABASE_URL: postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST:-archive-db}:5432/glitchtip REDIS_URL: redis://obs-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} networks: @@ -229,10 +229,10 @@ services: environment: PGPASSWORD: ${POSTGRES_PASSWORD} command: > - sh -c "psql -h archive-db -U ${POSTGRES_USER} -tc + sh -c "psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -tc \"SELECT 1 FROM pg_database WHERE datname = 'glitchtip'\" | grep -q 1 || - psql -h archive-db -U ${POSTGRES_USER} -c \"CREATE DATABASE glitchtip;\"" + psql -h ${POSTGRES_HOST:-archive-db} -U ${POSTGRES_USER} -c \"CREATE DATABASE glitchtip;\"" networks: - archiv-net -- 2.49.1 From 7e524948802b50af55ae75796c5a55a57db3fc62 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 21:59:23 +0200 Subject: [PATCH 03/27] fix(ci): deploy obs configs to /opt/familienarchiv/ before starting stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The observability stack's bind-mount sources pointed to workspace-relative paths. When CI wiped the workspace between runs, containers kept running but their config files disappeared — causing Docker to auto-create directories at the missing paths and crash the services on next restart. Fix: mount /opt/familienarchiv/ into CI job containers via runner-config.yaml, then copy infra/observability/ and docker-compose.observability.yml there before docker compose up. Compose runs from the permanent path, so bind mounts resolve to stable host paths that survive workspace wipes. Docker Compose reads /opt/familienarchiv/.env automatically (no --env-file flag), which is managed on the server and persists between CI runs. Closes #601 Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 18 ++++++++++++++++-- runner-config.yaml | 5 ++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 81cf885c..0764bc03 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -131,11 +131,25 @@ jobs: --profile staging \ up -d --wait --remove-orphans + - name: Deploy observability configs + # Copies the compose file and config tree from the workspace checkout + # into /opt/familienarchiv/ — the permanent location that persists + # between CI runs. Containers started in the next step bind-mount + # from there, so a future workspace wipe cannot corrupt a running + # config file. Secrets are read from /opt/familienarchiv/.env (managed + # separately on the server; not written or deleted by CI). + run: | + mkdir -p /opt/familienarchiv/infra + cp -r infra/observability /opt/familienarchiv/infra/ + cp docker-compose.observability.yml /opt/familienarchiv/ + - name: Start observability stack + # Runs from /opt/familienarchiv/ so bind mounts resolve to stable + # host paths that survive workspace wipes between nightly runs. + # Docker Compose reads /opt/familienarchiv/.env automatically. run: | docker compose \ - -f docker-compose.observability.yml \ - --env-file .env.staging \ + -f /opt/familienarchiv/docker-compose.observability.yml \ up -d --wait --remove-orphans - name: Reload Caddy diff --git a/runner-config.yaml b/runner-config.yaml index 23bef458..2df22cf3 100644 --- a/runner-config.yaml +++ b/runner-config.yaml @@ -15,12 +15,15 @@ container: valid_volumes: - "/var/run/docker.sock" - "/srv/gitea-workspace" + - "/opt/familienarchiv" # appended to `docker run` when the runner spawns a job container # SECURITY: Mounting the Docker socket grants job containers root-equivalent # access to the host Docker daemon. Acceptable here because only trusted code # from this private repo runs on this runner. Do NOT use on a runner that # accepts untrusted PRs from external contributors. - options: "-v /var/run/docker.sock:/var/run/docker.sock -v /srv/gitea-workspace:/srv/gitea-workspace" + # /opt/familienarchiv is mounted so the nightly job can deploy observability + # configs to the permanent location without needing ssh or nsenter. + options: "-v /var/run/docker.sock:/var/run/docker.sock -v /srv/gitea-workspace:/srv/gitea-workspace -v /opt/familienarchiv:/opt/familienarchiv" # keep network mode default (bridge) — Testcontainers handles its own networking force_pull: false -- 2.49.1 From 448c3cdcdbd70d31d343e2faf9176aa58218fbe8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 23:57:31 +0200 Subject: [PATCH 04/27] docs(obs): update .env.example for PORT_GRAFANA 3003, POSTGRES_HOST, $$ escaping Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index 97d677e9..7593d997 100644 --- a/.env.example +++ b/.env.example @@ -29,16 +29,17 @@ OCR_TRAINING_TOKEN=change-me-in-production # --- Observability --- # Optional stack — start with: docker compose -f docker-compose.observability.yml up -d # Requires the main stack to already be running (docker compose up -d creates archiv-net). +# In production the stack is managed from /opt/familienarchiv/ (see docs/DEPLOYMENT.md §4). # Ports for host access -PORT_GRAFANA=3001 +PORT_GRAFANA=3003 PORT_GLITCHTIP=3002 PORT_PROMETHEUS=9090 # Grafana admin password — change this before exposing Grafana beyond localhost GRAFANA_ADMIN_PASSWORD=changeme -# GlitchTip domain — production: use https://grafana.raddatz.cloud (must match Caddy vhost) +# GlitchTip domain — production: use https://glitchtip.archiv.raddatz.cloud (must match Caddy vhost) GLITCHTIP_DOMAIN=http://localhost:3002 # GlitchTip secret key — Django SECRET_KEY equivalent, used to sign sessions and tokens. @@ -47,6 +48,15 @@ GLITCHTIP_DOMAIN=http://localhost:3002 # Generate with: python3 -c "import secrets; print(secrets.token_hex(50))" GLITCHTIP_SECRET_KEY=changeme-generate-a-real-secret +# PostgreSQL hostname for GlitchTip's db-init job and workers. +# Override when only the staging stack is running (container name differs from archive-db). +# Default (archive-db) is correct for production with the full stack up. +POSTGRES_HOST=archive-db + +# $$ escaping note: passwords in /opt/familienarchiv/.env that contain a literal '$' must +# use '$$' so Docker Compose does not expand them as variable references. +# Example: a password 'p@$$word' should be written as 'p@$$$$word' in the .env file. + # Error reporting DSNs — leave empty to disable the SDK (safe default). # SENTRY_DSN: backend (Spring Boot) — used by the GlitchTip/Sentry Java SDK SENTRY_DSN= -- 2.49.1 From 7b7d0c92a8f2e998134e7afdc5841e55ea499bb9 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 23:58:42 +0200 Subject: [PATCH 05/27] docs(obs): update DEPLOYMENT.md with /opt/familienarchiv/ ops section, env keys, runner restart Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 53 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index b906d66f..6f6c2e52 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -43,7 +43,7 @@ graph TD - SSE notifications transit Caddy (browser → Caddy → backend); the backend is never reachable directly from the public internet. The SvelteKit SSR layer is bypassed for SSE, but Caddy is not. - The Caddyfile responds `404` on `/actuator/*` (defense in depth). Internal monitoring scrapes the backend on the docker network, not through Caddy. - Production and staging cohabit on the same host via docker compose project names: `archiv-production` (ports 8080/3000) and `archiv-staging` (ports 8081/3001). -- An optional observability stack (Prometheus, Node Exporter, cAdvisor) runs as a separate compose file: `docker compose -f docker-compose.observability.yml up -d`. It joins `archiv-net` and scrapes the backend's management port (`:8081`). Configuration lives under `infra/observability/`. +- An optional observability stack (Prometheus, Node Exporter, cAdvisor, Loki, Tempo, Grafana, GlitchTip) runs as a separate compose file. Configuration lives under `infra/observability/`. In production and CI, the stack is managed from `/opt/familienarchiv/` (CI copies it there on every nightly run) so bind mounts survive workspace wipes — see §4 for the ops procedure. ### OCR memory requirements @@ -142,7 +142,8 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back | Variable | Purpose | Default | Required? | Sensitive? | |---|---|---|---|---| | `PORT_PROMETHEUS` | Host port for the Prometheus UI (bound to `127.0.0.1` only) | `9090` | — | — | -| `PORT_GRAFANA` | Host port for the Grafana UI (bound to `127.0.0.1` only) | `3001` | — | — | +| `PORT_GRAFANA` | Host port for the Grafana UI (bound to `127.0.0.1` only) | `3003` | — | — | +| `POSTGRES_HOST` | PostgreSQL hostname for GlitchTip's db-init job and workers. Override when only the staging stack is running and `archive-db` is not resolvable by that name. | `archive-db` | — | — | | `GRAFANA_ADMIN_PASSWORD` | Grafana `admin` user password | `changeme` | YES (prod) | YES | | `PORT_GLITCHTIP` | Host port for the GlitchTip UI (bound to `127.0.0.1` only) | `3002` | — | — | | `GLITCHTIP_DOMAIN` | Public-facing base URL for GlitchTip (used in email links and CORS) | `http://localhost:3002` | YES (prod) | — | @@ -202,6 +203,18 @@ mkdir -p /srv/gitea-workspace # volumes: # - /srv/gitea-workspace:/srv/gitea-workspace # See runner-config.yaml (workdir_parent + valid_volumes + options) and ADR-015. + +# Observability config permanent directory — the nightly CI job copies +# docker-compose.observability.yml and infra/observability/ here on every run. +# The obs stack is always started from this path, not from the workspace. +# See ADR-016 for why this directory is used instead of a server-pull approach. +mkdir -p /opt/familienarchiv/infra + +# ⚠ IMPORTANT: after any change to runner-config.yaml (valid_volumes, options, workdir_parent), +# restart the Gitea Act runner on the host for the new config to take effect: +# systemctl restart gitea-runner +# Until restarted, job containers are spawned with the old config and any new bind mounts +# (e.g. /opt/familienarchiv) will not be available inside job steps. ``` ### 3.2 DNS records @@ -284,13 +297,43 @@ docker compose logs --tail=200 ### Observability stack -An observability stack is available via `docker-compose.observability.yml`. Configuration lives under `infra/observability/`. Start it after the main stack is up (which creates `archiv-net`): +An observability stack is available via `docker-compose.observability.yml`. Configuration lives under `infra/observability/`. + +#### Dev — start from the workspace ```bash docker compose up -d # creates archiv-net docker compose -f docker-compose.observability.yml up -d ``` +#### Production — managed from `/opt/familienarchiv/` + +The nightly CI job copies `docker-compose.observability.yml` and `infra/observability/` to `/opt/familienarchiv/` on every run, then starts the stack from there. Bind mounts in the compose file resolve to `/opt/familienarchiv/infra/observability/…` on the host, which survives workspace wipes between CI runs (see ADR-016). + +The obs stack reads secrets from `/opt/familienarchiv/.env` (Docker Compose auto-reads this file when launched from that directory). This file is managed by the operator — CI does **not** write or delete it. + +**Required keys in `/opt/familienarchiv/.env`:** + +| Key | Example / notes | +|---|---| +| `GRAFANA_ADMIN_PASSWORD` | Strong unique password | +| `GLITCHTIP_SECRET_KEY` | `python3 -c "import secrets; print(secrets.token_hex(32))"` | +| `GLITCHTIP_DOMAIN` | `https://glitchtip.archiv.raddatz.cloud` — must match the Caddy vhost | +| `POSTGRES_USER` | Must match the `archiv` user set in `.env.staging` / `.env.production` | +| `POSTGRES_PASSWORD` | Must match the running PostgreSQL container's password | +| `PORT_GRAFANA` | `3003` (staging default; 3001 was used by staging frontend) | +| `PORT_GLITCHTIP` | `3002` | +| `PORT_PROMETHEUS` | `9090` | +| `SENTRY_DSN` | Set after GlitchTip first-run; leave empty to disable | + +**`$$` escaping rule:** passwords that contain a literal `$` must use `$$` in this file so Docker Compose does not expand them as variable references. Example: a password `p@$word` must be written as `p@$$word`. Failure to escape produces a silently truncated password — Grafana or GlitchTip will start but reject logins. + +To start or restart the obs stack manually on the server: + +```bash +docker compose -f /opt/familienarchiv/docker-compose.observability.yml up -d --wait --remove-orphans +``` + Current services: | Service | Image | Purpose | @@ -311,7 +354,7 @@ Current services: | Item | Value | |---|---| -| URL | `http://localhost:3001` (or `http://localhost:$PORT_GRAFANA`) | +| URL | `http://localhost:3003` (or `http://localhost:$PORT_GRAFANA`) | | Username | `admin` | | Password | `$GRAFANA_ADMIN_PASSWORD` (default: `changeme` — **change before exposing to a network**) | @@ -341,7 +384,7 @@ docker exec obs-loki wget -qO- \ **Prefer `compose_service` over `container_name` in LogQL queries** — `container_name` differs between dev (`archive-backend`) and prod (`archiv-production-backend-1`), while `compose_service` is stable (`backend`, `db`, `minio`, etc.). -Prometheus port `9090` and Grafana port `3001` are bound to `127.0.0.1` on the host. No other observability ports are host-bound. +Prometheus port `9090` and Grafana port `3003` (default; configurable via `PORT_GRAFANA`) are bound to `127.0.0.1` on the host. No other observability ports are host-bound. #### GlitchTip -- 2.49.1 From dec6b8139b7bb4f62e77574279d2b439db205893 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 23:59:11 +0200 Subject: [PATCH 06/27] docs(c4): update l2-containers obs boundary to show /opt/familienarchiv/ permanent path Co-Authored-By: Claude Sonnet 4.6 --- docs/architecture/c4/l2-containers.puml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index 2122a225..96741ac8 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -17,7 +17,7 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") { Container(mc, "Bucket / Service-Account Init", "MinIO Client (mc)", "One-shot container on startup. Idempotent: creates the archive bucket, the archiv-app service account, and attaches the readwrite policy.") } -System_Boundary(observability, "Observability Stack (docker-compose.observability.yml)") { +System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-compose.observability.yml)") { Container(prometheus, "Prometheus", "prom/prometheus:v3.4.0", "Scrapes metrics from backend management port 8081 (/actuator/prometheus), node-exporter, and cAdvisor. Retention: 30 days.") Container(node_exporter, "Node Exporter", "prom/node-exporter:v1.9.0", "Host-level CPU, memory, disk, and network metrics.") Container(cadvisor, "cAdvisor", "gcr.io/cadvisor/cadvisor:v0.52.1", "Per-container resource metrics.") -- 2.49.1 From 4e94d85d7e84994785a8a7957bd3e6f192335a7b Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:00:07 +0200 Subject: [PATCH 07/27] docs(adr): add ADR-016 for obs stack co-location and CI-push config sync Co-Authored-By: Claude Sonnet 4.6 --- docs/adr/016-obs-stack-co-location-ci-push.md | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 docs/adr/016-obs-stack-co-location-ci-push.md diff --git a/docs/adr/016-obs-stack-co-location-ci-push.md b/docs/adr/016-obs-stack-co-location-ci-push.md new file mode 100644 index 00000000..839d08f6 --- /dev/null +++ b/docs/adr/016-obs-stack-co-location-ci-push.md @@ -0,0 +1,52 @@ +# ADR-016: Observability stack co-location at `/opt/familienarchiv/` with CI-push config sync + +## Status + +Accepted + +## Context + +Issue #601 established that the observability stack must survive Gitea CI workspace wipes between nightly runs. When the nightly job completes, act_runner deletes the job workspace. Any Docker container that bind-mounts a config file from a workspace path (`/srv/gitea-workspace/…/infra/observability/prometheus/prometheus.yml`) then references a path that no longer exists on the host. On the next nightly run, Docker Compose either auto-creates an empty directory in its place (causing the container to fail to start because a file mount receives a directory) or finds a stale file from a previous run if the workspace happened to land at the same path. + +ADR-015 solved the workspace bind-mount resolution problem: job workspaces are stored at `/srv/gitea-workspace` so `$(pwd)` inside the job container maps to a real host path. But it did not address persistence: the workspace is still wiped after the job, so bind mounts from workspace-relative paths remain fragile across runs. + +### Decision drivers + +1. Bind-mount sources must point to a host path that persists indefinitely, not to a path that disappears after each CI run. +2. Config files must reflect the committed state of the repo after every nightly run (no manual sync steps). +3. Secrets must not be written to the workspace or to any path managed by CI; they must survive independently of deployments. +4. The solution must not introduce new infrastructure dependencies (no SSH access from CI, no external registry, no additional server-side daemon). + +### Alternatives considered + +**A: Server-pull model** — a systemd timer or cron job on the server does `git pull` from the repo into `/opt/familienarchiv/` and then runs `docker compose up`. Rejected because: (1) requires git credentials on the server and a registered deploy key, (2) adds a second deployment mechanism that diverges from the CI-push model used for the main app stack, (3) timing coupling — the server pull must complete before CI's health checks run, requiring polling or a webhook. + +**B: Separate directory (e.g. `/opt/obs/`)** — keeps obs configs isolated from the app stack. Rejected because: (1) the main app compose files are already in `/opt/familienarchiv/` (managed the same way), and (2) GlitchTip shares the `archive-db` PostgreSQL instance and `archiv-net` Docker network — it is architecturally part of the same deployment unit, not a separate one. Co-location reflects the actual coupling. + +**C: Named Docker configs (Swarm)** — Docker Swarm supports first-class config objects that persist in the cluster. Rejected because the project does not use Swarm and introducing it solely for config persistence is a disproportionate dependency. + +## Decision + +The observability stack is co-located with the main application deployment at `/opt/familienarchiv/`: + +- `docker-compose.observability.yml` → `/opt/familienarchiv/docker-compose.observability.yml` +- `infra/observability/` → `/opt/familienarchiv/infra/observability/` + +The nightly CI job (`nightly.yml`) copies these files from the workspace checkout to `/opt/familienarchiv/` using `cp -r` on every run (CI-push model). Containers always read config from the permanent location; a workspace wipe has no effect on running containers. + +Secrets are stored in `/opt/familienarchiv/.env` on the server. This file is managed by the operator — CI does not write or delete it. Docker Compose auto-reads it when started from `/opt/familienarchiv/`. The required key inventory is documented in `docs/DEPLOYMENT.md §4`. + +The CI runner mounts `/opt/familienarchiv` as a bind mount into job containers (see `runner-config.yaml`). This requires a one-time `mkdir -p /opt/familienarchiv/infra` on the server and a runner restart after updating `runner-config.yaml` (see ADR-015 and `docs/DEPLOYMENT.md §3.1`). + +## Consequences + +**Positive:** +- Bind-mount sources survive workspace wipes by definition — they are on a persistent host path. +- Config is always in sync with the repo after each nightly run. +- No new infrastructure dependencies; the CI-push model mirrors how the main app stack is deployed. +- Secrets (`/opt/familienarchiv/.env`) are decoupled from CI — a deployment cannot accidentally overwrite them. + +**Negative:** +- `cp -r` does not remove deleted files; a config file removed from the repo persists in `/opt/familienarchiv/infra/observability/` until manually deleted. Acceptable for this project's change frequency. A `rsync -a --delete` would give a clean mirror if this becomes a problem. +- Mounting `/opt/familienarchiv/` into CI job containers expands the blast radius of a compromised workflow step — a malicious step could overwrite app compose files and Caddy config. Acceptable because the runner is single-tenant (trusted code only). See `runner-config.yaml` security comment. +- Runner must be restarted (`systemctl restart gitea-runner`) after any change to `runner-config.yaml` for the new mount to take effect. -- 2.49.1 From c7d2eeb3f05938f8c873b34ab43e1aad3a778588 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:00:44 +0200 Subject: [PATCH 08/27] docs(ci): harden runner-config.yaml security comment for /opt/familienarchiv/ write access Co-Authored-By: Claude Sonnet 4.6 --- runner-config.yaml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/runner-config.yaml b/runner-config.yaml index 2df22cf3..07cad8d5 100644 --- a/runner-config.yaml +++ b/runner-config.yaml @@ -17,12 +17,16 @@ container: - "/srv/gitea-workspace" - "/opt/familienarchiv" # appended to `docker run` when the runner spawns a job container - # SECURITY: Mounting the Docker socket grants job containers root-equivalent - # access to the host Docker daemon. Acceptable here because only trusted code - # from this private repo runs on this runner. Do NOT use on a runner that - # accepts untrusted PRs from external contributors. - # /opt/familienarchiv is mounted so the nightly job can deploy observability - # configs to the permanent location without needing ssh or nsenter. + # SECURITY WARNING: This mount configuration grants CI job containers: + # 1. Root-equivalent access to the host Docker daemon (via the socket). + # 2. Read/write access to /opt/familienarchiv/ — including the main app's + # compose files, Caddy config, and observability configs. A malicious + # workflow step could overwrite any file in that directory. + # Both are acceptable ONLY because this runner is single-tenant: it executes + # code exclusively from this private repo with a fixed set of trusted authors. + # WARNING: Do NOT add this runner to any repo with external contributors or + # untrusted PRs — the blast radius includes the entire production deployment. + # See ADR-016 for the reasoning behind the /opt/familienarchiv mount. options: "-v /var/run/docker.sock:/var/run/docker.sock -v /srv/gitea-workspace:/srv/gitea-workspace -v /opt/familienarchiv:/opt/familienarchiv" # keep network mode default (bridge) — Testcontainers handles its own networking force_pull: false -- 2.49.1 From df37113d382a3f446e23c0c1f6a2eaf764d8017e Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:01:17 +0200 Subject: [PATCH 09/27] ci(obs): add compose config dry-run before obs stack up to catch .env substitution errors Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 0764bc03..97643ee3 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -143,6 +143,16 @@ jobs: cp -r infra/observability /opt/familienarchiv/infra/ cp docker-compose.observability.yml /opt/familienarchiv/ + - name: Validate observability compose config + # Dry-run: resolves all variable substitutions from /opt/familienarchiv/.env + # and reports any missing required keys before containers start. Catches + # truncated passwords (missing $$ escaping), undefined variables, and YAML + # errors in config files updated by the previous step. + run: | + docker compose \ + -f /opt/familienarchiv/docker-compose.observability.yml \ + config --quiet + - name: Start observability stack # Runs from /opt/familienarchiv/ so bind mounts resolve to stable # host paths that survive workspace wipes between nightly runs. -- 2.49.1 From 79735e23e08594f50456533b8f646736f028346d Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:01:48 +0200 Subject: [PATCH 10/27] ci(obs): assert obs-loki/prometheus/grafana/tempo are healthy after stack up Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 97643ee3..d4af264d 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -162,6 +162,25 @@ jobs: -f /opt/familienarchiv/docker-compose.observability.yml \ up -d --wait --remove-orphans + - name: Assert observability stack health + # docker compose up --wait covers services WITH healthcheck directives only. + # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have + # no healthcheck — they are considered "started" as soon as the process runs. + # This step explicitly asserts the four healthchecked critical services are + # healthy before the smoke test proceeds. + run: | + set -e + unhealthy="" + for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do + status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") + if [ "$status" != "healthy" ]; then + echo "::error::$svc is not healthy (status: $status)" + unhealthy="$unhealthy $svc" + fi + done + [ -z "$unhealthy" ] || exit 1 + echo "All critical observability services are healthy" + - name: Reload Caddy # Apply any committed Caddyfile changes before smoke-testing the # public surface. Without this step, a Caddyfile edit lands in the -- 2.49.1 From b67bd201b2f18e366bf57c436e79bf8d4a775576 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:17:07 +0200 Subject: [PATCH 11/27] feat(obs): add obs.env with non-secret config tracked in git Co-Authored-By: Claude Sonnet 4.6 --- infra/observability/obs.env | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 infra/observability/obs.env diff --git a/infra/observability/obs.env b/infra/observability/obs.env new file mode 100644 index 00000000..8200b235 --- /dev/null +++ b/infra/observability/obs.env @@ -0,0 +1,20 @@ +# Non-secret observability stack configuration — tracked in git. +# Secret values (passwords, keys) are injected by CI from Gitea secrets +# into /opt/familienarchiv/obs-secrets.env at deploy time. +# +# For local dev the main .env file supplies these values instead; +# this file is only used in the CI/production path. + +# Host ports (all bound to 127.0.0.1 — Caddy is the external entry point) +PORT_GRAFANA=3003 +PORT_GLITCHTIP=3002 +PORT_PROMETHEUS=9090 + +# Public URLs — used for internal redirects, alert email links, OAuth callbacks +GF_SERVER_ROOT_URL=https://grafana.archiv.raddatz.cloud +GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud + +# PostgreSQL hostname for GlitchTip db-init and workers. +# archive-db is the production default (full stack running). +# Override in obs-secrets.env when a different stack is active. +POSTGRES_HOST=archive-db -- 2.49.1 From f9baf02b8668934f71699d072120b61ea39cec1c Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:17:47 +0200 Subject: [PATCH 12/27] feat(obs): add GF_SERVER_ROOT_URL to Grafana service Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.observability.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 7c30bcca..ad05213c 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -146,6 +146,7 @@ services: environment: GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme} GF_USERS_ALLOW_SIGN_UP: "false" + GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-http://localhost:3003} volumes: - grafana_data:/var/lib/grafana - ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro -- 2.49.1 From c5139851b8c87c633ceecb725ee5793eca0c1213 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:18:38 +0200 Subject: [PATCH 13/27] =?UTF-8?q?ci(obs):=20GitOps=20obs=20env=20split=20i?= =?UTF-8?q?n=20nightly=20=E2=80=94=20obs.env=20in=20git,=20secrets=20fresh?= =?UTF-8?q?=20from=20Gitea?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 37 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index d4af264d..ce45d991 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -78,12 +78,6 @@ jobs: APP_MAIL_FROM=noreply@staging.raddatz.cloud IMPORT_HOST_DIR=/srv/familienarchiv-staging/import POSTGRES_USER=archiv - PORT_GRAFANA=3003 - PORT_GLITCHTIP=3002 - PORT_PROMETHEUS=9090 - GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} - GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} - GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud SENTRY_DSN=${{ secrets.SENTRY_DSN }} EOF @@ -136,30 +130,43 @@ jobs: # into /opt/familienarchiv/ — the permanent location that persists # between CI runs. Containers started in the next step bind-mount # from there, so a future workspace wipe cannot corrupt a running - # config file. Secrets are read from /opt/familienarchiv/.env (managed - # separately on the server; not written or deleted by CI). + # config file. + # + # obs-secrets.env is written fresh from Gitea secrets on every run so + # Gitea is always the single source of truth for secret rotation. + # Non-secret config lives in infra/observability/obs.env (tracked in git). run: | mkdir -p /opt/familienarchiv/infra cp -r infra/observability /opt/familienarchiv/infra/ cp docker-compose.observability.yml /opt/familienarchiv/ + cat > /opt/familienarchiv/obs-secrets.env < Date: Sat, 16 May 2026 00:19:12 +0200 Subject: [PATCH 14/27] =?UTF-8?q?ci(obs):=20GitOps=20obs=20env=20split=20i?= =?UTF-8?q?n=20release=20=E2=80=94=20deploy=20to=20/opt/familienarchiv/,?= =?UTF-8?q?=20secrets=20fresh=20from=20Gitea?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/release.yml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 2645dc15..bd22cb63 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -76,12 +76,6 @@ jobs: APP_MAIL_FROM=noreply@raddatz.cloud IMPORT_HOST_DIR=/srv/familienarchiv-production/import POSTGRES_USER=archiv - PORT_GRAFANA=3003 - PORT_GLITCHTIP=3002 - PORT_PROMETHEUS=9090 - GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} - GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} - GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud SENTRY_DSN=${{ secrets.SENTRY_DSN }} EOF @@ -104,11 +98,28 @@ jobs: --env-file .env.production \ up -d --wait --remove-orphans + - name: Deploy observability configs + # Mirrors the nightly approach: copies obs compose file and config tree + # to /opt/familienarchiv/ (permanent path, survives workspace wipes — ADR-016), + # then writes obs-secrets.env fresh from Gitea secrets. + # Non-secret config lives in infra/observability/obs.env (tracked in git). + run: | + mkdir -p /opt/familienarchiv/infra + cp -r infra/observability /opt/familienarchiv/infra/ + cp docker-compose.observability.yml /opt/familienarchiv/ + cat > /opt/familienarchiv/obs-secrets.env < Date: Sat, 16 May 2026 00:20:21 +0200 Subject: [PATCH 15/27] docs(obs): document obs vs main stack env model, obs.env + obs-secrets.env approach Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 56 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 6f6c2e52..333a36e0 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -306,32 +306,54 @@ docker compose up -d # creates archiv-net docker compose -f docker-compose.observability.yml up -d ``` +#### Why the obs stack is managed differently from the main app stack + +The main app stack (`docker-compose.prod.yml`) has no config-file bind mounts — its containers read config from env vars and image defaults. The workspace is wiped after each CI run but that does not affect running containers, because they hold no references to workspace paths. + +The obs stack is different: `prometheus.yml`, `tempo.yml`, Loki config, Grafana provisioning files, and Promtail config are all bind-mounted from the host filesystem into their containers. If those source paths disappear (workspace wipe), the containers can restart fine until a `docker compose up` is run again — at that point Docker tries to re-resolve the bind-mount source and fails because the workspace path no longer exists. + +The fix is to keep the obs compose file and config tree at a **permanent path** that CI copies to on every run but which survives between runs: `/opt/familienarchiv/` (see ADR-016). + #### Production — managed from `/opt/familienarchiv/` -The nightly CI job copies `docker-compose.observability.yml` and `infra/observability/` to `/opt/familienarchiv/` on every run, then starts the stack from there. Bind mounts in the compose file resolve to `/opt/familienarchiv/infra/observability/…` on the host, which survives workspace wipes between CI runs (see ADR-016). +Every CI run (nightly + release) copies `docker-compose.observability.yml` and `infra/observability/` to `/opt/familienarchiv/` before starting the stack. Bind mounts then resolve to `/opt/familienarchiv/infra/observability/…` — a stable path that outlasts any workspace wipe. -The obs stack reads secrets from `/opt/familienarchiv/.env` (Docker Compose auto-reads this file when launched from that directory). This file is managed by the operator — CI does **not** write or delete it. +**Environment variables** follow the same two-source model as the main stack: -**Required keys in `/opt/familienarchiv/.env`:** +| Source | What it contains | Managed by | +|---|---|---| +| `infra/observability/obs.env` | All non-secret config (ports, URLs, hostnames) | Git — reviewed in PRs | +| `/opt/familienarchiv/obs-secrets.env` | Passwords and secret keys only | CI — written fresh from Gitea secrets on every deploy | -| Key | Example / notes | +Both files are passed explicitly via `--env-file` to the compose command, so there is no implicit auto-read `.env` and no operator-managed file to keep in sync. + +**Non-secret config** (`infra/observability/obs.env`): + +| Key | Value | Notes | +|---|---|---| +| `PORT_GRAFANA` | `3003` | Avoids collision with staging frontend on port 3001 | +| `PORT_GLITCHTIP` | `3002` | | +| `PORT_PROMETHEUS` | `9090` | | +| `GF_SERVER_ROOT_URL` | `https://grafana.archiv.raddatz.cloud` | Required for alert email links and OAuth redirects | +| `GLITCHTIP_DOMAIN` | `https://glitchtip.archiv.raddatz.cloud` | Must match the Caddy vhost | +| `POSTGRES_HOST` | `archive-db` | Override if only the staging stack is running | + +**Secret keys** (set in Gitea secrets, injected by CI into `obs-secrets.env`): + +| Gitea secret | Notes | |---|---| -| `GRAFANA_ADMIN_PASSWORD` | Strong unique password | -| `GLITCHTIP_SECRET_KEY` | `python3 -c "import secrets; print(secrets.token_hex(32))"` | -| `GLITCHTIP_DOMAIN` | `https://glitchtip.archiv.raddatz.cloud` — must match the Caddy vhost | -| `POSTGRES_USER` | Must match the `archiv` user set in `.env.staging` / `.env.production` | -| `POSTGRES_PASSWORD` | Must match the running PostgreSQL container's password | -| `PORT_GRAFANA` | `3003` (staging default; 3001 was used by staging frontend) | -| `PORT_GLITCHTIP` | `3002` | -| `PORT_PROMETHEUS` | `9090` | -| `SENTRY_DSN` | Set after GlitchTip first-run; leave empty to disable | +| `GRAFANA_ADMIN_PASSWORD` | Strong unique password; shared by nightly and release | +| `GLITCHTIP_SECRET_KEY` | `openssl rand -hex 32`; shared by nightly and release | +| `STAGING_POSTGRES_PASSWORD` / `PROD_POSTGRES_PASSWORD` | Must match the running PostgreSQL container | -**`$$` escaping rule:** passwords that contain a literal `$` must use `$$` in this file so Docker Compose does not expand them as variable references. Example: a password `p@$word` must be written as `p@$$word`. Failure to escape produces a silently truncated password — Grafana or GlitchTip will start but reject logins. - -To start or restart the obs stack manually on the server: +To start or restart the obs stack manually on the server (after CI has run at least once): ```bash -docker compose -f /opt/familienarchiv/docker-compose.observability.yml up -d --wait --remove-orphans +docker compose \ + -f /opt/familienarchiv/docker-compose.observability.yml \ + --env-file /opt/familienarchiv/infra/observability/obs.env \ + --env-file /opt/familienarchiv/obs-secrets.env \ + up -d --wait --remove-orphans ``` Current services: -- 2.49.1 From 53cf1837b202527fe2f2b2ad45418c0967eb0083 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 00:21:53 +0200 Subject: [PATCH 16/27] =?UTF-8?q?fix(obs):=20set=20POSTGRES=5FHOST=20per?= =?UTF-8?q?=20environment=20=E2=80=94=20staging/prod=20use=20compose=20aut?= =?UTF-8?q?o-names=20not=20archive-db?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 1 + .gitea/workflows/release.yml | 1 + infra/observability/obs.env | 8 +++++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index ce45d991..50219afc 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -144,6 +144,7 @@ jobs: GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} POSTGRES_USER=archiv POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} + POSTGRES_HOST=archiv-staging-db-1 EOF - name: Validate observability compose config diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index bd22cb63..48d61147 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -112,6 +112,7 @@ jobs: GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} POSTGRES_USER=archiv POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }} + POSTGRES_HOST=archiv-production-db-1 EOF - name: Start observability stack diff --git a/infra/observability/obs.env b/infra/observability/obs.env index 8200b235..ace1da3a 100644 --- a/infra/observability/obs.env +++ b/infra/observability/obs.env @@ -15,6 +15,8 @@ GF_SERVER_ROOT_URL=https://grafana.archiv.raddatz.cloud GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud # PostgreSQL hostname for GlitchTip db-init and workers. -# archive-db is the production default (full stack running). -# Override in obs-secrets.env when a different stack is active. -POSTGRES_HOST=archive-db +# The actual value depends on the Compose project name — it is not a fixed string. +# CI sets POSTGRES_HOST in obs-secrets.env per environment: +# staging: archiv-staging-db-1 (project archiv-staging + service db) +# production: archiv-production-db-1 (project archiv-production + service db) +# For local dev, set POSTGRES_HOST in your .env file (defaults to archive-db there). -- 2.49.1 From 4c5ee96e36772d5be56fa5e6fb9357333672c85a Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 08:52:42 +0200 Subject: [PATCH 17/27] docs(adr): correct ADR-016 Decision section to match two-source env model The Decision section described an operator-managed /opt/familienarchiv/.env that CI does not touch. The actual implementation is a two-source model: obs.env (git-tracked, non-secret config) + obs-secrets.env (CI-written fresh from Gitea secrets on every deploy). Also updates the Consequences bullet that incorrectly stated secrets are decoupled from CI. Co-Authored-By: Claude Sonnet 4.6 --- docs/adr/016-obs-stack-co-location-ci-push.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/adr/016-obs-stack-co-location-ci-push.md b/docs/adr/016-obs-stack-co-location-ci-push.md index 839d08f6..0e181597 100644 --- a/docs/adr/016-obs-stack-co-location-ci-push.md +++ b/docs/adr/016-obs-stack-co-location-ci-push.md @@ -32,9 +32,14 @@ The observability stack is co-located with the main application deployment at `/ - `docker-compose.observability.yml` → `/opt/familienarchiv/docker-compose.observability.yml` - `infra/observability/` → `/opt/familienarchiv/infra/observability/` -The nightly CI job (`nightly.yml`) copies these files from the workspace checkout to `/opt/familienarchiv/` using `cp -r` on every run (CI-push model). Containers always read config from the permanent location; a workspace wipe has no effect on running containers. +Both the nightly CI job (`nightly.yml`) and the release job (`release.yml`) copy these files from the workspace checkout to `/opt/familienarchiv/` using `cp -r` on every run (CI-push model). Containers always read config from the permanent location; a workspace wipe has no effect on running containers. -Secrets are stored in `/opt/familienarchiv/.env` on the server. This file is managed by the operator — CI does not write or delete it. Docker Compose auto-reads it when started from `/opt/familienarchiv/`. The required key inventory is documented in `docs/DEPLOYMENT.md §4`. +Environment variables follow a two-source model: + +- `infra/observability/obs.env` (git-tracked, non-secret): all non-sensitive config — host ports, public URLs (`GLITCHTIP_DOMAIN`, `GF_SERVER_ROOT_URL`), and the default `POSTGRES_HOST`. Changes go through PR review. No credentials. +- `/opt/familienarchiv/obs-secrets.env` (CI-written, per-deploy): passwords and secret keys only (`GRAFANA_ADMIN_PASSWORD`, `GLITCHTIP_SECRET_KEY`, `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_HOST`), injected fresh from Gitea secrets on every nightly and release deploy. Gitea is the single source of truth for secrets — rotating a secret takes effect on the next deploy without manual server action. + +Both files are passed explicitly via `--env-file` to every obs compose command (config dry-run and `up`). There is no implicit auto-read `.env`. The required key inventory is documented in `docs/DEPLOYMENT.md §4`. The CI runner mounts `/opt/familienarchiv` as a bind mount into job containers (see `runner-config.yaml`). This requires a one-time `mkdir -p /opt/familienarchiv/infra` on the server and a runner restart after updating `runner-config.yaml` (see ADR-015 and `docs/DEPLOYMENT.md §3.1`). @@ -44,7 +49,7 @@ The CI runner mounts `/opt/familienarchiv` as a bind mount into job containers ( - Bind-mount sources survive workspace wipes by definition — they are on a persistent host path. - Config is always in sync with the repo after each nightly run. - No new infrastructure dependencies; the CI-push model mirrors how the main app stack is deployed. -- Secrets (`/opt/familienarchiv/.env`) are decoupled from CI — a deployment cannot accidentally overwrite them. +- Secret rotation requires no manual server action — Gitea secrets are the authoritative store; `obs-secrets.env` is rewritten from scratch on every deploy so a secret change takes effect on the next nightly or release run. **Negative:** - `cp -r` does not remove deleted files; a config file removed from the repo persists in `/opt/familienarchiv/infra/observability/` until manually deleted. Acceptable for this project's change frequency. A `rsync -a --delete` would give a clean mirror if this becomes a problem. -- 2.49.1 From f628ab643577b37b3a7bb6f243b11407ea85814f Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 08:53:18 +0200 Subject: [PATCH 18/27] ci(obs): add validate + health assertion steps to release.yml nightly.yml had two observability gates that release.yml lacked: - "Validate observability compose config" (docker compose config --quiet) catches missing env vars and YAML errors before any containers start - "Assert observability stack health" checks obs-loki/prometheus/grafana/tempo are healthy after up --wait, covering services without healthcheck directives Mirrors the nightly.yml steps verbatim so the production deploy path is at least as well-verified as the nightly staging path. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/release.yml | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 48d61147..5e4d12a2 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -115,7 +115,24 @@ jobs: POSTGRES_HOST=archiv-production-db-1 EOF + - name: Validate observability compose config + # Dry-run: resolves all variable substitutions and reports any missing + # required keys before containers start. Catches undefined variables and + # YAML errors in config files updated by the previous step. + # Keep in sync with the equivalent step in nightly.yml. + run: | + docker compose \ + -f /opt/familienarchiv/docker-compose.observability.yml \ + --env-file /opt/familienarchiv/infra/observability/obs.env \ + --env-file /opt/familienarchiv/obs-secrets.env \ + config --quiet + - name: Start observability stack + # Runs with absolute paths so bind mounts resolve to stable host paths + # that survive workspace wipes between runs (see ADR-016). + # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env + # (written fresh from Gitea secrets above). + # Keep in sync with the equivalent step in nightly.yml. run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ @@ -123,6 +140,26 @@ jobs: --env-file /opt/familienarchiv/obs-secrets.env \ up -d --wait --remove-orphans + - name: Assert observability stack health + # docker compose up --wait covers services WITH healthcheck directives only. + # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have + # no healthcheck — they are considered "started" as soon as the process runs. + # This step explicitly asserts the four healthchecked critical services are + # healthy before the smoke test proceeds. + # Keep in sync with the equivalent step in nightly.yml. + run: | + set -e + unhealthy="" + for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do + status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") + if [ "$status" != "healthy" ]; then + echo "::error::$svc is not healthy (status: $status)" + unhealthy="$unhealthy $svc" + fi + done + [ -z "$unhealthy" ] || exit 1 + echo "All critical observability services are healthy" + - name: Reload Caddy # See nightly.yml — same rationale and mechanism: DooD job containers # cannot call systemctl directly; nsenter via a privileged sibling -- 2.49.1 From dec0001bd10ac4d8cecec621ed2ae6d081219cbb Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 08:53:49 +0200 Subject: [PATCH 19/27] ci(obs): chmod 600 obs-secrets.env after creation in both workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heredoc creates the file with default umask permissions (644 — world-readable). Setting 600 immediately after creation prevents other processes on the host from reading the Grafana, GlitchTip, and Postgres credentials. Defence-in-depth for the single-tenant VPS. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 1 + .gitea/workflows/release.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 50219afc..4ed4bb23 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -146,6 +146,7 @@ jobs: POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-staging-db-1 EOF + chmod 600 /opt/familienarchiv/obs-secrets.env - name: Validate observability compose config # Dry-run: resolves all variable substitutions and reports any missing diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 5e4d12a2..a45f48a7 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -114,6 +114,7 @@ jobs: POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-production-db-1 EOF + chmod 600 /opt/familienarchiv/obs-secrets.env - name: Validate observability compose config # Dry-run: resolves all variable substitutions and reports any missing -- 2.49.1 From f5c7be932b55f093046494c41cb42aa267e8fcff Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 08:54:17 +0200 Subject: [PATCH 20/27] ci(obs): document POSTGRES_HOST derivation from Compose project name The container names archiv-staging-db-1 and archiv-production-db-1 are derived from the Compose project name + service name. A project rename silently breaks the obs stack DB connection. Add a comment at the point of definition so the dependency is obvious when someone changes it. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 2 ++ .gitea/workflows/release.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 4ed4bb23..0d706d0b 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -146,6 +146,8 @@ jobs: POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-staging-db-1 EOF + # Note: POSTGRES_HOST is derived from the Compose project name (archiv-staging) + # and service name (db). A project rename requires updating this value. chmod 600 /opt/familienarchiv/obs-secrets.env - name: Validate observability compose config diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index a45f48a7..7b34728e 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -114,6 +114,8 @@ jobs: POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-production-db-1 EOF + # Note: POSTGRES_HOST is derived from the Compose project name (archiv-production) + # and service name (db). A project rename requires updating this value. chmod 600 /opt/familienarchiv/obs-secrets.env - name: Validate observability compose config -- 2.49.1 From 9662ff5f8cdd82899a57bdd4f119033c0396525b Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 09:03:46 +0200 Subject: [PATCH 21/27] ci(obs): quote heredoc delimiter in nightly obs-secrets.env write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents shell from expanding '$' in Gitea-rendered secret values. Without the quote, a password like 'P@$s5w0rd' has '$s5w0rd' silently expanded to '' — writing a truncated value to obs-secrets.env. '<<'EOF'' suppresses shell expansion; Gitea's '${{ }}' template rendering already ran before the shell sees the script. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 0d706d0b..ab0ee276 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -139,7 +139,7 @@ jobs: mkdir -p /opt/familienarchiv/infra cp -r infra/observability /opt/familienarchiv/infra/ cp docker-compose.observability.yml /opt/familienarchiv/ - cat > /opt/familienarchiv/obs-secrets.env < /opt/familienarchiv/obs-secrets.env <<'EOF' GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} POSTGRES_USER=archiv -- 2.49.1 From 25062be657e8d86f4a9427dd89557eb7dc69016b Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 09:04:12 +0200 Subject: [PATCH 22/27] ci(obs): quote heredoc delimiter in release obs-secrets.env write Same fix as nightly.yml: prevents shell expansion of '$' in secret values after Gitea renders them. Keep in sync with nightly.yml. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 7b34728e..b180b667 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -107,7 +107,7 @@ jobs: mkdir -p /opt/familienarchiv/infra cp -r infra/observability /opt/familienarchiv/infra/ cp docker-compose.observability.yml /opt/familienarchiv/ - cat > /opt/familienarchiv/obs-secrets.env < /opt/familienarchiv/obs-secrets.env <<'EOF' GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} POSTGRES_USER=archiv -- 2.49.1 From a7f60ebed8ad19473aa948ab8c5e379ccc89739c Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 09:04:41 +0200 Subject: [PATCH 23/27] docs(obs): add cp-r stale-file cleanup note to DEPLOYMENT.md CI uses 'cp -r' which does not remove deleted files. Documents the manual cleanup step for config files removed from git. Co-Authored-By: Claude Sonnet 4.6 --- docs/DEPLOYMENT.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 333a36e0..159beb7d 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -356,6 +356,11 @@ docker compose \ up -d --wait --remove-orphans ``` +> **Note:** `cp -r` does not remove deleted files. If a config file is removed from the repo, its stale copy persists at `/opt/familienarchiv/infra/observability/` until manually deleted: +> ```bash +> rm /opt/familienarchiv/infra/observability/ +> ``` + Current services: | Service | Image | Purpose | -- 2.49.1 From 6720a5aeb2b85d9fb895daecb0aadc4f4762e0d3 Mon Sep 17 00:00:00 2001 From: Marcel Date: Sat, 16 May 2026 09:20:08 +0200 Subject: [PATCH 24/27] chore(obs): improve deploy maintainability from review feedback - Move POSTGRES_USER to obs.env (non-secret, constant across envs) - Replace cp -r with rsync -a --delete so removed config files are purged from /opt/familienarchiv on next deploy instead of lingering - Document --env-file ordering contract in validate + start steps: obs.env first (defaults), obs-secrets.env second (wins on dupes) Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/nightly.yml | 11 +++++++---- .gitea/workflows/release.yml | 11 +++++++---- infra/observability/obs.env | 2 ++ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index ab0ee276..63ec0b03 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -136,13 +136,12 @@ jobs: # Gitea is always the single source of truth for secret rotation. # Non-secret config lives in infra/observability/obs.env (tracked in git). run: | - mkdir -p /opt/familienarchiv/infra - cp -r infra/observability /opt/familienarchiv/infra/ + mkdir -p /opt/familienarchiv/infra/observability + rsync -a --delete infra/observability/ /opt/familienarchiv/infra/observability/ cp docker-compose.observability.yml /opt/familienarchiv/ cat > /opt/familienarchiv/obs-secrets.env <<'EOF' GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} - POSTGRES_USER=archiv POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-staging-db-1 EOF @@ -154,6 +153,9 @@ jobs: # Dry-run: resolves all variable substitutions and reports any missing # required keys before containers start. Catches undefined variables and # YAML errors in config files updated by the previous step. + # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env + # second (CI-written secrets). Later files win on duplicate keys, so + # obs-secrets.env overrides POSTGRES_HOST set in obs.env. run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ @@ -165,7 +167,8 @@ jobs: # Runs with absolute paths so bind mounts resolve to stable host paths # that survive workspace wipes between nightly runs (see ADR-016). # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env - # (written fresh from Gitea secrets above). + # (written fresh from Gitea secrets above). --env-file order: obs.env first, + # obs-secrets.env second — later file wins on duplicate keys. run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index b180b667..041ffa09 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -104,13 +104,12 @@ jobs: # then writes obs-secrets.env fresh from Gitea secrets. # Non-secret config lives in infra/observability/obs.env (tracked in git). run: | - mkdir -p /opt/familienarchiv/infra - cp -r infra/observability /opt/familienarchiv/infra/ + mkdir -p /opt/familienarchiv/infra/observability + rsync -a --delete infra/observability/ /opt/familienarchiv/infra/observability/ cp docker-compose.observability.yml /opt/familienarchiv/ cat > /opt/familienarchiv/obs-secrets.env <<'EOF' GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }} GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }} - POSTGRES_USER=archiv POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }} POSTGRES_HOST=archiv-production-db-1 EOF @@ -122,6 +121,9 @@ jobs: # Dry-run: resolves all variable substitutions and reports any missing # required keys before containers start. Catches undefined variables and # YAML errors in config files updated by the previous step. + # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env + # second (CI-written secrets). Later files win on duplicate keys, so + # obs-secrets.env overrides POSTGRES_HOST set in obs.env. # Keep in sync with the equivalent step in nightly.yml. run: | docker compose \ @@ -134,7 +136,8 @@ jobs: # Runs with absolute paths so bind mounts resolve to stable host paths # that survive workspace wipes between runs (see ADR-016). # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env - # (written fresh from Gitea secrets above). + # (written fresh from Gitea secrets above). --env-file order: obs.env first, + # obs-secrets.env second — later file wins on duplicate keys. # Keep in sync with the equivalent step in nightly.yml. run: | docker compose \ diff --git a/infra/observability/obs.env b/infra/observability/obs.env index ace1da3a..1c46a8fe 100644 --- a/infra/observability/obs.env +++ b/infra/observability/obs.env @@ -14,6 +14,8 @@ PORT_PROMETHEUS=9090 GF_SERVER_ROOT_URL=https://grafana.archiv.raddatz.cloud GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud +POSTGRES_USER=archiv + # PostgreSQL hostname for GlitchTip db-init and workers. # The actual value depends on the Compose project name — it is not a fixed string. # CI sets POSTGRES_HOST in obs-secrets.env per environment: -- 2.49.1 From b194b565f64f2b87ae366a1531d03171f6f40b55 Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 16 May 2026 09:35:43 +0200 Subject: [PATCH 25/27] ci(obs): reference #603 in keep-in-sync comments; add obs-glitchtip to health assertion --- .gitea/workflows/release.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 041ffa09..e2dcab3e 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -124,7 +124,7 @@ jobs: # --env-file order: obs.env first (git-tracked defaults), obs-secrets.env # second (CI-written secrets). Later files win on duplicate keys, so # obs-secrets.env overrides POSTGRES_HOST set in obs.env. - # Keep in sync with the equivalent step in nightly.yml. + # Keep in sync with the equivalent step in nightly.yml (#603). run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ @@ -138,7 +138,7 @@ jobs: # Non-secret config from obs.env (git-tracked); secrets from obs-secrets.env # (written fresh from Gitea secrets above). --env-file order: obs.env first, # obs-secrets.env second — later file wins on duplicate keys. - # Keep in sync with the equivalent step in nightly.yml. + # Keep in sync with the equivalent step in nightly.yml (#603). run: | docker compose \ -f /opt/familienarchiv/docker-compose.observability.yml \ @@ -150,13 +150,13 @@ jobs: # docker compose up --wait covers services WITH healthcheck directives only. # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have # no healthcheck — they are considered "started" as soon as the process runs. - # This step explicitly asserts the four healthchecked critical services are + # This step explicitly asserts the five healthchecked critical services are # healthy before the smoke test proceeds. - # Keep in sync with the equivalent step in nightly.yml. + # Keep in sync with the equivalent step in nightly.yml (#603). run: | set -e unhealthy="" - for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do + for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") if [ "$status" != "healthy" ]; then echo "::error::$svc is not healthy (status: $status)" -- 2.49.1 From 0bb0a314ad621b4628509c1093906558d6ddfb67 Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 16 May 2026 09:36:37 +0200 Subject: [PATCH 26/27] ci(obs): add obs-glitchtip to health assertion loop (now has /_health/ healthcheck) --- .gitea/workflows/nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/nightly.yml b/.gitea/workflows/nightly.yml index 63ec0b03..488efcfb 100644 --- a/.gitea/workflows/nightly.yml +++ b/.gitea/workflows/nightly.yml @@ -180,12 +180,12 @@ jobs: # docker compose up --wait covers services WITH healthcheck directives only. # obs-promtail, obs-cadvisor, obs-node-exporter, and obs-glitchtip-worker have # no healthcheck — they are considered "started" as soon as the process runs. - # This step explicitly asserts the four healthchecked critical services are + # This step explicitly asserts the five healthchecked critical services are # healthy before the smoke test proceeds. run: | set -e unhealthy="" - for svc in obs-loki obs-prometheus obs-grafana obs-tempo; do + for svc in obs-loki obs-prometheus obs-grafana obs-tempo obs-glitchtip; do status=$(docker inspect "$svc" --format '{{.State.Health.Status}}' 2>/dev/null || echo "missing") if [ "$status" != "healthy" ]; then echo "::error::$svc is not healthy (status: $status)" -- 2.49.1 From 3658733003e681b4578c4e58aab5571c9a84ceed Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 16 May 2026 09:37:17 +0200 Subject: [PATCH 27/27] fix(obs): add GlitchTip healthcheck on /_health/ (port 8080) --- docker-compose.observability.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index ad05213c..ed60ee66 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -203,6 +203,12 @@ services: GLITCHTIP_MAX_EVENT_LIFE_DAYS: 90 ports: - "127.0.0.1:${PORT_GLITCHTIP:-3002}:8080" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8080/_health/ || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s networks: - archiv-net - obs-net -- 2.49.1