feat(observability): PO Overview Grafana dashboard (#651) #659

Merged
marcel merged 13 commits from feat/issue-651-grafana-po-overview into main 2026-05-22 18:37:03 +02:00
18 changed files with 1082 additions and 0 deletions

View File

@@ -39,6 +39,12 @@ PORT_PROMETHEUS=9090
# Grafana admin password — change this before exposing Grafana beyond localhost
GRAFANA_ADMIN_PASSWORD=changeme
# Password for the read-only grafana_reader PostgreSQL role used by the PO
# Overview dashboard. Consumed by Flyway V68 (to set the role's password) and
# by Grafana's PostgreSQL datasource (to connect). REQUIRED in production —
# generate with: openssl rand -hex 32
GRAFANA_DB_PASSWORD=changeme-generate-with-openssl-rand-hex-32
# GlitchTip domain — production: use https://glitchtip.archiv.raddatz.cloud (must match Caddy vhost)
GLITCHTIP_DOMAIN=http://localhost:3002

View File

@@ -31,6 +31,7 @@ name: nightly
# STAGING_APP_ADMIN_USERNAME
# STAGING_APP_ADMIN_PASSWORD
# GRAFANA_ADMIN_PASSWORD
# GRAFANA_DB_PASSWORD (read-only grafana_reader DB role, issue #651)
# GLITCHTIP_SECRET_KEY
# SENTRY_DSN (set after GlitchTip first-run; empty = Sentry disabled)
@@ -80,6 +81,7 @@ jobs:
POSTGRES_USER=archiv
SENTRY_DSN=${{ secrets.SENTRY_DSN }}
VITE_SENTRY_DSN=${{ secrets.VITE_SENTRY_DSN }}
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
EOF
- name: Verify backend /import:ro mount is wired
@@ -143,6 +145,7 @@ jobs:
cp docker-compose.observability.yml /opt/familienarchiv/
cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
POSTGRES_HOST=archiv-staging-db-1

View File

@@ -35,6 +35,7 @@ name: release
# MAIL_USERNAME
# MAIL_PASSWORD
# GRAFANA_ADMIN_PASSWORD
# GRAFANA_DB_PASSWORD (read-only grafana_reader DB role, issue #651)
# GLITCHTIP_SECRET_KEY
# SENTRY_DSN (set after GlitchTip first-run; empty = Sentry disabled)
@@ -77,6 +78,7 @@ jobs:
IMPORT_HOST_DIR=/srv/familienarchiv-production/import
POSTGRES_USER=archiv
SENTRY_DSN=${{ secrets.SENTRY_DSN }}
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
EOF
- name: Build images
@@ -110,6 +112,7 @@ jobs:
cp docker-compose.observability.yml /opt/familienarchiv/
cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
POSTGRES_HOST=archiv-production-db-1

View File

@@ -5,8 +5,10 @@ import lombok.extern.slf4j.Slf4j;
import org.flywaydb.core.Flyway;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.env.Environment;
import javax.sql.DataSource;
import java.util.Map;
@Configuration
@RequiredArgsConstructor
@@ -14,6 +16,7 @@ import javax.sql.DataSource;
public class FlywayConfig {
private final DataSource dataSource;
private final Environment environment;
@Bean(name = "flyway")
public Flyway flyway() {
@@ -21,6 +24,7 @@ public class FlywayConfig {
Flyway flyway = Flyway.configure()
.dataSource(dataSource)
.locations("classpath:db/migration")
.placeholders(Map.of("grafanaDbPassword", resolveGrafanaDbPassword()))
.baselineOnMigrate(true)
.baselineVersion("4")
.load();
@@ -28,4 +32,22 @@ public class FlywayConfig {
log.info("Flyway: {} migration(s) applied.", result.migrationsExecuted);
return flyway;
}
// Fail-closed: refuse to boot when GRAFANA_DB_PASSWORD is unset. The
// grafana_reader role's password is (re)set on every boot by
// R__grafana_reader_password.sql, so a missing env var means we'd either
// skip the rotation silently or — with a hardcoded fallback — publish a
// well-known credential for a role with SELECT on audit_log, documents,
// and transcription_blocks. Same shape as UserDataInitializer's refusal
// to seed default admin credentials outside dev/test/e2e.
String resolveGrafanaDbPassword() {
String value = environment.getProperty("GRAFANA_DB_PASSWORD");
if (value == null || value.isBlank()) {
throw new IllegalStateException(
"GRAFANA_DB_PASSWORD is required: it is consumed by "
+ "R__grafana_reader_password.sql to (re)set the grafana_reader "
+ "role's password on every boot. Generate with: openssl rand -hex 32");
}
return value;
}
}

View File

@@ -0,0 +1,14 @@
-- Repeatable migration: sets the grafana_reader role's password from the
-- ${grafanaDbPassword} placeholder (resolved by FlywayConfig from the
-- GRAFANA_DB_PASSWORD environment variable). Flyway computes the checksum on
-- the resolved migration content, so any change to GRAFANA_DB_PASSWORD changes
-- the checksum and re-applies this migration on the next boot. That makes
-- password rotation a "change env var + restart" operation — no manual psql.
--
-- V68 created the role itself (without a usable password). This file owns the
-- password lifecycle; nothing else writes it.
DO $$
BEGIN
EXECUTE format('ALTER ROLE grafana_reader WITH PASSWORD %L', '${grafanaDbPassword}');
END
$$;

View File

@@ -0,0 +1,17 @@
-- Read-only role used by the Grafana PostgreSQL datasource for the PO Overview
-- dashboard (issue #651). The role is created here without a usable password
-- (LOGIN-capable but no password set); R__grafana_reader_password.sql sets the
-- password from GRAFANA_DB_PASSWORD on every boot, so rotation is just "bump
-- the env var and restart the backend" — see docs/adr/024-* and the rotation
-- runbook in docs/DEPLOYMENT.md.
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_catalog.pg_roles WHERE rolname = 'grafana_reader') THEN
CREATE ROLE grafana_reader WITH LOGIN;
END IF;
END
$$;
GRANT CONNECT ON DATABASE ${flyway:database} TO grafana_reader;
GRANT USAGE ON SCHEMA public TO grafana_reader;
GRANT SELECT ON audit_log, documents, transcription_blocks TO grafana_reader;

View File

@@ -0,0 +1,37 @@
package org.raddatz.familienarchiv.config;
import org.junit.jupiter.api.Test;
import org.springframework.mock.env.MockEnvironment;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
class FlywayConfigTest {
@Test
void resolveGrafanaDbPassword_throws_when_env_unset() {
FlywayConfig config = new FlywayConfig(null, new MockEnvironment());
assertThatThrownBy(config::resolveGrafanaDbPassword)
.isInstanceOf(IllegalStateException.class)
.hasMessageContaining("GRAFANA_DB_PASSWORD is required");
}
@Test
void resolveGrafanaDbPassword_throws_when_env_blank() {
MockEnvironment env = new MockEnvironment().withProperty("GRAFANA_DB_PASSWORD", " ");
FlywayConfig config = new FlywayConfig(null, env);
assertThatThrownBy(config::resolveGrafanaDbPassword)
.isInstanceOf(IllegalStateException.class)
.hasMessageContaining("GRAFANA_DB_PASSWORD is required");
}
@Test
void resolveGrafanaDbPassword_returns_value_when_env_set() {
MockEnvironment env = new MockEnvironment().withProperty("GRAFANA_DB_PASSWORD", "abc");
FlywayConfig config = new FlywayConfig(null, env);
assertThat(config.resolveGrafanaDbPassword()).isEqualTo("abc");
}
}

View File

@@ -0,0 +1,89 @@
package org.raddatz.familienarchiv.config;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import org.raddatz.familienarchiv.PostgresContainerConfig;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
import org.springframework.context.annotation.Import;
import org.springframework.jdbc.core.JdbcTemplate;
import static org.assertj.core.api.Assertions.assertThat;
// GRAFANA_DB_PASSWORD is supplied via the global test default in
// src/test/resources/application.properties — FlywayConfig fails closed
// when it is unset, so all tests that load the migration path need it.
@DataJpaTest
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
@Import({PostgresContainerConfig.class, FlywayConfig.class})
class GrafanaReaderRoleIntegrationTest {
@Autowired JdbcTemplate jdbc;
// --- positive grants (SELECT on the three explicitly granted tables) ---
@Test
void grafana_reader_has_select_on_audit_log() {
assertThat(hasPrivilege("audit_log", "SELECT")).isTrue();
}
@Test
void grafana_reader_has_select_on_documents() {
assertThat(hasPrivilege("documents", "SELECT")).isTrue();
}
@Test
void grafana_reader_has_select_on_transcription_blocks() {
assertThat(hasPrivilege("transcription_blocks", "SELECT")).isTrue();
}
// --- write-deny on the granted tables: SELECT-only means SELECT-only.
// A future migration that GRANTs INSERT/UPDATE/DELETE on any of these
// would fail these tests, even though the original positive grants still
// pass. Locks the boundary in both directions.
@Test
void grafana_reader_has_no_INSERT_on_documents() {
assertThat(hasPrivilege("documents", "INSERT")).isFalse();
}
@Test
void grafana_reader_has_no_UPDATE_on_audit_log() {
assertThat(hasPrivilege("audit_log", "UPDATE")).isFalse();
}
@Test
void grafana_reader_has_no_DELETE_on_transcription_blocks() {
assertThat(hasPrivilege("transcription_blocks", "DELETE")).isFalse();
}
// --- negative grants: PII / sensitive tables MUST NOT be readable.
// The parameterized form catches the "someone widened the grant to
// ALL TABLES IN SCHEMA public" footgun — three specific positive grants
// would still pass while this sweep turns red.
@ParameterizedTest
@ValueSource(strings = {
"app_users",
"user_groups",
"persons",
"notifications",
"document_comments",
"document_annotations",
"geschichten"
})
void grafana_reader_has_no_SELECT_on_protected_table(String table) {
assertThat(hasPrivilege(table, "SELECT")).isFalse();
}
private boolean hasPrivilege(String table, String privilege) {
Boolean result = jdbc.queryForObject(
"SELECT has_table_privilege('grafana_reader', ?, ?)",
Boolean.class,
table,
privilege);
return Boolean.TRUE.equals(result);
}
}

View File

@@ -1,2 +1,8 @@
logging.level.root=WARN
logging.level.org.raddatz=INFO
# Default test value so FlywayConfig's fail-closed check passes without each
# test having to set GRAFANA_DB_PASSWORD explicitly. The actual value is
# irrelevant in tests — Flyway only uses it to set the grafana_reader role's
# password, which no test connects with.
GRAFANA_DB_PASSWORD=test-grafana-reader-password

View File

@@ -147,6 +147,9 @@ services:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-http://localhost:3003}
# Read-only password for the grafana_reader PostgreSQL role; interpolated
# into the provisioned PostgreSQL datasource (see datasources.yml).
GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
- ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro
@@ -165,6 +168,7 @@ services:
condition: service_healthy
networks:
- obs-net
- archiv-net # PO Overview dashboard queries archive-db via the grafana_reader role
# --- Error Tracking: GlitchTip ---

View File

@@ -227,6 +227,9 @@ services:
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv
SPRING_DATASOURCE_USERNAME: archiv
SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD}
# Consumed by Flyway V68 via the ${grafanaDbPassword} placeholder to set
# the read-only grafana_reader role's password.
GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
# Application uses the bucket-scoped service account, not MinIO root.
S3_ENDPOINT: http://minio:9000
S3_ACCESS_KEY: archiv-app

View File

@@ -163,6 +163,9 @@ services:
SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD}
# Consumed by Flyway V68 via the ${grafanaDbPassword} placeholder to set
# the read-only grafana_reader role's password.
GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
S3_ENDPOINT: http://minio:9000
S3_ACCESS_KEY: ${MINIO_ROOT_USER}
S3_SECRET_KEY: ${MINIO_ROOT_PASSWORD}

View File

@@ -152,6 +152,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
| `PORT_GRAFANA` | Host port for the Grafana UI (bound to `127.0.0.1` only) | `3003` | — | — |
| `POSTGRES_HOST` | PostgreSQL hostname for GlitchTip's db-init job and workers. Override when only the staging stack is running and `archive-db` is not resolvable by that name. | `archive-db` | — | — |
| `GRAFANA_ADMIN_PASSWORD` | Grafana `admin` user password | `changeme` | YES (prod) | YES |
| `GRAFANA_DB_PASSWORD` | Password for the read-only `grafana_reader` PostgreSQL role used by the PO Overview dashboard (issue #651). Consumed by Flyway V68 and the Grafana PostgreSQL datasource. Generate with `openssl rand -hex 32`. | — | YES (prod) | YES |
| `PORT_GLITCHTIP` | Host port for the GlitchTip UI (bound to `127.0.0.1` only) | `3002` | — | — |
| `GLITCHTIP_DOMAIN` | Public-facing base URL for GlitchTip (used in email links and CORS) | `http://localhost:3002` | YES (prod) | — |
| `GLITCHTIP_SECRET_KEY` | Django secret key for GlitchTip — generate with `python3 -c "import secrets; print(secrets.token_hex(32))"` | — | YES | YES |
@@ -256,6 +257,7 @@ git.raddatz.cloud A <server IP>
| `MAIL_USERNAME` | release.yml | SMTP user |
| `MAIL_PASSWORD` | release.yml | SMTP password |
| `GRAFANA_ADMIN_PASSWORD` | both | Grafana `admin` login — generate a strong password |
| `GRAFANA_DB_PASSWORD` | both | Read-only `grafana_reader` role password — `openssl rand -hex 32` |
| `GLITCHTIP_SECRET_KEY` | both | Django secret key — `openssl rand -hex 32` |
| `SENTRY_DSN` | both | GlitchTip project DSN — set after first-run (§4); leave empty to keep Sentry disabled |
| `VITE_SENTRY_DSN` | both | GlitchTip frontend project DSN — set after first-run (§4); leave empty to keep Sentry disabled |
@@ -357,6 +359,7 @@ Both files are passed explicitly via `--env-file` to the compose command, so the
| Gitea secret | Notes |
|---|---|
| `GRAFANA_ADMIN_PASSWORD` | Strong unique password; shared by nightly and release |
| `GRAFANA_DB_PASSWORD` | `openssl rand -hex 32`; shared by nightly and release — read-only DB role for the PO Overview dashboard |
| `GLITCHTIP_SECRET_KEY` | `openssl rand -hex 32`; shared by nightly and release |
| `STAGING_POSTGRES_PASSWORD` / `PROD_POSTGRES_PASSWORD` | Must match the running PostgreSQL container |
@@ -427,6 +430,31 @@ docker exec obs-loki wget -qO- \
Prometheus port `9090` and Grafana port `3003` (default; configurable via `PORT_GRAFANA`) are bound to `127.0.0.1` on the host. No other observability ports are host-bound.
##### Rotate the `grafana_reader` DB password
The PO Overview dashboard reads `audit_log`, `documents`, and `transcription_blocks` through the SELECT-only `grafana_reader` PostgreSQL role (issue #651, ADR-024). The role's password is owned by `R__grafana_reader_password.sql` — a Flyway *repeatable* migration that re-runs whenever the resolved `${grafanaDbPassword}` placeholder changes. That makes rotation a two-restart operation, no manual `psql` required.
```bash
# 1. Generate a new value
openssl rand -hex 32
# 2. Update both sides:
# - Gitea secret GRAFANA_DB_PASSWORD (nightly + release workflows pick it up)
# - Local .env on the server / dev machine
# 3. Restart the backend. Flyway sees that R__'s resolved checksum changed and
# re-applies it, issuing ALTER ROLE grafana_reader WITH PASSWORD '<new>'.
docker compose restart backend
# 4. Restart obs-grafana so the provisioned datasource picks up the new env value.
docker compose -f docker-compose.observability.yml restart obs-grafana
# 5. Verify the dashboard loads — PO Overview's Postgres panels should populate
# instead of "Data source error".
```
If `GRAFANA_DB_PASSWORD` is unset, the backend **refuses to start** (`IllegalStateException`). That is deliberate — see `FlywayConfig.resolveGrafanaDbPassword()` and the rationale in ADR-024.
#### GlitchTip
| Item | Value |

View File

@@ -0,0 +1,123 @@
# ADR-024: Grafana reads archive-db via a bridged network and a SELECT-only role
## Status
Accepted
## Context
Issue #651 (the PO Overview Grafana dashboard) needs aggregates over three
tables in the main application database — `audit_log`, `documents`, and
`transcription_blocks` — to answer the operator's four weekly questions: is
everything working, are people using it, is the archive making progress, is
OCR working well.
Until now, `obs-grafana` and the rest of the observability stack lived on
their own Docker network (`obs-net`) and never touched `archiv-net`, where
`archive-db` runs. The two were intentionally isolated: a compromise of any
observability container could not pivot to the application database.
The PO Overview's archive-progress and user-activity panels need rolling
7-day SQL aggregates that cannot be served by Prometheus or Loki. That
forces a connection from `obs-grafana` to `archive-db` for the first time.
Two implementation requirements shaped the design:
1. **Least privilege on the database side.** The Spring Boot application
role (`archiv`) has full read/write on every table. Letting Grafana
connect with that role would mean a Grafana compromise becomes an
application compromise. The dashboard only needs SELECT on three
tables; the role must reflect that and nothing more.
2. **Operational simplicity of secret rotation.** The role's password is
shared between the migration that sets it and the Grafana datasource
that uses it. A first version of this work put the password in a
versioned Flyway migration (V68), which Flyway only applies once —
leaving rotation as an out-of-band `psql ALTER ROLE` step that no
runbook documented. The shape must support rotation without manual
SQL.
## Decision
- Provision a dedicated PostgreSQL role `grafana_reader` with `LOGIN` plus
`GRANT SELECT` on `audit_log`, `documents`, `transcription_blocks` only.
No INSERT/UPDATE/DELETE on any table, no access to any other table —
enforced by the database, locked in by both positive and parameterized
negative tests in `GrafanaReaderRoleIntegrationTest`.
- Split the role's lifecycle across two migrations:
- `V68__add_grafana_reader_role.sql` — versioned, immutable, idempotent.
Creates the role and applies the grants. Runs exactly once per
database, like every other versioned migration.
- `R__grafana_reader_password.sql` — Flyway *repeatable* migration that
issues `ALTER ROLE grafana_reader WITH PASSWORD '${grafanaDbPassword}'`.
Flyway computes the checksum on the resolved content, so any change
to `GRAFANA_DB_PASSWORD` flips the checksum and re-applies the
migration on the next boot. Rotation becomes "bump env var, restart
backend, restart obs-grafana" — see the runbook in
`docs/DEPLOYMENT.md §4 → Rotate the grafana_reader DB password`.
- Resolve the password through Spring's `Environment` rather than a raw
`System.getenv()` call, so tests inject via `application.properties`
and the resolver is unit-testable with `MockEnvironment`. Fail closed
with `IllegalStateException` when the variable is unset — no fallback
string. Same shape as `UserDataInitializer`'s refusal to seed default
admin credentials outside dev/test/e2e.
- Join `obs-grafana` to `archiv-net` in addition to `obs-net`. Only the
Grafana container crosses the boundary; Loki, Tempo, Prometheus,
GlitchTip, and the worker containers remain `obs-net`-only.
## Consequences
**Positive**
- Database-level least privilege: a Grafana compromise gains SELECT on
three tables. Cannot write, cannot read PII tables like `app_users`,
`persons`, `notifications`, `document_comments`, `geschichten`. The
parameterized PII negative sweep in `GrafanaReaderRoleIntegrationTest`
is the regression gate; new sensitive tables get added to that list.
- Rotation is documented, idempotent, and survives operator turnover.
No "the password set on day 1 is the password forever" failure mode.
- Tests pin down both sides of the boundary: positive grants must hold,
write-deny must hold, and the PII negative list must stay empty.
**Negative / trade-offs**
- `obs-net` is no longer fully isolated from `archiv-net`. A Grafana RCE
(e.g. via a future Grafana CVE) gains a TCP path to `archive-db`
contained, but not impossible. The least-privilege role is the
mitigation; we accept that mitigation as sufficient for a single
bridged container.
- The backend must hold `GRAFANA_DB_PASSWORD` in its environment forever,
so Flyway can resolve the placeholder on every boot. A backend RCE
therefore also leaks the Grafana datasource password. Acceptable
because that password's blast radius is itself bounded by the
least-privilege grants on `grafana_reader`.
## Alternatives considered
- **Prometheus PostgreSQL exporter, no direct connection.** Loses ad-hoc
SQL aggregates — the dashboard would need every metric pre-defined as
an exporter query, with a redeploy to add a new one. The PO Overview
is the type of dashboard that grows panels over time; pre-defining
every aggregate is the wrong shape.
- **Read replica or logical-replication slot dedicated to Grafana.**
Real operational cost (extra Postgres instance, replication monitoring,
storage doubled) disproportionate to a weekly PO glance.
- **Versioned migration with `flyway repair` for rotation.** Rejected:
conflates schema lifecycle with credential lifecycle, requires manual
intervention to rotate, and the repair command's semantics are
surprising to operators unfamiliar with Flyway internals.
- **Hardcoded fallback password when env var is unset.** Rejected as a
security blocker: publishes a known credential for a role with read
access to user activity and full letter text. The fail-closed
behavior is the explicit defense.
## References
- Issue #651 — PO Overview Grafana dashboard
- `backend/src/main/resources/db/migration/V68__add_grafana_reader_role.sql`
- `backend/src/main/resources/db/migration/R__grafana_reader_password.sql`
- `backend/src/main/java/org/raddatz/familienarchiv/config/FlywayConfig.java`
- `backend/src/test/java/org/raddatz/familienarchiv/config/GrafanaReaderRoleIntegrationTest.java`
- `infra/observability/grafana/provisioning/datasources/datasources.yml`
- `docker-compose.observability.yml``archiv-net` bridge on `obs-grafana`
- `docs/DEPLOYMENT.md §4` — rotation runbook

View File

@@ -48,6 +48,7 @@ Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
Rel(grafana, loki, "Queries logs", "HTTP 3100")
Rel(grafana, tempo, "Queries traces", "HTTP 3200")
Rel(grafana, db, "Read-only dashboard queries via grafana_reader role", "PostgreSQL / archiv-net")
Rel(glitchtip, db, "Stores error events in glitchtip DB", "PostgreSQL / archiv-net")
Rel(obs_glitchtip_worker, obs_redis, "Processes Celery tasks", "Redis / obs-net")

View File

@@ -0,0 +1,702 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "grafana", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Product owner overview — system health, user activity, archive progress, and OCR quality at a weekly glance.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "System Health",
"type": "row",
"panels": []
},
{
"id": 1,
"title": "Backend Status",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"targets": [
{
"expr": "up{job=\"spring-boot\"}",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value"
}
},
{
"id": 2,
"title": "Server Errors (5xx)",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"targets": [
{
"expr": "sum(increase(http_server_requests_seconds_count{status=~\"5..\"}[$__range]))",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 6 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 3,
"title": "Response Time (p95)",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[$__range])) by (le))",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 2 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 4,
"title": "Error Log Count",
"type": "stat",
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"targets": [
{
"expr": "sum(count_over_time({compose_service=\"backend\"} | json | level=\"ERROR\" [$__range]))",
"queryType": "instant",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 10 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 5,
"title": "CPU Usage",
"type": "bargauge",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 5, "w": 8, "x": 0, "y": 5 },
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showUnfilled": true
}
},
{
"id": 6,
"title": "Memory Usage",
"type": "bargauge",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 5, "w": 8, "x": 8, "y": 5 },
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showUnfilled": true
}
},
{
"id": 7,
"title": "Disk Usage",
"type": "bargauge",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 5, "w": 8, "x": 16, "y": 5 },
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"})) * 100",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 80 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showUnfilled": true
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 },
"id": 101,
"title": "User Activity",
"type": "row",
"panels": []
},
{
"id": 8,
"title": "Active Users",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 11 },
"targets": [
{
"rawSql": "SELECT COUNT(DISTINCT actor_id) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 9,
"title": "Total Logins",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 11 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 10,
"title": "Failed Login Attempts",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 11 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind IN ('LOGIN_FAILED', 'LOGIN_RATE_LIMITED')",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 4 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 11,
"title": "Daily Logins (last 7 days)",
"type": "barchart",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 7, "w": 24, "x": 0, "y": 15 },
"targets": [
{
"rawSql": "SELECT DATE_TRUNC('day', happened_at) AS time, COUNT(*) AS logins FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS' GROUP BY 1 ORDER BY 1",
"format": "time_series",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"legend": { "displayMode": "hidden" },
"orientation": "auto",
"showValue": "auto",
"stacking": "none",
"xTickLabelRotation": 0,
"xTickLabelSpacing": 0
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
"id": 102,
"title": "Archive Progress",
"type": "row",
"panels": []
},
{
"id": 12,
"title": "Transcription Coverage",
"type": "bargauge",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 5, "w": 24, "x": 0, "y": 23 },
"targets": [
{
"rawSql": "SELECT (COUNT(*) FILTER (WHERE text IS NOT NULL AND text <> ''))::float * 100.0 / NULLIF(COUNT(*), 0) AS percent_complete FROM transcription_blocks",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"decimals": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 25 },
{ "color": "green", "value": 75 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showUnfilled": true
}
},
{
"id": 13,
"title": "Total Documents",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 28 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM documents WHERE status <> 'PLACEHOLDER'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 14,
"title": "Uploads This Week",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 28 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'FILE_UPLOADED'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 15,
"title": "Blocks Transcribed This Week",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 28 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'TEXT_SAVED'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 16,
"title": "Blocks Reviewed This Week",
"type": "stat",
"datasource": { "type": "postgres", "uid": "postgres" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 28 },
"targets": [
{
"rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'BLOCK_REVIEWED'",
"format": "table",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
"id": 103,
"title": "OCR Health",
"type": "row",
"panels": []
},
{
"id": 17,
"title": "OCR Jobs",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 33 },
"targets": [
{
"expr": "sum(increase(ocr_jobs_total[$__range]))",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0,
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 18,
"title": "OCR Page Error Rate",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 33 },
"targets": [
{
"expr": "sum(increase(ocr_skipped_pages_total[$__range])) / clamp_min(sum(increase(ocr_pages_total[$__range])), 1)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"decimals": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 19,
"title": "Illegible Word Rate",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 33 },
"targets": [
{
"expr": "sum(increase(ocr_illegible_words_total[$__range])) / clamp_min(sum(increase(ocr_words_total[$__range])), 1)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"decimals": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.1 },
{ "color": "red", "value": 0.25 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
}
},
{
"id": 20,
"title": "OCR Service Status",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 33 },
"targets": [
{
"expr": "ocr_models_ready",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "0": { "text": "NOT READY", "color": "red" } } },
{ "type": "value", "options": { "1": { "text": "READY", "color": "green" } } }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value"
}
}
],
"refresh": "",
"schemaVersion": 39,
"tags": ["po-overview", "familienarchiv"],
"templating": { "list": [] },
"time": { "from": "now-7d", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "PO Overview",
"uid": "po-overview",
"version": 1,
"weekStart": ""
}

View File

@@ -36,3 +36,19 @@ datasources:
datasourceUid: prometheus
nodeGraph:
enabled: true
# Read-only PostgreSQL datasource for the PO Overview dashboard (issue #651).
# Uses the grafana_reader role provisioned by Flyway V68. Traffic stays inside
# archiv-net, so sslmode=disable is the deliberate, accepted setting.
- name: PostgreSQL
type: postgres
uid: postgres
url: archive-db:5432
user: grafana_reader
editable: false
secureJsonData:
password: ${GRAFANA_DB_PASSWORD}
jsonData:
database: ${POSTGRES_DB}
sslmode: disable
postgresVersion: 1600

View File

@@ -16,6 +16,11 @@ GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud
POSTGRES_USER=archiv
# Note: GRAFANA_DB_PASSWORD is a secret and is injected by CI from
# obs-secrets.env (see .env.example for the local-dev declaration).
# It is consumed by both archive-backend (Flyway V68 placeholder) and
# obs-grafana (PostgreSQL datasource).
# PostgreSQL hostname for GlitchTip db-init and workers.
# The actual value depends on the Compose project name — it is not a fixed string.
# CI sets POSTGRES_HOST in obs-secrets.env per environment: