fix(obs): wire Prometheus metrics and Loki job label for Grafana dashboards #606

Merged
marcel merged 9 commits from worktree-fix+issue-604-obs-wiring into main 2026-05-16 16:00:36 +02:00
12 changed files with 366 additions and 13 deletions

View File

@@ -299,6 +299,10 @@ Run via `docker-compose.observability.yml` — requires the main stack to be run
| `SENTRY_DSN` | GlitchTip/Sentry DSN for the backend (Spring Boot) — leave empty to disable | | `SENTRY_DSN` | GlitchTip/Sentry DSN for the backend (Spring Boot) — leave empty to disable |
| `VITE_SENTRY_DSN` | GlitchTip/Sentry DSN for the frontend (SvelteKit) — injected at build time via Vite | | `VITE_SENTRY_DSN` | GlitchTip/Sentry DSN for the frontend (SvelteKit) — injected at build time via Vite |
## Observability
→ See [docs/OBSERVABILITY.md](./docs/OBSERVABILITY.md) — where to look for logs, traces, metrics, and errors.
## API Testing ## API Testing
HTTP test files are in `backend/api_tests/` for use with the VS Code REST Client extension. HTTP test files are in `backend/api_tests/` for use with the VS Code REST Client extension.

View File

@@ -48,6 +48,11 @@
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId> <artifactId>spring-boot-starter-actuator</artifactId>
</dependency> </dependency>
<!-- Spring Boot 4.0 splits Micrometer metrics export (incl. Prometheus scrape endpoint) into its own starter -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-micrometer-metrics</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-validation</artifactId> <artifactId>spring-boot-starter-validation</artifactId>

View File

@@ -3,13 +3,16 @@ package org.raddatz.familienarchiv.security;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.raddatz.familienarchiv.user.CustomUserDetailsService; import org.raddatz.familienarchiv.user.CustomUserDetailsService;
import jakarta.servlet.http.HttpServletResponse;
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import org.springframework.core.annotation.Order;
import org.springframework.core.env.Environment; import org.springframework.core.env.Environment;
import org.springframework.security.authentication.dao.DaoAuthenticationProvider; import org.springframework.security.authentication.dao.DaoAuthenticationProvider;
import org.springframework.security.config.Customizer; import org.springframework.security.config.Customizer;
import org.springframework.security.config.annotation.web.builders.HttpSecurity; import org.springframework.security.config.annotation.web.builders.HttpSecurity;
import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity; import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity;
import org.springframework.security.config.annotation.web.configurers.AbstractHttpConfigurer;
import org.springframework.security.crypto.bcrypt.BCryptPasswordEncoder; import org.springframework.security.crypto.bcrypt.BCryptPasswordEncoder;
import org.springframework.security.crypto.password.PasswordEncoder; import org.springframework.security.crypto.password.PasswordEncoder;
import org.springframework.security.web.SecurityFilterChain; import org.springframework.security.web.SecurityFilterChain;
@@ -34,6 +37,28 @@ public class SecurityConfig {
return authProvider; return authProvider;
} }
@Bean
@Order(1)
public SecurityFilterChain managementFilterChain(HttpSecurity http) throws Exception {
http
.securityMatcher("/actuator/**")
.authorizeHttpRequests(auth -> {
// Health and Prometheus are open — Docker health checks and Prometheus scraping need no credentials.
auth.requestMatchers("/actuator/health", "/actuator/prometheus").permitAll();
// All other actuator endpoints (metrics, info, env, heapdump…) require authentication.
auth.anyRequest().authenticated();
})
// Explicitly return 401 for any unauthenticated actuator request.
// Without this override, Spring Security's DelegatingAuthenticationEntryPoint
// would redirect browser-like clients to the form-login page (302 → /login),
// making it impossible to distinguish "not authenticated" from "not found" in tests.
.exceptionHandling(ex -> ex.authenticationEntryPoint(
(req, res, e) -> res.setStatus(HttpServletResponse.SC_UNAUTHORIZED)))
.formLogin(AbstractHttpConfigurer::disable)
.csrf(AbstractHttpConfigurer::disable);
return http.build();
}
@Bean @Bean
public SecurityFilterChain securityFilterChain(HttpSecurity http) throws Exception { public SecurityFilterChain securityFilterChain(HttpSecurity http) throws Exception {
http http
@@ -54,8 +79,10 @@ public class SecurityConfig {
.csrf(csrf -> csrf.disable()) .csrf(csrf -> csrf.disable())
.authorizeHttpRequests(auth -> { .authorizeHttpRequests(auth -> {
// Health endpoint must be open so CI/Docker health checks work without credentials // Actuator endpoints are governed by managementFilterChain (@Order(1)) above.
auth.requestMatchers("/actuator/health").permitAll(); // The permitAll() lines here are a belt-and-suspenders fallback in case any
// actuator path escapes that chain's securityMatcher. See docs/adr/017.
auth.requestMatchers("/actuator/health", "/actuator/prometheus").permitAll();
// Password reset endpoints are unauthenticated by nature // Password reset endpoints are unauthenticated by nature
auth.requestMatchers("/api/auth/forgot-password", "/api/auth/reset-password").permitAll(); auth.requestMatchers("/api/auth/forgot-password", "/api/auth/reset-password").permitAll();
// Invite-based registration endpoints are public // Invite-based registration endpoints are public

View File

@@ -49,7 +49,8 @@ management:
# Management port is separate from the app port so that: # Management port is separate from the app port so that:
# (a) Caddy never proxies /actuator/* (it only routes :8080 → the app port) # (a) Caddy never proxies /actuator/* (it only routes :8080 → the app port)
# (b) Prometheus scrapes backend:8081 directly inside archiv-net, not via Caddy # (b) Prometheus scrapes backend:8081 directly inside archiv-net, not via Caddy
# (c) Spring Security's session-authenticated filter chain on :8080 never sees actuator requests # Note: in Spring Boot 4.0 the management port shares the security filter chain; /actuator/health
# and /actuator/prometheus must be explicitly permitted in SecurityConfig — see SecurityConfig.java.
port: 8081 port: 8081
endpoints: endpoints:
web: web:
@@ -58,6 +59,16 @@ management:
endpoint: endpoint:
prometheus: prometheus:
enabled: true enabled: true
# Spring Boot 4.0: metrics export is disabled by default — explicitly opt in for Prometheus
prometheus:
metrics:
export:
enabled: true
metrics:
tags:
# Common tag applied to every metric so Grafana's Spring Boot dashboard can filter by application name.
# Override via MANAGEMENT_METRICS_TAGS_APPLICATION env var.
application: ${spring.application.name}
health: health:
mail: mail:
enabled: false enabled: false
@@ -66,13 +77,18 @@ management:
probability: 1.0 # 100% in dev; override via MANAGEMENT_TRACING_SAMPLING_PROBABILITY in prod compose probability: 1.0 # 100% in dev; override via MANAGEMENT_TRACING_SAMPLING_PROBABILITY in prod compose
# OpenTelemetry trace export — failures are non-fatal (app starts cleanly without Tempo running) # OpenTelemetry trace export — failures are non-fatal (app starts cleanly without Tempo running)
# The default http://localhost:4317 ensures CI compatibility when no observability stack is present. # Port 4318 = OTLP HTTP (the default transport for Spring Boot's HttpExporter).
# Port 4317 is gRPC-only; sending HTTP/1.1 to it produces "Connection reset".
otel: otel:
service: service:
name: familienarchiv-backend name: familienarchiv-backend
exporter: exporter:
otlp: otlp:
endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:http://localhost:4317} endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:http://localhost:4318}
logs:
exporter: none # Promtail captures Docker logs; disable OTLP log export (Tempo only accepts traces)
metrics:
exporter: none # Prometheus scrapes /actuator/prometheus; disable OTLP metric export to Tempo
springdoc: springdoc:
api-docs: api-docs:

View File

@@ -0,0 +1,63 @@
package org.raddatz.familienarchiv;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.web.server.LocalManagementPort;
import org.springframework.context.annotation.Import;
import org.springframework.http.ResponseEntity;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.test.context.bean.override.mockito.MockitoBean;
import org.springframework.web.client.DefaultResponseErrorHandler;
import org.springframework.web.client.RestTemplate;
import software.amazon.awssdk.services.s3.S3Client;
import java.io.IOException;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ActiveProfiles("test")
@Import(PostgresContainerConfig.class)
class ActuatorPrometheusIT {
@LocalManagementPort
private int managementPort;
@MockitoBean
S3Client s3Client;
@Test
void prometheus_endpoint_returns_200_without_credentials() {
ResponseEntity<String> response = noThrowTemplate().getForEntity(
"http://localhost:" + managementPort + "/actuator/prometheus", String.class);
assertThat(response.getStatusCode().value()).isEqualTo(200);
}
@Test
void prometheus_endpoint_returns_jvm_metrics() {
ResponseEntity<String> response = noThrowTemplate().getForEntity(
"http://localhost:" + managementPort + "/actuator/prometheus", String.class);
assertThat(response.getBody()).contains("jvm_memory_used_bytes");
}
@Test
void actuator_metrics_requires_authentication() {
ResponseEntity<String> response = noThrowTemplate().getForEntity(
"http://localhost:" + managementPort + "/actuator/metrics", String.class);
assertThat(response.getStatusCode().value()).isEqualTo(401);
}
private RestTemplate noThrowTemplate() {
RestTemplate template = new RestTemplate();
template.setErrorHandler(new DefaultResponseErrorHandler() {
@Override
public boolean hasError(org.springframework.http.client.ClientHttpResponse response) throws IOException {
return false;
}
});
return template;
}
}

View File

@@ -213,7 +213,11 @@ services:
APP_MAIL_FROM: ${APP_MAIL_FROM:-noreply@raddatz.cloud} APP_MAIL_FROM: ${APP_MAIL_FROM:-noreply@raddatz.cloud}
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true} SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true}
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true} SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true}
OTEL_EXPORTER_OTLP_ENDPOINT: http://tempo:4317 OTEL_EXPORTER_OTLP_ENDPOINT: http://tempo:4318
OTEL_LOGS_EXPORTER: none
OTEL_METRICS_EXPORTER: none
MANAGEMENT_METRICS_TAGS_APPLICATION: Familienarchiv
MANAGEMENT_TRACING_SAMPLING_PROBABILITY: ${MANAGEMENT_TRACING_SAMPLING_PROBABILITY:-0.1}
networks: networks:
- archiv-net - archiv-net
healthcheck: healthcheck:

View File

@@ -107,7 +107,10 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
| `MAIL_SMTP_AUTH` | SMTP auth enabled | `false` (dev) | YES (prod) | — | | `MAIL_SMTP_AUTH` | SMTP auth enabled | `false` (dev) | YES (prod) | — |
| `MAIL_STARTTLS_ENABLE` | STARTTLS enabled | `false` (dev) | YES (prod) | — | | `MAIL_STARTTLS_ENABLE` | STARTTLS enabled | `false` (dev) | YES (prod) | — |
| `SPRING_PROFILES_ACTIVE` | Spring profile | `dev,e2e` (compose) | YES | — | | `SPRING_PROFILES_ACTIVE` | Spring profile | `dev,e2e` (compose) | YES | — |
| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP gRPC endpoint for distributed traces (Tempo). Set to `http://tempo:4317` via compose. | `http://localhost:4317` | — | — | | `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP HTTP endpoint for distributed traces (Tempo). Port 4318 = HTTP transport; port 4317 is gRPC-only and causes "Connection reset" with Spring Boot's HttpExporter. | `http://localhost:4318` | — | — |
| `OTEL_LOGS_EXPORTER` | Disable OTLP log export — Promtail captures Docker logs via the logging driver; Tempo does not accept logs. | `none` | — | — |
| `OTEL_METRICS_EXPORTER` | Disable OTLP metric export — Prometheus scrapes `/actuator/prometheus` via pull model; Tempo does not accept metrics. | `none` | — | — |
| `MANAGEMENT_METRICS_TAGS_APPLICATION` | Common tag added to every Micrometer metric. Required by Grafana's Spring Boot Observability dashboard (ID 17175) `label_values(application)` template variable. | `Familienarchiv` | — | — |
| `MANAGEMENT_TRACING_SAMPLING_PROBABILITY` | Micrometer tracing sample rate; overridden to `0.0` in test profile. | `0.1` (compose) / `1.0` (dev) | — | — | | `MANAGEMENT_TRACING_SAMPLING_PROBABILITY` | Micrometer tracing sample rate; overridden to `0.0` in test profile. | `0.1` (compose) / `1.0` (dev) | — | — |
| `SENTRY_DSN` | GlitchTip / Sentry DSN for backend error reporting. Leave empty to disable the SDK. Set after GlitchTip first-run (§4). | — | — | YES | | `SENTRY_DSN` | GlitchTip / Sentry DSN for backend error reporting. Leave empty to disable the SDK. Set after GlitchTip first-run (§4). | — | — | YES |
@@ -280,6 +283,9 @@ Before the first deploy: rotate `PROD_APP_ADMIN_PASSWORD` to a strong value. Aft
## 4. Logs + observability ## 4. Logs + observability
> **Developer guide (where to look for what, LogQL queries, trace exploration) → [docs/OBSERVABILITY.md](./OBSERVABILITY.md).**
> This section covers the ops side: starting the stack, env vars, and CI wiring.
### First-response commands ### First-response commands
```bash ```bash
@@ -374,8 +380,8 @@ Current services:
| `obs-node-exporter` | `prom/node-exporter:v1.9.0` | Host-level CPU / memory / disk / network metrics | | `obs-node-exporter` | `prom/node-exporter:v1.9.0` | Host-level CPU / memory / disk / network metrics |
| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics | | `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics |
| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). | | `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). |
| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels | | `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, `compose_project`, and `job` labels. The `job` label is mapped from the Docker Compose service name (`com.docker.compose.service`) so that Grafana Loki dashboard queries (`{job="backend"}`, `{job="frontend"}`) work out of the box and the "App" variable dropdown is populated. |
| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP gRPC receiver on port 4317, OTLP HTTP on port 4318 (both `archiv-net`-internal). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). | | `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP HTTP receiver on port 4318 (`archiv-net`-internal; backend sends traces here). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). |
| `obs-grafana` | `grafana/grafana-oss:11.6.1` | Unified observability UI — metrics dashboards, log exploration, trace viewer. Bound to `127.0.0.1:${PORT_GRAFANA:-3003}` on the host. | | `obs-grafana` | `grafana/grafana-oss:11.6.1` | Unified observability UI — metrics dashboards, log exploration, trace viewer. Bound to `127.0.0.1:${PORT_GRAFANA:-3003}` on the host. |
| `obs-glitchtip` | `glitchtip/glitchtip:6.1.6` | Sentry-compatible error tracker. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces. Bound to `127.0.0.1:${PORT_GLITCHTIP:-3002}`. | | `obs-glitchtip` | `glitchtip/glitchtip:6.1.6` | Sentry-compatible error tracker. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces. Bound to `127.0.0.1:${PORT_GLITCHTIP:-3002}`. |
| `obs-glitchtip-worker` | `glitchtip/glitchtip:6.1.6` | Celery + beat worker — processes async GlitchTip tasks (event ingestion, notifications, cleanup). | | `obs-glitchtip-worker` | `glitchtip/glitchtip:6.1.6` | Celery + beat worker — processes async GlitchTip tasks (event ingestion, notifications, cleanup). |

180
docs/OBSERVABILITY.md Normal file
View File

@@ -0,0 +1,180 @@
# Observability Guide
> **Ops reference (starting the stack, env vars, CI wiring) → [DEPLOYMENT.md §4](./DEPLOYMENT.md#4-logs--observability).**
> This file is for developers: what signal lives where, how to reach it, and what to look for.
## Where to look for what
| I want to… | Go to |
|---|---|
| See the last N log lines from the backend | `docker compose logs --tail=100 backend` |
| Search logs by keyword across time | Grafana → Explore → Loki |
| Understand why an HTTP request failed | Grafana → Explore → Loki → filter by `traceId` → follow link to Tempo |
| See a full distributed trace (DB queries, HTTP calls) | Grafana → Explore → Tempo → search by service or trace ID |
| Check JVM heap / GC / thread count | Grafana → Dashboards → Spring Boot Observability |
| Check HTTP error rate or p95 latency | Grafana → Dashboards → Spring Boot Observability |
| Check host CPU / memory / disk | Grafana → Dashboards → Node Exporter Full |
| See grouped application errors with stack traces | GlitchTip |
| Check if the backend is healthy | `curl http://localhost:8081/actuator/health` (on the server) |
| Check what Prometheus is scraping | `curl http://localhost:9090/api/v1/targets` (on the server) |
## Access
| Tool | External URL | Who it's for |
|---|---|---|
| Grafana | `https://grafana.archiv.raddatz.cloud` | Logs, metrics, traces — the primary observability UI |
| GlitchTip | `https://glitchtip.archiv.raddatz.cloud` | Grouped errors with stack traces and release tracking |
Loki, Tempo, and Prometheus have no external URL. They are internal services, accessible only through Grafana (or via SSH tunnel — see below).
## Logs (Loki)
Logs reach Loki via Promtail, which reads all Docker container logs from the Docker socket and ships them with labels derived from Docker Compose metadata.
### Labels available in every log line
| Label | What it contains | Example |
|---|---|---|
| `job` | Compose service name | `backend`, `frontend`, `db` |
| `compose_service` | Same as `job` | `backend` |
| `compose_project` | Compose project name | `archiv-staging`, `archiv-production` |
| `container_name` | Docker container name | `archiv-staging-backend-1` |
| `filename` | Docker log source | `/var/lib/docker/containers/…` |
**Use `job` in LogQL queries** — it is stable across dev, staging, and production. `container_name` changes between environments.
### Common LogQL queries in Grafana Explore
```logql
# All backend logs
{job="backend"}
# Backend ERROR and WARN lines only
{job="backend"} |= "ERROR" or {job="backend"} |= "WARN"
# All logs for a specific request (paste a traceId from a log line)
{job="backend"} |= "3fa85f64-5717-4562-b3fc-2c963f66afa6"
# Log lines containing a specific exception class
{job="backend"} |~ "DomainException|NullPointerException"
# Frontend logs
{job="frontend"}
# Database (slow query log, if enabled)
{job="db"}
```
### Log → Trace correlation
Spring Boot writes the active `traceId` into every log line when a request is being processed:
```
2026-05-16 ... INFO [Familienarchiv,3fa85f64...,1b2c3d4e] o.r.f.document.DocumentService : ...
```
In Grafana Explore → Loki, log lines with a `traceId` field show a **Tempo** link. Clicking it opens the full trace in Explore → Tempo without copying and pasting IDs.
This linking is configured in the Loki datasource provisioning via the `traceId` derived field regex. No manual setup required.
## Traces (Tempo)
The backend sends traces to Tempo via OTLP HTTP (port 4318). Every inbound HTTP request and every JPA query produces a span. Spans are linked into traces by the propagated `traceId` header.
### Finding a trace in Grafana
**Option A — from a log line:**
1. Grafana → Explore → select *Loki* datasource
2. Query `{job="backend"}` and find the failing request
3. Click the **Tempo** link in the log line (appears when `traceId` is present)
**Option B — by service:**
1. Grafana → Explore → select *Tempo* datasource
2. Query type: **Search**
3. Service name: `familienarchiv-backend`
4. Filter by HTTP status, duration, or operation name as needed
**Option C — by trace ID:**
1. Grafana → Explore → select *Tempo* datasource
2. Query type: **TraceQL** or **Trace ID**
3. Paste the trace ID
### What each span type tells you
| Root span name pattern | What it covers |
|---|---|
| `GET /api/documents`, `POST /api/documents` | Full HTTP request lifecycle |
| `SELECT archiv.*` | A single JPA/JDBC query inside that request |
| `HikariPool.getConnection` | Connection pool wait time |
A slow `SELECT` span inside an otherwise fast HTTP span pinpoints a missing index. A slow `HikariPool.getConnection` span indicates connection pool exhaustion.
### Sampling rate
- **Dev**: 100% of requests are traced (`management.tracing.sampling.probability: 1.0` in `application.yaml`)
- **Staging / Production**: 10% (`MANAGEMENT_TRACING_SAMPLING_PROBABILITY=0.1` in `docker-compose.prod.yml`)
To find a trace for a specific request in staging/production, either increase the sampling rate temporarily or trigger the request multiple times.
## Metrics (Prometheus → Grafana)
Prometheus scrapes the backend management endpoint every 15 s:
```
Target: backend:8081/actuator/prometheus
Labels: job="spring-boot", application="Familienarchiv"
```
All Spring Boot metrics carry the `application="Familienarchiv"` tag, which is how the Grafana Spring Boot Observability dashboard (ID 17175) filters to this service.
### Useful Prometheus queries (run on the server or via Grafana Explore → Prometheus)
```promql
# HTTP error rate (5xx) as a fraction of all requests
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
/ sum(rate(http_server_requests_seconds_count[5m]))
# p95 response time
histogram_quantile(0.95, sum by (le) (
rate(http_server_requests_seconds_bucket[5m])
))
# JVM heap used
jvm_memory_used_bytes{area="heap", application="Familienarchiv"}
# Active DB connections
hikaricp_connections_active
```
## Errors (GlitchTip)
GlitchTip receives errors from both the backend (via Sentry Java SDK) and the frontend (via Sentry JavaScript SDK). It groups events by fingerprint, tracks first/last seen times, and links to the release that introduced the error.
GlitchTip complements Loki: use GlitchTip when you need **grouped, de-duplicated errors with stack traces and release attribution**; use Loki when you need **raw log lines with full context** or want to search across all log levels.
## Direct API access (debugging only)
Loki and Tempo bind no host ports. To reach them directly from your laptop, use an SSH tunnel through the server:
```bash
# Loki API on localhost:3100 (then query via curl or logcli)
ssh -L 3100:172.20.0.x:3100 root@raddatz.cloud
# Replace 172.20.0.x with the obs-loki container IP:
# docker inspect obs-loki --format '{{.NetworkSettings.Networks.archiv-obs-net.IPAddress}}'
# Tempo API on localhost:3200 (then query via curl or tempo-cli)
ssh -L 3200:172.20.0.x:3200 root@raddatz.cloud
```
In practice, Grafana Explore covers all common debugging workflows without needing direct API access.
## Signal summary
| Signal | Source | Transport | Storage | UI |
|---|---|---|---|---|
| Application logs | Spring Boot stdout → Docker log driver | Promtail → Loki push API | Loki | Grafana Explore → Loki |
| Distributed traces | Spring Boot OTel agent | OTLP HTTP → Tempo:4318 | Tempo | Grafana Explore → Tempo |
| JVM + HTTP metrics | Spring Actuator `/actuator/prometheus` | Prometheus pull (15 s) | Prometheus | Grafana dashboards |
| Host metrics | node-exporter | Prometheus pull | Prometheus | Grafana → Node Exporter Full |
| Container metrics | cAdvisor | Prometheus pull | Prometheus | Grafana (via Prometheus datasource) |
| Application errors | Sentry SDK | HTTP POST → GlitchTip ingest | GlitchTip DB | GlitchTip UI |

View File

@@ -0,0 +1,48 @@
# ADR-017: Spring Boot 4.0 management port shares the main security filter chain
## Status
Accepted
## Context
The Familienarchiv backend runs Spring Boot Actuator on a dedicated management port (8081) so that Caddy never proxies `/actuator/*` requests and Prometheus can reach the scrape endpoint directly inside `archiv-net`.
In earlier Spring Boot versions (< 4.0), the management server ran in an isolated child application context whose security was governed independently by `ManagementWebSecurityAutoConfiguration`. The main app's `SecurityConfig` filter chain (port 8080) never intercepted requests arriving on port 8081.
In Spring Boot 4.0 with Jetty, this isolation was removed. The management server now traverses the **same** Spring Security `FilterChainProxy` as the main application. Concretely:
- Any `SecurityFilterChain` bean in the application context is evaluated for requests arriving on the management port.
- There is no longer a separate "management security" child context.
This was discovered when Prometheus began receiving HTTP 401 responses from `/actuator/prometheus` despite the endpoint being exposed and the `micrometer-registry-prometheus` dependency being present. Prometheus rejected these responses with `received unsupported Content-Type "text/html"` because the main filter chain's form-login `DelegatingAuthenticationEntryPoint` was redirecting unauthenticated requests to `/login` (302 → HTML).
A secondary issue: Spring Boot 4.0 no longer auto-enables Prometheus metrics export — `management.prometheus.metrics.export.enabled` must be set explicitly, and the Prometheus scrape endpoint requires `spring-boot-starter-micrometer-metrics` (a new starter that was split out in Spring Boot 4.0).
## Decision
1. **Dedicated management `SecurityFilterChain`** scoped to `/actuator/**` at `@Order(1)` (highest precedence). This chain:
- `permitAll()` for `/actuator/health` and `/actuator/prometheus` — required for Docker health checks and unauthenticated Prometheus scraping.
- `authenticated()` for all other actuator endpoints — blocks `/actuator/metrics`, `/actuator/info`, etc. without credentials.
- Uses an explicit `401` entry point (not form-login redirect) so that API clients — including Prometheus — receive a machine-readable status code rather than an HTML redirect.
- No CSRF, no form login.
2. **Belt-and-suspenders `permitAll()` in the main `SecurityFilterChain`** for `/actuator/health` and `/actuator/prometheus`, in case a future configuration change causes these paths to escape the management chain's `securityMatcher`.
3. **Network isolation as the outer defense boundary.** Port 8081 is not published in `docker-compose.yml` and is not routed through Caddy. Only services inside `archiv-net` (primarily Prometheus and the Docker health checker) can reach the management port.
## Alternatives rejected
- **Exclude `ManagementWebSecurityAutoConfiguration`:** This auto-configuration no longer exists in Spring Boot 4.0. Exclusion is not applicable.
- **Keep `SecurityConfig` as the sole filter chain without `@Order(1)` management chain:** The main chain's form-login `DelegatingAuthenticationEntryPoint` redirects browser-like clients to `/login` (302). Prometheus and automated health check clients cannot follow this redirect, so the endpoint would be unreachable without a dedicated chain that returns plain 401 or 200.
- **Per-endpoint `@Order(1)` filter chain using `EndpointRequest.toAnyEndpoint()`:** The `spring-boot-security` artifact that provides `EndpointRequest` is not a transitive dependency of `spring-boot-starter-actuator` in Spring Boot 4.0. Using a path-based `securityMatcher("/actuator/**")` achieves the same scoping without an extra dependency.
## Consequences
- All actuator endpoints on port 8081 that are not explicitly `permitAll()`-ed require HTTP Basic credentials. Without valid credentials, the response is 401 (not a redirect).
- Adding a new actuator endpoint to `management.endpoints.web.exposure.include` implicitly protects it via `anyRequest().authenticated()` in the management chain — no additional `permitAll()` needed unless intentional.
- A regression test (`ActuatorPrometheusIT`) verifies:
- `/actuator/prometheus` returns 200 without credentials.
- `/actuator/metrics` returns 401 without credentials.
- Prometheus metric names are present in the response body.
- If port 8081 is ever accidentally published in `docker-compose.yml`, actuator endpoints other than health and prometheus are still protected by HTTP Basic. This reduces (but does not eliminate) the risk of inadvertent exposure.

View File

@@ -23,7 +23,7 @@ System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-
Container(cadvisor, "cAdvisor", "gcr.io/cadvisor/cadvisor:v0.52.1", "Per-container resource metrics.") Container(cadvisor, "cAdvisor", "gcr.io/cadvisor/cadvisor:v0.52.1", "Per-container resource metrics.")
Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.") Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.")
Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD.") Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD.")
Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP gRPC receiver on port 4317 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.") Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP HTTP receiver on port 4318 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.")
Container(grafana, "Grafana", "grafana/grafana-oss:11.6.1", "Unified observability UI — dashboards, logs, traces. Datasources (Prometheus, Loki, Tempo) and three dashboards are auto-provisioned.") Container(grafana, "Grafana", "grafana/grafana-oss:11.6.1", "Unified observability UI — dashboards, logs, traces. Datasources (Prometheus, Loki, Tempo) and three dashboards are auto-provisioned.")
Container(glitchtip, "GlitchTip", "glitchtip/glitchtip:6.1.6", "Sentry-compatible error tracker — web process. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces.") Container(glitchtip, "GlitchTip", "glitchtip/glitchtip:6.1.6", "Sentry-compatible error tracker — web process. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces.")
Container(obs_glitchtip_worker, "GlitchTip Worker", "glitchtip/glitchtip:6.1.6", "Celery + beat worker — async event ingestion, notifications, cleanup.") Container(obs_glitchtip_worker, "GlitchTip Worker", "glitchtip/glitchtip:6.1.6", "Celery + beat worker — async event ingestion, notifications, cleanup.")
@@ -42,7 +42,7 @@ Rel(backend, mail, "Sends notification and password-reset emails (optional)", "S
Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API") Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
Rel(backend, tempo, "Sends distributed traces via OTLP", "gRPC / OTLP / port 4317 (archiv-net)") Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090") Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
Rel(grafana, loki, "Queries logs", "HTTP 3100") Rel(grafana, loki, "Queries logs", "HTTP 3100")
Rel(grafana, tempo, "Queries traces", "HTTP 3200") Rel(grafana, tempo, "Queries traces", "HTTP 3200")

View File

@@ -15,8 +15,6 @@ scrape_configs:
metrics_path: /actuator/prometheus metrics_path: /actuator/prometheus
static_configs: static_configs:
# Uses the Docker service name (not container_name) for reliable DNS resolution. # Uses the Docker service name (not container_name) for reliable DNS resolution.
# Target will show as DOWN until backend instrumentation issue adds
# micrometer-registry-prometheus and exposes the endpoint — this is expected.
- targets: ['backend:8081'] - targets: ['backend:8081']
- job_name: ocr-service - job_name: ocr-service

View File

@@ -28,3 +28,5 @@ scrape_configs:
target_label: 'compose_project' target_label: 'compose_project'
- source_labels: ['__meta_docker_container_log_stream'] - source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream' target_label: 'logstream'
- source_labels: ['__meta_docker_container_label_com_docker_compose_service']
target_label: 'job'