diff --git a/CLAUDE.md b/CLAUDE.md
index 88db450b..d3acc1a2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -299,6 +299,10 @@ Run via `docker-compose.observability.yml` — requires the main stack to be run
| `SENTRY_DSN` | GlitchTip/Sentry DSN for the backend (Spring Boot) — leave empty to disable |
| `VITE_SENTRY_DSN` | GlitchTip/Sentry DSN for the frontend (SvelteKit) — injected at build time via Vite |
+## Observability
+
+→ See [docs/OBSERVABILITY.md](./docs/OBSERVABILITY.md) — where to look for logs, traces, metrics, and errors.
+
## API Testing
HTTP test files are in `backend/api_tests/` for use with the VS Code REST Client extension.
diff --git a/backend/pom.xml b/backend/pom.xml
index dd0bc03c..0dd83185 100644
--- a/backend/pom.xml
+++ b/backend/pom.xml
@@ -48,6 +48,11 @@
org.springframework.boot
spring-boot-starter-actuator
+
+
+ org.springframework.boot
+ spring-boot-starter-micrometer-metrics
+
org.springframework.boot
spring-boot-starter-validation
diff --git a/backend/src/main/java/org/raddatz/familienarchiv/security/SecurityConfig.java b/backend/src/main/java/org/raddatz/familienarchiv/security/SecurityConfig.java
index 298d9fa6..8b1a45ac 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/security/SecurityConfig.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/security/SecurityConfig.java
@@ -3,13 +3,16 @@ package org.raddatz.familienarchiv.security;
import lombok.RequiredArgsConstructor;
import org.raddatz.familienarchiv.user.CustomUserDetailsService;
+import jakarta.servlet.http.HttpServletResponse;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
+import org.springframework.core.annotation.Order;
import org.springframework.core.env.Environment;
import org.springframework.security.authentication.dao.DaoAuthenticationProvider;
import org.springframework.security.config.Customizer;
import org.springframework.security.config.annotation.web.builders.HttpSecurity;
import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity;
+import org.springframework.security.config.annotation.web.configurers.AbstractHttpConfigurer;
import org.springframework.security.crypto.bcrypt.BCryptPasswordEncoder;
import org.springframework.security.crypto.password.PasswordEncoder;
import org.springframework.security.web.SecurityFilterChain;
@@ -34,6 +37,28 @@ public class SecurityConfig {
return authProvider;
}
+ @Bean
+ @Order(1)
+ public SecurityFilterChain managementFilterChain(HttpSecurity http) throws Exception {
+ http
+ .securityMatcher("/actuator/**")
+ .authorizeHttpRequests(auth -> {
+ // Health and Prometheus are open — Docker health checks and Prometheus scraping need no credentials.
+ auth.requestMatchers("/actuator/health", "/actuator/prometheus").permitAll();
+ // All other actuator endpoints (metrics, info, env, heapdump…) require authentication.
+ auth.anyRequest().authenticated();
+ })
+ // Explicitly return 401 for any unauthenticated actuator request.
+ // Without this override, Spring Security's DelegatingAuthenticationEntryPoint
+ // would redirect browser-like clients to the form-login page (302 → /login),
+ // making it impossible to distinguish "not authenticated" from "not found" in tests.
+ .exceptionHandling(ex -> ex.authenticationEntryPoint(
+ (req, res, e) -> res.setStatus(HttpServletResponse.SC_UNAUTHORIZED)))
+ .formLogin(AbstractHttpConfigurer::disable)
+ .csrf(AbstractHttpConfigurer::disable);
+ return http.build();
+ }
+
@Bean
public SecurityFilterChain securityFilterChain(HttpSecurity http) throws Exception {
http
@@ -54,8 +79,10 @@ public class SecurityConfig {
.csrf(csrf -> csrf.disable())
.authorizeHttpRequests(auth -> {
- // Health endpoint must be open so CI/Docker health checks work without credentials
- auth.requestMatchers("/actuator/health").permitAll();
+ // Actuator endpoints are governed by managementFilterChain (@Order(1)) above.
+ // The permitAll() lines here are a belt-and-suspenders fallback in case any
+ // actuator path escapes that chain's securityMatcher. See docs/adr/017.
+ auth.requestMatchers("/actuator/health", "/actuator/prometheus").permitAll();
// Password reset endpoints are unauthenticated by nature
auth.requestMatchers("/api/auth/forgot-password", "/api/auth/reset-password").permitAll();
// Invite-based registration endpoints are public
diff --git a/backend/src/main/resources/application.yaml b/backend/src/main/resources/application.yaml
index ead3d9e8..776b2ab1 100644
--- a/backend/src/main/resources/application.yaml
+++ b/backend/src/main/resources/application.yaml
@@ -49,7 +49,8 @@ management:
# Management port is separate from the app port so that:
# (a) Caddy never proxies /actuator/* (it only routes :8080 → the app port)
# (b) Prometheus scrapes backend:8081 directly inside archiv-net, not via Caddy
- # (c) Spring Security's session-authenticated filter chain on :8080 never sees actuator requests
+ # Note: in Spring Boot 4.0 the management port shares the security filter chain; /actuator/health
+ # and /actuator/prometheus must be explicitly permitted in SecurityConfig — see SecurityConfig.java.
port: 8081
endpoints:
web:
@@ -58,6 +59,16 @@ management:
endpoint:
prometheus:
enabled: true
+ # Spring Boot 4.0: metrics export is disabled by default — explicitly opt in for Prometheus
+ prometheus:
+ metrics:
+ export:
+ enabled: true
+ metrics:
+ tags:
+ # Common tag applied to every metric so Grafana's Spring Boot dashboard can filter by application name.
+ # Override via MANAGEMENT_METRICS_TAGS_APPLICATION env var.
+ application: ${spring.application.name}
health:
mail:
enabled: false
@@ -66,13 +77,18 @@ management:
probability: 1.0 # 100% in dev; override via MANAGEMENT_TRACING_SAMPLING_PROBABILITY in prod compose
# OpenTelemetry trace export — failures are non-fatal (app starts cleanly without Tempo running)
-# The default http://localhost:4317 ensures CI compatibility when no observability stack is present.
+# Port 4318 = OTLP HTTP (the default transport for Spring Boot's HttpExporter).
+# Port 4317 is gRPC-only; sending HTTP/1.1 to it produces "Connection reset".
otel:
service:
name: familienarchiv-backend
exporter:
otlp:
- endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:http://localhost:4317}
+ endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:http://localhost:4318}
+ logs:
+ exporter: none # Promtail captures Docker logs; disable OTLP log export (Tempo only accepts traces)
+ metrics:
+ exporter: none # Prometheus scrapes /actuator/prometheus; disable OTLP metric export to Tempo
springdoc:
api-docs:
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/ActuatorPrometheusIT.java b/backend/src/test/java/org/raddatz/familienarchiv/ActuatorPrometheusIT.java
new file mode 100644
index 00000000..e68ccfc7
--- /dev/null
+++ b/backend/src/test/java/org/raddatz/familienarchiv/ActuatorPrometheusIT.java
@@ -0,0 +1,63 @@
+package org.raddatz.familienarchiv;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.boot.test.web.server.LocalManagementPort;
+import org.springframework.context.annotation.Import;
+import org.springframework.http.ResponseEntity;
+import org.springframework.test.context.ActiveProfiles;
+import org.springframework.test.context.bean.override.mockito.MockitoBean;
+import org.springframework.web.client.DefaultResponseErrorHandler;
+import org.springframework.web.client.RestTemplate;
+import software.amazon.awssdk.services.s3.S3Client;
+
+import java.io.IOException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
+@ActiveProfiles("test")
+@Import(PostgresContainerConfig.class)
+class ActuatorPrometheusIT {
+
+ @LocalManagementPort
+ private int managementPort;
+
+ @MockitoBean
+ S3Client s3Client;
+
+ @Test
+ void prometheus_endpoint_returns_200_without_credentials() {
+ ResponseEntity response = noThrowTemplate().getForEntity(
+ "http://localhost:" + managementPort + "/actuator/prometheus", String.class);
+
+ assertThat(response.getStatusCode().value()).isEqualTo(200);
+ }
+
+ @Test
+ void prometheus_endpoint_returns_jvm_metrics() {
+ ResponseEntity response = noThrowTemplate().getForEntity(
+ "http://localhost:" + managementPort + "/actuator/prometheus", String.class);
+
+ assertThat(response.getBody()).contains("jvm_memory_used_bytes");
+ }
+
+ @Test
+ void actuator_metrics_requires_authentication() {
+ ResponseEntity response = noThrowTemplate().getForEntity(
+ "http://localhost:" + managementPort + "/actuator/metrics", String.class);
+
+ assertThat(response.getStatusCode().value()).isEqualTo(401);
+ }
+
+ private RestTemplate noThrowTemplate() {
+ RestTemplate template = new RestTemplate();
+ template.setErrorHandler(new DefaultResponseErrorHandler() {
+ @Override
+ public boolean hasError(org.springframework.http.client.ClientHttpResponse response) throws IOException {
+ return false;
+ }
+ });
+ return template;
+ }
+}
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 28730f7e..9fcb453f 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -213,7 +213,11 @@ services:
APP_MAIL_FROM: ${APP_MAIL_FROM:-noreply@raddatz.cloud}
SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-true}
SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-true}
- OTEL_EXPORTER_OTLP_ENDPOINT: http://tempo:4317
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://tempo:4318
+ OTEL_LOGS_EXPORTER: none
+ OTEL_METRICS_EXPORTER: none
+ MANAGEMENT_METRICS_TAGS_APPLICATION: Familienarchiv
+ MANAGEMENT_TRACING_SAMPLING_PROBABILITY: ${MANAGEMENT_TRACING_SAMPLING_PROBABILITY:-0.1}
networks:
- archiv-net
healthcheck:
diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index e959d504..0b852e4d 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -107,7 +107,10 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
| `MAIL_SMTP_AUTH` | SMTP auth enabled | `false` (dev) | YES (prod) | — |
| `MAIL_STARTTLS_ENABLE` | STARTTLS enabled | `false` (dev) | YES (prod) | — |
| `SPRING_PROFILES_ACTIVE` | Spring profile | `dev,e2e` (compose) | YES | — |
-| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP gRPC endpoint for distributed traces (Tempo). Set to `http://tempo:4317` via compose. | `http://localhost:4317` | — | — |
+| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP HTTP endpoint for distributed traces (Tempo). Port 4318 = HTTP transport; port 4317 is gRPC-only and causes "Connection reset" with Spring Boot's HttpExporter. | `http://localhost:4318` | — | — |
+| `OTEL_LOGS_EXPORTER` | Disable OTLP log export — Promtail captures Docker logs via the logging driver; Tempo does not accept logs. | `none` | — | — |
+| `OTEL_METRICS_EXPORTER` | Disable OTLP metric export — Prometheus scrapes `/actuator/prometheus` via pull model; Tempo does not accept metrics. | `none` | — | — |
+| `MANAGEMENT_METRICS_TAGS_APPLICATION` | Common tag added to every Micrometer metric. Required by Grafana's Spring Boot Observability dashboard (ID 17175) `label_values(application)` template variable. | `Familienarchiv` | — | — |
| `MANAGEMENT_TRACING_SAMPLING_PROBABILITY` | Micrometer tracing sample rate; overridden to `0.0` in test profile. | `0.1` (compose) / `1.0` (dev) | — | — |
| `SENTRY_DSN` | GlitchTip / Sentry DSN for backend error reporting. Leave empty to disable the SDK. Set after GlitchTip first-run (§4). | — | — | YES |
@@ -280,6 +283,9 @@ Before the first deploy: rotate `PROD_APP_ADMIN_PASSWORD` to a strong value. Aft
## 4. Logs + observability
+> **Developer guide (where to look for what, LogQL queries, trace exploration) → [docs/OBSERVABILITY.md](./OBSERVABILITY.md).**
+> This section covers the ops side: starting the stack, env vars, and CI wiring.
+
### First-response commands
```bash
@@ -374,8 +380,8 @@ Current services:
| `obs-node-exporter` | `prom/node-exporter:v1.9.0` | Host-level CPU / memory / disk / network metrics |
| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics |
| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). |
-| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels |
-| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP gRPC receiver on port 4317, OTLP HTTP on port 4318 (both `archiv-net`-internal). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). |
+| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, `compose_project`, and `job` labels. The `job` label is mapped from the Docker Compose service name (`com.docker.compose.service`) so that Grafana Loki dashboard queries (`{job="backend"}`, `{job="frontend"}`) work out of the box and the "App" variable dropdown is populated. |
+| `obs-tempo` | `grafana/tempo:2.7.2` | Distributed trace storage — OTLP HTTP receiver on port 4318 (`archiv-net`-internal; backend sends traces here). Grafana queries traces on port 3200 (`obs-net`-internal). All ports are `expose`-only (not host-bound). |
| `obs-grafana` | `grafana/grafana-oss:11.6.1` | Unified observability UI — metrics dashboards, log exploration, trace viewer. Bound to `127.0.0.1:${PORT_GRAFANA:-3003}` on the host. |
| `obs-glitchtip` | `glitchtip/glitchtip:6.1.6` | Sentry-compatible error tracker. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces. Bound to `127.0.0.1:${PORT_GLITCHTIP:-3002}`. |
| `obs-glitchtip-worker` | `glitchtip/glitchtip:6.1.6` | Celery + beat worker — processes async GlitchTip tasks (event ingestion, notifications, cleanup). |
diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md
new file mode 100644
index 00000000..b895e849
--- /dev/null
+++ b/docs/OBSERVABILITY.md
@@ -0,0 +1,180 @@
+# Observability Guide
+
+> **Ops reference (starting the stack, env vars, CI wiring) → [DEPLOYMENT.md §4](./DEPLOYMENT.md#4-logs--observability).**
+> This file is for developers: what signal lives where, how to reach it, and what to look for.
+
+## Where to look for what
+
+| I want to… | Go to |
+|---|---|
+| See the last N log lines from the backend | `docker compose logs --tail=100 backend` |
+| Search logs by keyword across time | Grafana → Explore → Loki |
+| Understand why an HTTP request failed | Grafana → Explore → Loki → filter by `traceId` → follow link to Tempo |
+| See a full distributed trace (DB queries, HTTP calls) | Grafana → Explore → Tempo → search by service or trace ID |
+| Check JVM heap / GC / thread count | Grafana → Dashboards → Spring Boot Observability |
+| Check HTTP error rate or p95 latency | Grafana → Dashboards → Spring Boot Observability |
+| Check host CPU / memory / disk | Grafana → Dashboards → Node Exporter Full |
+| See grouped application errors with stack traces | GlitchTip |
+| Check if the backend is healthy | `curl http://localhost:8081/actuator/health` (on the server) |
+| Check what Prometheus is scraping | `curl http://localhost:9090/api/v1/targets` (on the server) |
+
+## Access
+
+| Tool | External URL | Who it's for |
+|---|---|---|
+| Grafana | `https://grafana.archiv.raddatz.cloud` | Logs, metrics, traces — the primary observability UI |
+| GlitchTip | `https://glitchtip.archiv.raddatz.cloud` | Grouped errors with stack traces and release tracking |
+
+Loki, Tempo, and Prometheus have no external URL. They are internal services, accessible only through Grafana (or via SSH tunnel — see below).
+
+## Logs (Loki)
+
+Logs reach Loki via Promtail, which reads all Docker container logs from the Docker socket and ships them with labels derived from Docker Compose metadata.
+
+### Labels available in every log line
+
+| Label | What it contains | Example |
+|---|---|---|
+| `job` | Compose service name | `backend`, `frontend`, `db` |
+| `compose_service` | Same as `job` | `backend` |
+| `compose_project` | Compose project name | `archiv-staging`, `archiv-production` |
+| `container_name` | Docker container name | `archiv-staging-backend-1` |
+| `filename` | Docker log source | `/var/lib/docker/containers/…` |
+
+**Use `job` in LogQL queries** — it is stable across dev, staging, and production. `container_name` changes between environments.
+
+### Common LogQL queries in Grafana Explore
+
+```logql
+# All backend logs
+{job="backend"}
+
+# Backend ERROR and WARN lines only
+{job="backend"} |= "ERROR" or {job="backend"} |= "WARN"
+
+# All logs for a specific request (paste a traceId from a log line)
+{job="backend"} |= "3fa85f64-5717-4562-b3fc-2c963f66afa6"
+
+# Log lines containing a specific exception class
+{job="backend"} |~ "DomainException|NullPointerException"
+
+# Frontend logs
+{job="frontend"}
+
+# Database (slow query log, if enabled)
+{job="db"}
+```
+
+### Log → Trace correlation
+
+Spring Boot writes the active `traceId` into every log line when a request is being processed:
+
+```
+2026-05-16 ... INFO [Familienarchiv,3fa85f64...,1b2c3d4e] o.r.f.document.DocumentService : ...
+```
+
+In Grafana Explore → Loki, log lines with a `traceId` field show a **Tempo** link. Clicking it opens the full trace in Explore → Tempo without copying and pasting IDs.
+
+This linking is configured in the Loki datasource provisioning via the `traceId` derived field regex. No manual setup required.
+
+## Traces (Tempo)
+
+The backend sends traces to Tempo via OTLP HTTP (port 4318). Every inbound HTTP request and every JPA query produces a span. Spans are linked into traces by the propagated `traceId` header.
+
+### Finding a trace in Grafana
+
+**Option A — from a log line:**
+1. Grafana → Explore → select *Loki* datasource
+2. Query `{job="backend"}` and find the failing request
+3. Click the **Tempo** link in the log line (appears when `traceId` is present)
+
+**Option B — by service:**
+1. Grafana → Explore → select *Tempo* datasource
+2. Query type: **Search**
+3. Service name: `familienarchiv-backend`
+4. Filter by HTTP status, duration, or operation name as needed
+
+**Option C — by trace ID:**
+1. Grafana → Explore → select *Tempo* datasource
+2. Query type: **TraceQL** or **Trace ID**
+3. Paste the trace ID
+
+### What each span type tells you
+
+| Root span name pattern | What it covers |
+|---|---|
+| `GET /api/documents`, `POST /api/documents` | Full HTTP request lifecycle |
+| `SELECT archiv.*` | A single JPA/JDBC query inside that request |
+| `HikariPool.getConnection` | Connection pool wait time |
+
+A slow `SELECT` span inside an otherwise fast HTTP span pinpoints a missing index. A slow `HikariPool.getConnection` span indicates connection pool exhaustion.
+
+### Sampling rate
+
+- **Dev**: 100% of requests are traced (`management.tracing.sampling.probability: 1.0` in `application.yaml`)
+- **Staging / Production**: 10% (`MANAGEMENT_TRACING_SAMPLING_PROBABILITY=0.1` in `docker-compose.prod.yml`)
+
+To find a trace for a specific request in staging/production, either increase the sampling rate temporarily or trigger the request multiple times.
+
+## Metrics (Prometheus → Grafana)
+
+Prometheus scrapes the backend management endpoint every 15 s:
+
+```
+Target: backend:8081/actuator/prometheus
+Labels: job="spring-boot", application="Familienarchiv"
+```
+
+All Spring Boot metrics carry the `application="Familienarchiv"` tag, which is how the Grafana Spring Boot Observability dashboard (ID 17175) filters to this service.
+
+### Useful Prometheus queries (run on the server or via Grafana Explore → Prometheus)
+
+```promql
+# HTTP error rate (5xx) as a fraction of all requests
+sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
+ / sum(rate(http_server_requests_seconds_count[5m]))
+
+# p95 response time
+histogram_quantile(0.95, sum by (le) (
+ rate(http_server_requests_seconds_bucket[5m])
+))
+
+# JVM heap used
+jvm_memory_used_bytes{area="heap", application="Familienarchiv"}
+
+# Active DB connections
+hikaricp_connections_active
+```
+
+## Errors (GlitchTip)
+
+GlitchTip receives errors from both the backend (via Sentry Java SDK) and the frontend (via Sentry JavaScript SDK). It groups events by fingerprint, tracks first/last seen times, and links to the release that introduced the error.
+
+GlitchTip complements Loki: use GlitchTip when you need **grouped, de-duplicated errors with stack traces and release attribution**; use Loki when you need **raw log lines with full context** or want to search across all log levels.
+
+## Direct API access (debugging only)
+
+Loki and Tempo bind no host ports. To reach them directly from your laptop, use an SSH tunnel through the server:
+
+```bash
+# Loki API on localhost:3100 (then query via curl or logcli)
+ssh -L 3100:172.20.0.x:3100 root@raddatz.cloud
+# Replace 172.20.0.x with the obs-loki container IP:
+# docker inspect obs-loki --format '{{.NetworkSettings.Networks.archiv-obs-net.IPAddress}}'
+
+# Tempo API on localhost:3200 (then query via curl or tempo-cli)
+ssh -L 3200:172.20.0.x:3200 root@raddatz.cloud
+```
+
+In practice, Grafana Explore covers all common debugging workflows without needing direct API access.
+
+## Signal summary
+
+| Signal | Source | Transport | Storage | UI |
+|---|---|---|---|---|
+| Application logs | Spring Boot stdout → Docker log driver | Promtail → Loki push API | Loki | Grafana Explore → Loki |
+| Distributed traces | Spring Boot OTel agent | OTLP HTTP → Tempo:4318 | Tempo | Grafana Explore → Tempo |
+| JVM + HTTP metrics | Spring Actuator `/actuator/prometheus` | Prometheus pull (15 s) | Prometheus | Grafana dashboards |
+| Host metrics | node-exporter | Prometheus pull | Prometheus | Grafana → Node Exporter Full |
+| Container metrics | cAdvisor | Prometheus pull | Prometheus | Grafana (via Prometheus datasource) |
+| Application errors | Sentry SDK | HTTP POST → GlitchTip ingest | GlitchTip DB | GlitchTip UI |
diff --git a/docs/adr/017-management-port-security.md b/docs/adr/017-management-port-security.md
new file mode 100644
index 00000000..ed60150c
--- /dev/null
+++ b/docs/adr/017-management-port-security.md
@@ -0,0 +1,48 @@
+# ADR-017: Spring Boot 4.0 management port shares the main security filter chain
+
+## Status
+
+Accepted
+
+## Context
+
+The Familienarchiv backend runs Spring Boot Actuator on a dedicated management port (8081) so that Caddy never proxies `/actuator/*` requests and Prometheus can reach the scrape endpoint directly inside `archiv-net`.
+
+In earlier Spring Boot versions (< 4.0), the management server ran in an isolated child application context whose security was governed independently by `ManagementWebSecurityAutoConfiguration`. The main app's `SecurityConfig` filter chain (port 8080) never intercepted requests arriving on port 8081.
+
+In Spring Boot 4.0 with Jetty, this isolation was removed. The management server now traverses the **same** Spring Security `FilterChainProxy` as the main application. Concretely:
+
+- Any `SecurityFilterChain` bean in the application context is evaluated for requests arriving on the management port.
+- There is no longer a separate "management security" child context.
+
+This was discovered when Prometheus began receiving HTTP 401 responses from `/actuator/prometheus` despite the endpoint being exposed and the `micrometer-registry-prometheus` dependency being present. Prometheus rejected these responses with `received unsupported Content-Type "text/html"` because the main filter chain's form-login `DelegatingAuthenticationEntryPoint` was redirecting unauthenticated requests to `/login` (302 → HTML).
+
+A secondary issue: Spring Boot 4.0 no longer auto-enables Prometheus metrics export — `management.prometheus.metrics.export.enabled` must be set explicitly, and the Prometheus scrape endpoint requires `spring-boot-starter-micrometer-metrics` (a new starter that was split out in Spring Boot 4.0).
+
+## Decision
+
+1. **Dedicated management `SecurityFilterChain`** scoped to `/actuator/**` at `@Order(1)` (highest precedence). This chain:
+ - `permitAll()` for `/actuator/health` and `/actuator/prometheus` — required for Docker health checks and unauthenticated Prometheus scraping.
+ - `authenticated()` for all other actuator endpoints — blocks `/actuator/metrics`, `/actuator/info`, etc. without credentials.
+ - Uses an explicit `401` entry point (not form-login redirect) so that API clients — including Prometheus — receive a machine-readable status code rather than an HTML redirect.
+ - No CSRF, no form login.
+
+2. **Belt-and-suspenders `permitAll()` in the main `SecurityFilterChain`** for `/actuator/health` and `/actuator/prometheus`, in case a future configuration change causes these paths to escape the management chain's `securityMatcher`.
+
+3. **Network isolation as the outer defense boundary.** Port 8081 is not published in `docker-compose.yml` and is not routed through Caddy. Only services inside `archiv-net` (primarily Prometheus and the Docker health checker) can reach the management port.
+
+## Alternatives rejected
+
+- **Exclude `ManagementWebSecurityAutoConfiguration`:** This auto-configuration no longer exists in Spring Boot 4.0. Exclusion is not applicable.
+- **Keep `SecurityConfig` as the sole filter chain without `@Order(1)` management chain:** The main chain's form-login `DelegatingAuthenticationEntryPoint` redirects browser-like clients to `/login` (302). Prometheus and automated health check clients cannot follow this redirect, so the endpoint would be unreachable without a dedicated chain that returns plain 401 or 200.
+- **Per-endpoint `@Order(1)` filter chain using `EndpointRequest.toAnyEndpoint()`:** The `spring-boot-security` artifact that provides `EndpointRequest` is not a transitive dependency of `spring-boot-starter-actuator` in Spring Boot 4.0. Using a path-based `securityMatcher("/actuator/**")` achieves the same scoping without an extra dependency.
+
+## Consequences
+
+- All actuator endpoints on port 8081 that are not explicitly `permitAll()`-ed require HTTP Basic credentials. Without valid credentials, the response is 401 (not a redirect).
+- Adding a new actuator endpoint to `management.endpoints.web.exposure.include` implicitly protects it via `anyRequest().authenticated()` in the management chain — no additional `permitAll()` needed unless intentional.
+- A regression test (`ActuatorPrometheusIT`) verifies:
+ - `/actuator/prometheus` returns 200 without credentials.
+ - `/actuator/metrics` returns 401 without credentials.
+ - Prometheus metric names are present in the response body.
+- If port 8081 is ever accidentally published in `docker-compose.yml`, actuator endpoints other than health and prometheus are still protected by HTTP Basic. This reduces (but does not eliminate) the risk of inadvertent exposure.
diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml
index 02167cdb..346efe75 100644
--- a/docs/architecture/c4/l2-containers.puml
+++ b/docs/architecture/c4/l2-containers.puml
@@ -23,7 +23,7 @@ System_Boundary(observability, "Observability Stack (/opt/familienarchiv/docker-
Container(cadvisor, "cAdvisor", "gcr.io/cadvisor/cadvisor:v0.52.1", "Per-container resource metrics.")
Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.")
Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD.")
- Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP gRPC receiver on port 4317 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.")
+ Container(tempo, "Tempo", "grafana/tempo:2.7.2", "Distributed trace storage. OTLP HTTP receiver on port 4318 (archiv-net). Grafana queries traces on port 3200 (obs-net). All ports internal only.")
Container(grafana, "Grafana", "grafana/grafana-oss:11.6.1", "Unified observability UI — dashboards, logs, traces. Datasources (Prometheus, Loki, Tempo) and three dashboards are auto-provisioned.")
Container(glitchtip, "GlitchTip", "glitchtip/glitchtip:6.1.6", "Sentry-compatible error tracker — web process. Receives frontend + backend error events, groups by fingerprint, provides issue UI with stack traces.")
Container(obs_glitchtip_worker, "GlitchTip Worker", "glitchtip/glitchtip:6.1.6", "Celery + beat worker — async event ingestion, notifications, cleanup.")
@@ -42,7 +42,7 @@ Rel(backend, mail, "Sends notification and password-reset emails (optional)", "S
Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
-Rel(backend, tempo, "Sends distributed traces via OTLP", "gRPC / OTLP / port 4317 (archiv-net)")
+Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
Rel(grafana, loki, "Queries logs", "HTTP 3100")
Rel(grafana, tempo, "Queries traces", "HTTP 3200")
diff --git a/infra/observability/prometheus/prometheus.yml b/infra/observability/prometheus/prometheus.yml
index 38a0f8d6..a29cc75b 100644
--- a/infra/observability/prometheus/prometheus.yml
+++ b/infra/observability/prometheus/prometheus.yml
@@ -15,8 +15,6 @@ scrape_configs:
metrics_path: /actuator/prometheus
static_configs:
# Uses the Docker service name (not container_name) for reliable DNS resolution.
- # Target will show as DOWN until backend instrumentation issue adds
- # micrometer-registry-prometheus and exposes the endpoint — this is expected.
- targets: ['backend:8081']
- job_name: ocr-service
diff --git a/infra/observability/promtail/promtail-config.yml b/infra/observability/promtail/promtail-config.yml
index b569c22f..b31781a4 100644
--- a/infra/observability/promtail/promtail-config.yml
+++ b/infra/observability/promtail/promtail-config.yml
@@ -28,3 +28,5 @@ scrape_configs:
target_label: 'compose_project'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream'
+ - source_labels: ['__meta_docker_container_label_com_docker_compose_service']
+ target_label: 'job'