From 22e1b25398315bd13190d9b1a1d0299c2eca6424 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 02:18:22 +0200 Subject: [PATCH 1/2] devops(observability): add Loki + Promtail for centralised container log aggregation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add obs-loki (grafana/loki:3.4.2) to docker-compose.observability.yml with healthcheck (wget /ready), expose-only port 3100, named volume loki_data - Add obs-promtail (grafana/promtail:3.4.2) bridging archiv-net + obs-net, depends_on loki service_healthy, docker.sock:ro, promtail_positions volume for restart-safe position tracking - Create infra/observability/loki/loki-config.yml: single-node TSDB schema v13, 30-day retention, auth disabled (obs-net only), telemetry off - Create infra/observability/promtail/promtail-config.yml: Docker SD scrape, container_name / compose_service / compose_project / logstream labels - Update docs/DEPLOYMENT.md §4 with service table and Loki quick-check commands Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.observability.yml | 45 +++++++++++++++++-- docs/DEPLOYMENT.md | 35 ++++++++++++++- infra/observability/loki/.gitkeep | 0 infra/observability/loki/loki-config.yml | 32 +++++++++++++ infra/observability/promtail/.gitkeep | 0 .../promtail/promtail-config.yml | 30 +++++++++++++ 6 files changed, 138 insertions(+), 4 deletions(-) delete mode 100644 infra/observability/loki/.gitkeep create mode 100644 infra/observability/loki/loki-config.yml delete mode 100644 infra/observability/promtail/.gitkeep create mode 100644 infra/observability/promtail/promtail-config.yml diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 216e46f6..9e186dec 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -73,9 +73,47 @@ services: - obs-net # --- Logs: Loki + Promtail --- - # loki: (see issue #574) - # promtail: (see issue #575) - # + + loki: + image: grafana/loki:3.4.2 + container_name: obs-loki + restart: unless-stopped + volumes: + - ./infra/observability/loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - loki_data:/loki + command: -config.file=/etc/loki/loki-config.yml + expose: + - "3100" + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready | grep -q ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + networks: + - obs-net + + promtail: + image: grafana/promtail:3.4.2 + container_name: obs-promtail + restart: unless-stopped + volumes: + - ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + # /var/run/docker.sock gives Promtail container-name discovery. Trade-off: any + # process that can write to this socket can control the Docker daemon (container + # escape). Acceptable on a single-operator archive; review if multi-user access + # to the host is ever introduced. + - /var/run/docker.sock:/var/run/docker.sock:ro + - promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion + command: -config.file=/etc/promtail/promtail-config.yml + networks: + - archiv-net # label discovery from application containers via Docker socket + - obs-net # log shipping to Loki + depends_on: + loki: + condition: service_healthy + # --- Traces: Tempo --- # tempo: (see future issue) # @@ -102,6 +140,7 @@ networks: volumes: prometheus_data: loki_data: + promtail_positions: tempo_data: grafana_data: glitchtip_data: diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index c4df74aa..4eb12fbd 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -265,7 +265,40 @@ docker compose logs --tail=200 ### Observability stack -An observability stack (Prometheus + Loki + Grafana) is available via `docker-compose.observability.yml` and configuration lives under `infra/observability/`. It joins the `archiv-net` Docker network to scrape the backend's management port. Full wiring and runbook documentation is tracked in issue #581. +An observability stack is available via `docker-compose.observability.yml`. Configuration lives under `infra/observability/`. Start it after the main stack is up (which creates `archiv-net`): + +```bash +docker compose up -d # creates archiv-net +docker compose -f docker-compose.observability.yml up -d +``` + +Current services: + +| Service | Image | Purpose | +|---|---|---| +| `obs-prometheus` | `prom/prometheus:v3.4.0` | Scrapes metrics from backend management port 8081 (`/actuator/prometheus`), node-exporter, and cAdvisor | +| `obs-node-exporter` | `prom/node-exporter:v1.9.0` | Host-level CPU / memory / disk / network metrics | +| `obs-cadvisor` | `gcr.io/cadvisor/cadvisor:v0.52.1` | Per-container resource metrics | +| `obs-loki` | `grafana/loki:3.4.2` | Log aggregation — receives log streams from Promtail. Port 3100 is `expose`-only (not host-bound). | +| `obs-promtail` | `grafana/promtail:3.4.2` | Log shipping agent — reads all Docker container logs via the Docker socket and forwards them to Loki with `container_name`, `compose_service`, and `compose_project` labels | + +**Loki quick checks** (after ~60 s, run from inside the `obs-loki` container): + +```bash +# Loki health +docker exec obs-loki wget -qO- http://localhost:3100/ready + +# List labels +docker exec obs-loki wget -qO- 'http://localhost:3100/loki/api/v1/labels' + +# Query logs by service (stable across dev and prod environments) +docker exec obs-loki wget -qO- \ + 'http://localhost:3100/loki/api/v1/query_range?query=%7Bcompose_service%3D%22backend%22%7D&limit=5' +``` + +**Prefer `compose_service` over `container_name` in LogQL queries** — `container_name` differs between dev (`archive-backend`) and prod (`archiv-production-backend-1`), while `compose_service` is stable (`backend`, `db`, `minio`, etc.). + +Prometheus port `9090` is bound to `127.0.0.1:${PORT_PROMETHEUS:-9090}` on the host. No other observability ports are host-bound. Full wiring and Grafana dashboards are tracked in issue #581. --- diff --git a/infra/observability/loki/.gitkeep b/infra/observability/loki/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/observability/loki/loki-config.yml b/infra/observability/loki/loki-config.yml new file mode 100644 index 00000000..36abdd07 --- /dev/null +++ b/infra/observability/loki/loki-config.yml @@ -0,0 +1,32 @@ +auth_enabled: false # safe — loki is not exposed beyond obs-net. Add auth before binding port 3100 to host. + +server: + http_listen_port: 3100 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory # correct for single-node — no etcd/consul needed here + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 720h # 30 days — low-volume family archive; revisit if log volume grows + +analytics: + reporting_enabled: false # no telemetry sent to Grafana Labs diff --git a/infra/observability/promtail/.gitkeep b/infra/observability/promtail/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/infra/observability/promtail/promtail-config.yml b/infra/observability/promtail/promtail-config.yml new file mode 100644 index 00000000..b569c22f --- /dev/null +++ b/infra/observability/promtail/promtail-config.yml @@ -0,0 +1,30 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 # gRPC disabled — used for Promtail clustering only; single-node deployment + +positions: + filename: /tmp/positions.yaml # /tmp is a named volume (promtail_positions) — persists across restarts + +clients: + - url: http://loki:3100/loki/api/v1/push + # Loki HTTP API is unauthenticated internally. Any container on obs-net can push logs. + # Acceptable: only trusted application containers join this network. + +scrape_configs: + - job_name: docker-containers + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container_name' + # Note: container_name differs between dev (archive-backend) and prod + # (archiv-production-backend-1). Prefer compose_service for stable LogQL + # queries across environments — it is stable: backend, db, minio, etc. + - source_labels: ['__meta_docker_container_label_com_docker_compose_service'] + target_label: 'compose_service' + - source_labels: ['__meta_docker_container_label_com_docker_compose_project'] + target_label: 'compose_project' + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'logstream' From c1406a32f1a749bf27d436d553b0e9d8ee084d05 Mon Sep 17 00:00:00 2001 From: Marcel Date: Fri, 15 May 2026 02:25:34 +0200 Subject: [PATCH 2/2] devops(observability): fix C4 diagram, security comment, and add Loki compactor block Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.observability.yml | 5 +---- docs/architecture/c4/l2-containers.puml | 4 +++- infra/observability/loki/loki-config.yml | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml index 9e186dec..82025222 100644 --- a/docker-compose.observability.yml +++ b/docker-compose.observability.yml @@ -100,10 +100,7 @@ services: volumes: - ./infra/observability/promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - # /var/run/docker.sock gives Promtail container-name discovery. Trade-off: any - # process that can write to this socket can control the Docker daemon (container - # escape). Acceptable on a single-operator archive; review if multi-user access - # to the host is ever introduced. + # :ro restricts file-system access but NOT Docker API permissions — a compromised Promtail has full daemon access. Accepted risk on single-operator self-hosted archive. - /var/run/docker.sock:/var/run/docker.sock:ro - promtail_positions:/tmp # persists positions.yaml across restarts — avoids duplicate log ingestion command: -config.file=/etc/promtail/promtail-config.yml diff --git a/docs/architecture/c4/l2-containers.puml b/docs/architecture/c4/l2-containers.puml index f27eda69..56968766 100644 --- a/docs/architecture/c4/l2-containers.puml +++ b/docs/architecture/c4/l2-containers.puml @@ -19,7 +19,8 @@ System_Boundary(archiv, "Familienarchiv (Docker Compose)") { System_Boundary(observability, "Observability Stack (docker-compose.observability.yml / archiv-net)") { Container(prometheus, "Prometheus", "prom/prometheus", "Scrapes metrics from backend management port 8081 (/actuator/prometheus). Retention and alert rules TBD — see issue #581.") - Container(loki, "Loki", "grafana/loki", "Log aggregation. Receives structured logs from the stack. Wiring TBD — see issue #581.") + Container(loki, "Loki", "grafana/loki:3.4.2", "Stores log streams from all containers.") + Container(promtail, "Promtail", "grafana/promtail:3.4.2", "Ships Docker container logs to Loki via Docker SD") Container(grafana, "Grafana", "grafana/grafana", "Dashboards and alerting UI. Data sources: Prometheus + Loki. Wiring TBD — see issue #581.") } @@ -34,5 +35,6 @@ Rel(backend, ocr, "OCR job requests with presigned MinIO URL", "HTTP / REST / JS Rel(backend, mail, "Sends notification and password-reset emails (optional)", "SMTP") Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned") Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI") +Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API") @enduml diff --git a/infra/observability/loki/loki-config.yml b/infra/observability/loki/loki-config.yml index 36abdd07..b5d84796 100644 --- a/infra/observability/loki/loki-config.yml +++ b/infra/observability/loki/loki-config.yml @@ -28,5 +28,13 @@ schema_config: limits_config: retention_period: 720h # 30 days — low-volume family archive; revisit if log volume grows +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + analytics: reporting_enabled: false # no telemetry sent to Grafana Labs