docs(observability): ADR-024 + rotation runbook for grafana_reader

ADR-024 records the deliberate cross-domain link (obs-grafana joins archiv-net to query archive-db via the SELECT-only grafana_reader role), the rejected alternatives (Prometheus exporter, read replica, versioned migration + flyway repair, hardcoded fallback), and the consequences — specifically that a Grafana compromise gains TCP reach to archive-db but is bounded by the role's least-privilege grants. The DEPLOYMENT.md runbook documents the rotation procedure that R__grafana_reader_password.sql now enables: bump GRAFANA_DB_PASSWORD, restart backend (Flyway re-applies because the resolved checksum changed), restart obs-grafana (datasource picks up the new env var). Also calls out the fail-closed startup behavior so operators who hit IllegalStateException know it is deliberate. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
test(observability): expand grafana_reader coverage with write-deny + PII negatives
2026-05-22 17:21:27 +02:00 · 2026-05-22 17:21:01 +02:00 · 2026-05-22 17:20:35 +02:00 · 2026-05-22 17:20:09 +02:00 · 2026-05-21 20:21:27 +02:00 · 2026-05-21 20:21:05 +02:00
68 changed files with 3486 additions and 123 deletions
--- a/.env.example
+++ b/.env.example
@@ -39,6 +39,12 @@ PORT_PROMETHEUS=9090
 # Grafana admin password — change this before exposing Grafana beyond localhost
 GRAFANA_ADMIN_PASSWORD=changeme
 # Password for the read-only grafana_reader PostgreSQL role used by the PO
 # Overview dashboard. Consumed by Flyway V68 (to set the role's password) and
 # by Grafana's PostgreSQL datasource (to connect). REQUIRED in production —
 # generate with: openssl rand -hex 32
 GRAFANA_DB_PASSWORD=changeme-generate-with-openssl-rand-hex-32
 # GlitchTip domain — production: use https://glitchtip.archiv.raddatz.cloud (must match Caddy vhost)
 GLITCHTIP_DOMAIN=http://localhost:3002
--- a/.gitea/workflows/nightly.yml
+++ b/.gitea/workflows/nightly.yml
@@ -31,6 +31,7 @@ name: nightly
 #   STAGING_APP_ADMIN_USERNAME
 #   STAGING_APP_ADMIN_PASSWORD
 #   GRAFANA_ADMIN_PASSWORD
 #   GRAFANA_DB_PASSWORD           (read-only grafana_reader DB role, issue #651)
 #   GLITCHTIP_SECRET_KEY
 #   SENTRY_DSN                  (set after GlitchTip first-run; empty = Sentry disabled)
@@ -79,6 +80,8 @@ jobs:
          IMPORT_HOST_DIR=/srv/familienarchiv-staging/import
          POSTGRES_USER=archiv
          SENTRY_DSN=${{ secrets.SENTRY_DSN }}
          VITE_SENTRY_DSN=${{ secrets.VITE_SENTRY_DSN }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          EOF
      - name: Verify backend /import:ro mount is wired
@@ -142,6 +145,7 @@ jobs:
          cp docker-compose.observability.yml /opt/familienarchiv/
          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
          POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }}
          POSTGRES_HOST=archiv-staging-db-1
--- a/.gitea/workflows/release.yml
+++ b/.gitea/workflows/release.yml
@@ -35,6 +35,7 @@ name: release
 #   MAIL_USERNAME
 #   MAIL_PASSWORD
 #   GRAFANA_ADMIN_PASSWORD
 #   GRAFANA_DB_PASSWORD           (read-only grafana_reader DB role, issue #651)
 #   GLITCHTIP_SECRET_KEY
 #   SENTRY_DSN                    (set after GlitchTip first-run; empty = Sentry disabled)
@@ -77,6 +78,7 @@ jobs:
          IMPORT_HOST_DIR=/srv/familienarchiv-production/import
          POSTGRES_USER=archiv
          SENTRY_DSN=${{ secrets.SENTRY_DSN }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          EOF
      - name: Build images
@@ -110,6 +112,7 @@ jobs:
          cp docker-compose.observability.yml /opt/familienarchiv/
          cat > /opt/familienarchiv/obs-secrets.env <<'EOF'
          GRAFANA_ADMIN_PASSWORD=${{ secrets.GRAFANA_ADMIN_PASSWORD }}
          GRAFANA_DB_PASSWORD=${{ secrets.GRAFANA_DB_PASSWORD }}
          GLITCHTIP_SECRET_KEY=${{ secrets.GLITCHTIP_SECRET_KEY }}
          POSTGRES_PASSWORD=${{ secrets.PROD_POSTGRES_PASSWORD }}
          POSTGRES_HOST=archiv-production-db-1
--- a/backend/src/main/java/org/raddatz/familienarchiv/config/FlywayConfig.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/config/FlywayConfig.java
@@ -5,8 +5,10 @@ import lombok.extern.slf4j.Slf4j;
 import org.flywaydb.core.Flyway;
 import org.springframework.context.annotation.Bean;
 import org.springframework.context.annotation.Configuration;
 import org.springframework.core.env.Environment;
 import javax.sql.DataSource;
 import java.util.Map;
@Configuration
@RequiredArgsConstructor
@@ -14,6 +16,7 @@ import javax.sql.DataSource;
 public class FlywayConfig {
    private final DataSource dataSource;
    private final Environment environment;
    @Bean(name = "flyway")
    public Flyway flyway() {
@@ -21,6 +24,7 @@ public class FlywayConfig {
        Flyway flyway = Flyway.configure()
                .dataSource(dataSource)
                .locations("classpath:db/migration")
                .placeholders(Map.of("grafanaDbPassword", resolveGrafanaDbPassword()))
                .baselineOnMigrate(true)
                .baselineVersion("4")
                .load();
@@ -28,4 +32,22 @@ public class FlywayConfig {
        log.info("Flyway: {} migration(s) applied.", result.migrationsExecuted);
        return flyway;
    }
    // Fail-closed: refuse to boot when GRAFANA_DB_PASSWORD is unset. The
    // grafana_reader role's password is (re)set on every boot by
    // R__grafana_reader_password.sql, so a missing env var means we'd either
    // skip the rotation silently or — with a hardcoded fallback — publish a
    // well-known credential for a role with SELECT on audit_log, documents,
    // and transcription_blocks. Same shape as UserDataInitializer's refusal
    // to seed default admin credentials outside dev/test/e2e.
    String resolveGrafanaDbPassword() {
        String value = environment.getProperty("GRAFANA_DB_PASSWORD");
        if (value == null || value.isBlank()) {
            throw new IllegalStateException(
                    "GRAFANA_DB_PASSWORD is required: it is consumed by "
                    + "R__grafana_reader_password.sql to (re)set the grafana_reader "
                    + "role's password on every boot. Generate with: openssl rand -hex 32");
        }
        return value;
    }
 }
--- a/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/importing/MassImportService.java
@@ -56,9 +56,17 @@ public class MassImportService {
    public enum State { IDLE, RUNNING, DONE, FAILED }
    public enum SkipReason {
        INVALID_FILENAME_PATH_TRAVERSAL,
        INVALID_PDF_SIGNATURE,
        FILE_READ_ERROR,
        ALREADY_EXISTS,
        S3_UPLOAD_FAILED
    }
    public record SkippedFile(
            @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String filename,
-            @Schema(requiredMode = Schema.RequiredMode.REQUIRED) String reason
+            @Schema(requiredMode = Schema.RequiredMode.REQUIRED) SkipReason reason
    ) {}
    public record ImportStatus(
@@ -291,6 +299,11 @@ public class MassImportService {
            if (index.isBlank()) continue;
            String filename = index.contains(".") ? index : index + ".pdf";
            if (!isValidImportFilename(filename)) {
                log.warn("Skipping import row {}: filename rejected — {}", i, filename);
                skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_FILENAME_PATH_TRAVERSAL));
                continue;
            }
            Optional<File> fileOnDisk = findFileRecursive(filename);
            if (fileOnDisk.isEmpty()) {
                log.warn("Datei nicht gefunden, importiere nur Metadaten: {}", filename);
@@ -300,17 +313,17 @@ public class MassImportService {
                try {
                    if (!isPdfMagicBytes(fileOnDisk.get())) {
                        log.warn("Überspringe {}: Datei beginnt nicht mit %PDF-Signatur", filename);
-                        skippedFiles.add(new SkippedFile(filename, "INVALID_PDF_SIGNATURE"));
+                        skippedFiles.add(new SkippedFile(filename, SkipReason.INVALID_PDF_SIGNATURE));
                        continue;
                    }
                } catch (IOException e) {
                    log.error("Fehler beim Prüfen der Magic-Bytes für {}", filename, e);
-                    skippedFiles.add(new SkippedFile(filename, "FILE_READ_ERROR"));
+                    skippedFiles.add(new SkippedFile(filename, SkipReason.FILE_READ_ERROR));
                    continue;
                }
            }
-            Optional<String> skipReason = importSingleDocument(cells, fileOnDisk, filename, index);
+            Optional<SkipReason> skipReason = importSingleDocument(cells, fileOnDisk, filename, index);
            if (skipReason.isPresent()) {
                skippedFiles.add(new SkippedFile(filename, skipReason.get()));
            } else {
@@ -320,6 +333,23 @@ public class MassImportService {
        return new ProcessResult(processed, skippedFiles);
    }
    private boolean isValidImportFilename(String filename) {
        if (filename == null || filename.isBlank()) return false;
        if (filename.contains("/")) return false;
        if (filename.contains("\\")) return false;
        if (filename.contains("∕")) return false;  // U+2215 DIVISION SLASH
        if (filename.contains("／")) return false;  // U+FF0F FULLWIDTH SOLIDUS
        if (filename.contains("⧵")) return false;  // U+29F5 REVERSE SOLIDUS OPERATOR
        if (filename.contains("..")) return false;
        if (filename.equals(".")) return false;
        if (filename.contains("\0")) return false;
        // Paths.get() is safe here on Linux for all inputs that passed the checks above;
        // it may throw InvalidPathException for OS-specific illegal chars on Windows,
        // but those are not reachable in production.
        if (Paths.get(filename).isAbsolute()) return false;
        return true;
    }
    // package-private: Mockito spy in tests can override to inject IOException
    InputStream openFileStream(File file) throws IOException {
        return new FileInputStream(file);
@@ -342,11 +372,11 @@ public class MassImportService {
     * @return empty Optional on success; an Optional containing the skip reason on failure/skip.
     */
    @Transactional
-    protected Optional<String> importSingleDocument(List<String> cells, Optional<File> file, String originalFilename, String index) {
+    protected Optional<SkipReason> importSingleDocument(List<String> cells, Optional<File> file, String originalFilename, String index) {
        Optional<Document> existing = documentService.findByOriginalFilename(originalFilename);
        if (existing.isPresent() && existing.get().getStatus() != DocumentStatus.PLACEHOLDER) {
            log.info("Dokument {} existiert bereits, überspringe.", originalFilename);
-            return Optional.of("ALREADY_EXISTS");
+            return Optional.of(SkipReason.ALREADY_EXISTS);
        }
        String archiveBox    = getCell(cells, colBox);
@@ -382,7 +412,7 @@ public class MassImportService {
                status = DocumentStatus.UPLOADED;
            } catch (Exception e) {
                log.error("S3 Upload Fehler für {}", file.get().getName(), e);
-                return Optional.of("S3_UPLOAD_FAILED");
+                return Optional.of(SkipReason.S3_UPLOAD_FAILED);
            }
        }
@@ -460,11 +490,18 @@ public class MassImportService {
    }
    private Optional<File> findFileRecursive(String filename) {
-        try (Stream<Path> walk = Files.walk(Paths.get(importDir))) {
+        File baseDir = new File(importDir);
-            return walk.filter(p -> !Files.isDirectory(p))
+        try (Stream<Path> walk = Files.walk(baseDir.toPath())) {
            Optional<Path> match = walk.filter(p -> !Files.isDirectory(p))
                    .filter(p -> p.getFileName().toString().equals(filename))
                    .map(Path::toFile)
                    .findFirst();
            if (match.isEmpty()) return Optional.empty();
            File candidate = match.get().toFile();
            String baseDirCanonical = baseDir.getCanonicalPath();
            if (!candidate.getCanonicalPath().startsWith(baseDirCanonical + File.separator)) {
                throw DomainException.internal(ErrorCode.INTERNAL_ERROR, "Path escape detected: " + candidate);
            }
            return Optional.of(candidate);
        } catch (IOException e) {
            return Optional.empty();
        }
--- a/backend/src/main/resources/db/migration/R__grafana_reader_password.sql
+++ b/backend/src/main/resources/db/migration/R__grafana_reader_password.sql
@@ -0,0 +1,14 @@
 -- Repeatable migration: sets the grafana_reader role's password from the
 -- ${grafanaDbPassword} placeholder (resolved by FlywayConfig from the
 -- GRAFANA_DB_PASSWORD environment variable). Flyway computes the checksum on
 -- the resolved migration content, so any change to GRAFANA_DB_PASSWORD changes
 -- the checksum and re-applies this migration on the next boot. That makes
 -- password rotation a "change env var + restart" operation — no manual psql.
 --
 -- V68 created the role itself (without a usable password). This file owns the
 -- password lifecycle; nothing else writes it.
 DO $$
 BEGIN
    EXECUTE format('ALTER ROLE grafana_reader WITH PASSWORD %L', '${grafanaDbPassword}');
 END
 $$;
--- a/backend/src/main/resources/db/migration/V68__add_grafana_reader_role.sql
+++ b/backend/src/main/resources/db/migration/V68__add_grafana_reader_role.sql
@@ -0,0 +1,17 @@
 -- Read-only role used by the Grafana PostgreSQL datasource for the PO Overview
 -- dashboard (issue #651). The role is created here without a usable password
 -- (LOGIN-capable but no password set); R__grafana_reader_password.sql sets the
 -- password from GRAFANA_DB_PASSWORD on every boot, so rotation is just "bump
 -- the env var and restart the backend" — see docs/adr/024-* and the rotation
 -- runbook in docs/DEPLOYMENT.md.
 DO $$
 BEGIN
    IF NOT EXISTS (SELECT 1 FROM pg_catalog.pg_roles WHERE rolname = 'grafana_reader') THEN
        CREATE ROLE grafana_reader WITH LOGIN;
    END IF;
 END
 $$;
 GRANT CONNECT ON DATABASE ${flyway:database} TO grafana_reader;
 GRANT USAGE  ON SCHEMA   public               TO grafana_reader;
 GRANT SELECT ON audit_log, documents, transcription_blocks TO grafana_reader;
--- a/backend/src/test/java/org/raddatz/familienarchiv/config/FlywayConfigTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/config/FlywayConfigTest.java
@@ -0,0 +1,37 @@
 package org.raddatz.familienarchiv.config;
 import org.junit.jupiter.api.Test;
 import org.springframework.mock.env.MockEnvironment;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 class FlywayConfigTest {
    @Test
    void resolveGrafanaDbPassword_throws_when_env_unset() {
        FlywayConfig config = new FlywayConfig(null, new MockEnvironment());
        assertThatThrownBy(config::resolveGrafanaDbPassword)
                .isInstanceOf(IllegalStateException.class)
                .hasMessageContaining("GRAFANA_DB_PASSWORD is required");
    }
    @Test
    void resolveGrafanaDbPassword_throws_when_env_blank() {
        MockEnvironment env = new MockEnvironment().withProperty("GRAFANA_DB_PASSWORD", "   ");
        FlywayConfig config = new FlywayConfig(null, env);
        assertThatThrownBy(config::resolveGrafanaDbPassword)
                .isInstanceOf(IllegalStateException.class)
                .hasMessageContaining("GRAFANA_DB_PASSWORD is required");
    }
    @Test
    void resolveGrafanaDbPassword_returns_value_when_env_set() {
        MockEnvironment env = new MockEnvironment().withProperty("GRAFANA_DB_PASSWORD", "abc");
        FlywayConfig config = new FlywayConfig(null, env);
        assertThat(config.resolveGrafanaDbPassword()).isEqualTo("abc");
    }
 }
--- a/backend/src/test/java/org/raddatz/familienarchiv/config/GrafanaReaderRoleIntegrationTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/config/GrafanaReaderRoleIntegrationTest.java
@@ -0,0 +1,89 @@
 package org.raddatz.familienarchiv.config;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.raddatz.familienarchiv.PostgresContainerConfig;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.boot.data.jpa.test.autoconfigure.DataJpaTest;
 import org.springframework.boot.jdbc.test.autoconfigure.AutoConfigureTestDatabase;
 import org.springframework.context.annotation.Import;
 import org.springframework.jdbc.core.JdbcTemplate;
 import static org.assertj.core.api.Assertions.assertThat;
 // GRAFANA_DB_PASSWORD is supplied via the global test default in
 // src/test/resources/application.properties — FlywayConfig fails closed
 // when it is unset, so all tests that load the migration path need it.
@DataJpaTest
@AutoConfigureTestDatabase(replace = AutoConfigureTestDatabase.Replace.NONE)
@Import({PostgresContainerConfig.class, FlywayConfig.class})
 class GrafanaReaderRoleIntegrationTest {
    @Autowired JdbcTemplate jdbc;
    // --- positive grants (SELECT on the three explicitly granted tables) ---
    @Test
    void grafana_reader_has_select_on_audit_log() {
        assertThat(hasPrivilege("audit_log", "SELECT")).isTrue();
    }
    @Test
    void grafana_reader_has_select_on_documents() {
        assertThat(hasPrivilege("documents", "SELECT")).isTrue();
    }
    @Test
    void grafana_reader_has_select_on_transcription_blocks() {
        assertThat(hasPrivilege("transcription_blocks", "SELECT")).isTrue();
    }
    // --- write-deny on the granted tables: SELECT-only means SELECT-only.
    // A future migration that GRANTs INSERT/UPDATE/DELETE on any of these
    // would fail these tests, even though the original positive grants still
    // pass. Locks the boundary in both directions.
    @Test
    void grafana_reader_has_no_INSERT_on_documents() {
        assertThat(hasPrivilege("documents", "INSERT")).isFalse();
    }
    @Test
    void grafana_reader_has_no_UPDATE_on_audit_log() {
        assertThat(hasPrivilege("audit_log", "UPDATE")).isFalse();
    }
    @Test
    void grafana_reader_has_no_DELETE_on_transcription_blocks() {
        assertThat(hasPrivilege("transcription_blocks", "DELETE")).isFalse();
    }
    // --- negative grants: PII / sensitive tables MUST NOT be readable.
    // The parameterized form catches the "someone widened the grant to
    // ALL TABLES IN SCHEMA public" footgun — three specific positive grants
    // would still pass while this sweep turns red.
    @ParameterizedTest
    @ValueSource(strings = {
            "app_users",
            "user_groups",
            "persons",
            "notifications",
            "document_comments",
            "document_annotations",
            "geschichten"
    })
    void grafana_reader_has_no_SELECT_on_protected_table(String table) {
        assertThat(hasPrivilege(table, "SELECT")).isFalse();
    }
    private boolean hasPrivilege(String table, String privilege) {
        Boolean result = jdbc.queryForObject(
                "SELECT has_table_privilege('grafana_reader', ?, ?)",
                Boolean.class,
                table,
                privilege);
        return Boolean.TRUE.equals(result);
    }
 }
--- a/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/importing/MassImportServiceTest.java
@@ -154,10 +154,10 @@ class MassImportServiceTest {
                .build();
        when(documentService.findByOriginalFilename("doc001.pdf")).thenReturn(Optional.of(existing));
-        Optional<String> result = service.importSingleDocument(minimalCells("doc001.pdf"), Optional.empty(), "doc001.pdf", "doc001");
+        Optional<MassImportService.SkipReason> result = service.importSingleDocument(minimalCells("doc001.pdf"), Optional.empty(), "doc001.pdf", "doc001");
        verify(documentService, never()).save(any());
-        assertThat(result).isPresent().contains("ALREADY_EXISTS");
+        assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS);
    }
    // ─── importSingleDocument — already-exists guard fires before file I/O ─────
@@ -179,10 +179,10 @@ class MassImportServiceTest {
        byte[] pdfHeader = {0x25, 0x50, 0x44, 0x46, 0x2D}; // %PDF-
        Files.write(physicalFile, pdfHeader);
-        Optional<String> result = service.importSingleDocument(
+        Optional<MassImportService.SkipReason> result = service.importSingleDocument(
                minimalCells("present.pdf"), Optional.of(physicalFile.toFile()), "present.pdf", "present");
-        assertThat(result).isPresent().contains("ALREADY_EXISTS");
+        assertThat(result).isPresent().contains(MassImportService.SkipReason.ALREADY_EXISTS);
        verify(s3Client, never()).putObject(any(PutObjectRequest.class), any(RequestBody.class));
        verify(documentService, never()).save(any());
    }
@@ -204,7 +204,7 @@ class MassImportServiceTest {
        assertThat(service.getStatus().skipped()).isEqualTo(1);
        assertThat(service.getStatus().skippedFiles())
                .extracting(MassImportService.SkippedFile::filename, MassImportService.SkippedFile::reason)
-                .containsExactly(org.assertj.core.groups.Tuple.tuple("upload_fail.pdf", "S3_UPLOAD_FAILED"));
+                .containsExactly(org.assertj.core.groups.Tuple.tuple("upload_fail.pdf", MassImportService.SkipReason.S3_UPLOAD_FAILED));
    }
    @Test
@@ -223,7 +223,7 @@ class MassImportServiceTest {
        assertThat(service.getStatus().skipped()).isEqualTo(1);
        assertThat(service.getStatus().skippedFiles())
                .extracting(MassImportService.SkippedFile::reason)
-                .containsExactly("ALREADY_EXISTS");
+                .containsExactly(MassImportService.SkipReason.ALREADY_EXISTS);
    }
    // ─── importSingleDocument — create new document (metadata only) ───────────
@@ -283,11 +283,11 @@ class MassImportServiceTest {
        doThrow(new RuntimeException("S3 error"))
                .when(s3Client).putObject(any(PutObjectRequest.class), any(RequestBody.class));
-        Optional<String> result = service.importSingleDocument(
+        Optional<MassImportService.SkipReason> result = service.importSingleDocument(
                minimalCells("fail.pdf"), Optional.of(tempFile.toFile()), "fail.pdf", "fail");
        verify(documentService, never()).save(any());
-        assertThat(result).isPresent().contains("S3_UPLOAD_FAILED");
+        assertThat(result).isPresent().contains(MassImportService.SkipReason.S3_UPLOAD_FAILED);
    }
    // ─── importSingleDocument — sender handling ───────────────────────────────
@@ -438,6 +438,110 @@ class MassImportServiceTest {
        verify(documentService).findByOriginalFilename("doc002.pdf");
    }
    // ─── isValidImportFilename — security regression — do not remove ─────────
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameIsNull() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", (String) null);
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameIsBlank() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "   ");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsForwardSlash() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "etc/passwd");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsBackslash() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "..\\etc\\passwd");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsDotDot() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "doc..evil.pdf");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameIsDotDot() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "..");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameIsAbsolutePath() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "/etc/passwd");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsNullByte() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "file\0.pdf");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsTrue_whenFilenameIsPlainBasename() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "document.pdf");
        assertThat(result).isTrue();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeDivisionSlash() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo∕bar.pdf");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsFullwidthSlash() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo／bar.pdf");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsFalse_whenFilenameContainsUnicodeReverseSolidus() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "foo⧵bar.pdf");
        assertThat(result).isFalse();
    }
    @Test
    void isValidImportFilename_returnsTrue_whenFilenameHasLeadingDot() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", ".hidden.pdf");
        assertThat(result).isTrue();
    }
    @Test
    void isValidImportFilename_returnsTrue_whenFilenameHasSpaces() {
        boolean result = ReflectionTestUtils.invokeMethod(service, "isValidImportFilename", "Brief an Oma.pdf");
        assertThat(result).isTrue();
    }
    @Test
    void processRows_skipsRowAndContinues_whenFilenameIsPathTraversal() {
        when(documentService.findByOriginalFilename("legitimate.pdf")).thenReturn(Optional.empty());
        when(documentService.save(any())).thenAnswer(inv -> inv.getArgument(0));
        List<List<String>> rows = List.of(
                List.of("header"),
                minimalCells("../evil"),       // row 1: path traversal — should be skipped
                minimalCells("legitimate.pdf") // row 2: valid — should be processed
        );
        MassImportService.ProcessResult result = ReflectionTestUtils.invokeMethod(service, "processRows", rows);
        assertThat(result.processed()).isEqualTo(1);
        assertThat(result.skippedFiles())
                .extracting(MassImportService.SkippedFile::reason)
                .containsExactly(MassImportService.SkipReason.INVALID_FILENAME_PATH_TRAVERSAL);
    }
    // ─── importSingleDocument — non-blank optional fields ────────────────────
    @Test
@@ -651,7 +755,22 @@ class MassImportServiceTest {
        assertThat(spyService.getStatus().skipped()).isEqualTo(1);
        assertThat(spyService.getStatus().skippedFiles())
                .extracting(MassImportService.SkippedFile::reason)
-                .containsExactly("FILE_READ_ERROR");
+                .containsExactly(MassImportService.SkipReason.FILE_READ_ERROR);
    }
    // ─── findFileRecursive — symlink escape security regression — do not remove ─
    @Test
    void findFileRecursive_throwsDomainException_whenSymlinkEscapesImportDir(
            @TempDir Path importDirPath, @TempDir Path outsideDir) throws Exception {
        Path outsideFile = outsideDir.resolve("secret.pdf");
        Files.writeString(outsideFile, "sensitive content");
        Files.createSymbolicLink(importDirPath.resolve("secret.pdf"), outsideFile);
        ReflectionTestUtils.setField(service, "importDir", importDirPath.toString());
        assertThatThrownBy(() -> ReflectionTestUtils.invokeMethod(service, "findFileRecursive", "secret.pdf"))
                .isInstanceOf(DomainException.class);
    }
    // ─── readOds — XXE security regression ───────────────────────────────────
--- a/backend/src/test/resources/application.properties
+++ b/backend/src/test/resources/application.properties
@@ -1,2 +1,8 @@
 logging.level.root=WARN
 logging.level.org.raddatz=INFO
 # Default test value so FlywayConfig's fail-closed check passes without each
 # test having to set GRAFANA_DB_PASSWORD explicitly. The actual value is
 # irrelevant in tests — Flyway only uses it to set the grafana_reader role's
 # password, which no test connects with.
 GRAFANA_DB_PASSWORD=test-grafana-reader-password
--- a/docker-compose.observability.yml
+++ b/docker-compose.observability.yml
@@ -147,6 +147,9 @@ services:
      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
      GF_USERS_ALLOW_SIGN_UP: "false"
      GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-http://localhost:3003}
      # Read-only password for the grafana_reader PostgreSQL role; interpolated
      # into the provisioned PostgreSQL datasource (see datasources.yml).
      GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
    volumes:
      - grafana_data:/var/lib/grafana
      - ./infra/observability/grafana/provisioning:/etc/grafana/provisioning:ro
@@ -165,6 +168,7 @@ services:
        condition: service_healthy
    networks:
      - obs-net
      - archiv-net   # PO Overview dashboard queries archive-db via the grafana_reader role
  # --- Error Tracking: GlitchTip ---
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -227,6 +227,9 @@ services:
      SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/archiv
      SPRING_DATASOURCE_USERNAME: archiv
      SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD}
      # Consumed by Flyway V68 via the ${grafanaDbPassword} placeholder to set
      # the read-only grafana_reader role's password.
      GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
      # Application uses the bucket-scoped service account, not MinIO root.
      S3_ENDPOINT: http://minio:9000
      S3_ACCESS_KEY: archiv-app
@@ -252,6 +255,8 @@ services:
      OTEL_METRICS_EXPORTER: none
      MANAGEMENT_METRICS_TAGS_APPLICATION: Familienarchiv
      MANAGEMENT_TRACING_SAMPLING_PROBABILITY: ${MANAGEMENT_TRACING_SAMPLING_PROBABILITY:-0.1}
      SENTRY_DSN: ${SENTRY_DSN:-}
      LOGGING_STRUCTURED_FORMAT_CONSOLE: ecs
    networks:
      - archiv-net
    healthcheck:
@@ -266,6 +271,10 @@ services:
    build:
      context: ./frontend
      target: production
      args:
        # Vite build-time variable — baked into the JS bundle at build time.
        # Empty default so deploys succeed before the secret is configured.
        VITE_SENTRY_DSN: ${VITE_SENTRY_DSN:-}
    restart: unless-stopped
    depends_on:
      backend:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -163,6 +163,9 @@ services:
      SPRING_DATASOURCE_URL: jdbc:postgresql://db:5432/${POSTGRES_DB}
      SPRING_DATASOURCE_USERNAME: ${POSTGRES_USER}
      SPRING_DATASOURCE_PASSWORD: ${POSTGRES_PASSWORD}
      # Consumed by Flyway V68 via the ${grafanaDbPassword} placeholder to set
      # the read-only grafana_reader role's password.
      GRAFANA_DB_PASSWORD: ${GRAFANA_DB_PASSWORD}
      S3_ENDPOINT: http://minio:9000
      S3_ACCESS_KEY: ${MINIO_ROOT_USER}
      S3_SECRET_KEY: ${MINIO_ROOT_PASSWORD}
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -152,6 +152,7 @@ All vars are set in `.env` at the repo root (copy from `.env.example`). The back
 | `PORT_GRAFANA` | Host port for the Grafana UI (bound to `127.0.0.1` only) | `3003` | — | — |
 | `POSTGRES_HOST` | PostgreSQL hostname for GlitchTip's db-init job and workers. Override when only the staging stack is running and `archive-db` is not resolvable by that name. | `archive-db` | — | — |
 | `GRAFANA_ADMIN_PASSWORD` | Grafana `admin` user password | `changeme` | YES (prod) | YES |
 | `GRAFANA_DB_PASSWORD` | Password for the read-only `grafana_reader` PostgreSQL role used by the PO Overview dashboard (issue #651). Consumed by Flyway V68 and the Grafana PostgreSQL datasource. Generate with `openssl rand -hex 32`. | — | YES (prod) | YES |
 | `PORT_GLITCHTIP` | Host port for the GlitchTip UI (bound to `127.0.0.1` only) | `3002` | — | — |
 | `GLITCHTIP_DOMAIN` | Public-facing base URL for GlitchTip (used in email links and CORS) | `http://localhost:3002` | YES (prod) | — |
 | `GLITCHTIP_SECRET_KEY` | Django secret key for GlitchTip — generate with `python3 -c "import secrets; print(secrets.token_hex(32))"` | — | YES | YES |
@@ -256,6 +257,7 @@ git.raddatz.cloud      A   <server IP>
 | `MAIL_USERNAME` | release.yml | SMTP user |
 | `MAIL_PASSWORD` | release.yml | SMTP password |
 | `GRAFANA_ADMIN_PASSWORD` | both | Grafana `admin` login — generate a strong password |
 | `GRAFANA_DB_PASSWORD` | both | Read-only `grafana_reader` role password — `openssl rand -hex 32` |
 | `GLITCHTIP_SECRET_KEY` | both | Django secret key — `openssl rand -hex 32` |
 | `SENTRY_DSN` | both | GlitchTip project DSN — set after first-run (§4); leave empty to keep Sentry disabled |
 | `VITE_SENTRY_DSN` | both | GlitchTip frontend project DSN — set after first-run (§4); leave empty to keep Sentry disabled |
@@ -357,6 +359,7 @@ Both files are passed explicitly via `--env-file` to the compose command, so the
 | Gitea secret | Notes |
 |---|---|
 | `GRAFANA_ADMIN_PASSWORD` | Strong unique password; shared by nightly and release |
 | `GRAFANA_DB_PASSWORD` | `openssl rand -hex 32`; shared by nightly and release — read-only DB role for the PO Overview dashboard |
 | `GLITCHTIP_SECRET_KEY` | `openssl rand -hex 32`; shared by nightly and release |
 | `STAGING_POSTGRES_PASSWORD` / `PROD_POSTGRES_PASSWORD` | Must match the running PostgreSQL container |
@@ -427,6 +430,31 @@ docker exec obs-loki wget -qO- \
 Prometheus port `9090` and Grafana port `3003` (default; configurable via `PORT_GRAFANA`) are bound to `127.0.0.1` on the host. No other observability ports are host-bound.
 ##### Rotate the `grafana_reader` DB password
 The PO Overview dashboard reads `audit_log`, `documents`, and `transcription_blocks` through the SELECT-only `grafana_reader` PostgreSQL role (issue #651, ADR-024). The role's password is owned by `R__grafana_reader_password.sql` — a Flyway *repeatable* migration that re-runs whenever the resolved `${grafanaDbPassword}` placeholder changes. That makes rotation a two-restart operation, no manual `psql` required.
 ```bash
 # 1. Generate a new value
 openssl rand -hex 32
 # 2. Update both sides:
 #    - Gitea secret GRAFANA_DB_PASSWORD (nightly + release workflows pick it up)
 #    - Local .env on the server / dev machine
 # 3. Restart the backend. Flyway sees that R__'s resolved checksum changed and
 #    re-applies it, issuing ALTER ROLE grafana_reader WITH PASSWORD '<new>'.
 docker compose restart backend
 # 4. Restart obs-grafana so the provisioned datasource picks up the new env value.
 docker compose -f docker-compose.observability.yml restart obs-grafana
 # 5. Verify the dashboard loads — PO Overview's Postgres panels should populate
 #    instead of "Data source error".
 ```
 If `GRAFANA_DB_PASSWORD` is unset, the backend **refuses to start** (`IllegalStateException`). That is deliberate — see `FlywayConfig.resolveGrafanaDbPassword()` and the rationale in ADR-024.
 #### GlitchTip
 | Item | Value |
--- a/docs/GLOSSARY.md
+++ b/docs/GLOSSARY.md
@@ -80,6 +80,14 @@ _See also [DocumentStatus lifecycle](#documentstatus-lifecycle)._
 **Sütterlin** — A specific standardized style of Kurrent taught in German schools from 1915 to 1941.
 **Illegible word** — a word whose recognition confidence falls below the configured threshold; replaced with the literal token `[unleserlich]` in the rendered block text and counted in the `ocr_illegible_words_total` Prometheus counter.
 **Models-ready gauge** — the `ocr_models_ready` Prometheus gauge, flipped from `0` to `1` once the FastAPI lifespan startup has finished loading the Kraken model and the spell-checker. Used both for the `/health` endpoint and as the supervised signal for the `ocr_models_ready < 1 for 2m` alert.
 **Recognition model accuracy** — the accuracy reported by `ketos train` for the recognition (text-line) model, exposed as `ocr_model_accuracy{kind="recognition"}`. Sourced from `_parse_best_checkpoint` on the highest-scoring checkpoint after training.
 **Segmentation model accuracy** — the accuracy reported by `ketos segtrain` for the baseline layout analysis (`blla`) model, exposed as `ocr_model_accuracy{kind="segmentation"}`. Distinct from recognition accuracy because the two models are trained and improved independently.
 ---
 ## Other Domain Terms
--- a/docs/OBSERVABILITY.md
+++ b/docs/OBSERVABILITY.md
@@ -118,11 +118,14 @@ To find a trace for a specific request in staging/production, either increase th
 ## Metrics (Prometheus → Grafana)
-Prometheus scrapes the backend management endpoint every 15 s:
+Prometheus scrapes two targets every 15 s:
 ```
 Target: backend:8081/actuator/prometheus
 Labels: job="spring-boot", application="Familienarchiv"
 Target: ocr:8000/metrics
 Labels: job="ocr-service"
 ```
 All Spring Boot metrics carry the `application="Familienarchiv"` tag, which is how the Grafana Spring Boot Observability dashboard (ID 17175) filters to this service.
@@ -146,6 +149,70 @@ jvm_memory_used_bytes{area="heap", application="Familienarchiv"}
 hikaricp_connections_active
 ```
 ### OCR-service custom metrics
 Exposed at `ocr:8000/metrics` by `prometheus-fastapi-instrumentator`. The
 `http_*` metrics describe the FastAPI request layer; the `ocr_*` series are
 domain-specific. **Never label these with PII or document content** — labels
 have unbounded cardinality risk and are visible to anyone with Grafana access.
 | Metric | Type | Labels | Unit | What it tracks |
 |---|---|---|---|---|
 | `ocr_jobs_total` | Counter | `engine` (`surya`/`kraken`), `script_type` | jobs | OCR jobs that started after a successful PDF download |
 | `ocr_pages_total` | Counter | `engine` | pages | Successfully OCR'd pages in the streaming generator |
 | `ocr_skipped_pages_total` | Counter | — | pages | Pages skipped because the engine raised on them |
 | `ocr_words_total` | Counter | — | words | Recognized words summed across every block |
 | `ocr_illegible_words_total` | Counter | — | words | Words below the confidence threshold (rendered as `[unleserlich]`) |
 | `ocr_processing_seconds` | Histogram | `engine` | seconds | Per-page (stream) or per-document (`/ocr`) engine time, excluding preprocessing |
 | `ocr_training_runs_total` | Counter | `kind` (`recognition`/`segmentation`), `outcome` (`success`/`error`) | runs | Completed training runs |
 | `ocr_model_accuracy` | Gauge | `kind` | ratio (0–1) | Latest accuracy reported by a successful training run |
 | `ocr_models_ready` | Gauge | — | 0\|1 | 1 once the lifespan startup has finished loading models |
 Canonical example queries (the same ones referenced in issue #652):
 ```promql
 # OCR throughput by engine
 sum by (engine) (rate(ocr_pages_total[5m]))
 # Share of words rendered as [unleserlich]
 sum(rate(ocr_illegible_words_total[5m]))
  / sum(rate(ocr_words_total[5m]))
 # p95 page processing time per engine
 histogram_quantile(0.95, sum by (engine, le) (
  rate(ocr_processing_seconds_bucket[5m])
 ))
 # Training error rate
 sum(rate(ocr_training_runs_total{outcome="error"}[1h]))
  / sum(rate(ocr_training_runs_total[1h]))
 # Latest recognition vs segmentation accuracy
 ocr_model_accuracy
 ```
 ### Internal-only endpoints
 `/metrics` is exposed by the OCR service over plain HTTP without
 authentication. The container is reachable only on the internal Docker
 network — Caddy never proxies to it directly. If the service is ever
 exposed (e.g. a `ports:` mapping is added), block the endpoint at the
 reverse proxy:
 ```caddy
 ocr.example.com {
    @internal_only path /metrics /health
    respond @internal_only 404
    reverse_proxy ocr:8000
 }
 ```
 The `MetricsPathFilter` in `ocr-service/main.py` suppresses uvicorn's
 **stdout** access log lines for `/metrics` and `/health` so the container
 console stays focused on real OCR traffic. Promtail/Loki still receive
 access lines from any other source. Treat the filter as console
 noise-control, not an audit-suppression mechanism.
 ## Errors (GlitchTip)
 GlitchTip receives errors from both the backend (via Sentry Java SDK) and the frontend (via Sentry JavaScript SDK). It groups events by fingerprint, tracks first/last seen times, and links to the release that introduced the error.
--- a/docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md
+++ b/docs/adr/023-prometheus-instrumentator-and-metrics-registry-injection.md
@@ -0,0 +1,94 @@
 # ADR-023: Prometheus Instrumentator and Metrics Registry Injection
 ## Status
 Accepted
 ## Context
 Until issue #652 the OCR service exposed no `/metrics` endpoint. The
 observability stack already scrapes the Spring Boot backend's actuator
 endpoint, but it had nothing to scrape on the Python side. Without HTTP-
 and domain-level metrics from `ocr-service` we cannot answer questions
 like "what is the share of words rendered as `[unleserlich]`" or
 "is the training error rate above its budget" from Grafana.
 Two implementation requirements influenced the design:
 1. **Counter / gauge isolation in tests.** `prometheus_client` collectors
   are module-level singletons keyed by name on the global `REGISTRY`.
   Re-importing or naively re-instantiating them raises a duplicated-
   collector error and cross-test state leaks (a `.inc()` in test A is
   still readable by test B). A test harness needs a way to swap the
   active container for a fresh per-test instance.
 2. **Minimal blast radius on the request path.** We did not want to
   hand-instrument every endpoint with FastAPI middleware. The
   `prometheus-fastapi-instrumentator` library already provides
   `http_requests_total`, `http_request_duration_seconds`, and the
   `/metrics` exposition route, all idiomatic Prometheus names.
 ## Decision
 - Add `prometheus-fastapi-instrumentator==7.0.0` and pin its transitive
  dependency `prometheus-client==0.25.0` explicitly in
  `ocr-service/requirements.txt`.
 - Mount the instrumentator once at module load:
  `Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app)`.
  This adds `/metrics` and an HTTP-level dashboard surface without
  changing any endpoint code.
 - Define every domain metric (`ocr_jobs_total`, `ocr_pages_total`,
  `ocr_processing_seconds`, …) inside a `build_metrics(registry)`
  factory in `ocr-service/metrics.py` that returns a frozen `OcrMetrics`
  dataclass. Production code binds the container to the default
  `REGISTRY` once: `metrics: OcrMetrics = build_metrics(REGISTRY)`.
 - Tests use a `fresh_metrics` fixture that builds a new
  `CollectorRegistry()` per test and monkeypatches `main.metrics` with
  a container bound to it. The endpoint code keeps reading
  `metrics.<name>` without knowing whether it is talking to the global
  registry or a per-test one.
 ## Consequences
 **Positive**
 - One reusable factory captures the metric definitions; future metrics
  go in one place.
 - Tests run with full counter isolation. Cross-test state leakage is
  impossible because each test sees its own dataclass instance.
 - The instrumentator gives us `http_*` metrics for free, including a
  Grafana-ready histogram that pairs with the Spring Boot one.
 **Negative**
 - One extra level of indirection: any test that asserts on metric
  values must remember to monkeypatch `main.metrics`, not the registry
  directly. Rebinding through the registry is harmless but useless —
  the dataclass holds references to the original collectors.
 - `prometheus-client` is now pinned. Upgrading it requires an explicit
  bump and re-checking the instrumentator's compatibility range.
 - `/metrics` is exposed unauthenticated and relies on the Docker
  internal network for confidentiality. See
  [docs/OBSERVABILITY.md §Internal-only endpoints](../OBSERVABILITY.md)
  for the Caddy snippet that must be added if the service ever gets a
  host-side port mapping.
 ## Alternatives considered
 - **Hand-roll the `/metrics` endpoint.** Rejected: would have meant
  duplicating what `prometheus-fastapi-instrumentator` ships, plus
  middleware for the HTTP histograms.
 - **Skip the factory; pass `registry` as a function argument
  everywhere.** Rejected: clutters every endpoint signature and breaks
  the symmetry with the Spring Boot side, which also relies on a
  process-global Micrometer registry.
 - **Use a `pytest` autouse fixture that resets `REGISTRY` between
  tests.** Rejected: `prometheus_client` does not expose a clean
  "unregister all" hook, and we would be relying on private APIs.
 ## References
 - Issue: [#652](https://git.raddatz.cloud/marcel/familienarchiv/issues/652)
 - Library: <https://github.com/trallnag/prometheus-fastapi-instrumentator>
 - Code: `ocr-service/metrics.py`, `ocr-service/main.py`,
  `ocr-service/test_metrics.py`
--- a/docs/adr/024-grafana-reads-archive-db-via-bridged-network.md
+++ b/docs/adr/024-grafana-reads-archive-db-via-bridged-network.md
@@ -0,0 +1,123 @@
 # ADR-024: Grafana reads archive-db via a bridged network and a SELECT-only role
 ## Status
 Accepted
 ## Context
 Issue #651 (the PO Overview Grafana dashboard) needs aggregates over three
 tables in the main application database — `audit_log`, `documents`, and
 `transcription_blocks` — to answer the operator's four weekly questions: is
 everything working, are people using it, is the archive making progress, is
 OCR working well.
 Until now, `obs-grafana` and the rest of the observability stack lived on
 their own Docker network (`obs-net`) and never touched `archiv-net`, where
 `archive-db` runs. The two were intentionally isolated: a compromise of any
 observability container could not pivot to the application database.
 The PO Overview's archive-progress and user-activity panels need rolling
 7-day SQL aggregates that cannot be served by Prometheus or Loki. That
 forces a connection from `obs-grafana` to `archive-db` for the first time.
 Two implementation requirements shaped the design:
 1. **Least privilege on the database side.** The Spring Boot application
   role (`archiv`) has full read/write on every table. Letting Grafana
   connect with that role would mean a Grafana compromise becomes an
   application compromise. The dashboard only needs SELECT on three
   tables; the role must reflect that and nothing more.
 2. **Operational simplicity of secret rotation.** The role's password is
   shared between the migration that sets it and the Grafana datasource
   that uses it. A first version of this work put the password in a
   versioned Flyway migration (V68), which Flyway only applies once —
   leaving rotation as an out-of-band `psql ALTER ROLE` step that no
   runbook documented. The shape must support rotation without manual
   SQL.
 ## Decision
 - Provision a dedicated PostgreSQL role `grafana_reader` with `LOGIN` plus
  `GRANT SELECT` on `audit_log`, `documents`, `transcription_blocks` only.
  No INSERT/UPDATE/DELETE on any table, no access to any other table —
  enforced by the database, locked in by both positive and parameterized
  negative tests in `GrafanaReaderRoleIntegrationTest`.
 - Split the role's lifecycle across two migrations:
  - `V68__add_grafana_reader_role.sql` — versioned, immutable, idempotent.
    Creates the role and applies the grants. Runs exactly once per
    database, like every other versioned migration.
  - `R__grafana_reader_password.sql` — Flyway *repeatable* migration that
    issues `ALTER ROLE grafana_reader WITH PASSWORD '${grafanaDbPassword}'`.
    Flyway computes the checksum on the resolved content, so any change
    to `GRAFANA_DB_PASSWORD` flips the checksum and re-applies the
    migration on the next boot. Rotation becomes "bump env var, restart
    backend, restart obs-grafana" — see the runbook in
    `docs/DEPLOYMENT.md §4 → Rotate the grafana_reader DB password`.
 - Resolve the password through Spring's `Environment` rather than a raw
  `System.getenv()` call, so tests inject via `application.properties`
  and the resolver is unit-testable with `MockEnvironment`. Fail closed
  with `IllegalStateException` when the variable is unset — no fallback
  string. Same shape as `UserDataInitializer`'s refusal to seed default
  admin credentials outside dev/test/e2e.
 - Join `obs-grafana` to `archiv-net` in addition to `obs-net`. Only the
  Grafana container crosses the boundary; Loki, Tempo, Prometheus,
  GlitchTip, and the worker containers remain `obs-net`-only.
 ## Consequences
 **Positive**
 - Database-level least privilege: a Grafana compromise gains SELECT on
  three tables. Cannot write, cannot read PII tables like `app_users`,
  `persons`, `notifications`, `document_comments`, `geschichten`. The
  parameterized PII negative sweep in `GrafanaReaderRoleIntegrationTest`
  is the regression gate; new sensitive tables get added to that list.
 - Rotation is documented, idempotent, and survives operator turnover.
  No "the password set on day 1 is the password forever" failure mode.
 - Tests pin down both sides of the boundary: positive grants must hold,
  write-deny must hold, and the PII negative list must stay empty.
 **Negative / trade-offs**
 - `obs-net` is no longer fully isolated from `archiv-net`. A Grafana RCE
  (e.g. via a future Grafana CVE) gains a TCP path to `archive-db` —
  contained, but not impossible. The least-privilege role is the
  mitigation; we accept that mitigation as sufficient for a single
  bridged container.
 - The backend must hold `GRAFANA_DB_PASSWORD` in its environment forever,
  so Flyway can resolve the placeholder on every boot. A backend RCE
  therefore also leaks the Grafana datasource password. Acceptable
  because that password's blast radius is itself bounded by the
  least-privilege grants on `grafana_reader`.
 ## Alternatives considered
 - **Prometheus PostgreSQL exporter, no direct connection.** Loses ad-hoc
  SQL aggregates — the dashboard would need every metric pre-defined as
  an exporter query, with a redeploy to add a new one. The PO Overview
  is the type of dashboard that grows panels over time; pre-defining
  every aggregate is the wrong shape.
 - **Read replica or logical-replication slot dedicated to Grafana.**
  Real operational cost (extra Postgres instance, replication monitoring,
  storage doubled) disproportionate to a weekly PO glance.
 - **Versioned migration with `flyway repair` for rotation.** Rejected:
  conflates schema lifecycle with credential lifecycle, requires manual
  intervention to rotate, and the repair command's semantics are
  surprising to operators unfamiliar with Flyway internals.
 - **Hardcoded fallback password when env var is unset.** Rejected as a
  security blocker: publishes a known credential for a role with read
  access to user activity and full letter text. The fail-closed
  behavior is the explicit defense.
 ## References
 - Issue #651 — PO Overview Grafana dashboard
 - `backend/src/main/resources/db/migration/V68__add_grafana_reader_role.sql`
 - `backend/src/main/resources/db/migration/R__grafana_reader_password.sql`
 - `backend/src/main/java/org/raddatz/familienarchiv/config/FlywayConfig.java`
 - `backend/src/test/java/org/raddatz/familienarchiv/config/GrafanaReaderRoleIntegrationTest.java`
 - `infra/observability/grafana/provisioning/datasources/datasources.yml`
 - `docker-compose.observability.yml` — `archiv-net` bridge on `obs-grafana`
 - `docs/DEPLOYMENT.md §4` — rotation runbook
--- a/docs/architecture/c4/l2-containers.puml
+++ b/docs/architecture/c4/l2-containers.puml
@@ -43,9 +43,12 @@ Rel(ocr, storage, "Fetches PDF via presigned URL", "HTTP / S3 presigned")
 Rel(mc, storage, "Bootstraps bucket + service account on startup", "MinIO Client CLI")
 Rel(promtail, loki, "Pushes log streams", "HTTP/Loki push API")
 Rel(backend, tempo, "Sends distributed traces via OTLP", "HTTP / OTLP / port 4318 (archiv-net)")
 Rel(prometheus, backend, "Scrapes JVM + HTTP metrics", "HTTP 8081 /actuator/prometheus")
 Rel(prometheus, ocr, "Scrapes OCR + http_* metrics", "HTTP 8000 /metrics")
 Rel(grafana, prometheus, "Queries metrics", "HTTP 9090")
 Rel(grafana, loki, "Queries logs", "HTTP 3100")
 Rel(grafana, tempo, "Queries traces", "HTTP 3200")
 Rel(grafana, db, "Read-only dashboard queries via grafana_reader role", "PostgreSQL / archiv-net")
 Rel(glitchtip, db, "Stores error events in glitchtip DB", "PostgreSQL / archiv-net")
 Rel(obs_glitchtip_worker, obs_redis, "Processes Celery tasks", "Redis / obs-net")
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -16,6 +16,10 @@ CMD ["npm", "run", "dev"]
 # Compiles the SvelteKit Node-adapter output to /app/build.
 FROM node:20.19.0-alpine3.21 AS build
 WORKDIR /app
 # VITE_SENTRY_DSN is a build-time variable — Vite bakes it into the bundle.
 # Passed via docker-compose build.args; empty string disables the SDK.
 ARG VITE_SENTRY_DSN
 ENV VITE_SENTRY_DSN=$VITE_SENTRY_DSN
 COPY package.json package-lock.json ./
 RUN npm ci
 COPY . .
--- a/frontend/eslint.config.js
+++ b/frontend/eslint.config.js
@@ -106,6 +106,31 @@ export default defineConfig(
 			]
 		}
 	},
 	{
 		// Forbid test fixtures (*.test-fixture.svelte) from being imported by
 		// production code. Tree-shaking keeps them out of the production bundle
 		// today (no route reaches them), but a lint rule makes the boundary
 		// explicit so an accidental autocomplete import in a route or component
 		// fails fast. Test files (*.spec.ts / *.test.ts) and the fixtures
 		// themselves are exempt — see the next block. Nora #2 on PR #629
 		// round 3.
 		files: ['**/*.svelte', '**/*.svelte.ts', '**/*.svelte.js', '**/*.ts'],
 		ignores: ['**/*.spec.ts', '**/*.test.ts', '**/*.test-fixture.svelte'],
 		rules: {
 			'no-restricted-imports': [
 				'error',
 				{
 					patterns: [
 						{
 							group: ['**/*.test-fixture.svelte'],
 							message:
 								'Test fixtures (*.test-fixture.svelte) are test-only — do not import from production code. Tracked by #637.'
 						}
 					]
 				}
 			]
 		}
 	},
 	{
 		plugins: { boundaries },
 		settings: {
--- a/frontend/messages/de.json
+++ b/frontend/messages/de.json
@@ -445,8 +445,12 @@
 	"person_mention_load_error": "Person konnte nicht geladen werden.",
 	"person_mention_loading": "Lade Person…",
 	"person_mention_popup_empty": "Keine Personen gefunden",
 	"person_mention_search_label": "Person suchen",
 	"person_mention_search_prompt": "Namen eingeben…",
 	"person_mention_btn_label": "Person verlinken",
 	"person_mention_create_new": "Neue Person anlegen",
 	"person_mention_results_count_singular": "1 Person gefunden",
 	"person_mention_results_count_plural": "{count} Personen gefunden",
 	"transcription_editor_aria_label": "Transkriptionstext",
 	"person_born_name_prefix": "geb.",
 	"page_title_home": "Archiv",
@@ -634,6 +638,9 @@
 	"transcription_block_review": "Als geprüft markieren",
 	"transcription_block_unreview": "Markierung aufheben",
 	"transcription_reviewed_count": "{reviewed} von {total} geprüft",
 	"transcription_mark_all_reviewed": "Alle als fertig markieren",
 	"transcription_mark_all_reviewed_disabled": "Alle Blöcke sind bereits als fertig markiert",
 	"transcription_mark_all_reviewed_error": "Markierung fehlgeschlagen. Bitte versuchen Sie es erneut.",
 	"training_ocr_heading": "Kurrent-Erkennung trainieren",
 	"training_ocr_description": "Starte ein neues Training mit den bisher geprüften OCR-Blöcken, um die Erkennungsgenauigkeit für Kurrentschrift zu verbessern.",
 	"training_ocr_blocks_ready": "{blocks} geprüfte Blöcke bereit / {docs} Dokumente",
--- a/frontend/messages/en.json
+++ b/frontend/messages/en.json
@@ -445,8 +445,12 @@
 	"person_mention_load_error": "Could not load person.",
 	"person_mention_loading": "Loading person…",
 	"person_mention_popup_empty": "No persons found",
 	"person_mention_search_label": "Search for a person",
 	"person_mention_search_prompt": "Enter a name…",
 	"person_mention_btn_label": "Link person",
 	"person_mention_create_new": "Create new person",
 	"person_mention_results_count_singular": "1 person found",
 	"person_mention_results_count_plural": "{count} persons found",
 	"transcription_editor_aria_label": "Transcription text",
 	"person_born_name_prefix": "née",
 	"page_title_home": "Archive",
@@ -634,6 +638,9 @@
 	"transcription_block_review": "Mark as reviewed",
 	"transcription_block_unreview": "Unmark as reviewed",
 	"transcription_reviewed_count": "{reviewed} of {total} reviewed",
 	"transcription_mark_all_reviewed": "Mark all as reviewed",
 	"transcription_mark_all_reviewed_disabled": "All blocks are already marked as reviewed",
 	"transcription_mark_all_reviewed_error": "Failed to mark all as reviewed. Please try again.",
 	"training_ocr_heading": "Train Kurrent recognition",
 	"training_ocr_description": "Start a new training run using the reviewed OCR blocks to improve recognition accuracy for Kurrent script.",
 	"training_ocr_blocks_ready": "{blocks} reviewed blocks ready / {docs} documents",
--- a/frontend/messages/es.json
+++ b/frontend/messages/es.json
@@ -445,8 +445,12 @@
 	"person_mention_load_error": "No se pudo cargar la persona.",
 	"person_mention_loading": "Cargando persona…",
 	"person_mention_popup_empty": "No se encontraron personas",
 	"person_mention_search_label": "Buscar persona",
 	"person_mention_search_prompt": "Escribe un nombre…",
 	"person_mention_btn_label": "Vincular persona",
 	"person_mention_create_new": "Crear nueva persona",
 	"person_mention_results_count_singular": "1 persona encontrada",
 	"person_mention_results_count_plural": "{count} personas encontradas",
 	"transcription_editor_aria_label": "Texto de transcripción",
 	"person_born_name_prefix": "n.",
 	"page_title_home": "Archivo",
@@ -634,6 +638,9 @@
 	"transcription_block_review": "Marcar como revisado",
 	"transcription_block_unreview": "Desmarcar como revisado",
 	"transcription_reviewed_count": "{reviewed} de {total} revisados",
 	"transcription_mark_all_reviewed": "Marcar todo como revisado",
 	"transcription_mark_all_reviewed_disabled": "Todos los bloques ya están marcados como revisados",
 	"transcription_mark_all_reviewed_error": "Error al marcar como revisado. Intente de nuevo.",
 	"training_ocr_heading": "Entrenar reconocimiento Kurrent",
 	"training_ocr_description": "Inicia un nuevo entrenamiento con los bloques OCR revisados para mejorar la precisión de reconocimiento del script Kurrent.",
 	"training_ocr_blocks_ready": "{blocks} bloques revisados listos / {docs} documentos",
--- a/frontend/src/lib/document/transcription/TranscriptionBlock.svelte.spec.ts
+++ b/frontend/src/lib/document/transcription/TranscriptionBlock.svelte.spec.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect, vi, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
 import { page } from 'vitest/browser';
-import TranscriptionBlockHost from './TranscriptionBlock.test-host.svelte';
+import TranscriptionBlockHost from './TranscriptionBlock.test-fixture.svelte';
 import type { ConfirmService } from '$lib/shared/services/confirm.svelte.js';
 afterEach(cleanup);
--- a/frontend/src/lib/document/transcription/TranscriptionBlock.test-fixture.svelte
+++ b/frontend/src/lib/document/transcription/TranscriptionBlock.test-fixture.svelte
--- a/frontend/src/lib/document/transcription/TranscriptionEditView.svelte
+++ b/frontend/src/lib/document/transcription/TranscriptionEditView.svelte
@@ -50,6 +50,7 @@ let activeBlockId: string | null = $state(null);
 let localLabels: string[] = $derived.by(() => [...trainingLabels]);
 let listEl: HTMLElement | null = $state(null);
 let markingAllReviewed = $state(false);
 let markAllError = $state<string | null>(null);
 const sortedBlocks = $derived([...blocks].sort((a, b) => a.sortOrder - b.sortOrder));
 const hasBlocks = $derived(blocks.length > 0);
@@ -68,8 +69,11 @@ $effect(() => {
 async function handleMarkAllReviewed() {
 	if (!onMarkAllReviewed) return;
 	markingAllReviewed = true;
 	markAllError = null;
 	try {
 		await onMarkAllReviewed();
 	} catch {
 		markAllError = m.transcription_mark_all_reviewed_error();
 	} finally {
 		markingAllReviewed = false;
 	}
@@ -173,7 +177,7 @@ async function handleLabelToggle(label: string) {
 					<button
 						onclick={handleMarkAllReviewed}
 						disabled={allReviewed || markingAllReviewed}
-						title={allReviewed ? 'Alle Blöcke sind bereits als fertig markiert' : undefined}
+						title={allReviewed ? m.transcription_mark_all_reviewed_disabled() : undefined}
 						class="flex min-h-[44px] items-center gap-1.5 rounded-sm px-3 font-sans text-xs font-medium text-brand-navy/80 transition-colors hover:text-brand-navy focus-visible:ring-2 focus-visible:ring-brand-navy disabled:opacity-40"
 					>
 						{#if markingAllReviewed}
@@ -211,7 +215,7 @@ async function handleLabelToggle(label: string) {
 								<path stroke-linecap="round" stroke-linejoin="round" d="M5 13l4 4L19 7" />
 							</svg>
 						{/if}
-						Alle als fertig markieren
+						{m.transcription_mark_all_reviewed()}
 					</button>
 				{/if}
 			</div>
@@ -221,6 +225,31 @@ async function handleLabelToggle(label: string) {
 					style="width: {reviewProgress}%"
 				></div>
 			</div>
 			{#if markAllError}
 				<div
 					role="alert"
 					class="mt-1.5 flex items-center gap-2 rounded-sm border border-red-200 bg-red-50 px-3 py-2 font-sans text-sm text-red-700"
 				>
 					<span class="flex-1">{markAllError}</span>
 					<button
 						onclick={() => (markAllError = null)}
 						aria-label={m.comp_dismiss()}
 						class="flex min-h-[44px] min-w-[44px] items-center justify-center rounded text-red-600 hover:text-red-700 focus-visible:ring-2 focus-visible:ring-red-500"
 					>
 						<svg
 							class="h-4 w-4"
 							fill="none"
 							stroke="currentColor"
 							stroke-width="2"
 							viewBox="0 0 24 24"
 							xmlns="http://www.w3.org/2000/svg"
 							aria-hidden="true"
 						>
 							<path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
 						</svg>
 					</button>
 				</div>
 			{/if}
 		</div>
 		<div class="p-4">
 			<!-- svelte-ignore a11y_no_static_element_interactions -->
--- a/frontend/src/lib/document/transcription/TranscriptionEditView.svelte.spec.ts
+++ b/frontend/src/lib/document/transcription/TranscriptionEditView.svelte.spec.ts
@@ -3,6 +3,7 @@ import { cleanup, render } from 'vitest-browser-svelte';
 import { page, userEvent } from 'vitest/browser';
 import TranscriptionEditView from './TranscriptionEditView.svelte';
 import { createConfirmService, CONFIRM_KEY } from '$lib/shared/services/confirm.svelte.js';
 import { m } from '$lib/paraglide/messages.js';
 afterEach(cleanup);
@@ -312,14 +313,14 @@ describe('TranscriptionEditView — mark all reviewed', () => {
 			onMarkAllReviewed: vi.fn().mockResolvedValue(undefined)
 		});
 		await expect
-			.element(page.getByRole('button', { name: /Alle als fertig markieren/ }))
+			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.toBeInTheDocument();
 	});
 	it('does not show "Alle als fertig markieren" button when onMarkAllReviewed is not provided', async () => {
 		renderView({ blocks: [unreviewedBlock1, unreviewedBlock2] });
 		await expect
-			.element(page.getByRole('button', { name: /Alle als fertig markieren/ }))
+			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.not.toBeInTheDocument();
 	});
@@ -329,7 +330,7 @@ describe('TranscriptionEditView — mark all reviewed', () => {
 			onMarkAllReviewed: vi.fn().mockResolvedValue(undefined)
 		});
 		await expect
-			.element(page.getByRole('button', { name: /Alle als fertig markieren/ }))
+			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.toBeDisabled();
 	});
@@ -343,7 +344,7 @@ describe('TranscriptionEditView — mark all reviewed', () => {
 		// userEvent.click() via Playwright CDP doesn't reliably trigger Svelte 5 onclick
 		// handlers when a TipTap editor is mounted in the same component tree.
 		const btn = (await page
-			.getByRole('button', { name: /Alle als fertig markieren/ })
+			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await vi.waitFor(() => expect(onMarkAllReviewed).toHaveBeenCalledTimes(1));
@@ -361,12 +362,83 @@ describe('TranscriptionEditView — mark all reviewed', () => {
 		// Same CDP click workaround: dispatch from browser JS to reliably fire Svelte 5 onclick
 		const btnEl = (await page
-			.getByRole('button', { name: /Alle als fertig markieren/ })
+			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect
-			.element(page.getByRole('button', { name: /Alle als fertig markieren/ }))
+			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.toBeDisabled();
 		resolveMarkAll();
 	});
 	it('shows error message when onMarkAllReviewed callback rejects', async () => {
 		const onMarkAllReviewed = vi.fn().mockRejectedValue(new Error('INTERNAL_ERROR'));
 		renderView({ blocks: [unreviewedBlock1, unreviewedBlock2], onMarkAllReviewed });
 		const btnEl = (await page
 			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).toBeInTheDocument();
 		await expect
 			.element(page.getByRole('alert'))
 			.toHaveTextContent(m.transcription_mark_all_reviewed_error());
 	});
 	it('clears error when dismiss button is clicked', async () => {
 		const onMarkAllReviewed = vi.fn().mockRejectedValue(new Error('INTERNAL_ERROR'));
 		renderView({ blocks: [unreviewedBlock1, unreviewedBlock2], onMarkAllReviewed });
 		const btnEl = (await page
 			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).toBeInTheDocument();
 		const dismissEl = (await page
 			.getByRole('button', { name: m.comp_dismiss() })
 			.element()) as HTMLButtonElement;
 		dismissEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).not.toBeInTheDocument();
 	});
 	it('clears error on next successful markAllReviewed call', async () => {
 		const onMarkAllReviewed = vi
 			.fn()
 			.mockRejectedValueOnce(new Error('INTERNAL_ERROR'))
 			.mockResolvedValue(undefined);
 		renderView({ blocks: [unreviewedBlock1, unreviewedBlock2], onMarkAllReviewed });
 		const btnEl = (await page
 			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).toBeInTheDocument();
 		// Wait for the button to be re-enabled before the second click — ensures the first
 		// async rejection has fully settled and Svelte has flushed state changes
 		await expect
 			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.not.toBeDisabled();
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).not.toBeInTheDocument();
 	});
 	it('re-enables button after markAllReviewed failure', async () => {
 		const onMarkAllReviewed = vi.fn().mockRejectedValue(new Error('INTERNAL_ERROR'));
 		renderView({ blocks: [unreviewedBlock1, unreviewedBlock2], onMarkAllReviewed });
 		const btnEl = (await page
 			.getByRole('button', { name: m.transcription_mark_all_reviewed() })
 			.element()) as HTMLButtonElement;
 		btnEl.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
 		await expect.element(page.getByRole('alert')).toBeInTheDocument();
 		await expect
 			.element(page.getByRole('button', { name: m.transcription_mark_all_reviewed() }))
 			.not.toBeDisabled();
 	});
 });
--- a/frontend/src/lib/document/transcription/useTranscriptionBlocks.svelte.test.ts
+++ b/frontend/src/lib/document/transcription/useTranscriptionBlocks.svelte.test.ts
@@ -259,12 +259,15 @@ describe('createTranscriptionBlocks.markAllReviewed', () => {
 		expect(ctrl.blocks.every((b) => b.reviewed)).toBe(true);
 	});
-	it('is a no-op when PUT returns non-OK', async () => {
+	it('throws and leaves blocks unchanged when PUT returns non-OK', async () => {
 		const fetchImpl = vi.fn(async (url: RequestInfo | URL, init?: RequestInit) => {
 			const u = url.toString();
 			const method = init?.method ?? 'GET';
 			if (u.includes('/review-all') && method === 'PUT') {
-				return new Response('', { status: 500 });
+				return new Response(JSON.stringify({ code: 'INTERNAL_ERROR' }), {
 					status: 500,
 					headers: { 'Content-Type': 'application/json' }
 				});
 			}
 			return new Response(JSON.stringify([baseBlock({ id: 'b-1', reviewed: false })]), {
 				status: 200,
@@ -274,7 +277,26 @@ describe('createTranscriptionBlocks.markAllReviewed', () => {
 		const ctrl = createTranscriptionBlocks({ documentId: () => 'doc-1', fetchImpl });
 		await ctrl.load();
-		await ctrl.markAllReviewed();
+		await expect(ctrl.markAllReviewed()).rejects.toThrow('INTERNAL_ERROR');
 		expect(ctrl.blocks[0].reviewed).toBe(false);
 	});
 	it('throws INTERNAL_ERROR when PUT returns non-JSON body (e.g. nginx 502)', async () => {
 		const fetchImpl = vi.fn(async (url: RequestInfo | URL, init?: RequestInit) => {
 			const u = url.toString();
 			const method = init?.method ?? 'GET';
 			if (u.includes('/review-all') && method === 'PUT') {
 				return new Response('Bad Gateway', { status: 502 });
 			}
 			return new Response(JSON.stringify([baseBlock({ id: 'b-1', reviewed: false })]), {
 				status: 200,
 				headers: { 'Content-Type': 'application/json' }
 			});
 		});
 		const ctrl = createTranscriptionBlocks({ documentId: () => 'doc-1', fetchImpl });
 		await ctrl.load();
 		await expect(ctrl.markAllReviewed()).rejects.toThrow('INTERNAL_ERROR');
 		expect(ctrl.blocks[0].reviewed).toBe(false);
 	});
 });
--- a/frontend/src/lib/document/transcription/useTranscriptionBlocks.svelte.ts
+++ b/frontend/src/lib/document/transcription/useTranscriptionBlocks.svelte.ts
@@ -120,7 +120,11 @@ export function createTranscriptionBlocks(
 		const res = await fetchImpl(`/api/documents/${documentId()}/transcription-blocks/review-all`, {
 			method: 'PUT'
 		});
-		if (!res.ok) return;
+		if (!res.ok) {
 			const body = await res.json().catch(() => ({}));
 			// Never render body.message — route through getErrorMessage() to prevent leaking backend internals
 			throw new Error((body as { code?: string })?.code ?? 'INTERNAL_ERROR');
 		}
 		const updated = (await res.json()) as { id: string; reviewed: boolean }[];
 		for (const b of updated) {
 			const existing = blocks.find((x) => x.id === b.id);
--- a/frontend/src/lib/shared/api.server.ts
+++ b/frontend/src/lib/shared/api.server.ts
@@ -26,7 +26,6 @@ export function createApiClient(fetch: typeof globalThis.fetch) {
 export interface ApiError {
 	code?: string;
 	message?: string;
 }
 export function extractErrorCode(error: unknown): string | undefined {
--- a/frontend/src/lib/shared/discussion/MentionDropdown.svelte
+++ b/frontend/src/lib/shared/discussion/MentionDropdown.svelte
@@ -2,7 +2,18 @@
 import type { components } from '$lib/generated/api';
 // eslint-disable-next-line boundaries/dependencies -- mention dropdown needs person date formatting; extract to shared if it becomes reusable
 import { formatLifeDateRange } from '$lib/person/personLifeDates';
 import { untrack } from 'svelte';
 import { m } from '$lib/paraglide/messages.js';
 // Layered defence cap on the @mention search query length (CWE-400
 // amplification). The <input maxlength> attribute below caps direct
 // user edits, but the editor-mirror path (Tiptap contenteditable -> mirror
 // $effect -> searchQuery) is not covered by `maxlength` since the
 // contenteditable has no such enforcement. Clipping at the mirror keeps
 // the cap honest from both paths. Tracked server-side separately.
 // Nora #1 on PR #629. Hoisted to mentionConstants.ts so the host editor
 // (PersonMentionEditor) can clip the inserted displayName to the same cap
 // — see Felix #3 on PR #629.
 import { MAX_QUERY_LENGTH } from './mentionConstants';
 type Person = components['schemas']['Person'];
@@ -17,7 +28,46 @@ type DropdownState = {
 	clientRect: (() => DOMRect | null) | null;
 };
-let { model }: { model: DropdownState } = $props();
+let {
 	model,
 	editorQuery = '',
 	onSearch = () => {}
 }: {
 	model: DropdownState;
 	/** Text typed after `@` in the host editor. Mirrors into the search input
 	 *  until the user takes manual ownership by typing into the input itself. */
 	editorQuery?: string;
 	onSearch?: (query: string) => void;
 } = $props();
 let searchQuery = $state(untrack(() => editorQuery.slice(0, MAX_QUERY_LENGTH)));
 let userHasEdited = $state(false);
 // Intent-revealing alias used by both the persistent aria-live announcer and
 // the visible empty-state copy. Folding the duplicated rule into one $derived
 // keeps the two branches in lockstep. Felix #3 on PR #629 round 4.
 const isQueryEmpty = $derived(searchQuery.trim() === '');
 // Mirror the editor's typed text until the user takes ownership.
 //
 // Why `$state + $effect` (not `$derived`): `searchQuery` is also written by
 // `bind:value` on the <input> below, so it needs to be a mutable `$state`.
 // A `$derived` would be read-only and would clobber direct user edits on
 // every editor keystroke. The `userHasEdited` latch pins ownership once the
 // user types into the input. Felix #1 on PR #629.
 $effect(() => {
 	if (!userHasEdited) {
 		searchQuery = editorQuery.slice(0, MAX_QUERY_LENGTH);
 	}
 });
 // Fire onSearch whenever the effective query changes — covers both the
 // editor mirror and direct input edits. This is the only place onSearch
 // fires; when the dropdown is unmounted, the effect is disposed and no
 // further fetches occur.
 $effect(() => {
 	onSearch(searchQuery);
 });
 // highlightedIndex must be both writable (keyboard handler mutates it) and
 // reset when `items` changes (so it never points past the end of a new list).
@@ -112,16 +162,70 @@ function selectItem(item: Person) {
 	unauthenticated users.
 -->
 <div
-	class="fixed z-50 w-72 overflow-hidden rounded-sm border border-line bg-surface shadow-lg"
+	class="fixed z-50 w-72 max-w-[calc(100vw-1rem)] overflow-hidden rounded-sm border border-line bg-surface shadow-lg"
 	role="listbox"
 	aria-label={m.person_mention_btn_label()}
 	style:top={position.top}
 	style:bottom={position.bottom}
 	style:left={position.left}
 >
 	<div class="border-b border-line px-3 py-2">
 		<label class="sr-only" for="mention-search">{m.person_mention_search_label()}</label>
 		<div class="flex items-center gap-2">
 			<svg
 				aria-hidden="true"
 				viewBox="0 0 24 24"
 				fill="none"
 				stroke="currentColor"
 				stroke-width="2"
 				class="h-5 w-5 shrink-0 text-ink-2"
 			>
 				<circle cx="11" cy="11" r="7" />
 				<path d="m20 20-3.5-3.5" stroke-linecap="round" />
 			</svg>
 			<input
 				id="mention-search"
 				type="search"
 				data-test-search-input
 				maxlength={MAX_QUERY_LENGTH}
 				class="min-h-[44px] w-full bg-transparent font-sans text-base text-ink placeholder:text-ink-3 focus:outline-none focus-visible:ring-2 focus-visible:ring-brand-navy focus-visible:ring-inset"
 				placeholder={m.person_mention_search_prompt()}
 				bind:value={searchQuery}
 				oninput={() => {
 					userHasEdited = true;
 				}}
 				onmousedown={(e) => e.stopPropagation()}
 			/>
 		</div>
 	</div>
 	<!--
 		Persistent aria-live region — lives ABOVE the conditional branches so the
 		element never unmounts when items transition between empty and populated.
 		VoiceOver in particular swallows announcements from freshly-mounted live
 		regions, and the previous (conditional-inside) markup silently dropped
 		the "N persons found" announcement when results populated. Leonie #3 on
 		PR #629 round 3.
 	-->
 	<p class="sr-only" aria-live="polite">
 		{#if model.items.length === 0}
 			{isQueryEmpty ? m.person_mention_search_prompt() : m.person_mention_popup_empty()}
 		{:else if model.items.length === 1}
 			{m.person_mention_results_count_singular()}
 		{:else}
 			{m.person_mention_results_count_plural({ count: model.items.length })}
 		{/if}
 	</p>
 	{#if model.items.length === 0}
-		<p class="px-3 py-2.5 font-sans text-sm text-ink-3">
+		<!--
-			{m.person_mention_popup_empty()}
+			Visible empty-state copy — visual-only. The persistent sr-only <p>
 			above is the sole AT announcer; this one is hidden from screen readers
 			via aria-hidden="true" so VoiceOver does not double-announce
 			(NVDA de-dups, VoiceOver does not). Leonie S-2 on PR #629 round 4.
 			Do NOT add an aria-live attribute here — that would re-introduce
 			the duplicate announcement.
 		-->
 		<p aria-hidden="true" class="px-3 py-2.5 font-sans text-sm text-ink-3">
 			{isQueryEmpty ? m.person_mention_search_prompt() : m.person_mention_popup_empty()}
 		</p>
 		<!--
 			Empty-state escape hatch — without it the transcriber has to close
@@ -132,7 +236,7 @@ function selectItem(item: Person) {
 		<a
 			href="/persons/new"
 			target="_blank"
-			rel="noopener"
+			rel="noopener noreferrer"
 			class="flex min-h-[44px] items-center gap-2 border-t border-line px-3 py-2.5 font-sans text-sm font-medium text-brand-navy hover:bg-canvas focus:bg-canvas focus:outline-none"
 			onmousedown={(e) => e.preventDefault()}
 		>
--- a/frontend/src/lib/shared/discussion/MentionDropdown.svelte.test.ts
+++ b/frontend/src/lib/shared/discussion/MentionDropdown.svelte.test.ts
@@ -1,22 +1,37 @@
 import { describe, it, expect, vi, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
-import { page } from 'vitest/browser';
+import { page, userEvent } from 'vitest/browser';
 import { flushSync, mount, tick, unmount } from 'svelte';
 import MentionDropdown from './MentionDropdown.svelte';
 import MentionDropdownFixture from './MentionDropdown.test-fixture.svelte';
 import { m } from '$lib/paraglide/messages.js';
 import type { components } from '$lib/generated/api';
 type Person = components['schemas']['Person'];
 afterEach(cleanup);
-const makePerson = (id: string, name: string, overrides: Record<string, unknown> = {}) => ({
+const makePerson = (id: string, name: string, overrides: Partial<Person> = {}): Person => {
-	id,
+	const parts = name.split(' ');
-	firstName: name.split(' ')[0] ?? null,
+	return {
-	lastName: name.split(' ').slice(1).join(' ') || name,
+		id,
-	displayName: name,
+		firstName: parts[0],
-	birthYear: null as number | null,
+		lastName: parts.slice(1).join(' ') || name,
-	deathYear: null as number | null,
+		displayName: name,
-	...overrides
+		personType: 'PERSON',
-});
+		familyMember: false,
 		...overrides
 	};
 };
-const baseModel = (overrides: Record<string, unknown> = {}) => ({
+type DropdownState = {
-	items: [] as ReturnType<typeof makePerson>[],
+	items: Person[];
 	command: (item: Person) => void;
 	clientRect: (() => DOMRect | null) | null;
 };
 const baseModel = (overrides: Partial<DropdownState> = {}): DropdownState => ({
 	items: [],
 	command: vi.fn(),
 	clientRect: () => new DOMRect(100, 100, 0, 24),
 	...overrides
@@ -29,14 +44,32 @@ describe('MentionDropdown', () => {
 		await expect.element(page.getByRole('listbox', { name: /person verlinken/i })).toBeVisible();
 	});
-	it('renders the empty placeholder when items is empty', async () => {
+	it('shows the "enter a name" prompt when the search field is empty', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
-		await expect.element(page.getByText('Keine Personen gefunden')).toBeVisible();
+		// Scope to the visible empty-state <p> (text-ink-3) — the persistent
 		// sr-only aria-live region above also contains the same prompt copy.
 		const visibleEmptyP = document.querySelector(
 			'[role="listbox"] p.text-ink-3'
 		) as HTMLElement | null;
 		expect(visibleEmptyP).not.toBeNull();
 		expect(visibleEmptyP!.textContent ?? '').toContain(m.person_mention_search_prompt());
 		expect(visibleEmptyP!.textContent ?? '').not.toContain(m.person_mention_popup_empty());
 	});
 	it('shows "no persons found" when the search has a query but the list is empty', async () => {
 		render(MentionDropdown, { props: { model: baseModel(), editorQuery: 'WdG' } });
 		const visibleEmptyP = document.querySelector(
 			'[role="listbox"] p.text-ink-3'
 		) as HTMLElement | null;
 		expect(visibleEmptyP).not.toBeNull();
 		expect(visibleEmptyP!.textContent ?? '').toContain(m.person_mention_popup_empty());
 		expect(visibleEmptyP!.textContent ?? '').not.toContain(m.person_mention_search_prompt());
 	});
 	it('shows the create-new escape hatch link in the empty state', async () => {
-		render(MentionDropdown, { props: { model: baseModel() } });
+		render(MentionDropdown, { props: { model: baseModel(), editorQuery: 'unknown' } });
 		const link = (await page
 			.getByRole('link', { name: /neue person anlegen/i })
@@ -44,6 +77,7 @@ describe('MentionDropdown', () => {
 		expect(link.href).toContain('/persons/new');
 		expect(link.target).toBe('_blank');
 		expect(link.rel).toContain('noopener');
 		expect(link.rel).toContain('noreferrer');
 	});
 	it('renders one option per item when populated', async () => {
@@ -104,3 +138,315 @@ describe('MentionDropdown', () => {
 		expect(dropdown.style.left).toBe('123px');
 	});
 });
 // ─── Search input — Issue #380 ────────────────────────────────────────────────
 describe('MentionDropdown — search input', () => {
 	it('renders a search input pre-filled with the editorQuery prop', async () => {
 		render(MentionDropdown, {
 			props: { model: baseModel(), editorQuery: 'WdG' }
 		});
 		await expect.element(page.getByRole('searchbox')).toHaveValue('WdG');
 	});
 	it('exposes a data-test-search-input attribute for E2E selectors', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const input = document.querySelector('[data-test-search-input]');
 		expect(input).not.toBeNull();
 		expect((input as HTMLInputElement).type).toBe('search');
 	});
 	it('search input wrapper meets the 44px touch target (WCAG 2.2 AA)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const input = document.querySelector('[data-test-search-input]') as HTMLElement;
 		expect(input).not.toBeNull();
 		expect(input.className).toContain('min-h-[44px]');
 	});
 	it('renders a persistent aria-live="polite" region (does not remount on items transition; Leonie #3 on PR #629)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const listbox = document.querySelector('[role="listbox"]');
 		expect(listbox).not.toBeNull();
 		const live = listbox!.querySelector('p[aria-live="polite"]');
 		expect(live).not.toBeNull();
 		// Empty + empty-query → "Namen eingeben…" prompt
 		expect(live!.textContent ?? '').toContain(m.person_mention_search_prompt());
 	});
 	it('announces the result count in the persistent live region when items populate (Leonie #3 on PR #629)', async () => {
 		render(MentionDropdown, {
 			props: {
 				model: baseModel({
 					items: [
 						makePerson('p1', 'Anna Schmidt'),
 						makePerson('p2', 'Bert Meier'),
 						makePerson('p3', 'Carl Vogel')
 					]
 				})
 			}
 		});
 		const listbox = document.querySelector('[role="listbox"]');
 		expect(listbox).not.toBeNull();
 		const live = listbox!.querySelector('p[aria-live="polite"]');
 		expect(live).not.toBeNull();
 		// Populated → "3 Personen gefunden" (plural)
 		expect(live!.textContent ?? '').toContain('3');
 	});
 	it('announces the singular form when exactly one item is present (Sara #4 on PR #629)', async () => {
 		render(MentionDropdown, {
 			props: {
 				model: baseModel({
 					items: [makePerson('p1', 'Anna Schmidt')]
 				})
 			}
 		});
 		const listbox = document.querySelector('[role="listbox"]');
 		expect(listbox).not.toBeNull();
 		const live = listbox!.querySelector('p[aria-live="polite"]');
 		expect(live).not.toBeNull();
 		// Singular branch — "1 Person gefunden" / "1 person found" / "1 persona encontrada"
 		// (locale-dependent; resolved via the Paraglide message helper).
 		expect(live!.textContent ?? '').toContain(m.person_mention_results_count_singular());
 	});
 	it('keeps the visible empty-state copy without its own aria-live and hides it from AT (Leonie #3 on PR #629 round 3; Leonie S-2 round 4)', async () => {
 		render(MentionDropdown, { props: { model: baseModel(), editorQuery: 'WdG' } });
 		// Visible empty-state <p> exists with the empty-result copy ...
 		const empty = document.querySelector('p.text-ink-3') as HTMLElement | null;
 		expect(empty).not.toBeNull();
 		expect(empty!.textContent ?? '').toContain(m.person_mention_popup_empty());
 		// ... but it must NOT carry its own aria-live (the persistent sr-only
 		// region above the conditional is the announcer now).
 		expect(empty!.hasAttribute('aria-live')).toBe(false);
 		// ... and it MUST be hidden from screen readers via aria-hidden="true"
 		// so VoiceOver does not double-announce (the persistent sr-only region
 		// is the sole AT source of truth). Leonie S-2 on PR #629 round 4.
 		expect(empty!.getAttribute('aria-hidden')).toBe('true');
 	});
 	it('renders the magnifier icon at h-5 w-5 with text-ink-2 (Leonie BLOCKER on PR #629)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const icon = document.querySelector('[data-test-search-input]')
 			?.previousElementSibling as SVGElement | null;
 		expect(icon).not.toBeNull();
 		expect(icon!.tagName.toLowerCase()).toBe('svg');
 		expect(icon!.getAttribute('class') ?? '').toContain('h-5');
 		expect(icon!.getAttribute('class') ?? '').toContain('w-5');
 		expect(icon!.getAttribute('class') ?? '').toContain('text-ink-2');
 	});
 	it('caps the search input at maxlength=100 (CWE-400 amplification — Nora on PR #629)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const input = document.querySelector('[data-test-search-input]') as HTMLInputElement;
 		expect(input).not.toBeNull();
 		expect(input.maxLength).toBe(100);
 	});
 	it('clips a long editorQuery mirror to 100 chars (CWE-400 layered — Nora #1 on PR #629)', async () => {
 		const longQuery = 'A'.repeat(200);
 		render(MentionDropdown, { props: { model: baseModel(), editorQuery: longQuery } });
 		const input = document.querySelector('[data-test-search-input]') as HTMLInputElement;
 		expect(input).not.toBeNull();
 		expect(input.value.length).toBe(100);
 		expect(input.value).toBe('A'.repeat(100));
 	});
 	it('caps the listbox width to the viewport (320 px reflow guard — Leonie FINDING-MENTION-005)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const listbox = document.querySelector('[role="listbox"]') as HTMLElement;
 		expect(listbox).not.toBeNull();
 		expect(listbox.className).toContain('max-w-[calc(100vw-1rem)]');
 	});
 	it('renders the @mention search input at text-base (16 px senior-audience floor — Leonie FINDING-MENTION-006)', async () => {
 		render(MentionDropdown, { props: { model: baseModel() } });
 		const input = document.querySelector('[data-test-search-input]') as HTMLInputElement;
 		expect(input).not.toBeNull();
 		expect(input.className).toContain('text-base');
 		expect(input.className).not.toContain('text-sm');
 	});
 	it('invokes onSearch with the current value whenever the user types', async () => {
 		const onSearch = vi.fn();
 		render(MentionDropdown, { props: { model: baseModel(), onSearch } });
 		await userEvent.type(page.getByRole('searchbox'), 'Walter');
 		await vi.waitFor(() => {
 			expect(onSearch).toHaveBeenCalled();
 			expect(onSearch).toHaveBeenLastCalledWith('Walter');
 		});
 	});
 	it('keeps the user-edited search value when editorQuery changes after the takeover (Felix on PR #629)', async () => {
 		let setEditorQuery!: (q: string) => void;
 		render(MentionDropdownFixture, {
 			model: baseModel(),
 			initialEditorQuery: 'WdG',
 			onReady: (s: (q: string) => void) => {
 				setEditorQuery = s;
 			}
 		});
 		await expect.element(page.getByRole('searchbox')).toHaveValue('WdG');
 		await page.getByRole('searchbox').fill('Walter');
 		await expect.element(page.getByRole('searchbox')).toHaveValue('Walter');
 		setEditorQuery('WdGruyter');
 		// Flush pending Svelte reactivity so any (non-)update from the mirror
 		// $effect has landed before we assert. expect.element already polls, so
 		// no fixed-timeout fallback is needed. Sara on PR #629 round 3.
 		await tick();
 		await expect.element(page.getByRole('searchbox')).toHaveValue('Walter');
 	});
 });
 // ─── ArrowDown via exported onKeyDown (Sara #3 on PR #629) ──────────────────
 //
 // In production, Tiptap intercepts ArrowDown/ArrowUp/Enter at the editor level
 // and forwards them to the dropdown via its exported onKeyDown(event) function
 // — the dropdown itself has no DOM keydown listener. This test exercises the
 // same export so a regression in highlightedIndex/selection logic is caught
 // at the unit level. The full E2E focus-chain test is deferred to a separate
 // issue (Playwright).
 //
 // These unit tests directly invoke the exported `onKeyDown` to pin its
 // behaviour in isolation. They do NOT exercise the Tiptap forwarding
 // chain (PersonMentionEditor.suggestion.render() returning { onKeyDown })
 // — that integration is covered by the 'ArrowDown moves the highlight'
 // test in PersonMentionEditor.svelte.spec.ts. Sara on PR #629 round 3.
 describe('MentionDropdown — onKeyDown forwarding', () => {
 	// flushSync ensures Svelte reactivity propagation completes before
 	// asserting (uniform across all four key tests so the next reader
 	// doesn't have to figure out why some are wrapped and others aren't).
 	// Felix #1 suggestion on PR #629 round 3.
 	it('ArrowDown advances aria-selected to the next option in the listbox', async () => {
 		const container = document.createElement('div');
 		document.body.appendChild(container);
 		const instance = mount(MentionDropdown, {
 			target: container,
 			props: {
 				model: baseModel({
 					items: [makePerson('p1', 'Anna Schmidt'), makePerson('p2', 'Bert Meier')]
 				})
 			}
 		});
 		try {
 			const exports = instance as unknown as { onKeyDown: (e: KeyboardEvent) => boolean };
 			// First option starts highlighted.
 			const first = container.querySelector('[data-test-person-id="p1"]') as HTMLElement;
 			const second = container.querySelector('[data-test-person-id="p2"]') as HTMLElement;
 			expect(first.getAttribute('aria-selected')).toBe('true');
 			expect(second.getAttribute('aria-selected')).toBe('false');
 			let consumed = false;
 			flushSync(() => {
 				consumed = exports.onKeyDown(new KeyboardEvent('keydown', { key: 'ArrowDown' }));
 			});
 			expect(consumed).toBe(true);
 			expect(first.getAttribute('aria-selected')).toBe('false');
 			expect(second.getAttribute('aria-selected')).toBe('true');
 		} finally {
 			unmount(instance);
 			container.remove();
 		}
 	});
 	it('ArrowUp wraps from the first option to the last', async () => {
 		const container = document.createElement('div');
 		document.body.appendChild(container);
 		const instance = mount(MentionDropdown, {
 			target: container,
 			props: {
 				model: baseModel({
 					items: [makePerson('p1', 'Anna Schmidt'), makePerson('p2', 'Bert Meier')]
 				})
 			}
 		});
 		try {
 			const exports = instance as unknown as { onKeyDown: (e: KeyboardEvent) => boolean };
 			let consumed = false;
 			flushSync(() => {
 				consumed = exports.onKeyDown(new KeyboardEvent('keydown', { key: 'ArrowUp' }));
 			});
 			expect(consumed).toBe(true);
 			const second = container.querySelector('[data-test-person-id="p2"]') as HTMLElement;
 			expect(second.getAttribute('aria-selected')).toBe('true');
 		} finally {
 			unmount(instance);
 			container.remove();
 		}
 	});
 	it('Enter invokes model.command with the currently highlighted item', async () => {
 		const command = vi.fn();
 		const container = document.createElement('div');
 		document.body.appendChild(container);
 		const instance = mount(MentionDropdown, {
 			target: container,
 			props: {
 				model: baseModel({
 					items: [makePerson('p1', 'Anna Schmidt'), makePerson('p2', 'Bert Meier')],
 					command
 				})
 			}
 		});
 		try {
 			const exports = instance as unknown as { onKeyDown: (e: KeyboardEvent) => boolean };
 			let consumed = false;
 			flushSync(() => {
 				consumed = exports.onKeyDown(new KeyboardEvent('keydown', { key: 'Enter' }));
 			});
 			expect(consumed).toBe(true);
 			expect(command).toHaveBeenCalledTimes(1);
 			expect(command.mock.calls[0][0].id).toBe('p1');
 		} finally {
 			unmount(instance);
 			container.remove();
 		}
 	});
 	it('Escape returns false so the suggestion plugin can handle it', async () => {
 		const container = document.createElement('div');
 		document.body.appendChild(container);
 		const instance = mount(MentionDropdown, {
 			target: container,
 			props: {
 				model: baseModel({ items: [makePerson('p1', 'Anna Schmidt')] })
 			}
 		});
 		try {
 			const exports = instance as unknown as { onKeyDown: (e: KeyboardEvent) => boolean };
 			let consumed = true;
 			flushSync(() => {
 				consumed = exports.onKeyDown(new KeyboardEvent('keydown', { key: 'Escape' }));
 			});
 			expect(consumed).toBe(false);
 		} finally {
 			unmount(instance);
 			container.remove();
 		}
 	});
 });
--- a/frontend/src/lib/shared/discussion/MentionDropdown.test-fixture.svelte
+++ b/frontend/src/lib/shared/discussion/MentionDropdown.test-fixture.svelte
@@ -0,0 +1,32 @@
 <script lang="ts">
 import { untrack } from 'svelte';
 import MentionDropdown from './MentionDropdown.svelte';
 import type { components } from '$lib/generated/api';
 type Person = components['schemas']['Person'];
 type DropdownState = {
 	items: Person[];
 	command: (item: Person) => void;
 	clientRect: (() => DOMRect | null) | null;
 };
 type Props = {
 	model: DropdownState;
 	initialEditorQuery: string;
 	/** Test hook: receives a setter for editorQuery so the test can mutate it. */
 	onReady?: (setEditorQuery: (q: string) => void) => void;
 	onSearch?: (q: string) => void;
 };
 let { model, initialEditorQuery, onReady, onSearch = () => {} }: Props = $props();
 let editorQuery = $state(untrack(() => initialEditorQuery));
 $effect(() => {
 	onReady?.((q) => {
 		editorQuery = q;
 	});
 });
 </script>
 <MentionDropdown model={model} editorQuery={editorQuery} onSearch={onSearch} />
--- a/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte
+++ b/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte
@@ -7,7 +7,9 @@ import { m } from '$lib/paraglide/messages.js';
 import type { components } from '$lib/generated/api';
 import type { PersonMention } from '$lib/shared/types';
 import { deserialize, serialize } from '$lib/shared/discussion/mentionSerializer';
 import { debounce } from '$lib/shared/utils/debounce';
 import MentionDropdown from './MentionDropdown.svelte';
 import { MAX_QUERY_LENGTH, SEARCH_DEBOUNCE_MS, SEARCH_RESULT_LIMIT } from './mentionConstants';
 type Person = components['schemas']['Person'];
@@ -33,6 +35,13 @@ let {
 let editorEl: HTMLDivElement;
 let editor: Editor | null = null;
 // Hoisted so onDestroy can guarantee the imperatively-mounted dropdown is
 // torn down even if Tiptap's suggestion plugin onExit didn't fire (e.g. when
 // the host component is unmounted while the dropdown is still open).
 let mountedDropdown: object | null = null;
 // Hoisted so onDestroy can cancel any pending fetch — otherwise a trailing
 // debounced search can fire after the editor is gone and pollute later tests.
 let cancelPendingSearch: (() => void) | null = null;
 // Single reactive state object shared with MentionDropdown. Mutating these
 // fields propagates to the mounted dropdown via Svelte's $state proxy —
@@ -42,10 +51,12 @@ let dropdownState = $state<{
 	items: Person[];
 	command: (item: Person) => void;
 	clientRect: (() => DOMRect | null) | null;
 	editorQuery: string;
 }>({
 	items: [],
 	command: () => {},
-	clientRect: null
+	clientRect: null,
 	editorQuery: ''
 });
 type DropdownExports = {
@@ -138,16 +149,13 @@ onMount(() => {
 					// Nora #5618 #3 — separate issue tracks the GET /api/persons
 					// response-shape audit (PersonSummaryDTO leaks `notes`).
 					// ─────────────────────────────────────────────────────────────
-					items: async ({ query }: { query: string }) => {
+					// Tiptap's suggestion plugin requires an `items()` callback to keep
-						if (!query) return [];
+					// the dropdown alive, but the actual fetch is owned by `runSearch`
-						try {
+					// below — routed through the dropdown's search input via the
-							const res = await fetch(`/api/persons?q=${encodeURIComponent(query)}`);
+					// debounced `onSearch` channel. Returning `[]` here keeps Tiptap
-							if (!res.ok) return [];
+					// happy without firing a duplicate per-keystroke fetch.
-							return ((await res.json()) as Person[]).slice(0, 5);
+					// Markus #5616 / Felix / Nora / Sara on PR #629.
-						} catch {
+					items: async () => [],
 							return [];
 						}
 					},
 					// AC-1 fix: insert the typed query as displayName, not person.displayName.
 					command({ editor: ed, range, props }) {
 						const p = props as unknown as { personId: string; displayName: string };
@@ -165,7 +173,6 @@ onMount(() => {
 							.run();
 					},
 					render() {
 						let component: object | null = null;
 						let exports: DropdownExports | null = null;
 						// Tiptap's SuggestionProps types `command` against the default
@@ -178,25 +185,84 @@ onMount(() => {
 							clientRect?: (() => DOMRect | null) | null;
 						};
 						// Request-token guard: every onSearch invocation bumps `requestId`;
 						// runSearch captures the id active when its fetch starts and discards
 						// the response if a newer onSearch has fired since. Without this, a
 						// late response can repopulate the dropdown after the user cleared
 						// the search input. Sara on PR #629.
 						let requestId = 0;
 						const runSearch = async (query: string) => {
 							const id = requestId;
 							try {
 								// Defensive client-side cap — server-side enforcement is tracked
 								// separately. Markus on PR #629.
 								const res = await fetch(
 									`/api/persons?q=${encodeURIComponent(query)}&limit=${SEARCH_RESULT_LIMIT}`
 								);
 								if (id !== requestId) return;
 								if (!res.ok) {
 									dropdownState.items = [];
 									return;
 								}
 								const data = (await res.json()) as Person[];
 								if (id !== requestId) return;
 								dropdownState.items = data.slice(0, SEARCH_RESULT_LIMIT);
 							} catch {
 								if (id !== requestId) return;
 								dropdownState.items = [];
 							}
 						};
 						const debouncedSearch = debounce(runSearch, SEARCH_DEBOUNCE_MS);
 						cancelPendingSearch = () => debouncedSearch.cancel();
 						const onSearch = (query: string) => {
 							requestId++;
 							if (query.trim() === '') {
 								debouncedSearch.cancel();
 								dropdownState.items = [];
 								return;
 							}
 							debouncedSearch(query);
 						};
 						const updateState = (renderProps: LooseRenderProps) => {
-							dropdownState.items = renderProps.items as Person[];
+							// Clip once here so both the inserted displayName and the
 							// dropdown's editor-mirror see the same value. The dropdown
 							// already clips the mirror (Nora #1 CWE-400), but without
 							// clipping at the command boundary an unclipped query would
 							// still flow through as the inserted displayName — visible
 							// UI divergence between "what I searched" and "what was
 							// inserted". Felix #3 on PR #629.
 							const clippedQuery = renderProps.query.slice(0, MAX_QUERY_LENGTH);
 							// AC-1: pass typed query as displayName, not person.displayName
 							dropdownState.command = (item: Person) =>
 								renderProps.command({
 									personId: item.id,
-									displayName: renderProps.query
+									displayName: clippedQuery
 								});
 							dropdownState.clientRect = renderProps.clientRect ?? null;
 							dropdownState.editorQuery = clippedQuery;
 						};
 						return {
 							onStart(renderProps) {
-								updateState(renderProps as unknown as LooseRenderProps);
+								const loose = renderProps as unknown as LooseRenderProps;
 								updateState(loose);
 								// MentionDropdown reads `editorQuery` off the shared state
 								// proxy via its `editorQuery` prop binding below — this is
 								// the same pattern as `model.items`. We do not pass it as a
 								// separate prop because Svelte 5's mount() does not expose
 								// settable prop accessors, so we route through the proxy.
 								const mounted = mount(MentionDropdown, {
 									target: document.body,
-									props: { model: dropdownState }
+									props: {
 										model: dropdownState,
 										get editorQuery() {
 											return dropdownState.editorQuery;
 										},
 										onSearch
 									}
 								});
-								component = mounted as object;
+								mountedDropdown = mounted as object;
 								exports = mounted as unknown as DropdownExports;
 							},
 							onUpdate(renderProps) {
@@ -208,9 +274,16 @@ onMount(() => {
 								return exports?.onKeyDown(event) ?? false;
 							},
 							onExit() {
-								if (component) {
+								// Cancel any pending debounce so a closed dropdown's trailing
-									unmount(component);
+								// runSearch cannot fire against the *next* dropdown's state.
-									component = null;
+								// The hoisted `cancelPendingSearch` would be overwritten by
 								// the next render()'s onStart before the trailing call fires,
 								// so we cancel locally via the closure-scoped debouncedSearch.
 								// Felix #1 on PR #629.
 								debouncedSearch.cancel();
 								if (mountedDropdown) {
 									unmount(mountedDropdown);
 									mountedDropdown = null;
 									exports = null;
 								}
 							}
@@ -253,7 +326,15 @@ onMount(() => {
 });
 onDestroy(() => {
 	cancelPendingSearch?.();
 	editor?.destroy();
 	// Tiptap suggestion onExit usually unmounts the dropdown, but if the host
 	// component is destroyed while a suggestion is active the dropdown can
 	// outlive the editor — clean it up explicitly.
 	if (mountedDropdown) {
 		unmount(mountedDropdown);
 		mountedDropdown = null;
 	}
 });
 // Keep the data-placeholder attribute in sync with actual emptiness so the
--- a/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte.spec.ts
+++ b/frontend/src/lib/shared/discussion/PersonMentionEditor.svelte.spec.ts
@@ -8,29 +8,45 @@
 import { describe, it, expect, vi, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
 import { page, userEvent } from 'vitest/browser';
-import PersonMentionEditorHost from './PersonMentionEditor.test-host.svelte';
+import { tick } from 'svelte';
 import PersonMentionEditorHost from './PersonMentionEditor.test-fixture.svelte';
 import type { components } from '$lib/generated/api';
 import { m } from '$lib/paraglide/messages.js';
 // Single source of truth for the debounce window — imported from the shared
 // module so the test cannot drift from production. Sara on PR #629 round 3.
 import { SEARCH_DEBOUNCE_MS } from './mentionConstants';
 type Person = components['schemas']['Person'];
 type PersonMention = components['schemas']['PersonMention'];
 /**
 * Headroom above SEARCH_DEBOUNCE_MS for the debounce-window wait
 * assertions in this file. 350 ms is calibrated against CI-runner jitter
 * we observed pre-#629; dropping it below ~200 ms reintroduces flake.
 * See PR #629 round-2 review comment #10935 (Sara).
 */
 const POST_DEBOUNCE_SLACK_MS = 350;
 const AUGUSTE: Person = {
 	id: 'p-aug',
 	firstName: 'Auguste',
 	lastName: 'Raddatz',
 	displayName: 'Auguste Raddatz',
 	personType: 'PERSON',
 	familyMember: false,
 	birthYear: 1882,
 	deathYear: 1944
-} as unknown as Person;
+};
 const ANNA: Person = {
 	id: 'p-anna',
 	firstName: 'Anna',
 	lastName: 'Schmidt',
 	displayName: 'Anna Schmidt',
 	personType: 'PERSON',
 	familyMember: false,
 	birthYear: 1860
-} as unknown as Person;
+};
 function mockFetchWithPersons(persons: Person[] = [AUGUSTE, ANNA]) {
 	vi.stubGlobal(
@@ -125,6 +141,20 @@ describe('PersonMentionEditor — typeahead', () => {
 		});
 	});
 	it('appends &limit=5 to the fetch URL (defensive client-side cap, Markus on PR #629)', async () => {
 		const fetchMock = vi
 			.fn()
 			.mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([AUGUSTE]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@Aug');
 		await vi.waitFor(() => {
 			expect(fetchMock).toHaveBeenCalledWith(expect.stringContaining('limit=5'));
 		});
 	});
 	it('shows life dates next to the name in the dropdown', async () => {
 		mockFetchWithPersons();
 		renderHost();
@@ -142,8 +172,15 @@ describe('PersonMentionEditor — typeahead', () => {
 		await userEvent.type(page.getByRole('textbox'), '@xyz');
-		await vi.waitFor(async () => {
+		// The visible empty-state <p> (text-ink-3) shows the copy. The persistent
-			await expect.element(page.getByText('Keine Personen gefunden')).toBeInTheDocument();
+		// sr-only aria-live region also contains the same copy, so we scope to the
 		// visible element to avoid a multi-match resolution in expect.element.
 		await vi.waitFor(() => {
 			const visibleEmptyP = document.querySelector(
 				'[role="listbox"] p.text-ink-3'
 			) as HTMLElement | null;
 			expect(visibleEmptyP).not.toBeNull();
 			expect(visibleEmptyP!.textContent ?? '').toContain('Keine Personen gefunden');
 		});
 	});
@@ -161,6 +198,254 @@ describe('PersonMentionEditor — typeahead', () => {
 	});
 });
 // ─── AC-2/3: search input drives the person fetch (debounced) ───────────────
 describe('PersonMentionEditor — AC-2/3: search input drives fetch', () => {
 	it('editing the search input fires a debounced fetch with the new query', async () => {
 		const fetchMock = vi
 			.fn()
 			.mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([AUGUSTE]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		// Open the dropdown so the search input is reachable.
 		await userEvent.type(page.getByRole('textbox'), '@');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('searchbox')).toBeVisible();
 		});
 		const fetchesBeforeSearch = fetchMock.mock.calls.length;
 		// `fill` simulates a single input event with the final value — sidesteps
 		// per-keystroke timing of userEvent.type so the test can deterministically
 		// assert that one input event collapses into one debounced fetch.
 		await page.getByRole('searchbox').fill('Walter');
 		await vi.waitFor(
 			() => {
 				expect(fetchMock).toHaveBeenCalledWith(expect.stringContaining('q=Walter'));
 			},
 			{ timeout: 1000 }
 		);
 		const fetchesAfterSearch = fetchMock.mock.calls.length - fetchesBeforeSearch;
 		expect(fetchesAfterSearch).toBe(1);
 	});
 	it('fires exactly one /api/persons fetch when the user searches for Walter (regression guard)', async () => {
 		// Regression guard: a previous version of PersonMentionEditor had a
 		// duplicated `items()` callback in the Tiptap suggestion config that
 		// fetched per-keystroke in addition to the debounced search-input fetch
 		// (Markus & Felix round-1). To catch that regression, we must NOT
 		// subtract any baseline — every fetch from render onwards counts.
 		// Sara on PR #629 round 3.
 		const fetchMock = vi
 			.fn()
 			.mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([AUGUSTE]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		// Open the dropdown, then drive the search input via fill() — sidesteps
 		// per-keystroke timing of userEvent.type that Sara flagged round 2.
 		await userEvent.type(page.getByRole('textbox'), '@');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('searchbox')).toBeVisible();
 		});
 		await page.getByRole('searchbox').fill('Walter');
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		// No baseline subtraction — count ALL /api/persons fetches since render.
 		// If the legacy per-keystroke items() callback returns, typing `@` alone
 		// would already produce one fetch and `fill('Walter')` another, breaking
 		// this assertion.
 		const personsFetches = fetchMock.mock.calls.filter(
 			([url]) => typeof url === 'string' && url.startsWith('/api/persons')
 		);
 		expect(personsFetches.length).toBe(1);
 	});
 	it('clearing the search input clears the list without firing a fetch', async () => {
 		const fetchMock = vi
 			.fn()
 			.mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([AUGUSTE]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@Aug');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByText('Auguste Raddatz')).toBeInTheDocument();
 		});
 		const fetchesBeforeClear = fetchMock.mock.calls.length;
 		await userEvent.clear(page.getByRole('searchbox'));
 		// Negative assertion: wait past the debounce window to confirm no
 		// trailing fetch was scheduled. Removing this wait would mask a
 		// re-introduction of the keystroke-driven items() fetch.
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		expect(fetchMock.mock.calls.length).toBe(fetchesBeforeClear);
 		await expect.element(page.getByText('Auguste Raddatz')).not.toBeInTheDocument();
 	});
 });
 // ─── Whitespace-only query (Elicit AC-4 ambiguity on PR #629) ───────────────
 describe('PersonMentionEditor — whitespace-only query', () => {
 	it('keeps the "Namen eingeben…" prompt and fires no fetch when @ is followed only by spaces', async () => {
 		const fetchMock = vi.fn().mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@   ');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('searchbox')).toBeVisible();
 		});
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		// Scope to the visible empty-state <p> (text-ink-3) — the persistent
 		// sr-only aria-live region above contains the same copy.
 		const visibleEmptyP = document.querySelector(
 			'[role="listbox"] p.text-ink-3'
 		) as HTMLElement | null;
 		expect(visibleEmptyP).not.toBeNull();
 		expect(visibleEmptyP!.textContent ?? '').toContain(m.person_mention_search_prompt());
 		expect(fetchMock).not.toHaveBeenCalled();
 	});
 });
 // ─── Stale-response race (Sara on PR #629) ───────────────────────────────────
 describe('PersonMentionEditor — stale-response race', () => {
 	it('discards a stale response that resolves after the search has been cleared', async () => {
 		let resolveFetch!: (v: { ok: boolean; json: () => Promise<Person[]> }) => void;
 		const pendingResponse = new Promise<{ ok: boolean; json: () => Promise<Person[]> }>((r) => {
 			resolveFetch = r;
 		});
 		const fetchMock = vi.fn().mockReturnValue(pendingResponse);
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		// Open the dropdown and let the debounce fire so a fetch is in flight.
 		await userEvent.type(page.getByRole('textbox'), '@Aug');
 		await vi.waitFor(() => {
 			expect(fetchMock).toHaveBeenCalledWith(expect.stringContaining('/api/persons?q=Aug'));
 		});
 		// Clear the search input *before* the fetch resolves.
 		await userEvent.clear(page.getByRole('searchbox'));
 		await expect.element(page.getByRole('searchbox')).toHaveValue('');
 		// The stale fetch now resolves with persons. The dropdown must stay empty.
 		resolveFetch({ ok: true, json: () => Promise.resolve([AUGUSTE]) });
 		// Flush pending Svelte reactivity so any (non-)update from the stale
 		// fetch resolution has landed before we assert. expect.element already
 		// polls, so no fixed-timeout fallback is needed. Sara on PR #629 round 4.
 		await tick();
 		await expect.element(page.getByText('Auguste Raddatz')).not.toBeInTheDocument();
 	});
 });
 // ─── Server failure characterization (Sara #2 on PR #629) ───────────────────
 describe('PersonMentionEditor — server failure', () => {
 	it('on 500 response keeps the dropdown open with the empty-state copy (silent failure pinned; distinct error UX tracked separately)', async () => {
 		const fetchMock = vi
 			.fn()
 			.mockResolvedValue({ ok: false, status: 500, json: vi.fn().mockResolvedValue({}) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@Aug');
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		// Pins current silent-failure behaviour. The day someone implements a
 		// distinct error UX (toast / "Suche fehlgeschlagen" copy), this test
 		// goes red and forces them to update the assertion. Scope to the
 		// visible <p> (text-ink-3) — the persistent sr-only live region
 		// above contains the same copy.
 		const visibleEmptyP = document.querySelector(
 			'[role="listbox"] p.text-ink-3'
 		) as HTMLElement | null;
 		expect(visibleEmptyP).not.toBeNull();
 		expect(visibleEmptyP!.textContent ?? '').toContain(m.person_mention_popup_empty());
 	});
 	it('on a fetch reject (network failure) keeps the dropdown open with the empty-state copy', async () => {
 		const fetchMock = vi.fn().mockRejectedValue(new TypeError('NetworkError'));
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@Aug');
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		const visibleEmptyP = document.querySelector(
 			'[role="listbox"] p.text-ink-3'
 		) as HTMLElement | null;
 		expect(visibleEmptyP).not.toBeNull();
 		expect(visibleEmptyP!.textContent ?? '').toContain(m.person_mention_popup_empty());
 	});
 });
 // ─── onExit cancels pending debounce (Felix #1 on PR #629) ───────────────────
 describe('PersonMentionEditor — onExit cancels pending debounce', () => {
 	it('cancels the pending debounced fetch when Escape closes the dropdown before the debounce fires', async () => {
 		const fetchMock = vi.fn().mockResolvedValue({ ok: true, json: vi.fn().mockResolvedValue([]) });
 		vi.stubGlobal('fetch', fetchMock);
 		renderHost();
 		// Open the dropdown by typing @ + a query in the editor.
 		await userEvent.type(page.getByRole('textbox'), '@A');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('searchbox')).toBeVisible();
 		});
 		// Wait for any in-flight fetch from opening the dropdown to settle.
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		const fetchesBeforeEscape = fetchMock.mock.calls.length;
 		// Trigger a new debounced search (queues runSearch after 150 ms), then
 		// immediately Escape *while focus is back in the editor* so Tiptap's
 		// suggestion-plugin Escape handler fires onExit before the debounce.
 		// Without onExit cancelling the pending debounce, runSearch executes
 		// against the now-unmounted dropdown's state.
 		await page.getByRole('searchbox').fill('Walter');
 		// Focus the editor so the Escape lands on Tiptap's suggestion handler.
 		(page.getByRole('textbox').element() as HTMLElement).focus();
 		await userEvent.keyboard('{Escape}');
 		// Wait past the debounce window. If onExit did not cancel the pending
 		// debounce, a fetch with q=Walter would still fire here.
 		await new Promise((r) => setTimeout(r, SEARCH_DEBOUNCE_MS + POST_DEBOUNCE_SLACK_MS));
 		const newFetches = fetchMock.mock.calls.slice(fetchesBeforeEscape);
 		const walterFetches = newFetches.filter(
 			([url]) => typeof url === 'string' && url.includes('q=Walter')
 		);
 		expect(walterFetches.length).toBe(0);
 	});
 });
 // ─── AC-1: search input prefilled with text typed after @ ───────────────────
 describe('PersonMentionEditor — AC-1: search input prefill', () => {
 	it('prefills the dropdown search input with the text typed after @', async () => {
 		mockFetchEmpty();
 		renderHost();
 		await userEvent.type(page.getByRole('textbox'), '@WdG');
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('searchbox')).toHaveValue('WdG');
 		});
 	});
 });
 // ─── AC-1: typed text becomes displayName, not DB name ───────────────────────
 describe('PersonMentionEditor — AC-1: typed text as displayName', () => {
@@ -229,6 +514,39 @@ describe('PersonMentionEditor — AC-1: typed text as displayName', () => {
 		});
 	});
 	it('clips the inserted displayName to MAX_QUERY_LENGTH=100 chars (Felix #3 on PR #629)', async () => {
 		// CWE-400 amplification: the dropdown clips its search input + mirror at
 		// 100 chars (Nora #1), but the host editor was passing the unclipped
 		// renderProps.query straight through to displayName — so a 105-char
 		// @-suffix in the editor could insert a 105-char displayName into the
 		// sidecar even though the dropdown only searched the first 100.
 		mockFetchWithPersons();
 		const host = renderHost();
 		// Type @ + 105 'A' chars in the contenteditable. The renderProps.query
 		// fed into the command callback derives from the editor text after `@`,
 		// not the dropdown's searchbox — so we must drive the editor.
 		await userEvent.type(page.getByRole('textbox'), '@' + 'A'.repeat(105));
 		// The mocked /api/persons returns AUGUSTE for any query — wait for it.
 		await vi.waitFor(async () => {
 			await expect.element(page.getByRole('option', { name: /Auguste Raddatz/ })).toBeVisible();
 		});
 		const option = (await page
 			.getByRole('option', { name: /Auguste Raddatz/ })
 			.element()) as HTMLElement;
 		option.dispatchEvent(new MouseEvent('mousedown', { bubbles: true, cancelable: true }));
 		await vi.waitFor(() => {
 			expect(host.snapshot.mentionedPersons).toHaveLength(1);
 			// Tight assertion: input is 105 chars, cap is exactly 100. Using
 			// `toHaveLength(100)` discriminates "clip works" from "clip works
 			// AND nothing weakened it to e.g. 95". Sara on PR #629 round 4.
 			expect(host.snapshot.mentionedPersons[0].displayName).toHaveLength(100);
 		});
 	});
 	it('does not duplicate the sidecar entry when the same person is selected twice', async () => {
 		mockFetchWithPersons();
 		const host = renderHost({
--- a/frontend/src/lib/shared/discussion/PersonMentionEditor.test-fixture.svelte
+++ b/frontend/src/lib/shared/discussion/PersonMentionEditor.test-fixture.svelte
--- a/frontend/src/lib/shared/discussion/mentionConstants.ts
+++ b/frontend/src/lib/shared/discussion/mentionConstants.ts
@@ -0,0 +1,10 @@
 /** Shared knobs for the @mention typeahead. Single source of truth for
 *  the dropdown component and the host editor — keeps the layered length
 *  cap and the debounce window consistent across both files. */
 export const MAX_QUERY_LENGTH = 100;
 export const SEARCH_DEBOUNCE_MS = 150;
 /** Defensive client-side cap on the result list. Single consumer today
 *  (PersonMentionEditor), kept here for symmetry with the other limit
 *  knobs so the @mention configuration lives in one place. Felix #1 on
 *  PR #629 round 4. */
 export const SEARCH_RESULT_LIMIT = 5;
--- a/frontend/src/lib/shared/services/confirm.svelte.test.ts
+++ b/frontend/src/lib/shared/services/confirm.svelte.test.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect, afterEach } from 'vitest';
 import { cleanup, render } from 'vitest-browser-svelte';
 import { page, userEvent } from 'vitest/browser';
-import TestHost from './confirm.test-host.svelte';
+import TestHost from './confirm.test-fixture.svelte';
 import type { ConfirmService } from './confirm.svelte.js';
 afterEach(cleanup);
--- a/frontend/src/lib/shared/services/confirm.test-fixture.svelte
+++ b/frontend/src/lib/shared/services/confirm.test-fixture.svelte
--- a/frontend/src/lib/shared/utils/debounce.ts
+++ b/frontend/src/lib/shared/utils/debounce.ts
@@ -1,12 +1,25 @@
 /**
 * Returns a debounced version of fn that delays invocation until after
- * `delay` ms have elapsed since the last call.
+ * `delay` ms have elapsed since the last call. The returned function
 * exposes a `cancel()` method that DROPS (does not flush) the pending
 * trailing invocation — essential when the host context (a destroyed
 * component, an unmounted editor) shouldn't fire the trailing call.
 */
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
-export function debounce<T extends (...args: any[]) => void>(fn: T, delay: number): T {
+export function debounce<T extends (...args: any[]) => void>(
-	let timer: ReturnType<typeof setTimeout>;
+	fn: T,
-	return ((...args: Parameters<T>) => {
+	delay: number
-		clearTimeout(timer);
+): T & { cancel: () => void } {
 	let timer: ReturnType<typeof setTimeout> | undefined;
 	const wrapped = ((...args: Parameters<T>) => {
 		if (timer !== undefined) clearTimeout(timer);
 		timer = setTimeout(() => fn(...args), delay);
-	}) as T;
+	}) as T & { cancel: () => void };
 	wrapped.cancel = () => {
 		if (timer !== undefined) {
 			clearTimeout(timer);
 			timer = undefined;
 		}
 	};
 	return wrapped;
 }
--- a/frontend/src/routes/admin/groups/[id]/page.server.spec.ts
+++ b/frontend/src/routes/admin/groups/[id]/page.server.spec.ts
@@ -7,7 +7,8 @@ const mockApi = {
 };
 vi.mock('$lib/shared/api.server', () => ({
-	createApiClient: () => mockApi
+	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 beforeEach(() => vi.clearAllMocks());
--- a/frontend/src/routes/admin/groups/layout.server.spec.ts
+++ b/frontend/src/routes/admin/groups/layout.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+layout.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/admin/layout.server.spec.ts
+++ b/frontend/src/routes/admin/layout.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+layout.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/admin/ocr/[personId]/page.server.spec.ts
+++ b/frontend/src/routes/admin/ocr/[personId]/page.server.spec.ts
@@ -3,7 +3,10 @@ import { load } from './+page.server';
 const mockApi = { GET: vi.fn() };
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: () => mockApi }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 beforeEach(() => vi.clearAllMocks());
--- a/frontend/src/routes/admin/ocr/global/page.server.spec.ts
+++ b/frontend/src/routes/admin/ocr/global/page.server.spec.ts
@@ -3,7 +3,10 @@ import { load } from './+page.server';
 const mockApi = { GET: vi.fn() };
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: () => mockApi }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 beforeEach(() => vi.clearAllMocks());
--- a/frontend/src/routes/admin/ocr/page.server.spec.ts
+++ b/frontend/src/routes/admin/ocr/page.server.spec.ts
@@ -3,7 +3,10 @@ import { load } from './+page.server';
 const mockApi = { GET: vi.fn() };
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: () => mockApi }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 beforeEach(() => vi.clearAllMocks());
--- a/frontend/src/routes/admin/tags/[id]/page.server.spec.ts
+++ b/frontend/src/routes/admin/tags/[id]/page.server.spec.ts
@@ -8,7 +8,8 @@ const mockApi = {
 };
 vi.mock('$lib/shared/api.server', () => ({
-	createApiClient: () => mockApi
+	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 beforeEach(() => vi.clearAllMocks());
--- a/frontend/src/routes/admin/tags/layout.server.spec.ts
+++ b/frontend/src/routes/admin/tags/layout.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+layout.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/admin/users/[id]/page.server.spec.ts
+++ b/frontend/src/routes/admin/users/[id]/page.server.spec.ts
@@ -4,7 +4,10 @@ vi.mock('$env/dynamic/private', () => ({
 	env: { API_INTERNAL_URL: 'http://localhost:8080' }
 }));
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { load, actions } from './+page.server';
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/admin/users/layout.server.spec.ts
+++ b/frontend/src/routes/admin/users/layout.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+layout.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/aktivitaeten/+page.server.ts
+++ b/frontend/src/routes/aktivitaeten/+page.server.ts
@@ -1,5 +1,5 @@
 import { fail } from '@sveltejs/kit';
-import { createApiClient, extractErrorCode } from '$lib/shared/api.server';
+import { createApiClient } from '$lib/shared/api.server';
 import { getErrorMessage } from '$lib/shared/errors';
 import type { components, operations } from '$lib/generated/api';
@@ -79,9 +79,8 @@ export const actions = {
 			params: { path: { id: notificationId } }
 		});
 		if (!result.response.ok) {
-			return fail(result.response.status, {
+			const code = (result.error as unknown as { code?: string })?.code;
-				error: getErrorMessage(extractErrorCode(result.error))
+			return fail(result.response.status, { error: getErrorMessage(code) });
 			});
 		}
 		return { success: true };
 	},
@@ -90,9 +89,8 @@ export const actions = {
 		const api = createApiClient(fetch);
 		const result = await api.POST('/api/notifications/read-all');
 		if (!result.response.ok) {
-			return fail(result.response.status, {
+			const code = (result.error as unknown as { code?: string })?.code;
-				error: getErrorMessage(extractErrorCode(result.error))
+			return fail(result.response.status, { error: getErrorMessage(code) });
 			});
 		}
 		return { success: true };
 	}
--- a/frontend/src/routes/aktivitaeten/page.server.spec.ts
+++ b/frontend/src/routes/aktivitaeten/page.server.spec.ts
@@ -8,7 +8,8 @@ const mockApi = {
 };
 vi.mock('$lib/shared/api.server', () => ({
-	createApiClient: () => mockApi
+	createApiClient: () => mockApi,
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 function buildUrl(search = ''): URL {
--- a/frontend/src/routes/briefwechsel/page.server.spec.ts
+++ b/frontend/src/routes/briefwechsel/page.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+page.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 vi.mock('$lib/shared/errors', () => ({
 	getErrorMessage: (code: string) => code ?? 'Unknown error'
 }));
--- a/frontend/src/routes/documents/[id]/page.server.spec.ts
+++ b/frontend/src/routes/documents/[id]/page.server.spec.ts
@@ -1,6 +1,9 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 vi.mock('$env/dynamic/private', () => ({ env: { API_INTERNAL_URL: 'http://test-backend:8080' } }));
 import { load } from './+page.server';
--- a/frontend/src/routes/documents/page.server.spec.ts
+++ b/frontend/src/routes/documents/page.server.spec.ts
@@ -1,6 +1,9 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { load } from './+page.server';
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/page.server.spec.ts
+++ b/frontend/src/routes/page.server.spec.ts
@@ -1,6 +1,9 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { load } from './+page.server';
 import { createApiClient } from '$lib/shared/api.server';
--- a/frontend/src/routes/persons/[id]/page.server.spec.ts
+++ b/frontend/src/routes/persons/[id]/page.server.spec.ts
@@ -1,7 +1,10 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 import { load } from './+page.server';
-vi.mock('$lib/shared/api.server', () => ({ createApiClient: vi.fn() }));
+vi.mock('$lib/shared/api.server', () => ({
 	createApiClient: vi.fn(),
 	extractErrorCode: (e: unknown) => (e as { code?: string } | undefined)?.code
 }));
 import { createApiClient } from '$lib/shared/api.server';
--- a/infra/observability/grafana/provisioning/dashboards/loki-logs.json
+++ b/infra/observability/grafana/provisioning/dashboards/loki-logs.json
@@ -196,7 +196,7 @@
      },
      "targets": [
        {
-          "expr": "{job=\"$app\"} |= \"$search\" | logfmt",
+          "expr": "{job=\"$app\"} |= \"$search\" | json",
          "hide": false,
          "legendFormat": "",
          "refId": "A"
--- a/infra/observability/grafana/provisioning/dashboards/po-overview.json
+++ b/infra/observability/grafana/provisioning/dashboards/po-overview.json
@@ -0,0 +1,702 @@
 {
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": { "type": "grafana", "uid": "grafana" },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "description": "Product owner overview — system health, user activity, archive progress, and OCR quality at a weekly glance.",
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "collapsed": false,
      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
      "id": 100,
      "title": "System Health",
      "type": "row",
      "panels": []
    },
    {
      "id": 1,
      "title": "Backend Status",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
      "targets": [
        {
          "expr": "up{job=\"spring-boot\"}",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "mappings": [
            { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
            { "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "red", "value": null },
              { "color": "green", "value": 1 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "textMode": "value"
      }
    },
    {
      "id": 2,
      "title": "Server Errors (5xx)",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
      "targets": [
        {
          "expr": "sum(increase(http_server_requests_seconds_count{status=~\"5..\"}[$__range]))",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 6 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 3,
      "title": "Response Time (p95)",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
      "targets": [
        {
          "expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[$__range])) by (le))",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "s",
          "decimals": 2,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 0.5 },
              { "color": "red", "value": 2 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 4,
      "title": "Error Log Count",
      "type": "stat",
      "datasource": { "type": "loki", "uid": "loki" },
      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
      "targets": [
        {
          "expr": "sum(count_over_time({compose_service=\"backend\"} | json | level=\"ERROR\" [$__range]))",
          "queryType": "instant",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 10 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 5,
      "title": "CPU Usage",
      "type": "bargauge",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 5, "w": 8, "x": 0, "y": 5 },
      "targets": [
        {
          "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 70 },
              { "color": "red", "value": 85 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "showUnfilled": true
      }
    },
    {
      "id": 6,
      "title": "Memory Usage",
      "type": "bargauge",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 5, "w": 8, "x": 8, "y": 5 },
      "targets": [
        {
          "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 70 },
              { "color": "red", "value": 85 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "showUnfilled": true
      }
    },
    {
      "id": 7,
      "title": "Disk Usage",
      "type": "bargauge",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 5, "w": 8, "x": 16, "y": 5 },
      "targets": [
        {
          "expr": "(1 - (node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"})) * 100",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 70 },
              { "color": "red", "value": 80 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "showUnfilled": true
      }
    },
    {
      "collapsed": false,
      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 },
      "id": 101,
      "title": "User Activity",
      "type": "row",
      "panels": []
    },
    {
      "id": 8,
      "title": "Active Users",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 8, "x": 0, "y": 11 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(DISTINCT actor_id) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 9,
      "title": "Total Logins",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 8, "x": 8, "y": 11 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 10,
      "title": "Failed Login Attempts",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 8, "x": 16, "y": 11 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind IN ('LOGIN_FAILED', 'LOGIN_RATE_LIMITED')",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 4 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 11,
      "title": "Daily Logins (last 7 days)",
      "type": "barchart",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 7, "w": 24, "x": 0, "y": 15 },
      "targets": [
        {
          "rawSql": "SELECT DATE_TRUNC('day', happened_at) AS time, COUNT(*) AS logins FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'LOGIN_SUCCESS' GROUP BY 1 ORDER BY 1",
          "format": "time_series",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "legend": { "displayMode": "hidden" },
        "orientation": "auto",
        "showValue": "auto",
        "stacking": "none",
        "xTickLabelRotation": 0,
        "xTickLabelSpacing": 0
      }
    },
    {
      "collapsed": false,
      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
      "id": 102,
      "title": "Archive Progress",
      "type": "row",
      "panels": []
    },
    {
      "id": 12,
      "title": "Transcription Coverage",
      "type": "bargauge",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 5, "w": 24, "x": 0, "y": 23 },
      "targets": [
        {
          "rawSql": "SELECT (COUNT(*) FILTER (WHERE text IS NOT NULL AND text <> ''))::float * 100.0 / NULLIF(COUNT(*), 0) AS percent_complete FROM transcription_blocks",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "decimals": 1,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "red", "value": null },
              { "color": "yellow", "value": 25 },
              { "color": "green", "value": 75 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "showUnfilled": true
      }
    },
    {
      "id": 13,
      "title": "Total Documents",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 28 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM documents WHERE status <> 'PLACEHOLDER'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 14,
      "title": "Uploads This Week",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 28 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'FILE_UPLOADED'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 15,
      "title": "Blocks Transcribed This Week",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 28 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'TEXT_SAVED'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 16,
      "title": "Blocks Reviewed This Week",
      "type": "stat",
      "datasource": { "type": "postgres", "uid": "postgres" },
      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 28 },
      "targets": [
        {
          "rawSql": "SELECT COUNT(*) AS value FROM audit_log WHERE happened_at >= NOW() - INTERVAL '7 days' AND kind = 'BLOCK_REVIEWED'",
          "format": "table",
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "collapsed": false,
      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
      "id": 103,
      "title": "OCR Health",
      "type": "row",
      "panels": []
    },
    {
      "id": 17,
      "title": "OCR Jobs",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 33 },
      "targets": [
        {
          "expr": "sum(increase(ocr_jobs_total[$__range]))",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "short",
          "decimals": 0,
          "color": { "mode": "fixed", "fixedColor": "blue" }
        }
      },
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 18,
      "title": "OCR Page Error Rate",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 33 },
      "targets": [
        {
          "expr": "sum(increase(ocr_skipped_pages_total[$__range])) / clamp_min(sum(increase(ocr_pages_total[$__range])), 1)",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percentunit",
          "decimals": 1,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 0.01 },
              { "color": "red", "value": 0.05 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 19,
      "title": "Illegible Word Rate",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 33 },
      "targets": [
        {
          "expr": "sum(increase(ocr_illegible_words_total[$__range])) / clamp_min(sum(increase(ocr_words_total[$__range])), 1)",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percentunit",
          "decimals": 1,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 0.1 },
              { "color": "red", "value": 0.25 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
      }
    },
    {
      "id": 20,
      "title": "OCR Service Status",
      "type": "stat",
      "datasource": { "type": "prometheus", "uid": "prometheus" },
      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 33 },
      "targets": [
        {
          "expr": "ocr_models_ready",
          "instant": true,
          "refId": "A"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "mappings": [
            { "type": "value", "options": { "0": { "text": "NOT READY", "color": "red" } } },
            { "type": "value", "options": { "1": { "text": "READY", "color": "green" } } }
          ],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "red", "value": null },
              { "color": "green", "value": 1 }
            ]
          },
          "color": { "mode": "thresholds" }
        }
      },
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
        "textMode": "value"
      }
    }
  ],
  "refresh": "",
  "schemaVersion": 39,
  "tags": ["po-overview", "familienarchiv"],
  "templating": { "list": [] },
  "time": { "from": "now-7d", "to": "now" },
  "timepicker": {},
  "timezone": "browser",
  "title": "PO Overview",
  "uid": "po-overview",
  "version": 1,
  "weekStart": ""
 }
--- a/infra/observability/grafana/provisioning/datasources/datasources.yml
+++ b/infra/observability/grafana/provisioning/datasources/datasources.yml
@@ -36,3 +36,19 @@ datasources:
        datasourceUid: prometheus
      nodeGraph:
        enabled: true
  # Read-only PostgreSQL datasource for the PO Overview dashboard (issue #651).
  # Uses the grafana_reader role provisioned by Flyway V68. Traffic stays inside
  # archiv-net, so sslmode=disable is the deliberate, accepted setting.
  - name: PostgreSQL
    type: postgres
    uid: postgres
    url: archive-db:5432
    user: grafana_reader
    editable: false
    secureJsonData:
      password: ${GRAFANA_DB_PASSWORD}
    jsonData:
      database: ${POSTGRES_DB}
      sslmode: disable
      postgresVersion: 1600
--- a/infra/observability/obs.env
+++ b/infra/observability/obs.env
@@ -16,6 +16,11 @@ GLITCHTIP_DOMAIN=https://glitchtip.archiv.raddatz.cloud
 POSTGRES_USER=archiv
 # Note: GRAFANA_DB_PASSWORD is a secret and is injected by CI from
 # obs-secrets.env (see .env.example for the local-dev declaration).
 # It is consumed by both archive-backend (Flyway V68 placeholder) and
 # obs-grafana (PostgreSQL datasource).
 # PostgreSQL hostname for GlitchTip db-init and workers.
 # The actual value depends on the Compose project name — it is not a fixed string.
 # CI sets POSTGRES_HOST in obs-secrets.env per environment:
--- a/infra/observability/prometheus/prometheus.yml
+++ b/infra/observability/prometheus/prometheus.yml
@@ -20,7 +20,4 @@ scrape_configs:
  - job_name: ocr-service
    metrics_path: /metrics
    static_configs:
      # TODO: remove or add prometheus-client to ocr-service.
      # The Python OCR service does not currently expose Prometheus metrics.
      # This target will show as DOWN until prometheus-client is added to ocr-service.
      - targets: ['ocr:8000']
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -2,6 +2,7 @@
 import asyncio
 import glob
 import inspect
 import io
 import json
 import logging
@@ -10,9 +11,11 @@ import re
 import shutil
 import subprocess
 import tempfile
 import time
 import zipfile
 from contextlib import asynccontextmanager
 from datetime import datetime, timezone
 from typing import Awaitable, Callable
 from urllib.parse import urlparse
 import httpx
@@ -20,8 +23,11 @@ import pypdfium2 as pdfium
 from fastapi import FastAPI, Form, Header, HTTPException, UploadFile
 from fastapi.responses import StreamingResponse
 from PIL import Image
 from prometheus_client import REGISTRY
 from prometheus_fastapi_instrumentator import Instrumentator
 from confidence import apply_confidence_markers, get_threshold
 from metrics import OcrMetrics, build_metrics
 from spell_check import correct_text, load_spell_checker
 from engines import kraken as kraken_engine
 from engines import surya as surya_engine
@@ -37,6 +43,12 @@ logger = logging.getLogger(__name__)
 _models_ready = False
 # One-shot import-time binding to the default REGISTRY. Tests that need a
 # clean counter state must monkeypatch `main.metrics` with a container built
 # from a fresh CollectorRegistry — rebinding through the registry directly
 # will not retarget the references stored in the OcrMetrics dataclass.
 metrics: OcrMetrics = build_metrics(REGISTRY)
 ALLOWED_PDF_HOSTS = set(
    h.strip() for h in os.getenv("ALLOWED_PDF_HOSTS", "minio,localhost,127.0.0.1").split(",")
 )
@@ -44,6 +56,42 @@ ALLOWED_PDF_HOSTS = set(
 _SPELL_CHECK_SCRIPT_TYPES = {"HANDWRITING_KURRENT", "HANDWRITING_LATIN"}
 async def _record_training(
    runner: Callable[[], Awaitable[dict] | dict],
    kind: str,
 ) -> dict:
    """Run a training callable and record outcome + accuracy metrics.
    Wraps the per-endpoint try/except + outcome counter + accuracy gauge
    block that used to be repeated at /train, /train-sender, and /segtrain.
    The runner returns a dict with at least an `accuracy` key; if its value
    is None, the gauge is left at its default.
    """
    try:
        result = runner()
        if inspect.isawaitable(result):
            result = await result
    except Exception:
        metrics.ocr_training_runs_total.labels(kind=kind, outcome="error").inc()
        raise
    metrics.ocr_training_runs_total.labels(kind=kind, outcome="success").inc()
    if result.get("accuracy") is not None:
        metrics.ocr_model_accuracy.labels(kind=kind).set(result["accuracy"])
    return result
 def _observe_block_words(words: list[dict], threshold: float) -> None:
    """Record per-block word counts and below-threshold word counts.
    Pre: `words` is non-empty. Caller checks for that — keeping the helper
    branch-free makes the call sites read as a single line.
    """
    metrics.ocr_words_total.inc(len(words))
    metrics.ocr_illegible_words_total.inc(
        sum(1 for w in words if w["confidence"] < threshold)
    )
 def _validate_url(url: str) -> None:
    """Validate that the PDF URL points to an allowed host (SSRF protection)."""
    parsed = urlparse(url)
@@ -63,6 +111,7 @@ async def lifespan(app: FastAPI):
    kraken_engine.load_models()
    load_spell_checker()
    _models_ready = True
    metrics.ocr_models_ready.set(1)
    logger.info("Startup complete — ready to accept requests")
    yield
@@ -72,6 +121,28 @@ async def lifespan(app: FastAPI):
 app = FastAPI(title="Familienarchiv OCR Service", lifespan=lifespan)
 # /metrics is unauthenticated — relies on Docker-internal-network exposure
 # only (CWE-200 risk if `ports:` ever maps 8000 to host). See
 # docs/OBSERVABILITY.md §Internal-only endpoints for the Caddy block snippet.
 Instrumentator(excluded_handlers=["/health", "/metrics"]).instrument(app).expose(app)
 class MetricsPathFilter(logging.Filter):
    """Drop uvicorn.access entries for /metrics and /health to keep logs focused."""
    _SUPPRESSED_PATHS = {"/metrics", "/health"}
    def filter(self, record: logging.LogRecord) -> bool:
        # uvicorn.access formats as: '%s - "%s %s HTTP/%s" %d'
        if record.args and len(record.args) >= 3:
            path = record.args[2]
            if isinstance(path, str) and path in self._SUPPRESSED_PATHS:
                return False
        return True
 logging.getLogger("uvicorn.access").addFilter(MetricsPathFilter())
@app.get("/health")
 def health():
@@ -99,7 +170,9 @@ async def run_ocr(request: OcrRequest):
        del img
    script_type = request.scriptType.upper()
    engine_name = "kraken" if script_type == "HANDWRITING_KURRENT" else "surya"
    extract_started = time.monotonic()
    if script_type == "HANDWRITING_KURRENT":
        if not kraken_engine.is_available():
            raise HTTPException(
@@ -111,11 +184,18 @@ async def run_ocr(request: OcrRequest):
    else:
        # TYPEWRITER, HANDWRITING_LATIN, UNKNOWN — all use Surya
        blocks = await asyncio.to_thread(surya_engine.extract_blocks, images, request.language)
    metrics.ocr_processing_seconds.labels(engine=engine_name).observe(
        time.monotonic() - extract_started
    )
    metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc()
    threshold = get_threshold(script_type)
    for block in blocks:
-        if block.get("words"):
+        words = block.get("words") or []
-            block["text"] = apply_confidence_markers(block["words"], threshold)
+        if words:
            _observe_block_words(words, threshold)
            block["text"] = apply_confidence_markers(words, threshold)
        block.pop("words", None)
        if script_type in _SPELL_CHECK_SCRIPT_TYPES:
            block["text"] = correct_text(block["text"])
@@ -146,6 +226,9 @@ async def run_ocr_stream(request: OcrRequest):
        )
    engine = kraken_engine if use_kraken else surya_engine
    engine_name = "kraken" if use_kraken else "surya"
    metrics.ocr_jobs_total.labels(engine=engine_name, script_type=script_type).inc()
    if request.regions:
        # Guided mode: recognize only the user-drawn annotation regions
@@ -176,12 +259,15 @@ async def run_ocr_stream(request: OcrRequest):
                    image = await asyncio.to_thread(preprocess_page, image)
                    blocks = []
                    sender_path = request.senderModelPath if use_kraken else None
                    engine_seconds = 0.0
                    for region in page_regions:
                        region_started = time.monotonic()
                        text = await asyncio.to_thread(
                            engine.extract_region_text, image,
                            region.x, region.y, region.width, region.height,
                            sender_path,
                        )
                        engine_seconds += time.monotonic() - region_started
                        if script_type in _SPELL_CHECK_SCRIPT_TYPES:
                            text = correct_text(text)
                        blocks.append({
@@ -195,7 +281,11 @@ async def run_ocr_stream(request: OcrRequest):
                            "annotationId": region.annotationId,
                        })
                    metrics.ocr_processing_seconds.labels(engine=engine_name).observe(
                        engine_seconds
                    )
                    total_blocks += len(blocks)
                    metrics.ocr_pages_total.labels(engine=engine_name).inc()
                    yield json.dumps({
                        "type": "page",
                        "pageNumber": page_idx,
@@ -205,6 +295,7 @@ async def run_ocr_stream(request: OcrRequest):
                except Exception:
                    logger.exception("Guided OCR failed on page %d", page_idx)
                    skipped_pages += 1
                    metrics.ocr_skipped_pages_total.inc()
                    yield json.dumps({
                        "type": "error",
                        "pageNumber": page_idx,
@@ -238,18 +329,25 @@ async def run_ocr_stream(request: OcrRequest):
                yield json.dumps({"type": "preprocessing", "pageNumber": page_idx}) + "\n"
                image = await asyncio.to_thread(preprocess_page, image)
                sender_path = request.senderModelPath if use_kraken else None
                page_started = time.monotonic()
                blocks = await asyncio.to_thread(
                    engine.extract_page_blocks, image, page_idx, request.language, sender_path
                )
                metrics.ocr_processing_seconds.labels(engine=engine_name).observe(
                    time.monotonic() - page_started
                )
                for block in blocks:
-                    if block.get("words"):
+                    words = block.get("words") or []
-                        block["text"] = apply_confidence_markers(block["words"], threshold)
+                    if words:
                        _observe_block_words(words, threshold)
                        block["text"] = apply_confidence_markers(words, threshold)
                    block.pop("words", None)
                    if script_type in _SPELL_CHECK_SCRIPT_TYPES:
                        block["text"] = correct_text(block["text"])
                total_blocks += len(blocks)
                metrics.ocr_pages_total.labels(engine=engine_name).inc()
                yield json.dumps({
                    "type": "page",
                    "pageNumber": page_idx,
@@ -259,6 +357,7 @@ async def run_ocr_stream(request: OcrRequest):
            except Exception:
                logger.exception("OCR failed on page %d", page_idx)
                skipped_pages += 1
                metrics.ocr_skipped_pages_total.inc()
                yield json.dumps({
                    "type": "error",
                    "pageNumber": page_idx,
@@ -438,8 +537,7 @@ async def train_model(
            return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs}
-    result = await asyncio.to_thread(_run_training)
+    return await _record_training(lambda: asyncio.to_thread(_run_training), kind="recognition")
    return result
@app.post("/train-sender")
@@ -518,8 +616,9 @@ async def train_sender_model(
            return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs}
-    result = await asyncio.to_thread(_run_sender_training)
+    return await _record_training(
-    return result
+        lambda: asyncio.to_thread(_run_sender_training), kind="recognition"
    )
@app.post("/segtrain")
@@ -628,8 +727,7 @@ async def segtrain_model(
            return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs}
-    result = await asyncio.to_thread(_run_segtrain)
+    return await _record_training(lambda: asyncio.to_thread(_run_segtrain), kind="segmentation")
    return result
 async def _download_and_convert_pdf(url: str) -> list[Image.Image]:
--- a/ocr-service/metrics.py
+++ b/ocr-service/metrics.py
@@ -0,0 +1,92 @@
 """Prometheus metric definitions for the OCR service.
 `build_metrics(registry)` returns a fresh `OcrMetrics` instance bound to the
 given `CollectorRegistry`. Production code calls it once at module load with
 the default `REGISTRY`; tests pass a per-test `CollectorRegistry()` to keep
 counter values isolated between cases (decision #3 on issue #652).
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
@dataclass(frozen=True)
 class OcrMetrics:
    """Container for every custom OCR metric.
    Counters and gauges are immutable references to `prometheus_client`
    instances. Mutating them (`.inc()`, `.observe()`, `.set()`) is safe;
    rebinding the field on the dataclass is not — use `build_metrics` to get
    a new container.
    """
    ocr_jobs_total: Counter
    ocr_pages_total: Counter
    ocr_skipped_pages_total: Counter
    ocr_words_total: Counter
    ocr_illegible_words_total: Counter
    ocr_processing_seconds: Histogram
    ocr_training_runs_total: Counter
    ocr_model_accuracy: Gauge
    ocr_models_ready: Gauge
 def build_metrics(registry: CollectorRegistry) -> OcrMetrics:
    """Create one OcrMetrics instance bound to `registry`."""
    return OcrMetrics(
        ocr_jobs_total=Counter(
            "ocr_jobs_total",
            "Number of OCR jobs processed, labelled by engine and script type.",
            ["engine", "script_type"],
            registry=registry,
        ),
        ocr_pages_total=Counter(
            "ocr_pages_total",
            "Number of pages successfully OCR'd, labelled by engine.",
            ["engine"],
            registry=registry,
        ),
        ocr_skipped_pages_total=Counter(
            "ocr_skipped_pages_total",
            "Number of pages skipped because the OCR engine raised.",
            registry=registry,
        ),
        ocr_words_total=Counter(
            "ocr_words_total",
            "Number of words recognized across all OCR blocks.",
            registry=registry,
        ),
        ocr_illegible_words_total=Counter(
            "ocr_illegible_words_total",
            "Number of words below the confidence threshold "
            "(replaced with [unleserlich]).",
            registry=registry,
        ),
        ocr_processing_seconds=Histogram(
            "ocr_processing_seconds",
            "OCR processing time per page (streaming) or per document (non-streaming).",
            ["engine"],
            registry=registry,
        ),
        ocr_training_runs_total=Counter(
            "ocr_training_runs_total",
            "Number of training runs, labelled by kind (recognition|segmentation) "
            "and outcome (success|error).",
            ["kind", "outcome"],
            registry=registry,
        ),
        ocr_model_accuracy=Gauge(
            "ocr_model_accuracy",
            "Latest model accuracy reported by a successful training run.",
            ["kind"],
            registry=registry,
        ),
        ocr_models_ready=Gauge(
            "ocr_models_ready",
            "1 once the lifespan startup has finished loading models, 0 before.",
            registry=registry,
        ),
    )
--- a/ocr-service/requirements.txt
+++ b/ocr-service/requirements.txt
@@ -10,3 +10,5 @@ pyvips>=2.2.0
 httpx==0.28.1
 pyspellchecker==0.9.0
 opencv-python-headless==4.11.0.86
 prometheus-fastapi-instrumentator==7.0.0
 prometheus-client==0.25.0
--- a/ocr-service/test_metrics.py
+++ b/ocr-service/test_metrics.py
@@ -0,0 +1,638 @@
 """Tests for Prometheus metrics exposed by the OCR service.
 Each test that asserts on a counter/gauge value uses a fresh CollectorRegistry
 (see decision #3 on issue #652) to keep the metrics isolated between tests.
 """
 import contextlib
 import io
 import zipfile
 from unittest.mock import AsyncMock, patch
 import pytest
 from httpx import ASGITransport, AsyncClient
 from PIL import Image
 from prometheus_client import CollectorRegistry
 from main import app
 from metrics import build_metrics
@contextlib.asynccontextmanager
 async def ocr_client(*, raise_app_exceptions: bool = True):
    """Yield an AsyncClient with model-loaders patched and _models_ready forced on.
    The shared setup for almost every metrics test: stub the heavy lifecycle
    hooks (kraken_engine.load_models, load_spell_checker), flip the readiness
    flag so request handlers do not 503, and restore it afterwards.
    """
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        transport = ASGITransport(app=app, raise_app_exceptions=raise_app_exceptions)
        async with AsyncClient(transport=transport, base_url="http://test") as client:
            import main as main_module
            main_module._models_ready = True
            try:
                yield client
            finally:
                main_module._models_ready = False
 def _minimal_zip() -> bytes:
    """Return a ZIP containing one fake .xml so endpoint validation passes."""
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w") as zf:
        zf.writestr("page_01.xml", "<PcGts/>")
    return buf.getvalue()
 def _fake_training_result(accuracy: float = 0.91) -> dict:
    return {"loss": None, "accuracy": accuracy, "cer": round(1 - accuracy, 4), "epochs": 5}
@pytest.fixture
 def fresh_metrics(monkeypatch):
    """Replace the module-level `main.metrics` with one bound to a fresh registry."""
    registry = CollectorRegistry()
    test_metrics = build_metrics(registry)
    monkeypatch.setattr("main.metrics", test_metrics)
    return test_metrics
@pytest.mark.asyncio
 async def test_metrics_endpoint_returns_200():
    """`GET /metrics` returns 200 with Prometheus exposition content.
    Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture.
    The `/metrics` endpoint is wired by `prometheus-fastapi-instrumentator`, which
    binds to the default REGISTRY at app-construction time; swapping `main.metrics`
    via the fixture would not redirect what `/metrics` exposes. This test only
    asserts response shape (status code + content-type substring), not numeric
    counter values, so cross-test state leakage cannot affect it.
    """
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.get("/metrics")
    assert response.status_code == 200
    assert "text/plain" in response.headers.get("content-type", "")
@pytest.mark.asyncio
 async def test_metrics_includes_http_request_metrics_after_ocr_call():
    """After a request to /ocr, `/metrics` exposes auto-instrumented http_* metrics.
    Uses the global REGISTRY by design — does NOT take the `fresh_metrics` fixture.
    The `http_requests_total` / `http_request_duration_seconds` metrics live on
    the instrumentator's default REGISTRY (not on `main.metrics`), so a fresh
    CollectorRegistry would never see them. This test only asserts response shape
    (substring presence in the exposition body), not numeric counter values, so
    cross-test state leakage cannot affect it.
    """
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            ocr_response = await client.post("/ocr", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            })
            assert ocr_response.status_code == 200, ocr_response.text
            metrics_response = await client.get("/metrics")
    body = metrics_response.text
    assert "http_requests_total" in body
    assert "http_request_duration_seconds" in body
 def test_build_metrics_registers_all_custom_metrics_on_given_registry():
    """`build_metrics` returns an OcrMetrics bound to the supplied registry."""
    registry = CollectorRegistry()
    metrics = build_metrics(registry)
    metric_names = {m.name for m in registry.collect()}
    expected = {
        "ocr_jobs",
        "ocr_pages",
        "ocr_skipped_pages",
        "ocr_words",
        "ocr_illegible_words",
        "ocr_processing_seconds",
        "ocr_training_runs",
        "ocr_model_accuracy",
        "ocr_models_ready",
    }
    assert expected <= metric_names, f"missing: {expected - metric_names}"
    # A second registry yields a separate container — no shared state.
    other_metrics = build_metrics(CollectorRegistry())
    assert metrics is not other_metrics
 async def _drive_ocr(client: AsyncClient, *, script_type: str) -> None:
    """Helper — fires /ocr with a single mocked page and asserts a 200."""
    response = await client.post("/ocr", json={
        "pdfUrl": "http://minio/doc.pdf",
        "scriptType": script_type,
        "language": "de",
    })
    assert response.status_code == 200, response.text
@pytest.mark.asyncio
 async def test_ocr_jobs_total_incremented_with_kraken_engine_label_for_kurrent(fresh_metrics):
    """A /ocr call with HANDWRITING_KURRENT increments engine=kraken."""
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]
    with patch("main.correct_text", side_effect=lambda t: t), \
         patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.kraken_engine.is_available", return_value=True), \
         patch("main.kraken_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="HANDWRITING_KURRENT")
    value = fresh_metrics.ocr_jobs_total.labels(
        engine="kraken", script_type="HANDWRITING_KURRENT"
    )._value.get()
    assert value == 1.0
@pytest.mark.asyncio
 async def test_ocr_jobs_total_incremented_with_surya_engine_label_for_typewriter(fresh_metrics):
    """A /ocr call with TYPEWRITER increments engine=surya."""
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="TYPEWRITER")
    value = fresh_metrics.ocr_jobs_total.labels(
        engine="surya", script_type="TYPEWRITER"
    )._value.get()
    assert value == 1.0
@pytest.mark.asyncio
 async def test_ocr_pages_total_incremented_once_per_page_in_stream(fresh_metrics):
    """The /ocr/stream generator increments ocr_pages_total per successful page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(3)]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "hi", "words": []}]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                # Drain the stream so all per-page increments fire.
                async for _ in response.aiter_lines():
                    pass
    value = fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get()
    assert value == 3.0
@pytest.mark.asyncio
 async def test_ocr_skipped_pages_total_incremented_when_engine_raises_for_a_page(fresh_metrics):
    """When the engine raises on a page, ocr_skipped_pages_total bumps and the stream finishes."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    good_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "ok", "words": []}]
    call_count = {"n": 0}
    def extract_side_effect(*args, **kwargs):
        call_count["n"] += 1
        if call_count["n"] == 1:
            raise RuntimeError("synthetic engine failure")
        return good_blocks
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", side_effect=extract_side_effect):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                saw_error = False
                async for line in response.aiter_lines():
                    if line and '"type": "error"' in line:
                        saw_error = True
                assert saw_error
    assert fresh_metrics.ocr_skipped_pages_total._value.get() == 1.0
    # The second page still succeeds.
    assert fresh_metrics.ocr_pages_total.labels(engine="surya")._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_words_and_illegible_words_total_sum_across_blocks(fresh_metrics):
    """Counters reflect totals summed over every block in the request.
    Threshold defaults to THRESHOLD_DEFAULT (0.3) for non-Kurrent scripts. Two
    blocks: 3 words above + 2 words below threshold across blocks.
    """
    mock_images = [Image.new("RGB", (100, 100))]
    mock_blocks = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
         "polygon": None, "text": "ignored",
         "words": [{"text": "Lieber", "confidence": 0.9},
                   {"text": "Freund", "confidence": 0.1}]},
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
         "polygon": None, "text": "ignored",
         "words": [{"text": "Gruss", "confidence": 0.8},
                   {"text": "verschmiert", "confidence": 0.05},
                   {"text": "Karl", "confidence": 0.95}]},
    ]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            await _drive_ocr(client, script_type="TYPEWRITER")
    assert fresh_metrics.ocr_words_total._value.get() == 5.0
    assert fresh_metrics.ocr_illegible_words_total._value.get() == 2.0
 def _histogram_count_sum(histogram, **labels) -> tuple[float, float]:
    """Read the per-label-set _count and _sum from a prometheus_client Histogram."""
    child = histogram.labels(**labels)
    return child._sum.get(), sum(b.get() for b in child._buckets)
@pytest.mark.asyncio
 async def test_ocr_processing_seconds_histogram_observed_per_page_in_stream(fresh_metrics):
    """The streaming generator observes ocr_processing_seconds once per page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    mock_blocks = [{"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0,
                    "polygon": None, "text": "ok", "words": []}]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_page_blocks", return_value=mock_blocks):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass
    sum_seconds, count = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="surya"
    )
    assert count == 2.0
    assert sum_seconds >= 0.0
@pytest.mark.asyncio
 async def test_ocr_training_runs_total_incremented_with_recognition_success_label(fresh_metrics):
    """/train success increments ocr_training_runs_total{kind=recognition, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result()
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )
    assert response.status_code == 200
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="success"
    )._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_training_runs_total_incremented_with_recognition_error_label(fresh_metrics):
    """When ketos exits non-zero, the error counter bumps and the exception propagates.
    Uses the narrowest available seam — `subprocess.run` returning a failing
    CompletedProcess — instead of stubbing the asyncio.to_thread boundary,
    so the test exercises the real _run_training error path.
    """
    from subprocess import CompletedProcess
    failing_proc = CompletedProcess(
        args=["ketos"], returncode=1, stdout="", stderr="synthetic ketos failure"
    )
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.subprocess.run", return_value=failing_proc):
        transport = ASGITransport(app=app, raise_app_exceptions=False)
        async with AsyncClient(transport=transport, base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )
    assert response.status_code == 500
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="error"
    )._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_training_runs_total_incremented_with_segmentation_success_label(fresh_metrics):
    """/segtrain success increments ocr_training_runs_total{kind=segmentation, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=0.83)
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/segtrain",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )
    assert response.status_code == 200
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="segmentation", outcome="success"
    )._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_training_runs_total_incremented_with_recognition_success_label_for_train_sender(fresh_metrics):
    """/train-sender success increments ocr_training_runs_total{kind=recognition, outcome=success}."""
    async def fake_to_thread(func, *args, **kwargs):
        return _fake_training_result()
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train-sender",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                data={"output_model_path": "/app/models/sender_test.mlmodel"},
                headers={"X-Training-Token": "secret-token"},
            )
    assert response.status_code == 200, response.text
    assert fresh_metrics.ocr_training_runs_total.labels(
        kind="recognition", outcome="success"
    )._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_model_accuracy_gauge_stays_default_when_training_returns_no_accuracy(fresh_metrics):
    """When the runner returns accuracy=None, ocr_model_accuracy must remain at its default 0."""
    async def fake_to_thread(func, *args, **kwargs):
        return {"loss": None, "accuracy": None, "cer": None, "epochs": 5}
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True), \
         patch("main.asyncio.to_thread", side_effect=fake_to_thread):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            response = await client.post(
                "/train",
                files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                headers={"X-Training-Token": "secret-token"},
            )
    assert response.status_code == 200
    # Gauge was never .set() — accessing the label child still creates it with default 0.0.
    assert fresh_metrics.ocr_model_accuracy.labels(
        kind="recognition"
    )._value.get() == 0.0
@pytest.mark.asyncio
 async def test_ocr_model_accuracy_gauge_set_per_kind_after_successful_training(fresh_metrics):
    """After /train and /segtrain succeed, ocr_model_accuracy{kind=...} reflects the result."""
    recognition_accuracy = 0.917
    segmentation_accuracy = 0.834
    async def fake_recognition_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=recognition_accuracy)
    async def fake_segmentation_to_thread(func, *args, **kwargs):
        return _fake_training_result(accuracy=segmentation_accuracy)
    with patch("main.TRAINING_TOKEN", "secret-token"), \
         patch("main._models_ready", True):
        async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
            with patch("main.asyncio.to_thread", side_effect=fake_recognition_to_thread):
                rec_resp = await client.post(
                    "/train",
                    files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                    headers={"X-Training-Token": "secret-token"},
                )
            assert rec_resp.status_code == 200
            with patch("main.asyncio.to_thread", side_effect=fake_segmentation_to_thread):
                seg_resp = await client.post(
                    "/segtrain",
                    files={"file": ("training.zip", _minimal_zip(), "application/zip")},
                    headers={"X-Training-Token": "secret-token"},
                )
            assert seg_resp.status_code == 200
    assert fresh_metrics.ocr_model_accuracy.labels(kind="recognition")._value.get() == pytest.approx(recognition_accuracy)
    assert fresh_metrics.ocr_model_accuracy.labels(kind="segmentation")._value.get() == pytest.approx(segmentation_accuracy)
 def test_ocr_models_ready_gauge_defaults_to_zero():
    """A freshly-built OcrMetrics has ocr_models_ready=0 before lifespan runs."""
    metrics = build_metrics(CollectorRegistry())
    assert metrics.ocr_models_ready._value.get() == 0.0
@pytest.mark.asyncio
 async def test_ocr_models_ready_gauge_is_one_after_lifespan_startup(fresh_metrics):
    """The lifespan flips ocr_models_ready to 1 once load_models / load_spell_checker return.
    ASGITransport does not run lifespan by default, so the lifespan context
    manager is driven directly to exercise the startup code path.
    """
    assert fresh_metrics.ocr_models_ready._value.get() == 0.0
    with patch("main.kraken_engine.load_models"), \
         patch("main.load_spell_checker"):
        async with app.router.lifespan_context(app):
            assert fresh_metrics.ocr_models_ready._value.get() == 1.0
@pytest.mark.asyncio
 async def test_ocr_processing_seconds_histogram_observed_per_page_in_guided_stream(fresh_metrics):
    """The guided streaming generator observes ocr_processing_seconds once per page."""
    mock_images = [Image.new("RGB", (100, 100)) for _ in range(2)]
    regions = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
        {"pageNumber": 2, "x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0, "annotationId": "a2"},
    ]
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.surya_engine.extract_region_text", return_value="text"):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
                "regions": regions,
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass
    sum_seconds, count = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="surya"
    )
    assert count == 2.0
    assert sum_seconds >= 0.0
@pytest.mark.asyncio
 async def test_ocr_processing_seconds_histogram_excludes_spell_check_time_in_guided_stream(fresh_metrics):
    """The guided observation must time engine work only, not the spell-check pass.
    Wall-clock bound rather than a structural `patch("main.time.monotonic")`:
    the patched attribute is the *global* `time.monotonic`, which httpx and
    asyncio also consume — they exhaust the deterministic sequence before the
    request reaches the engine loop. Bound is sized against the failure mode,
    not the noise floor: spell-check sleeps 0.05s × 2 regions = 0.1s, so a
    timer that accidentally wrapped `correct_text` would observe >= 0.1s. The
    0.09s ceiling catches that bug while leaving ~90ms of slack for slow CI
    runners (engine work is instantaneous under the mock).
    """
    mock_images = [Image.new("RGB", (100, 100))]
    regions = [
        {"pageNumber": 1, "x": 0.0, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a1"},
        {"pageNumber": 1, "x": 0.5, "y": 0.0, "width": 0.5, "height": 0.5, "annotationId": "a2"},
    ]
    def slow_correct(text):
        import time as _time
        _time.sleep(0.05)
        return text
    with patch("main._download_and_convert_pdf", new_callable=AsyncMock, return_value=mock_images), \
         patch("main.preprocess_page", side_effect=lambda img: img), \
         patch("main.kraken_engine.is_available", return_value=True), \
         patch("main.kraken_engine.extract_region_text", return_value="text"), \
         patch("main.correct_text", side_effect=slow_correct):
        async with ocr_client() as client:
            async with client.stream("POST", "/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "HANDWRITING_KURRENT",
                "language": "de",
                "regions": regions,
            }) as response:
                assert response.status_code == 200
                async for _ in response.aiter_lines():
                    pass
    sum_seconds, _ = _histogram_count_sum(
        fresh_metrics.ocr_processing_seconds, engine="kraken"
    )
    assert sum_seconds < 0.09, f"timing must exclude spell-check; got sum={sum_seconds}"
@pytest.mark.asyncio
 async def test_ocr_jobs_total_not_incremented_when_pdf_download_fails_in_stream(fresh_metrics):
    """If `_download_and_convert_pdf` raises, ocr_jobs_total is NOT incremented.
    Mirrors the /ocr endpoint's semantics: the counter only records jobs that
    actually started OCR work, not failed downloads.
    """
    async def fail_download(url):
        raise RuntimeError("synthetic download failure")
    with patch("main._download_and_convert_pdf", new=fail_download):
        async with ocr_client(raise_app_exceptions=False) as client:
            response = await client.post("/ocr/stream", json={
                "pdfUrl": "http://minio/doc.pdf",
                "scriptType": "TYPEWRITER",
                "language": "de",
            })
    assert response.status_code == 500
    assert fresh_metrics.ocr_jobs_total.labels(
        engine="surya", script_type="TYPEWRITER"
    )._value.get() == 0.0
 def test_uvicorn_access_log_filter_fails_open_on_short_or_missing_args():
    """The filter must default-allow records when args is None or shorter than expected.
    Locks in fail-open behavior: if uvicorn ever changes its format we keep
    forwarding records to the handler rather than silently dropping logs.
    """
    import logging as _logging
    from main import MetricsPathFilter
    filt = MetricsPathFilter()
    none_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg="some message", args=None, exc_info=None,
    )
    short_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg="%s %s", args=("a", "b"), exc_info=None,
    )
    assert filt.filter(none_record) is True
    assert filt.filter(short_record) is True
 def test_uvicorn_access_log_filter_skips_metrics_path():
    """The MetricsPathFilter drops uvicorn.access log records that target /metrics."""
    import logging as _logging
    from main import MetricsPathFilter
    filt = MetricsPathFilter()
    metrics_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "GET", "/metrics", "1.1", 200),
        exc_info=None,
    )
    health_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "GET", "/health", "1.1", 200),
        exc_info=None,
    )
    ocr_record = _logging.LogRecord(
        name="uvicorn.access", level=_logging.INFO, pathname="", lineno=0,
        msg='%s - "%s %s HTTP/%s" %d',
        args=("127.0.0.1:1234", "POST", "/ocr", "1.1", 200),
        exc_info=None,
    )
    assert filt.filter(metrics_record) is False
    assert filt.filter(health_record) is False
    assert filt.filter(ocr_record) is True