From 3f55af46e677aacef635a5220fe35af43bdad252 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 13:06:12 +0200
Subject: [PATCH 01/13] fix(ocr-service): add entrypoint that validates blla
 model format on startup

Adds ensure_blla_model.py which loads the blla segmentation model with
ketos on every container start. If the model is missing or in the legacy
PyTorch ZIP format (incompatible with ketos 7), it re-downloads the
correct CoreML protobuf model from Zenodo (DOI 10.5281/zenodo.14602569).
The Dockerfile now uses entrypoint.sh which runs this check before
starting uvicorn.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/Dockerfile           |  4 +-
 ocr-service/ensure_blla_model.py | 77 ++++++++++++++++++++++++++++++++
 ocr-service/entrypoint.sh        |  9 ++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 ocr-service/ensure_blla_model.py
 create mode 100644 ocr-service/entrypoint.sh

diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile
index 01b8ebfa..a8ec48df 100644
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -21,6 +21,8 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 COPY . .
 
+RUN chmod +x /app/entrypoint.sh
+
 EXPOSE 8000
 
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
+CMD ["/app/entrypoint.sh"]
diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py
new file mode 100644
index 00000000..a8aed449
--- /dev/null
+++ b/ocr-service/ensure_blla_model.py
@@ -0,0 +1,77 @@
+"""Validates the blla segmentation base model and downloads it if needed.
+
+Run at container startup before uvicorn. ketos 7 requires the model in
+CoreML protobuf or safetensors format — legacy PyTorch ZIP archives
+(torch.save output from kraken <4) are not loadable and will be replaced.
+
+Exits non-zero on failure so Docker marks the container unhealthy rather
+than silently starting with a broken model.
+"""
+
+import glob
+import logging
+import os
+import shutil
+import subprocess
+import sys
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s:ensure_blla_model:%(message)s",
+)
+log = logging.getLogger(__name__)
+
+BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel")
+# DOI for "General segmentation model for print and handwriting" — ketos 7 compatible.
+BLLA_MODEL_DOI = "10.5281/zenodo.14602569"
+HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo")
+
+
+def _model_is_loadable(path: str) -> bool:
+    try:
+        from kraken.lib import vgsl
+
+        vgsl.TorchVGSLModel.load_model(path)
+        return True
+    except Exception as e:
+        log.warning("Model at %s failed to load: %s", path, e)
+        return False
+
+
+def _download_blla() -> str:
+    log.info("Downloading blla model (DOI %s) ...", BLLA_MODEL_DOI)
+    result = subprocess.run(
+        ["kraken", "get", BLLA_MODEL_DOI],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        log.error("kraken get failed: %s", result.stderr)
+        sys.exit(1)
+
+    candidates = sorted(glob.glob(os.path.join(HTRMOPO_DIR, "*/blla.mlmodel")))
+    if not candidates:
+        log.error("Downloaded blla.mlmodel not found under %s", HTRMOPO_DIR)
+        sys.exit(1)
+
+    return candidates[-1]
+
+
+def main() -> None:
+    if os.path.exists(BLLA_MODEL_PATH):
+        if _model_is_loadable(BLLA_MODEL_PATH):
+            log.info("blla model OK: %s", BLLA_MODEL_PATH)
+            return
+        log.warning(
+            "blla model at %s is in an incompatible format — replacing", BLLA_MODEL_PATH
+        )
+        os.rename(BLLA_MODEL_PATH, BLLA_MODEL_PATH + ".incompatible")
+
+    os.makedirs(os.path.dirname(BLLA_MODEL_PATH), exist_ok=True)
+    downloaded = _download_blla()
+    shutil.copy2(downloaded, BLLA_MODEL_PATH)
+    log.info("Installed blla model at %s", BLLA_MODEL_PATH)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh
new file mode 100644
index 00000000..ec6892a8
--- /dev/null
+++ b/ocr-service/entrypoint.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -euo pipefail
+
+# Validate the blla segmentation base model and download it if missing or
+# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives —
+# this ensures the volume always holds a loadable CoreML protobuf model.
+python3 /app/ensure_blla_model.py
+
+exec uvicorn main:app --host 0.0.0.0 --port 8000 --workers 1
-- 
2.49.1


From 9ca3e92387c1e96ea7c00620643c002c8c854d31 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 13:06:44 +0200
Subject: [PATCH 02/13] fix(ocr-service): fix ketos 7 segtrain compatibility
 and prevent OOM

Three issues fixed:

1. --resize both was removed in ketos 7; replaced with --resize union
   which extends the model's class mapping to include training data classes.

2. ketos ignores -s when -i is present, so the 1800px blla model caused
   7+ GB peak RAM and OOM-killed the host (no swap, 5 GB free).
   Now checks the loaded model's input height: only uses the base model
   when it was already fine-tuned at 800px; otherwise trains from scratch
   at 800px (~200 MB peak). After the first run the trained 800px model
   becomes the base for all subsequent fine-tuning runs.

3. segtrain now computes and returns cer = 1 - accuracy, matching the
   recognition training path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/main.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/ocr-service/main.py b/ocr-service/main.py
index 0f656aaa..63473737 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -472,16 +472,35 @@ async def segtrain_model(
                 "-q", "fixed",
                 "-N", "10",
             ]
+            # Train at 800px height. The default blla model uses 1800px, which peaks at
+            # ~7+ GB on CPU and kills the host (ketos ignores -s when -i is present, so
+            # we cannot override the height of an existing model).
+            # Strategy: only use the base model if it is already at 800px (i.e. was
+            # produced by a previous fine-tuning run here). Otherwise train from scratch —
+            # the first run bootstraps a 800px model; all subsequent runs fine-tune it.
+            seg_spec = (
+                "[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 "
+                "Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]"
+            )
+            use_base_model = False
             if os.path.exists(blla_model_path):
-                cmd += ["-i", blla_model_path, "--resize", "both"]
+                try:
+                    from kraken.lib import vgsl as _vgsl
+                    _m = _vgsl.TorchVGSLModel.load_model(blla_model_path)
+                    use_base_model = _m.input[2] == 800  # input is (batch, channels, H, W)
+                    if not use_base_model:
+                        log.info(
+                            "Base model height is %dpx — skipping -i to avoid OOM; "
+                            "will train from scratch at 800px",
+                            _m.input[2],
+                        )
+                except Exception:
+                    pass
+
+            if use_base_model:
+                cmd += ["-i", blla_model_path, "--resize", "union", "-s", seg_spec]
             else:
-                # No pretrained model — train from scratch with reduced height (800px)
-                # to keep peak RAM under ~200 MB on CPU (default 1800px uses ~500 MB+)
-                cmd += [
-                    "-s",
-                    "[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 "
-                    "Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]",
-                ]
+                cmd += ["-s", seg_spec]
             cmd += xml_files
 
             log.info("Running: %s", " ".join(cmd[:5]) + " ...")
@@ -493,7 +512,8 @@ async def segtrain_model(
                 raise RuntimeError(f"ketos segtrain failed (exit {proc.returncode}): {proc.stderr[-500:]}")
 
             accuracy, epochs = _parse_best_checkpoint(checkpoint_dir)
-            log.info("Segmentation training complete — epochs=%s accuracy=%s", epochs, accuracy)
+            cer = round(1.0 - accuracy, 4) if accuracy is not None else None
+            log.info("Segmentation training complete — epochs=%s accuracy=%s cer=%s", epochs, accuracy, cer)
 
             best_model = _find_best_model(checkpoint_dir)
             if best_model is None:
@@ -508,7 +528,7 @@ async def segtrain_model(
             shutil.copy2(best_model, blla_model_path)
             log.info("Replaced blla model at %s", blla_model_path)
 
-            return {"loss": None, "accuracy": accuracy, "cer": None, "epochs": epochs}
+            return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs}
 
     result = await asyncio.to_thread(_run_segtrain)
     return result
-- 
2.49.1


From 4108cda520eff854469535d1dba406f9be2fac0c Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 13:07:11 +0200
Subject: [PATCH 03/13] fix(deploy): wire OCR training token to backend and
 raise container memory limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Pass OCR_TRAINING_TOKEN through to the backend container as
  APP_OCR_TRAINING_TOKEN so RestClientOcrClient sends the X-Training-Token
  header when calling /train and /segtrain.
- Raise mem_limit/memswap_limit from 8g to 12g to give segtrain headroom
  on hosts with more available RAM.
- Uncomment OCR_TRAINING_TOKEN in .env.example — it is now required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .env.example       | 6 +++---
 docker-compose.yml | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.env.example b/.env.example
index 9011b4a5..6ba5dcf9 100644
--- a/.env.example
+++ b/.env.example
@@ -21,9 +21,9 @@ PORT_FRONTEND=5173
 PORT_MAILPIT_UI=8100
 PORT_MAILPIT_SMTP=1025
 
-# OCR Training — set a secret token to protect the /train and /segtrain endpoints on the
-# Python OCR microservice. Leave empty to disable token authentication (development only).
-# OCR_TRAINING_TOKEN=change-me-in-production
+# OCR Training — secret token required to call /train and /segtrain on the OCR service.
+# Also set in the backend so it can pass the token through. Must not be empty in production.
+OCR_TRAINING_TOKEN=change-me-in-production
 
 # Production SMTP — uncomment and fill in to send real emails instead of catching them
 # APP_BASE_URL=https://your-domain.example.com
diff --git a/docker-compose.yml b/docker-compose.yml
index bf57501a..35660e0f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -83,8 +83,8 @@ services:
     restart: unless-stopped
     expose:
       - "8000"
-    mem_limit: 8g
-    memswap_limit: 8g
+    mem_limit: 12g
+    memswap_limit: 12g
     volumes:
       - ocr_models:/app/models
       - ocr_cache:/root/.cache
@@ -145,6 +145,7 @@ services:
       SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false}
       SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false}
       APP_OCR_BASE_URL: http://ocr-service:8000
+      APP_OCR_TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
     ports:
       - "${PORT_BACKEND}:8080"
     networks:
-- 
2.49.1


From ff565353c05009a6e9f950794a439034632d32b2 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 13:07:39 +0200
Subject: [PATCH 04/13] fix(backend): store error rate for segmentation
 training runs

setCer() was called for recognition training but not for segmentation.
The OCR service now returns cer = 1 - accuracy for segtrain; persist it
so the admin panel can display Fehlerrate for both training types.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../org/raddatz/familienarchiv/service/OcrTrainingService.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
index 9c8f4e5d..820ab2f5 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
@@ -162,11 +162,12 @@ public class OcrTrainingService {
             return Objects.requireNonNull(txTemplate.execute(status -> {
                 run.setStatus(TrainingStatus.DONE);
                 run.setCompletedAt(Instant.now());
+                run.setCer(result.cer());
                 run.setLoss(result.loss());
                 run.setAccuracy(result.accuracy());
                 run.setEpochs(result.epochs());
                 OcrTrainingRun updated = trainingRunRepository.save(run);
-                log.info("[trainingRun={}] Segmentation training completed — epochs={}", runId, result.epochs());
+                log.info("[trainingRun={}] Segmentation training completed — cer={} epochs={}", runId, result.cer(), result.epochs());
                 return updated;
             }));
         } catch (Exception e) {
-- 
2.49.1


From 9ee39efb8b33cc2ceb02865beb84ccd7069e2021 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 13:08:08 +0200
Subject: [PATCH 05/13] feat(frontend): limit training history to 3 runs with
 expand toggle

Both training panels (OCR and segmentation) share TrainingHistory.
Show only the 3 most recent runs by default; render a Mehr/Weniger
anzeigen button when there are more.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/lib/components/TrainingHistory.svelte | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/frontend/src/lib/components/TrainingHistory.svelte b/frontend/src/lib/components/TrainingHistory.svelte
index ea194cd4..3409cf59 100644
--- a/frontend/src/lib/components/TrainingHistory.svelte
+++ b/frontend/src/lib/components/TrainingHistory.svelte
@@ -20,6 +20,12 @@ interface Props {
 
 let { runs }: Props = $props();
 
+const COLLAPSED_COUNT = 3;
+let expanded = $state(false);
+
+const visibleRuns = $derived(expanded ? runs : runs.slice(0, COLLAPSED_COUNT));
+const hasMore = $derived(runs.length > COLLAPSED_COUNT);
+
 const dateFormatter = new Intl.DateTimeFormat('de-DE', {
 	day: 'numeric',
 	month: 'short',
@@ -54,7 +60,7 @@ function formatCer(cer: number | undefined | null): string {
 				</td>
 			</tr>
 		{:else}
-			{#each runs as run (run.id)}
+			{#each visibleRuns as run (run.id)}
 				<tr class="border-b border-line/50 last:border-0">
 					<td class="py-2 text-ink-2">{formatDate(run.createdAt)}</td>
 					<td class="py-2">
@@ -117,3 +123,15 @@ function formatCer(cer: number | undefined | null): string {
 		{/if}
 	</tbody>
 </table>
+
+{#if hasMore}
+	<div class="mt-2 text-center">
+		<button
+			type="button"
+			class="text-xs font-medium text-ink-3 transition-colors hover:text-ink"
+			onclick={() => (expanded = !expanded)}
+		>
+			{expanded ? m.comp_expandable_show_less() : m.comp_expandable_show_more()}
+		</button>
+	</div>
+{/if}
-- 
2.49.1


From a1694090ff272902b56d1d56a9c706d7f7be2463 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:17:57 +0200
Subject: [PATCH 06/13] refactor(ocr): extract assertNoRunningTraining() to
 eliminate duplicate guard

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../service/OcrTrainingService.java             | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
index 820ab2f5..dc6ff043 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
@@ -45,6 +45,13 @@ public class OcrTrainingService {
             List<OcrTrainingRun> runs
     ) {}
 
+    private void assertNoRunningTraining() {
+        if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
+            throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
+                    "A training run is already in progress");
+        }
+    }
+
     // Not safe for horizontal scaling: training reloads the Kraken model in-process on the
     // Python OCR service after each run. The DB-level RUNNING constraint (V30 partial unique
     // index) prevents concurrent training API calls, but cannot prevent two OCR service replicas
@@ -53,10 +60,7 @@ public class OcrTrainingService {
         // Short transaction: guard check + create RUNNING row, then commit immediately.
         // The DB connection is released before the OCR HTTP call, which can take several minutes.
         OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
-            if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
-                throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
-                        "A training run is already in progress");
-            }
+            assertNoRunningTraining();
 
             var eligibleBlocks = trainingDataExportService.queryEligibleBlocks();
             if (eligibleBlocks.size() < 5) {
@@ -120,10 +124,7 @@ public class OcrTrainingService {
     public OcrTrainingRun triggerSegTraining(UUID triggeredBy) {
         // Same pattern as triggerTraining: narrow transactions around DB writes only.
         OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> {
-            if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) {
-                throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING,
-                        "A training run is already in progress");
-            }
+            assertNoRunningTraining();
 
             var segBlocks = segmentationTrainingExportService.querySegmentationBlocks();
             if (segBlocks.size() < 5) {
-- 
2.49.1


From 1eaae2ca09c9881c48a27be6a94a828f5d82cccc Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:19:04 +0200
Subject: [PATCH 07/13] =?UTF-8?q?test(ocr):=20add=20unit=20tests=20for=20t?=
 =?UTF-8?q?riggerSegTraining()=20=E2=80=94=20conflict,=20threshold,=20happ?=
 =?UTF-8?q?y=20path,=20failure?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../service/OcrTrainingServiceTest.java       | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
index 7d83188f..1d1e47e9 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
@@ -146,6 +146,90 @@ class OcrTrainingServiceTest {
                 run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null));
     }
 
+    // ─── triggerSegTraining ───────────────────────────────────────────────────
+
+    @Test
+    void triggerSegTraining_throws409_whenRunningRunExists() {
+        when(runRepository.findFirstByStatus(TrainingStatus.RUNNING))
+                .thenReturn(Optional.of(OcrTrainingRun.builder()
+                        .id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
+                        .blockCount(5).documentCount(2).modelName("blla").build()));
+
+        assertThatThrownBy(() -> service.triggerSegTraining(null))
+                .isInstanceOf(DomainException.class)
+                .extracting("status")
+                .satisfies(s -> assertThat(s.toString()).contains("409"));
+    }
+
+    @Test
+    void triggerSegTraining_throws422_whenFewerThan5Segments() {
+        when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
+        when(segExportService.querySegmentationBlocks()).thenReturn(List.of(
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build()
+        ));
+
+        assertThatThrownBy(() -> service.triggerSegTraining(null))
+                .isInstanceOf(DomainException.class);
+    }
+
+    @Test
+    void triggerSegTraining_createsRunWithBlla_andMarksDoneWithCer() throws Exception {
+        when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
+
+        UUID docA = UUID.randomUUID();
+        UUID docB = UUID.randomUUID();
+        List<TranscriptionBlock> segs = List.of(
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docB).build()
+        );
+        when(segExportService.querySegmentationBlocks()).thenReturn(segs);
+        when(segExportService.exportToZip()).thenReturn(out -> {});
+        when(ocrClient.segtrainModel(any())).thenReturn(new OcrClient.TrainingResult(null, 0.92, 0.08, 5));
+
+        OcrTrainingRun saved = OcrTrainingRun.builder()
+                .id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
+                .blockCount(5).documentCount(2).modelName("blla").build();
+        when(runRepository.save(any())).thenReturn(saved);
+
+        service.triggerSegTraining(null);
+
+        verify(runRepository, atLeastOnce()).save(argThat(run ->
+                run.getStatus() == TrainingStatus.DONE
+                        && "blla".equals(run.getModelName())
+                        && run.getCer() != null));
+    }
+
+    @Test
+    void triggerSegTraining_marksRunFailed_whenOcrClientThrows() throws Exception {
+        when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty());
+
+        UUID docA = UUID.randomUUID();
+        List<TranscriptionBlock> segs = List.of(
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(),
+                TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build()
+        );
+        when(segExportService.querySegmentationBlocks()).thenReturn(segs);
+        when(segExportService.exportToZip()).thenReturn(out -> {});
+        when(ocrClient.segtrainModel(any())).thenThrow(new RuntimeException("seg timeout"));
+
+        OcrTrainingRun saved = OcrTrainingRun.builder()
+                .id(UUID.randomUUID()).status(TrainingStatus.RUNNING)
+                .blockCount(5).documentCount(1).modelName("blla").build();
+        when(runRepository.save(any())).thenReturn(saved);
+
+        service.triggerSegTraining(null);
+
+        verify(runRepository, atLeastOnce()).save(argThat(run ->
+                run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null));
+    }
+
     // ─── Orphan recovery ──────────────────────────────────────────────────────
 
     @Test
-- 
2.49.1


From 9b6b6f4f7e91751b7a886dacffe2cdafba93b145 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:20:11 +0200
Subject: [PATCH 08/13] refactor(ocr): rename findTop5 to findTop10 for
 headroom as frontend shows 3 by default

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../familienarchiv/repository/OcrTrainingRunRepository.java     | 2 +-
 .../org/raddatz/familienarchiv/service/OcrTrainingService.java  | 2 +-
 .../raddatz/familienarchiv/service/OcrTrainingServiceTest.java  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java
index 0bab0e99..fe7d61d1 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java
@@ -12,5 +12,5 @@ public interface OcrTrainingRunRepository extends JpaRepository<OcrTrainingRun,
 
     Optional<OcrTrainingRun> findFirstByStatus(TrainingStatus status);
 
-    List<OcrTrainingRun> findTop5ByOrderByCreatedAtDesc();
+    List<OcrTrainingRun> findTop10ByOrderByCreatedAtDesc();
 }
diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
index dc6ff043..65828999 100644
--- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
+++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java
@@ -195,7 +195,7 @@ public class OcrTrainingService {
         int totalOcrBlocks = (int) blockRepository.count();
         int availableSegBlocks = segmentationTrainingExportService.querySegmentationBlocks().size();
 
-        List<OcrTrainingRun> recentRuns = trainingRunRepository.findTop5ByOrderByCreatedAtDesc();
+        List<OcrTrainingRun> recentRuns = trainingRunRepository.findTop10ByOrderByCreatedAtDesc();
         OcrTrainingRun lastRun = recentRuns.isEmpty() ? null : recentRuns.get(0);
 
         return new TrainingInfoResponse(
diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
index 1d1e47e9..62ef00f6 100644
--- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
+++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java
@@ -53,7 +53,7 @@ class OcrTrainingServiceTest {
         service = new OcrTrainingService(runRepository, exportService, segExportService, ocrClient, healthClient, blockRepository, txTemplate);
 
         when(blockRepository.count()).thenReturn(0L);
-        when(runRepository.findTop5ByOrderByCreatedAtDesc()).thenReturn(List.of());
+        when(runRepository.findTop10ByOrderByCreatedAtDesc()).thenReturn(List.of());
         when(segExportService.querySegmentationBlocks()).thenReturn(List.of());
     }
 
-- 
2.49.1


From fdae60a52832dcbed9229182ece41d21c6cdd4f2 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:24:04 +0200
Subject: [PATCH 09/13] fix(ocr): narrow exception handling and add unit tests
 for ensure_blla_model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _model_is_loadable: narrow bare except to (RuntimeError, OSError, ValueError)
  with DEBUG-level fallback for unexpected exceptions — prevents silent masking
  of missing kraken install or AttributeError on vgsl
- _run_segtrain: replace bare except:pass with log.warning so height-check
  fallback is visible in container logs
- New test_ensure_blla_model.py: covers model-OK early return, incompatible
  model rename+replace, and missing model download paths

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/ensure_blla_model.py      |  5 +-
 ocr-service/main.py                   |  4 +-
 ocr-service/test_ensure_blla_model.py | 69 +++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 ocr-service/test_ensure_blla_model.py

diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py
index a8aed449..d0fa0305 100644
--- a/ocr-service/ensure_blla_model.py
+++ b/ocr-service/ensure_blla_model.py
@@ -33,9 +33,12 @@ def _model_is_loadable(path: str) -> bool:
 
         vgsl.TorchVGSLModel.load_model(path)
         return True
-    except Exception as e:
+    except (RuntimeError, OSError, ValueError) as e:
         log.warning("Model at %s failed to load: %s", path, e)
         return False
+    except Exception:
+        log.debug("Unexpected error loading model at %s", path, exc_info=True)
+        return False
 
 
 def _download_blla() -> str:
diff --git a/ocr-service/main.py b/ocr-service/main.py
index 63473737..3545f6a4 100644
--- a/ocr-service/main.py
+++ b/ocr-service/main.py
@@ -494,8 +494,8 @@ async def segtrain_model(
                             "will train from scratch at 800px",
                             _m.input[2],
                         )
-                except Exception:
-                    pass
+                except Exception as exc:
+                    log.warning("Could not inspect base model height, training from scratch: %s", exc)
 
             if use_base_model:
                 cmd += ["-i", blla_model_path, "--resize", "union", "-s", seg_spec]
diff --git a/ocr-service/test_ensure_blla_model.py b/ocr-service/test_ensure_blla_model.py
new file mode 100644
index 00000000..cc171e92
--- /dev/null
+++ b/ocr-service/test_ensure_blla_model.py
@@ -0,0 +1,69 @@
+"""Unit tests for ensure_blla_model.main()."""
+
+from unittest.mock import MagicMock, call, patch
+
+import ensure_blla_model
+
+
+# ─── Model already loadable ───────────────────────────────────────────────────
+
+
+def test_main_returns_early_when_model_is_loadable():
+    """When the model exists and loads cleanly, no download or rename occurs."""
+    with (
+        patch("os.path.exists", return_value=True),
+        patch.object(ensure_blla_model, "_model_is_loadable", return_value=True),
+        patch.object(ensure_blla_model, "_download_blla") as mock_download,
+        patch("os.rename") as mock_rename,
+    ):
+        ensure_blla_model.main()
+
+    mock_download.assert_not_called()
+    mock_rename.assert_not_called()
+
+
+# ─── Model exists but is incompatible ─────────────────────────────────────────
+
+
+def test_main_replaces_incompatible_model():
+    """An incompatible model is renamed and replaced with a fresh download."""
+    fake_path = "/app/models/blla.mlmodel"
+    downloaded_path = "/tmp/downloaded.mlmodel"
+
+    with (
+        patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path),
+        patch("os.path.exists", return_value=True),
+        patch.object(ensure_blla_model, "_model_is_loadable", return_value=False),
+        patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path),
+        patch("os.rename") as mock_rename,
+        patch("shutil.copy2") as mock_copy,
+        patch("os.makedirs"),
+    ):
+        ensure_blla_model.main()
+
+    mock_rename.assert_called_once_with(fake_path, fake_path + ".incompatible")
+    mock_copy.assert_called_once_with(downloaded_path, fake_path)
+
+
+# ─── Model missing ────────────────────────────────────────────────────────────
+
+
+def test_main_downloads_when_model_missing():
+    """When the model file doesn't exist at all, it is downloaded without rename."""
+    fake_path = "/app/models/blla.mlmodel"
+    downloaded_path = "/tmp/downloaded.mlmodel"
+
+    with (
+        patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path),
+        patch("os.path.exists", return_value=False),
+        patch.object(ensure_blla_model, "_model_is_loadable") as mock_loadable,
+        patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path),
+        patch("os.rename") as mock_rename,
+        patch("shutil.copy2") as mock_copy,
+        patch("os.makedirs"),
+    ):
+        ensure_blla_model.main()
+
+    mock_loadable.assert_not_called()
+    mock_rename.assert_not_called()
+    mock_copy.assert_called_once_with(downloaded_path, fake_path)
-- 
2.49.1


From 29b44e3f48cbc398b3d0a06ad502f4bb57256ae4 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:24:44 +0200
Subject: [PATCH 10/13] fix(ocr): pin Dockerfile base image to
 python:3.11.9-slim for reproducible builds

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ocr-service/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile
index a8ec48df..25d383a4 100644
--- a/ocr-service/Dockerfile
+++ b/ocr-service/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.11.9-slim
 
 WORKDIR /app
 
-- 
2.49.1


From 83900de787bd0d73b2b51a21c228f1064759d96a Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:26:55 +0200
Subject: [PATCH 11/13] fix(frontend): accessibility fixes for TrainingHistory
 expand/collapse and FAILED badge

- Add aria-expanded + aria-controls to expand button (WCAG 4.1.2)
- Add id="training-history-rows" to tbody for aria-controls target
- Replace title= tooltip on FAILED badge with details/summary for keyboard
  and touch accessibility; add training_error_detail_label i18n key
- Use motion-safe:animate-pulse on RUNNING badge for prefers-reduced-motion

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 frontend/messages/de.json                         |  1 +
 frontend/messages/en.json                         |  1 +
 frontend/messages/es.json                         |  1 +
 .../src/lib/components/TrainingHistory.svelte     | 15 ++++++++++++---
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/frontend/messages/de.json b/frontend/messages/de.json
index 48135368..cfb4578d 100644
--- a/frontend/messages/de.json
+++ b/frontend/messages/de.json
@@ -554,6 +554,7 @@
 	"training_history_col_cer": "Fehlerrate",
 	"training_status_done": "Fertig",
 	"training_status_failed": "Fehler",
+	"training_error_detail_label": "Fehlerdetails",
 	"training_status_running": "Läuft…",
 	"training_seg_heading": "Segmentierung trainieren",
 	"training_seg_description": "Starte ein neues Training mit annotierten Segmentierungsbereichen, um die Texterkennung zu verbessern.",
diff --git a/frontend/messages/en.json b/frontend/messages/en.json
index 8f55b111..887b4091 100644
--- a/frontend/messages/en.json
+++ b/frontend/messages/en.json
@@ -554,6 +554,7 @@
 	"training_history_col_cer": "Error Rate",
 	"training_status_done": "Done",
 	"training_status_failed": "Failed",
+	"training_error_detail_label": "Error details",
 	"training_status_running": "Running…",
 	"training_seg_heading": "Train segmentation",
 	"training_seg_description": "Start a new training run using annotated segmentation regions to improve text detection.",
diff --git a/frontend/messages/es.json b/frontend/messages/es.json
index b4b0ba65..ab357f76 100644
--- a/frontend/messages/es.json
+++ b/frontend/messages/es.json
@@ -554,6 +554,7 @@
 	"training_history_col_cer": "Tasa de error",
 	"training_status_done": "Listo",
 	"training_status_failed": "Error",
+	"training_error_detail_label": "Detalles del error",
 	"training_status_running": "Ejecutando…",
 	"training_seg_heading": "Entrenar segmentación",
 	"training_seg_description": "Inicia un nuevo entrenamiento con regiones de segmentación anotadas para mejorar la detección de texto.",
diff --git a/frontend/src/lib/components/TrainingHistory.svelte b/frontend/src/lib/components/TrainingHistory.svelte
index 3409cf59..586fee32 100644
--- a/frontend/src/lib/components/TrainingHistory.svelte
+++ b/frontend/src/lib/components/TrainingHistory.svelte
@@ -52,7 +52,7 @@ function formatCer(cer: number | undefined | null): string {
 			<th class="hidden pb-2 text-right md:table-cell">{m.training_history_col_cer()}</th>
 		</tr>
 	</thead>
-	<tbody>
+	<tbody id="training-history-rows">
 		{#if runs.length === 0}
 			<tr>
 				<td colspan="5" class="py-4 text-center text-sm text-ink-2">
@@ -85,7 +85,6 @@ function formatCer(cer: number | undefined | null): string {
 						{:else if run.status === 'FAILED'}
 							<span
 								class="inline-flex items-center gap-1 rounded-sm bg-red-100 px-1.5 py-0.5 text-xs font-medium text-red-700"
-								title={run.errorMessage}
 							>
 								<svg
 									aria-hidden="true"
@@ -101,13 +100,21 @@ function formatCer(cer: number | undefined | null): string {
 								</svg>
 								{m.training_status_failed()}
 							</span>
+							{#if run.errorMessage}
+								<details class="mt-0.5">
+									<summary class="cursor-pointer text-xs text-red-700 underline">
+										{m.training_error_detail_label()}
+									</summary>
+									<p class="mt-1 text-xs text-red-600">{run.errorMessage}</p>
+								</details>
+							{/if}
 						{:else}
 							<span
 								class="inline-flex items-center gap-1 rounded-sm bg-yellow-100 px-1.5 py-0.5 text-xs font-medium text-yellow-700"
 							>
 								<span
 									aria-hidden="true"
-									class="h-1.5 w-1.5 animate-pulse rounded-full bg-yellow-500"
+									class="h-1.5 w-1.5 rounded-full bg-yellow-500 motion-safe:animate-pulse"
 								></span>
 								{m.training_status_running()}
 							</span>
@@ -128,6 +135,8 @@ function formatCer(cer: number | undefined | null): string {
 	<div class="mt-2 text-center">
 		<button
 			type="button"
+			aria-expanded={expanded}
+			aria-controls="training-history-rows"
 			class="text-xs font-medium text-ink-3 transition-colors hover:text-ink"
 			onclick={() => (expanded = !expanded)}
 		>
-- 
2.49.1


From 06e3ae141ca44a9537f56b954f23430d300b68bc Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:28:15 +0200
Subject: [PATCH 12/13] test(frontend): add Vitest component tests for
 TrainingHistory expand/collapse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../components/TrainingHistory.svelte.spec.ts | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 frontend/src/lib/components/TrainingHistory.svelte.spec.ts

diff --git a/frontend/src/lib/components/TrainingHistory.svelte.spec.ts b/frontend/src/lib/components/TrainingHistory.svelte.spec.ts
new file mode 100644
index 00000000..728b167a
--- /dev/null
+++ b/frontend/src/lib/components/TrainingHistory.svelte.spec.ts
@@ -0,0 +1,52 @@
+import { afterEach, describe, expect, it } from 'vitest';
+import { cleanup, render } from 'vitest-browser-svelte';
+import { page } from 'vitest/browser';
+import TrainingHistory from './TrainingHistory.svelte';
+
+afterEach(cleanup);
+
+function makeRun(i: number) {
+	return {
+		id: `run-${i}`,
+		status: 'DONE' as const,
+		blockCount: 10,
+		documentCount: 2,
+		modelName: 'german_kurrent',
+		createdAt: `2026-01-0${i + 1}T12:00:00Z`,
+		completedAt: `2026-01-0${i + 1}T12:05:00Z`
+	};
+}
+
+const fiveRuns = Array.from({ length: 5 }, (_, i) => makeRun(i));
+const twoRuns = Array.from({ length: 2 }, (_, i) => makeRun(i));
+
+describe('TrainingHistory — expand/collapse', () => {
+	it('shows only 3 runs initially when more than 3 exist', async () => {
+		render(TrainingHistory, { runs: fiveRuns });
+
+		const rows = page.getByRole('row');
+		// 1 header row + 3 data rows = 4 total
+		await expect.element(rows.nth(3)).toBeInTheDocument();
+		await expect.element(rows.nth(4)).not.toBeInTheDocument();
+
+		await expect.element(page.getByRole('button', { name: /Mehr anzeigen/i })).toBeInTheDocument();
+	});
+
+	it('shows all runs after clicking the expand button', async () => {
+		render(TrainingHistory, { runs: fiveRuns });
+
+		await page.getByRole('button', { name: /Mehr anzeigen/i }).click();
+
+		const rows = page.getByRole('row');
+		// 1 header row + 5 data rows = 6 total
+		await expect.element(rows.nth(5)).toBeInTheDocument();
+	});
+
+	it('hides the toggle button when 3 or fewer runs exist', async () => {
+		render(TrainingHistory, { runs: twoRuns });
+
+		await expect
+			.element(page.getByRole('button', { name: /Mehr anzeigen/i }))
+			.not.toBeInTheDocument();
+	});
+});
-- 
2.49.1


From 6d7469e9b81ff91b9848bc7dd7091cbaa89362a4 Mon Sep 17 00:00:00 2001
From: Marcel <marcel@familienarchiv>
Date: Tue, 14 Apr 2026 15:29:28 +0200
Subject: [PATCH 13/13] fix(deploy): increase OCR healthcheck start_period,
 comment ocr_cache volume, add token hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- start_period 60s → 120s: Zenodo download on cold start can exceed 60s on slow connections
- ocr_cache volume comment: documents what the cache stores for future operators
- .env.example: add token generation command to prevent weak placeholder in production

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .env.example       | 1 +
 docker-compose.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.env.example b/.env.example
index 6ba5dcf9..5b928337 100644
--- a/.env.example
+++ b/.env.example
@@ -23,6 +23,7 @@ PORT_MAILPIT_SMTP=1025
 
 # OCR Training — secret token required to call /train and /segtrain on the OCR service.
 # Also set in the backend so it can pass the token through. Must not be empty in production.
+# Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"
 OCR_TRAINING_TOKEN=change-me-in-production
 
 # Production SMTP — uncomment and fill in to send real emails instead of catching them
diff --git a/docker-compose.yml b/docker-compose.yml
index 35660e0f..0637d417 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -87,7 +87,7 @@ services:
     memswap_limit: 12g
     volumes:
       - ocr_models:/app/models
-      - ocr_cache:/root/.cache
+      - ocr_cache:/root/.cache  # Hugging Face / ketos model download cache — prevents re-downloads on container recreate
     environment:
       KRAKEN_MODEL_PATH: /app/models/german_kurrent.mlmodel
       TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}"
@@ -102,7 +102,7 @@ services:
       interval: 10s
       timeout: 5s
       retries: 12
-      start_period: 60s
+      start_period: 120s
 
   # --- Backend: Spring Boot ---
   backend:
-- 
2.49.1