From 3f55af46e677aacef635a5220fe35af43bdad252 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 13:06:12 +0200 Subject: [PATCH 01/13] fix(ocr-service): add entrypoint that validates blla model format on startup Adds ensure_blla_model.py which loads the blla segmentation model with ketos on every container start. If the model is missing or in the legacy PyTorch ZIP format (incompatible with ketos 7), it re-downloads the correct CoreML protobuf model from Zenodo (DOI 10.5281/zenodo.14602569). The Dockerfile now uses entrypoint.sh which runs this check before starting uvicorn. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 4 +- ocr-service/ensure_blla_model.py | 77 ++++++++++++++++++++++++++++++++ ocr-service/entrypoint.sh | 9 ++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 ocr-service/ensure_blla_model.py create mode 100644 ocr-service/entrypoint.sh diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index 01b8ebfa..a8ec48df 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -21,6 +21,8 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . +RUN chmod +x /app/entrypoint.sh + EXPOSE 8000 -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"] +CMD ["/app/entrypoint.sh"] diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py new file mode 100644 index 00000000..a8aed449 --- /dev/null +++ b/ocr-service/ensure_blla_model.py @@ -0,0 +1,77 @@ +"""Validates the blla segmentation base model and downloads it if needed. + +Run at container startup before uvicorn. ketos 7 requires the model in +CoreML protobuf or safetensors format — legacy PyTorch ZIP archives +(torch.save output from kraken <4) are not loadable and will be replaced. + +Exits non-zero on failure so Docker marks the container unhealthy rather +than silently starting with a broken model. +""" + +import glob +import logging +import os +import shutil +import subprocess +import sys + +logging.basicConfig( + level=logging.INFO, + format="%(levelname)s:ensure_blla_model:%(message)s", +) +log = logging.getLogger(__name__) + +BLLA_MODEL_PATH = os.environ.get("BLLA_MODEL_PATH", "/app/models/blla.mlmodel") +# DOI for "General segmentation model for print and handwriting" — ketos 7 compatible. +BLLA_MODEL_DOI = "10.5281/zenodo.14602569" +HTRMOPO_DIR = os.path.expanduser("~/.local/share/htrmopo") + + +def _model_is_loadable(path: str) -> bool: + try: + from kraken.lib import vgsl + + vgsl.TorchVGSLModel.load_model(path) + return True + except Exception as e: + log.warning("Model at %s failed to load: %s", path, e) + return False + + +def _download_blla() -> str: + log.info("Downloading blla model (DOI %s) ...", BLLA_MODEL_DOI) + result = subprocess.run( + ["kraken", "get", BLLA_MODEL_DOI], + capture_output=True, + text=True, + ) + if result.returncode != 0: + log.error("kraken get failed: %s", result.stderr) + sys.exit(1) + + candidates = sorted(glob.glob(os.path.join(HTRMOPO_DIR, "*/blla.mlmodel"))) + if not candidates: + log.error("Downloaded blla.mlmodel not found under %s", HTRMOPO_DIR) + sys.exit(1) + + return candidates[-1] + + +def main() -> None: + if os.path.exists(BLLA_MODEL_PATH): + if _model_is_loadable(BLLA_MODEL_PATH): + log.info("blla model OK: %s", BLLA_MODEL_PATH) + return + log.warning( + "blla model at %s is in an incompatible format — replacing", BLLA_MODEL_PATH + ) + os.rename(BLLA_MODEL_PATH, BLLA_MODEL_PATH + ".incompatible") + + os.makedirs(os.path.dirname(BLLA_MODEL_PATH), exist_ok=True) + downloaded = _download_blla() + shutil.copy2(downloaded, BLLA_MODEL_PATH) + log.info("Installed blla model at %s", BLLA_MODEL_PATH) + + +if __name__ == "__main__": + main() diff --git a/ocr-service/entrypoint.sh b/ocr-service/entrypoint.sh new file mode 100644 index 00000000..ec6892a8 --- /dev/null +++ b/ocr-service/entrypoint.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -euo pipefail + +# Validate the blla segmentation base model and download it if missing or +# incompatible. ketos 7 dropped support for legacy PyTorch ZIP archives — +# this ensures the volume always holds a loadable CoreML protobuf model. +python3 /app/ensure_blla_model.py + +exec uvicorn main:app --host 0.0.0.0 --port 8000 --workers 1 -- 2.49.1 From 9ca3e92387c1e96ea7c00620643c002c8c854d31 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 13:06:44 +0200 Subject: [PATCH 02/13] fix(ocr-service): fix ketos 7 segtrain compatibility and prevent OOM Three issues fixed: 1. --resize both was removed in ketos 7; replaced with --resize union which extends the model's class mapping to include training data classes. 2. ketos ignores -s when -i is present, so the 1800px blla model caused 7+ GB peak RAM and OOM-killed the host (no swap, 5 GB free). Now checks the loaded model's input height: only uses the base model when it was already fine-tuned at 800px; otherwise trains from scratch at 800px (~200 MB peak). After the first run the trained 800px model becomes the base for all subsequent fine-tuning runs. 3. segtrain now computes and returns cer = 1 - accuracy, matching the recognition training path. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 0f656aaa..63473737 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -472,16 +472,35 @@ async def segtrain_model( "-q", "fixed", "-N", "10", ] + # Train at 800px height. The default blla model uses 1800px, which peaks at + # ~7+ GB on CPU and kills the host (ketos ignores -s when -i is present, so + # we cannot override the height of an existing model). + # Strategy: only use the base model if it is already at 800px (i.e. was + # produced by a previous fine-tuning run here). Otherwise train from scratch — + # the first run bootstraps a 800px model; all subsequent runs fine-tune it. + seg_spec = ( + "[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 " + "Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]" + ) + use_base_model = False if os.path.exists(blla_model_path): - cmd += ["-i", blla_model_path, "--resize", "both"] + try: + from kraken.lib import vgsl as _vgsl + _m = _vgsl.TorchVGSLModel.load_model(blla_model_path) + use_base_model = _m.input[2] == 800 # input is (batch, channels, H, W) + if not use_base_model: + log.info( + "Base model height is %dpx — skipping -i to avoid OOM; " + "will train from scratch at 800px", + _m.input[2], + ) + except Exception: + pass + + if use_base_model: + cmd += ["-i", blla_model_path, "--resize", "union", "-s", seg_spec] else: - # No pretrained model — train from scratch with reduced height (800px) - # to keep peak RAM under ~200 MB on CPU (default 1800px uses ~500 MB+) - cmd += [ - "-s", - "[1,800,0,3 Cr7,7,64,2,2 Gn32 Cr3,3,128,2,2 Gn32 Cr3,3,128 Gn32 " - "Cr3,3,256 Gn32 Cr3,3,256 Gn32 Lbx32 Lby32 Cr1,1,32 Gn32 Lby32 Lbx32]", - ] + cmd += ["-s", seg_spec] cmd += xml_files log.info("Running: %s", " ".join(cmd[:5]) + " ...") @@ -493,7 +512,8 @@ async def segtrain_model( raise RuntimeError(f"ketos segtrain failed (exit {proc.returncode}): {proc.stderr[-500:]}") accuracy, epochs = _parse_best_checkpoint(checkpoint_dir) - log.info("Segmentation training complete — epochs=%s accuracy=%s", epochs, accuracy) + cer = round(1.0 - accuracy, 4) if accuracy is not None else None + log.info("Segmentation training complete — epochs=%s accuracy=%s cer=%s", epochs, accuracy, cer) best_model = _find_best_model(checkpoint_dir) if best_model is None: @@ -508,7 +528,7 @@ async def segtrain_model( shutil.copy2(best_model, blla_model_path) log.info("Replaced blla model at %s", blla_model_path) - return {"loss": None, "accuracy": accuracy, "cer": None, "epochs": epochs} + return {"loss": None, "accuracy": accuracy, "cer": cer, "epochs": epochs} result = await asyncio.to_thread(_run_segtrain) return result -- 2.49.1 From 4108cda520eff854469535d1dba406f9be2fac0c Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 13:07:11 +0200 Subject: [PATCH 03/13] fix(deploy): wire OCR training token to backend and raise container memory limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pass OCR_TRAINING_TOKEN through to the backend container as APP_OCR_TRAINING_TOKEN so RestClientOcrClient sends the X-Training-Token header when calling /train and /segtrain. - Raise mem_limit/memswap_limit from 8g to 12g to give segtrain headroom on hosts with more available RAM. - Uncomment OCR_TRAINING_TOKEN in .env.example — it is now required. Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 6 +++--- docker-compose.yml | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.env.example b/.env.example index 9011b4a5..6ba5dcf9 100644 --- a/.env.example +++ b/.env.example @@ -21,9 +21,9 @@ PORT_FRONTEND=5173 PORT_MAILPIT_UI=8100 PORT_MAILPIT_SMTP=1025 -# OCR Training — set a secret token to protect the /train and /segtrain endpoints on the -# Python OCR microservice. Leave empty to disable token authentication (development only). -# OCR_TRAINING_TOKEN=change-me-in-production +# OCR Training — secret token required to call /train and /segtrain on the OCR service. +# Also set in the backend so it can pass the token through. Must not be empty in production. +OCR_TRAINING_TOKEN=change-me-in-production # Production SMTP — uncomment and fill in to send real emails instead of catching them # APP_BASE_URL=https://your-domain.example.com diff --git a/docker-compose.yml b/docker-compose.yml index bf57501a..35660e0f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -83,8 +83,8 @@ services: restart: unless-stopped expose: - "8000" - mem_limit: 8g - memswap_limit: 8g + mem_limit: 12g + memswap_limit: 12g volumes: - ocr_models:/app/models - ocr_cache:/root/.cache @@ -145,6 +145,7 @@ services: SPRING_MAIL_PROPERTIES_MAIL_SMTP_AUTH: ${MAIL_SMTP_AUTH:-false} SPRING_MAIL_PROPERTIES_MAIL_SMTP_STARTTLS_ENABLE: ${MAIL_STARTTLS_ENABLE:-false} APP_OCR_BASE_URL: http://ocr-service:8000 + APP_OCR_TRAINING_TOKEN: "${OCR_TRAINING_TOKEN:-}" ports: - "${PORT_BACKEND}:8080" networks: -- 2.49.1 From ff565353c05009a6e9f950794a439034632d32b2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 13:07:39 +0200 Subject: [PATCH 04/13] fix(backend): store error rate for segmentation training runs setCer() was called for recognition training but not for segmentation. The OCR service now returns cer = 1 - accuracy for segtrain; persist it so the admin panel can display Fehlerrate for both training types. Co-Authored-By: Claude Sonnet 4.6 --- .../org/raddatz/familienarchiv/service/OcrTrainingService.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java index 9c8f4e5d..820ab2f5 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java @@ -162,11 +162,12 @@ public class OcrTrainingService { return Objects.requireNonNull(txTemplate.execute(status -> { run.setStatus(TrainingStatus.DONE); run.setCompletedAt(Instant.now()); + run.setCer(result.cer()); run.setLoss(result.loss()); run.setAccuracy(result.accuracy()); run.setEpochs(result.epochs()); OcrTrainingRun updated = trainingRunRepository.save(run); - log.info("[trainingRun={}] Segmentation training completed — epochs={}", runId, result.epochs()); + log.info("[trainingRun={}] Segmentation training completed — cer={} epochs={}", runId, result.cer(), result.epochs()); return updated; })); } catch (Exception e) { -- 2.49.1 From 9ee39efb8b33cc2ceb02865beb84ccd7069e2021 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 13:08:08 +0200 Subject: [PATCH 05/13] feat(frontend): limit training history to 3 runs with expand toggle Both training panels (OCR and segmentation) share TrainingHistory. Show only the 3 most recent runs by default; render a Mehr/Weniger anzeigen button when there are more. Co-Authored-By: Claude Sonnet 4.6 --- .../src/lib/components/TrainingHistory.svelte | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/frontend/src/lib/components/TrainingHistory.svelte b/frontend/src/lib/components/TrainingHistory.svelte index ea194cd4..3409cf59 100644 --- a/frontend/src/lib/components/TrainingHistory.svelte +++ b/frontend/src/lib/components/TrainingHistory.svelte @@ -20,6 +20,12 @@ interface Props { let { runs }: Props = $props(); +const COLLAPSED_COUNT = 3; +let expanded = $state(false); + +const visibleRuns = $derived(expanded ? runs : runs.slice(0, COLLAPSED_COUNT)); +const hasMore = $derived(runs.length > COLLAPSED_COUNT); + const dateFormatter = new Intl.DateTimeFormat('de-DE', { day: 'numeric', month: 'short', @@ -54,7 +60,7 @@ function formatCer(cer: number | undefined | null): string { {:else} - {#each runs as run (run.id)} + {#each visibleRuns as run (run.id)} {formatDate(run.createdAt)} @@ -117,3 +123,15 @@ function formatCer(cer: number | undefined | null): string { {/if} + +{#if hasMore} +
+ +
+{/if} -- 2.49.1 From a1694090ff272902b56d1d56a9c706d7f7be2463 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:17:57 +0200 Subject: [PATCH 06/13] refactor(ocr): extract assertNoRunningTraining() to eliminate duplicate guard Co-Authored-By: Claude Sonnet 4.6 --- .../service/OcrTrainingService.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java index 820ab2f5..dc6ff043 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java @@ -45,6 +45,13 @@ public class OcrTrainingService { List runs ) {} + private void assertNoRunningTraining() { + if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) { + throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING, + "A training run is already in progress"); + } + } + // Not safe for horizontal scaling: training reloads the Kraken model in-process on the // Python OCR service after each run. The DB-level RUNNING constraint (V30 partial unique // index) prevents concurrent training API calls, but cannot prevent two OCR service replicas @@ -53,10 +60,7 @@ public class OcrTrainingService { // Short transaction: guard check + create RUNNING row, then commit immediately. // The DB connection is released before the OCR HTTP call, which can take several minutes. OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> { - if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) { - throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING, - "A training run is already in progress"); - } + assertNoRunningTraining(); var eligibleBlocks = trainingDataExportService.queryEligibleBlocks(); if (eligibleBlocks.size() < 5) { @@ -120,10 +124,7 @@ public class OcrTrainingService { public OcrTrainingRun triggerSegTraining(UUID triggeredBy) { // Same pattern as triggerTraining: narrow transactions around DB writes only. OcrTrainingRun run = Objects.requireNonNull(txTemplate.execute(status -> { - if (trainingRunRepository.findFirstByStatus(TrainingStatus.RUNNING).isPresent()) { - throw DomainException.conflict(ErrorCode.TRAINING_ALREADY_RUNNING, - "A training run is already in progress"); - } + assertNoRunningTraining(); var segBlocks = segmentationTrainingExportService.querySegmentationBlocks(); if (segBlocks.size() < 5) { -- 2.49.1 From 1eaae2ca09c9881c48a27be6a94a828f5d82cccc Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:19:04 +0200 Subject: [PATCH 07/13] =?UTF-8?q?test(ocr):=20add=20unit=20tests=20for=20t?= =?UTF-8?q?riggerSegTraining()=20=E2=80=94=20conflict,=20threshold,=20happ?= =?UTF-8?q?y=20path,=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../service/OcrTrainingServiceTest.java | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java index 7d83188f..1d1e47e9 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java @@ -146,6 +146,90 @@ class OcrTrainingServiceTest { run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null)); } + // ─── triggerSegTraining ─────────────────────────────────────────────────── + + @Test + void triggerSegTraining_throws409_whenRunningRunExists() { + when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)) + .thenReturn(Optional.of(OcrTrainingRun.builder() + .id(UUID.randomUUID()).status(TrainingStatus.RUNNING) + .blockCount(5).documentCount(2).modelName("blla").build())); + + assertThatThrownBy(() -> service.triggerSegTraining(null)) + .isInstanceOf(DomainException.class) + .extracting("status") + .satisfies(s -> assertThat(s.toString()).contains("409")); + } + + @Test + void triggerSegTraining_throws422_whenFewerThan5Segments() { + when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty()); + when(segExportService.querySegmentationBlocks()).thenReturn(List.of( + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(UUID.randomUUID()).build() + )); + + assertThatThrownBy(() -> service.triggerSegTraining(null)) + .isInstanceOf(DomainException.class); + } + + @Test + void triggerSegTraining_createsRunWithBlla_andMarksDoneWithCer() throws Exception { + when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty()); + + UUID docA = UUID.randomUUID(); + UUID docB = UUID.randomUUID(); + List segs = List.of( + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docB).build() + ); + when(segExportService.querySegmentationBlocks()).thenReturn(segs); + when(segExportService.exportToZip()).thenReturn(out -> {}); + when(ocrClient.segtrainModel(any())).thenReturn(new OcrClient.TrainingResult(null, 0.92, 0.08, 5)); + + OcrTrainingRun saved = OcrTrainingRun.builder() + .id(UUID.randomUUID()).status(TrainingStatus.RUNNING) + .blockCount(5).documentCount(2).modelName("blla").build(); + when(runRepository.save(any())).thenReturn(saved); + + service.triggerSegTraining(null); + + verify(runRepository, atLeastOnce()).save(argThat(run -> + run.getStatus() == TrainingStatus.DONE + && "blla".equals(run.getModelName()) + && run.getCer() != null)); + } + + @Test + void triggerSegTraining_marksRunFailed_whenOcrClientThrows() throws Exception { + when(runRepository.findFirstByStatus(TrainingStatus.RUNNING)).thenReturn(Optional.empty()); + + UUID docA = UUID.randomUUID(); + List segs = List.of( + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build(), + TranscriptionBlock.builder().id(UUID.randomUUID()).documentId(docA).build() + ); + when(segExportService.querySegmentationBlocks()).thenReturn(segs); + when(segExportService.exportToZip()).thenReturn(out -> {}); + when(ocrClient.segtrainModel(any())).thenThrow(new RuntimeException("seg timeout")); + + OcrTrainingRun saved = OcrTrainingRun.builder() + .id(UUID.randomUUID()).status(TrainingStatus.RUNNING) + .blockCount(5).documentCount(1).modelName("blla").build(); + when(runRepository.save(any())).thenReturn(saved); + + service.triggerSegTraining(null); + + verify(runRepository, atLeastOnce()).save(argThat(run -> + run.getStatus() == TrainingStatus.FAILED && run.getErrorMessage() != null)); + } + // ─── Orphan recovery ────────────────────────────────────────────────────── @Test -- 2.49.1 From 9b6b6f4f7e91751b7a886dacffe2cdafba93b145 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:20:11 +0200 Subject: [PATCH 08/13] refactor(ocr): rename findTop5 to findTop10 for headroom as frontend shows 3 by default Co-Authored-By: Claude Sonnet 4.6 --- .../familienarchiv/repository/OcrTrainingRunRepository.java | 2 +- .../org/raddatz/familienarchiv/service/OcrTrainingService.java | 2 +- .../raddatz/familienarchiv/service/OcrTrainingServiceTest.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java index 0bab0e99..fe7d61d1 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/repository/OcrTrainingRunRepository.java @@ -12,5 +12,5 @@ public interface OcrTrainingRunRepository extends JpaRepository findFirstByStatus(TrainingStatus status); - List findTop5ByOrderByCreatedAtDesc(); + List findTop10ByOrderByCreatedAtDesc(); } diff --git a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java index dc6ff043..65828999 100644 --- a/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java +++ b/backend/src/main/java/org/raddatz/familienarchiv/service/OcrTrainingService.java @@ -195,7 +195,7 @@ public class OcrTrainingService { int totalOcrBlocks = (int) blockRepository.count(); int availableSegBlocks = segmentationTrainingExportService.querySegmentationBlocks().size(); - List recentRuns = trainingRunRepository.findTop5ByOrderByCreatedAtDesc(); + List recentRuns = trainingRunRepository.findTop10ByOrderByCreatedAtDesc(); OcrTrainingRun lastRun = recentRuns.isEmpty() ? null : recentRuns.get(0); return new TrainingInfoResponse( diff --git a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java index 1d1e47e9..62ef00f6 100644 --- a/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java +++ b/backend/src/test/java/org/raddatz/familienarchiv/service/OcrTrainingServiceTest.java @@ -53,7 +53,7 @@ class OcrTrainingServiceTest { service = new OcrTrainingService(runRepository, exportService, segExportService, ocrClient, healthClient, blockRepository, txTemplate); when(blockRepository.count()).thenReturn(0L); - when(runRepository.findTop5ByOrderByCreatedAtDesc()).thenReturn(List.of()); + when(runRepository.findTop10ByOrderByCreatedAtDesc()).thenReturn(List.of()); when(segExportService.querySegmentationBlocks()).thenReturn(List.of()); } -- 2.49.1 From fdae60a52832dcbed9229182ece41d21c6cdd4f2 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:24:04 +0200 Subject: [PATCH 09/13] fix(ocr): narrow exception handling and add unit tests for ensure_blla_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _model_is_loadable: narrow bare except to (RuntimeError, OSError, ValueError) with DEBUG-level fallback for unexpected exceptions — prevents silent masking of missing kraken install or AttributeError on vgsl - _run_segtrain: replace bare except:pass with log.warning so height-check fallback is visible in container logs - New test_ensure_blla_model.py: covers model-OK early return, incompatible model rename+replace, and missing model download paths Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/ensure_blla_model.py | 5 +- ocr-service/main.py | 4 +- ocr-service/test_ensure_blla_model.py | 69 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 ocr-service/test_ensure_blla_model.py diff --git a/ocr-service/ensure_blla_model.py b/ocr-service/ensure_blla_model.py index a8aed449..d0fa0305 100644 --- a/ocr-service/ensure_blla_model.py +++ b/ocr-service/ensure_blla_model.py @@ -33,9 +33,12 @@ def _model_is_loadable(path: str) -> bool: vgsl.TorchVGSLModel.load_model(path) return True - except Exception as e: + except (RuntimeError, OSError, ValueError) as e: log.warning("Model at %s failed to load: %s", path, e) return False + except Exception: + log.debug("Unexpected error loading model at %s", path, exc_info=True) + return False def _download_blla() -> str: diff --git a/ocr-service/main.py b/ocr-service/main.py index 63473737..3545f6a4 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -494,8 +494,8 @@ async def segtrain_model( "will train from scratch at 800px", _m.input[2], ) - except Exception: - pass + except Exception as exc: + log.warning("Could not inspect base model height, training from scratch: %s", exc) if use_base_model: cmd += ["-i", blla_model_path, "--resize", "union", "-s", seg_spec] diff --git a/ocr-service/test_ensure_blla_model.py b/ocr-service/test_ensure_blla_model.py new file mode 100644 index 00000000..cc171e92 --- /dev/null +++ b/ocr-service/test_ensure_blla_model.py @@ -0,0 +1,69 @@ +"""Unit tests for ensure_blla_model.main().""" + +from unittest.mock import MagicMock, call, patch + +import ensure_blla_model + + +# ─── Model already loadable ─────────────────────────────────────────────────── + + +def test_main_returns_early_when_model_is_loadable(): + """When the model exists and loads cleanly, no download or rename occurs.""" + with ( + patch("os.path.exists", return_value=True), + patch.object(ensure_blla_model, "_model_is_loadable", return_value=True), + patch.object(ensure_blla_model, "_download_blla") as mock_download, + patch("os.rename") as mock_rename, + ): + ensure_blla_model.main() + + mock_download.assert_not_called() + mock_rename.assert_not_called() + + +# ─── Model exists but is incompatible ───────────────────────────────────────── + + +def test_main_replaces_incompatible_model(): + """An incompatible model is renamed and replaced with a fresh download.""" + fake_path = "/app/models/blla.mlmodel" + downloaded_path = "/tmp/downloaded.mlmodel" + + with ( + patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path), + patch("os.path.exists", return_value=True), + patch.object(ensure_blla_model, "_model_is_loadable", return_value=False), + patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path), + patch("os.rename") as mock_rename, + patch("shutil.copy2") as mock_copy, + patch("os.makedirs"), + ): + ensure_blla_model.main() + + mock_rename.assert_called_once_with(fake_path, fake_path + ".incompatible") + mock_copy.assert_called_once_with(downloaded_path, fake_path) + + +# ─── Model missing ──────────────────────────────────────────────────────────── + + +def test_main_downloads_when_model_missing(): + """When the model file doesn't exist at all, it is downloaded without rename.""" + fake_path = "/app/models/blla.mlmodel" + downloaded_path = "/tmp/downloaded.mlmodel" + + with ( + patch.object(ensure_blla_model, "BLLA_MODEL_PATH", fake_path), + patch("os.path.exists", return_value=False), + patch.object(ensure_blla_model, "_model_is_loadable") as mock_loadable, + patch.object(ensure_blla_model, "_download_blla", return_value=downloaded_path), + patch("os.rename") as mock_rename, + patch("shutil.copy2") as mock_copy, + patch("os.makedirs"), + ): + ensure_blla_model.main() + + mock_loadable.assert_not_called() + mock_rename.assert_not_called() + mock_copy.assert_called_once_with(downloaded_path, fake_path) -- 2.49.1 From 29b44e3f48cbc398b3d0a06ad502f4bb57256ae4 Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:24:44 +0200 Subject: [PATCH 10/13] fix(ocr): pin Dockerfile base image to python:3.11.9-slim for reproducible builds Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service/Dockerfile b/ocr-service/Dockerfile index a8ec48df..25d383a4 100644 --- a/ocr-service/Dockerfile +++ b/ocr-service/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-slim +FROM python:3.11.9-slim WORKDIR /app -- 2.49.1 From 83900de787bd0d73b2b51a21c228f1064759d96a Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 14 Apr 2026 15:26:55 +0200 Subject: [PATCH 11/13] fix(frontend): accessibility fixes for TrainingHistory expand/collapse and FAILED badge - Add aria-expanded + aria-controls to expand button (WCAG 4.1.2) - Add id="training-history-rows" to tbody for aria-controls target - Replace title= tooltip on FAILED badge with details/summary for keyboard and touch accessibility; add training_error_detail_label i18n key - Use motion-safe:animate-pulse on RUNNING badge for prefers-reduced-motion Co-Authored-By: Claude Sonnet 4.6 --- frontend/messages/de.json | 1 + frontend/messages/en.json | 1 + frontend/messages/es.json | 1 + .../src/lib/components/TrainingHistory.svelte | 15 ++++++++++++--- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/frontend/messages/de.json b/frontend/messages/de.json index 48135368..cfb4578d 100644 --- a/frontend/messages/de.json +++ b/frontend/messages/de.json @@ -554,6 +554,7 @@ "training_history_col_cer": "Fehlerrate", "training_status_done": "Fertig", "training_status_failed": "Fehler", + "training_error_detail_label": "Fehlerdetails", "training_status_running": "Läuft…", "training_seg_heading": "Segmentierung trainieren", "training_seg_description": "Starte ein neues Training mit annotierten Segmentierungsbereichen, um die Texterkennung zu verbessern.", diff --git a/frontend/messages/en.json b/frontend/messages/en.json index 8f55b111..887b4091 100644 --- a/frontend/messages/en.json +++ b/frontend/messages/en.json @@ -554,6 +554,7 @@ "training_history_col_cer": "Error Rate", "training_status_done": "Done", "training_status_failed": "Failed", + "training_error_detail_label": "Error details", "training_status_running": "Running…", "training_seg_heading": "Train segmentation", "training_seg_description": "Start a new training run using annotated segmentation regions to improve text detection.", diff --git a/frontend/messages/es.json b/frontend/messages/es.json index b4b0ba65..ab357f76 100644 --- a/frontend/messages/es.json +++ b/frontend/messages/es.json @@ -554,6 +554,7 @@ "training_history_col_cer": "Tasa de error", "training_status_done": "Listo", "training_status_failed": "Error", + "training_error_detail_label": "Detalles del error", "training_status_running": "Ejecutando…", "training_seg_heading": "Entrenar segmentación", "training_seg_description": "Inicia un nuevo entrenamiento con regiones de segmentación anotadas para mejorar la detección de texto.", diff --git a/frontend/src/lib/components/TrainingHistory.svelte b/frontend/src/lib/components/TrainingHistory.svelte index 3409cf59..586fee32 100644 --- a/frontend/src/lib/components/TrainingHistory.svelte +++ b/frontend/src/lib/components/TrainingHistory.svelte @@ -52,7 +52,7 @@ function formatCer(cer: number | undefined | null): string { {m.training_history_col_cer()} - + {#if runs.length === 0} @@ -85,7 +85,6 @@ function formatCer(cer: number | undefined | null): string { {:else if run.status === 'FAILED'} + {#if run.errorMessage} +
+ + {m.training_error_detail_label()} + +

{run.errorMessage}

+
+ {/if} {:else} {m.training_status_running()} @@ -128,6 +135,8 @@ function formatCer(cer: number | undefined | null): string {