From 89a18c430e5ba140041afc75983aed78cee006f8 Mon Sep 17 00:00:00 2001 From: Marcel Date: Mon, 13 Apr 2026 21:09:13 +0200 Subject: [PATCH] fix(training): limit CPU threads and epochs to prevent RAM exhaustion Force CPU-only training (--device cpu), cap OpenMP/BLAS thread pool at 2 (--threads 2), and reduce epochs from 50 to 10 (-N 10). 50 epochs on a laptop OOM-killed the container. 10 epochs is sufficient for incremental fine-tuning runs; more data is added over time and training re-run. Co-Authored-By: Claude Sonnet 4.6 --- ocr-service/main.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ocr-service/main.py b/ocr-service/main.py index 8e4d2288..3a2c084e 100644 --- a/ocr-service/main.py +++ b/ocr-service/main.py @@ -366,12 +366,13 @@ async def train_model( os.makedirs(checkpoint_dir, exist_ok=True) cmd = [ - "ketos", "--workers", "0", + "ketos", "--workers", "0", "--device", "cpu", "--threads", "2", "train", "-f", "path", "-o", checkpoint_dir, "-q", "fixed", - "-N", "50", + "-N", "10", + "-B", "1", ] if os.path.exists(KRAKEN_MODEL_PATH): cmd += ["-i", KRAKEN_MODEL_PATH] @@ -457,11 +458,11 @@ async def segtrain_model( os.makedirs(checkpoint_dir, exist_ok=True) cmd = [ - "ketos", "--workers", "0", + "ketos", "--workers", "0", "--device", "cpu", "--threads", "2", "segtrain", "-o", checkpoint_dir, "-q", "fixed", - "-N", "50", + "-N", "10", ] if os.path.exists(blla_model_path): cmd += ["-i", blla_model_path]