fix(training): limit CPU threads and epochs to prevent RAM exhaustion

Force CPU-only training (--device cpu), cap OpenMP/BLAS thread pool at 2
(--threads 2), and reduce epochs from 50 to 10 (-N 10). 50 epochs on a
laptop OOM-killed the container. 10 epochs is sufficient for incremental
fine-tuning runs; more data is added over time and training re-run.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 21:09:13 +02:00
parent 8dec5b5976
commit 89a18c430e

View File

@@ -366,12 +366,13 @@ async def train_model(
os.makedirs(checkpoint_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True)
cmd = [ cmd = [
"ketos", "--workers", "0", "ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"train", "train",
"-f", "path", "-f", "path",
"-o", checkpoint_dir, "-o", checkpoint_dir,
"-q", "fixed", "-q", "fixed",
"-N", "50", "-N", "10",
"-B", "1",
] ]
if os.path.exists(KRAKEN_MODEL_PATH): if os.path.exists(KRAKEN_MODEL_PATH):
cmd += ["-i", KRAKEN_MODEL_PATH] cmd += ["-i", KRAKEN_MODEL_PATH]
@@ -457,11 +458,11 @@ async def segtrain_model(
os.makedirs(checkpoint_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True)
cmd = [ cmd = [
"ketos", "--workers", "0", "ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"segtrain", "segtrain",
"-o", checkpoint_dir, "-o", checkpoint_dir,
"-q", "fixed", "-q", "fixed",
"-N", "50", "-N", "10",
] ]
if os.path.exists(blla_model_path): if os.path.exists(blla_model_path):
cmd += ["-i", blla_model_path] cmd += ["-i", blla_model_path]