fix(training): limit CPU threads and epochs to prevent RAM exhaustion

Force CPU-only training (--device cpu), cap OpenMP/BLAS thread pool at 2
(--threads 2), and reduce epochs from 50 to 10 (-N 10). 50 epochs on a
laptop OOM-killed the container. 10 epochs is sufficient for incremental
fine-tuning runs; more data is added over time and training re-run.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-04-13 21:09:13 +02:00
parent 8dec5b5976
commit 89a18c430e

View File

@@ -366,12 +366,13 @@ async def train_model(
os.makedirs(checkpoint_dir, exist_ok=True)
cmd = [
"ketos", "--workers", "0",
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"train",
"-f", "path",
"-o", checkpoint_dir,
"-q", "fixed",
"-N", "50",
"-N", "10",
"-B", "1",
]
if os.path.exists(KRAKEN_MODEL_PATH):
cmd += ["-i", KRAKEN_MODEL_PATH]
@@ -457,11 +458,11 @@ async def segtrain_model(
os.makedirs(checkpoint_dir, exist_ok=True)
cmd = [
"ketos", "--workers", "0",
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
"segtrain",
"-o", checkpoint_dir,
"-q", "fixed",
"-N", "50",
"-N", "10",
]
if os.path.exists(blla_model_path):
cmd += ["-i", blla_model_path]