fix(training): limit CPU threads and epochs to prevent RAM exhaustion
Force CPU-only training (--device cpu), cap OpenMP/BLAS thread pool at 2 (--threads 2), and reduce epochs from 50 to 10 (-N 10). 50 epochs on a laptop OOM-killed the container. 10 epochs is sufficient for incremental fine-tuning runs; more data is added over time and training re-run. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -366,12 +366,13 @@ async def train_model(
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
"ketos", "--workers", "0",
|
||||
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
|
||||
"train",
|
||||
"-f", "path",
|
||||
"-o", checkpoint_dir,
|
||||
"-q", "fixed",
|
||||
"-N", "50",
|
||||
"-N", "10",
|
||||
"-B", "1",
|
||||
]
|
||||
if os.path.exists(KRAKEN_MODEL_PATH):
|
||||
cmd += ["-i", KRAKEN_MODEL_PATH]
|
||||
@@ -457,11 +458,11 @@ async def segtrain_model(
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
"ketos", "--workers", "0",
|
||||
"ketos", "--workers", "0", "--device", "cpu", "--threads", "2",
|
||||
"segtrain",
|
||||
"-o", checkpoint_dir,
|
||||
"-q", "fixed",
|
||||
"-N", "50",
|
||||
"-N", "10",
|
||||
]
|
||||
if os.path.exists(blla_model_path):
|
||||
cmd += ["-i", blla_model_path]
|
||||
|
||||
Reference in New Issue
Block a user