HaileyStorm
/

chess-mamba-vs-xformer

HaileyStorm commited on Mar 26, 2024

Commit

062c52f

verified ·

1 Parent(s): 3ef6e42

Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub

Files changed (1) hide show

chess-mamba-vs-xformer/config/Mamba/250M.py CHANGED Viewed

@@ -2,35 +2,35 @@ import numpy as np
 import math
 beta1 = 0.9
-beta2 = 0.95
 weight_decay = 4.5e-3
 grad_clip = 0.5
 auto_clip = True
 auto_clip_max = 0.5
-auto_clip_min = 3.333e-3
 grad_clip_start_size = 100
 grad_clip_max_size = 400
 grad_clip_percentile = 10  #7.5 (try it at 10, tested @7.75)
 max_seq_len = 1536
 # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
-base_batch_size = 256
-batch_size = 10
-gradient_accumulation_steps = 10
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
-eval_interval = 300
-eval_iters = 33
-log_interval = 75
-train_file_update_interval = 10 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
-warmup_iters = 500 # not super necessary potentially
-learning_rate = 2.0e-3 # tested 1.5e-3 from 112k-156k, before that 3.5e-3  #8e-3
-min_lr = 1.3333e-4
 # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
-max_iters = 400000  #~=102M games
 # # # # #
@@ -51,17 +51,17 @@ print(f'Eval interval: {eval_interval}')
 print(f'Log interval: {log_interval}')
 wandb_log = True
-wandb_project = 'chess-mamba-v2'
 wandb_run_name = 'Mamba-250M'
 dataset = 'stable'
-# 251M param
 model_type = 'mamba'
-n_layer = 96
-d_model = 578
-d_state = 56
-dt_rank = 176
 move_num_in_gamestate = False
 init_from = 'scratch'

 import math
 beta1 = 0.9
+beta2 = 0.925
 weight_decay = 4.5e-3
 grad_clip = 0.5
 auto_clip = True
 auto_clip_max = 0.5
+auto_clip_min = 1e-3
 grad_clip_start_size = 100
 grad_clip_max_size = 400
 grad_clip_percentile = 10  #7.5 (try it at 10, tested @7.75)
 max_seq_len = 1536
 # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
+base_batch_size = 100
+batch_size = 100
+gradient_accumulation_steps = 1
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
+eval_interval = 150
+eval_iters = 8
+log_interval = 1
+train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
+warmup_iters = 1280 # not super necessary potentially
+learning_rate = 2.5e-4
+min_lr = 1.6667e-5
 # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
+max_iters = 2000000
 # # # # #
 print(f'Log interval: {log_interval}')
 wandb_log = True
+wandb_project = 'chess-mamba-YOLO'
 wandb_run_name = 'Mamba-250M'
 dataset = 'stable'
+# 250??M param
 model_type = 'mamba'
+n_layer = 36
+d_model = 928
+d_state = 48
+dt_rank = 'auto'
 move_num_in_gamestate = False
 init_from = 'scratch'