HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on Mar 23, 2024

Commit

9be6028

•

1 Parent(s): a4a8fea

Upload chess-mamba-vs-xformer/config/Mamba/50m.py with huggingface_hub

Browse files

Files changed (1) hide show

chess-mamba-vs-xformer/config/Mamba/50m.py +70 -0

chess-mamba-vs-xformer/config/Mamba/50m.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import numpy as np
+import math
+beta1 = 0.9
+beta2 = 0.95
+weight_decay = 4.5e-3
+grad_clip = 0.5
+auto_clip = True
+auto_clip_max = 0.5
+auto_clip_min = 3.333e-3
+grad_clip_start_size = 100
+grad_clip_max_size = 400
+grad_clip_percentile = 10  #7.5 (try it at 10, tested @7.75)
+max_seq_len = 1536
+# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
+base_batch_size = 100
+batch_size = 100
+gradient_accumulation_steps = 1
+effective_batch_size = batch_size * gradient_accumulation_steps
+always_save_checkpoint = True
+eval_interval = 150
+eval_iters = 8
+log_interval = 1
+train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
+warmup_iters = 1280 # not super necessary potentially
+learning_rate = 3e-4
+min_lr = 2e-5
+# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
+max_iters = 1000000
+# # # # #
+warmup_iters = int(warmup_iters * (base_batch_size / effective_batch_size))
+learning_rate = learning_rate * np.sqrt(effective_batch_size / base_batch_size) # with baby networks can afford to go a bit higher
+max_iters = int(max_iters * (base_batch_size / effective_batch_size))
+min_lr = min_lr * np.sqrt(effective_batch_size / base_batch_size) # learning_rate / 10 usually
+out_dir = 'out/Mamba/50M'
+eval_interval = int(eval_interval * (base_batch_size / effective_batch_size)) # keep frequent because we'll overfit
+eval_iters = int(eval_iters * (base_batch_size / batch_size)) # intentionally scaled by batch_size instead of effective_batch_size
+log_interval = int(math.ceil(log_interval * (base_batch_size / effective_batch_size))) # don't print too too often
+print(f'warmup iters: {warmup_iters}')
+print(f'Max iters: {max_iters} ({max_iters * effective_batch_size} games)')
+print(f'Eval iters: {eval_iters}')
+print(f'Eval interval: {eval_interval}')
+print(f'Log interval: {log_interval}')
+wandb_log = True
+wandb_project = 'chess-mamba-v2'
+wandb_run_name = 'Mamba-50M'
+dataset = 'stable'
+# 50??M param
+model_type = 'mamba'
+n_layer = 36
+d_model = 512
+d_state = 32
+dt_rank = 'auto'
+move_num_in_gamestate = False
+init_from = 'resume'
+device = 'cuda'  # run on cpu only
+compile = False # do not torch compile the model