HaileyStorm
/

chess-mamba-vs-xformer

HaileyStorm commited on Mar 31, 2024

Commit

d82f87e

verified ·

1 Parent(s): b79e9b5

Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub

Files changed (1) hide show

chess-mamba-vs-xformer/config/Mamba/250M.py CHANGED Viewed

@@ -2,33 +2,33 @@ import numpy as np
 import math
 beta1 = 0.9
-beta2 = 0.925
-weight_decay = 4.5e-3
 grad_clip = 0.5
 auto_clip = True
-auto_clip_max = 0.5
 auto_clip_min = 1e-3
 grad_clip_start_size = 100
 grad_clip_max_size = 400
-grad_clip_percentile = 10  #7.5 (try it at 10, tested @7.75)
 max_seq_len = 1536
 # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
 base_batch_size = 100
-batch_size = 100
-gradient_accumulation_steps = 1
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
-eval_interval = 150
-eval_iters = 8
-log_interval = 1
 train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
 warmup_iters = 1280 # not super necessary potentially
-learning_rate = 2.5e-4
-min_lr = 1.6667e-5
 # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
 max_iters = 2000000
@@ -52,19 +52,19 @@ print(f'Log interval: {log_interval}')
 wandb_log = True
 wandb_project = 'chess-mamba-YOLO'
-wandb_run_name = 'Mamba-250M'
-dataset = 'stable'
-# 250??M param
 model_type = 'mamba'
-n_layer = 36
-d_model = 928
-d_state = 48
-dt_rank = 'auto'
 move_num_in_gamestate = False
-init_from = 'scratch'
 device = 'cuda'  # run on cpu only
 compile = False # do not torch compile the model

 import math
 beta1 = 0.9
+beta2 = 0.905 #0.9125  # 0.925
+weight_decay = 1e-4  #1.25e-4  # 4.5e-3
 grad_clip = 0.5
 auto_clip = True
+auto_clip_max = 0.1
 auto_clip_min = 1e-3
 grad_clip_start_size = 100
 grad_clip_max_size = 400
+grad_clip_percentile = 9 #9.25  # 10
 max_seq_len = 1536
 # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
 base_batch_size = 100
+batch_size = 18
+gradient_accumulation_steps = 8
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
+eval_interval = 420 #500
+eval_iters = 8.0 # 7.5
+log_interval = 2
 train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
 warmup_iters = 1280 # not super necessary potentially
+learning_rate = 1.5e-4  # 1.75e-4  # 2.5e-4
+min_lr = 1e-5   # 1.16667e-5
 # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
 max_iters = 2000000
 wandb_log = True
 wandb_project = 'chess-mamba-YOLO'
+wandb_run_name = 'Mamba-280M'
+dataset = 'stable2'
+# 279.8M param
 model_type = 'mamba'
+n_layer = 40
+d_model = 1024
+d_state = 64
+dt_rank = 72 #'auto'
 move_num_in_gamestate = False
+init_from = 'resume'
 device = 'cuda'  # run on cpu only
 compile = False # do not torch compile the model