HaileyStorm commited on
Commit
9be6028
1 Parent(s): a4a8fea

Upload chess-mamba-vs-xformer/config/Mamba/50m.py with huggingface_hub

Browse files
chess-mamba-vs-xformer/config/Mamba/50m.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import math
3
+
4
+ beta1 = 0.9
5
+ beta2 = 0.95
6
+ weight_decay = 4.5e-3
7
+ grad_clip = 0.5
8
+ auto_clip = True
9
+ auto_clip_max = 0.5
10
+ auto_clip_min = 3.333e-3
11
+ grad_clip_start_size = 100
12
+ grad_clip_max_size = 400
13
+ grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
14
+ max_seq_len = 1536
15
+
16
+ # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
+ base_batch_size = 100
18
+
19
+ batch_size = 100
20
+ gradient_accumulation_steps = 1
21
+ effective_batch_size = batch_size * gradient_accumulation_steps
22
+
23
+ always_save_checkpoint = True
24
+ eval_interval = 150
25
+ eval_iters = 8
26
+ log_interval = 1
27
+ train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
+
29
+ warmup_iters = 1280 # not super necessary potentially
30
+ learning_rate = 3e-4
31
+ min_lr = 2e-5
32
+ # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
+ max_iters = 1000000
34
+
35
+ # # # # #
36
+
37
+ warmup_iters = int(warmup_iters * (base_batch_size / effective_batch_size))
38
+ learning_rate = learning_rate * np.sqrt(effective_batch_size / base_batch_size) # with baby networks can afford to go a bit higher
39
+ max_iters = int(max_iters * (base_batch_size / effective_batch_size))
40
+ min_lr = min_lr * np.sqrt(effective_batch_size / base_batch_size) # learning_rate / 10 usually
41
+
42
+ out_dir = 'out/Mamba/50M'
43
+ eval_interval = int(eval_interval * (base_batch_size / effective_batch_size)) # keep frequent because we'll overfit
44
+ eval_iters = int(eval_iters * (base_batch_size / batch_size)) # intentionally scaled by batch_size instead of effective_batch_size
45
+ log_interval = int(math.ceil(log_interval * (base_batch_size / effective_batch_size))) # don't print too too often
46
+
47
+ print(f'warmup iters: {warmup_iters}')
48
+ print(f'Max iters: {max_iters} ({max_iters * effective_batch_size} games)')
49
+ print(f'Eval iters: {eval_iters}')
50
+ print(f'Eval interval: {eval_interval}')
51
+ print(f'Log interval: {log_interval}')
52
+
53
+ wandb_log = True
54
+ wandb_project = 'chess-mamba-v2'
55
+ wandb_run_name = 'Mamba-50M'
56
+
57
+ dataset = 'stable'
58
+
59
+ # 50??M param
60
+ model_type = 'mamba'
61
+ n_layer = 36
62
+ d_model = 512
63
+ d_state = 32
64
+ dt_rank = 'auto'
65
+ move_num_in_gamestate = False
66
+
67
+ init_from = 'resume'
68
+
69
+ device = 'cuda' # run on cpu only
70
+ compile = False # do not torch compile the model