HaileyStorm commited on
Commit
a56e045
1 Parent(s): 9be6028

Delete chess-mamba-vs-xformer/config/Mamba/50M.py

Browse files
chess-mamba-vs-xformer/config/Mamba/50M.py DELETED
@@ -1,70 +0,0 @@
1
- import numpy as np
2
- import math
3
-
4
- beta1 = 0.9
5
- beta2 = 0.95
6
- weight_decay = 4.5e-3
7
- grad_clip = 0.5
8
- auto_clip = True
9
- auto_clip_max = 0.5
10
- auto_clip_min = 3.333e-3
11
- grad_clip_start_size = 100
12
- grad_clip_max_size = 400
13
- grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
14
- max_seq_len = 1536
15
-
16
- # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
- base_batch_size = 100
18
-
19
- batch_size = 100
20
- gradient_accumulation_steps = 1
21
- effective_batch_size = batch_size * gradient_accumulation_steps
22
-
23
- always_save_checkpoint = True
24
- eval_interval = 150
25
- eval_iters = 8
26
- log_interval = 1
27
- train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
-
29
- warmup_iters = 1280 # not super necessary potentially
30
- learning_rate = 3e-4
31
- min_lr = 2e-5
32
- # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
- max_iters = 1000000
34
-
35
- # # # # #
36
-
37
- warmup_iters = int(warmup_iters * (base_batch_size / effective_batch_size))
38
- learning_rate = learning_rate * np.sqrt(effective_batch_size / base_batch_size) # with baby networks can afford to go a bit higher
39
- max_iters = int(max_iters * (base_batch_size / effective_batch_size))
40
- min_lr = min_lr * np.sqrt(effective_batch_size / base_batch_size) # learning_rate / 10 usually
41
-
42
- out_dir = 'out/Mamba/50M'
43
- eval_interval = int(eval_interval * (base_batch_size / effective_batch_size)) # keep frequent because we'll overfit
44
- eval_iters = int(eval_iters * (base_batch_size / batch_size)) # intentionally scaled by batch_size instead of effective_batch_size
45
- log_interval = int(math.ceil(log_interval * (base_batch_size / effective_batch_size))) # don't print too too often
46
-
47
- print(f'warmup iters: {warmup_iters}')
48
- print(f'Max iters: {max_iters} ({max_iters * effective_batch_size} games)')
49
- print(f'Eval iters: {eval_iters}')
50
- print(f'Eval interval: {eval_interval}')
51
- print(f'Log interval: {log_interval}')
52
-
53
- wandb_log = True
54
- wandb_project = 'chess-mamba-v2'
55
- wandb_run_name = 'Mamba-50M'
56
-
57
- dataset = 'stable'
58
-
59
- # 50??M param
60
- model_type = 'mamba'
61
- n_layer = 36
62
- d_model = 512
63
- d_state = 32
64
- dt_rank = 'auto'
65
- move_num_in_gamestate = False
66
-
67
- init_from = 'resume'
68
-
69
- device = 'cuda' # run on cpu only
70
- compile = False # do not torch compile the model