HaileyStorm
commited on
Commit
•
a56e045
1
Parent(s):
9be6028
Delete chess-mamba-vs-xformer/config/Mamba/50M.py
Browse files
chess-mamba-vs-xformer/config/Mamba/50M.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import math
|
3 |
-
|
4 |
-
beta1 = 0.9
|
5 |
-
beta2 = 0.95
|
6 |
-
weight_decay = 4.5e-3
|
7 |
-
grad_clip = 0.5
|
8 |
-
auto_clip = True
|
9 |
-
auto_clip_max = 0.5
|
10 |
-
auto_clip_min = 3.333e-3
|
11 |
-
grad_clip_start_size = 100
|
12 |
-
grad_clip_max_size = 400
|
13 |
-
grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
|
14 |
-
max_seq_len = 1536
|
15 |
-
|
16 |
-
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
-
base_batch_size = 100
|
18 |
-
|
19 |
-
batch_size = 100
|
20 |
-
gradient_accumulation_steps = 1
|
21 |
-
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
-
|
23 |
-
always_save_checkpoint = True
|
24 |
-
eval_interval = 150
|
25 |
-
eval_iters = 8
|
26 |
-
log_interval = 1
|
27 |
-
train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
|
28 |
-
|
29 |
-
warmup_iters = 1280 # not super necessary potentially
|
30 |
-
learning_rate = 3e-4
|
31 |
-
min_lr = 2e-5
|
32 |
-
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
-
max_iters = 1000000
|
34 |
-
|
35 |
-
# # # # #
|
36 |
-
|
37 |
-
warmup_iters = int(warmup_iters * (base_batch_size / effective_batch_size))
|
38 |
-
learning_rate = learning_rate * np.sqrt(effective_batch_size / base_batch_size) # with baby networks can afford to go a bit higher
|
39 |
-
max_iters = int(max_iters * (base_batch_size / effective_batch_size))
|
40 |
-
min_lr = min_lr * np.sqrt(effective_batch_size / base_batch_size) # learning_rate / 10 usually
|
41 |
-
|
42 |
-
out_dir = 'out/Mamba/50M'
|
43 |
-
eval_interval = int(eval_interval * (base_batch_size / effective_batch_size)) # keep frequent because we'll overfit
|
44 |
-
eval_iters = int(eval_iters * (base_batch_size / batch_size)) # intentionally scaled by batch_size instead of effective_batch_size
|
45 |
-
log_interval = int(math.ceil(log_interval * (base_batch_size / effective_batch_size))) # don't print too too often
|
46 |
-
|
47 |
-
print(f'warmup iters: {warmup_iters}')
|
48 |
-
print(f'Max iters: {max_iters} ({max_iters * effective_batch_size} games)')
|
49 |
-
print(f'Eval iters: {eval_iters}')
|
50 |
-
print(f'Eval interval: {eval_interval}')
|
51 |
-
print(f'Log interval: {log_interval}')
|
52 |
-
|
53 |
-
wandb_log = True
|
54 |
-
wandb_project = 'chess-mamba-v2'
|
55 |
-
wandb_run_name = 'Mamba-50M'
|
56 |
-
|
57 |
-
dataset = 'stable'
|
58 |
-
|
59 |
-
# 50??M param
|
60 |
-
model_type = 'mamba'
|
61 |
-
n_layer = 36
|
62 |
-
d_model = 512
|
63 |
-
d_state = 32
|
64 |
-
dt_rank = 'auto'
|
65 |
-
move_num_in_gamestate = False
|
66 |
-
|
67 |
-
init_from = 'resume'
|
68 |
-
|
69 |
-
device = 'cuda' # run on cpu only
|
70 |
-
compile = False # do not torch compile the model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|