HaileyStorm
commited on
Commit
•
9be6028
1
Parent(s):
a4a8fea
Upload chess-mamba-vs-xformer/config/Mamba/50m.py with huggingface_hub
Browse files
chess-mamba-vs-xformer/config/Mamba/50m.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import math
|
3 |
+
|
4 |
+
beta1 = 0.9
|
5 |
+
beta2 = 0.95
|
6 |
+
weight_decay = 4.5e-3
|
7 |
+
grad_clip = 0.5
|
8 |
+
auto_clip = True
|
9 |
+
auto_clip_max = 0.5
|
10 |
+
auto_clip_min = 3.333e-3
|
11 |
+
grad_clip_start_size = 100
|
12 |
+
grad_clip_max_size = 400
|
13 |
+
grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
|
14 |
+
max_seq_len = 1536
|
15 |
+
|
16 |
+
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
+
base_batch_size = 100
|
18 |
+
|
19 |
+
batch_size = 100
|
20 |
+
gradient_accumulation_steps = 1
|
21 |
+
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
+
|
23 |
+
always_save_checkpoint = True
|
24 |
+
eval_interval = 150
|
25 |
+
eval_iters = 8
|
26 |
+
log_interval = 1
|
27 |
+
train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
|
28 |
+
|
29 |
+
warmup_iters = 1280 # not super necessary potentially
|
30 |
+
learning_rate = 3e-4
|
31 |
+
min_lr = 2e-5
|
32 |
+
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
+
max_iters = 1000000
|
34 |
+
|
35 |
+
# # # # #
|
36 |
+
|
37 |
+
warmup_iters = int(warmup_iters * (base_batch_size / effective_batch_size))
|
38 |
+
learning_rate = learning_rate * np.sqrt(effective_batch_size / base_batch_size) # with baby networks can afford to go a bit higher
|
39 |
+
max_iters = int(max_iters * (base_batch_size / effective_batch_size))
|
40 |
+
min_lr = min_lr * np.sqrt(effective_batch_size / base_batch_size) # learning_rate / 10 usually
|
41 |
+
|
42 |
+
out_dir = 'out/Mamba/50M'
|
43 |
+
eval_interval = int(eval_interval * (base_batch_size / effective_batch_size)) # keep frequent because we'll overfit
|
44 |
+
eval_iters = int(eval_iters * (base_batch_size / batch_size)) # intentionally scaled by batch_size instead of effective_batch_size
|
45 |
+
log_interval = int(math.ceil(log_interval * (base_batch_size / effective_batch_size))) # don't print too too often
|
46 |
+
|
47 |
+
print(f'warmup iters: {warmup_iters}')
|
48 |
+
print(f'Max iters: {max_iters} ({max_iters * effective_batch_size} games)')
|
49 |
+
print(f'Eval iters: {eval_iters}')
|
50 |
+
print(f'Eval interval: {eval_interval}')
|
51 |
+
print(f'Log interval: {log_interval}')
|
52 |
+
|
53 |
+
wandb_log = True
|
54 |
+
wandb_project = 'chess-mamba-v2'
|
55 |
+
wandb_run_name = 'Mamba-50M'
|
56 |
+
|
57 |
+
dataset = 'stable'
|
58 |
+
|
59 |
+
# 50??M param
|
60 |
+
model_type = 'mamba'
|
61 |
+
n_layer = 36
|
62 |
+
d_model = 512
|
63 |
+
d_state = 32
|
64 |
+
dt_rank = 'auto'
|
65 |
+
move_num_in_gamestate = False
|
66 |
+
|
67 |
+
init_from = 'resume'
|
68 |
+
|
69 |
+
device = 'cuda' # run on cpu only
|
70 |
+
compile = False # do not torch compile the model
|