HaileyStorm
commited on
Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub
Browse files
chess-mamba-vs-xformer/config/Mamba/250M.py
CHANGED
@@ -2,33 +2,33 @@ import numpy as np
|
|
2 |
import math
|
3 |
|
4 |
beta1 = 0.9
|
5 |
-
beta2 = 0.925
|
6 |
-
weight_decay = 4.5e-3
|
7 |
grad_clip = 0.5
|
8 |
auto_clip = True
|
9 |
-
auto_clip_max = 0.
|
10 |
auto_clip_min = 1e-3
|
11 |
grad_clip_start_size = 100
|
12 |
grad_clip_max_size = 400
|
13 |
-
grad_clip_percentile =
|
14 |
max_seq_len = 1536
|
15 |
|
16 |
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
base_batch_size = 100
|
18 |
|
19 |
-
batch_size =
|
20 |
-
gradient_accumulation_steps =
|
21 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
|
23 |
always_save_checkpoint = True
|
24 |
-
eval_interval =
|
25 |
-
eval_iters = 8
|
26 |
-
log_interval =
|
27 |
train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
|
28 |
|
29 |
warmup_iters = 1280 # not super necessary potentially
|
30 |
-
learning_rate = 2.5e-4
|
31 |
-
min_lr = 1.
|
32 |
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
max_iters = 2000000
|
34 |
|
@@ -52,19 +52,19 @@ print(f'Log interval: {log_interval}')
|
|
52 |
|
53 |
wandb_log = True
|
54 |
wandb_project = 'chess-mamba-YOLO'
|
55 |
-
wandb_run_name = 'Mamba-
|
56 |
|
57 |
-
dataset = '
|
58 |
|
59 |
-
#
|
60 |
model_type = 'mamba'
|
61 |
-
n_layer =
|
62 |
-
d_model =
|
63 |
-
d_state =
|
64 |
-
dt_rank = 'auto'
|
65 |
move_num_in_gamestate = False
|
66 |
|
67 |
-
init_from = '
|
68 |
|
69 |
device = 'cuda' # run on cpu only
|
70 |
compile = False # do not torch compile the model
|
|
|
2 |
import math
|
3 |
|
4 |
beta1 = 0.9
|
5 |
+
beta2 = 0.905 #0.9125 # 0.925
|
6 |
+
weight_decay = 1e-4 #1.25e-4 # 4.5e-3
|
7 |
grad_clip = 0.5
|
8 |
auto_clip = True
|
9 |
+
auto_clip_max = 0.1
|
10 |
auto_clip_min = 1e-3
|
11 |
grad_clip_start_size = 100
|
12 |
grad_clip_max_size = 400
|
13 |
+
grad_clip_percentile = 9 #9.25 # 10
|
14 |
max_seq_len = 1536
|
15 |
|
16 |
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
base_batch_size = 100
|
18 |
|
19 |
+
batch_size = 18
|
20 |
+
gradient_accumulation_steps = 8
|
21 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
|
23 |
always_save_checkpoint = True
|
24 |
+
eval_interval = 420 #500
|
25 |
+
eval_iters = 8.0 # 7.5
|
26 |
+
log_interval = 2
|
27 |
train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
|
28 |
|
29 |
warmup_iters = 1280 # not super necessary potentially
|
30 |
+
learning_rate = 1.5e-4 # 1.75e-4 # 2.5e-4
|
31 |
+
min_lr = 1e-5 # 1.16667e-5
|
32 |
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
max_iters = 2000000
|
34 |
|
|
|
52 |
|
53 |
wandb_log = True
|
54 |
wandb_project = 'chess-mamba-YOLO'
|
55 |
+
wandb_run_name = 'Mamba-280M'
|
56 |
|
57 |
+
dataset = 'stable2'
|
58 |
|
59 |
+
# 279.8M param
|
60 |
model_type = 'mamba'
|
61 |
+
n_layer = 40
|
62 |
+
d_model = 1024
|
63 |
+
d_state = 64
|
64 |
+
dt_rank = 72 #'auto'
|
65 |
move_num_in_gamestate = False
|
66 |
|
67 |
+
init_from = 'resume'
|
68 |
|
69 |
device = 'cuda' # run on cpu only
|
70 |
compile = False # do not torch compile the model
|