HaileyStorm
commited on
Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub
Browse files
chess-mamba-vs-xformer/config/Mamba/250M.py
CHANGED
@@ -2,35 +2,35 @@ import numpy as np
|
|
2 |
import math
|
3 |
|
4 |
beta1 = 0.9
|
5 |
-
beta2 = 0.
|
6 |
weight_decay = 4.5e-3
|
7 |
grad_clip = 0.5
|
8 |
auto_clip = True
|
9 |
auto_clip_max = 0.5
|
10 |
-
auto_clip_min =
|
11 |
grad_clip_start_size = 100
|
12 |
grad_clip_max_size = 400
|
13 |
grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
|
14 |
max_seq_len = 1536
|
15 |
|
16 |
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
-
base_batch_size =
|
18 |
|
19 |
-
batch_size =
|
20 |
-
gradient_accumulation_steps =
|
21 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
|
23 |
always_save_checkpoint = True
|
24 |
-
eval_interval =
|
25 |
-
eval_iters =
|
26 |
-
log_interval =
|
27 |
-
train_file_update_interval =
|
28 |
|
29 |
-
warmup_iters =
|
30 |
-
learning_rate = 2.
|
31 |
-
min_lr = 1.
|
32 |
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
-
max_iters =
|
34 |
|
35 |
# # # # #
|
36 |
|
@@ -51,17 +51,17 @@ print(f'Eval interval: {eval_interval}')
|
|
51 |
print(f'Log interval: {log_interval}')
|
52 |
|
53 |
wandb_log = True
|
54 |
-
wandb_project = 'chess-mamba-
|
55 |
wandb_run_name = 'Mamba-250M'
|
56 |
|
57 |
dataset = 'stable'
|
58 |
|
59 |
-
#
|
60 |
model_type = 'mamba'
|
61 |
-
n_layer =
|
62 |
-
d_model =
|
63 |
-
d_state =
|
64 |
-
dt_rank =
|
65 |
move_num_in_gamestate = False
|
66 |
|
67 |
init_from = 'scratch'
|
|
|
2 |
import math
|
3 |
|
4 |
beta1 = 0.9
|
5 |
+
beta2 = 0.925
|
6 |
weight_decay = 4.5e-3
|
7 |
grad_clip = 0.5
|
8 |
auto_clip = True
|
9 |
auto_clip_max = 0.5
|
10 |
+
auto_clip_min = 1e-3
|
11 |
grad_clip_start_size = 100
|
12 |
grad_clip_max_size = 400
|
13 |
grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
|
14 |
max_seq_len = 1536
|
15 |
|
16 |
# batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
|
17 |
+
base_batch_size = 100
|
18 |
|
19 |
+
batch_size = 100
|
20 |
+
gradient_accumulation_steps = 1
|
21 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
22 |
|
23 |
always_save_checkpoint = True
|
24 |
+
eval_interval = 150
|
25 |
+
eval_iters = 8
|
26 |
+
log_interval = 1
|
27 |
+
train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
|
28 |
|
29 |
+
warmup_iters = 1280 # not super necessary potentially
|
30 |
+
learning_rate = 2.5e-4
|
31 |
+
min_lr = 1.6667e-5
|
32 |
# max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
|
33 |
+
max_iters = 2000000
|
34 |
|
35 |
# # # # #
|
36 |
|
|
|
51 |
print(f'Log interval: {log_interval}')
|
52 |
|
53 |
wandb_log = True
|
54 |
+
wandb_project = 'chess-mamba-YOLO'
|
55 |
wandb_run_name = 'Mamba-250M'
|
56 |
|
57 |
dataset = 'stable'
|
58 |
|
59 |
+
# 250??M param
|
60 |
model_type = 'mamba'
|
61 |
+
n_layer = 36
|
62 |
+
d_model = 928
|
63 |
+
d_state = 48
|
64 |
+
dt_rank = 'auto'
|
65 |
move_num_in_gamestate = False
|
66 |
|
67 |
init_from = 'scratch'
|