HaileyStorm commited on
Commit
062c52f
·
verified ·
1 Parent(s): 3ef6e42

Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub

Browse files
chess-mamba-vs-xformer/config/Mamba/250M.py CHANGED
@@ -2,35 +2,35 @@ import numpy as np
2
  import math
3
 
4
  beta1 = 0.9
5
- beta2 = 0.95
6
  weight_decay = 4.5e-3
7
  grad_clip = 0.5
8
  auto_clip = True
9
  auto_clip_max = 0.5
10
- auto_clip_min = 3.333e-3
11
  grad_clip_start_size = 100
12
  grad_clip_max_size = 400
13
  grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
14
  max_seq_len = 1536
15
 
16
  # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
- base_batch_size = 256
18
 
19
- batch_size = 10
20
- gradient_accumulation_steps = 10
21
  effective_batch_size = batch_size * gradient_accumulation_steps
22
 
23
  always_save_checkpoint = True
24
- eval_interval = 300
25
- eval_iters = 33
26
- log_interval = 75
27
- train_file_update_interval = 10 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
 
29
- warmup_iters = 500 # not super necessary potentially
30
- learning_rate = 2.0e-3 # tested 1.5e-3 from 112k-156k, before that 3.5e-3 #8e-3
31
- min_lr = 1.3333e-4
32
  # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
- max_iters = 400000 #~=102M games
34
 
35
  # # # # #
36
 
@@ -51,17 +51,17 @@ print(f'Eval interval: {eval_interval}')
51
  print(f'Log interval: {log_interval}')
52
 
53
  wandb_log = True
54
- wandb_project = 'chess-mamba-v2'
55
  wandb_run_name = 'Mamba-250M'
56
 
57
  dataset = 'stable'
58
 
59
- # 251M param
60
  model_type = 'mamba'
61
- n_layer = 96
62
- d_model = 578
63
- d_state = 56
64
- dt_rank = 176
65
  move_num_in_gamestate = False
66
 
67
  init_from = 'scratch'
 
2
  import math
3
 
4
  beta1 = 0.9
5
+ beta2 = 0.925
6
  weight_decay = 4.5e-3
7
  grad_clip = 0.5
8
  auto_clip = True
9
  auto_clip_max = 0.5
10
+ auto_clip_min = 1e-3
11
  grad_clip_start_size = 100
12
  grad_clip_max_size = 400
13
  grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
14
  max_seq_len = 1536
15
 
16
  # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
+ base_batch_size = 100
18
 
19
+ batch_size = 100
20
+ gradient_accumulation_steps = 1
21
  effective_batch_size = batch_size * gradient_accumulation_steps
22
 
23
  always_save_checkpoint = True
24
+ eval_interval = 150
25
+ eval_iters = 8
26
+ log_interval = 1
27
+ train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
 
29
+ warmup_iters = 1280 # not super necessary potentially
30
+ learning_rate = 2.5e-4
31
+ min_lr = 1.6667e-5
32
  # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
+ max_iters = 2000000
34
 
35
  # # # # #
36
 
 
51
  print(f'Log interval: {log_interval}')
52
 
53
  wandb_log = True
54
+ wandb_project = 'chess-mamba-YOLO'
55
  wandb_run_name = 'Mamba-250M'
56
 
57
  dataset = 'stable'
58
 
59
+ # 250??M param
60
  model_type = 'mamba'
61
+ n_layer = 36
62
+ d_model = 928
63
+ d_state = 48
64
+ dt_rank = 'auto'
65
  move_num_in_gamestate = False
66
 
67
  init_from = 'scratch'