HaileyStorm commited on
Commit
d82f87e
·
verified ·
1 Parent(s): b79e9b5

Upload chess-mamba-vs-xformer/config/Mamba/250M.py with huggingface_hub

Browse files
chess-mamba-vs-xformer/config/Mamba/250M.py CHANGED
@@ -2,33 +2,33 @@ import numpy as np
2
  import math
3
 
4
  beta1 = 0.9
5
- beta2 = 0.925
6
- weight_decay = 4.5e-3
7
  grad_clip = 0.5
8
  auto_clip = True
9
- auto_clip_max = 0.5
10
  auto_clip_min = 1e-3
11
  grad_clip_start_size = 100
12
  grad_clip_max_size = 400
13
- grad_clip_percentile = 10 #7.5 (try it at 10, tested @7.75)
14
  max_seq_len = 1536
15
 
16
  # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
  base_batch_size = 100
18
 
19
- batch_size = 100
20
- gradient_accumulation_steps = 1
21
  effective_batch_size = batch_size * gradient_accumulation_steps
22
 
23
  always_save_checkpoint = True
24
- eval_interval = 150
25
- eval_iters = 8
26
- log_interval = 1
27
  train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
 
29
  warmup_iters = 1280 # not super necessary potentially
30
- learning_rate = 2.5e-4
31
- min_lr = 1.6667e-5
32
  # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
  max_iters = 2000000
34
 
@@ -52,19 +52,19 @@ print(f'Log interval: {log_interval}')
52
 
53
  wandb_log = True
54
  wandb_project = 'chess-mamba-YOLO'
55
- wandb_run_name = 'Mamba-250M'
56
 
57
- dataset = 'stable'
58
 
59
- # 250??M param
60
  model_type = 'mamba'
61
- n_layer = 36
62
- d_model = 928
63
- d_state = 48
64
- dt_rank = 'auto'
65
  move_num_in_gamestate = False
66
 
67
- init_from = 'scratch'
68
 
69
  device = 'cuda' # run on cpu only
70
  compile = False # do not torch compile the model
 
2
  import math
3
 
4
  beta1 = 0.9
5
+ beta2 = 0.905 #0.9125 # 0.925
6
+ weight_decay = 1e-4 #1.25e-4 # 4.5e-3
7
  grad_clip = 0.5
8
  auto_clip = True
9
+ auto_clip_max = 0.1
10
  auto_clip_min = 1e-3
11
  grad_clip_start_size = 100
12
  grad_clip_max_size = 400
13
+ grad_clip_percentile = 9 #9.25 # 10
14
  max_seq_len = 1536
15
 
16
  # batch size below values are based on this. When actual batch size adjusted, the below are adjusted automatically
17
  base_batch_size = 100
18
 
19
+ batch_size = 18
20
+ gradient_accumulation_steps = 8
21
  effective_batch_size = batch_size * gradient_accumulation_steps
22
 
23
  always_save_checkpoint = True
24
+ eval_interval = 420 #500
25
+ eval_iters = 8.0 # 7.5
26
+ log_interval = 2
27
  train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
28
 
29
  warmup_iters = 1280 # not super necessary potentially
30
+ learning_rate = 1.5e-4 # 1.75e-4 # 2.5e-4
31
+ min_lr = 1e-5 # 1.16667e-5
32
  # max_iters is for auto-stopping end of stable phase. Reported %complete progress is wrt this (that is, % complete doesn't include anneal).
33
  max_iters = 2000000
34
 
 
52
 
53
  wandb_log = True
54
  wandb_project = 'chess-mamba-YOLO'
55
+ wandb_run_name = 'Mamba-280M'
56
 
57
+ dataset = 'stable2'
58
 
59
+ # 279.8M param
60
  model_type = 'mamba'
61
+ n_layer = 40
62
+ d_model = 1024
63
+ d_state = 64
64
+ dt_rank = 72 #'auto'
65
  move_num_in_gamestate = False
66
 
67
+ init_from = 'resume'
68
 
69
  device = 'cuda' # run on cpu only
70
  compile = False # do not torch compile the model