sophia_opt.SophiaG
Browse files- scripts/pretrain-model.yaml +22 -11
- scripts/requirements.in +2 -0
scripts/pretrain-model.yaml
CHANGED
@@ -115,21 +115,32 @@ eval:
|
|
115 |
final_validation: true
|
116 |
|
117 |
# Optimizer-related arguments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
optimizer:
|
119 |
-
|
120 |
-
class_path: grokadamw.GrokAdamW
|
121 |
|
122 |
init_args:
|
123 |
-
|
124 |
-
lr: 4e-04
|
125 |
-
|
126 |
-
# (type: float, default: 0.01)
|
127 |
-
weight_decay: 0.1
|
128 |
-
|
129 |
-
# (type: tuple, default: (0.9,0.999))
|
130 |
betas:
|
131 |
-
- 0.
|
132 |
-
- 0.
|
|
|
|
|
133 |
|
134 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
135 |
devices: auto
|
|
|
115 |
final_validation: true
|
116 |
|
117 |
# Optimizer-related arguments
|
118 |
+
# optimizer:
|
119 |
+
# # class_path: torch.optim.AdamW
|
120 |
+
# class_path: grokadamw.GrokAdamW
|
121 |
+
#
|
122 |
+
# init_args:
|
123 |
+
# # (type: float, default: 0.001)
|
124 |
+
# lr: 4e-04
|
125 |
+
#
|
126 |
+
# # (type: float, default: 0.01)
|
127 |
+
# weight_decay: 0.1
|
128 |
+
#
|
129 |
+
# # (type: tuple, default: (0.9,0.999))
|
130 |
+
# betas:
|
131 |
+
# - 0.9
|
132 |
+
# - 0.95
|
133 |
+
|
134 |
optimizer:
|
135 |
+
class_path: sophia_opt.SophiaG
|
|
|
136 |
|
137 |
init_args:
|
138 |
+
lr: 2e-4
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
betas:
|
140 |
+
- 0.965
|
141 |
+
- 0.99
|
142 |
+
rho: 0.01
|
143 |
+
weight_decay: 1e-1
|
144 |
|
145 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
146 |
devices: auto
|
scripts/requirements.in
CHANGED
@@ -17,3 +17,5 @@ grokadamw
|
|
17 |
# pyzstd
|
18 |
# zstd
|
19 |
Pillow
|
|
|
|
|
|
17 |
# pyzstd
|
18 |
# zstd
|
19 |
Pillow
|
20 |
+
|
21 |
+
sophia-opt
|