ZhangShenao commited on
Commit
24bfe31
·
verified ·
1 Parent(s): d528f1c

Model save

Browse files
README.md CHANGED
@@ -38,18 +38,18 @@ More information needed
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 2e-05
41
- - train_batch_size: 8
42
  - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
- - num_devices: 3
46
- - gradient_accumulation_steps: 4
47
- - total_train_batch_size: 96
48
- - total_eval_batch_size: 6
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
- - num_epochs: 5
53
 
54
  ### Training results
55
 
 
38
 
39
  The following hyperparameters were used during training:
40
  - learning_rate: 2e-05
41
+ - train_batch_size: 4
42
  - eval_batch_size: 2
43
  - seed: 42
44
  - distributed_type: multi-GPU
45
+ - num_devices: 2
46
+ - gradient_accumulation_steps: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 4
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 7
53
 
54
  ### Training results
55
 
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.615384615384615,
3
- "total_flos": 9343567134720.0,
4
- "train_loss": 1.0503198325634002,
5
- "train_runtime": 473.9669,
6
  "train_samples": 7473,
7
- "train_samples_per_second": 6.519,
8
- "train_steps_per_second": 0.063
9
  }
 
1
  {
2
+ "epoch": 6.461538461538462,
3
+ "total_flos": 13164745850880.0,
4
+ "train_loss": 0.6425543285551525,
5
+ "train_runtime": 766.0702,
6
  "train_samples": 7473,
7
+ "train_samples_per_second": 5.647,
8
+ "train_steps_per_second": 0.082
9
  }
final_checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca64b968a06f69bcc06bd2d890b91670134b547700a758c879e4088eee568d1c
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2318c52462a65d896bc745ea3dc20db5d92d18d1fe3c1f7a5e98cf811fec084
3
  size 4976698672
final_checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a8694067ac64052128f791d7ff03286ca98766b71ff57ce6ed9aa134a1b50d6
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba206f098d4bd3d54e9a1efd9f08c461edbc7eb794c4a0025d5bf8ba8e5dafc
3
  size 4999802720
final_checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4816d8cdb34b70df0c6b8eeab67571315b017bc2eac9e1ed00543d94bf3dda7
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e0c6855ff9e939027d8b2dc19384cd8cd5503d89622d4a841d40f2ad5719ca
3
  size 4915916176
final_checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85a008e424c01a32b5a5e132ddf3ba1106e787cace0b85304b8f2e9b0633df6c
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969fc7245d7821e96253fb3f655605605c1c0f17b002e6894fffdcde65345a83
3
  size 1168138808
final_checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c33a19196bfe2bad2613a328bf8b5d72e314960f54e408ef1737430f48f21b92
3
  size 7608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7b36c9a7dd8915e25f0506dc589443d0eda498aaac22f0528d243752246a0c1
3
  size 7608
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca64b968a06f69bcc06bd2d890b91670134b547700a758c879e4088eee568d1c
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2318c52462a65d896bc745ea3dc20db5d92d18d1fe3c1f7a5e98cf811fec084
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a8694067ac64052128f791d7ff03286ca98766b71ff57ce6ed9aa134a1b50d6
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba206f098d4bd3d54e9a1efd9f08c461edbc7eb794c4a0025d5bf8ba8e5dafc
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4816d8cdb34b70df0c6b8eeab67571315b017bc2eac9e1ed00543d94bf3dda7
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e0c6855ff9e939027d8b2dc19384cd8cd5503d89622d4a841d40f2ad5719ca
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85a008e424c01a32b5a5e132ddf3ba1106e787cace0b85304b8f2e9b0633df6c
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969fc7245d7821e96253fb3f655605605c1c0f17b002e6894fffdcde65345a83
3
  size 1168138808
runs/Jan07_07-47-21_n124-173-214/events.out.tfevents.1736236065.n124-173-214.79071.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c378998777bee2ecb1f4c7054cc3029610471c66555f0e91b14ac3c56344ff1b
3
+ size 5865
runs/Jan07_07-49-44_n124-173-214/events.out.tfevents.1736236207.n124-173-214.81356.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:178d9ced32b4ab7a0adb6268525779d59f52f3e594c7853d19d42cbbdace9c44
3
+ size 5658
runs/Jan07_07-51-24_n124-173-214/events.out.tfevents.1736236306.n124-173-214.83171.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc3ae23e7444a7e21841fe9ddb6561cd22875cefb22b33d8ab2cafebc769e0d
3
+ size 19047
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.615384615384615,
3
- "total_flos": 9343567134720.0,
4
- "train_loss": 1.0503198325634002,
5
- "train_runtime": 473.9669,
6
  "train_samples": 7473,
7
- "train_samples_per_second": 6.519,
8
- "train_steps_per_second": 0.063
9
  }
 
1
  {
2
+ "epoch": 6.461538461538462,
3
+ "total_flos": 13164745850880.0,
4
+ "train_loss": 0.6425543285551525,
5
+ "train_runtime": 766.0702,
6
  "train_samples": 7473,
7
+ "train_samples_per_second": 5.647,
8
+ "train_steps_per_second": 0.082
9
  }
trainer_state.json CHANGED
@@ -1,237 +1,468 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.615384615384615,
5
  "eval_steps": 500,
6
- "global_step": 30,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.15384615384615385,
13
- "grad_norm": 45.207840048303645,
14
- "learning_rate": 6.666666666666667e-06,
15
- "loss": 1.7659,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.3076923076923077,
20
- "grad_norm": 45.675403154272956,
21
- "learning_rate": 1.3333333333333333e-05,
22
- "loss": 1.7642,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.46153846153846156,
27
- "grad_norm": 20.868641882690532,
28
- "learning_rate": 2e-05,
29
- "loss": 1.3575,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.6153846153846154,
34
- "grad_norm": 34.06850705215301,
35
- "learning_rate": 1.9932383577419432e-05,
36
- "loss": 1.9345,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.7692307692307693,
41
- "grad_norm": 37.41867508840468,
42
- "learning_rate": 1.973044870579824e-05,
43
- "loss": 1.6787,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.9230769230769231,
48
- "grad_norm": 53.46088058831564,
49
- "learning_rate": 1.9396926207859085e-05,
50
- "loss": 1.488,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 1.0769230769230769,
55
- "grad_norm": 7.4627580332441275,
56
- "learning_rate": 1.8936326403234125e-05,
57
- "loss": 1.2373,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 1.2307692307692308,
62
- "grad_norm": 3.9518851165618223,
63
- "learning_rate": 1.8354878114129368e-05,
64
- "loss": 1.0725,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 1.3846153846153846,
69
- "grad_norm": 7.177914493789473,
70
- "learning_rate": 1.766044443118978e-05,
71
- "loss": 1.0964,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 1.5384615384615383,
76
- "grad_norm": 5.589669105212754,
77
- "learning_rate": 1.686241637868734e-05,
78
- "loss": 1.0627,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 1.6923076923076923,
83
- "grad_norm": 5.081123298359917,
84
- "learning_rate": 1.5971585917027864e-05,
85
- "loss": 1.0228,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 1.8461538461538463,
90
- "grad_norm": 5.35114385581476,
91
- "learning_rate": 1.5000000000000002e-05,
92
- "loss": 1.0027,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 2.0,
97
- "grad_norm": 1.8251196525450355,
98
- "learning_rate": 1.396079766039157e-05,
99
- "loss": 0.9503,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 2.1538461538461537,
104
- "grad_norm": 2.8409999360336515,
105
- "learning_rate": 1.2868032327110904e-05,
106
- "loss": 0.9229,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 2.3076923076923075,
111
- "grad_norm": 1.5687190110468443,
112
- "learning_rate": 1.1736481776669307e-05,
113
- "loss": 0.9072,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 2.4615384615384617,
118
- "grad_norm": 2.0183470286877503,
119
- "learning_rate": 1.0581448289104759e-05,
120
- "loss": 0.9059,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 2.6153846153846154,
125
- "grad_norm": 1.5424054889975665,
126
- "learning_rate": 9.418551710895243e-06,
127
- "loss": 0.8844,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 2.769230769230769,
132
- "grad_norm": 1.4748838136546052,
133
- "learning_rate": 8.263518223330698e-06,
134
- "loss": 0.8603,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 2.9230769230769234,
139
- "grad_norm": 1.2236878389423478,
140
- "learning_rate": 7.131967672889101e-06,
141
- "loss": 0.8656,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 3.076923076923077,
146
- "grad_norm": 1.1955478413282419,
147
- "learning_rate": 6.039202339608432e-06,
148
- "loss": 0.8482,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 3.230769230769231,
153
- "grad_norm": 0.7738899384102409,
154
- "learning_rate": 5.000000000000003e-06,
155
- "loss": 0.8156,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 3.3846153846153846,
160
- "grad_norm": 1.0901149643020054,
161
- "learning_rate": 4.028414082972141e-06,
162
- "loss": 0.8103,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 3.5384615384615383,
167
- "grad_norm": 0.8933210291425059,
168
- "learning_rate": 3.1375836213126653e-06,
169
- "loss": 0.7943,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 3.6923076923076925,
174
- "grad_norm": 0.6465350340912998,
175
- "learning_rate": 2.339555568810221e-06,
176
- "loss": 0.801,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 3.8461538461538463,
181
- "grad_norm": 0.7231924138082919,
182
- "learning_rate": 1.6451218858706374e-06,
183
- "loss": 0.7893,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 4.0,
188
- "grad_norm": 0.7957272625370657,
189
- "learning_rate": 1.0636735967658785e-06,
190
- "loss": 0.7955,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 4.153846153846154,
195
- "grad_norm": 0.7102065034979811,
196
- "learning_rate": 6.030737921409169e-07,
197
- "loss": 0.7631,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 4.3076923076923075,
202
- "grad_norm": 0.7243607288304533,
203
- "learning_rate": 2.6955129420176193e-07,
204
- "loss": 0.7785,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 4.461538461538462,
209
- "grad_norm": 0.687719390104318,
210
- "learning_rate": 6.761642258056977e-08,
211
- "loss": 0.7678,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 4.615384615384615,
216
- "grad_norm": 0.6115431503277491,
217
- "learning_rate": 0.0,
218
- "loss": 0.7664,
219
  "step": 30
220
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  {
222
  "epoch": 4.615384615384615,
223
- "step": 30,
224
- "total_flos": 9343567134720.0,
225
- "train_loss": 1.0503198325634002,
226
- "train_runtime": 473.9669,
227
- "train_samples_per_second": 6.519,
228
- "train_steps_per_second": 0.063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  }
230
  ],
231
  "logging_steps": 1,
232
- "max_steps": 30,
233
  "num_input_tokens_seen": 0,
234
- "num_train_epochs": 5,
235
  "save_steps": 999999,
236
  "stateful_callbacks": {
237
  "TrainerControl": {
@@ -245,8 +476,8 @@
245
  "attributes": {}
246
  }
247
  },
248
- "total_flos": 9343567134720.0,
249
- "train_batch_size": 8,
250
  "trial_name": null,
251
  "trial_params": null
252
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.461538461538462,
5
  "eval_steps": 500,
6
+ "global_step": 63,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.10256410256410256,
13
+ "grad_norm": 45.74676831817011,
14
+ "learning_rate": 2.8571428571428573e-06,
15
+ "loss": 1.7736,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.20512820512820512,
20
+ "grad_norm": 43.8997476738742,
21
+ "learning_rate": 5.7142857142857145e-06,
22
+ "loss": 1.7436,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.3076923076923077,
27
+ "grad_norm": 33.1407433311641,
28
+ "learning_rate": 8.571428571428571e-06,
29
+ "loss": 1.5701,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.41025641025641024,
34
+ "grad_norm": 15.347997497580668,
35
+ "learning_rate": 1.1428571428571429e-05,
36
+ "loss": 1.3552,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.5128205128205128,
41
+ "grad_norm": 42.292432586275865,
42
+ "learning_rate": 1.4285714285714287e-05,
43
+ "loss": 1.2627,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.6153846153846154,
48
+ "grad_norm": 36.376951471898245,
49
+ "learning_rate": 1.7142857142857142e-05,
50
+ "loss": 1.2198,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.717948717948718,
55
+ "grad_norm": 7.974991090529373,
56
+ "learning_rate": 2e-05,
57
+ "loss": 1.1377,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.8205128205128205,
62
+ "grad_norm": 16.501115024701512,
63
+ "learning_rate": 1.998426815017817e-05,
64
+ "loss": 1.1813,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.9230769230769231,
69
+ "grad_norm": 7.188262087017176,
70
+ "learning_rate": 1.9937122098932428e-05,
71
+ "loss": 1.1565,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 1.0256410256410255,
76
+ "grad_norm": 4.140303367679906,
77
+ "learning_rate": 1.985871018518236e-05,
78
+ "loss": 1.03,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 1.1282051282051282,
83
+ "grad_norm": 2.728109728903398,
84
+ "learning_rate": 1.9749279121818235e-05,
85
+ "loss": 0.9516,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 1.2307692307692308,
90
+ "grad_norm": 5.943582967422805,
91
+ "learning_rate": 1.9609173219450998e-05,
92
+ "loss": 0.9511,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 1.3333333333333333,
97
+ "grad_norm": 2.215718821444342,
98
+ "learning_rate": 1.9438833303083677e-05,
99
+ "loss": 0.927,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 1.435897435897436,
104
+ "grad_norm": 3.7476553781843758,
105
+ "learning_rate": 1.9238795325112867e-05,
106
+ "loss": 0.9357,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 1.5384615384615383,
111
+ "grad_norm": 2.3784620064293667,
112
+ "learning_rate": 1.900968867902419e-05,
113
+ "loss": 0.9271,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 1.641025641025641,
118
+ "grad_norm": 1.921424026693961,
119
+ "learning_rate": 1.8752234219087538e-05,
120
+ "loss": 0.9106,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 1.7435897435897436,
125
+ "grad_norm": 1.74875556144991,
126
+ "learning_rate": 1.8467241992282842e-05,
127
+ "loss": 0.8924,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 1.8461538461538463,
132
+ "grad_norm": 1.5465709392794136,
133
+ "learning_rate": 1.8155608689592604e-05,
134
+ "loss": 0.8985,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 1.9487179487179487,
139
+ "grad_norm": 1.6088170768182135,
140
+ "learning_rate": 1.78183148246803e-05,
141
+ "loss": 0.8731,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 2.051282051282051,
146
+ "grad_norm": 1.550794028416961,
147
+ "learning_rate": 1.7456421648831658e-05,
148
+ "loss": 0.8108,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 2.1538461538461537,
153
+ "grad_norm": 1.1891604452468383,
154
+ "learning_rate": 1.7071067811865477e-05,
155
+ "loss": 0.7693,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 2.2564102564102564,
160
+ "grad_norm": 1.7874206012739782,
161
+ "learning_rate": 1.6663465779520042e-05,
162
+ "loss": 0.7545,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 2.358974358974359,
167
+ "grad_norm": 1.546137200179085,
168
+ "learning_rate": 1.6234898018587336e-05,
169
+ "loss": 0.7505,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 2.4615384615384617,
174
+ "grad_norm": 1.5723321757671156,
175
+ "learning_rate": 1.578671296179806e-05,
176
+ "loss": 0.7592,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 2.564102564102564,
181
+ "grad_norm": 1.147889286365701,
182
+ "learning_rate": 1.5320320765153367e-05,
183
+ "loss": 0.737,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 2.6666666666666665,
188
+ "grad_norm": 1.2373533333897078,
189
+ "learning_rate": 1.4837188871052399e-05,
190
+ "loss": 0.7165,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 2.769230769230769,
195
+ "grad_norm": 1.1523919084408827,
196
+ "learning_rate": 1.4338837391175582e-05,
197
+ "loss": 0.6998,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 2.871794871794872,
202
+ "grad_norm": 1.1820135421697542,
203
+ "learning_rate": 1.3826834323650899e-05,
204
+ "loss": 0.7086,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 2.9743589743589745,
209
+ "grad_norm": 1.163468789617511,
210
+ "learning_rate": 1.3302790619551673e-05,
211
+ "loss": 0.7175,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 3.076923076923077,
216
+ "grad_norm": 1.7767072503264743,
217
+ "learning_rate": 1.2768355114248493e-05,
218
+ "loss": 0.593,
219
  "step": 30
220
  },
221
+ {
222
+ "epoch": 3.1794871794871793,
223
+ "grad_norm": 1.81233019892104,
224
+ "learning_rate": 1.2225209339563144e-05,
225
+ "loss": 0.5519,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 3.282051282051282,
230
+ "grad_norm": 2.1736744456228694,
231
+ "learning_rate": 1.1675062233047365e-05,
232
+ "loss": 0.5437,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 3.3846153846153846,
237
+ "grad_norm": 2.523447078328654,
238
+ "learning_rate": 1.1119644761033079e-05,
239
+ "loss": 0.5334,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 3.4871794871794872,
244
+ "grad_norm": 1.736445380863662,
245
+ "learning_rate": 1.0560704472371919e-05,
246
+ "loss": 0.5068,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 3.58974358974359,
251
+ "grad_norm": 1.4506284420650815,
252
+ "learning_rate": 1e-05,
253
+ "loss": 0.5092,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 3.6923076923076925,
258
+ "grad_norm": 1.4702124114670763,
259
+ "learning_rate": 9.439295527628083e-06,
260
+ "loss": 0.4965,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 3.7948717948717947,
265
+ "grad_norm": 1.460168348735874,
266
+ "learning_rate": 8.880355238966923e-06,
267
+ "loss": 0.4902,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 3.8974358974358974,
272
+ "grad_norm": 1.2827679925297792,
273
+ "learning_rate": 8.324937766952638e-06,
274
+ "loss": 0.4923,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 4.0,
279
+ "grad_norm": 1.3138838652676037,
280
+ "learning_rate": 7.774790660436857e-06,
281
+ "loss": 0.4731,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 4.102564102564102,
286
+ "grad_norm": 2.82702405107918,
287
+ "learning_rate": 7.2316448857515076e-06,
288
+ "loss": 0.377,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 4.205128205128205,
293
+ "grad_norm": 2.2541347262792897,
294
+ "learning_rate": 6.697209380448333e-06,
295
+ "loss": 0.3562,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 4.3076923076923075,
300
+ "grad_norm": 2.5922750621707293,
301
+ "learning_rate": 6.173165676349103e-06,
302
+ "loss": 0.3482,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 4.410256410256411,
307
+ "grad_norm": 3.552401632591804,
308
+ "learning_rate": 5.66116260882442e-06,
309
+ "loss": 0.3332,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 4.512820512820513,
314
+ "grad_norm": 2.5996491260559615,
315
+ "learning_rate": 5.1628111289476025e-06,
316
+ "loss": 0.3292,
317
+ "step": 44
318
+ },
319
  {
320
  "epoch": 4.615384615384615,
321
+ "grad_norm": 2.0225922587821805,
322
+ "learning_rate": 4.679679234846636e-06,
323
+ "loss": 0.3134,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 4.717948717948718,
328
+ "grad_norm": 1.794698781237276,
329
+ "learning_rate": 4.213287038201943e-06,
330
+ "loss": 0.3144,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 4.82051282051282,
335
+ "grad_norm": 1.78570038657741,
336
+ "learning_rate": 3.7651019814126656e-06,
337
+ "loss": 0.3054,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 4.923076923076923,
342
+ "grad_norm": 1.855808340157979,
343
+ "learning_rate": 3.3365342204799613e-06,
344
+ "loss": 0.3,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 5.0256410256410255,
349
+ "grad_norm": 1.9551127034348588,
350
+ "learning_rate": 2.9289321881345257e-06,
351
+ "loss": 0.2895,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 5.128205128205128,
356
+ "grad_norm": 1.8579864077346009,
357
+ "learning_rate": 2.5435783511683444e-06,
358
+ "loss": 0.2339,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 5.230769230769231,
363
+ "grad_norm": 1.624477309859244,
364
+ "learning_rate": 2.1816851753197023e-06,
365
+ "loss": 0.2185,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 5.333333333333333,
370
+ "grad_norm": 1.7017286954162238,
371
+ "learning_rate": 1.8443913104073984e-06,
372
+ "loss": 0.2273,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 5.435897435897436,
377
+ "grad_norm": 1.612088436018407,
378
+ "learning_rate": 1.5327580077171589e-06,
379
+ "loss": 0.2157,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 5.538461538461538,
384
+ "grad_norm": 1.6667785081621305,
385
+ "learning_rate": 1.2477657809124632e-06,
386
+ "loss": 0.2163,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 5.641025641025641,
391
+ "grad_norm": 1.5501897496769197,
392
+ "learning_rate": 9.903113209758098e-07,
393
+ "loss": 0.2134,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 5.743589743589744,
398
+ "grad_norm": 1.6228058819139755,
399
+ "learning_rate": 7.612046748871327e-07,
400
+ "loss": 0.2247,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 5.846153846153846,
405
+ "grad_norm": 1.4413136220349998,
406
+ "learning_rate": 5.611666969163243e-07,
407
+ "loss": 0.2157,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 5.948717948717949,
412
+ "grad_norm": 1.3872967872162942,
413
+ "learning_rate": 3.908267805490051e-07,
414
+ "loss": 0.2111,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 6.051282051282051,
419
+ "grad_norm": 1.5157677115577846,
420
+ "learning_rate": 2.507208781817638e-07,
421
+ "loss": 0.1947,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 6.153846153846154,
426
+ "grad_norm": 1.5908969907868264,
427
+ "learning_rate": 1.4128981481764115e-07,
428
+ "loss": 0.1848,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 6.256410256410256,
433
+ "grad_norm": 1.5844350731835435,
434
+ "learning_rate": 6.287790106757396e-08,
435
+ "loss": 0.1888,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 6.358974358974359,
440
+ "grad_norm": 1.6757546348969776,
441
+ "learning_rate": 1.5731849821833955e-08,
442
+ "loss": 0.1826,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 6.461538461538462,
447
+ "grad_norm": 1.5614630318772702,
448
+ "learning_rate": 0.0,
449
+ "loss": 0.1757,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 6.461538461538462,
454
+ "step": 63,
455
+ "total_flos": 13164745850880.0,
456
+ "train_loss": 0.6425543285551525,
457
+ "train_runtime": 766.0702,
458
+ "train_samples_per_second": 5.647,
459
+ "train_steps_per_second": 0.082
460
  }
461
  ],
462
  "logging_steps": 1,
463
+ "max_steps": 63,
464
  "num_input_tokens_seen": 0,
465
+ "num_train_epochs": 7,
466
  "save_steps": 999999,
467
  "stateful_callbacks": {
468
  "TrainerControl": {
 
476
  "attributes": {}
477
  }
478
  },
479
+ "total_flos": 13164745850880.0,
480
+ "train_batch_size": 4,
481
  "trial_name": null,
482
  "trial_params": null
483
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c33a19196bfe2bad2613a328bf8b5d72e314960f54e408ef1737430f48f21b92
3
  size 7608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7b36c9a7dd8915e25f0506dc589443d0eda498aaac22f0528d243752246a0c1
3
  size 7608