mtzig commited on
Commit
dab7347
·
verified ·
1 Parent(s): aded198

Training in progress, step 100, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
last-checkpoint/optimizer_0/.metadata ADDED
Binary file (369 kB). View file
 
last-checkpoint/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cbbb2b3d4f31c0a3413df2eaabce947e7719fd0714df8a5fab22393f53e219
3
+ size 13934748
last-checkpoint/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dcda1040311414dc0a2d44a05e5cb35e7c3038170d8e17543a4332cb366e191
3
+ size 13999412
last-checkpoint/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df998ad924c5b62f90019cbb88fd62b3e4e64d88b228130d251792bf7deab033
3
+ size 13990904
last-checkpoint/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a5670376370f1d6ada74f967c2248f323eac4ca9690d09f922137342c62f2a
3
+ size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED
Binary file (135 kB). View file
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1705c8193a4631578a089db2d70fd2c71d0505a2f3d764fe46d1c24b2a070eeb
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc95d73e7987c5d7d832cf8226eb09bd9e7f7be58ec455e6bb2af988ae5d69aa
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00c32232ddb18801082f4fe4b153458b3dc5c37925e551cbcfed6e39be0485e5
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c11425ca4111acf116243f564b369521900c6d6ccd8a56608c8343daf67d67
3
+ size 6966784
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e67c2bec7d86c4f6210325ca670c1a767d63ea7097a338fac8d4332930e740d6
3
+ size 14960
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7128968a26346cae27935bd130c910b7855033e1601547200dbc0f94356ba770
3
+ size 14960
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d956842b2ce7b3ecd63e4eecaf16e30235bcc33f9f434a1d5a9ad735729148b6
3
+ size 14960
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58df2eeb2aeb3e7ff65838d74d9b8fdd9bdafa1a418b60d36797cdf8924dfc1c
3
+ size 14960
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:005d0b07ecb0e6cdb0df3ee6d6ccfde8718b0ebbfe5a6ffbd39e3b172fc51813
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.06765899864682003,
5
+ "eval_steps": 20,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.726605504587156,
14
+ "eval_f1": 0.11834319526627218,
15
+ "eval_loss": 0.6266470551490784,
16
+ "eval_precision": 0.5,
17
+ "eval_recall": 0.06711409395973154,
18
+ "eval_runtime": 53.0676,
19
+ "eval_samples_per_second": 5.615,
20
+ "eval_steps_per_second": 0.188,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.0006765899864682003,
25
+ "grad_norm": 1.9097040891647339,
26
+ "learning_rate": 1.3513513513513515e-07,
27
+ "loss": 0.5346,
28
+ "step": 1
29
+ },
30
+ {
31
+ "epoch": 0.0013531799729364006,
32
+ "grad_norm": 2.262101173400879,
33
+ "learning_rate": 2.702702702702703e-07,
34
+ "loss": 0.6432,
35
+ "step": 2
36
+ },
37
+ {
38
+ "epoch": 0.0020297699594046007,
39
+ "grad_norm": 2.2351596355438232,
40
+ "learning_rate": 4.0540540540540546e-07,
41
+ "loss": 0.6418,
42
+ "step": 3
43
+ },
44
+ {
45
+ "epoch": 0.0027063599458728013,
46
+ "grad_norm": 2.1407454013824463,
47
+ "learning_rate": 5.405405405405406e-07,
48
+ "loss": 0.681,
49
+ "step": 4
50
+ },
51
+ {
52
+ "epoch": 0.0033829499323410014,
53
+ "grad_norm": 1.836843729019165,
54
+ "learning_rate": 6.756756756756758e-07,
55
+ "loss": 0.6663,
56
+ "step": 5
57
+ },
58
+ {
59
+ "epoch": 0.0040595399188092015,
60
+ "grad_norm": 2.4660489559173584,
61
+ "learning_rate": 8.108108108108109e-07,
62
+ "loss": 0.6643,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.004736129905277402,
67
+ "grad_norm": 2.2095065116882324,
68
+ "learning_rate": 9.459459459459461e-07,
69
+ "loss": 0.6107,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 0.005412719891745603,
74
+ "grad_norm": 2.3385086059570312,
75
+ "learning_rate": 1.0810810810810812e-06,
76
+ "loss": 0.6332,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 0.006089309878213802,
81
+ "grad_norm": 2.0470025539398193,
82
+ "learning_rate": 1.2162162162162164e-06,
83
+ "loss": 0.6645,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 0.006765899864682003,
88
+ "grad_norm": 2.1129884719848633,
89
+ "learning_rate": 1.3513513513513515e-06,
90
+ "loss": 0.5937,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 0.007442489851150203,
95
+ "grad_norm": 2.343991994857788,
96
+ "learning_rate": 1.4864864864864868e-06,
97
+ "loss": 0.6274,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.008119079837618403,
102
+ "grad_norm": 2.254518508911133,
103
+ "learning_rate": 1.6216216216216219e-06,
104
+ "loss": 0.6133,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.008795669824086604,
109
+ "grad_norm": 2.3268182277679443,
110
+ "learning_rate": 1.756756756756757e-06,
111
+ "loss": 0.5994,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 0.009472259810554804,
116
+ "grad_norm": 2.1147611141204834,
117
+ "learning_rate": 1.8918918918918922e-06,
118
+ "loss": 0.6043,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 0.010148849797023005,
123
+ "grad_norm": 3.140791654586792,
124
+ "learning_rate": 2.0270270270270273e-06,
125
+ "loss": 0.6447,
126
+ "step": 15
127
+ },
128
+ {
129
+ "epoch": 0.010825439783491205,
130
+ "grad_norm": 2.154975175857544,
131
+ "learning_rate": 2.1621621621621623e-06,
132
+ "loss": 0.6472,
133
+ "step": 16
134
+ },
135
+ {
136
+ "epoch": 0.011502029769959404,
137
+ "grad_norm": 2.405954599380493,
138
+ "learning_rate": 2.297297297297298e-06,
139
+ "loss": 0.6622,
140
+ "step": 17
141
+ },
142
+ {
143
+ "epoch": 0.012178619756427604,
144
+ "grad_norm": 1.8810043334960938,
145
+ "learning_rate": 2.432432432432433e-06,
146
+ "loss": 0.6463,
147
+ "step": 18
148
+ },
149
+ {
150
+ "epoch": 0.012855209742895805,
151
+ "grad_norm": 2.251763105392456,
152
+ "learning_rate": 2.5675675675675675e-06,
153
+ "loss": 0.6118,
154
+ "step": 19
155
+ },
156
+ {
157
+ "epoch": 0.013531799729364006,
158
+ "grad_norm": 2.2010996341705322,
159
+ "learning_rate": 2.702702702702703e-06,
160
+ "loss": 0.6566,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 0.013531799729364006,
165
+ "eval_accuracy": 0.726605504587156,
166
+ "eval_f1": 0.11834319526627218,
167
+ "eval_loss": 0.6250319480895996,
168
+ "eval_precision": 0.5,
169
+ "eval_recall": 0.06711409395973154,
170
+ "eval_runtime": 51.8026,
171
+ "eval_samples_per_second": 5.753,
172
+ "eval_steps_per_second": 0.193,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.014208389715832206,
177
+ "grad_norm": 2.1348178386688232,
178
+ "learning_rate": 2.837837837837838e-06,
179
+ "loss": 0.6593,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.014884979702300407,
184
+ "grad_norm": 2.461346387863159,
185
+ "learning_rate": 2.9729729729729736e-06,
186
+ "loss": 0.5665,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.015561569688768605,
191
+ "grad_norm": 1.7864395380020142,
192
+ "learning_rate": 3.1081081081081082e-06,
193
+ "loss": 0.6044,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.016238159675236806,
198
+ "grad_norm": 2.120920419692993,
199
+ "learning_rate": 3.2432432432432437e-06,
200
+ "loss": 0.6494,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.016914749661705007,
205
+ "grad_norm": 2.293957233428955,
206
+ "learning_rate": 3.3783783783783788e-06,
207
+ "loss": 0.6729,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.017591339648173207,
212
+ "grad_norm": 1.9928455352783203,
213
+ "learning_rate": 3.513513513513514e-06,
214
+ "loss": 0.606,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.018267929634641408,
219
+ "grad_norm": 1.8565198183059692,
220
+ "learning_rate": 3.648648648648649e-06,
221
+ "loss": 0.571,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.018944519621109608,
226
+ "grad_norm": 1.8976123332977295,
227
+ "learning_rate": 3.7837837837837844e-06,
228
+ "loss": 0.5702,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.01962110960757781,
233
+ "grad_norm": 2.2150862216949463,
234
+ "learning_rate": 3.918918918918919e-06,
235
+ "loss": 0.5535,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.02029769959404601,
240
+ "grad_norm": 2.0916941165924072,
241
+ "learning_rate": 4.0540540540540545e-06,
242
+ "loss": 0.6707,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.02097428958051421,
247
+ "grad_norm": 2.0436134338378906,
248
+ "learning_rate": 4.189189189189189e-06,
249
+ "loss": 0.5966,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.02165087956698241,
254
+ "grad_norm": 1.8890984058380127,
255
+ "learning_rate": 4.324324324324325e-06,
256
+ "loss": 0.5533,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.022327469553450607,
261
+ "grad_norm": 2.0738587379455566,
262
+ "learning_rate": 4.45945945945946e-06,
263
+ "loss": 0.6128,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.023004059539918808,
268
+ "grad_norm": 1.9424076080322266,
269
+ "learning_rate": 4.594594594594596e-06,
270
+ "loss": 0.5763,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.02368064952638701,
275
+ "grad_norm": 1.7840420007705688,
276
+ "learning_rate": 4.72972972972973e-06,
277
+ "loss": 0.5632,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.02435723951285521,
282
+ "grad_norm": 2.2191755771636963,
283
+ "learning_rate": 4.864864864864866e-06,
284
+ "loss": 0.6482,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.02503382949932341,
289
+ "grad_norm": 1.925732970237732,
290
+ "learning_rate": 5e-06,
291
+ "loss": 0.6266,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.02571041948579161,
296
+ "grad_norm": 1.7854461669921875,
297
+ "learning_rate": 5.135135135135135e-06,
298
+ "loss": 0.5505,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.02638700947225981,
303
+ "grad_norm": 1.9672614336013794,
304
+ "learning_rate": 5.2702702702702705e-06,
305
+ "loss": 0.5851,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.02706359945872801,
310
+ "grad_norm": 1.713619589805603,
311
+ "learning_rate": 5.405405405405406e-06,
312
+ "loss": 0.5066,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.02706359945872801,
317
+ "eval_accuracy": 0.728440366972477,
318
+ "eval_f1": 0.11904761904761904,
319
+ "eval_loss": 0.6119223237037659,
320
+ "eval_precision": 0.5263157894736842,
321
+ "eval_recall": 0.06711409395973154,
322
+ "eval_runtime": 52.1134,
323
+ "eval_samples_per_second": 5.718,
324
+ "eval_steps_per_second": 0.192,
325
+ "step": 40
326
+ },
327
+ {
328
+ "epoch": 0.02774018944519621,
329
+ "grad_norm": 2.512800455093384,
330
+ "learning_rate": 5.540540540540541e-06,
331
+ "loss": 0.6359,
332
+ "step": 41
333
+ },
334
+ {
335
+ "epoch": 0.028416779431664412,
336
+ "grad_norm": 2.311678647994995,
337
+ "learning_rate": 5.675675675675676e-06,
338
+ "loss": 0.5823,
339
+ "step": 42
340
+ },
341
+ {
342
+ "epoch": 0.029093369418132613,
343
+ "grad_norm": 1.8111237287521362,
344
+ "learning_rate": 5.810810810810811e-06,
345
+ "loss": 0.5194,
346
+ "step": 43
347
+ },
348
+ {
349
+ "epoch": 0.029769959404600813,
350
+ "grad_norm": 2.3231632709503174,
351
+ "learning_rate": 5.945945945945947e-06,
352
+ "loss": 0.6335,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 0.030446549391069014,
357
+ "grad_norm": 1.9767159223556519,
358
+ "learning_rate": 6.081081081081082e-06,
359
+ "loss": 0.5406,
360
+ "step": 45
361
+ },
362
+ {
363
+ "epoch": 0.03112313937753721,
364
+ "grad_norm": 1.8099788427352905,
365
+ "learning_rate": 6.2162162162162164e-06,
366
+ "loss": 0.5191,
367
+ "step": 46
368
+ },
369
+ {
370
+ "epoch": 0.031799729364005415,
371
+ "grad_norm": 1.9105194807052612,
372
+ "learning_rate": 6.351351351351351e-06,
373
+ "loss": 0.5575,
374
+ "step": 47
375
+ },
376
+ {
377
+ "epoch": 0.03247631935047361,
378
+ "grad_norm": 1.8297271728515625,
379
+ "learning_rate": 6.486486486486487e-06,
380
+ "loss": 0.51,
381
+ "step": 48
382
+ },
383
+ {
384
+ "epoch": 0.033152909336941816,
385
+ "grad_norm": 1.8884862661361694,
386
+ "learning_rate": 6.621621621621622e-06,
387
+ "loss": 0.5755,
388
+ "step": 49
389
+ },
390
+ {
391
+ "epoch": 0.03382949932341001,
392
+ "grad_norm": 2.0803935527801514,
393
+ "learning_rate": 6.7567567567567575e-06,
394
+ "loss": 0.5915,
395
+ "step": 50
396
+ },
397
+ {
398
+ "epoch": 0.03450608930987822,
399
+ "grad_norm": 2.036954164505005,
400
+ "learning_rate": 6.891891891891892e-06,
401
+ "loss": 0.5394,
402
+ "step": 51
403
+ },
404
+ {
405
+ "epoch": 0.035182679296346414,
406
+ "grad_norm": 2.0037217140197754,
407
+ "learning_rate": 7.027027027027028e-06,
408
+ "loss": 0.4967,
409
+ "step": 52
410
+ },
411
+ {
412
+ "epoch": 0.03585926928281461,
413
+ "grad_norm": 1.6572487354278564,
414
+ "learning_rate": 7.162162162162163e-06,
415
+ "loss": 0.5458,
416
+ "step": 53
417
+ },
418
+ {
419
+ "epoch": 0.036535859269282815,
420
+ "grad_norm": 1.8542054891586304,
421
+ "learning_rate": 7.297297297297298e-06,
422
+ "loss": 0.4571,
423
+ "step": 54
424
+ },
425
+ {
426
+ "epoch": 0.03721244925575101,
427
+ "grad_norm": 1.6970975399017334,
428
+ "learning_rate": 7.4324324324324324e-06,
429
+ "loss": 0.5125,
430
+ "step": 55
431
+ },
432
+ {
433
+ "epoch": 0.037889039242219216,
434
+ "grad_norm": 1.8225724697113037,
435
+ "learning_rate": 7.567567567567569e-06,
436
+ "loss": 0.549,
437
+ "step": 56
438
+ },
439
+ {
440
+ "epoch": 0.03856562922868741,
441
+ "grad_norm": 1.5912785530090332,
442
+ "learning_rate": 7.702702702702704e-06,
443
+ "loss": 0.4843,
444
+ "step": 57
445
+ },
446
+ {
447
+ "epoch": 0.03924221921515562,
448
+ "grad_norm": 1.694573998451233,
449
+ "learning_rate": 7.837837837837838e-06,
450
+ "loss": 0.5804,
451
+ "step": 58
452
+ },
453
+ {
454
+ "epoch": 0.039918809201623814,
455
+ "grad_norm": 1.6933585405349731,
456
+ "learning_rate": 7.972972972972974e-06,
457
+ "loss": 0.5306,
458
+ "step": 59
459
+ },
460
+ {
461
+ "epoch": 0.04059539918809202,
462
+ "grad_norm": 1.7225837707519531,
463
+ "learning_rate": 8.108108108108109e-06,
464
+ "loss": 0.4866,
465
+ "step": 60
466
+ },
467
+ {
468
+ "epoch": 0.04059539918809202,
469
+ "eval_accuracy": 0.7376146788990826,
470
+ "eval_f1": 0.2011173184357542,
471
+ "eval_loss": 0.581759512424469,
472
+ "eval_precision": 0.6,
473
+ "eval_recall": 0.12080536912751678,
474
+ "eval_runtime": 51.4731,
475
+ "eval_samples_per_second": 5.789,
476
+ "eval_steps_per_second": 0.194,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.041271989174560215,
481
+ "grad_norm": 1.9804434776306152,
482
+ "learning_rate": 8.243243243243245e-06,
483
+ "loss": 0.5489,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.04194857916102842,
488
+ "grad_norm": 2.3419950008392334,
489
+ "learning_rate": 8.378378378378378e-06,
490
+ "loss": 0.5551,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.04262516914749662,
495
+ "grad_norm": 2.275982618331909,
496
+ "learning_rate": 8.513513513513514e-06,
497
+ "loss": 0.5127,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.04330175913396482,
502
+ "grad_norm": 2.507098913192749,
503
+ "learning_rate": 8.64864864864865e-06,
504
+ "loss": 0.5736,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.04397834912043302,
509
+ "grad_norm": 1.8046241998672485,
510
+ "learning_rate": 8.783783783783785e-06,
511
+ "loss": 0.4755,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.044654939106901215,
516
+ "grad_norm": 1.8296290636062622,
517
+ "learning_rate": 8.91891891891892e-06,
518
+ "loss": 0.4999,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.04533152909336942,
523
+ "grad_norm": 2.3316869735717773,
524
+ "learning_rate": 9.054054054054054e-06,
525
+ "loss": 0.4797,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.046008119079837616,
530
+ "grad_norm": 1.6778762340545654,
531
+ "learning_rate": 9.189189189189191e-06,
532
+ "loss": 0.5238,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.04668470906630582,
537
+ "grad_norm": 1.8217062950134277,
538
+ "learning_rate": 9.324324324324325e-06,
539
+ "loss": 0.526,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.04736129905277402,
544
+ "grad_norm": 2.7135376930236816,
545
+ "learning_rate": 9.45945945945946e-06,
546
+ "loss": 0.5899,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.04803788903924222,
551
+ "grad_norm": 1.841891884803772,
552
+ "learning_rate": 9.594594594594594e-06,
553
+ "loss": 0.5312,
554
+ "step": 71
555
+ },
556
+ {
557
+ "epoch": 0.04871447902571042,
558
+ "grad_norm": 1.9096564054489136,
559
+ "learning_rate": 9.729729729729732e-06,
560
+ "loss": 0.5277,
561
+ "step": 72
562
+ },
563
+ {
564
+ "epoch": 0.04939106901217862,
565
+ "grad_norm": 3.7141664028167725,
566
+ "learning_rate": 9.864864864864865e-06,
567
+ "loss": 0.5468,
568
+ "step": 73
569
+ },
570
+ {
571
+ "epoch": 0.05006765899864682,
572
+ "grad_norm": 2.147271156311035,
573
+ "learning_rate": 1e-05,
574
+ "loss": 0.4658,
575
+ "step": 74
576
+ },
577
+ {
578
+ "epoch": 0.05074424898511502,
579
+ "grad_norm": 3.2354440689086914,
580
+ "learning_rate": 1.0135135135135136e-05,
581
+ "loss": 0.4915,
582
+ "step": 75
583
+ },
584
+ {
585
+ "epoch": 0.05142083897158322,
586
+ "grad_norm": 2.6529741287231445,
587
+ "learning_rate": 1.027027027027027e-05,
588
+ "loss": 0.5009,
589
+ "step": 76
590
+ },
591
+ {
592
+ "epoch": 0.052097428958051424,
593
+ "grad_norm": 1.9220309257507324,
594
+ "learning_rate": 1.0405405405405407e-05,
595
+ "loss": 0.4614,
596
+ "step": 77
597
+ },
598
+ {
599
+ "epoch": 0.05277401894451962,
600
+ "grad_norm": 2.6269216537475586,
601
+ "learning_rate": 1.0540540540540541e-05,
602
+ "loss": 0.4909,
603
+ "step": 78
604
+ },
605
+ {
606
+ "epoch": 0.05345060893098782,
607
+ "grad_norm": 2.8617451190948486,
608
+ "learning_rate": 1.0675675675675677e-05,
609
+ "loss": 0.5087,
610
+ "step": 79
611
+ },
612
+ {
613
+ "epoch": 0.05412719891745602,
614
+ "grad_norm": 2.258033275604248,
615
+ "learning_rate": 1.0810810810810812e-05,
616
+ "loss": 0.4434,
617
+ "step": 80
618
+ },
619
+ {
620
+ "epoch": 0.05412719891745602,
621
+ "eval_accuracy": 0.7412844036697248,
622
+ "eval_f1": 0.3922413793103448,
623
+ "eval_loss": 0.5494486689567566,
624
+ "eval_precision": 0.5481927710843374,
625
+ "eval_recall": 0.3053691275167785,
626
+ "eval_runtime": 52.2043,
627
+ "eval_samples_per_second": 5.708,
628
+ "eval_steps_per_second": 0.192,
629
+ "step": 80
630
+ },
631
+ {
632
+ "epoch": 0.05480378890392422,
633
+ "grad_norm": 3.6041858196258545,
634
+ "learning_rate": 1.0945945945945946e-05,
635
+ "loss": 0.4269,
636
+ "step": 81
637
+ },
638
+ {
639
+ "epoch": 0.05548037889039242,
640
+ "grad_norm": 2.4709510803222656,
641
+ "learning_rate": 1.1081081081081081e-05,
642
+ "loss": 0.5329,
643
+ "step": 82
644
+ },
645
+ {
646
+ "epoch": 0.05615696887686062,
647
+ "grad_norm": 2.8416366577148438,
648
+ "learning_rate": 1.1216216216216219e-05,
649
+ "loss": 0.4599,
650
+ "step": 83
651
+ },
652
+ {
653
+ "epoch": 0.056833558863328824,
654
+ "grad_norm": 2.6396408081054688,
655
+ "learning_rate": 1.1351351351351352e-05,
656
+ "loss": 0.4452,
657
+ "step": 84
658
+ },
659
+ {
660
+ "epoch": 0.05751014884979702,
661
+ "grad_norm": 1.7931419610977173,
662
+ "learning_rate": 1.1486486486486488e-05,
663
+ "loss": 0.4034,
664
+ "step": 85
665
+ },
666
+ {
667
+ "epoch": 0.058186738836265225,
668
+ "grad_norm": 2.2836318016052246,
669
+ "learning_rate": 1.1621621621621622e-05,
670
+ "loss": 0.3732,
671
+ "step": 86
672
+ },
673
+ {
674
+ "epoch": 0.05886332882273342,
675
+ "grad_norm": 2.0475215911865234,
676
+ "learning_rate": 1.1756756756756757e-05,
677
+ "loss": 0.4186,
678
+ "step": 87
679
+ },
680
+ {
681
+ "epoch": 0.05953991880920163,
682
+ "grad_norm": 2.0375993251800537,
683
+ "learning_rate": 1.1891891891891894e-05,
684
+ "loss": 0.3456,
685
+ "step": 88
686
+ },
687
+ {
688
+ "epoch": 0.060216508795669824,
689
+ "grad_norm": 3.458310604095459,
690
+ "learning_rate": 1.2027027027027028e-05,
691
+ "loss": 0.3599,
692
+ "step": 89
693
+ },
694
+ {
695
+ "epoch": 0.06089309878213803,
696
+ "grad_norm": 2.087979555130005,
697
+ "learning_rate": 1.2162162162162164e-05,
698
+ "loss": 0.3591,
699
+ "step": 90
700
+ },
701
+ {
702
+ "epoch": 0.061569688768606225,
703
+ "grad_norm": 2.4800474643707275,
704
+ "learning_rate": 1.2297297297297299e-05,
705
+ "loss": 0.3947,
706
+ "step": 91
707
+ },
708
+ {
709
+ "epoch": 0.06224627875507442,
710
+ "grad_norm": 3.9390594959259033,
711
+ "learning_rate": 1.2432432432432433e-05,
712
+ "loss": 0.4404,
713
+ "step": 92
714
+ },
715
+ {
716
+ "epoch": 0.06292286874154263,
717
+ "grad_norm": 3.231876850128174,
718
+ "learning_rate": 1.2567567567567568e-05,
719
+ "loss": 0.4116,
720
+ "step": 93
721
+ },
722
+ {
723
+ "epoch": 0.06359945872801083,
724
+ "grad_norm": 5.661862373352051,
725
+ "learning_rate": 1.2702702702702702e-05,
726
+ "loss": 0.4991,
727
+ "step": 94
728
+ },
729
+ {
730
+ "epoch": 0.06427604871447902,
731
+ "grad_norm": 3.7746121883392334,
732
+ "learning_rate": 1.283783783783784e-05,
733
+ "loss": 0.5173,
734
+ "step": 95
735
+ },
736
+ {
737
+ "epoch": 0.06495263870094722,
738
+ "grad_norm": 2.9691073894500732,
739
+ "learning_rate": 1.2972972972972975e-05,
740
+ "loss": 0.377,
741
+ "step": 96
742
+ },
743
+ {
744
+ "epoch": 0.06562922868741543,
745
+ "grad_norm": 2.5602574348449707,
746
+ "learning_rate": 1.3108108108108109e-05,
747
+ "loss": 0.3232,
748
+ "step": 97
749
+ },
750
+ {
751
+ "epoch": 0.06630581867388363,
752
+ "grad_norm": 3.1697347164154053,
753
+ "learning_rate": 1.3243243243243244e-05,
754
+ "loss": 0.3596,
755
+ "step": 98
756
+ },
757
+ {
758
+ "epoch": 0.06698240866035182,
759
+ "grad_norm": 5.4793877601623535,
760
+ "learning_rate": 1.3378378378378381e-05,
761
+ "loss": 0.3252,
762
+ "step": 99
763
+ },
764
+ {
765
+ "epoch": 0.06765899864682003,
766
+ "grad_norm": 3.7010715007781982,
767
+ "learning_rate": 1.3513513513513515e-05,
768
+ "loss": 0.264,
769
+ "step": 100
770
+ },
771
+ {
772
+ "epoch": 0.06765899864682003,
773
+ "eval_accuracy": 0.7568807339449541,
774
+ "eval_f1": 0.40979955456570155,
775
+ "eval_loss": 0.5758041143417358,
776
+ "eval_precision": 0.609271523178808,
777
+ "eval_recall": 0.3087248322147651,
778
+ "eval_runtime": 51.8245,
779
+ "eval_samples_per_second": 5.75,
780
+ "eval_steps_per_second": 0.193,
781
+ "step": 100
782
+ }
783
+ ],
784
+ "logging_steps": 1,
785
+ "max_steps": 1478,
786
+ "num_input_tokens_seen": 0,
787
+ "num_train_epochs": 1,
788
+ "save_steps": 100,
789
+ "stateful_callbacks": {
790
+ "TrainerControl": {
791
+ "args": {
792
+ "should_epoch_stop": false,
793
+ "should_evaluate": false,
794
+ "should_log": false,
795
+ "should_save": true,
796
+ "should_training_stop": false
797
+ },
798
+ "attributes": {}
799
+ }
800
+ },
801
+ "total_flos": 3.03754272309248e+16,
802
+ "train_batch_size": 8,
803
+ "trial_name": null,
804
+ "trial_params": null
805
+ }