mtzig commited on
Commit
ce8da3e
·
verified ·
1 Parent(s): ff98a4a

Training in progress, step 100, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
last-checkpoint/optimizer_0/.metadata ADDED
Binary file (369 kB). View file
 
last-checkpoint/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a11f6eb61ebc4c827a2a0cabb53ba5861bf77948b4ec298fdcdb20939fffd586
3
+ size 13934748
last-checkpoint/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb48712c0b06852263ef03ab6b5d5d77801a410ac7ea14d92e5a0505c76b4e1
3
+ size 13999412
last-checkpoint/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc26c42132b11d3ecb59970adc5b7b5bb6609373d86200a69c8f53b17217d249
3
+ size 13990904
last-checkpoint/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e789cfde514e8455ea76491fbbef336342b7dfcc6a63c4e8d0ffe6285129c3
3
+ size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED
Binary file (135 kB). View file
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e298fd40301ba45413b283973cd8d8e5621aaf4e04fbcc6457a7d3aa1ee192f7
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf14a2fb616c05d999cebdf8868df70dae195fd394e777f0252eff0a697a810
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4ceb6dac62a82a3ca694dafdd2ce1c65b970e0b228dbc4edf9f5a4477f91591
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc9dbbb69cf055042499b53359622524319ec9aa9fc08cce8dbce5002e876f0e
3
+ size 6966784
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac6e12494d9f7beb5ad6444fb7b42c6b4b9d47ff7b7ca2046a4c699d82e5ceab
3
+ size 14960
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24c20bed7146e8592ee7be19541201d084650afd67df0962abfe8cb93aa74cb
3
+ size 14960
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e27bc7dfff1fc3d0a8365aa488679bcf8ed7a0f5020e7b9fd6882657a1101ed
3
+ size 14960
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0858549c02f50b168ee3460d47e117e21644685b378f1f8120021848c7536ba
3
+ size 14960
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c2b2c481d06007bb7d9563886a1f85c531ffc044fd1a193b9a050c24445082
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.24752475247524752,
5
+ "eval_steps": 20,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.7339246119733924,
14
+ "eval_f1": 0.24528301886792453,
15
+ "eval_loss": 0.6025775074958801,
16
+ "eval_precision": 0.6,
17
+ "eval_recall": 0.1541501976284585,
18
+ "eval_runtime": 48.5969,
19
+ "eval_samples_per_second": 5.679,
20
+ "eval_steps_per_second": 0.185,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.0024752475247524753,
25
+ "grad_norm": 2.356825113296509,
26
+ "learning_rate": 4.878048780487805e-07,
27
+ "loss": 0.6028,
28
+ "step": 1
29
+ },
30
+ {
31
+ "epoch": 0.0049504950495049506,
32
+ "grad_norm": 1.4082318544387817,
33
+ "learning_rate": 9.75609756097561e-07,
34
+ "loss": 0.5236,
35
+ "step": 2
36
+ },
37
+ {
38
+ "epoch": 0.007425742574257425,
39
+ "grad_norm": 1.3720033168792725,
40
+ "learning_rate": 1.4634146341463414e-06,
41
+ "loss": 0.5333,
42
+ "step": 3
43
+ },
44
+ {
45
+ "epoch": 0.009900990099009901,
46
+ "grad_norm": 1.050480842590332,
47
+ "learning_rate": 1.951219512195122e-06,
48
+ "loss": 0.5034,
49
+ "step": 4
50
+ },
51
+ {
52
+ "epoch": 0.012376237623762377,
53
+ "grad_norm": 1.6233748197555542,
54
+ "learning_rate": 2.4390243902439027e-06,
55
+ "loss": 0.552,
56
+ "step": 5
57
+ },
58
+ {
59
+ "epoch": 0.01485148514851485,
60
+ "grad_norm": 1.229498267173767,
61
+ "learning_rate": 2.926829268292683e-06,
62
+ "loss": 0.5009,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.017326732673267328,
67
+ "grad_norm": 1.7190988063812256,
68
+ "learning_rate": 3.414634146341464e-06,
69
+ "loss": 0.593,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 0.019801980198019802,
74
+ "grad_norm": 1.6073822975158691,
75
+ "learning_rate": 3.902439024390244e-06,
76
+ "loss": 0.5551,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 0.022277227722772276,
81
+ "grad_norm": 1.680944800376892,
82
+ "learning_rate": 4.390243902439025e-06,
83
+ "loss": 0.5735,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 0.024752475247524754,
88
+ "grad_norm": 1.4425185918807983,
89
+ "learning_rate": 4.8780487804878055e-06,
90
+ "loss": 0.5361,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 0.027227722772277228,
95
+ "grad_norm": 1.679691195487976,
96
+ "learning_rate": 5.365853658536586e-06,
97
+ "loss": 0.5455,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.0297029702970297,
102
+ "grad_norm": 1.3832247257232666,
103
+ "learning_rate": 5.853658536585366e-06,
104
+ "loss": 0.5082,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.03217821782178218,
109
+ "grad_norm": 1.7135882377624512,
110
+ "learning_rate": 6.341463414634147e-06,
111
+ "loss": 0.5421,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 0.034653465346534656,
116
+ "grad_norm": 2.4003047943115234,
117
+ "learning_rate": 6.829268292682928e-06,
118
+ "loss": 0.6302,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 0.03712871287128713,
123
+ "grad_norm": 1.6961004734039307,
124
+ "learning_rate": 7.317073170731707e-06,
125
+ "loss": 0.5604,
126
+ "step": 15
127
+ },
128
+ {
129
+ "epoch": 0.039603960396039604,
130
+ "grad_norm": 1.8884222507476807,
131
+ "learning_rate": 7.804878048780489e-06,
132
+ "loss": 0.5602,
133
+ "step": 16
134
+ },
135
+ {
136
+ "epoch": 0.04207920792079208,
137
+ "grad_norm": 1.4523755311965942,
138
+ "learning_rate": 8.292682926829268e-06,
139
+ "loss": 0.5568,
140
+ "step": 17
141
+ },
142
+ {
143
+ "epoch": 0.04455445544554455,
144
+ "grad_norm": 1.5286188125610352,
145
+ "learning_rate": 8.78048780487805e-06,
146
+ "loss": 0.5672,
147
+ "step": 18
148
+ },
149
+ {
150
+ "epoch": 0.04702970297029703,
151
+ "grad_norm": 1.2915096282958984,
152
+ "learning_rate": 9.268292682926831e-06,
153
+ "loss": 0.5099,
154
+ "step": 19
155
+ },
156
+ {
157
+ "epoch": 0.04950495049504951,
158
+ "grad_norm": 1.2785226106643677,
159
+ "learning_rate": 9.756097560975611e-06,
160
+ "loss": 0.5522,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 0.04950495049504951,
165
+ "eval_accuracy": 0.7394678492239468,
166
+ "eval_f1": 0.38961038961038963,
167
+ "eval_loss": 0.5767667293548584,
168
+ "eval_precision": 0.5681818181818182,
169
+ "eval_recall": 0.2964426877470356,
170
+ "eval_runtime": 48.4333,
171
+ "eval_samples_per_second": 5.699,
172
+ "eval_steps_per_second": 0.186,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.05198019801980198,
177
+ "grad_norm": 1.2662467956542969,
178
+ "learning_rate": 1.024390243902439e-05,
179
+ "loss": 0.5133,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.054455445544554455,
184
+ "grad_norm": 1.4190106391906738,
185
+ "learning_rate": 1.0731707317073172e-05,
186
+ "loss": 0.5608,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.05693069306930693,
191
+ "grad_norm": 1.5969791412353516,
192
+ "learning_rate": 1.1219512195121953e-05,
193
+ "loss": 0.5697,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.0594059405940594,
198
+ "grad_norm": 1.6572206020355225,
199
+ "learning_rate": 1.1707317073170731e-05,
200
+ "loss": 0.5746,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.06188118811881188,
205
+ "grad_norm": 1.3676258325576782,
206
+ "learning_rate": 1.2195121951219513e-05,
207
+ "loss": 0.5155,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.06435643564356436,
212
+ "grad_norm": 1.7536897659301758,
213
+ "learning_rate": 1.2682926829268294e-05,
214
+ "loss": 0.4858,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.06683168316831684,
219
+ "grad_norm": 1.7013200521469116,
220
+ "learning_rate": 1.3170731707317076e-05,
221
+ "loss": 0.5188,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.06930693069306931,
226
+ "grad_norm": 1.9813508987426758,
227
+ "learning_rate": 1.3658536585365855e-05,
228
+ "loss": 0.4945,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.07178217821782178,
233
+ "grad_norm": 1.3483078479766846,
234
+ "learning_rate": 1.4146341463414635e-05,
235
+ "loss": 0.508,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.07425742574257425,
240
+ "grad_norm": 1.829410433769226,
241
+ "learning_rate": 1.4634146341463415e-05,
242
+ "loss": 0.4897,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.07673267326732673,
247
+ "grad_norm": 1.6702966690063477,
248
+ "learning_rate": 1.5121951219512196e-05,
249
+ "loss": 0.5073,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.07920792079207921,
254
+ "grad_norm": 1.5164490938186646,
255
+ "learning_rate": 1.5609756097560978e-05,
256
+ "loss": 0.502,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.08168316831683169,
261
+ "grad_norm": 1.3430652618408203,
262
+ "learning_rate": 1.6097560975609757e-05,
263
+ "loss": 0.484,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.08415841584158416,
268
+ "grad_norm": 1.8384352922439575,
269
+ "learning_rate": 1.6585365853658537e-05,
270
+ "loss": 0.507,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.08663366336633663,
275
+ "grad_norm": 1.4320778846740723,
276
+ "learning_rate": 1.7073170731707317e-05,
277
+ "loss": 0.503,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.0891089108910891,
282
+ "grad_norm": 1.949724793434143,
283
+ "learning_rate": 1.75609756097561e-05,
284
+ "loss": 0.4447,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.09158415841584158,
289
+ "grad_norm": 1.4103795289993286,
290
+ "learning_rate": 1.804878048780488e-05,
291
+ "loss": 0.4747,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.09405940594059406,
296
+ "grad_norm": 1.0967051982879639,
297
+ "learning_rate": 1.8536585365853663e-05,
298
+ "loss": 0.4207,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.09653465346534654,
303
+ "grad_norm": 1.5804564952850342,
304
+ "learning_rate": 1.902439024390244e-05,
305
+ "loss": 0.469,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.09900990099009901,
310
+ "grad_norm": 1.444751501083374,
311
+ "learning_rate": 1.9512195121951222e-05,
312
+ "loss": 0.4818,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.09900990099009901,
317
+ "eval_accuracy": 0.7760532150776053,
318
+ "eval_f1": 0.4846938775510204,
319
+ "eval_loss": 0.48587334156036377,
320
+ "eval_precision": 0.6834532374100719,
321
+ "eval_recall": 0.37549407114624506,
322
+ "eval_runtime": 48.205,
323
+ "eval_samples_per_second": 5.726,
324
+ "eval_steps_per_second": 0.187,
325
+ "step": 40
326
+ },
327
+ {
328
+ "epoch": 0.10148514851485149,
329
+ "grad_norm": 1.3895819187164307,
330
+ "learning_rate": 2e-05,
331
+ "loss": 0.3838,
332
+ "step": 41
333
+ },
334
+ {
335
+ "epoch": 0.10396039603960396,
336
+ "grad_norm": 1.7078603506088257,
337
+ "learning_rate": 1.9999625498303936e-05,
338
+ "loss": 0.409,
339
+ "step": 42
340
+ },
341
+ {
342
+ "epoch": 0.10643564356435643,
343
+ "grad_norm": 1.5501896142959595,
344
+ "learning_rate": 1.999850202126604e-05,
345
+ "loss": 0.4403,
346
+ "step": 43
347
+ },
348
+ {
349
+ "epoch": 0.10891089108910891,
350
+ "grad_norm": 1.7278761863708496,
351
+ "learning_rate": 1.9996629653035128e-05,
352
+ "loss": 0.4193,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 0.11138613861386139,
357
+ "grad_norm": 1.9645085334777832,
358
+ "learning_rate": 1.999400853385221e-05,
359
+ "loss": 0.3859,
360
+ "step": 45
361
+ },
362
+ {
363
+ "epoch": 0.11386138613861387,
364
+ "grad_norm": 1.9028494358062744,
365
+ "learning_rate": 1.9990638860040007e-05,
366
+ "loss": 0.3904,
367
+ "step": 46
368
+ },
369
+ {
370
+ "epoch": 0.11633663366336634,
371
+ "grad_norm": 2.2867023944854736,
372
+ "learning_rate": 1.9986520883988233e-05,
373
+ "loss": 0.4071,
374
+ "step": 47
375
+ },
376
+ {
377
+ "epoch": 0.1188118811881188,
378
+ "grad_norm": 2.1702394485473633,
379
+ "learning_rate": 1.9981654914134684e-05,
380
+ "loss": 0.3845,
381
+ "step": 48
382
+ },
383
+ {
384
+ "epoch": 0.12128712871287128,
385
+ "grad_norm": 2.050438642501831,
386
+ "learning_rate": 1.9976041314942156e-05,
387
+ "loss": 0.3667,
388
+ "step": 49
389
+ },
390
+ {
391
+ "epoch": 0.12376237623762376,
392
+ "grad_norm": 2.091747522354126,
393
+ "learning_rate": 1.9969680506871138e-05,
394
+ "loss": 0.4383,
395
+ "step": 50
396
+ },
397
+ {
398
+ "epoch": 0.12623762376237624,
399
+ "grad_norm": 2.5285873413085938,
400
+ "learning_rate": 1.99625729663483e-05,
401
+ "loss": 0.3441,
402
+ "step": 51
403
+ },
404
+ {
405
+ "epoch": 0.12871287128712872,
406
+ "grad_norm": 2.5812807083129883,
407
+ "learning_rate": 1.9954719225730847e-05,
408
+ "loss": 0.3605,
409
+ "step": 52
410
+ },
411
+ {
412
+ "epoch": 0.1311881188118812,
413
+ "grad_norm": 2.215435028076172,
414
+ "learning_rate": 1.9946119873266615e-05,
415
+ "loss": 0.3959,
416
+ "step": 53
417
+ },
418
+ {
419
+ "epoch": 0.13366336633663367,
420
+ "grad_norm": 2.313823938369751,
421
+ "learning_rate": 1.9936775553050017e-05,
422
+ "loss": 0.3524,
423
+ "step": 54
424
+ },
425
+ {
426
+ "epoch": 0.13613861386138615,
427
+ "grad_norm": 2.2460765838623047,
428
+ "learning_rate": 1.9926686964973813e-05,
429
+ "loss": 0.3696,
430
+ "step": 55
431
+ },
432
+ {
433
+ "epoch": 0.13861386138613863,
434
+ "grad_norm": 2.4624035358428955,
435
+ "learning_rate": 1.9915854864676665e-05,
436
+ "loss": 0.3482,
437
+ "step": 56
438
+ },
439
+ {
440
+ "epoch": 0.14108910891089108,
441
+ "grad_norm": 3.2622599601745605,
442
+ "learning_rate": 1.9904280063486563e-05,
443
+ "loss": 0.3012,
444
+ "step": 57
445
+ },
446
+ {
447
+ "epoch": 0.14356435643564355,
448
+ "grad_norm": 3.04482364654541,
449
+ "learning_rate": 1.9891963428360043e-05,
450
+ "loss": 0.3131,
451
+ "step": 58
452
+ },
453
+ {
454
+ "epoch": 0.14603960396039603,
455
+ "grad_norm": 2.944631576538086,
456
+ "learning_rate": 1.9878905881817254e-05,
457
+ "loss": 0.291,
458
+ "step": 59
459
+ },
460
+ {
461
+ "epoch": 0.1485148514851485,
462
+ "grad_norm": 2.953406572341919,
463
+ "learning_rate": 1.9865108401872856e-05,
464
+ "loss": 0.3892,
465
+ "step": 60
466
+ },
467
+ {
468
+ "epoch": 0.1485148514851485,
469
+ "eval_accuracy": 0.7982261640798226,
470
+ "eval_f1": 0.5991189427312775,
471
+ "eval_loss": 0.4218328297138214,
472
+ "eval_precision": 0.6766169154228856,
473
+ "eval_recall": 0.5375494071146245,
474
+ "eval_runtime": 48.1494,
475
+ "eval_samples_per_second": 5.732,
476
+ "eval_steps_per_second": 0.187,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.15099009900990099,
481
+ "grad_norm": 3.373743772506714,
482
+ "learning_rate": 1.9850572021962788e-05,
483
+ "loss": 0.3091,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.15346534653465346,
488
+ "grad_norm": 3.0653586387634277,
489
+ "learning_rate": 1.9835297830866827e-05,
490
+ "loss": 0.3295,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.15594059405940594,
495
+ "grad_norm": 4.514039516448975,
496
+ "learning_rate": 1.9819286972627066e-05,
497
+ "loss": 0.3336,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.15841584158415842,
502
+ "grad_norm": 4.89851713180542,
503
+ "learning_rate": 1.980254064646223e-05,
504
+ "loss": 0.2786,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.1608910891089109,
509
+ "grad_norm": 3.4198689460754395,
510
+ "learning_rate": 1.9785060106677818e-05,
511
+ "loss": 0.2681,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.16336633663366337,
516
+ "grad_norm": 3.666618824005127,
517
+ "learning_rate": 1.976684666257219e-05,
518
+ "loss": 0.3517,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.16584158415841585,
523
+ "grad_norm": 2.8845083713531494,
524
+ "learning_rate": 1.9747901678338496e-05,
525
+ "loss": 0.264,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.16831683168316833,
530
+ "grad_norm": 3.752220630645752,
531
+ "learning_rate": 1.9728226572962474e-05,
532
+ "loss": 0.3364,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.1707920792079208,
537
+ "grad_norm": 2.955590009689331,
538
+ "learning_rate": 1.9707822820116193e-05,
539
+ "loss": 0.2992,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.17326732673267325,
544
+ "grad_norm": 3.232114553451538,
545
+ "learning_rate": 1.9686691948047665e-05,
546
+ "loss": 0.2776,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.17574257425742573,
551
+ "grad_norm": 3.167881727218628,
552
+ "learning_rate": 1.966483553946637e-05,
553
+ "loss": 0.2817,
554
+ "step": 71
555
+ },
556
+ {
557
+ "epoch": 0.1782178217821782,
558
+ "grad_norm": 2.4919188022613525,
559
+ "learning_rate": 1.964225523142473e-05,
560
+ "loss": 0.2218,
561
+ "step": 72
562
+ },
563
+ {
564
+ "epoch": 0.1806930693069307,
565
+ "grad_norm": 5.347751617431641,
566
+ "learning_rate": 1.9618952715195476e-05,
567
+ "loss": 0.409,
568
+ "step": 73
569
+ },
570
+ {
571
+ "epoch": 0.18316831683168316,
572
+ "grad_norm": 2.9692351818084717,
573
+ "learning_rate": 1.9594929736144978e-05,
574
+ "loss": 0.2817,
575
+ "step": 74
576
+ },
577
+ {
578
+ "epoch": 0.18564356435643564,
579
+ "grad_norm": 3.427264928817749,
580
+ "learning_rate": 1.9570188093602512e-05,
581
+ "loss": 0.2858,
582
+ "step": 75
583
+ },
584
+ {
585
+ "epoch": 0.18811881188118812,
586
+ "grad_norm": 3.0406219959259033,
587
+ "learning_rate": 1.95447296407255e-05,
588
+ "loss": 0.2773,
589
+ "step": 76
590
+ },
591
+ {
592
+ "epoch": 0.1905940594059406,
593
+ "grad_norm": 1.9700223207473755,
594
+ "learning_rate": 1.9518556284360696e-05,
595
+ "loss": 0.2827,
596
+ "step": 77
597
+ },
598
+ {
599
+ "epoch": 0.19306930693069307,
600
+ "grad_norm": 2.567990303039551,
601
+ "learning_rate": 1.9491669984901377e-05,
602
+ "loss": 0.2649,
603
+ "step": 78
604
+ },
605
+ {
606
+ "epoch": 0.19554455445544555,
607
+ "grad_norm": 2.3955078125,
608
+ "learning_rate": 1.9464072756140487e-05,
609
+ "loss": 0.252,
610
+ "step": 79
611
+ },
612
+ {
613
+ "epoch": 0.19801980198019803,
614
+ "grad_norm": 2.5083565711975098,
615
+ "learning_rate": 1.9435766665119823e-05,
616
+ "loss": 0.2916,
617
+ "step": 80
618
+ },
619
+ {
620
+ "epoch": 0.19801980198019803,
621
+ "eval_accuracy": 0.8237250554323725,
622
+ "eval_f1": 0.6276346604215457,
623
+ "eval_loss": 0.37471064925193787,
624
+ "eval_precision": 0.7701149425287356,
625
+ "eval_recall": 0.5296442687747036,
626
+ "eval_runtime": 47.5247,
627
+ "eval_samples_per_second": 5.808,
628
+ "eval_steps_per_second": 0.189,
629
+ "step": 80
630
+ },
631
+ {
632
+ "epoch": 0.2004950495049505,
633
+ "grad_norm": 1.8391790390014648,
634
+ "learning_rate": 1.9406753831975202e-05,
635
+ "loss": 0.2714,
636
+ "step": 81
637
+ },
638
+ {
639
+ "epoch": 0.20297029702970298,
640
+ "grad_norm": 2.8943684101104736,
641
+ "learning_rate": 1.9377036429777673e-05,
642
+ "loss": 0.3109,
643
+ "step": 82
644
+ },
645
+ {
646
+ "epoch": 0.20544554455445543,
647
+ "grad_norm": 4.071371555328369,
648
+ "learning_rate": 1.934661668437073e-05,
649
+ "loss": 0.2958,
650
+ "step": 83
651
+ },
652
+ {
653
+ "epoch": 0.2079207920792079,
654
+ "grad_norm": 2.5396947860717773,
655
+ "learning_rate": 1.9315496874203637e-05,
656
+ "loss": 0.2573,
657
+ "step": 84
658
+ },
659
+ {
660
+ "epoch": 0.2103960396039604,
661
+ "grad_norm": 2.732813596725464,
662
+ "learning_rate": 1.9283679330160726e-05,
663
+ "loss": 0.2796,
664
+ "step": 85
665
+ },
666
+ {
667
+ "epoch": 0.21287128712871287,
668
+ "grad_norm": 2.0551230907440186,
669
+ "learning_rate": 1.9251166435386837e-05,
670
+ "loss": 0.2601,
671
+ "step": 86
672
+ },
673
+ {
674
+ "epoch": 0.21534653465346534,
675
+ "grad_norm": 2.191950798034668,
676
+ "learning_rate": 1.921796062510882e-05,
677
+ "loss": 0.2444,
678
+ "step": 87
679
+ },
680
+ {
681
+ "epoch": 0.21782178217821782,
682
+ "grad_norm": 2.024583101272583,
683
+ "learning_rate": 1.9184064386453127e-05,
684
+ "loss": 0.2398,
685
+ "step": 88
686
+ },
687
+ {
688
+ "epoch": 0.2202970297029703,
689
+ "grad_norm": 1.9806023836135864,
690
+ "learning_rate": 1.9149480258259535e-05,
691
+ "loss": 0.2618,
692
+ "step": 89
693
+ },
694
+ {
695
+ "epoch": 0.22277227722772278,
696
+ "grad_norm": 2.0127272605895996,
697
+ "learning_rate": 1.911421083089097e-05,
698
+ "loss": 0.3024,
699
+ "step": 90
700
+ },
701
+ {
702
+ "epoch": 0.22524752475247525,
703
+ "grad_norm": 2.1638479232788086,
704
+ "learning_rate": 1.907825874603951e-05,
705
+ "loss": 0.2853,
706
+ "step": 91
707
+ },
708
+ {
709
+ "epoch": 0.22772277227722773,
710
+ "grad_norm": 2.1442925930023193,
711
+ "learning_rate": 1.9041626696528503e-05,
712
+ "loss": 0.2803,
713
+ "step": 92
714
+ },
715
+ {
716
+ "epoch": 0.2301980198019802,
717
+ "grad_norm": 2.100146770477295,
718
+ "learning_rate": 1.9004317426110888e-05,
719
+ "loss": 0.1991,
720
+ "step": 93
721
+ },
722
+ {
723
+ "epoch": 0.23267326732673269,
724
+ "grad_norm": 2.472929000854492,
725
+ "learning_rate": 1.8966333729263674e-05,
726
+ "loss": 0.2588,
727
+ "step": 94
728
+ },
729
+ {
730
+ "epoch": 0.23514851485148514,
731
+ "grad_norm": 2.93789005279541,
732
+ "learning_rate": 1.892767845097864e-05,
733
+ "loss": 0.2359,
734
+ "step": 95
735
+ },
736
+ {
737
+ "epoch": 0.2376237623762376,
738
+ "grad_norm": 3.479299545288086,
739
+ "learning_rate": 1.8888354486549238e-05,
740
+ "loss": 0.3254,
741
+ "step": 96
742
+ },
743
+ {
744
+ "epoch": 0.2400990099009901,
745
+ "grad_norm": 3.940781593322754,
746
+ "learning_rate": 1.8848364781353744e-05,
747
+ "loss": 0.2681,
748
+ "step": 97
749
+ },
750
+ {
751
+ "epoch": 0.24257425742574257,
752
+ "grad_norm": 2.455169916152954,
753
+ "learning_rate": 1.8807712330634645e-05,
754
+ "loss": 0.2624,
755
+ "step": 98
756
+ },
757
+ {
758
+ "epoch": 0.24504950495049505,
759
+ "grad_norm": 2.5762863159179688,
760
+ "learning_rate": 1.8766400179274287e-05,
761
+ "loss": 0.1947,
762
+ "step": 99
763
+ },
764
+ {
765
+ "epoch": 0.24752475247524752,
766
+ "grad_norm": 2.557237148284912,
767
+ "learning_rate": 1.8724431421566822e-05,
768
+ "loss": 0.2191,
769
+ "step": 100
770
+ },
771
+ {
772
+ "epoch": 0.24752475247524752,
773
+ "eval_accuracy": 0.8303769401330376,
774
+ "eval_f1": 0.6466512702078522,
775
+ "eval_loss": 0.35379520058631897,
776
+ "eval_precision": 0.7777777777777778,
777
+ "eval_recall": 0.5533596837944664,
778
+ "eval_runtime": 48.1818,
779
+ "eval_samples_per_second": 5.728,
780
+ "eval_steps_per_second": 0.187,
781
+ "step": 100
782
+ }
783
+ ],
784
+ "logging_steps": 1,
785
+ "max_steps": 404,
786
+ "num_input_tokens_seen": 0,
787
+ "num_train_epochs": 1,
788
+ "save_steps": 100,
789
+ "stateful_callbacks": {
790
+ "TrainerControl": {
791
+ "args": {
792
+ "should_epoch_stop": false,
793
+ "should_evaluate": false,
794
+ "should_log": false,
795
+ "should_save": true,
796
+ "should_training_stop": false
797
+ },
798
+ "attributes": {}
799
+ }
800
+ },
801
+ "total_flos": 6.099415795027149e+16,
802
+ "train_batch_size": 8,
803
+ "trial_name": null,
804
+ "trial_params": null
805
+ }