plip commited on
Commit
bc6cb0f
1 Parent(s): 0145f9c

Training in progress, step 20000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a9b46ed6af836e8924481b9ef3b8ac908f671a8de3334b85575e607c66e5a72
3
  size 202193937
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65debe850847edf2f62bba0deb29483af297ffa34c6cdcd66a55832044fcc8ec
3
  size 202193937
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fa1850686699db6f7d38adef4e1f7555c082f72c52d9543757d96d6e7dfe575
3
  size 102501541
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6f55f7fb7116bd42203f7b9313d6b9f08f6a023278c3f67be7ce3fba873b5dc
3
  size 102501541
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f63b6e7591877af348d0715da8e499a40da21a9ac10ea61d3d17f2882b7374e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:006a29e4d6e5dacaa8f0d3ba56263cd7f1775a1129713c79ccc62a2c04246894
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4931c97be633a6e3808391ac03b792837805ded64ff2a578babfa79e60dc22a7
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c84272d9e8a15cd78c706d16b77833d53dbfbd182e8ad79e3cd658ef6c3eaaf6
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.23299704093758009,
5
- "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -206,11 +206,211 @@
206
  "eval_samples_per_second": 1737.882,
207
  "eval_steps_per_second": 27.673,
208
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  }
210
  ],
211
  "max_steps": 500000,
212
  "num_train_epochs": 12,
213
- "total_flos": 3.194871387745e+20,
214
  "trial_name": null,
215
  "trial_params": null
216
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.46599408187516017,
5
+ "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
206
  "eval_samples_per_second": 1737.882,
207
  "eval_steps_per_second": 27.673,
208
  "step": 10000
209
+ },
210
+ {
211
+ "epoch": 0.24,
212
+ "learning_rate": 0.00012599999999999997,
213
+ "loss": 0.5516,
214
+ "step": 10500
215
+ },
216
+ {
217
+ "epoch": 0.26,
218
+ "learning_rate": 0.00013199999999999998,
219
+ "loss": 0.5428,
220
+ "step": 11000
221
+ },
222
+ {
223
+ "epoch": 0.26,
224
+ "eval_loss": 0.5300613045692444,
225
+ "eval_runtime": 1.2658,
226
+ "eval_samples_per_second": 1736.485,
227
+ "eval_steps_per_second": 27.651,
228
+ "step": 11000
229
+ },
230
+ {
231
+ "epoch": 0.27,
232
+ "learning_rate": 0.000138,
233
+ "loss": 0.5346,
234
+ "step": 11500
235
+ },
236
+ {
237
+ "epoch": 0.28,
238
+ "learning_rate": 0.00014399999999999998,
239
+ "loss": 0.527,
240
+ "step": 12000
241
+ },
242
+ {
243
+ "epoch": 0.28,
244
+ "eval_loss": 0.5140743255615234,
245
+ "eval_runtime": 1.2609,
246
+ "eval_samples_per_second": 1743.192,
247
+ "eval_steps_per_second": 27.758,
248
+ "step": 12000
249
+ },
250
+ {
251
+ "epoch": 0.29,
252
+ "learning_rate": 0.00015,
253
+ "loss": 0.5203,
254
+ "step": 12500
255
+ },
256
+ {
257
+ "epoch": 0.3,
258
+ "learning_rate": 0.000156,
259
+ "loss": 0.5142,
260
+ "step": 13000
261
+ },
262
+ {
263
+ "epoch": 0.3,
264
+ "eval_loss": 0.5022566318511963,
265
+ "eval_runtime": 1.2481,
266
+ "eval_samples_per_second": 1761.066,
267
+ "eval_steps_per_second": 28.042,
268
+ "step": 13000
269
+ },
270
+ {
271
+ "epoch": 0.31,
272
+ "learning_rate": 0.000162,
273
+ "loss": 0.5079,
274
+ "step": 13500
275
+ },
276
+ {
277
+ "epoch": 0.33,
278
+ "learning_rate": 0.000168,
279
+ "loss": 0.5002,
280
+ "step": 14000
281
+ },
282
+ {
283
+ "epoch": 0.33,
284
+ "eval_loss": 0.4841987192630768,
285
+ "eval_runtime": 1.3176,
286
+ "eval_samples_per_second": 1668.178,
287
+ "eval_steps_per_second": 26.563,
288
+ "step": 14000
289
+ },
290
+ {
291
+ "epoch": 0.34,
292
+ "learning_rate": 0.00017399999999999997,
293
+ "loss": 0.4916,
294
+ "step": 14500
295
+ },
296
+ {
297
+ "epoch": 0.35,
298
+ "learning_rate": 0.00017999999999999998,
299
+ "loss": 0.4852,
300
+ "step": 15000
301
+ },
302
+ {
303
+ "epoch": 0.35,
304
+ "eval_loss": 0.47066375613212585,
305
+ "eval_runtime": 1.3026,
306
+ "eval_samples_per_second": 1687.422,
307
+ "eval_steps_per_second": 26.87,
308
+ "step": 15000
309
+ },
310
+ {
311
+ "epoch": 0.36,
312
+ "learning_rate": 0.000186,
313
+ "loss": 0.4778,
314
+ "step": 15500
315
+ },
316
+ {
317
+ "epoch": 0.37,
318
+ "learning_rate": 0.00019199999999999998,
319
+ "loss": 0.4697,
320
+ "step": 16000
321
+ },
322
+ {
323
+ "epoch": 0.37,
324
+ "eval_loss": 0.4562186300754547,
325
+ "eval_runtime": 1.2895,
326
+ "eval_samples_per_second": 1704.503,
327
+ "eval_steps_per_second": 27.142,
328
+ "step": 16000
329
+ },
330
+ {
331
+ "epoch": 0.38,
332
+ "learning_rate": 0.000198,
333
+ "loss": 0.4636,
334
+ "step": 16500
335
+ },
336
+ {
337
+ "epoch": 0.4,
338
+ "learning_rate": 0.000204,
339
+ "loss": 0.459,
340
+ "step": 17000
341
+ },
342
+ {
343
+ "epoch": 0.4,
344
+ "eval_loss": 0.44295796751976013,
345
+ "eval_runtime": 1.2516,
346
+ "eval_samples_per_second": 1756.093,
347
+ "eval_steps_per_second": 27.963,
348
+ "step": 17000
349
+ },
350
+ {
351
+ "epoch": 0.41,
352
+ "learning_rate": 0.00020999999999999998,
353
+ "loss": 0.4544,
354
+ "step": 17500
355
+ },
356
+ {
357
+ "epoch": 0.42,
358
+ "learning_rate": 0.00021599999999999996,
359
+ "loss": 0.45,
360
+ "step": 18000
361
+ },
362
+ {
363
+ "epoch": 0.42,
364
+ "eval_loss": 0.4343169629573822,
365
+ "eval_runtime": 1.327,
366
+ "eval_samples_per_second": 1656.351,
367
+ "eval_steps_per_second": 26.375,
368
+ "step": 18000
369
+ },
370
+ {
371
+ "epoch": 0.43,
372
+ "learning_rate": 0.00022199999999999998,
373
+ "loss": 0.4458,
374
+ "step": 18500
375
+ },
376
+ {
377
+ "epoch": 0.44,
378
+ "learning_rate": 0.00022799999999999999,
379
+ "loss": 0.4411,
380
+ "step": 19000
381
+ },
382
+ {
383
+ "epoch": 0.44,
384
+ "eval_loss": 0.42496559023857117,
385
+ "eval_runtime": 1.3285,
386
+ "eval_samples_per_second": 1654.492,
387
+ "eval_steps_per_second": 26.345,
388
+ "step": 19000
389
+ },
390
+ {
391
+ "epoch": 0.45,
392
+ "learning_rate": 0.000234,
393
+ "loss": 0.4368,
394
+ "step": 19500
395
+ },
396
+ {
397
+ "epoch": 0.47,
398
+ "learning_rate": 0.00023999999999999998,
399
+ "loss": 0.4322,
400
+ "step": 20000
401
+ },
402
+ {
403
+ "epoch": 0.47,
404
+ "eval_loss": 0.4132905900478363,
405
+ "eval_runtime": 1.3158,
406
+ "eval_samples_per_second": 1670.425,
407
+ "eval_steps_per_second": 26.599,
408
+ "step": 20000
409
  }
410
  ],
411
  "max_steps": 500000,
412
  "num_train_epochs": 12,
413
+ "total_flos": 6.38974277549e+20,
414
  "trial_name": null,
415
  "trial_params": null
416
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fa1850686699db6f7d38adef4e1f7555c082f72c52d9543757d96d6e7dfe575
3
  size 102501541
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6f55f7fb7116bd42203f7b9313d6b9f08f6a023278c3f67be7ce3fba873b5dc
3
  size 102501541