mika5883 commited on
Commit
f1a9de1
1 Parent(s): 20df67e

Training in progress, step 34500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e92f9d6fa8f3943fa6d1539e3fc9c8082440671fbbfe767f34bd7b0b0b7bbcb
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0931cfc0502a489316d773e7e3fbc87d1bf87d22980010d8e5debce1a389ab5a
3
  size 891644712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b52a6542aae59b9e8dbc260d927ca0f2fdc030aeecaf2628fab24bffbce8b859
3
  size 1783444357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68488d7f6b13bf4345eed8b0bddb03bc654697dc1a0ce2e2c44855609a96ee79
3
  size 1783444357
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05a1c8d1b4a5f5df3241839fbe5d279cd2d58633aba6dc35472c1b5348efdb84
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2668009309a225b4528d1c2be158a46d8643edce60db33568885f84d9153d0f5
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:473385105464e09d9e2cd49bbce42350523e82d12bca7455cb9d145ff112729d
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c659c1c793579d80447241ae65c0f0bd61aec30b0f2e6c845263cb4857f12c85
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1152,
5
  "eval_steps": 500,
6
- "global_step": 18000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -259,6 +259,237 @@
259
  "learning_rate": 4.712096e-05,
260
  "loss": 0.412,
261
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  }
263
  ],
264
  "logging_steps": 500,
@@ -278,7 +509,7 @@
278
  "attributes": {}
279
  }
280
  },
281
- "total_flos": 8.768993624064e+16,
282
  "train_batch_size": 64,
283
  "trial_name": null,
284
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2208,
5
  "eval_steps": 500,
6
+ "global_step": 34500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
259
  "learning_rate": 4.712096e-05,
260
  "loss": 0.412,
261
  "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.1184,
265
+ "grad_norm": 0.9078772664070129,
266
+ "learning_rate": 4.704096e-05,
267
+ "loss": 0.4043,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.1216,
272
+ "grad_norm": 1.082939624786377,
273
+ "learning_rate": 4.696112e-05,
274
+ "loss": 0.4045,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 0.1248,
279
+ "grad_norm": 0.9159390926361084,
280
+ "learning_rate": 4.688112e-05,
281
+ "loss": 0.4098,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 0.128,
286
+ "grad_norm": 0.8420547842979431,
287
+ "learning_rate": 4.680128e-05,
288
+ "loss": 0.4033,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 0.1312,
293
+ "grad_norm": 0.7658286094665527,
294
+ "learning_rate": 4.672128e-05,
295
+ "loss": 0.4002,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 0.1344,
300
+ "grad_norm": 0.9074057340621948,
301
+ "learning_rate": 4.664128e-05,
302
+ "loss": 0.3964,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 0.1376,
307
+ "grad_norm": 0.6065025329589844,
308
+ "learning_rate": 4.656128e-05,
309
+ "loss": 0.3984,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 0.1408,
314
+ "grad_norm": 0.7523757219314575,
315
+ "learning_rate": 4.6481280000000004e-05,
316
+ "loss": 0.3959,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 0.144,
321
+ "grad_norm": 0.807826042175293,
322
+ "learning_rate": 4.6401280000000004e-05,
323
+ "loss": 0.3921,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 0.1472,
328
+ "grad_norm": 0.8530682325363159,
329
+ "learning_rate": 4.632128e-05,
330
+ "loss": 0.4002,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 0.1504,
335
+ "grad_norm": 0.8661518692970276,
336
+ "learning_rate": 4.6241280000000006e-05,
337
+ "loss": 0.3856,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 0.1536,
342
+ "grad_norm": 0.7473235130310059,
343
+ "learning_rate": 4.616144e-05,
344
+ "loss": 0.3854,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 0.1568,
349
+ "grad_norm": 0.7954819202423096,
350
+ "learning_rate": 4.6081440000000005e-05,
351
+ "loss": 0.3871,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 0.16,
356
+ "grad_norm": 0.8758727312088013,
357
+ "learning_rate": 4.600144e-05,
358
+ "loss": 0.3842,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 0.1632,
363
+ "grad_norm": 0.8430293798446655,
364
+ "learning_rate": 4.592144000000001e-05,
365
+ "loss": 0.3886,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 0.1664,
370
+ "grad_norm": 0.6557173728942871,
371
+ "learning_rate": 4.584144e-05,
372
+ "loss": 0.3854,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 0.1696,
377
+ "grad_norm": 0.7791888117790222,
378
+ "learning_rate": 4.576144e-05,
379
+ "loss": 0.3796,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 0.1728,
384
+ "grad_norm": 0.736084520816803,
385
+ "learning_rate": 4.56816e-05,
386
+ "loss": 0.3806,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 0.176,
391
+ "grad_norm": 0.7714269161224365,
392
+ "learning_rate": 4.56016e-05,
393
+ "loss": 0.3781,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 0.1792,
398
+ "grad_norm": 0.766144335269928,
399
+ "learning_rate": 4.552176e-05,
400
+ "loss": 0.3766,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 0.1824,
405
+ "grad_norm": 0.7035301923751831,
406
+ "learning_rate": 4.544176e-05,
407
+ "loss": 0.3737,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 0.1856,
412
+ "grad_norm": 0.7573793530464172,
413
+ "learning_rate": 4.536176e-05,
414
+ "loss": 0.3753,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 0.1888,
419
+ "grad_norm": 0.8799508213996887,
420
+ "learning_rate": 4.528176e-05,
421
+ "loss": 0.373,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 0.192,
426
+ "grad_norm": 0.8543264269828796,
427
+ "learning_rate": 4.520176e-05,
428
+ "loss": 0.3735,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 0.1952,
433
+ "grad_norm": 0.6768947243690491,
434
+ "learning_rate": 4.512176e-05,
435
+ "loss": 0.3697,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 0.1984,
440
+ "grad_norm": 0.8239702582359314,
441
+ "learning_rate": 4.504176e-05,
442
+ "loss": 0.3675,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 0.2016,
447
+ "grad_norm": 0.8310449123382568,
448
+ "learning_rate": 4.4961760000000004e-05,
449
+ "loss": 0.3695,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 0.2048,
454
+ "grad_norm": 0.8459475040435791,
455
+ "learning_rate": 4.488176e-05,
456
+ "loss": 0.3694,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 0.208,
461
+ "grad_norm": 0.7346063852310181,
462
+ "learning_rate": 4.4801760000000006e-05,
463
+ "loss": 0.3646,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 0.2112,
468
+ "grad_norm": 0.6958354115486145,
469
+ "learning_rate": 4.472176e-05,
470
+ "loss": 0.3704,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 0.2144,
475
+ "grad_norm": 0.8244686722755432,
476
+ "learning_rate": 4.464176000000001e-05,
477
+ "loss": 0.3647,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 0.2176,
482
+ "grad_norm": 0.7559502124786377,
483
+ "learning_rate": 4.456192e-05,
484
+ "loss": 0.3665,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 0.2208,
489
+ "grad_norm": 0.9046504497528076,
490
+ "learning_rate": 4.4481920000000007e-05,
491
+ "loss": 0.3637,
492
+ "step": 34500
493
  }
494
  ],
495
  "logging_steps": 500,
 
509
  "attributes": {}
510
  }
511
  },
512
+ "total_flos": 1.6807237779456e+17,
513
  "train_batch_size": 64,
514
  "trial_name": null,
515
  "trial_params": null