mtzig commited on
Commit
5ec3d1b
·
verified ·
1 Parent(s): fffe2c1

Training in progress, step 1700, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55b3067cadc9a6b6288d648e729308b59c8205582769114a2174ac7793a1c0f7
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9ad4c45f6ed9ff141594db885ce7f7936b72c33bf7831ea51061b751c035d0
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8073125026f2108e54015e375aa7b740d2d75884fc1c727a72246964566922ea
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0005a7a0bf83524bb14a93793c3fdab26d6c2653ecf1f287deeb08e8e78ca1fd
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5abbde4ca3822f1f7f4a8e1bc6bc07dee8a4408705c977218063313754aed8
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed9ba22411531d762cf848c2d9daddff6ee7f29ca806d5aef7f5ba9813947f0
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0347fac0faf901c20b49c9a8da56dfe6a2d4fc4a8d1d17d2653b3fdf314373c
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:371c2c2e2799dc1b7de0b141a212b857758839245977f9c58714ec11a1162c7e
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0adf8dfb4f071dbd258f6b78e089dbae582d016b34020a7a5258f77704d0bb72
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c2bdb37a902663861f07a52281ef1995bcfda4e8830c535faae292fabb659b6
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:123607e08a6efb56d07d9b0176bbefe77dbc0ba8afe0ea10c7cca368cea46f3e
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478afb018e67b6963a405f6bfecae60632c4d7b580db98fb4a37e4698026d54a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:455965ff8e08d76926b4a5db2181f14085f56488fae539b6ff0a35a22650710e
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2f9589c38a3685a3a7913c666aa2459077a853b4e8f8a5230bce75fa99b9825
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a04e762cbd4d47aee16732cda8f4c7502c27308609f7ce076b4e2f0d3fb4e5d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6419cd9723247df1fda540548fd8769dc6b91a2aa84ee458e9c056ee561c4042
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48d247744d91b0cf3a238f968b2951ba5f6bd23f26407678b401da8a2acbb383
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5d4b484a25f92c99275105a25a5abc87d9965b9b7b7ca782045935178f7d615
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ba34a14be23116f11a9748f367721ca69d7011e51824e858c2c479e347dcd2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c549c91cf2b0439baf2468c247f8e2109889f720a27e0d09c9b7d5f695e49a5
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95569b2a398b0a1ffd69ece8b6a4563b909b698504e65a48d908f9a52eef793b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f7b75ebe10eb5c6ecc97c93cde36ee0b594c67c95103dbdcabab169117e465
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3939aff3aee9b7ef5e206c8fc125283e081f9551035e6c540260c6004f5ebe67
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ada658c72a7f1c0eace40e44824bfa74094a719f3408a314ecbea87cf54304
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdc30bf95193bb74ac994ed7ae22377e2ad2041f81720e59ece18a966aa1e5ab
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c8c47081e0cd48c8e1647d14b0cfbcdb2a632b234bf18bf1a619d30eef11321
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7584735719364778,
5
  "eval_steps": 20,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12179,6 +12179,766 @@
12179
  "eval_samples_per_second": 5.387,
12180
  "eval_steps_per_second": 0.179,
12181
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12182
  }
12183
  ],
12184
  "logging_steps": 1,
@@ -12198,7 +12958,7 @@
12198
  "attributes": {}
12199
  }
12200
  },
12201
- "total_flos": 4.276067349792031e+17,
12202
  "train_batch_size": 8,
12203
  "trial_name": null,
12204
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8058781701825077,
5
  "eval_steps": 20,
6
+ "global_step": 1700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12179
  "eval_samples_per_second": 5.387,
12180
  "eval_steps_per_second": 0.179,
12181
  "step": 1600
12182
+ },
12183
+ {
12184
+ "epoch": 0.7589476179189382,
12185
+ "grad_norm": 2.9690921306610107,
12186
+ "learning_rate": 3.3316870025959693e-06,
12187
+ "loss": 0.1082,
12188
+ "step": 1601
12189
+ },
12190
+ {
12191
+ "epoch": 0.7594216639013984,
12192
+ "grad_norm": 5.340085506439209,
12193
+ "learning_rate": 3.3193613578646633e-06,
12194
+ "loss": 0.1429,
12195
+ "step": 1602
12196
+ },
12197
+ {
12198
+ "epoch": 0.7598957098838587,
12199
+ "grad_norm": 3.9515483379364014,
12200
+ "learning_rate": 3.307054016256912e-06,
12201
+ "loss": 0.1083,
12202
+ "step": 1603
12203
+ },
12204
+ {
12205
+ "epoch": 0.760369755866319,
12206
+ "grad_norm": 6.481595993041992,
12207
+ "learning_rate": 3.2947650114914587e-06,
12208
+ "loss": 0.189,
12209
+ "step": 1604
12210
+ },
12211
+ {
12212
+ "epoch": 0.7608438018487793,
12213
+ "grad_norm": 5.499702453613281,
12214
+ "learning_rate": 3.2824943772368213e-06,
12215
+ "loss": 0.1637,
12216
+ "step": 1605
12217
+ },
12218
+ {
12219
+ "epoch": 0.7613178478312397,
12220
+ "grad_norm": 6.298553943634033,
12221
+ "learning_rate": 3.270242147111182e-06,
12222
+ "loss": 0.1212,
12223
+ "step": 1606
12224
+ },
12225
+ {
12226
+ "epoch": 0.7617918938136999,
12227
+ "grad_norm": 3.0482068061828613,
12228
+ "learning_rate": 3.258008354682303e-06,
12229
+ "loss": 0.1139,
12230
+ "step": 1607
12231
+ },
12232
+ {
12233
+ "epoch": 0.7622659397961602,
12234
+ "grad_norm": 8.920551300048828,
12235
+ "learning_rate": 3.2457930334674304e-06,
12236
+ "loss": 0.1548,
12237
+ "step": 1608
12238
+ },
12239
+ {
12240
+ "epoch": 0.7627399857786206,
12241
+ "grad_norm": 7.286499500274658,
12242
+ "learning_rate": 3.233596216933206e-06,
12243
+ "loss": 0.1776,
12244
+ "step": 1609
12245
+ },
12246
+ {
12247
+ "epoch": 0.7632140317610808,
12248
+ "grad_norm": 12.77665901184082,
12249
+ "learning_rate": 3.2214179384955713e-06,
12250
+ "loss": 0.1825,
12251
+ "step": 1610
12252
+ },
12253
+ {
12254
+ "epoch": 0.7636880777435411,
12255
+ "grad_norm": 5.8278374671936035,
12256
+ "learning_rate": 3.209258231519682e-06,
12257
+ "loss": 0.1913,
12258
+ "step": 1611
12259
+ },
12260
+ {
12261
+ "epoch": 0.7641621237260015,
12262
+ "grad_norm": 3.0561583042144775,
12263
+ "learning_rate": 3.197117129319808e-06,
12264
+ "loss": 0.1343,
12265
+ "step": 1612
12266
+ },
12267
+ {
12268
+ "epoch": 0.7646361697084617,
12269
+ "grad_norm": 6.679983139038086,
12270
+ "learning_rate": 3.1849946651592532e-06,
12271
+ "loss": 0.1593,
12272
+ "step": 1613
12273
+ },
12274
+ {
12275
+ "epoch": 0.765110215690922,
12276
+ "grad_norm": 4.746762275695801,
12277
+ "learning_rate": 3.172890872250254e-06,
12278
+ "loss": 0.2468,
12279
+ "step": 1614
12280
+ },
12281
+ {
12282
+ "epoch": 0.7655842616733823,
12283
+ "grad_norm": 3.5384531021118164,
12284
+ "learning_rate": 3.1608057837538976e-06,
12285
+ "loss": 0.0998,
12286
+ "step": 1615
12287
+ },
12288
+ {
12289
+ "epoch": 0.7660583076558426,
12290
+ "grad_norm": 3.744356870651245,
12291
+ "learning_rate": 3.1487394327800156e-06,
12292
+ "loss": 0.1393,
12293
+ "step": 1616
12294
+ },
12295
+ {
12296
+ "epoch": 0.766532353638303,
12297
+ "grad_norm": 3.5314719676971436,
12298
+ "learning_rate": 3.136691852387116e-06,
12299
+ "loss": 0.0888,
12300
+ "step": 1617
12301
+ },
12302
+ {
12303
+ "epoch": 0.7670063996207632,
12304
+ "grad_norm": 5.413354396820068,
12305
+ "learning_rate": 3.1246630755822703e-06,
12306
+ "loss": 0.1746,
12307
+ "step": 1618
12308
+ },
12309
+ {
12310
+ "epoch": 0.7674804456032235,
12311
+ "grad_norm": 5.721497535705566,
12312
+ "learning_rate": 3.1126531353210456e-06,
12313
+ "loss": 0.1132,
12314
+ "step": 1619
12315
+ },
12316
+ {
12317
+ "epoch": 0.7679544915856839,
12318
+ "grad_norm": 6.063429355621338,
12319
+ "learning_rate": 3.1006620645073925e-06,
12320
+ "loss": 0.1388,
12321
+ "step": 1620
12322
+ },
12323
+ {
12324
+ "epoch": 0.7679544915856839,
12325
+ "eval_accuracy": 0.9951690821256038,
12326
+ "eval_f1": 0.9454545454545454,
12327
+ "eval_loss": 0.012482204474508762,
12328
+ "eval_precision": 0.896551724137931,
12329
+ "eval_recall": 1.0,
12330
+ "eval_runtime": 49.5878,
12331
+ "eval_samples_per_second": 5.465,
12332
+ "eval_steps_per_second": 0.181,
12333
+ "step": 1620
12334
+ },
12335
+ {
12336
+ "epoch": 0.7684285375681441,
12337
+ "grad_norm": 5.663280487060547,
12338
+ "learning_rate": 3.0886898959935663e-06,
12339
+ "loss": 0.1339,
12340
+ "step": 1621
12341
+ },
12342
+ {
12343
+ "epoch": 0.7689025835506044,
12344
+ "grad_norm": 3.009401321411133,
12345
+ "learning_rate": 3.0767366625800366e-06,
12346
+ "loss": 0.1137,
12347
+ "step": 1622
12348
+ },
12349
+ {
12350
+ "epoch": 0.7693766295330647,
12351
+ "grad_norm": 4.703526973724365,
12352
+ "learning_rate": 3.064802397015394e-06,
12353
+ "loss": 0.2366,
12354
+ "step": 1623
12355
+ },
12356
+ {
12357
+ "epoch": 0.769850675515525,
12358
+ "grad_norm": 3.2940542697906494,
12359
+ "learning_rate": 3.052887131996267e-06,
12360
+ "loss": 0.1395,
12361
+ "step": 1624
12362
+ },
12363
+ {
12364
+ "epoch": 0.7703247214979853,
12365
+ "grad_norm": 3.261302947998047,
12366
+ "learning_rate": 3.040990900167219e-06,
12367
+ "loss": 0.1505,
12368
+ "step": 1625
12369
+ },
12370
+ {
12371
+ "epoch": 0.7707987674804456,
12372
+ "grad_norm": 3.4305295944213867,
12373
+ "learning_rate": 3.0291137341206755e-06,
12374
+ "loss": 0.1372,
12375
+ "step": 1626
12376
+ },
12377
+ {
12378
+ "epoch": 0.7712728134629059,
12379
+ "grad_norm": 8.65300178527832,
12380
+ "learning_rate": 3.0172556663968254e-06,
12381
+ "loss": 0.1821,
12382
+ "step": 1627
12383
+ },
12384
+ {
12385
+ "epoch": 0.7717468594453663,
12386
+ "grad_norm": 5.62878942489624,
12387
+ "learning_rate": 3.0054167294835314e-06,
12388
+ "loss": 0.1512,
12389
+ "step": 1628
12390
+ },
12391
+ {
12392
+ "epoch": 0.7722209054278265,
12393
+ "grad_norm": 5.76574182510376,
12394
+ "learning_rate": 2.993596955816244e-06,
12395
+ "loss": 0.1573,
12396
+ "step": 1629
12397
+ },
12398
+ {
12399
+ "epoch": 0.7726949514102868,
12400
+ "grad_norm": 7.997915267944336,
12401
+ "learning_rate": 2.9817963777779124e-06,
12402
+ "loss": 0.2725,
12403
+ "step": 1630
12404
+ },
12405
+ {
12406
+ "epoch": 0.7731689973927471,
12407
+ "grad_norm": 3.254222869873047,
12408
+ "learning_rate": 2.970015027698895e-06,
12409
+ "loss": 0.1247,
12410
+ "step": 1631
12411
+ },
12412
+ {
12413
+ "epoch": 0.7736430433752074,
12414
+ "grad_norm": 8.073678016662598,
12415
+ "learning_rate": 2.958252937856869e-06,
12416
+ "loss": 0.1538,
12417
+ "step": 1632
12418
+ },
12419
+ {
12420
+ "epoch": 0.7741170893576677,
12421
+ "grad_norm": 2.6469109058380127,
12422
+ "learning_rate": 2.946510140476747e-06,
12423
+ "loss": 0.0928,
12424
+ "step": 1633
12425
+ },
12426
+ {
12427
+ "epoch": 0.774591135340128,
12428
+ "grad_norm": 6.9095869064331055,
12429
+ "learning_rate": 2.9347866677305814e-06,
12430
+ "loss": 0.1415,
12431
+ "step": 1634
12432
+ },
12433
+ {
12434
+ "epoch": 0.7750651813225883,
12435
+ "grad_norm": 3.802766799926758,
12436
+ "learning_rate": 2.923082551737484e-06,
12437
+ "loss": 0.1323,
12438
+ "step": 1635
12439
+ },
12440
+ {
12441
+ "epoch": 0.7755392273050485,
12442
+ "grad_norm": 4.053550720214844,
12443
+ "learning_rate": 2.911397824563533e-06,
12444
+ "loss": 0.1498,
12445
+ "step": 1636
12446
+ },
12447
+ {
12448
+ "epoch": 0.7760132732875089,
12449
+ "grad_norm": 5.973599910736084,
12450
+ "learning_rate": 2.899732518221685e-06,
12451
+ "loss": 0.149,
12452
+ "step": 1637
12453
+ },
12454
+ {
12455
+ "epoch": 0.7764873192699692,
12456
+ "grad_norm": 3.402735710144043,
12457
+ "learning_rate": 2.888086664671693e-06,
12458
+ "loss": 0.1312,
12459
+ "step": 1638
12460
+ },
12461
+ {
12462
+ "epoch": 0.7769613652524295,
12463
+ "grad_norm": 6.684436798095703,
12464
+ "learning_rate": 2.8764602958200096e-06,
12465
+ "loss": 0.1108,
12466
+ "step": 1639
12467
+ },
12468
+ {
12469
+ "epoch": 0.7774354112348898,
12470
+ "grad_norm": 3.762352466583252,
12471
+ "learning_rate": 2.8648534435197086e-06,
12472
+ "loss": 0.1221,
12473
+ "step": 1640
12474
+ },
12475
+ {
12476
+ "epoch": 0.7774354112348898,
12477
+ "eval_accuracy": 0.9959742351046699,
12478
+ "eval_f1": 0.9532710280373832,
12479
+ "eval_loss": 0.009969827719032764,
12480
+ "eval_precision": 0.9272727272727272,
12481
+ "eval_recall": 0.9807692307692307,
12482
+ "eval_runtime": 50.0594,
12483
+ "eval_samples_per_second": 5.414,
12484
+ "eval_steps_per_second": 0.18,
12485
+ "step": 1640
12486
+ },
12487
+ {
12488
+ "epoch": 0.77790945721735,
12489
+ "grad_norm": 5.541801452636719,
12490
+ "learning_rate": 2.853266139570391e-06,
12491
+ "loss": 0.1781,
12492
+ "step": 1641
12493
+ },
12494
+ {
12495
+ "epoch": 0.7783835031998104,
12496
+ "grad_norm": 5.2935638427734375,
12497
+ "learning_rate": 2.841698415718103e-06,
12498
+ "loss": 0.1746,
12499
+ "step": 1642
12500
+ },
12501
+ {
12502
+ "epoch": 0.7788575491822707,
12503
+ "grad_norm": 3.5511698722839355,
12504
+ "learning_rate": 2.8301503036552446e-06,
12505
+ "loss": 0.1303,
12506
+ "step": 1643
12507
+ },
12508
+ {
12509
+ "epoch": 0.7793315951647309,
12510
+ "grad_norm": 2.210439682006836,
12511
+ "learning_rate": 2.8186218350204865e-06,
12512
+ "loss": 0.1052,
12513
+ "step": 1644
12514
+ },
12515
+ {
12516
+ "epoch": 0.7798056411471913,
12517
+ "grad_norm": 3.1148386001586914,
12518
+ "learning_rate": 2.8071130413986814e-06,
12519
+ "loss": 0.0829,
12520
+ "step": 1645
12521
+ },
12522
+ {
12523
+ "epoch": 0.7802796871296516,
12524
+ "grad_norm": 7.042520999908447,
12525
+ "learning_rate": 2.795623954320781e-06,
12526
+ "loss": 0.2299,
12527
+ "step": 1646
12528
+ },
12529
+ {
12530
+ "epoch": 0.7807537331121118,
12531
+ "grad_norm": 4.106062889099121,
12532
+ "learning_rate": 2.7841546052637346e-06,
12533
+ "loss": 0.119,
12534
+ "step": 1647
12535
+ },
12536
+ {
12537
+ "epoch": 0.7812277790945722,
12538
+ "grad_norm": 2.969593048095703,
12539
+ "learning_rate": 2.7727050256504295e-06,
12540
+ "loss": 0.0684,
12541
+ "step": 1648
12542
+ },
12543
+ {
12544
+ "epoch": 0.7817018250770325,
12545
+ "grad_norm": 6.737387180328369,
12546
+ "learning_rate": 2.761275246849582e-06,
12547
+ "loss": 0.1164,
12548
+ "step": 1649
12549
+ },
12550
+ {
12551
+ "epoch": 0.7821758710594928,
12552
+ "grad_norm": 6.33607292175293,
12553
+ "learning_rate": 2.7498653001756615e-06,
12554
+ "loss": 0.1104,
12555
+ "step": 1650
12556
+ },
12557
+ {
12558
+ "epoch": 0.7826499170419531,
12559
+ "grad_norm": 3.347256898880005,
12560
+ "learning_rate": 2.738475216888802e-06,
12561
+ "loss": 0.1036,
12562
+ "step": 1651
12563
+ },
12564
+ {
12565
+ "epoch": 0.7831239630244133,
12566
+ "grad_norm": 3.709547281265259,
12567
+ "learning_rate": 2.7271050281947165e-06,
12568
+ "loss": 0.1436,
12569
+ "step": 1652
12570
+ },
12571
+ {
12572
+ "epoch": 0.7835980090068737,
12573
+ "grad_norm": 3.4499459266662598,
12574
+ "learning_rate": 2.7157547652446193e-06,
12575
+ "loss": 0.1515,
12576
+ "step": 1653
12577
+ },
12578
+ {
12579
+ "epoch": 0.784072054989334,
12580
+ "grad_norm": 2.6657423973083496,
12581
+ "learning_rate": 2.704424459135123e-06,
12582
+ "loss": 0.1087,
12583
+ "step": 1654
12584
+ },
12585
+ {
12586
+ "epoch": 0.7845461009717942,
12587
+ "grad_norm": 6.451166152954102,
12588
+ "learning_rate": 2.6931141409081753e-06,
12589
+ "loss": 0.2029,
12590
+ "step": 1655
12591
+ },
12592
+ {
12593
+ "epoch": 0.7850201469542546,
12594
+ "grad_norm": 4.049078464508057,
12595
+ "learning_rate": 2.681823841550947e-06,
12596
+ "loss": 0.1342,
12597
+ "step": 1656
12598
+ },
12599
+ {
12600
+ "epoch": 0.7854941929367149,
12601
+ "grad_norm": 5.632473468780518,
12602
+ "learning_rate": 2.6705535919957772e-06,
12603
+ "loss": 0.1467,
12604
+ "step": 1657
12605
+ },
12606
+ {
12607
+ "epoch": 0.7859682389191751,
12608
+ "grad_norm": 3.3033530712127686,
12609
+ "learning_rate": 2.6593034231200664e-06,
12610
+ "loss": 0.1404,
12611
+ "step": 1658
12612
+ },
12613
+ {
12614
+ "epoch": 0.7864422849016355,
12615
+ "grad_norm": 3.3128445148468018,
12616
+ "learning_rate": 2.648073365746204e-06,
12617
+ "loss": 0.1129,
12618
+ "step": 1659
12619
+ },
12620
+ {
12621
+ "epoch": 0.7869163308840957,
12622
+ "grad_norm": 5.318967342376709,
12623
+ "learning_rate": 2.6368634506414757e-06,
12624
+ "loss": 0.1571,
12625
+ "step": 1660
12626
+ },
12627
+ {
12628
+ "epoch": 0.7869163308840957,
12629
+ "eval_accuracy": 0.9959742351046699,
12630
+ "eval_f1": 0.9532710280373832,
12631
+ "eval_loss": 0.010810844600200653,
12632
+ "eval_precision": 0.9272727272727272,
12633
+ "eval_recall": 0.9807692307692307,
12634
+ "eval_runtime": 49.9177,
12635
+ "eval_samples_per_second": 5.429,
12636
+ "eval_steps_per_second": 0.18,
12637
+ "step": 1660
12638
+ },
12639
+ {
12640
+ "epoch": 0.7873903768665561,
12641
+ "grad_norm": 6.077727317810059,
12642
+ "learning_rate": 2.6256737085179852e-06,
12643
+ "loss": 0.1892,
12644
+ "step": 1661
12645
+ },
12646
+ {
12647
+ "epoch": 0.7878644228490164,
12648
+ "grad_norm": 5.929904460906982,
12649
+ "learning_rate": 2.614504170032567e-06,
12650
+ "loss": 0.1609,
12651
+ "step": 1662
12652
+ },
12653
+ {
12654
+ "epoch": 0.7883384688314766,
12655
+ "grad_norm": 12.54429817199707,
12656
+ "learning_rate": 2.6033548657867013e-06,
12657
+ "loss": 0.149,
12658
+ "step": 1663
12659
+ },
12660
+ {
12661
+ "epoch": 0.788812514813937,
12662
+ "grad_norm": 3.4696834087371826,
12663
+ "learning_rate": 2.5922258263264366e-06,
12664
+ "loss": 0.1037,
12665
+ "step": 1664
12666
+ },
12667
+ {
12668
+ "epoch": 0.7892865607963973,
12669
+ "grad_norm": 3.9441494941711426,
12670
+ "learning_rate": 2.581117082142296e-06,
12671
+ "loss": 0.1487,
12672
+ "step": 1665
12673
+ },
12674
+ {
12675
+ "epoch": 0.7897606067788575,
12676
+ "grad_norm": 3.3771462440490723,
12677
+ "learning_rate": 2.570028663669204e-06,
12678
+ "loss": 0.0966,
12679
+ "step": 1666
12680
+ },
12681
+ {
12682
+ "epoch": 0.7902346527613179,
12683
+ "grad_norm": 5.6400604248046875,
12684
+ "learning_rate": 2.5589606012863968e-06,
12685
+ "loss": 0.1358,
12686
+ "step": 1667
12687
+ },
12688
+ {
12689
+ "epoch": 0.7907086987437781,
12690
+ "grad_norm": 3.4519641399383545,
12691
+ "learning_rate": 2.547912925317334e-06,
12692
+ "loss": 0.0834,
12693
+ "step": 1668
12694
+ },
12695
+ {
12696
+ "epoch": 0.7911827447262384,
12697
+ "grad_norm": 7.2654242515563965,
12698
+ "learning_rate": 2.5368856660296327e-06,
12699
+ "loss": 0.1244,
12700
+ "step": 1669
12701
+ },
12702
+ {
12703
+ "epoch": 0.7916567907086988,
12704
+ "grad_norm": 6.323776721954346,
12705
+ "learning_rate": 2.5258788536349622e-06,
12706
+ "loss": 0.1153,
12707
+ "step": 1670
12708
+ },
12709
+ {
12710
+ "epoch": 0.792130836691159,
12711
+ "grad_norm": 8.622234344482422,
12712
+ "learning_rate": 2.514892518288988e-06,
12713
+ "loss": 0.2104,
12714
+ "step": 1671
12715
+ },
12716
+ {
12717
+ "epoch": 0.7926048826736194,
12718
+ "grad_norm": 3.370286703109741,
12719
+ "learning_rate": 2.503926690091263e-06,
12720
+ "loss": 0.0609,
12721
+ "step": 1672
12722
+ },
12723
+ {
12724
+ "epoch": 0.7930789286560797,
12725
+ "grad_norm": 5.871740818023682,
12726
+ "learning_rate": 2.492981399085157e-06,
12727
+ "loss": 0.1789,
12728
+ "step": 1673
12729
+ },
12730
+ {
12731
+ "epoch": 0.7935529746385399,
12732
+ "grad_norm": 5.285881519317627,
12733
+ "learning_rate": 2.482056675257776e-06,
12734
+ "loss": 0.1565,
12735
+ "step": 1674
12736
+ },
12737
+ {
12738
+ "epoch": 0.7940270206210003,
12739
+ "grad_norm": 6.630995273590088,
12740
+ "learning_rate": 2.471152548539876e-06,
12741
+ "loss": 0.176,
12742
+ "step": 1675
12743
+ },
12744
+ {
12745
+ "epoch": 0.7945010666034605,
12746
+ "grad_norm": 2.7057905197143555,
12747
+ "learning_rate": 2.4602690488057836e-06,
12748
+ "loss": 0.0897,
12749
+ "step": 1676
12750
+ },
12751
+ {
12752
+ "epoch": 0.7949751125859208,
12753
+ "grad_norm": 3.194324493408203,
12754
+ "learning_rate": 2.4494062058733157e-06,
12755
+ "loss": 0.1121,
12756
+ "step": 1677
12757
+ },
12758
+ {
12759
+ "epoch": 0.7954491585683812,
12760
+ "grad_norm": 7.977220058441162,
12761
+ "learning_rate": 2.438564049503688e-06,
12762
+ "loss": 0.1833,
12763
+ "step": 1678
12764
+ },
12765
+ {
12766
+ "epoch": 0.7959232045508414,
12767
+ "grad_norm": 4.833785057067871,
12768
+ "learning_rate": 2.4277426094014457e-06,
12769
+ "loss": 0.1875,
12770
+ "step": 1679
12771
+ },
12772
+ {
12773
+ "epoch": 0.7963972505333017,
12774
+ "grad_norm": 5.799574375152588,
12775
+ "learning_rate": 2.416941915214377e-06,
12776
+ "loss": 0.1472,
12777
+ "step": 1680
12778
+ },
12779
+ {
12780
+ "epoch": 0.7963972505333017,
12781
+ "eval_accuracy": 0.9943639291465378,
12782
+ "eval_f1": 0.9357798165137615,
12783
+ "eval_loss": 0.011454065330326557,
12784
+ "eval_precision": 0.8947368421052632,
12785
+ "eval_recall": 0.9807692307692307,
12786
+ "eval_runtime": 49.6644,
12787
+ "eval_samples_per_second": 5.457,
12788
+ "eval_steps_per_second": 0.181,
12789
+ "step": 1680
12790
+ },
12791
+ {
12792
+ "epoch": 0.796871296515762,
12793
+ "grad_norm": 5.203197956085205,
12794
+ "learning_rate": 2.4061619965334314e-06,
12795
+ "loss": 0.1582,
12796
+ "step": 1681
12797
+ },
12798
+ {
12799
+ "epoch": 0.7973453424982223,
12800
+ "grad_norm": 8.293927192687988,
12801
+ "learning_rate": 2.395402882892639e-06,
12802
+ "loss": 0.2625,
12803
+ "step": 1682
12804
+ },
12805
+ {
12806
+ "epoch": 0.7978193884806827,
12807
+ "grad_norm": 4.733770847320557,
12808
+ "learning_rate": 2.3846646037690304e-06,
12809
+ "loss": 0.1162,
12810
+ "step": 1683
12811
+ },
12812
+ {
12813
+ "epoch": 0.7982934344631429,
12814
+ "grad_norm": 3.6557698249816895,
12815
+ "learning_rate": 2.3739471885825536e-06,
12816
+ "loss": 0.142,
12817
+ "step": 1684
12818
+ },
12819
+ {
12820
+ "epoch": 0.7987674804456032,
12821
+ "grad_norm": 5.944900989532471,
12822
+ "learning_rate": 2.363250666695999e-06,
12823
+ "loss": 0.1202,
12824
+ "step": 1685
12825
+ },
12826
+ {
12827
+ "epoch": 0.7992415264280636,
12828
+ "grad_norm": 3.3309900760650635,
12829
+ "learning_rate": 2.3525750674149094e-06,
12830
+ "loss": 0.1227,
12831
+ "step": 1686
12832
+ },
12833
+ {
12834
+ "epoch": 0.7997155724105238,
12835
+ "grad_norm": 5.317230224609375,
12836
+ "learning_rate": 2.34192041998751e-06,
12837
+ "loss": 0.1406,
12838
+ "step": 1687
12839
+ },
12840
+ {
12841
+ "epoch": 0.8001896183929841,
12842
+ "grad_norm": 4.319701671600342,
12843
+ "learning_rate": 2.331286753604621e-06,
12844
+ "loss": 0.1916,
12845
+ "step": 1688
12846
+ },
12847
+ {
12848
+ "epoch": 0.8006636643754445,
12849
+ "grad_norm": 4.4361982345581055,
12850
+ "learning_rate": 2.3206740973995823e-06,
12851
+ "loss": 0.1844,
12852
+ "step": 1689
12853
+ },
12854
+ {
12855
+ "epoch": 0.8011377103579047,
12856
+ "grad_norm": 3.2999582290649414,
12857
+ "learning_rate": 2.3100824804481703e-06,
12858
+ "loss": 0.0952,
12859
+ "step": 1690
12860
+ },
12861
+ {
12862
+ "epoch": 0.801611756340365,
12863
+ "grad_norm": 7.211174964904785,
12864
+ "learning_rate": 2.29951193176852e-06,
12865
+ "loss": 0.1072,
12866
+ "step": 1691
12867
+ },
12868
+ {
12869
+ "epoch": 0.8020858023228253,
12870
+ "grad_norm": 5.33006477355957,
12871
+ "learning_rate": 2.2889624803210453e-06,
12872
+ "loss": 0.1978,
12873
+ "step": 1692
12874
+ },
12875
+ {
12876
+ "epoch": 0.8025598483052856,
12877
+ "grad_norm": 5.028670787811279,
12878
+ "learning_rate": 2.2784341550083577e-06,
12879
+ "loss": 0.0922,
12880
+ "step": 1693
12881
+ },
12882
+ {
12883
+ "epoch": 0.803033894287746,
12884
+ "grad_norm": 5.079577445983887,
12885
+ "learning_rate": 2.2679269846751915e-06,
12886
+ "loss": 0.1134,
12887
+ "step": 1694
12888
+ },
12889
+ {
12890
+ "epoch": 0.8035079402702062,
12891
+ "grad_norm": 3.310760974884033,
12892
+ "learning_rate": 2.2574409981083224e-06,
12893
+ "loss": 0.0928,
12894
+ "step": 1695
12895
+ },
12896
+ {
12897
+ "epoch": 0.8039819862526665,
12898
+ "grad_norm": 5.977758884429932,
12899
+ "learning_rate": 2.2469762240364847e-06,
12900
+ "loss": 0.1011,
12901
+ "step": 1696
12902
+ },
12903
+ {
12904
+ "epoch": 0.8044560322351268,
12905
+ "grad_norm": 6.029415607452393,
12906
+ "learning_rate": 2.236532691130299e-06,
12907
+ "loss": 0.1699,
12908
+ "step": 1697
12909
+ },
12910
+ {
12911
+ "epoch": 0.8049300782175871,
12912
+ "grad_norm": 9.231821060180664,
12913
+ "learning_rate": 2.2261104280021937e-06,
12914
+ "loss": 0.2549,
12915
+ "step": 1698
12916
+ },
12917
+ {
12918
+ "epoch": 0.8054041242000474,
12919
+ "grad_norm": 2.8385801315307617,
12920
+ "learning_rate": 2.215709463206316e-06,
12921
+ "loss": 0.0953,
12922
+ "step": 1699
12923
+ },
12924
+ {
12925
+ "epoch": 0.8058781701825077,
12926
+ "grad_norm": 6.947047233581543,
12927
+ "learning_rate": 2.205329825238467e-06,
12928
+ "loss": 0.1236,
12929
+ "step": 1700
12930
+ },
12931
+ {
12932
+ "epoch": 0.8058781701825077,
12933
+ "eval_accuracy": 0.9935587761674718,
12934
+ "eval_f1": 0.9272727272727272,
12935
+ "eval_loss": 0.012126692570745945,
12936
+ "eval_precision": 0.8793103448275862,
12937
+ "eval_recall": 0.9807692307692307,
12938
+ "eval_runtime": 49.2509,
12939
+ "eval_samples_per_second": 5.502,
12940
+ "eval_steps_per_second": 0.183,
12941
+ "step": 1700
12942
  }
12943
  ],
12944
  "logging_steps": 1,
 
12958
  "attributes": {}
12959
  }
12960
  },
12961
+ "total_flos": 4.5441970776047616e+17,
12962
  "train_batch_size": 8,
12963
  "trial_name": null,
12964
  "trial_params": null