mtzig commited on
Commit
fb6695d
1 Parent(s): ad5f102

Training in progress, step 2100, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97e5f90b02b18eee0439efcd1e11c562003887a0a8341c65f3c61afc97e6ce91
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:050e4db55e69664bf6d9c834522ec2206b36b64c8d2f6ed4d5d17b4cf9da2f4e
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa33fe523d912fae3cb37eeb6b60af785266354c6c31911ecc4617df910b0be2
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb3a674abc7da7a23279462f8cae294d8ecdec98362fed586fc3bccef1a61d4
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe1bbd6e31aea6726660dc7dac9d7b7b788a128231286077750bd3b7ceeb5a97
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1e641d2b2e349a4c213409e52cf62d25bc236ac15c9791b7bc804909f7f92c3
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cefe39a14475612351b6fccd8db8eec85a931549215bf24bafd93144edce8a5
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866ec72c28b8ea1e8a4c76f5ed42b739d69875ec24137c268880795bd767ba9b
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57079b1ad6dfda7f50c73be4cc9a2461ca37b66b4a9e6186c57fa89a2fbb32dc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f5d80504b530d1236d869d6a0431889ce3c16ca369fab9ca79aef572e1e676f
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b90c40fdfe265353374604f556a9c76615bc263d7688eb1dc6fa1733158babe8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0148021e09d7b7a64e41765bf2c33e45d25853d9e709eca7c135e74bee54b7
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:834bf46634f0752bdb674694ee8a0f7d157d699667caf2b5dc77591f5ada58ec
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda280986df81c923c89a0a8a61df0a1484f3b11f668604be6beb240af22c140
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20a45e516450ff75e5f30798a9fd5c55d60506aebd3e02c1c8b581ae0fd8ecb1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bb78205b0b38be64245705e3d63c368f26e81d439c05fe7f4f6ee459319648f
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f258b75154d2aee1a76c51ba8c53eb9ff1afc1684f65be22d906efc966e2f31d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68bc9217b6e9ab217f22aba698dbeddd344df01c6c8d3bf496373786b4d6b46f
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a2cd1c1229272286316fc487e083e3c0dbb26b851fd444bc5cfa3906d05744d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72a20a9702c2689058ab5d5d2baeb8c7227e34d68571334f92805043bd9e18eb
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54bd2f8ba2fbed41edcaf0b31a7cc52ace7dc5e888e79b744825e45b024f9c0c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8533b05acf81e2a8c388c137bc99083b4a5fc4f3554fc80f7b0497d2e0eca05f
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20daa4d654ef46df708f18dbbf7bc707be5815cfc90479bf1752f4b1f5183f51
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4e0b82e92d540a47961438b15ece197574d010671ffe40e6c7ee07f5dac4307
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0c332a71d8fb512346f2df9841021fb4baac7da78dd4eb8a3c1b75157d59e96
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac1b330f53ae14ab4a2bb829af8af4d5e4c909474cfca651cf822672c87529f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9480919649205973,
5
  "eval_steps": 20,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -15219,6 +15219,766 @@
15219
  "eval_samples_per_second": 5.375,
15220
  "eval_steps_per_second": 0.179,
15221
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15222
  }
15223
  ],
15224
  "logging_steps": 1,
@@ -15238,7 +15998,7 @@
15238
  "attributes": {}
15239
  }
15240
  },
15241
- "total_flos": 5.343662741557084e+17,
15242
  "train_batch_size": 8,
15243
  "trial_name": null,
15244
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9954965631666272,
5
  "eval_steps": 20,
6
+ "global_step": 2100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
15219
  "eval_samples_per_second": 5.375,
15220
  "eval_steps_per_second": 0.179,
15221
  "step": 2000
15222
+ },
15223
+ {
15224
+ "epoch": 0.9485660109030576,
15225
+ "grad_norm": 2.924222707748413,
15226
+ "learning_rate": 1.5935584725626062e-07,
15227
+ "loss": 0.0924,
15228
+ "step": 2001
15229
+ },
15230
+ {
15231
+ "epoch": 0.9490400568855178,
15232
+ "grad_norm": 5.106085300445557,
15233
+ "learning_rate": 1.5642615756586765e-07,
15234
+ "loss": 0.1919,
15235
+ "step": 2002
15236
+ },
15237
+ {
15238
+ "epoch": 0.9495141028679782,
15239
+ "grad_norm": 5.821203708648682,
15240
+ "learning_rate": 1.5352343657680234e-07,
15241
+ "loss": 0.1551,
15242
+ "step": 2003
15243
+ },
15244
+ {
15245
+ "epoch": 0.9499881488504385,
15246
+ "grad_norm": 4.752243518829346,
15247
+ "learning_rate": 1.506476922417266e-07,
15248
+ "loss": 0.16,
15249
+ "step": 2004
15250
+ },
15251
+ {
15252
+ "epoch": 0.9504621948328988,
15253
+ "grad_norm": 4.044118404388428,
15254
+ "learning_rate": 1.4779893243939358e-07,
15255
+ "loss": 0.1228,
15256
+ "step": 2005
15257
+ },
15258
+ {
15259
+ "epoch": 0.9509362408153591,
15260
+ "grad_norm": 5.809322834014893,
15261
+ "learning_rate": 1.4497716497462676e-07,
15262
+ "loss": 0.1309,
15263
+ "step": 2006
15264
+ },
15265
+ {
15266
+ "epoch": 0.9514102867978194,
15267
+ "grad_norm": 5.9313459396362305,
15268
+ "learning_rate": 1.4218239757829656e-07,
15269
+ "loss": 0.1126,
15270
+ "step": 2007
15271
+ },
15272
+ {
15273
+ "epoch": 0.9518843327802797,
15274
+ "grad_norm": 5.524699687957764,
15275
+ "learning_rate": 1.3941463790730248e-07,
15276
+ "loss": 0.0932,
15277
+ "step": 2008
15278
+ },
15279
+ {
15280
+ "epoch": 0.95235837876274,
15281
+ "grad_norm": 3.8316197395324707,
15282
+ "learning_rate": 1.3667389354454997e-07,
15283
+ "loss": 0.1288,
15284
+ "step": 2009
15285
+ },
15286
+ {
15287
+ "epoch": 0.9528324247452002,
15288
+ "grad_norm": 4.261562347412109,
15289
+ "learning_rate": 1.3396017199892808e-07,
15290
+ "loss": 0.0725,
15291
+ "step": 2010
15292
+ },
15293
+ {
15294
+ "epoch": 0.9533064707276606,
15295
+ "grad_norm": 3.045381546020508,
15296
+ "learning_rate": 1.312734807052929e-07,
15297
+ "loss": 0.1336,
15298
+ "step": 2011
15299
+ },
15300
+ {
15301
+ "epoch": 0.9537805167101209,
15302
+ "grad_norm": 4.582825183868408,
15303
+ "learning_rate": 1.2861382702444304e-07,
15304
+ "loss": 0.1122,
15305
+ "step": 2012
15306
+ },
15307
+ {
15308
+ "epoch": 0.9542545626925811,
15309
+ "grad_norm": 5.358804702758789,
15310
+ "learning_rate": 1.2598121824310305e-07,
15311
+ "loss": 0.2103,
15312
+ "step": 2013
15313
+ },
15314
+ {
15315
+ "epoch": 0.9547286086750415,
15316
+ "grad_norm": 8.944177627563477,
15317
+ "learning_rate": 1.2337566157390124e-07,
15318
+ "loss": 0.2294,
15319
+ "step": 2014
15320
+ },
15321
+ {
15322
+ "epoch": 0.9552026546575018,
15323
+ "grad_norm": 3.860495090484619,
15324
+ "learning_rate": 1.2079716415534958e-07,
15325
+ "loss": 0.1725,
15326
+ "step": 2015
15327
+ },
15328
+ {
15329
+ "epoch": 0.9556767006399621,
15330
+ "grad_norm": 5.890530586242676,
15331
+ "learning_rate": 1.1824573305182829e-07,
15332
+ "loss": 0.1347,
15333
+ "step": 2016
15334
+ },
15335
+ {
15336
+ "epoch": 0.9561507466224224,
15337
+ "grad_norm": 4.890679359436035,
15338
+ "learning_rate": 1.1572137525356019e-07,
15339
+ "loss": 0.1632,
15340
+ "step": 2017
15341
+ },
15342
+ {
15343
+ "epoch": 0.9566247926048826,
15344
+ "grad_norm": 3.409152030944824,
15345
+ "learning_rate": 1.1322409767659526e-07,
15346
+ "loss": 0.1673,
15347
+ "step": 2018
15348
+ },
15349
+ {
15350
+ "epoch": 0.957098838587343,
15351
+ "grad_norm": 2.9978771209716797,
15352
+ "learning_rate": 1.1075390716279167e-07,
15353
+ "loss": 0.0933,
15354
+ "step": 2019
15355
+ },
15356
+ {
15357
+ "epoch": 0.9575728845698033,
15358
+ "grad_norm": 4.279489994049072,
15359
+ "learning_rate": 1.0831081047979585e-07,
15360
+ "loss": 0.1072,
15361
+ "step": 2020
15362
+ },
15363
+ {
15364
+ "epoch": 0.9575728845698033,
15365
+ "eval_accuracy": 0.9935587761674718,
15366
+ "eval_f1": 0.9272727272727272,
15367
+ "eval_loss": 0.012736320495605469,
15368
+ "eval_precision": 0.8793103448275862,
15369
+ "eval_recall": 0.9807692307692307,
15370
+ "eval_runtime": 50.4208,
15371
+ "eval_samples_per_second": 5.375,
15372
+ "eval_steps_per_second": 0.178,
15373
+ "step": 2020
15374
+ },
15375
+ {
15376
+ "epoch": 0.9580469305522635,
15377
+ "grad_norm": 3.377288579940796,
15378
+ "learning_rate": 1.0589481432102588e-07,
15379
+ "loss": 0.1007,
15380
+ "step": 2021
15381
+ },
15382
+ {
15383
+ "epoch": 0.9585209765347239,
15384
+ "grad_norm": 4.943248271942139,
15385
+ "learning_rate": 1.0350592530564919e-07,
15386
+ "loss": 0.1345,
15387
+ "step": 2022
15388
+ },
15389
+ {
15390
+ "epoch": 0.9589950225171842,
15391
+ "grad_norm": 3.178915500640869,
15392
+ "learning_rate": 1.0114414997856814e-07,
15393
+ "loss": 0.1501,
15394
+ "step": 2023
15395
+ },
15396
+ {
15397
+ "epoch": 0.9594690684996444,
15398
+ "grad_norm": 2.851790428161621,
15399
+ "learning_rate": 9.880949481040347e-08,
15400
+ "loss": 0.1128,
15401
+ "step": 2024
15402
+ },
15403
+ {
15404
+ "epoch": 0.9599431144821048,
15405
+ "grad_norm": 7.474143981933594,
15406
+ "learning_rate": 9.650196619747088e-08,
15407
+ "loss": 0.2338,
15408
+ "step": 2025
15409
+ },
15410
+ {
15411
+ "epoch": 0.960417160464565,
15412
+ "grad_norm": 4.426879405975342,
15413
+ "learning_rate": 9.422157046176772e-08,
15414
+ "loss": 0.1695,
15415
+ "step": 2026
15416
+ },
15417
+ {
15418
+ "epoch": 0.9608912064470254,
15419
+ "grad_norm": 4.276393890380859,
15420
+ "learning_rate": 9.19683138509564e-08,
15421
+ "loss": 0.1695,
15422
+ "step": 2027
15423
+ },
15424
+ {
15425
+ "epoch": 0.9613652524294857,
15426
+ "grad_norm": 4.4484357833862305,
15427
+ "learning_rate": 8.974220253834209e-08,
15428
+ "loss": 0.1489,
15429
+ "step": 2028
15430
+ },
15431
+ {
15432
+ "epoch": 0.9618392984119459,
15433
+ "grad_norm": 6.785750389099121,
15434
+ "learning_rate": 8.754324262286284e-08,
15435
+ "loss": 0.1951,
15436
+ "step": 2029
15437
+ },
15438
+ {
15439
+ "epoch": 0.9623133443944063,
15440
+ "grad_norm": 5.472995281219482,
15441
+ "learning_rate": 8.537144012906728e-08,
15442
+ "loss": 0.1067,
15443
+ "step": 2030
15444
+ },
15445
+ {
15446
+ "epoch": 0.9627873903768666,
15447
+ "grad_norm": 7.181637287139893,
15448
+ "learning_rate": 8.322680100710023e-08,
15449
+ "loss": 0.161,
15450
+ "step": 2031
15451
+ },
15452
+ {
15453
+ "epoch": 0.9632614363593268,
15454
+ "grad_norm": 2.844578504562378,
15455
+ "learning_rate": 8.110933113268604e-08,
15456
+ "loss": 0.1443,
15457
+ "step": 2032
15458
+ },
15459
+ {
15460
+ "epoch": 0.9637354823417872,
15461
+ "grad_norm": 4.869422435760498,
15462
+ "learning_rate": 7.901903630711416e-08,
15463
+ "loss": 0.0947,
15464
+ "step": 2033
15465
+ },
15466
+ {
15467
+ "epoch": 0.9642095283242474,
15468
+ "grad_norm": 8.479473114013672,
15469
+ "learning_rate": 7.695592225722137e-08,
15470
+ "loss": 0.1779,
15471
+ "step": 2034
15472
+ },
15473
+ {
15474
+ "epoch": 0.9646835743067077,
15475
+ "grad_norm": 6.677021503448486,
15476
+ "learning_rate": 7.491999463537403e-08,
15477
+ "loss": 0.2122,
15478
+ "step": 2035
15479
+ },
15480
+ {
15481
+ "epoch": 0.9651576202891681,
15482
+ "grad_norm": 6.709725379943848,
15483
+ "learning_rate": 7.291125901946027e-08,
15484
+ "loss": 0.2262,
15485
+ "step": 2036
15486
+ },
15487
+ {
15488
+ "epoch": 0.9656316662716283,
15489
+ "grad_norm": 9.639774322509766,
15490
+ "learning_rate": 7.092972091286454e-08,
15491
+ "loss": 0.1651,
15492
+ "step": 2037
15493
+ },
15494
+ {
15495
+ "epoch": 0.9661057122540887,
15496
+ "grad_norm": 4.166116714477539,
15497
+ "learning_rate": 6.897538574445972e-08,
15498
+ "loss": 0.0947,
15499
+ "step": 2038
15500
+ },
15501
+ {
15502
+ "epoch": 0.966579758236549,
15503
+ "grad_norm": 5.3391876220703125,
15504
+ "learning_rate": 6.704825886858946e-08,
15505
+ "loss": 0.1904,
15506
+ "step": 2039
15507
+ },
15508
+ {
15509
+ "epoch": 0.9670538042190092,
15510
+ "grad_norm": 4.7893595695495605,
15511
+ "learning_rate": 6.5148345565057e-08,
15512
+ "loss": 0.2165,
15513
+ "step": 2040
15514
+ },
15515
+ {
15516
+ "epoch": 0.9670538042190092,
15517
+ "eval_accuracy": 0.9935587761674718,
15518
+ "eval_f1": 0.9272727272727272,
15519
+ "eval_loss": 0.012882479466497898,
15520
+ "eval_precision": 0.8793103448275862,
15521
+ "eval_recall": 0.9807692307692307,
15522
+ "eval_runtime": 50.0769,
15523
+ "eval_samples_per_second": 5.412,
15524
+ "eval_steps_per_second": 0.18,
15525
+ "step": 2040
15526
+ },
15527
+ {
15528
+ "epoch": 0.9675278502014696,
15529
+ "grad_norm": 4.419187545776367,
15530
+ "learning_rate": 6.327565103910193e-08,
15531
+ "loss": 0.168,
15532
+ "step": 2041
15533
+ },
15534
+ {
15535
+ "epoch": 0.9680018961839298,
15536
+ "grad_norm": 10.345693588256836,
15537
+ "learning_rate": 6.143018042139903e-08,
15538
+ "loss": 0.1948,
15539
+ "step": 2042
15540
+ },
15541
+ {
15542
+ "epoch": 0.9684759421663901,
15543
+ "grad_norm": 8.49881649017334,
15544
+ "learning_rate": 5.96119387680294e-08,
15545
+ "loss": 0.1262,
15546
+ "step": 2043
15547
+ },
15548
+ {
15549
+ "epoch": 0.9689499881488505,
15550
+ "grad_norm": 3.0430965423583984,
15551
+ "learning_rate": 5.782093106048159e-08,
15552
+ "loss": 0.1379,
15553
+ "step": 2044
15554
+ },
15555
+ {
15556
+ "epoch": 0.9694240341313107,
15557
+ "grad_norm": 3.5971128940582275,
15558
+ "learning_rate": 5.605716220562385e-08,
15559
+ "loss": 0.1537,
15560
+ "step": 2045
15561
+ },
15562
+ {
15563
+ "epoch": 0.969898080113771,
15564
+ "grad_norm": 2.2294721603393555,
15565
+ "learning_rate": 5.4320637035704114e-08,
15566
+ "loss": 0.0722,
15567
+ "step": 2046
15568
+ },
15569
+ {
15570
+ "epoch": 0.9703721260962314,
15571
+ "grad_norm": 6.108776569366455,
15572
+ "learning_rate": 5.2611360308323364e-08,
15573
+ "loss": 0.1432,
15574
+ "step": 2047
15575
+ },
15576
+ {
15577
+ "epoch": 0.9708461720786916,
15578
+ "grad_norm": 4.834316730499268,
15579
+ "learning_rate": 5.092933670643452e-08,
15580
+ "loss": 0.1153,
15581
+ "step": 2048
15582
+ },
15583
+ {
15584
+ "epoch": 0.971320218061152,
15585
+ "grad_norm": 3.5349068641662598,
15586
+ "learning_rate": 4.9274570838322436e-08,
15587
+ "loss": 0.09,
15588
+ "step": 2049
15589
+ },
15590
+ {
15591
+ "epoch": 0.9717942640436122,
15592
+ "grad_norm": 5.70138692855835,
15593
+ "learning_rate": 4.764706723759172e-08,
15594
+ "loss": 0.1562,
15595
+ "step": 2050
15596
+ },
15597
+ {
15598
+ "epoch": 0.9722683100260725,
15599
+ "grad_norm": 7.175850868225098,
15600
+ "learning_rate": 4.604683036316004e-08,
15601
+ "loss": 0.3258,
15602
+ "step": 2051
15603
+ },
15604
+ {
15605
+ "epoch": 0.9727423560085329,
15606
+ "grad_norm": 4.018371105194092,
15607
+ "learning_rate": 4.4473864599235975e-08,
15608
+ "loss": 0.2188,
15609
+ "step": 2052
15610
+ },
15611
+ {
15612
+ "epoch": 0.9732164019909931,
15613
+ "grad_norm": 5.978484153747559,
15614
+ "learning_rate": 4.29281742553167e-08,
15615
+ "loss": 0.1379,
15616
+ "step": 2053
15617
+ },
15618
+ {
15619
+ "epoch": 0.9736904479734534,
15620
+ "grad_norm": 3.8246636390686035,
15621
+ "learning_rate": 4.1409763566172544e-08,
15622
+ "loss": 0.1241,
15623
+ "step": 2054
15624
+ },
15625
+ {
15626
+ "epoch": 0.9741644939559138,
15627
+ "grad_norm": 3.782214641571045,
15628
+ "learning_rate": 3.991863669183138e-08,
15629
+ "loss": 0.1725,
15630
+ "step": 2055
15631
+ },
15632
+ {
15633
+ "epoch": 0.974638539938374,
15634
+ "grad_norm": 5.974038124084473,
15635
+ "learning_rate": 3.845479771757532e-08,
15636
+ "loss": 0.0927,
15637
+ "step": 2056
15638
+ },
15639
+ {
15640
+ "epoch": 0.9751125859208343,
15641
+ "grad_norm": 4.958864688873291,
15642
+ "learning_rate": 3.701825065392184e-08,
15643
+ "loss": 0.1666,
15644
+ "step": 2057
15645
+ },
15646
+ {
15647
+ "epoch": 0.9755866319032946,
15648
+ "grad_norm": 7.515510559082031,
15649
+ "learning_rate": 3.560899943661822e-08,
15650
+ "loss": 0.1924,
15651
+ "step": 2058
15652
+ },
15653
+ {
15654
+ "epoch": 0.9760606778857549,
15655
+ "grad_norm": 6.320629596710205,
15656
+ "learning_rate": 3.422704792662601e-08,
15657
+ "loss": 0.1618,
15658
+ "step": 2059
15659
+ },
15660
+ {
15661
+ "epoch": 0.9765347238682153,
15662
+ "grad_norm": 3.3863184452056885,
15663
+ "learning_rate": 3.2872399910115484e-08,
15664
+ "loss": 0.1417,
15665
+ "step": 2060
15666
+ },
15667
+ {
15668
+ "epoch": 0.9765347238682153,
15669
+ "eval_accuracy": 0.9935587761674718,
15670
+ "eval_f1": 0.9272727272727272,
15671
+ "eval_loss": 0.012739640660583973,
15672
+ "eval_precision": 0.8793103448275862,
15673
+ "eval_recall": 0.9807692307692307,
15674
+ "eval_runtime": 49.7841,
15675
+ "eval_samples_per_second": 5.444,
15676
+ "eval_steps_per_second": 0.181,
15677
+ "step": 2060
15678
+ },
15679
+ {
15680
+ "epoch": 0.9770087698506755,
15681
+ "grad_norm": 4.818965911865234,
15682
+ "learning_rate": 3.154505909845229e-08,
15683
+ "loss": 0.1236,
15684
+ "step": 2061
15685
+ },
15686
+ {
15687
+ "epoch": 0.9774828158331358,
15688
+ "grad_norm": 6.437606334686279,
15689
+ "learning_rate": 3.024502912818528e-08,
15690
+ "loss": 0.2105,
15691
+ "step": 2062
15692
+ },
15693
+ {
15694
+ "epoch": 0.9779568618155962,
15695
+ "grad_norm": 4.611502647399902,
15696
+ "learning_rate": 2.897231356104424e-08,
15697
+ "loss": 0.1779,
15698
+ "step": 2063
15699
+ },
15700
+ {
15701
+ "epoch": 0.9784309077980564,
15702
+ "grad_norm": 5.7043843269348145,
15703
+ "learning_rate": 2.7726915883919958e-08,
15704
+ "loss": 0.1738,
15705
+ "step": 2064
15706
+ },
15707
+ {
15708
+ "epoch": 0.9789049537805167,
15709
+ "grad_norm": 3.3945627212524414,
15710
+ "learning_rate": 2.6508839508861963e-08,
15711
+ "loss": 0.1066,
15712
+ "step": 2065
15713
+ },
15714
+ {
15715
+ "epoch": 0.979378999762977,
15716
+ "grad_norm": 3.7300400733947754,
15717
+ "learning_rate": 2.5318087773066325e-08,
15718
+ "loss": 0.1186,
15719
+ "step": 2066
15720
+ },
15721
+ {
15722
+ "epoch": 0.9798530457454373,
15723
+ "grad_norm": 5.509089469909668,
15724
+ "learning_rate": 2.4154663938867894e-08,
15725
+ "loss": 0.1847,
15726
+ "step": 2067
15727
+ },
15728
+ {
15729
+ "epoch": 0.9803270917278976,
15730
+ "grad_norm": 3.7570600509643555,
15731
+ "learning_rate": 2.3018571193729188e-08,
15732
+ "loss": 0.1604,
15733
+ "step": 2068
15734
+ },
15735
+ {
15736
+ "epoch": 0.9808011377103579,
15737
+ "grad_norm": 3.3540408611297607,
15738
+ "learning_rate": 2.190981265023373e-08,
15739
+ "loss": 0.0865,
15740
+ "step": 2069
15741
+ },
15742
+ {
15743
+ "epoch": 0.9812751836928182,
15744
+ "grad_norm": 3.976696252822876,
15745
+ "learning_rate": 2.082839134607828e-08,
15746
+ "loss": 0.1681,
15747
+ "step": 2070
15748
+ },
15749
+ {
15750
+ "epoch": 0.9817492296752786,
15751
+ "grad_norm": 5.1151838302612305,
15752
+ "learning_rate": 1.9774310244059512e-08,
15753
+ "loss": 0.1638,
15754
+ "step": 2071
15755
+ },
15756
+ {
15757
+ "epoch": 0.9822232756577388,
15758
+ "grad_norm": 6.2366414070129395,
15759
+ "learning_rate": 1.874757223207291e-08,
15760
+ "loss": 0.1142,
15761
+ "step": 2072
15762
+ },
15763
+ {
15764
+ "epoch": 0.9826973216401991,
15765
+ "grad_norm": 3.962942361831665,
15766
+ "learning_rate": 1.7748180123100535e-08,
15767
+ "loss": 0.136,
15768
+ "step": 2073
15769
+ },
15770
+ {
15771
+ "epoch": 0.9831713676226594,
15772
+ "grad_norm": 9.428374290466309,
15773
+ "learning_rate": 1.677613665520106e-08,
15774
+ "loss": 0.1083,
15775
+ "step": 2074
15776
+ },
15777
+ {
15778
+ "epoch": 0.9836454136051197,
15779
+ "grad_norm": 6.933211326599121,
15780
+ "learning_rate": 1.583144449150975e-08,
15781
+ "loss": 0.1716,
15782
+ "step": 2075
15783
+ },
15784
+ {
15785
+ "epoch": 0.98411945958758,
15786
+ "grad_norm": 5.4883575439453125,
15787
+ "learning_rate": 1.4914106220225156e-08,
15788
+ "loss": 0.1931,
15789
+ "step": 2076
15790
+ },
15791
+ {
15792
+ "epoch": 0.9845935055700403,
15793
+ "grad_norm": 3.674689531326294,
15794
+ "learning_rate": 1.402412435460132e-08,
15795
+ "loss": 0.167,
15796
+ "step": 2077
15797
+ },
15798
+ {
15799
+ "epoch": 0.9850675515525006,
15800
+ "grad_norm": 3.3293275833129883,
15801
+ "learning_rate": 1.3161501332947802e-08,
15802
+ "loss": 0.1224,
15803
+ "step": 2078
15804
+ },
15805
+ {
15806
+ "epoch": 0.9855415975349608,
15807
+ "grad_norm": 5.863772392272949,
15808
+ "learning_rate": 1.2326239518614114e-08,
15809
+ "loss": 0.1418,
15810
+ "step": 2079
15811
+ },
15812
+ {
15813
+ "epoch": 0.9860156435174212,
15814
+ "grad_norm": 4.600866317749023,
15815
+ "learning_rate": 1.1518341199989735e-08,
15816
+ "loss": 0.101,
15817
+ "step": 2080
15818
+ },
15819
+ {
15820
+ "epoch": 0.9860156435174212,
15821
+ "eval_accuracy": 0.9935587761674718,
15822
+ "eval_f1": 0.9272727272727272,
15823
+ "eval_loss": 0.012615163810551167,
15824
+ "eval_precision": 0.8793103448275862,
15825
+ "eval_recall": 0.9807692307692307,
15826
+ "eval_runtime": 50.0081,
15827
+ "eval_samples_per_second": 5.419,
15828
+ "eval_steps_per_second": 0.18,
15829
+ "step": 2080
15830
+ },
15831
+ {
15832
+ "epoch": 0.9864896894998815,
15833
+ "grad_norm": 6.645082950592041,
15834
+ "learning_rate": 1.0737808590495225e-08,
15835
+ "loss": 0.1798,
15836
+ "step": 2081
15837
+ },
15838
+ {
15839
+ "epoch": 0.9869637354823418,
15840
+ "grad_norm": 3.026750087738037,
15841
+ "learning_rate": 9.984643828576669e-09,
15842
+ "loss": 0.1094,
15843
+ "step": 2082
15844
+ },
15845
+ {
15846
+ "epoch": 0.9874377814648021,
15847
+ "grad_norm": 4.144604206085205,
15848
+ "learning_rate": 9.25884897770013e-09,
15849
+ "loss": 0.1389,
15850
+ "step": 2083
15851
+ },
15852
+ {
15853
+ "epoch": 0.9879118274472624,
15854
+ "grad_norm": 5.202576637268066,
15855
+ "learning_rate": 8.560426026343881e-09,
15856
+ "loss": 0.2081,
15857
+ "step": 2084
15858
+ },
15859
+ {
15860
+ "epoch": 0.9883858734297227,
15861
+ "grad_norm": 3.02374005317688,
15862
+ "learning_rate": 7.889376887997291e-09,
15863
+ "loss": 0.1092,
15864
+ "step": 2085
15865
+ },
15866
+ {
15867
+ "epoch": 0.988859919412183,
15868
+ "grad_norm": 2.9011049270629883,
15869
+ "learning_rate": 7.245703401149717e-09,
15870
+ "loss": 0.1357,
15871
+ "step": 2086
15872
+ },
15873
+ {
15874
+ "epoch": 0.9893339653946432,
15875
+ "grad_norm": 6.7899250984191895,
15876
+ "learning_rate": 6.629407329292736e-09,
15877
+ "loss": 0.1352,
15878
+ "step": 2087
15879
+ },
15880
+ {
15881
+ "epoch": 0.9898080113771036,
15882
+ "grad_norm": 3.2638795375823975,
15883
+ "learning_rate": 6.0404903609068146e-09,
15884
+ "loss": 0.1101,
15885
+ "step": 2088
15886
+ },
15887
+ {
15888
+ "epoch": 0.9902820573595639,
15889
+ "grad_norm": 5.045032501220703,
15890
+ "learning_rate": 5.47895410946575e-09,
15891
+ "loss": 0.1703,
15892
+ "step": 2089
15893
+ },
15894
+ {
15895
+ "epoch": 0.9907561033420241,
15896
+ "grad_norm": 3.866666078567505,
15897
+ "learning_rate": 4.9448001134233536e-09,
15898
+ "loss": 0.1551,
15899
+ "step": 2090
15900
+ },
15901
+ {
15902
+ "epoch": 0.9912301493244845,
15903
+ "grad_norm": 3.6094112396240234,
15904
+ "learning_rate": 4.438029836216773e-09,
15905
+ "loss": 0.1368,
15906
+ "step": 2091
15907
+ },
15908
+ {
15909
+ "epoch": 0.9917041953069448,
15910
+ "grad_norm": 4.1954779624938965,
15911
+ "learning_rate": 3.958644666257616e-09,
15912
+ "loss": 0.1503,
15913
+ "step": 2092
15914
+ },
15915
+ {
15916
+ "epoch": 0.9921782412894051,
15917
+ "grad_norm": 3.4606990814208984,
15918
+ "learning_rate": 3.5066459169297294e-09,
15919
+ "loss": 0.1193,
15920
+ "step": 2093
15921
+ },
15922
+ {
15923
+ "epoch": 0.9926522872718654,
15924
+ "grad_norm": 3.768021821975708,
15925
+ "learning_rate": 3.082034826586977e-09,
15926
+ "loss": 0.171,
15927
+ "step": 2094
15928
+ },
15929
+ {
15930
+ "epoch": 0.9931263332543256,
15931
+ "grad_norm": 3.0747859477996826,
15932
+ "learning_rate": 2.684812558547689e-09,
15933
+ "loss": 0.0801,
15934
+ "step": 2095
15935
+ },
15936
+ {
15937
+ "epoch": 0.993600379236786,
15938
+ "grad_norm": 6.338437080383301,
15939
+ "learning_rate": 2.3149802010913323e-09,
15940
+ "loss": 0.1644,
15941
+ "step": 2096
15942
+ },
15943
+ {
15944
+ "epoch": 0.9940744252192463,
15945
+ "grad_norm": 3.9809048175811768,
15946
+ "learning_rate": 1.9725387674585095e-09,
15947
+ "loss": 0.1284,
15948
+ "step": 2097
15949
+ },
15950
+ {
15951
+ "epoch": 0.9945484712017065,
15952
+ "grad_norm": 4.982059955596924,
15953
+ "learning_rate": 1.6574891958442973e-09,
15954
+ "loss": 0.1434,
15955
+ "step": 2098
15956
+ },
15957
+ {
15958
+ "epoch": 0.9950225171841669,
15959
+ "grad_norm": 2.924060583114624,
15960
+ "learning_rate": 1.3698323493993582e-09,
15961
+ "loss": 0.0911,
15962
+ "step": 2099
15963
+ },
15964
+ {
15965
+ "epoch": 0.9954965631666272,
15966
+ "grad_norm": 7.759647846221924,
15967
+ "learning_rate": 1.1095690162243878e-09,
15968
+ "loss": 0.2136,
15969
+ "step": 2100
15970
+ },
15971
+ {
15972
+ "epoch": 0.9954965631666272,
15973
+ "eval_accuracy": 0.9935587761674718,
15974
+ "eval_f1": 0.9272727272727272,
15975
+ "eval_loss": 0.012555374763906002,
15976
+ "eval_precision": 0.8793103448275862,
15977
+ "eval_recall": 0.9807692307692307,
15978
+ "eval_runtime": 50.017,
15979
+ "eval_samples_per_second": 5.418,
15980
+ "eval_steps_per_second": 0.18,
15981
+ "step": 2100
15982
  }
15983
  ],
15984
  "logging_steps": 1,
 
15998
  "attributes": {}
15999
  }
16000
  },
16001
+ "total_flos": 5.6093907946255155e+17,
16002
  "train_batch_size": 8,
16003
  "trial_name": null,
16004
  "trial_params": null