mtzig commited on
Commit
6d384dd
·
verified ·
1 Parent(s): 0f6cfd8

Training in progress, step 2100, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d7192018c64bdccff774afcd22cbcd76059fa27194291eeaacbc76ea524aa63
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c626d88cecaf6f37244c24626ee31bda254de73e335860f886b2be28c4358d97
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca22c33f8080251d5f5d97f28d33c8cc18a4767e5f5cff87416545e5fb185fb
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b98f82f21939a42caf63b3e60fa8693d044a5bada470fb4c47ca564bc1aa2906
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66623e6446f8054aee5d492b054a9455f9bb8adbad530ba6f6465ee2f5929c58
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1bccfb3da16edb9ca2352f991e7e2c84949c2cebb82bdfe6dff4edb7588812b
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54b62f8c309871fa5d8782861ae4bf92a5dceb25d023cd3a9d768be5615f069a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1b5e474c9b591c523f4c4558a63e2fdd86f92990aa17d39609578b1c9d025a
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84e430f88a0a44f989953bfa3d00715c78a1e556790668bcf94552bbb3132bcd
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84eef1c8a398e669a09b130c39c3f146f2a1df5c8f58186431773f03716ad0dd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c41f1e9710327af3eb1dbcd995f1c8c4728f5def02217750c574e1151cb857f
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c2c1bfcfbe43cb98961bcf7bbee9910700d60cc94ea9e559cdcc0bfcaf1d3a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f3d47d510d184b2506f02c98ec0193d42ca3e28479c7d3b1251b62aeee8ed8a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65404a56baaeb38eea09621cc68aa2f31f268f0657702a26eb129038b9b80d1b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ee788f6b1cf391d52047972d3a7d27b4d29af32d1a4ba0a7f601e15b6e16d3a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3c2b908498addeec6c50ef933c786ada650e8ffdacabaf686c730cc90d5e9dd
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:893faec3b6c926fa56067e55d73350b0ed9727d9be736c36b2925e7f1e74fab7
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75d7eee0983d654dc4f4d9d0aeab1c0cc99847a413b7ee9122cbe6f31278739d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb66081da603be4d12e8cfebe24115be6094a48664c48955e07784d22997190a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6b89b5ae016f3558d6cf4489eb242de8fea1141c77af78593bebef95e5e45eb
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b1dff40192627c15279da223a06773cb9569b8accd66f51d83aab7cd1a1d6ae
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9375cbe9615de32a9bfeb48c97d58f16a884f450ceae1c1433fd9c53f512214c
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4bff99e2375f4a7846a05096a3ce957abe9a2c562cd9e7982d628fafd52f87e
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c4476d4d3e749b45bb7cf5bd672971013f9e7d9039dbfad26020d82e32caff
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d73ccbddd9878e2801e4e223113f627d05857e225082b8462f35474c4ac4809
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20531ddcffa25460cb7198bef6ec4382015b394eaa7700ad1ffe8c13cee7ce9f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9456264775413712,
5
  "eval_steps": 20,
6
- "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -15219,6 +15219,766 @@
15219
  "eval_samples_per_second": 5.641,
15220
  "eval_steps_per_second": 0.184,
15221
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15222
  }
15223
  ],
15224
  "logging_steps": 1,
@@ -15238,7 +15998,7 @@
15238
  "attributes": {}
15239
  }
15240
  },
15241
- "total_flos": 5.1185465136093594e+17,
15242
  "train_batch_size": 8,
15243
  "trial_name": null,
15244
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9929078014184397,
5
  "eval_steps": 20,
6
+ "global_step": 2100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
15219
  "eval_samples_per_second": 5.641,
15220
  "eval_steps_per_second": 0.184,
15221
  "step": 2000
15222
+ },
15223
+ {
15224
+ "epoch": 0.9460992907801419,
15225
+ "grad_norm": 5.928063869476318,
15226
+ "learning_rate": 1.7657111276051852e-07,
15227
+ "loss": 0.2758,
15228
+ "step": 2001
15229
+ },
15230
+ {
15231
+ "epoch": 0.9465721040189126,
15232
+ "grad_norm": 5.081968784332275,
15233
+ "learning_rate": 1.734959100204281e-07,
15234
+ "loss": 0.1877,
15235
+ "step": 2002
15236
+ },
15237
+ {
15238
+ "epoch": 0.9470449172576832,
15239
+ "grad_norm": 5.424426078796387,
15240
+ "learning_rate": 1.704474879300766e-07,
15241
+ "loss": 0.216,
15242
+ "step": 2003
15243
+ },
15244
+ {
15245
+ "epoch": 0.9475177304964539,
15246
+ "grad_norm": 5.300611972808838,
15247
+ "learning_rate": 1.6742585479747388e-07,
15248
+ "loss": 0.2141,
15249
+ "step": 2004
15250
+ },
15251
+ {
15252
+ "epoch": 0.9479905437352246,
15253
+ "grad_norm": 7.5446858406066895,
15254
+ "learning_rate": 1.6443101885762812e-07,
15255
+ "loss": 0.2932,
15256
+ "step": 2005
15257
+ },
15258
+ {
15259
+ "epoch": 0.9484633569739953,
15260
+ "grad_norm": 6.073637008666992,
15261
+ "learning_rate": 1.614629882725094e-07,
15262
+ "loss": 0.2036,
15263
+ "step": 2006
15264
+ },
15265
+ {
15266
+ "epoch": 0.948936170212766,
15267
+ "grad_norm": 7.519749164581299,
15268
+ "learning_rate": 1.5852177113103606e-07,
15269
+ "loss": 0.2765,
15270
+ "step": 2007
15271
+ },
15272
+ {
15273
+ "epoch": 0.9494089834515367,
15274
+ "grad_norm": 6.116303443908691,
15275
+ "learning_rate": 1.5560737544905058e-07,
15276
+ "loss": 0.2913,
15277
+ "step": 2008
15278
+ },
15279
+ {
15280
+ "epoch": 0.9498817966903074,
15281
+ "grad_norm": 5.81624174118042,
15282
+ "learning_rate": 1.5271980916929497e-07,
15283
+ "loss": 0.2321,
15284
+ "step": 2009
15285
+ },
15286
+ {
15287
+ "epoch": 0.950354609929078,
15288
+ "grad_norm": 5.760371208190918,
15289
+ "learning_rate": 1.498590801613975e-07,
15290
+ "loss": 0.2134,
15291
+ "step": 2010
15292
+ },
15293
+ {
15294
+ "epoch": 0.9508274231678487,
15295
+ "grad_norm": 5.03253698348999,
15296
+ "learning_rate": 1.4702519622184053e-07,
15297
+ "loss": 0.2093,
15298
+ "step": 2011
15299
+ },
15300
+ {
15301
+ "epoch": 0.9513002364066194,
15302
+ "grad_norm": 4.581620693206787,
15303
+ "learning_rate": 1.4421816507394605e-07,
15304
+ "loss": 0.2063,
15305
+ "step": 2012
15306
+ },
15307
+ {
15308
+ "epoch": 0.9517730496453901,
15309
+ "grad_norm": 5.890350818634033,
15310
+ "learning_rate": 1.4143799436785233e-07,
15311
+ "loss": 0.2267,
15312
+ "step": 2013
15313
+ },
15314
+ {
15315
+ "epoch": 0.9522458628841608,
15316
+ "grad_norm": 6.05654764175415,
15317
+ "learning_rate": 1.3868469168049403e-07,
15318
+ "loss": 0.2326,
15319
+ "step": 2014
15320
+ },
15321
+ {
15322
+ "epoch": 0.9527186761229315,
15323
+ "grad_norm": 3.6070337295532227,
15324
+ "learning_rate": 1.3595826451558214e-07,
15325
+ "loss": 0.1469,
15326
+ "step": 2015
15327
+ },
15328
+ {
15329
+ "epoch": 0.9531914893617022,
15330
+ "grad_norm": 7.624080181121826,
15331
+ "learning_rate": 1.3325872030357955e-07,
15332
+ "loss": 0.2893,
15333
+ "step": 2016
15334
+ },
15335
+ {
15336
+ "epoch": 0.9536643026004729,
15337
+ "grad_norm": 6.688779354095459,
15338
+ "learning_rate": 1.3058606640168558e-07,
15339
+ "loss": 0.2668,
15340
+ "step": 2017
15341
+ },
15342
+ {
15343
+ "epoch": 0.9541371158392435,
15344
+ "grad_norm": 6.714046001434326,
15345
+ "learning_rate": 1.279403100938148e-07,
15346
+ "loss": 0.2095,
15347
+ "step": 2018
15348
+ },
15349
+ {
15350
+ "epoch": 0.9546099290780142,
15351
+ "grad_norm": 3.696683406829834,
15352
+ "learning_rate": 1.25321458590576e-07,
15353
+ "loss": 0.1431,
15354
+ "step": 2019
15355
+ },
15356
+ {
15357
+ "epoch": 0.9550827423167849,
15358
+ "grad_norm": 6.133592128753662,
15359
+ "learning_rate": 1.2272951902925211e-07,
15360
+ "loss": 0.3241,
15361
+ "step": 2020
15362
+ },
15363
+ {
15364
+ "epoch": 0.9550827423167849,
15365
+ "eval_accuracy": 0.8647450110864745,
15366
+ "eval_f1": 0.7162790697674418,
15367
+ "eval_loss": 0.2992999255657196,
15368
+ "eval_precision": 0.8700564971751412,
15369
+ "eval_recall": 0.6086956521739131,
15370
+ "eval_runtime": 48.4915,
15371
+ "eval_samples_per_second": 5.692,
15372
+ "eval_steps_per_second": 0.186,
15373
+ "step": 2020
15374
+ },
15375
+ {
15376
+ "epoch": 0.9555555555555556,
15377
+ "grad_norm": 7.198812007904053,
15378
+ "learning_rate": 1.201644984737804e-07,
15379
+ "loss": 0.2988,
15380
+ "step": 2021
15381
+ },
15382
+ {
15383
+ "epoch": 0.9560283687943263,
15384
+ "grad_norm": 4.9037322998046875,
15385
+ "learning_rate": 1.1762640391473901e-07,
15386
+ "loss": 0.2401,
15387
+ "step": 2022
15388
+ },
15389
+ {
15390
+ "epoch": 0.956501182033097,
15391
+ "grad_norm": 4.425469398498535,
15392
+ "learning_rate": 1.1511524226931914e-07,
15393
+ "loss": 0.1406,
15394
+ "step": 2023
15395
+ },
15396
+ {
15397
+ "epoch": 0.9569739952718677,
15398
+ "grad_norm": 5.938382625579834,
15399
+ "learning_rate": 1.126310203813108e-07,
15400
+ "loss": 0.2148,
15401
+ "step": 2024
15402
+ },
15403
+ {
15404
+ "epoch": 0.9574468085106383,
15405
+ "grad_norm": 7.644670486450195,
15406
+ "learning_rate": 1.1017374502108713e-07,
15407
+ "loss": 0.2778,
15408
+ "step": 2025
15409
+ },
15410
+ {
15411
+ "epoch": 0.957919621749409,
15412
+ "grad_norm": 5.539424896240234,
15413
+ "learning_rate": 1.0774342288557892e-07,
15414
+ "loss": 0.2106,
15415
+ "step": 2026
15416
+ },
15417
+ {
15418
+ "epoch": 0.9583924349881797,
15419
+ "grad_norm": 6.603002548217773,
15420
+ "learning_rate": 1.053400605982613e-07,
15421
+ "loss": 0.2815,
15422
+ "step": 2027
15423
+ },
15424
+ {
15425
+ "epoch": 0.9588652482269504,
15426
+ "grad_norm": 4.729203701019287,
15427
+ "learning_rate": 1.0296366470913477e-07,
15428
+ "loss": 0.2226,
15429
+ "step": 2028
15430
+ },
15431
+ {
15432
+ "epoch": 0.9593380614657211,
15433
+ "grad_norm": 7.116330623626709,
15434
+ "learning_rate": 1.0061424169470646e-07,
15435
+ "loss": 0.299,
15436
+ "step": 2029
15437
+ },
15438
+ {
15439
+ "epoch": 0.9598108747044918,
15440
+ "grad_norm": 6.153399467468262,
15441
+ "learning_rate": 9.829179795797339e-08,
15442
+ "loss": 0.2681,
15443
+ "step": 2030
15444
+ },
15445
+ {
15446
+ "epoch": 0.9602836879432625,
15447
+ "grad_norm": 4.379301071166992,
15448
+ "learning_rate": 9.599633982840362e-08,
15449
+ "loss": 0.1883,
15450
+ "step": 2031
15451
+ },
15452
+ {
15453
+ "epoch": 0.9607565011820332,
15454
+ "grad_norm": 5.625801086425781,
15455
+ "learning_rate": 9.372787356192181e-08,
15456
+ "loss": 0.1923,
15457
+ "step": 2032
15458
+ },
15459
+ {
15460
+ "epoch": 0.9612293144208038,
15461
+ "grad_norm": 4.8772077560424805,
15462
+ "learning_rate": 9.148640534089037e-08,
15463
+ "loss": 0.1565,
15464
+ "step": 2033
15465
+ },
15466
+ {
15467
+ "epoch": 0.9617021276595744,
15468
+ "grad_norm": 6.87009334564209,
15469
+ "learning_rate": 8.927194127408945e-08,
15470
+ "loss": 0.2341,
15471
+ "step": 2034
15472
+ },
15473
+ {
15474
+ "epoch": 0.9621749408983451,
15475
+ "grad_norm": 4.184564113616943,
15476
+ "learning_rate": 8.708448739670805e-08,
15477
+ "loss": 0.1848,
15478
+ "step": 2035
15479
+ },
15480
+ {
15481
+ "epoch": 0.9626477541371158,
15482
+ "grad_norm": 4.61867094039917,
15483
+ "learning_rate": 8.492404967031853e-08,
15484
+ "loss": 0.175,
15485
+ "step": 2036
15486
+ },
15487
+ {
15488
+ "epoch": 0.9631205673758865,
15489
+ "grad_norm": 3.9743919372558594,
15490
+ "learning_rate": 8.27906339828688e-08,
15491
+ "loss": 0.1485,
15492
+ "step": 2037
15493
+ },
15494
+ {
15495
+ "epoch": 0.9635933806146572,
15496
+ "grad_norm": 6.921072959899902,
15497
+ "learning_rate": 8.0684246148659e-08,
15498
+ "loss": 0.2734,
15499
+ "step": 2038
15500
+ },
15501
+ {
15502
+ "epoch": 0.9640661938534278,
15503
+ "grad_norm": 4.7037129402160645,
15504
+ "learning_rate": 7.860489190833043e-08,
15505
+ "loss": 0.1407,
15506
+ "step": 2039
15507
+ },
15508
+ {
15509
+ "epoch": 0.9645390070921985,
15510
+ "grad_norm": 5.145064353942871,
15511
+ "learning_rate": 7.655257692884998e-08,
15512
+ "loss": 0.2289,
15513
+ "step": 2040
15514
+ },
15515
+ {
15516
+ "epoch": 0.9645390070921985,
15517
+ "eval_accuracy": 0.8658536585365854,
15518
+ "eval_f1": 0.7192575406032483,
15519
+ "eval_loss": 0.29763469099998474,
15520
+ "eval_precision": 0.8707865168539326,
15521
+ "eval_recall": 0.6126482213438735,
15522
+ "eval_runtime": 48.2853,
15523
+ "eval_samples_per_second": 5.716,
15524
+ "eval_steps_per_second": 0.186,
15525
+ "step": 2040
15526
+ },
15527
+ {
15528
+ "epoch": 0.9650118203309692,
15529
+ "grad_norm": 5.289119243621826,
15530
+ "learning_rate": 7.452730680349019e-08,
15531
+ "loss": 0.2251,
15532
+ "step": 2041
15533
+ },
15534
+ {
15535
+ "epoch": 0.9654846335697399,
15536
+ "grad_norm": 7.4958624839782715,
15537
+ "learning_rate": 7.252908705181805e-08,
15538
+ "loss": 0.2453,
15539
+ "step": 2042
15540
+ },
15541
+ {
15542
+ "epoch": 0.9659574468085106,
15543
+ "grad_norm": 5.394641876220703,
15544
+ "learning_rate": 7.055792311967958e-08,
15545
+ "loss": 0.2879,
15546
+ "step": 2043
15547
+ },
15548
+ {
15549
+ "epoch": 0.9664302600472813,
15550
+ "grad_norm": 4.002281665802002,
15551
+ "learning_rate": 6.861382037918418e-08,
15552
+ "loss": 0.1805,
15553
+ "step": 2044
15554
+ },
15555
+ {
15556
+ "epoch": 0.966903073286052,
15557
+ "grad_norm": 5.974024295806885,
15558
+ "learning_rate": 6.669678412868919e-08,
15559
+ "loss": 0.2024,
15560
+ "step": 2045
15561
+ },
15562
+ {
15563
+ "epoch": 0.9673758865248226,
15564
+ "grad_norm": 5.801767349243164,
15565
+ "learning_rate": 6.480681959278645e-08,
15566
+ "loss": 0.2164,
15567
+ "step": 2046
15568
+ },
15569
+ {
15570
+ "epoch": 0.9678486997635933,
15571
+ "grad_norm": 4.779239177703857,
15572
+ "learning_rate": 6.29439319222891e-08,
15573
+ "loss": 0.1936,
15574
+ "step": 2047
15575
+ },
15576
+ {
15577
+ "epoch": 0.968321513002364,
15578
+ "grad_norm": 4.674015522003174,
15579
+ "learning_rate": 6.11081261942148e-08,
15580
+ "loss": 0.2035,
15581
+ "step": 2048
15582
+ },
15583
+ {
15584
+ "epoch": 0.9687943262411347,
15585
+ "grad_norm": 6.905233860015869,
15586
+ "learning_rate": 5.929940741177476e-08,
15587
+ "loss": 0.2818,
15588
+ "step": 2049
15589
+ },
15590
+ {
15591
+ "epoch": 0.9692671394799054,
15592
+ "grad_norm": 9.568391799926758,
15593
+ "learning_rate": 5.751778050435808e-08,
15594
+ "loss": 0.32,
15595
+ "step": 2050
15596
+ },
15597
+ {
15598
+ "epoch": 0.9697399527186761,
15599
+ "grad_norm": 5.665557384490967,
15600
+ "learning_rate": 5.5763250327518505e-08,
15601
+ "loss": 0.2695,
15602
+ "step": 2051
15603
+ },
15604
+ {
15605
+ "epoch": 0.9702127659574468,
15606
+ "grad_norm": 4.919648170471191,
15607
+ "learning_rate": 5.4035821662963285e-08,
15608
+ "loss": 0.2343,
15609
+ "step": 2052
15610
+ },
15611
+ {
15612
+ "epoch": 0.9706855791962175,
15613
+ "grad_norm": 3.9685451984405518,
15614
+ "learning_rate": 5.233549921853876e-08,
15615
+ "loss": 0.18,
15616
+ "step": 2053
15617
+ },
15618
+ {
15619
+ "epoch": 0.9711583924349881,
15620
+ "grad_norm": 5.1178131103515625,
15621
+ "learning_rate": 5.066228762821479e-08,
15622
+ "loss": 0.1903,
15623
+ "step": 2054
15624
+ },
15625
+ {
15626
+ "epoch": 0.9716312056737588,
15627
+ "grad_norm": 6.247317314147949,
15628
+ "learning_rate": 4.901619145207703e-08,
15629
+ "loss": 0.1892,
15630
+ "step": 2055
15631
+ },
15632
+ {
15633
+ "epoch": 0.9721040189125295,
15634
+ "grad_norm": 3.8373396396636963,
15635
+ "learning_rate": 4.7397215176311354e-08,
15636
+ "loss": 0.1359,
15637
+ "step": 2056
15638
+ },
15639
+ {
15640
+ "epoch": 0.9725768321513002,
15641
+ "grad_norm": 6.623259544372559,
15642
+ "learning_rate": 4.580536321319273e-08,
15643
+ "loss": 0.23,
15644
+ "step": 2057
15645
+ },
15646
+ {
15647
+ "epoch": 0.9730496453900709,
15648
+ "grad_norm": 5.989914894104004,
15649
+ "learning_rate": 4.424063990107308e-08,
15650
+ "loss": 0.2538,
15651
+ "step": 2058
15652
+ },
15653
+ {
15654
+ "epoch": 0.9735224586288416,
15655
+ "grad_norm": 4.51497745513916,
15656
+ "learning_rate": 4.270304950436788e-08,
15657
+ "loss": 0.1994,
15658
+ "step": 2059
15659
+ },
15660
+ {
15661
+ "epoch": 0.9739952718676123,
15662
+ "grad_norm": 4.718496799468994,
15663
+ "learning_rate": 4.119259621354843e-08,
15664
+ "loss": 0.1593,
15665
+ "step": 2060
15666
+ },
15667
+ {
15668
+ "epoch": 0.9739952718676123,
15669
+ "eval_accuracy": 0.8636363636363636,
15670
+ "eval_f1": 0.7132867132867133,
15671
+ "eval_loss": 0.29835787415504456,
15672
+ "eval_precision": 0.8693181818181818,
15673
+ "eval_recall": 0.6047430830039525,
15674
+ "eval_runtime": 48.4979,
15675
+ "eval_samples_per_second": 5.691,
15676
+ "eval_steps_per_second": 0.186,
15677
+ "step": 2060
15678
+ },
15679
+ {
15680
+ "epoch": 0.9744680851063829,
15681
+ "grad_norm": 4.2522358894348145,
15682
+ "learning_rate": 3.9709284145125205e-08,
15683
+ "loss": 0.2072,
15684
+ "step": 2061
15685
+ },
15686
+ {
15687
+ "epoch": 0.9749408983451536,
15688
+ "grad_norm": 6.090972900390625,
15689
+ "learning_rate": 3.825311734164116e-08,
15690
+ "loss": 0.227,
15691
+ "step": 2062
15692
+ },
15693
+ {
15694
+ "epoch": 0.9754137115839243,
15695
+ "grad_norm": 5.209742546081543,
15696
+ "learning_rate": 3.682409977165957e-08,
15697
+ "loss": 0.214,
15698
+ "step": 2063
15699
+ },
15700
+ {
15701
+ "epoch": 0.975886524822695,
15702
+ "grad_norm": 5.365957260131836,
15703
+ "learning_rate": 3.5422235329751756e-08,
15704
+ "loss": 0.1831,
15705
+ "step": 2064
15706
+ },
15707
+ {
15708
+ "epoch": 0.9763593380614657,
15709
+ "grad_norm": 9.389203071594238,
15710
+ "learning_rate": 3.4047527836483793e-08,
15711
+ "loss": 0.2723,
15712
+ "step": 2065
15713
+ },
15714
+ {
15715
+ "epoch": 0.9768321513002364,
15716
+ "grad_norm": 7.358561038970947,
15717
+ "learning_rate": 3.269998103841765e-08,
15718
+ "loss": 0.2694,
15719
+ "step": 2066
15720
+ },
15721
+ {
15722
+ "epoch": 0.9773049645390071,
15723
+ "grad_norm": 5.198401927947998,
15724
+ "learning_rate": 3.137959860808448e-08,
15725
+ "loss": 0.29,
15726
+ "step": 2067
15727
+ },
15728
+ {
15729
+ "epoch": 0.9777777777777777,
15730
+ "grad_norm": 5.073206901550293,
15731
+ "learning_rate": 3.008638414398801e-08,
15732
+ "loss": 0.2165,
15733
+ "step": 2068
15734
+ },
15735
+ {
15736
+ "epoch": 0.9782505910165484,
15737
+ "grad_norm": 5.652972221374512,
15738
+ "learning_rate": 2.882034117058896e-08,
15739
+ "loss": 0.2447,
15740
+ "step": 2069
15741
+ },
15742
+ {
15743
+ "epoch": 0.9787234042553191,
15744
+ "grad_norm": 5.199291229248047,
15745
+ "learning_rate": 2.7581473138296177e-08,
15746
+ "loss": 0.2055,
15747
+ "step": 2070
15748
+ },
15749
+ {
15750
+ "epoch": 0.9791962174940898,
15751
+ "grad_norm": 4.334774017333984,
15752
+ "learning_rate": 2.636978342345553e-08,
15753
+ "loss": 0.1535,
15754
+ "step": 2071
15755
+ },
15756
+ {
15757
+ "epoch": 0.9796690307328605,
15758
+ "grad_norm": 5.554661750793457,
15759
+ "learning_rate": 2.518527532834436e-08,
15760
+ "loss": 0.239,
15761
+ "step": 2072
15762
+ },
15763
+ {
15764
+ "epoch": 0.9801418439716312,
15765
+ "grad_norm": 5.669870853424072,
15766
+ "learning_rate": 2.402795208116149e-08,
15767
+ "loss": 0.2128,
15768
+ "step": 2073
15769
+ },
15770
+ {
15771
+ "epoch": 0.9806146572104019,
15772
+ "grad_norm": 5.936855316162109,
15773
+ "learning_rate": 2.2897816836014996e-08,
15774
+ "loss": 0.275,
15775
+ "step": 2074
15776
+ },
15777
+ {
15778
+ "epoch": 0.9810874704491725,
15779
+ "grad_norm": 10.341303825378418,
15780
+ "learning_rate": 2.179487267291891e-08,
15781
+ "loss": 0.3173,
15782
+ "step": 2075
15783
+ },
15784
+ {
15785
+ "epoch": 0.9815602836879432,
15786
+ "grad_norm": 6.050800323486328,
15787
+ "learning_rate": 2.071912259777875e-08,
15788
+ "loss": 0.2196,
15789
+ "step": 2076
15790
+ },
15791
+ {
15792
+ "epoch": 0.9820330969267139,
15793
+ "grad_norm": 5.055636882781982,
15794
+ "learning_rate": 1.967056954238933e-08,
15795
+ "loss": 0.181,
15796
+ "step": 2077
15797
+ },
15798
+ {
15799
+ "epoch": 0.9825059101654846,
15800
+ "grad_norm": 7.4767632484436035,
15801
+ "learning_rate": 1.864921636442252e-08,
15802
+ "loss": 0.201,
15803
+ "step": 2078
15804
+ },
15805
+ {
15806
+ "epoch": 0.9829787234042553,
15807
+ "grad_norm": 6.8587493896484375,
15808
+ "learning_rate": 1.7655065847423935e-08,
15809
+ "loss": 0.3132,
15810
+ "step": 2079
15811
+ },
15812
+ {
15813
+ "epoch": 0.983451536643026,
15814
+ "grad_norm": 7.90069580078125,
15815
+ "learning_rate": 1.6688120700798505e-08,
15816
+ "loss": 0.2018,
15817
+ "step": 2080
15818
+ },
15819
+ {
15820
+ "epoch": 0.983451536643026,
15821
+ "eval_accuracy": 0.8647450110864745,
15822
+ "eval_f1": 0.7175925925925926,
15823
+ "eval_loss": 0.29836517572402954,
15824
+ "eval_precision": 0.8659217877094972,
15825
+ "eval_recall": 0.6126482213438735,
15826
+ "eval_runtime": 48.9715,
15827
+ "eval_samples_per_second": 5.636,
15828
+ "eval_steps_per_second": 0.184,
15829
+ "step": 2080
15830
+ },
15831
+ {
15832
+ "epoch": 0.9839243498817967,
15833
+ "grad_norm": 6.26698637008667,
15834
+ "learning_rate": 1.5748383559809345e-08,
15835
+ "loss": 0.2399,
15836
+ "step": 2081
15837
+ },
15838
+ {
15839
+ "epoch": 0.9843971631205674,
15840
+ "grad_norm": 6.140974044799805,
15841
+ "learning_rate": 1.4835856985568887e-08,
15842
+ "loss": 0.2634,
15843
+ "step": 2082
15844
+ },
15845
+ {
15846
+ "epoch": 0.984869976359338,
15847
+ "grad_norm": 4.758864879608154,
15848
+ "learning_rate": 1.3950543465027777e-08,
15849
+ "loss": 0.2022,
15850
+ "step": 2083
15851
+ },
15852
+ {
15853
+ "epoch": 0.9853427895981087,
15854
+ "grad_norm": 6.061093330383301,
15855
+ "learning_rate": 1.3092445410977094e-08,
15856
+ "loss": 0.2611,
15857
+ "step": 2084
15858
+ },
15859
+ {
15860
+ "epoch": 0.9858156028368794,
15861
+ "grad_norm": 5.0369696617126465,
15862
+ "learning_rate": 1.2261565162030586e-08,
15863
+ "loss": 0.242,
15864
+ "step": 2085
15865
+ },
15866
+ {
15867
+ "epoch": 0.9862884160756501,
15868
+ "grad_norm": 4.759927272796631,
15869
+ "learning_rate": 1.1457904982627998e-08,
15870
+ "loss": 0.2424,
15871
+ "step": 2086
15872
+ },
15873
+ {
15874
+ "epoch": 0.9867612293144208,
15875
+ "grad_norm": 4.427268028259277,
15876
+ "learning_rate": 1.0681467063022866e-08,
15877
+ "loss": 0.1903,
15878
+ "step": 2087
15879
+ },
15880
+ {
15881
+ "epoch": 0.9872340425531915,
15882
+ "grad_norm": 5.498013496398926,
15883
+ "learning_rate": 9.932253519280289e-09,
15884
+ "loss": 0.1198,
15885
+ "step": 2088
15886
+ },
15887
+ {
15888
+ "epoch": 0.9877068557919622,
15889
+ "grad_norm": 5.413758754730225,
15890
+ "learning_rate": 9.210266393266942e-09,
15891
+ "loss": 0.231,
15892
+ "step": 2089
15893
+ },
15894
+ {
15895
+ "epoch": 0.9881796690307328,
15896
+ "grad_norm": 7.1858134269714355,
15897
+ "learning_rate": 8.515507652649968e-09,
15898
+ "loss": 0.26,
15899
+ "step": 2090
15900
+ },
15901
+ {
15902
+ "epoch": 0.9886524822695035,
15903
+ "grad_norm": 4.840980052947998,
15904
+ "learning_rate": 7.84797919089031e-09,
15905
+ "loss": 0.2581,
15906
+ "step": 2091
15907
+ },
15908
+ {
15909
+ "epoch": 0.9891252955082742,
15910
+ "grad_norm": 5.378105640411377,
15911
+ "learning_rate": 7.20768282723383e-09,
15912
+ "loss": 0.2107,
15913
+ "step": 2092
15914
+ },
15915
+ {
15916
+ "epoch": 0.9895981087470449,
15917
+ "grad_norm": 8.181370735168457,
15918
+ "learning_rate": 6.5946203067135395e-09,
15919
+ "loss": 0.2245,
15920
+ "step": 2093
15921
+ },
15922
+ {
15923
+ "epoch": 0.9900709219858156,
15924
+ "grad_norm": 5.936405181884766,
15925
+ "learning_rate": 6.008793300136262e-09,
15926
+ "loss": 0.1958,
15927
+ "step": 2094
15928
+ },
15929
+ {
15930
+ "epoch": 0.9905437352245863,
15931
+ "grad_norm": 6.984827995300293,
15932
+ "learning_rate": 5.450203404087084e-09,
15933
+ "loss": 0.2338,
15934
+ "step": 2095
15935
+ },
15936
+ {
15937
+ "epoch": 0.991016548463357,
15938
+ "grad_norm": 5.687265872955322,
15939
+ "learning_rate": 4.918852140916031e-09,
15940
+ "loss": 0.2498,
15941
+ "step": 2096
15942
+ },
15943
+ {
15944
+ "epoch": 0.9914893617021276,
15945
+ "grad_norm": 8.568177223205566,
15946
+ "learning_rate": 4.414740958742503e-09,
15947
+ "loss": 0.3252,
15948
+ "step": 2097
15949
+ },
15950
+ {
15951
+ "epoch": 0.9919621749408983,
15952
+ "grad_norm": 4.833063125610352,
15953
+ "learning_rate": 3.937871231444179e-09,
15954
+ "loss": 0.1798,
15955
+ "step": 2098
15956
+ },
15957
+ {
15958
+ "epoch": 0.992434988179669,
15959
+ "grad_norm": 4.7450056076049805,
15960
+ "learning_rate": 3.4882442586570143e-09,
15961
+ "loss": 0.1758,
15962
+ "step": 2099
15963
+ },
15964
+ {
15965
+ "epoch": 0.9929078014184397,
15966
+ "grad_norm": 5.54990291595459,
15967
+ "learning_rate": 3.0658612657730182e-09,
15968
+ "loss": 0.2018,
15969
+ "step": 2100
15970
+ },
15971
+ {
15972
+ "epoch": 0.9929078014184397,
15973
+ "eval_accuracy": 0.8647450110864745,
15974
+ "eval_f1": 0.7162790697674418,
15975
+ "eval_loss": 0.2974694073200226,
15976
+ "eval_precision": 0.8700564971751412,
15977
+ "eval_recall": 0.6086956521739131,
15978
+ "eval_runtime": 47.9735,
15979
+ "eval_samples_per_second": 5.753,
15980
+ "eval_steps_per_second": 0.188,
15981
+ "step": 2100
15982
  }
15983
  ],
15984
  "logging_steps": 1,
 
15998
  "attributes": {}
15999
  }
16000
  },
16001
+ "total_flos": 5.377331196550185e+17,
16002
  "train_batch_size": 8,
16003
  "trial_name": null,
16004
  "trial_params": null