mtzig commited on
Commit
8bfdc48
·
verified ·
1 Parent(s): 43113f1

Training in progress, step 6700, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d4fccc41669d8adadb54f68349f74f89ffff09966ac60dcb53a6e48cd78c003
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965566f8b9a741a6f2801dc78e4fbc5ac70240c8d6d7b5570ba0182bcd9674e9
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67369eaffaaf23fcc57a3579d2b350eecf84593e088e012b88be2cddfbf73336
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7212ed89906b0804da8eba1f5c500d042a2a31b594b63c7afc77b7fca62b4f05
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b50419c39b978efc4f0a7211e73d09aa76109771056a53f0af1043bfa2a908e
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93ad7abb665289229475a0dc55018b7ca6c10b70ef45f15c0b9b8f137cc5c291
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:558bd7a1550e1f29246bbb3508f6e1aeea579c63ac91e9658afafb526206e361
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a28d9e945552a66feca51fc9780b294ee621de58c9db83d3aefe7462105d0d49
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b3c3c85375cb3b52f1d532892946383bc9042f73634efc9351ea34228856e5f
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49977e9fb46265ba81ad5ce120a7b938b5fafa454d7bb632a57a63f975e9f54a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc71fae38f9e58f7ed5e1e8ac6eae4e0afc3c45a3119840f87936211ac808bef
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a633db66552fb787cb1151b9a3e2e30b0293e84603ef7d545351fc947c5f219
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8ed0d0c7fd248cf46be28fe84a80281716dee0a1579c90e502dfbf7a133a4db
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2168060d5d243a5dda1e0bc7482749ed6c7fc4cb39ff029c8a95d29643dcf6
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:318d0617050b5302b7b9fd244c0bcdb8dedde6e6db48bf8d3bfab29c9662237c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:581e13951882957811a470d66e41e45bbc9bb66544ca2d6e3568683cc9866887
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8b1b52eaa5cc0adbc5ad547706bdc14a1c79b929a785b296eb1b0d394f8b5e5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091b7cd663405f9e474cf640b71ae20df31b45b8cceb2d74232e5c4232ae67f5
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01f0a58e9f4a9804440e8394c58ad8351def40b4f77ca1177f17b91d40c5e86b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:951da6c961efeea8abd4771cf6f335146152fd6e811aedd9376cfbaf0b5c2661
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bcc5c1e32fe134cab8ae52b6ee4359379c0b414157c020ab3e06d21256e51f1
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48de50c66a37a5de2f7b1873acf38375f58754d859c5eb82d7fe707070cddd0c
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90a32c65375a0b35f1aa52aca5fe27b9247b98c2cd81ac883e623d8b0225929b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2136ce865bd31b51bdee33783218e662d324835501f13ef2cf89d65f472e3f07
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab30ef4bf6ec4e411aa77a20b4b6abd224f83b1f055386091808c7312483b117
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3d454f05c0bda87b3125802c8738baab69763f7e63757668c9f80a78618863
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9738104020656584,
5
  "eval_steps": 20,
6
- "global_step": 6600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -50179,6 +50179,766 @@
50179
  "eval_samples_per_second": 5.723,
50180
  "eval_steps_per_second": 0.197,
50181
  "step": 6600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50182
  }
50183
  ],
50184
  "logging_steps": 1,
@@ -50198,7 +50958,7 @@
50198
  "attributes": {}
50199
  }
50200
  },
50201
- "total_flos": 2.0335814788555735e+18,
50202
  "train_batch_size": 8,
50203
  "trial_name": null,
50204
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9885651051272594,
5
  "eval_steps": 20,
6
+ "global_step": 6700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
50179
  "eval_samples_per_second": 5.723,
50180
  "eval_steps_per_second": 0.197,
50181
  "step": 6600
50182
+ },
50183
+ {
50184
+ "epoch": 0.9739579490962744,
50185
+ "grad_norm": 4.089635372161865,
50186
+ "learning_rate": 4.106580592637577e-08,
50187
+ "loss": 0.0861,
50188
+ "step": 6601
50189
+ },
50190
+ {
50191
+ "epoch": 0.9741054961268905,
50192
+ "grad_norm": 3.2651960849761963,
50193
+ "learning_rate": 4.060078990829719e-08,
50194
+ "loss": 0.0572,
50195
+ "step": 6602
50196
+ },
50197
+ {
50198
+ "epoch": 0.9742530431575065,
50199
+ "grad_norm": 1.9765956401824951,
50200
+ "learning_rate": 4.0138416393955545e-08,
50201
+ "loss": 0.0234,
50202
+ "step": 6603
50203
+ },
50204
+ {
50205
+ "epoch": 0.9744005901881224,
50206
+ "grad_norm": 3.121824264526367,
50207
+ "learning_rate": 3.967868550602827e-08,
50208
+ "loss": 0.0691,
50209
+ "step": 6604
50210
+ },
50211
+ {
50212
+ "epoch": 0.9745481372187385,
50213
+ "grad_norm": 2.891749620437622,
50214
+ "learning_rate": 3.922159736649889e-08,
50215
+ "loss": 0.0814,
50216
+ "step": 6605
50217
+ },
50218
+ {
50219
+ "epoch": 0.9746956842493545,
50220
+ "grad_norm": 3.361797332763672,
50221
+ "learning_rate": 3.8767152096641504e-08,
50222
+ "loss": 0.0982,
50223
+ "step": 6606
50224
+ },
50225
+ {
50226
+ "epoch": 0.9748432312799705,
50227
+ "grad_norm": 2.2206366062164307,
50228
+ "learning_rate": 3.831534981703522e-08,
50229
+ "loss": 0.0196,
50230
+ "step": 6607
50231
+ },
50232
+ {
50233
+ "epoch": 0.9749907783105864,
50234
+ "grad_norm": 1.349016785621643,
50235
+ "learning_rate": 3.7866190647554145e-08,
50236
+ "loss": 0.0523,
50237
+ "step": 6608
50238
+ },
50239
+ {
50240
+ "epoch": 0.9751383253412025,
50241
+ "grad_norm": 2.72078537940979,
50242
+ "learning_rate": 3.7419674707374064e-08,
50243
+ "loss": 0.089,
50244
+ "step": 6609
50245
+ },
50246
+ {
50247
+ "epoch": 0.9752858723718185,
50248
+ "grad_norm": 1.52629816532135,
50249
+ "learning_rate": 3.697580211496798e-08,
50250
+ "loss": 0.0607,
50251
+ "step": 6610
50252
+ },
50253
+ {
50254
+ "epoch": 0.9754334194024346,
50255
+ "grad_norm": 1.2776182889938354,
50256
+ "learning_rate": 3.6534572988106144e-08,
50257
+ "loss": 0.0283,
50258
+ "step": 6611
50259
+ },
50260
+ {
50261
+ "epoch": 0.9755809664330506,
50262
+ "grad_norm": 1.8487846851348877,
50263
+ "learning_rate": 3.6095987443860445e-08,
50264
+ "loss": 0.0396,
50265
+ "step": 6612
50266
+ },
50267
+ {
50268
+ "epoch": 0.9757285134636665,
50269
+ "grad_norm": 1.9192994832992554,
50270
+ "learning_rate": 3.5660045598597814e-08,
50271
+ "loss": 0.0598,
50272
+ "step": 6613
50273
+ },
50274
+ {
50275
+ "epoch": 0.9758760604942825,
50276
+ "grad_norm": 1.6062878370285034,
50277
+ "learning_rate": 3.522674756798794e-08,
50278
+ "loss": 0.0665,
50279
+ "step": 6614
50280
+ },
50281
+ {
50282
+ "epoch": 0.9760236075248986,
50283
+ "grad_norm": 1.8371132612228394,
50284
+ "learning_rate": 3.479609346699553e-08,
50285
+ "loss": 0.0509,
50286
+ "step": 6615
50287
+ },
50288
+ {
50289
+ "epoch": 0.9761711545555146,
50290
+ "grad_norm": 2.38232159614563,
50291
+ "learning_rate": 3.4368083409885844e-08,
50292
+ "loss": 0.0964,
50293
+ "step": 6616
50294
+ },
50295
+ {
50296
+ "epoch": 0.9763187015861305,
50297
+ "grad_norm": 2.736990451812744,
50298
+ "learning_rate": 3.394271751021916e-08,
50299
+ "loss": 0.141,
50300
+ "step": 6617
50301
+ },
50302
+ {
50303
+ "epoch": 0.9764662486167466,
50304
+ "grad_norm": 6.989748954772949,
50305
+ "learning_rate": 3.351999588085963e-08,
50306
+ "loss": 0.0612,
50307
+ "step": 6618
50308
+ },
50309
+ {
50310
+ "epoch": 0.9766137956473626,
50311
+ "grad_norm": 1.7669501304626465,
50312
+ "learning_rate": 3.309991863396644e-08,
50313
+ "loss": 0.0488,
50314
+ "step": 6619
50315
+ },
50316
+ {
50317
+ "epoch": 0.9767613426779787,
50318
+ "grad_norm": 1.4926352500915527,
50319
+ "learning_rate": 3.2682485880997096e-08,
50320
+ "loss": 0.0275,
50321
+ "step": 6620
50322
+ },
50323
+ {
50324
+ "epoch": 0.9767613426779787,
50325
+ "eval_accuracy": 0.9797395079594791,
50326
+ "eval_f1": 0.9653465346534653,
50327
+ "eval_loss": 0.055932920426130295,
50328
+ "eval_precision": 0.9848484848484849,
50329
+ "eval_recall": 0.9466019417475728,
50330
+ "eval_runtime": 49.2316,
50331
+ "eval_samples_per_second": 5.911,
50332
+ "eval_steps_per_second": 0.203,
50333
+ "step": 6620
50334
+ },
50335
+ {
50336
+ "epoch": 0.9769088897085946,
50337
+ "grad_norm": 3.745084047317505,
50338
+ "learning_rate": 3.226769773270855e-08,
50339
+ "loss": 0.0572,
50340
+ "step": 6621
50341
+ },
50342
+ {
50343
+ "epoch": 0.9770564367392106,
50344
+ "grad_norm": 1.6596072912216187,
50345
+ "learning_rate": 3.1855554299156096e-08,
50346
+ "loss": 0.0188,
50347
+ "step": 6622
50348
+ },
50349
+ {
50350
+ "epoch": 0.9772039837698266,
50351
+ "grad_norm": 2.8496854305267334,
50352
+ "learning_rate": 3.1446055689690056e-08,
50353
+ "loss": 0.0525,
50354
+ "step": 6623
50355
+ },
50356
+ {
50357
+ "epoch": 0.9773515308004427,
50358
+ "grad_norm": 2.801842451095581,
50359
+ "learning_rate": 3.103920201296462e-08,
50360
+ "loss": 0.0328,
50361
+ "step": 6624
50362
+ },
50363
+ {
50364
+ "epoch": 0.9774990778310586,
50365
+ "grad_norm": 2.546543598175049,
50366
+ "learning_rate": 3.063499337692788e-08,
50367
+ "loss": 0.0721,
50368
+ "step": 6625
50369
+ },
50370
+ {
50371
+ "epoch": 0.9776466248616746,
50372
+ "grad_norm": 2.072725534439087,
50373
+ "learning_rate": 3.023342988882849e-08,
50374
+ "loss": 0.037,
50375
+ "step": 6626
50376
+ },
50377
+ {
50378
+ "epoch": 0.9777941718922907,
50379
+ "grad_norm": 2.3779847621917725,
50380
+ "learning_rate": 2.983451165521123e-08,
50381
+ "loss": 0.0952,
50382
+ "step": 6627
50383
+ },
50384
+ {
50385
+ "epoch": 0.9779417189229067,
50386
+ "grad_norm": 2.976325035095215,
50387
+ "learning_rate": 2.9438238781921424e-08,
50388
+ "loss": 0.071,
50389
+ "step": 6628
50390
+ },
50391
+ {
50392
+ "epoch": 0.9780892659535226,
50393
+ "grad_norm": 3.863071918487549,
50394
+ "learning_rate": 2.9044611374099418e-08,
50395
+ "loss": 0.1468,
50396
+ "step": 6629
50397
+ },
50398
+ {
50399
+ "epoch": 0.9782368129841387,
50400
+ "grad_norm": 4.173577785491943,
50401
+ "learning_rate": 2.8653629536187222e-08,
50402
+ "loss": 0.0564,
50403
+ "step": 6630
50404
+ },
50405
+ {
50406
+ "epoch": 0.9783843600147547,
50407
+ "grad_norm": 3.290264844894409,
50408
+ "learning_rate": 2.8265293371922965e-08,
50409
+ "loss": 0.0889,
50410
+ "step": 6631
50411
+ },
50412
+ {
50413
+ "epoch": 0.9785319070453707,
50414
+ "grad_norm": 1.5761719942092896,
50415
+ "learning_rate": 2.7879602984342002e-08,
50416
+ "loss": 0.0203,
50417
+ "step": 6632
50418
+ },
50419
+ {
50420
+ "epoch": 0.9786794540759867,
50421
+ "grad_norm": 3.7511749267578125,
50422
+ "learning_rate": 2.7496558475778035e-08,
50423
+ "loss": 0.0736,
50424
+ "step": 6633
50425
+ },
50426
+ {
50427
+ "epoch": 0.9788270011066027,
50428
+ "grad_norm": 4.070005893707275,
50429
+ "learning_rate": 2.7116159947865318e-08,
50430
+ "loss": 0.0997,
50431
+ "step": 6634
50432
+ },
50433
+ {
50434
+ "epoch": 0.9789745481372187,
50435
+ "grad_norm": 2.2428393363952637,
50436
+ "learning_rate": 2.6738407501533113e-08,
50437
+ "loss": 0.064,
50438
+ "step": 6635
50439
+ },
50440
+ {
50441
+ "epoch": 0.9791220951678348,
50442
+ "grad_norm": 1.4023271799087524,
50443
+ "learning_rate": 2.636330123701014e-08,
50444
+ "loss": 0.0415,
50445
+ "step": 6636
50446
+ },
50447
+ {
50448
+ "epoch": 0.9792696421984508,
50449
+ "grad_norm": 1.616129755973816,
50450
+ "learning_rate": 2.599084125382123e-08,
50451
+ "loss": 0.0531,
50452
+ "step": 6637
50453
+ },
50454
+ {
50455
+ "epoch": 0.9794171892290667,
50456
+ "grad_norm": 4.515521049499512,
50457
+ "learning_rate": 2.5621027650790664e-08,
50458
+ "loss": 0.2382,
50459
+ "step": 6638
50460
+ },
50461
+ {
50462
+ "epoch": 0.9795647362596828,
50463
+ "grad_norm": 2.131122589111328,
50464
+ "learning_rate": 2.5253860526042173e-08,
50465
+ "loss": 0.0389,
50466
+ "step": 6639
50467
+ },
50468
+ {
50469
+ "epoch": 0.9797122832902988,
50470
+ "grad_norm": 1.6050862073898315,
50471
+ "learning_rate": 2.4889339976992277e-08,
50472
+ "loss": 0.0358,
50473
+ "step": 6640
50474
+ },
50475
+ {
50476
+ "epoch": 0.9797122832902988,
50477
+ "eval_accuracy": 0.9782923299565847,
50478
+ "eval_f1": 0.9629629629629629,
50479
+ "eval_loss": 0.05516430363059044,
50480
+ "eval_precision": 0.9798994974874372,
50481
+ "eval_recall": 0.9466019417475728,
50482
+ "eval_runtime": 49.5399,
50483
+ "eval_samples_per_second": 5.874,
50484
+ "eval_steps_per_second": 0.202,
50485
+ "step": 6640
50486
+ },
50487
+ {
50488
+ "epoch": 0.9798598303209148,
50489
+ "grad_norm": 1.8880443572998047,
50490
+ "learning_rate": 2.4527466100360277e-08,
50491
+ "loss": 0.0747,
50492
+ "step": 6641
50493
+ },
50494
+ {
50495
+ "epoch": 0.9800073773515308,
50496
+ "grad_norm": 5.500354290008545,
50497
+ "learning_rate": 2.4168238992160477e-08,
50498
+ "loss": 0.0474,
50499
+ "step": 6642
50500
+ },
50501
+ {
50502
+ "epoch": 0.9801549243821468,
50503
+ "grad_norm": 2.404766321182251,
50504
+ "learning_rate": 2.3811658747705525e-08,
50505
+ "loss": 0.0494,
50506
+ "step": 6643
50507
+ },
50508
+ {
50509
+ "epoch": 0.9803024714127628,
50510
+ "grad_norm": 2.824960947036743,
50511
+ "learning_rate": 2.3457725461607518e-08,
50512
+ "loss": 0.074,
50513
+ "step": 6644
50514
+ },
50515
+ {
50516
+ "epoch": 0.9804500184433789,
50517
+ "grad_norm": 1.472124457359314,
50518
+ "learning_rate": 2.3106439227773558e-08,
50519
+ "loss": 0.0277,
50520
+ "step": 6645
50521
+ },
50522
+ {
50523
+ "epoch": 0.9805975654739948,
50524
+ "grad_norm": 0.9315122365951538,
50525
+ "learning_rate": 2.27578001394102e-08,
50526
+ "loss": 0.0097,
50527
+ "step": 6646
50528
+ },
50529
+ {
50530
+ "epoch": 0.9807451125046108,
50531
+ "grad_norm": 2.713543176651001,
50532
+ "learning_rate": 2.241180828902012e-08,
50533
+ "loss": 0.0622,
50534
+ "step": 6647
50535
+ },
50536
+ {
50537
+ "epoch": 0.9808926595352269,
50538
+ "grad_norm": 5.194150447845459,
50539
+ "learning_rate": 2.2068463768405435e-08,
50540
+ "loss": 0.0851,
50541
+ "step": 6648
50542
+ },
50543
+ {
50544
+ "epoch": 0.9810402065658429,
50545
+ "grad_norm": 5.96819543838501,
50546
+ "learning_rate": 2.1727766668664385e-08,
50547
+ "loss": 0.0849,
50548
+ "step": 6649
50549
+ },
50550
+ {
50551
+ "epoch": 0.9811877535964588,
50552
+ "grad_norm": 1.1071208715438843,
50553
+ "learning_rate": 2.138971708019355e-08,
50554
+ "loss": 0.0268,
50555
+ "step": 6650
50556
+ },
50557
+ {
50558
+ "epoch": 0.9813353006270749,
50559
+ "grad_norm": 2.806211471557617,
50560
+ "learning_rate": 2.105431509268563e-08,
50561
+ "loss": 0.0916,
50562
+ "step": 6651
50563
+ },
50564
+ {
50565
+ "epoch": 0.9814828476576909,
50566
+ "grad_norm": 3.1690165996551514,
50567
+ "learning_rate": 2.0721560795133876e-08,
50568
+ "loss": 0.0993,
50569
+ "step": 6652
50570
+ },
50571
+ {
50572
+ "epoch": 0.9816303946883069,
50573
+ "grad_norm": 2.2227795124053955,
50574
+ "learning_rate": 2.0391454275827673e-08,
50575
+ "loss": 0.0388,
50576
+ "step": 6653
50577
+ },
50578
+ {
50579
+ "epoch": 0.9817779417189229,
50580
+ "grad_norm": 0.5616309642791748,
50581
+ "learning_rate": 2.0063995622350287e-08,
50582
+ "loss": 0.0045,
50583
+ "step": 6654
50584
+ },
50585
+ {
50586
+ "epoch": 0.9819254887495389,
50587
+ "grad_norm": 2.450514316558838,
50588
+ "learning_rate": 1.9739184921588885e-08,
50589
+ "loss": 0.0688,
50590
+ "step": 6655
50591
+ },
50592
+ {
50593
+ "epoch": 0.9820730357801549,
50594
+ "grad_norm": 2.0356853008270264,
50595
+ "learning_rate": 1.9417022259723418e-08,
50596
+ "loss": 0.0511,
50597
+ "step": 6656
50598
+ },
50599
+ {
50600
+ "epoch": 0.982220582810771,
50601
+ "grad_norm": 2.293266773223877,
50602
+ "learning_rate": 1.9097507722231068e-08,
50603
+ "loss": 0.0289,
50604
+ "step": 6657
50605
+ },
50606
+ {
50607
+ "epoch": 0.982368129841387,
50608
+ "grad_norm": 2.306947708129883,
50609
+ "learning_rate": 1.8780641393890685e-08,
50610
+ "loss": 0.0559,
50611
+ "step": 6658
50612
+ },
50613
+ {
50614
+ "epoch": 0.9825156768720029,
50615
+ "grad_norm": 0.8441616296768188,
50616
+ "learning_rate": 1.84664233587728e-08,
50617
+ "loss": 0.0107,
50618
+ "step": 6659
50619
+ },
50620
+ {
50621
+ "epoch": 0.982663223902619,
50622
+ "grad_norm": 1.6219745874404907,
50623
+ "learning_rate": 1.815485370025072e-08,
50624
+ "loss": 0.0266,
50625
+ "step": 6660
50626
+ },
50627
+ {
50628
+ "epoch": 0.982663223902619,
50629
+ "eval_accuracy": 0.9782923299565847,
50630
+ "eval_f1": 0.9629629629629629,
50631
+ "eval_loss": 0.05498597025871277,
50632
+ "eval_precision": 0.9798994974874372,
50633
+ "eval_recall": 0.9466019417475728,
50634
+ "eval_runtime": 49.5621,
50635
+ "eval_samples_per_second": 5.871,
50636
+ "eval_steps_per_second": 0.202,
50637
+ "step": 6660
50638
+ },
50639
+ {
50640
+ "epoch": 0.982810770933235,
50641
+ "grad_norm": 1.976530909538269,
50642
+ "learning_rate": 1.784593250099054e-08,
50643
+ "loss": 0.0253,
50644
+ "step": 6661
50645
+ },
50646
+ {
50647
+ "epoch": 0.982958317963851,
50648
+ "grad_norm": 2.217996120452881,
50649
+ "learning_rate": 1.7539659842957803e-08,
50650
+ "loss": 0.0555,
50651
+ "step": 6662
50652
+ },
50653
+ {
50654
+ "epoch": 0.983105864994467,
50655
+ "grad_norm": 2.010887861251831,
50656
+ "learning_rate": 1.7236035807416397e-08,
50657
+ "loss": 0.0421,
50658
+ "step": 6663
50659
+ },
50660
+ {
50661
+ "epoch": 0.983253412025083,
50662
+ "grad_norm": 0.6405054926872253,
50663
+ "learning_rate": 1.6935060474926323e-08,
50664
+ "loss": 0.0071,
50665
+ "step": 6664
50666
+ },
50667
+ {
50668
+ "epoch": 0.983400959055699,
50669
+ "grad_norm": 2.444506883621216,
50670
+ "learning_rate": 1.6636733925342595e-08,
50671
+ "loss": 0.033,
50672
+ "step": 6665
50673
+ },
50674
+ {
50675
+ "epoch": 0.983548506086315,
50676
+ "grad_norm": 1.0735312700271606,
50677
+ "learning_rate": 1.6341056237820784e-08,
50678
+ "loss": 0.0151,
50679
+ "step": 6666
50680
+ },
50681
+ {
50682
+ "epoch": 0.983696053116931,
50683
+ "grad_norm": 2.435049533843994,
50684
+ "learning_rate": 1.6048027490812577e-08,
50685
+ "loss": 0.0543,
50686
+ "step": 6667
50687
+ },
50688
+ {
50689
+ "epoch": 0.983843600147547,
50690
+ "grad_norm": 2.4513931274414062,
50691
+ "learning_rate": 1.5757647762065786e-08,
50692
+ "loss": 0.0621,
50693
+ "step": 6668
50694
+ },
50695
+ {
50696
+ "epoch": 0.983991147178163,
50697
+ "grad_norm": 1.8004716634750366,
50698
+ "learning_rate": 1.5469917128626554e-08,
50699
+ "loss": 0.025,
50700
+ "step": 6669
50701
+ },
50702
+ {
50703
+ "epoch": 0.9841386942087791,
50704
+ "grad_norm": 1.50918710231781,
50705
+ "learning_rate": 1.518483566683826e-08,
50706
+ "loss": 0.0401,
50707
+ "step": 6670
50708
+ },
50709
+ {
50710
+ "epoch": 0.984286241239395,
50711
+ "grad_norm": 2.1539971828460693,
50712
+ "learning_rate": 1.4902403452339287e-08,
50713
+ "loss": 0.0664,
50714
+ "step": 6671
50715
+ },
50716
+ {
50717
+ "epoch": 0.984433788270011,
50718
+ "grad_norm": 7.559150218963623,
50719
+ "learning_rate": 1.4622620560069688e-08,
50720
+ "loss": 0.0937,
50721
+ "step": 6672
50722
+ },
50723
+ {
50724
+ "epoch": 0.9845813353006271,
50725
+ "grad_norm": 1.530104637145996,
50726
+ "learning_rate": 1.4345487064260089e-08,
50727
+ "loss": 0.0648,
50728
+ "step": 6673
50729
+ },
50730
+ {
50731
+ "epoch": 0.9847288823312431,
50732
+ "grad_norm": 1.3213176727294922,
50733
+ "learning_rate": 1.4071003038443887e-08,
50734
+ "loss": 0.0344,
50735
+ "step": 6674
50736
+ },
50737
+ {
50738
+ "epoch": 0.984876429361859,
50739
+ "grad_norm": 1.8271011114120483,
50740
+ "learning_rate": 1.3799168555449494e-08,
50741
+ "loss": 0.0243,
50742
+ "step": 6675
50743
+ },
50744
+ {
50745
+ "epoch": 0.9850239763924751,
50746
+ "grad_norm": 1.226176142692566,
50747
+ "learning_rate": 1.3529983687400328e-08,
50748
+ "loss": 0.0178,
50749
+ "step": 6676
50750
+ },
50751
+ {
50752
+ "epoch": 0.9851715234230911,
50753
+ "grad_norm": 0.6308827996253967,
50754
+ "learning_rate": 1.3263448505720366e-08,
50755
+ "loss": 0.007,
50756
+ "step": 6677
50757
+ },
50758
+ {
50759
+ "epoch": 0.9853190704537071,
50760
+ "grad_norm": 2.996870517730713,
50761
+ "learning_rate": 1.2999563081127486e-08,
50762
+ "loss": 0.0786,
50763
+ "step": 6678
50764
+ },
50765
+ {
50766
+ "epoch": 0.9854666174843232,
50767
+ "grad_norm": 2.7150681018829346,
50768
+ "learning_rate": 1.2738327483639013e-08,
50769
+ "loss": 0.0394,
50770
+ "step": 6679
50771
+ },
50772
+ {
50773
+ "epoch": 0.9856141645149391,
50774
+ "grad_norm": 2.043134927749634,
50775
+ "learning_rate": 1.2479741782566168e-08,
50776
+ "loss": 0.0759,
50777
+ "step": 6680
50778
+ },
50779
+ {
50780
+ "epoch": 0.9856141645149391,
50781
+ "eval_accuracy": 0.9782923299565847,
50782
+ "eval_f1": 0.9629629629629629,
50783
+ "eval_loss": 0.05593600869178772,
50784
+ "eval_precision": 0.9798994974874372,
50785
+ "eval_recall": 0.9466019417475728,
50786
+ "eval_runtime": 49.7146,
50787
+ "eval_samples_per_second": 5.853,
50788
+ "eval_steps_per_second": 0.201,
50789
+ "step": 6680
50790
+ },
50791
+ {
50792
+ "epoch": 0.9857617115455551,
50793
+ "grad_norm": 1.8694920539855957,
50794
+ "learning_rate": 1.2223806046520737e-08,
50795
+ "loss": 0.0362,
50796
+ "step": 6681
50797
+ },
50798
+ {
50799
+ "epoch": 0.9859092585761712,
50800
+ "grad_norm": 2.6727139949798584,
50801
+ "learning_rate": 1.1970520343408398e-08,
50802
+ "loss": 0.073,
50803
+ "step": 6682
50804
+ },
50805
+ {
50806
+ "epoch": 0.9860568056067872,
50807
+ "grad_norm": 0.7778927683830261,
50808
+ "learning_rate": 1.1719884740433174e-08,
50809
+ "loss": 0.0056,
50810
+ "step": 6683
50811
+ },
50812
+ {
50813
+ "epoch": 0.9862043526374031,
50814
+ "grad_norm": 2.3464653491973877,
50815
+ "learning_rate": 1.1471899304095202e-08,
50816
+ "loss": 0.0314,
50817
+ "step": 6684
50818
+ },
50819
+ {
50820
+ "epoch": 0.9863518996680192,
50821
+ "grad_norm": 0.8709948658943176,
50822
+ "learning_rate": 1.122656410019296e-08,
50823
+ "loss": 0.0199,
50824
+ "step": 6685
50825
+ },
50826
+ {
50827
+ "epoch": 0.9864994466986352,
50828
+ "grad_norm": 6.606779098510742,
50829
+ "learning_rate": 1.0983879193819936e-08,
50830
+ "loss": 0.108,
50831
+ "step": 6686
50832
+ },
50833
+ {
50834
+ "epoch": 0.9866469937292512,
50835
+ "grad_norm": 4.287250995635986,
50836
+ "learning_rate": 1.074384464936684e-08,
50837
+ "loss": 0.0716,
50838
+ "step": 6687
50839
+ },
50840
+ {
50841
+ "epoch": 0.9867945407598672,
50842
+ "grad_norm": 0.7073714733123779,
50843
+ "learning_rate": 1.0506460530521622e-08,
50844
+ "loss": 0.0188,
50845
+ "step": 6688
50846
+ },
50847
+ {
50848
+ "epoch": 0.9869420877904832,
50849
+ "grad_norm": 4.2220563888549805,
50850
+ "learning_rate": 1.0271726900269452e-08,
50851
+ "loss": 0.0769,
50852
+ "step": 6689
50853
+ },
50854
+ {
50855
+ "epoch": 0.9870896348210992,
50856
+ "grad_norm": 1.6127564907073975,
50857
+ "learning_rate": 1.003964382089162e-08,
50858
+ "loss": 0.0457,
50859
+ "step": 6690
50860
+ },
50861
+ {
50862
+ "epoch": 0.9872371818517153,
50863
+ "grad_norm": 2.2320802211761475,
50864
+ "learning_rate": 9.810211353965537e-09,
50865
+ "loss": 0.047,
50866
+ "step": 6691
50867
+ },
50868
+ {
50869
+ "epoch": 0.9873847288823312,
50870
+ "grad_norm": 3.913719654083252,
50871
+ "learning_rate": 9.583429560365843e-09,
50872
+ "loss": 0.0715,
50873
+ "step": 6692
50874
+ },
50875
+ {
50876
+ "epoch": 0.9875322759129472,
50877
+ "grad_norm": 2.9218332767486572,
50878
+ "learning_rate": 9.359298500264402e-09,
50879
+ "loss": 0.0513,
50880
+ "step": 6693
50881
+ },
50882
+ {
50883
+ "epoch": 0.9876798229435633,
50884
+ "grad_norm": 1.7875134944915771,
50885
+ "learning_rate": 9.137818233129203e-09,
50886
+ "loss": 0.0406,
50887
+ "step": 6694
50888
+ },
50889
+ {
50890
+ "epoch": 0.9878273699741793,
50891
+ "grad_norm": 2.7455263137817383,
50892
+ "learning_rate": 8.91898881772657e-09,
50893
+ "loss": 0.0704,
50894
+ "step": 6695
50895
+ },
50896
+ {
50897
+ "epoch": 0.9879749170047952,
50898
+ "grad_norm": 0.6625596880912781,
50899
+ "learning_rate": 8.702810312115618e-09,
50900
+ "loss": 0.006,
50901
+ "step": 6696
50902
+ },
50903
+ {
50904
+ "epoch": 0.9881224640354113,
50905
+ "grad_norm": 1.6851662397384644,
50906
+ "learning_rate": 8.489282773656016e-09,
50907
+ "loss": 0.0527,
50908
+ "step": 6697
50909
+ },
50910
+ {
50911
+ "epoch": 0.9882700110660273,
50912
+ "grad_norm": 2.4347875118255615,
50913
+ "learning_rate": 8.278406259001337e-09,
50914
+ "loss": 0.0673,
50915
+ "step": 6698
50916
+ },
50917
+ {
50918
+ "epoch": 0.9884175580966433,
50919
+ "grad_norm": 5.950766563415527,
50920
+ "learning_rate": 8.07018082410349e-09,
50921
+ "loss": 0.091,
50922
+ "step": 6699
50923
+ },
50924
+ {
50925
+ "epoch": 0.9885651051272594,
50926
+ "grad_norm": 1.7334251403808594,
50927
+ "learning_rate": 7.864606524211616e-09,
50928
+ "loss": 0.0396,
50929
+ "step": 6700
50930
+ },
50931
+ {
50932
+ "epoch": 0.9885651051272594,
50933
+ "eval_accuracy": 0.9797395079594791,
50934
+ "eval_f1": 0.9653465346534653,
50935
+ "eval_loss": 0.05492059141397476,
50936
+ "eval_precision": 0.9848484848484849,
50937
+ "eval_recall": 0.9466019417475728,
50938
+ "eval_runtime": 49.3051,
50939
+ "eval_samples_per_second": 5.902,
50940
+ "eval_steps_per_second": 0.203,
50941
+ "step": 6700
50942
  }
50943
  ],
50944
  "logging_steps": 1,
 
50958
  "attributes": {}
50959
  }
50960
  },
50961
+ "total_flos": 2.0639708098351596e+18,
50962
  "train_batch_size": 8,
50963
  "trial_name": null,
50964
  "trial_params": null