mtzig commited on
Commit
dd76f93
1 Parent(s): b7d353c

Training in progress, step 776, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e243eb5a2e392baf6fbbd34e2880b92b192e8406253f7dbfdcc5bf34a27d48d
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f43f1400398e4c8505720effcc16af684c412b5638dba7f3a1aebf4a06cc201
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35be897abb2edd59e7e14cd584897fafab6ebcbe9d7b044ec3c0f24465e96641
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:020d542507719a4eb2ab83beed83011776a0481d28e239449ece4a1132799518
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:756519000cbd96a62a18f4b2593ac1531235ff43f81a97c750d5d508e0bb4895
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f40acb7ba2ebf0ad18f4cab0b9f0d53f945042effd89e47fb9a8b7ccab730bd
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9503408f5508841046560c3ab2f47fba69b29194960b3e332e210fb40344d3dc
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e04417cf7b8930a23dde2fa70a0f93b67ae1d9250b82485eb0de6d821acbc32d
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:526dcd0450898f0723267644199cfb9a8075bd3c8768053837a8471c3071e1f8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3439420f05cede144ac4d240b8b0b215387dbe8143269602efa71be4361358
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b8a091054d4117d69c92fe61df180bdff12c693387e2bf9406cb7a0c62fb550
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:944dc60c3baa584c4911758be20c6546b7d8e81f6a0cbb615f3b61e9148cf32b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97f4e093aada0ef569e1477e89853408435b9af062279297db8b59881fcbb194
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef3c9204dba34dd140c5701148d798c401bac1df0b7b31998ff88e7e10ae2b1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69c3c82ad5b7f6e2fcf2155f310ae31260fa6abcafc63890af31474a7296f19d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e7acdf2f27b8bdd158b1883d758f2c93dd6ce84922dd99b3cdac6a5117b556
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6a53af4113d719a74be72b9b96e06ddfcf589f1c186482a58b06d91bc3e4d34
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecbf30739d47702ff1dc62fda61a8b804166ddf0362f853a15764a0f6ca810ca
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3f79573df50dcf97b1bf70e5483cf9597da1b8a05214eb274fc3c3833cf3ad5
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62ea2caeae1f19550ff4c866c756827368743f8de236623a1b0db1ace7686574
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f80c1cfb5ea86276cd48dd4a859d36d824a2fd5563d437a8a0c4be3eb6200b32
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c459f1b116355fb0f3bdc3f65b9adc53657df08c616f375c6b5ee9a3f2a5eb
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:714110b482b507e1fb6219bf62e8ec1001b632b03d77db23a105560f7b48af04
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e129f010772342f4e564c483d7381dc6b23561fca445914ea8c159bc5d7a2ed
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b38c3c47d3e1cfb698a7e53778b8d216b765546387a9e6bd3f149bea8249062
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836d724299dd150551400174e98221fb5b90ef0e07f83661da8beb5333f31043
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9020618556701031,
5
  "eval_steps": 20,
6
- "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5339,6 +5339,574 @@
5339
  "eval_samples_per_second": 5.311,
5340
  "eval_steps_per_second": 0.175,
5341
  "step": 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5342
  }
5343
  ],
5344
  "logging_steps": 1,
@@ -5353,12 +5921,12 @@
5353
  "should_evaluate": false,
5354
  "should_log": false,
5355
  "should_save": true,
5356
- "should_training_stop": false
5357
  },
5358
  "attributes": {}
5359
  }
5360
  },
5361
- "total_flos": 2.3348483995572634e+17,
5362
  "train_batch_size": 8,
5363
  "trial_name": null,
5364
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 20,
6
+ "global_step": 776,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5339
  "eval_samples_per_second": 5.311,
5340
  "eval_steps_per_second": 0.175,
5341
  "step": 700
5342
+ },
5343
+ {
5344
+ "epoch": 0.9033505154639175,
5345
+ "grad_norm": 3.9940011501312256,
5346
+ "learning_rate": 5.643565211494285e-07,
5347
+ "loss": 0.0374,
5348
+ "step": 701
5349
+ },
5350
+ {
5351
+ "epoch": 0.904639175257732,
5352
+ "grad_norm": 0.029992813244462013,
5353
+ "learning_rate": 5.495458148283505e-07,
5354
+ "loss": 0.0003,
5355
+ "step": 702
5356
+ },
5357
+ {
5358
+ "epoch": 0.9059278350515464,
5359
+ "grad_norm": 0.03322442248463631,
5360
+ "learning_rate": 5.349265521053603e-07,
5361
+ "loss": 0.0002,
5362
+ "step": 703
5363
+ },
5364
+ {
5365
+ "epoch": 0.9072164948453608,
5366
+ "grad_norm": 1.478757381439209,
5367
+ "learning_rate": 5.204990291317535e-07,
5368
+ "loss": 0.0026,
5369
+ "step": 704
5370
+ },
5371
+ {
5372
+ "epoch": 0.9085051546391752,
5373
+ "grad_norm": 3.334146499633789,
5374
+ "learning_rate": 5.062635381746362e-07,
5375
+ "loss": 0.0251,
5376
+ "step": 705
5377
+ },
5378
+ {
5379
+ "epoch": 0.9097938144329897,
5380
+ "grad_norm": 0.7413277626037598,
5381
+ "learning_rate": 4.92220367611006e-07,
5382
+ "loss": 0.0037,
5383
+ "step": 706
5384
+ },
5385
+ {
5386
+ "epoch": 0.9110824742268041,
5387
+ "grad_norm": 0.4254130423069,
5388
+ "learning_rate": 4.783698019219118e-07,
5389
+ "loss": 0.0014,
5390
+ "step": 707
5391
+ },
5392
+ {
5393
+ "epoch": 0.9123711340206185,
5394
+ "grad_norm": 2.775209426879883,
5395
+ "learning_rate": 4.647121216866857e-07,
5396
+ "loss": 0.026,
5397
+ "step": 708
5398
+ },
5399
+ {
5400
+ "epoch": 0.913659793814433,
5401
+ "grad_norm": 1.4021016359329224,
5402
+ "learning_rate": 4.512476035772628e-07,
5403
+ "loss": 0.0094,
5404
+ "step": 709
5405
+ },
5406
+ {
5407
+ "epoch": 0.9149484536082474,
5408
+ "grad_norm": 0.19824525713920593,
5409
+ "learning_rate": 4.3797652035257544e-07,
5410
+ "loss": 0.0016,
5411
+ "step": 710
5412
+ },
5413
+ {
5414
+ "epoch": 0.9162371134020618,
5415
+ "grad_norm": 1.9552204608917236,
5416
+ "learning_rate": 4.248991408530279e-07,
5417
+ "loss": 0.0136,
5418
+ "step": 711
5419
+ },
5420
+ {
5421
+ "epoch": 0.9175257731958762,
5422
+ "grad_norm": 0.1556915044784546,
5423
+ "learning_rate": 4.1201572999505e-07,
5424
+ "loss": 0.0004,
5425
+ "step": 712
5426
+ },
5427
+ {
5428
+ "epoch": 0.9188144329896907,
5429
+ "grad_norm": 4.923846244812012,
5430
+ "learning_rate": 3.9932654876573164e-07,
5431
+ "loss": 0.0145,
5432
+ "step": 713
5433
+ },
5434
+ {
5435
+ "epoch": 0.9201030927835051,
5436
+ "grad_norm": 0.12035495787858963,
5437
+ "learning_rate": 3.8683185421753313e-07,
5438
+ "loss": 0.0005,
5439
+ "step": 714
5440
+ },
5441
+ {
5442
+ "epoch": 0.9213917525773195,
5443
+ "grad_norm": 1.2554630041122437,
5444
+ "learning_rate": 3.74531899463082e-07,
5445
+ "loss": 0.0086,
5446
+ "step": 715
5447
+ },
5448
+ {
5449
+ "epoch": 0.9226804123711341,
5450
+ "grad_norm": 0.042812976986169815,
5451
+ "learning_rate": 3.6242693367004365e-07,
5452
+ "loss": 0.0002,
5453
+ "step": 716
5454
+ },
5455
+ {
5456
+ "epoch": 0.9239690721649485,
5457
+ "grad_norm": 0.1420414000749588,
5458
+ "learning_rate": 3.5051720205606877e-07,
5459
+ "loss": 0.0009,
5460
+ "step": 717
5461
+ },
5462
+ {
5463
+ "epoch": 0.9252577319587629,
5464
+ "grad_norm": 0.12338154017925262,
5465
+ "learning_rate": 3.38802945883836e-07,
5466
+ "loss": 0.001,
5467
+ "step": 718
5468
+ },
5469
+ {
5470
+ "epoch": 0.9265463917525774,
5471
+ "grad_norm": 1.5074167251586914,
5472
+ "learning_rate": 3.2728440245615724e-07,
5473
+ "loss": 0.004,
5474
+ "step": 719
5475
+ },
5476
+ {
5477
+ "epoch": 0.9278350515463918,
5478
+ "grad_norm": 4.496931076049805,
5479
+ "learning_rate": 3.1596180511117235e-07,
5480
+ "loss": 0.0144,
5481
+ "step": 720
5482
+ },
5483
+ {
5484
+ "epoch": 0.9278350515463918,
5485
+ "eval_accuracy": 0.9975173783515392,
5486
+ "eval_f1": 0.9557522123893806,
5487
+ "eval_loss": 0.013383620418608189,
5488
+ "eval_precision": 0.9642857142857143,
5489
+ "eval_recall": 0.9473684210526315,
5490
+ "eval_runtime": 85.8622,
5491
+ "eval_samples_per_second": 5.299,
5492
+ "eval_steps_per_second": 0.175,
5493
+ "step": 720
5494
+ },
5495
+ {
5496
+ "epoch": 0.9291237113402062,
5497
+ "grad_norm": 3.5359578132629395,
5498
+ "learning_rate": 3.048353832176221e-07,
5499
+ "loss": 0.0237,
5500
+ "step": 721
5501
+ },
5502
+ {
5503
+ "epoch": 0.9304123711340206,
5504
+ "grad_norm": 0.27863824367523193,
5505
+ "learning_rate": 2.939053621702015e-07,
5506
+ "loss": 0.0012,
5507
+ "step": 722
5508
+ },
5509
+ {
5510
+ "epoch": 0.9317010309278351,
5511
+ "grad_norm": 2.338470935821533,
5512
+ "learning_rate": 2.83171963384995e-07,
5513
+ "loss": 0.0099,
5514
+ "step": 723
5515
+ },
5516
+ {
5517
+ "epoch": 0.9329896907216495,
5518
+ "grad_norm": 4.0552592277526855,
5519
+ "learning_rate": 2.7263540429498747e-07,
5520
+ "loss": 0.0294,
5521
+ "step": 724
5522
+ },
5523
+ {
5524
+ "epoch": 0.9342783505154639,
5525
+ "grad_norm": 1.4125486612319946,
5526
+ "learning_rate": 2.6229589834566807e-07,
5527
+ "loss": 0.0324,
5528
+ "step": 725
5529
+ },
5530
+ {
5531
+ "epoch": 0.9355670103092784,
5532
+ "grad_norm": 3.3217031955718994,
5533
+ "learning_rate": 2.5215365499069446e-07,
5534
+ "loss": 0.0062,
5535
+ "step": 726
5536
+ },
5537
+ {
5538
+ "epoch": 0.9368556701030928,
5539
+ "grad_norm": 0.8493993878364563,
5540
+ "learning_rate": 2.4220887968765873e-07,
5541
+ "loss": 0.002,
5542
+ "step": 727
5543
+ },
5544
+ {
5545
+ "epoch": 0.9381443298969072,
5546
+ "grad_norm": 3.687810182571411,
5547
+ "learning_rate": 2.3246177389392388e-07,
5548
+ "loss": 0.037,
5549
+ "step": 728
5550
+ },
5551
+ {
5552
+ "epoch": 0.9394329896907216,
5553
+ "grad_norm": 0.4572630822658539,
5554
+ "learning_rate": 2.229125350625394e-07,
5555
+ "loss": 0.0012,
5556
+ "step": 729
5557
+ },
5558
+ {
5559
+ "epoch": 0.9407216494845361,
5560
+ "grad_norm": 0.3448236882686615,
5561
+ "learning_rate": 2.1356135663824328e-07,
5562
+ "loss": 0.0017,
5563
+ "step": 730
5564
+ },
5565
+ {
5566
+ "epoch": 0.9420103092783505,
5567
+ "grad_norm": 1.1979801654815674,
5568
+ "learning_rate": 2.0440842805354522e-07,
5569
+ "loss": 0.0174,
5570
+ "step": 731
5571
+ },
5572
+ {
5573
+ "epoch": 0.9432989690721649,
5574
+ "grad_norm": 0.3525691330432892,
5575
+ "learning_rate": 1.9545393472488738e-07,
5576
+ "loss": 0.0019,
5577
+ "step": 732
5578
+ },
5579
+ {
5580
+ "epoch": 0.9445876288659794,
5581
+ "grad_norm": 1.4202477931976318,
5582
+ "learning_rate": 1.866980580488842e-07,
5583
+ "loss": 0.0269,
5584
+ "step": 733
5585
+ },
5586
+ {
5587
+ "epoch": 0.9458762886597938,
5588
+ "grad_norm": 1.2961419820785522,
5589
+ "learning_rate": 1.7814097539865626e-07,
5590
+ "loss": 0.0023,
5591
+ "step": 734
5592
+ },
5593
+ {
5594
+ "epoch": 0.9471649484536082,
5595
+ "grad_norm": 0.17165932059288025,
5596
+ "learning_rate": 1.6978286012023225e-07,
5597
+ "loss": 0.0009,
5598
+ "step": 735
5599
+ },
5600
+ {
5601
+ "epoch": 0.9484536082474226,
5602
+ "grad_norm": 0.12149068713188171,
5603
+ "learning_rate": 1.6162388152903498e-07,
5604
+ "loss": 0.0005,
5605
+ "step": 736
5606
+ },
5607
+ {
5608
+ "epoch": 0.9497422680412371,
5609
+ "grad_norm": 0.8597332835197449,
5610
+ "learning_rate": 1.5366420490645738e-07,
5611
+ "loss": 0.0028,
5612
+ "step": 737
5613
+ },
5614
+ {
5615
+ "epoch": 0.9510309278350515,
5616
+ "grad_norm": 0.13404878973960876,
5617
+ "learning_rate": 1.4590399149650769e-07,
5618
+ "loss": 0.0004,
5619
+ "step": 738
5620
+ },
5621
+ {
5622
+ "epoch": 0.9523195876288659,
5623
+ "grad_norm": 1.938827633857727,
5624
+ "learning_rate": 1.3834339850254952e-07,
5625
+ "loss": 0.0083,
5626
+ "step": 739
5627
+ },
5628
+ {
5629
+ "epoch": 0.9536082474226805,
5630
+ "grad_norm": 1.4186664819717407,
5631
+ "learning_rate": 1.309825790841146e-07,
5632
+ "loss": 0.0284,
5633
+ "step": 740
5634
+ },
5635
+ {
5636
+ "epoch": 0.9536082474226805,
5637
+ "eval_accuracy": 0.9980139026812314,
5638
+ "eval_f1": 0.9642857142857143,
5639
+ "eval_loss": 0.013383138924837112,
5640
+ "eval_precision": 0.9818181818181818,
5641
+ "eval_recall": 0.9473684210526315,
5642
+ "eval_runtime": 85.8161,
5643
+ "eval_samples_per_second": 5.302,
5644
+ "eval_steps_per_second": 0.175,
5645
+ "step": 740
5646
+ },
5647
+ {
5648
+ "epoch": 0.9548969072164949,
5649
+ "grad_norm": 1.9752976894378662,
5650
+ "learning_rate": 1.2382168235379742e-07,
5651
+ "loss": 0.0028,
5652
+ "step": 741
5653
+ },
5654
+ {
5655
+ "epoch": 0.9561855670103093,
5656
+ "grad_norm": 4.70041036605835,
5657
+ "learning_rate": 1.1686085337423991e-07,
5658
+ "loss": 0.0102,
5659
+ "step": 742
5660
+ },
5661
+ {
5662
+ "epoch": 0.9574742268041238,
5663
+ "grad_norm": 3.6995646953582764,
5664
+ "learning_rate": 1.1010023315518592e-07,
5665
+ "loss": 0.0218,
5666
+ "step": 743
5667
+ },
5668
+ {
5669
+ "epoch": 0.9587628865979382,
5670
+ "grad_norm": 2.3631069660186768,
5671
+ "learning_rate": 1.0353995865063138e-07,
5672
+ "loss": 0.0306,
5673
+ "step": 744
5674
+ },
5675
+ {
5676
+ "epoch": 0.9600515463917526,
5677
+ "grad_norm": 0.07328186929225922,
5678
+ "learning_rate": 9.718016275604759e-08,
5679
+ "loss": 0.0004,
5680
+ "step": 745
5681
+ },
5682
+ {
5683
+ "epoch": 0.961340206185567,
5684
+ "grad_norm": 0.09281091392040253,
5685
+ "learning_rate": 9.10209743056889e-08,
5686
+ "loss": 0.0006,
5687
+ "step": 746
5688
+ },
5689
+ {
5690
+ "epoch": 0.9626288659793815,
5691
+ "grad_norm": 2.129312753677368,
5692
+ "learning_rate": 8.506251806997934e-08,
5693
+ "loss": 0.0418,
5694
+ "step": 747
5695
+ },
5696
+ {
5697
+ "epoch": 0.9639175257731959,
5698
+ "grad_norm": 0.45759478211402893,
5699
+ "learning_rate": 7.930491475299229e-08,
5700
+ "loss": 0.001,
5701
+ "step": 748
5702
+ },
5703
+ {
5704
+ "epoch": 0.9652061855670103,
5705
+ "grad_norm": 0.9310470819473267,
5706
+ "learning_rate": 7.37482809900003e-08,
5707
+ "loss": 0.0023,
5708
+ "step": 749
5709
+ },
5710
+ {
5711
+ "epoch": 0.9664948453608248,
5712
+ "grad_norm": 0.8306396007537842,
5713
+ "learning_rate": 6.839272934511143e-08,
5714
+ "loss": 0.0038,
5715
+ "step": 750
5716
+ },
5717
+ {
5718
+ "epoch": 0.9677835051546392,
5719
+ "grad_norm": 0.3015538156032562,
5720
+ "learning_rate": 6.323836830899321e-08,
5721
+ "loss": 0.001,
5722
+ "step": 751
5723
+ },
5724
+ {
5725
+ "epoch": 0.9690721649484536,
5726
+ "grad_norm": 2.0582327842712402,
5727
+ "learning_rate": 5.828530229667228e-08,
5728
+ "loss": 0.0127,
5729
+ "step": 752
5730
+ },
5731
+ {
5732
+ "epoch": 0.970360824742268,
5733
+ "grad_norm": 0.13917666673660278,
5734
+ "learning_rate": 5.353363164541825e-08,
5735
+ "loss": 0.001,
5736
+ "step": 753
5737
+ },
5738
+ {
5739
+ "epoch": 0.9716494845360825,
5740
+ "grad_norm": 1.766170620918274,
5741
+ "learning_rate": 4.898345261271531e-08,
5742
+ "loss": 0.0033,
5743
+ "step": 754
5744
+ },
5745
+ {
5746
+ "epoch": 0.9729381443298969,
5747
+ "grad_norm": 2.295456647872925,
5748
+ "learning_rate": 4.463485737430606e-08,
5749
+ "loss": 0.0045,
5750
+ "step": 755
5751
+ },
5752
+ {
5753
+ "epoch": 0.9742268041237113,
5754
+ "grad_norm": 3.3458142280578613,
5755
+ "learning_rate": 4.0487934022328533e-08,
5756
+ "loss": 0.0081,
5757
+ "step": 756
5758
+ },
5759
+ {
5760
+ "epoch": 0.9755154639175257,
5761
+ "grad_norm": 3.1655807495117188,
5762
+ "learning_rate": 3.654276656353206e-08,
5763
+ "loss": 0.0101,
5764
+ "step": 757
5765
+ },
5766
+ {
5767
+ "epoch": 0.9768041237113402,
5768
+ "grad_norm": 0.16411763429641724,
5769
+ "learning_rate": 3.27994349175742e-08,
5770
+ "loss": 0.0005,
5771
+ "step": 758
5772
+ },
5773
+ {
5774
+ "epoch": 0.9780927835051546,
5775
+ "grad_norm": 3.4274473190307617,
5776
+ "learning_rate": 2.9258014915399813e-08,
5777
+ "loss": 0.0037,
5778
+ "step": 759
5779
+ },
5780
+ {
5781
+ "epoch": 0.979381443298969,
5782
+ "grad_norm": 1.7726308107376099,
5783
+ "learning_rate": 2.591857829770672e-08,
5784
+ "loss": 0.0066,
5785
+ "step": 760
5786
+ },
5787
+ {
5788
+ "epoch": 0.979381443298969,
5789
+ "eval_accuracy": 0.9975173783515392,
5790
+ "eval_f1": 0.9557522123893806,
5791
+ "eval_loss": 0.013445839285850525,
5792
+ "eval_precision": 0.9642857142857143,
5793
+ "eval_recall": 0.9473684210526315,
5794
+ "eval_runtime": 86.1309,
5795
+ "eval_samples_per_second": 5.283,
5796
+ "eval_steps_per_second": 0.174,
5797
+ "step": 760
5798
+ },
5799
+ {
5800
+ "epoch": 0.9806701030927835,
5801
+ "grad_norm": 3.986452579498291,
5802
+ "learning_rate": 2.278119271349466e-08,
5803
+ "loss": 0.0265,
5804
+ "step": 761
5805
+ },
5806
+ {
5807
+ "epoch": 0.9819587628865979,
5808
+ "grad_norm": 1.644910454750061,
5809
+ "learning_rate": 1.984592171869082e-08,
5810
+ "loss": 0.0039,
5811
+ "step": 762
5812
+ },
5813
+ {
5814
+ "epoch": 0.9832474226804123,
5815
+ "grad_norm": 0.0761791542172432,
5816
+ "learning_rate": 1.711282477486642e-08,
5817
+ "loss": 0.0003,
5818
+ "step": 763
5819
+ },
5820
+ {
5821
+ "epoch": 0.9845360824742269,
5822
+ "grad_norm": 0.8887882828712463,
5823
+ "learning_rate": 1.4581957248026579e-08,
5824
+ "loss": 0.0047,
5825
+ "step": 764
5826
+ },
5827
+ {
5828
+ "epoch": 0.9858247422680413,
5829
+ "grad_norm": 1.5930033922195435,
5830
+ "learning_rate": 1.2253370407495636e-08,
5831
+ "loss": 0.0062,
5832
+ "step": 765
5833
+ },
5834
+ {
5835
+ "epoch": 0.9871134020618557,
5836
+ "grad_norm": 0.7668823599815369,
5837
+ "learning_rate": 1.0127111424872437e-08,
5838
+ "loss": 0.0027,
5839
+ "step": 766
5840
+ },
5841
+ {
5842
+ "epoch": 0.9884020618556701,
5843
+ "grad_norm": 3.0065221786499023,
5844
+ "learning_rate": 8.203223373078883e-09,
5845
+ "loss": 0.0152,
5846
+ "step": 767
5847
+ },
5848
+ {
5849
+ "epoch": 0.9896907216494846,
5850
+ "grad_norm": 0.5820819735527039,
5851
+ "learning_rate": 6.481745225485059e-09,
5852
+ "loss": 0.0016,
5853
+ "step": 768
5854
+ },
5855
+ {
5856
+ "epoch": 0.990979381443299,
5857
+ "grad_norm": 0.7048105597496033,
5858
+ "learning_rate": 4.962711855120983e-09,
5859
+ "loss": 0.0027,
5860
+ "step": 769
5861
+ },
5862
+ {
5863
+ "epoch": 0.9922680412371134,
5864
+ "grad_norm": 2.509854555130005,
5865
+ "learning_rate": 3.6461540339682855e-09,
5866
+ "loss": 0.0188,
5867
+ "step": 770
5868
+ },
5869
+ {
5870
+ "epoch": 0.9935567010309279,
5871
+ "grad_norm": 0.2154129147529602,
5872
+ "learning_rate": 2.532098432341812e-09,
5873
+ "loss": 0.0008,
5874
+ "step": 771
5875
+ },
5876
+ {
5877
+ "epoch": 0.9948453608247423,
5878
+ "grad_norm": 2.393842935562134,
5879
+ "learning_rate": 1.6205676183411733e-09,
5880
+ "loss": 0.0079,
5881
+ "step": 772
5882
+ },
5883
+ {
5884
+ "epoch": 0.9961340206185567,
5885
+ "grad_norm": 2.176377058029175,
5886
+ "learning_rate": 9.115800574022171e-10,
5887
+ "loss": 0.013,
5888
+ "step": 773
5889
+ },
5890
+ {
5891
+ "epoch": 0.9974226804123711,
5892
+ "grad_norm": 2.1431355476379395,
5893
+ "learning_rate": 4.0515011191621933e-10,
5894
+ "loss": 0.0134,
5895
+ "step": 774
5896
+ },
5897
+ {
5898
+ "epoch": 0.9987113402061856,
5899
+ "grad_norm": 1.2513763904571533,
5900
+ "learning_rate": 1.0128804094233779e-10,
5901
+ "loss": 0.0034,
5902
+ "step": 775
5903
+ },
5904
+ {
5905
+ "epoch": 1.0,
5906
+ "grad_norm": 2.7385754585266113,
5907
+ "learning_rate": 0.0,
5908
+ "loss": 0.0062,
5909
+ "step": 776
5910
  }
5911
  ],
5912
  "logging_steps": 1,
 
5921
  "should_evaluate": false,
5922
  "should_log": false,
5923
  "should_save": true,
5924
+ "should_training_stop": true
5925
  },
5926
  "attributes": {}
5927
  }
5928
  },
5929
+ "total_flos": 2.5857289592242176e+17,
5930
  "train_batch_size": 8,
5931
  "trial_name": null,
5932
  "trial_params": null