mtzig commited on
Commit
43edcbd
·
verified ·
1 Parent(s): 6568e48

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cb3b644050ed772b7473bed1a7223464b5adddad4a96b1e2f174c7375d90aca
3
  size 13648688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15078b350dceb966b20c8709542ebf0e64b3e9a4c0e2319cdaec4f9c5530bac6
3
  size 13648688
last-checkpoint/global_step700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d753f240c6d44b4bbe0556059d07f11e525bb5e9db9c3f9f93ad5e62c7229d8b
3
+ size 20450800
last-checkpoint/global_step700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14db1f831000826974ad5f792ab00cc773e4701c955f34d50943cc8bf79f0528
3
+ size 20450800
last-checkpoint/global_step700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79468a097847f015aa4935f8c165d90349322e51d0721720db234d01ed6b2d13
3
+ size 20450800
last-checkpoint/global_step700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36d4083d52ad8ada7eb47d0557d374a79a41dacbad7e5613ad40f9ee07870048
3
+ size 20450800
last-checkpoint/global_step700/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9000f9e4de8903bf33637a3164d7047c80d16b19389259040cc5dc4f48da333d
3
+ size 152238
last-checkpoint/global_step700/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eda5edf08baa6742371faab8836bbaaaefb59c558bf7648e07471d1f9cb94572
3
+ size 152238
last-checkpoint/global_step700/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:066effbb5600bcd5a6257c2386143b9784c2e3055c47b8e8155cda1fef9ad1b2
3
+ size 152238
last-checkpoint/global_step700/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:510c92c97451d4269f524888663e8c58c9f65608affe7d5aefed5707dfabece1
3
+ size 152238
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step600
 
1
+ global_step700
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b891feb40e4494a2f4339c4a6c2396fe8789003482bccb878b16a84fd49972d2
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08ee93655f035f40cef98d94e21df0215201bfd9c2fd009c63503f74d4bd0676
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58b2bf57c8acfc6560630987a9b234d67d256e62c79fb6301ceb72e476851c06
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f9350b4bfefd5190b618e0103ff8128fab616f2df08e300e5789f194a7e25b8
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:714ec8d81c6c369124166420e509178826aa2a10b37b4d55bda2151ad2f6106f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0a399dabcc87f1904a1f24d9d7781d4c2d3c109c95dd2958fca743902bd75c
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:073d7e3faa703403bd3d6a14715495db3bc0ef77ab8523513ff8bd83de272df4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e90189ce66cbbdd26dcd499b49b05660c650805c2cfc5e25340f61c20bbb952
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a56472912fbe406df77e6f0ecfb06e43ede87be214ec32eebe03c6969c7328f2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed875039ee3baaee6a245c8988a3754c26fb7f9e800cc58167646a8642969266
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7377805102981863,
5
  "eval_steps": 40,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4399,6 +4399,730 @@
4399
  "eval_samples_per_second": 2.127,
4400
  "eval_steps_per_second": 0.17,
4401
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4402
  }
4403
  ],
4404
  "logging_steps": 1,
@@ -4418,7 +5142,7 @@
4418
  "attributes": {}
4419
  }
4420
  },
4421
- "total_flos": 588849383505920.0,
4422
  "train_batch_size": 4,
4423
  "trial_name": null,
4424
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8607439286812173,
5
  "eval_steps": 40,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4399
  "eval_samples_per_second": 2.127,
4400
  "eval_steps_per_second": 0.17,
4401
  "step": 600
4402
+ },
4403
+ {
4404
+ "epoch": 0.7390101444820166,
4405
+ "grad_norm": 0.2943806410067254,
4406
+ "learning_rate": 3.8712690512345555e-06,
4407
+ "loss": 0.1728,
4408
+ "step": 601
4409
+ },
4410
+ {
4411
+ "epoch": 0.7402397786658469,
4412
+ "grad_norm": 0.42925890423626006,
4413
+ "learning_rate": 3.837366411839114e-06,
4414
+ "loss": 0.1948,
4415
+ "step": 602
4416
+ },
4417
+ {
4418
+ "epoch": 0.7414694128496773,
4419
+ "grad_norm": 0.3527421902997394,
4420
+ "learning_rate": 3.8035775956118416e-06,
4421
+ "loss": 0.1413,
4422
+ "step": 603
4423
+ },
4424
+ {
4425
+ "epoch": 0.7426990470335075,
4426
+ "grad_norm": 0.4175876107718813,
4427
+ "learning_rate": 3.7699032266284863e-06,
4428
+ "loss": 0.2727,
4429
+ "step": 604
4430
+ },
4431
+ {
4432
+ "epoch": 0.7439286812173378,
4433
+ "grad_norm": 0.4187920324268778,
4434
+ "learning_rate": 3.736343926850954e-06,
4435
+ "loss": 0.1588,
4436
+ "step": 605
4437
+ },
4438
+ {
4439
+ "epoch": 0.7451583154011682,
4440
+ "grad_norm": 0.3852760473136735,
4441
+ "learning_rate": 3.702900316115836e-06,
4442
+ "loss": 0.174,
4443
+ "step": 606
4444
+ },
4445
+ {
4446
+ "epoch": 0.7463879495849984,
4447
+ "grad_norm": 0.42823407651531814,
4448
+ "learning_rate": 3.6695730121229734e-06,
4449
+ "loss": 0.1938,
4450
+ "step": 607
4451
+ },
4452
+ {
4453
+ "epoch": 0.7476175837688288,
4454
+ "grad_norm": 0.3509868875989032,
4455
+ "learning_rate": 3.6363626304240185e-06,
4456
+ "loss": 0.1475,
4457
+ "step": 608
4458
+ },
4459
+ {
4460
+ "epoch": 0.7488472179526591,
4461
+ "grad_norm": 0.2971798682387744,
4462
+ "learning_rate": 3.6032697844110896e-06,
4463
+ "loss": 0.1767,
4464
+ "step": 609
4465
+ },
4466
+ {
4467
+ "epoch": 0.7500768521364894,
4468
+ "grad_norm": 0.6072125452251376,
4469
+ "learning_rate": 3.5702950853054284e-06,
4470
+ "loss": 0.1699,
4471
+ "step": 610
4472
+ },
4473
+ {
4474
+ "epoch": 0.7513064863203197,
4475
+ "grad_norm": 0.42164026185503256,
4476
+ "learning_rate": 3.5374391421461273e-06,
4477
+ "loss": 0.1412,
4478
+ "step": 611
4479
+ },
4480
+ {
4481
+ "epoch": 0.75253612050415,
4482
+ "grad_norm": 0.3486983507433236,
4483
+ "learning_rate": 3.5047025617788578e-06,
4484
+ "loss": 0.1936,
4485
+ "step": 612
4486
+ },
4487
+ {
4488
+ "epoch": 0.7537657546879803,
4489
+ "grad_norm": 0.4729724505869417,
4490
+ "learning_rate": 3.4720859488446744e-06,
4491
+ "loss": 0.2232,
4492
+ "step": 613
4493
+ },
4494
+ {
4495
+ "epoch": 0.7549953888718106,
4496
+ "grad_norm": 0.4712358882570717,
4497
+ "learning_rate": 3.4395899057688575e-06,
4498
+ "loss": 0.1957,
4499
+ "step": 614
4500
+ },
4501
+ {
4502
+ "epoch": 0.756225023055641,
4503
+ "grad_norm": 0.30705322198688406,
4504
+ "learning_rate": 3.407215032749763e-06,
4505
+ "loss": 0.1771,
4506
+ "step": 615
4507
+ },
4508
+ {
4509
+ "epoch": 0.7574546572394713,
4510
+ "grad_norm": 0.4492814208789423,
4511
+ "learning_rate": 3.374961927747751e-06,
4512
+ "loss": 0.2017,
4513
+ "step": 616
4514
+ },
4515
+ {
4516
+ "epoch": 0.7586842914233015,
4517
+ "grad_norm": 0.4380941744123555,
4518
+ "learning_rate": 3.342831186474149e-06,
4519
+ "loss": 0.2032,
4520
+ "step": 617
4521
+ },
4522
+ {
4523
+ "epoch": 0.7599139256071319,
4524
+ "grad_norm": 0.3711476811320281,
4525
+ "learning_rate": 3.31082340238023e-06,
4526
+ "loss": 0.139,
4527
+ "step": 618
4528
+ },
4529
+ {
4530
+ "epoch": 0.7611435597909622,
4531
+ "grad_norm": 0.4366197359235773,
4532
+ "learning_rate": 3.27893916664626e-06,
4533
+ "loss": 0.1726,
4534
+ "step": 619
4535
+ },
4536
+ {
4537
+ "epoch": 0.7623731939747925,
4538
+ "grad_norm": 0.3831073094979708,
4539
+ "learning_rate": 3.2471790681705928e-06,
4540
+ "loss": 0.1734,
4541
+ "step": 620
4542
+ },
4543
+ {
4544
+ "epoch": 0.7636028281586228,
4545
+ "grad_norm": 0.3832625976759797,
4546
+ "learning_rate": 3.215543693558769e-06,
4547
+ "loss": 0.1326,
4548
+ "step": 621
4549
+ },
4550
+ {
4551
+ "epoch": 0.7648324623424532,
4552
+ "grad_norm": 0.4637885564290134,
4553
+ "learning_rate": 3.1840336271126935e-06,
4554
+ "loss": 0.213,
4555
+ "step": 622
4556
+ },
4557
+ {
4558
+ "epoch": 0.7660620965262834,
4559
+ "grad_norm": 0.5509391377682509,
4560
+ "learning_rate": 3.152649450819852e-06,
4561
+ "loss": 0.202,
4562
+ "step": 623
4563
+ },
4564
+ {
4565
+ "epoch": 0.7672917307101137,
4566
+ "grad_norm": 0.4604352454314464,
4567
+ "learning_rate": 3.1213917443425456e-06,
4568
+ "loss": 0.2395,
4569
+ "step": 624
4570
+ },
4571
+ {
4572
+ "epoch": 0.7685213648939441,
4573
+ "grad_norm": 0.5005650818328251,
4574
+ "learning_rate": 3.0902610850071922e-06,
4575
+ "loss": 0.1712,
4576
+ "step": 625
4577
+ },
4578
+ {
4579
+ "epoch": 0.7697509990777743,
4580
+ "grad_norm": 0.3297795229391836,
4581
+ "learning_rate": 3.0592580477936606e-06,
4582
+ "loss": 0.1249,
4583
+ "step": 626
4584
+ },
4585
+ {
4586
+ "epoch": 0.7709806332616047,
4587
+ "grad_norm": 0.37133417357695125,
4588
+ "learning_rate": 3.0283832053246644e-06,
4589
+ "loss": 0.1496,
4590
+ "step": 627
4591
+ },
4592
+ {
4593
+ "epoch": 0.772210267445435,
4594
+ "grad_norm": 1.0851806228661502,
4595
+ "learning_rate": 2.99763712785516e-06,
4596
+ "loss": 0.1834,
4597
+ "step": 628
4598
+ },
4599
+ {
4600
+ "epoch": 0.7734399016292653,
4601
+ "grad_norm": 0.5871194480383413,
4602
+ "learning_rate": 2.967020383261834e-06,
4603
+ "loss": 0.2054,
4604
+ "step": 629
4605
+ },
4606
+ {
4607
+ "epoch": 0.7746695358130956,
4608
+ "grad_norm": 0.5149728889777226,
4609
+ "learning_rate": 2.9365335370326143e-06,
4610
+ "loss": 0.1972,
4611
+ "step": 630
4612
+ },
4613
+ {
4614
+ "epoch": 0.7758991699969259,
4615
+ "grad_norm": 0.37527398302282,
4616
+ "learning_rate": 2.9061771522562143e-06,
4617
+ "loss": 0.1492,
4618
+ "step": 631
4619
+ },
4620
+ {
4621
+ "epoch": 0.7771288041807562,
4622
+ "grad_norm": 0.4284583342223879,
4623
+ "learning_rate": 2.875951789611734e-06,
4624
+ "loss": 0.1937,
4625
+ "step": 632
4626
+ },
4627
+ {
4628
+ "epoch": 0.7783584383645865,
4629
+ "grad_norm": 0.4328792148070332,
4630
+ "learning_rate": 2.8458580073583262e-06,
4631
+ "loss": 0.1905,
4632
+ "step": 633
4633
+ },
4634
+ {
4635
+ "epoch": 0.7795880725484169,
4636
+ "grad_norm": 0.4067822771383594,
4637
+ "learning_rate": 2.8158963613248437e-06,
4638
+ "loss": 0.2048,
4639
+ "step": 634
4640
+ },
4641
+ {
4642
+ "epoch": 0.7808177067322472,
4643
+ "grad_norm": 0.5475925840409395,
4644
+ "learning_rate": 2.7860674048996174e-06,
4645
+ "loss": 0.2014,
4646
+ "step": 635
4647
+ },
4648
+ {
4649
+ "epoch": 0.7820473409160774,
4650
+ "grad_norm": 0.3714863801891058,
4651
+ "learning_rate": 2.756371689020214e-06,
4652
+ "loss": 0.1597,
4653
+ "step": 636
4654
+ },
4655
+ {
4656
+ "epoch": 0.7832769750999078,
4657
+ "grad_norm": 0.45403846500036404,
4658
+ "learning_rate": 2.7268097621632473e-06,
4659
+ "loss": 0.1588,
4660
+ "step": 637
4661
+ },
4662
+ {
4663
+ "epoch": 0.784506609283738,
4664
+ "grad_norm": 0.2750476426300895,
4665
+ "learning_rate": 2.697382170334275e-06,
4666
+ "loss": 0.1456,
4667
+ "step": 638
4668
+ },
4669
+ {
4670
+ "epoch": 0.7857362434675684,
4671
+ "grad_norm": 0.4122155448314921,
4672
+ "learning_rate": 2.6680894570577042e-06,
4673
+ "loss": 0.165,
4674
+ "step": 639
4675
+ },
4676
+ {
4677
+ "epoch": 0.7869658776513987,
4678
+ "grad_norm": 0.44104871745668295,
4679
+ "learning_rate": 2.638932163366742e-06,
4680
+ "loss": 0.1883,
4681
+ "step": 640
4682
+ },
4683
+ {
4684
+ "epoch": 0.7869658776513987,
4685
+ "eval_accuracy": 0.8021390374331551,
4686
+ "eval_f1": 0.5066666666666667,
4687
+ "eval_loss": 0.42875000834465027,
4688
+ "eval_precision": 0.76,
4689
+ "eval_recall": 0.38,
4690
+ "eval_runtime": 22.3064,
4691
+ "eval_samples_per_second": 2.242,
4692
+ "eval_steps_per_second": 0.179,
4693
+ "step": 640
4694
+ },
4695
+ {
4696
+ "epoch": 0.7881955118352291,
4697
+ "grad_norm": 0.38537966631812437,
4698
+ "learning_rate": 2.6099108277934105e-06,
4699
+ "loss": 0.1942,
4700
+ "step": 641
4701
+ },
4702
+ {
4703
+ "epoch": 0.7894251460190593,
4704
+ "grad_norm": 0.47302017581744826,
4705
+ "learning_rate": 2.581025986358602e-06,
4706
+ "loss": 0.2733,
4707
+ "step": 642
4708
+ },
4709
+ {
4710
+ "epoch": 0.7906547802028896,
4711
+ "grad_norm": 0.4006638675446945,
4712
+ "learning_rate": 2.5522781725621814e-06,
4713
+ "loss": 0.1905,
4714
+ "step": 643
4715
+ },
4716
+ {
4717
+ "epoch": 0.79188441438672,
4718
+ "grad_norm": 0.4264868084266065,
4719
+ "learning_rate": 2.523667917373125e-06,
4720
+ "loss": 0.2047,
4721
+ "step": 644
4722
+ },
4723
+ {
4724
+ "epoch": 0.7931140485705502,
4725
+ "grad_norm": 0.3954441386492838,
4726
+ "learning_rate": 2.4951957492197097e-06,
4727
+ "loss": 0.1377,
4728
+ "step": 645
4729
+ },
4730
+ {
4731
+ "epoch": 0.7943436827543806,
4732
+ "grad_norm": 0.39481889488214283,
4733
+ "learning_rate": 2.4668621939797745e-06,
4734
+ "loss": 0.1402,
4735
+ "step": 646
4736
+ },
4737
+ {
4738
+ "epoch": 0.7955733169382109,
4739
+ "grad_norm": 0.5271696297567287,
4740
+ "learning_rate": 2.438667774970981e-06,
4741
+ "loss": 0.2091,
4742
+ "step": 647
4743
+ },
4744
+ {
4745
+ "epoch": 0.7968029511220412,
4746
+ "grad_norm": 0.40581144727582685,
4747
+ "learning_rate": 2.4106130129411608e-06,
4748
+ "loss": 0.1898,
4749
+ "step": 648
4750
+ },
4751
+ {
4752
+ "epoch": 0.7980325853058715,
4753
+ "grad_norm": 0.4102532645005857,
4754
+ "learning_rate": 2.3826984260587084e-06,
4755
+ "loss": 0.2066,
4756
+ "step": 649
4757
+ },
4758
+ {
4759
+ "epoch": 0.7992622194897018,
4760
+ "grad_norm": 0.388703790445828,
4761
+ "learning_rate": 2.354924529902978e-06,
4762
+ "loss": 0.1987,
4763
+ "step": 650
4764
+ },
4765
+ {
4766
+ "epoch": 0.8004918536735321,
4767
+ "grad_norm": 0.4906618445456134,
4768
+ "learning_rate": 2.327291837454799e-06,
4769
+ "loss": 0.1837,
4770
+ "step": 651
4771
+ },
4772
+ {
4773
+ "epoch": 0.8017214878573624,
4774
+ "grad_norm": 0.37536494595757913,
4775
+ "learning_rate": 2.2998008590869838e-06,
4776
+ "loss": 0.1657,
4777
+ "step": 652
4778
+ },
4779
+ {
4780
+ "epoch": 0.8029511220411928,
4781
+ "grad_norm": 0.3812431916923574,
4782
+ "learning_rate": 2.2724521025548828e-06,
4783
+ "loss": 0.1008,
4784
+ "step": 653
4785
+ },
4786
+ {
4787
+ "epoch": 0.804180756225023,
4788
+ "grad_norm": 0.3734890292027527,
4789
+ "learning_rate": 2.245246072987045e-06,
4790
+ "loss": 0.1343,
4791
+ "step": 654
4792
+ },
4793
+ {
4794
+ "epoch": 0.8054103904088533,
4795
+ "grad_norm": 0.4423063838480555,
4796
+ "learning_rate": 2.2181832728758635e-06,
4797
+ "loss": 0.2222,
4798
+ "step": 655
4799
+ },
4800
+ {
4801
+ "epoch": 0.8066400245926837,
4802
+ "grad_norm": 0.3896545849527162,
4803
+ "learning_rate": 2.191264202068286e-06,
4804
+ "loss": 0.1766,
4805
+ "step": 656
4806
+ },
4807
+ {
4808
+ "epoch": 0.807869658776514,
4809
+ "grad_norm": 0.6024032080378133,
4810
+ "learning_rate": 2.1644893577566118e-06,
4811
+ "loss": 0.231,
4812
+ "step": 657
4813
+ },
4814
+ {
4815
+ "epoch": 0.8090992929603443,
4816
+ "grad_norm": 0.43861748495389236,
4817
+ "learning_rate": 2.137859234469286e-06,
4818
+ "loss": 0.2467,
4819
+ "step": 658
4820
+ },
4821
+ {
4822
+ "epoch": 0.8103289271441746,
4823
+ "grad_norm": 0.37033226791746354,
4824
+ "learning_rate": 2.1113743240617668e-06,
4825
+ "loss": 0.1337,
4826
+ "step": 659
4827
+ },
4828
+ {
4829
+ "epoch": 0.811558561328005,
4830
+ "grad_norm": 0.6398820179734428,
4831
+ "learning_rate": 2.08503511570746e-06,
4832
+ "loss": 0.1954,
4833
+ "step": 660
4834
+ },
4835
+ {
4836
+ "epoch": 0.8127881955118352,
4837
+ "grad_norm": 0.4504933775118792,
4838
+ "learning_rate": 2.058842095888658e-06,
4839
+ "loss": 0.18,
4840
+ "step": 661
4841
+ },
4842
+ {
4843
+ "epoch": 0.8140178296956655,
4844
+ "grad_norm": 0.361212739042047,
4845
+ "learning_rate": 2.0327957483875693e-06,
4846
+ "loss": 0.1489,
4847
+ "step": 662
4848
+ },
4849
+ {
4850
+ "epoch": 0.8152474638794959,
4851
+ "grad_norm": 0.307913369177724,
4852
+ "learning_rate": 2.006896554277388e-06,
4853
+ "loss": 0.1572,
4854
+ "step": 663
4855
+ },
4856
+ {
4857
+ "epoch": 0.8164770980633261,
4858
+ "grad_norm": 0.25426740831645195,
4859
+ "learning_rate": 1.981144991913392e-06,
4860
+ "loss": 0.12,
4861
+ "step": 664
4862
+ },
4863
+ {
4864
+ "epoch": 0.8177067322471565,
4865
+ "grad_norm": 0.3663288109181175,
4866
+ "learning_rate": 1.9555415369241228e-06,
4867
+ "loss": 0.1571,
4868
+ "step": 665
4869
+ },
4870
+ {
4871
+ "epoch": 0.8189363664309868,
4872
+ "grad_norm": 0.41662449029107057,
4873
+ "learning_rate": 1.930086662202589e-06,
4874
+ "loss": 0.1873,
4875
+ "step": 666
4876
+ },
4877
+ {
4878
+ "epoch": 0.820166000614817,
4879
+ "grad_norm": 0.40845173743188795,
4880
+ "learning_rate": 1.9047808378975485e-06,
4881
+ "loss": 0.1534,
4882
+ "step": 667
4883
+ },
4884
+ {
4885
+ "epoch": 0.8213956347986474,
4886
+ "grad_norm": 0.6212434671550456,
4887
+ "learning_rate": 1.8796245314048046e-06,
4888
+ "loss": 0.2374,
4889
+ "step": 668
4890
+ },
4891
+ {
4892
+ "epoch": 0.8226252689824777,
4893
+ "grad_norm": 0.3337054400199707,
4894
+ "learning_rate": 1.8546182073585828e-06,
4895
+ "loss": 0.184,
4896
+ "step": 669
4897
+ },
4898
+ {
4899
+ "epoch": 0.823854903166308,
4900
+ "grad_norm": 0.37408116822647747,
4901
+ "learning_rate": 1.829762327622958e-06,
4902
+ "loss": 0.1627,
4903
+ "step": 670
4904
+ },
4905
+ {
4906
+ "epoch": 0.8250845373501383,
4907
+ "grad_norm": 0.41291954814345744,
4908
+ "learning_rate": 1.805057351283307e-06,
4909
+ "loss": 0.1426,
4910
+ "step": 671
4911
+ },
4912
+ {
4913
+ "epoch": 0.8263141715339687,
4914
+ "grad_norm": 0.6232928915412197,
4915
+ "learning_rate": 1.7805037346378384e-06,
4916
+ "loss": 0.1939,
4917
+ "step": 672
4918
+ },
4919
+ {
4920
+ "epoch": 0.827543805717799,
4921
+ "grad_norm": 0.43962963164293384,
4922
+ "learning_rate": 1.756101931189169e-06,
4923
+ "loss": 0.2049,
4924
+ "step": 673
4925
+ },
4926
+ {
4927
+ "epoch": 0.8287734399016292,
4928
+ "grad_norm": 0.3747672424266052,
4929
+ "learning_rate": 1.7318523916359376e-06,
4930
+ "loss": 0.1644,
4931
+ "step": 674
4932
+ },
4933
+ {
4934
+ "epoch": 0.8300030740854596,
4935
+ "grad_norm": 0.4713865050667868,
4936
+ "learning_rate": 1.7077555638644838e-06,
4937
+ "loss": 0.2924,
4938
+ "step": 675
4939
+ },
4940
+ {
4941
+ "epoch": 0.8312327082692899,
4942
+ "grad_norm": 0.5391745289921438,
4943
+ "learning_rate": 1.6838118929405856e-06,
4944
+ "loss": 0.1767,
4945
+ "step": 676
4946
+ },
4947
+ {
4948
+ "epoch": 0.8324623424531202,
4949
+ "grad_norm": 0.35807178811591905,
4950
+ "learning_rate": 1.660021821101222e-06,
4951
+ "loss": 0.1718,
4952
+ "step": 677
4953
+ },
4954
+ {
4955
+ "epoch": 0.8336919766369505,
4956
+ "grad_norm": 0.5700152695384362,
4957
+ "learning_rate": 1.6363857877464161e-06,
4958
+ "loss": 0.1505,
4959
+ "step": 678
4960
+ },
4961
+ {
4962
+ "epoch": 0.8349216108207809,
4963
+ "grad_norm": 0.521349273286693,
4964
+ "learning_rate": 1.6129042294311227e-06,
4965
+ "loss": 0.1893,
4966
+ "step": 679
4967
+ },
4968
+ {
4969
+ "epoch": 0.8361512450046111,
4970
+ "grad_norm": 0.4881174981503527,
4971
+ "learning_rate": 1.5895775798571523e-06,
4972
+ "loss": 0.2403,
4973
+ "step": 680
4974
+ },
4975
+ {
4976
+ "epoch": 0.8361512450046111,
4977
+ "eval_accuracy": 0.8021390374331551,
4978
+ "eval_f1": 0.5066666666666667,
4979
+ "eval_loss": 0.42875000834465027,
4980
+ "eval_precision": 0.76,
4981
+ "eval_recall": 0.38,
4982
+ "eval_runtime": 23.134,
4983
+ "eval_samples_per_second": 2.161,
4984
+ "eval_steps_per_second": 0.173,
4985
+ "step": 680
4986
+ },
4987
+ {
4988
+ "epoch": 0.8373808791884414,
4989
+ "grad_norm": 0.43157618057929154,
4990
+ "learning_rate": 1.5664062698651706e-06,
4991
+ "loss": 0.1824,
4992
+ "step": 681
4993
+ },
4994
+ {
4995
+ "epoch": 0.8386105133722718,
4996
+ "grad_norm": 0.5760272230077988,
4997
+ "learning_rate": 1.5433907274267357e-06,
4998
+ "loss": 0.2397,
4999
+ "step": 682
5000
+ },
5001
+ {
5002
+ "epoch": 0.839840147556102,
5003
+ "grad_norm": 0.5350905991023048,
5004
+ "learning_rate": 1.5205313776364028e-06,
5005
+ "loss": 0.1892,
5006
+ "step": 683
5007
+ },
5008
+ {
5009
+ "epoch": 0.8410697817399324,
5010
+ "grad_norm": 0.61137934990804,
5011
+ "learning_rate": 1.4978286427038602e-06,
5012
+ "loss": 0.2348,
5013
+ "step": 684
5014
+ },
5015
+ {
5016
+ "epoch": 0.8422994159237627,
5017
+ "grad_norm": 0.4331644305139785,
5018
+ "learning_rate": 1.4752829419461357e-06,
5019
+ "loss": 0.1937,
5020
+ "step": 685
5021
+ },
5022
+ {
5023
+ "epoch": 0.8435290501075929,
5024
+ "grad_norm": 0.3640781076289279,
5025
+ "learning_rate": 1.4528946917798603e-06,
5026
+ "loss": 0.1962,
5027
+ "step": 686
5028
+ },
5029
+ {
5030
+ "epoch": 0.8447586842914233,
5031
+ "grad_norm": 0.4244637100420945,
5032
+ "learning_rate": 1.4306643057135638e-06,
5033
+ "loss": 0.193,
5034
+ "step": 687
5035
+ },
5036
+ {
5037
+ "epoch": 0.8459883184752536,
5038
+ "grad_norm": 0.27253213925489794,
5039
+ "learning_rate": 1.4085921943400416e-06,
5040
+ "loss": 0.1582,
5041
+ "step": 688
5042
+ },
5043
+ {
5044
+ "epoch": 0.847217952659084,
5045
+ "grad_norm": 0.7026492760941759,
5046
+ "learning_rate": 1.3866787653287804e-06,
5047
+ "loss": 0.2727,
5048
+ "step": 689
5049
+ },
5050
+ {
5051
+ "epoch": 0.8484475868429142,
5052
+ "grad_norm": 0.3357057600160637,
5053
+ "learning_rate": 1.3649244234184157e-06,
5054
+ "loss": 0.1395,
5055
+ "step": 690
5056
+ },
5057
+ {
5058
+ "epoch": 0.8496772210267446,
5059
+ "grad_norm": 0.38849185683759185,
5060
+ "learning_rate": 1.3433295704092586e-06,
5061
+ "loss": 0.1367,
5062
+ "step": 691
5063
+ },
5064
+ {
5065
+ "epoch": 0.8509068552105749,
5066
+ "grad_norm": 0.5532934868131949,
5067
+ "learning_rate": 1.3218946051558867e-06,
5068
+ "loss": 0.2007,
5069
+ "step": 692
5070
+ },
5071
+ {
5072
+ "epoch": 0.8521364893944051,
5073
+ "grad_norm": 0.4093414023233572,
5074
+ "learning_rate": 1.3006199235597628e-06,
5075
+ "loss": 0.199,
5076
+ "step": 693
5077
+ },
5078
+ {
5079
+ "epoch": 0.8533661235782355,
5080
+ "grad_norm": 0.5800657790788337,
5081
+ "learning_rate": 1.279505918561923e-06,
5082
+ "loss": 0.1786,
5083
+ "step": 694
5084
+ },
5085
+ {
5086
+ "epoch": 0.8545957577620658,
5087
+ "grad_norm": 0.5604353644860381,
5088
+ "learning_rate": 1.2585529801357377e-06,
5089
+ "loss": 0.2597,
5090
+ "step": 695
5091
+ },
5092
+ {
5093
+ "epoch": 0.8558253919458961,
5094
+ "grad_norm": 0.4944214492031985,
5095
+ "learning_rate": 1.2377614952796825e-06,
5096
+ "loss": 0.1578,
5097
+ "step": 696
5098
+ },
5099
+ {
5100
+ "epoch": 0.8570550261297264,
5101
+ "grad_norm": 0.3580298395044867,
5102
+ "learning_rate": 1.217131848010209e-06,
5103
+ "loss": 0.145,
5104
+ "step": 697
5105
+ },
5106
+ {
5107
+ "epoch": 0.8582846603135568,
5108
+ "grad_norm": 0.49696207588289626,
5109
+ "learning_rate": 1.196664419354644e-06,
5110
+ "loss": 0.1847,
5111
+ "step": 698
5112
+ },
5113
+ {
5114
+ "epoch": 0.859514294497387,
5115
+ "grad_norm": 0.5676831498828142,
5116
+ "learning_rate": 1.176359587344158e-06,
5117
+ "loss": 0.2467,
5118
+ "step": 699
5119
+ },
5120
+ {
5121
+ "epoch": 0.8607439286812173,
5122
+ "grad_norm": 0.4791316046608471,
5123
+ "learning_rate": 1.1562177270067766e-06,
5124
+ "loss": 0.2128,
5125
+ "step": 700
5126
  }
5127
  ],
5128
  "logging_steps": 1,
 
5142
  "attributes": {}
5143
  }
5144
  },
5145
+ "total_flos": 687762207244288.0,
5146
  "train_batch_size": 4,
5147
  "trial_name": null,
5148
  "trial_params": null