mtzig commited on
Commit
9d4ca34
·
verified ·
1 Parent(s): 67de92a

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f4e1f5f1b463bbf7a2ef4d4af3eb26c728852abec7c6787de609ac33d09a95c
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933b1f63d90c319542a4fd47759174fed83fc881d8cb4cb2a83a713fabd5a6a8
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90a4eef445695e00c0448191c471e20fc90fac55088f64ebc374faeb67378993
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25fa0e6ce8d54d0ff171f555366adec8729dd853388ab8d8c9f00eb8bd9371dd
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c69c54a9cccf4257b34f1b979d5d539b31b5218794be3960611b7d2d897e994a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21407e44a54ae327fa01bf921b6d09dfdd5516385f912101dd79966b49088d89
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ed25b5e3653278282ac873c3af5310841446a9e13773d544889afce31024bcb
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f41da44c9f99411f6202714d8e5e84be2ba87c642f032b48d842df5f62d4222
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39de24dbe6f2f3a1b4e34daf6b8e7473a3a38ea40a91769099a82e8f4ebd1d0b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b57b77c4bb1d00af9ed39553b6d0ea59b3e5d5141d103e1b5721d70f1c1075
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98fb28188ad6ed7eb480ef9e981c73e6e5e156423f75a203ab35ebe4c0ee7122
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d1c230b287be50383ea4060312fe31d090370284705ec75e143fb62864c38fd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8f0f9b2d5716e1b93fa3c48662c835a10bad645dcd88050a14008f0e7777a56
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ada10e4fc29560cc70bbb7f18e8d12a735aff41ce11272efee5b1a4fe85f4c4
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a1a3d5f6e07161b2ee73578e4b8d161f40891058e3f83f813289b5c369f350a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9fc5c2486e572dc7f0881c6dd684dbcdd44bc14f5a416e106e39724f7005e6d
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c0996004e3280ba2b8c5308142e245e93b9a3d5870de383914360145085a647
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae91bbe4bb3448f9fe588b5f12c6d570f98cbeb7f79c6b4c021fdd413e35a673
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b99dae60d08ae089466b878474ef297a0b281547cd1097ea214ecee77244b16
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e2bdfc538b632be76938bdc369215d9c9e9696454b505d6d5c099a19d59619
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c3acfb47638e30fe1106672a6fd0db74c9187c94c19467e9d22bd366fbb5472
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654f94a53cbd3a4c0aa96462f7eefb36cea6a40f65967f82f41333fe8d59b3e6
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9723827a668573edbd596a65e0f225b208491adf853284b8da3f11b792077fdc
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ee86115e5e887f663c435c280ac37373efa53275c443e874691073017d1363
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a124d1e9d8a7b4a76d7294be394802bfec19da05b0209e12c8dc6b8ab250293
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85ee37d9a532de8cdb09f3a64e5b2fe9e638521f567e2b493ae4f1f2c3b0617
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6443298969072165,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.451,
3820
  "eval_steps_per_second": 0.18,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.6687462625574912e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7731958762886598,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.451,
3820
  "eval_steps_per_second": 0.18,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.645618556701031,
3825
+ "grad_norm": 0.3875870704650879,
3826
+ "learning_rate": 6.730609277866644e-06,
3827
+ "loss": 0.0013,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.6469072164948454,
3832
+ "grad_norm": 0.7030169367790222,
3833
+ "learning_rate": 6.688107468527297e-06,
3834
+ "loss": 0.0044,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.6481958762886598,
3839
+ "grad_norm": 0.05920355021953583,
3840
+ "learning_rate": 6.645672750209216e-06,
3841
+ "loss": 0.0007,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.6494845360824743,
3846
+ "grad_norm": 2.848557472229004,
3847
+ "learning_rate": 6.603305982538295e-06,
3848
+ "loss": 0.014,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.6507731958762887,
3853
+ "grad_norm": 0.3917801082134247,
3854
+ "learning_rate": 6.561008023763915e-06,
3855
+ "loss": 0.0009,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.6520618556701031,
3860
+ "grad_norm": 1.1397738456726074,
3861
+ "learning_rate": 6.518779730741555e-06,
3862
+ "loss": 0.0072,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.6533505154639175,
3867
+ "grad_norm": 0.10615069419145584,
3868
+ "learning_rate": 6.476621958915426e-06,
3869
+ "loss": 0.0007,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.654639175257732,
3874
+ "grad_norm": 0.2596324384212494,
3875
+ "learning_rate": 6.434535562301153e-06,
3876
+ "loss": 0.0006,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.6559278350515464,
3881
+ "grad_norm": 1.1918329000473022,
3882
+ "learning_rate": 6.392521393468471e-06,
3883
+ "loss": 0.0025,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.6572164948453608,
3888
+ "grad_norm": 2.2419281005859375,
3889
+ "learning_rate": 6.350580303523947e-06,
3890
+ "loss": 0.006,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.6585051546391752,
3895
+ "grad_norm": 1.821906328201294,
3896
+ "learning_rate": 6.308713142093749e-06,
3897
+ "loss": 0.025,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.6597938144329897,
3902
+ "grad_norm": 2.59908390045166,
3903
+ "learning_rate": 6.266920757306429e-06,
3904
+ "loss": 0.0225,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.6610824742268041,
3909
+ "grad_norm": 2.4867560863494873,
3910
+ "learning_rate": 6.225203995775746e-06,
3911
+ "loss": 0.0097,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.6623711340206185,
3916
+ "grad_norm": 0.33032119274139404,
3917
+ "learning_rate": 6.183563702583506e-06,
3918
+ "loss": 0.0012,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.663659793814433,
3923
+ "grad_norm": 0.6683783531188965,
3924
+ "learning_rate": 6.1420007212624584e-06,
3925
+ "loss": 0.0014,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.6649484536082474,
3930
+ "grad_norm": 0.08879516273736954,
3931
+ "learning_rate": 6.100515893779188e-06,
3932
+ "loss": 0.0006,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.6662371134020618,
3937
+ "grad_norm": 1.5069953203201294,
3938
+ "learning_rate": 6.05911006051708e-06,
3939
+ "loss": 0.0062,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.6675257731958762,
3944
+ "grad_norm": 1.1832886934280396,
3945
+ "learning_rate": 6.01778406025928e-06,
3946
+ "loss": 0.0051,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.6688144329896907,
3951
+ "grad_norm": 1.7542977333068848,
3952
+ "learning_rate": 5.976538730171708e-06,
3953
+ "loss": 0.0072,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.6701030927835051,
3958
+ "grad_norm": 2.53532338142395,
3959
+ "learning_rate": 5.935374905786102e-06,
3960
+ "loss": 0.0054,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.6701030927835051,
3965
+ "eval_accuracy": 0.997020854021847,
3966
+ "eval_f1": 0.9473684210526315,
3967
+ "eval_loss": 0.014478031545877457,
3968
+ "eval_precision": 0.9473684210526315,
3969
+ "eval_recall": 0.9473684210526315,
3970
+ "eval_runtime": 85.5952,
3971
+ "eval_samples_per_second": 5.316,
3972
+ "eval_steps_per_second": 0.175,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.6713917525773195,
3977
+ "grad_norm": 3.9269418716430664,
3978
+ "learning_rate": 5.89429342098309e-06,
3979
+ "loss": 0.0174,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.6726804123711341,
3984
+ "grad_norm": 1.222317099571228,
3985
+ "learning_rate": 5.8532951079752895e-06,
3986
+ "loss": 0.0046,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.6739690721649485,
3991
+ "grad_norm": 0.17730024456977844,
3992
+ "learning_rate": 5.812380797290465e-06,
3993
+ "loss": 0.0008,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.6752577319587629,
3998
+ "grad_norm": 0.8336971998214722,
3999
+ "learning_rate": 5.771551317754691e-06,
4000
+ "loss": 0.0014,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.6765463917525774,
4005
+ "grad_norm": 0.8036553859710693,
4006
+ "learning_rate": 5.730807496475568e-06,
4007
+ "loss": 0.0033,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.6778350515463918,
4012
+ "grad_norm": 0.5665665864944458,
4013
+ "learning_rate": 5.690150158825462e-06,
4014
+ "loss": 0.0016,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.6791237113402062,
4019
+ "grad_norm": 0.08514845371246338,
4020
+ "learning_rate": 5.649580128424792e-06,
4021
+ "loss": 0.0013,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.6804123711340206,
4026
+ "grad_norm": 0.22260144352912903,
4027
+ "learning_rate": 5.609098227125334e-06,
4028
+ "loss": 0.001,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.6817010309278351,
4033
+ "grad_norm": 0.6056246161460876,
4034
+ "learning_rate": 5.568705274993584e-06,
4035
+ "loss": 0.0013,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.6829896907216495,
4040
+ "grad_norm": 0.2608482539653778,
4041
+ "learning_rate": 5.528402090294142e-06,
4042
+ "loss": 0.0013,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.6842783505154639,
4047
+ "grad_norm": 2.119140386581421,
4048
+ "learning_rate": 5.488189489473133e-06,
4049
+ "loss": 0.0264,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.6855670103092784,
4054
+ "grad_norm": 0.605993390083313,
4055
+ "learning_rate": 5.448068287141663e-06,
4056
+ "loss": 0.0025,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.6868556701030928,
4061
+ "grad_norm": 0.04999390244483948,
4062
+ "learning_rate": 5.4080392960593355e-06,
4063
+ "loss": 0.0003,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.6881443298969072,
4068
+ "grad_norm": 1.6986360549926758,
4069
+ "learning_rate": 5.368103327117768e-06,
4070
+ "loss": 0.0199,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.6894329896907216,
4075
+ "grad_norm": 1.8997451066970825,
4076
+ "learning_rate": 5.328261189324166e-06,
4077
+ "loss": 0.0236,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.6907216494845361,
4082
+ "grad_norm": 0.4543597102165222,
4083
+ "learning_rate": 5.288513689784951e-06,
4084
+ "loss": 0.0013,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.6920103092783505,
4089
+ "grad_norm": 0.3688147962093353,
4090
+ "learning_rate": 5.2488616336893915e-06,
4091
+ "loss": 0.001,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.6932989690721649,
4096
+ "grad_norm": 1.8557827472686768,
4097
+ "learning_rate": 5.209305824293307e-06,
4098
+ "loss": 0.0086,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.6945876288659794,
4103
+ "grad_norm": 2.0368287563323975,
4104
+ "learning_rate": 5.1698470629027845e-06,
4105
+ "loss": 0.0127,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.6958762886597938,
4110
+ "grad_norm": 1.7883585691452026,
4111
+ "learning_rate": 5.130486148857952e-06,
4112
+ "loss": 0.0293,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.6958762886597938,
4117
+ "eval_accuracy": 0.997020854021847,
4118
+ "eval_f1": 0.9473684210526315,
4119
+ "eval_loss": 0.014807779341936111,
4120
+ "eval_precision": 0.9473684210526315,
4121
+ "eval_recall": 0.9473684210526315,
4122
+ "eval_runtime": 84.982,
4123
+ "eval_samples_per_second": 5.354,
4124
+ "eval_steps_per_second": 0.177,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.6971649484536082,
4129
+ "grad_norm": 1.6709312200546265,
4130
+ "learning_rate": 5.0912238795167845e-06,
4131
+ "loss": 0.022,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.6984536082474226,
4136
+ "grad_norm": 0.772537350654602,
4137
+ "learning_rate": 5.05206105023895e-06,
4138
+ "loss": 0.0029,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.6997422680412371,
4143
+ "grad_norm": 4.051438331604004,
4144
+ "learning_rate": 5.012998454369701e-06,
4145
+ "loss": 0.038,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.7010309278350515,
4150
+ "grad_norm": 1.2733999490737915,
4151
+ "learning_rate": 4.974036883223798e-06,
4152
+ "loss": 0.0065,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.7023195876288659,
4157
+ "grad_norm": 0.21695715188980103,
4158
+ "learning_rate": 4.935177126069485e-06,
4159
+ "loss": 0.0006,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.7036082474226805,
4164
+ "grad_norm": 0.9881150722503662,
4165
+ "learning_rate": 4.896419970112499e-06,
4166
+ "loss": 0.0061,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.7048969072164949,
4171
+ "grad_norm": 0.4101882576942444,
4172
+ "learning_rate": 4.857766200480115e-06,
4173
+ "loss": 0.0012,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.7061855670103093,
4178
+ "grad_norm": 0.4901997745037079,
4179
+ "learning_rate": 4.819216600205254e-06,
4180
+ "loss": 0.0022,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.7074742268041238,
4185
+ "grad_norm": 1.6338658332824707,
4186
+ "learning_rate": 4.780771950210616e-06,
4187
+ "loss": 0.0074,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.7087628865979382,
4192
+ "grad_norm": 0.9421409964561462,
4193
+ "learning_rate": 4.742433029292856e-06,
4194
+ "loss": 0.0023,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.7100515463917526,
4199
+ "grad_norm": 0.20757536590099335,
4200
+ "learning_rate": 4.704200614106813e-06,
4201
+ "loss": 0.0012,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.711340206185567,
4206
+ "grad_norm": 2.018266201019287,
4207
+ "learning_rate": 4.6660754791497755e-06,
4208
+ "loss": 0.0096,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.7126288659793815,
4213
+ "grad_norm": 2.6476552486419678,
4214
+ "learning_rate": 4.628058396745787e-06,
4215
+ "loss": 0.0053,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.7139175257731959,
4220
+ "grad_norm": 1.7703890800476074,
4221
+ "learning_rate": 4.590150137030009e-06,
4222
+ "loss": 0.0071,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.7152061855670103,
4227
+ "grad_norm": 1.2769412994384766,
4228
+ "learning_rate": 4.552351467933115e-06,
4229
+ "loss": 0.0036,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.7164948453608248,
4234
+ "grad_norm": 1.8354310989379883,
4235
+ "learning_rate": 4.514663155165731e-06,
4236
+ "loss": 0.008,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.7177835051546392,
4241
+ "grad_norm": 0.896404504776001,
4242
+ "learning_rate": 4.477085962202931e-06,
4243
+ "loss": 0.0028,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.7190721649484536,
4248
+ "grad_norm": 0.33429154753685,
4249
+ "learning_rate": 4.439620650268771e-06,
4250
+ "loss": 0.0013,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.720360824742268,
4255
+ "grad_norm": 1.1864862442016602,
4256
+ "learning_rate": 4.402267978320854e-06,
4257
+ "loss": 0.0035,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.7216494845360825,
4262
+ "grad_norm": 2.4220573902130127,
4263
+ "learning_rate": 4.365028703034976e-06,
4264
+ "loss": 0.0133,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.7216494845360825,
4269
+ "eval_accuracy": 0.997020854021847,
4270
+ "eval_f1": 0.9473684210526315,
4271
+ "eval_loss": 0.013733865693211555,
4272
+ "eval_precision": 0.9473684210526315,
4273
+ "eval_recall": 0.9473684210526315,
4274
+ "eval_runtime": 85.2875,
4275
+ "eval_samples_per_second": 5.335,
4276
+ "eval_steps_per_second": 0.176,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.7229381443298969,
4281
+ "grad_norm": 2.1442863941192627,
4282
+ "learning_rate": 4.327903578789785e-06,
4283
+ "loss": 0.0307,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.7242268041237113,
4288
+ "grad_norm": 1.1676955223083496,
4289
+ "learning_rate": 4.290893357651502e-06,
4290
+ "loss": 0.002,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.7255154639175257,
4295
+ "grad_norm": 1.461906909942627,
4296
+ "learning_rate": 4.253998789358683e-06,
4297
+ "loss": 0.0105,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.7268041237113402,
4302
+ "grad_norm": 2.029210090637207,
4303
+ "learning_rate": 4.217220621307043e-06,
4304
+ "loss": 0.0066,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.7280927835051546,
4309
+ "grad_norm": 0.26991185545921326,
4310
+ "learning_rate": 4.180559598534297e-06,
4311
+ "loss": 0.0009,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.729381443298969,
4316
+ "grad_norm": 2.1972944736480713,
4317
+ "learning_rate": 4.144016463705081e-06,
4318
+ "loss": 0.0074,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.7306701030927835,
4323
+ "grad_norm": 1.7855631113052368,
4324
+ "learning_rate": 4.107591957095903e-06,
4325
+ "loss": 0.0234,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.7319587628865979,
4330
+ "grad_norm": 0.13372205197811127,
4331
+ "learning_rate": 4.071286816580142e-06,
4332
+ "loss": 0.0011,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.7332474226804123,
4337
+ "grad_norm": 0.3758986294269562,
4338
+ "learning_rate": 4.035101777613113e-06,
4339
+ "loss": 0.0009,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.7345360824742269,
4344
+ "grad_norm": 4.052021026611328,
4345
+ "learning_rate": 3.999037573217157e-06,
4346
+ "loss": 0.031,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.7358247422680413,
4351
+ "grad_norm": 3.024075508117676,
4352
+ "learning_rate": 3.963094933966797e-06,
4353
+ "loss": 0.0191,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.7371134020618557,
4358
+ "grad_norm": 0.10660507529973984,
4359
+ "learning_rate": 3.927274587973935e-06,
4360
+ "loss": 0.0004,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.7384020618556701,
4365
+ "grad_norm": 0.7237541079521179,
4366
+ "learning_rate": 3.8915772608731066e-06,
4367
+ "loss": 0.0015,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.7396907216494846,
4372
+ "grad_norm": 3.8813493251800537,
4373
+ "learning_rate": 3.856003675806777e-06,
4374
+ "loss": 0.0142,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.740979381443299,
4379
+ "grad_norm": 0.08904914557933807,
4380
+ "learning_rate": 3.820554553410693e-06,
4381
+ "loss": 0.0006,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.7422680412371134,
4386
+ "grad_norm": 2.8645918369293213,
4387
+ "learning_rate": 3.78523061179929e-06,
4388
+ "loss": 0.0151,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.7435567010309279,
4393
+ "grad_norm": 0.8430268168449402,
4394
+ "learning_rate": 3.7500325665511337e-06,
4395
+ "loss": 0.0031,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.7448453608247423,
4400
+ "grad_norm": 0.0855301171541214,
4401
+ "learning_rate": 3.7149611306944356e-06,
4402
+ "loss": 0.0007,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.7461340206185567,
4407
+ "grad_norm": 1.7717701196670532,
4408
+ "learning_rate": 3.680017014692604e-06,
4409
+ "loss": 0.0075,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.7474226804123711,
4414
+ "grad_norm": 1.216423749923706,
4415
+ "learning_rate": 3.645200926429844e-06,
4416
+ "loss": 0.0028,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.7474226804123711,
4421
+ "eval_accuracy": 0.9980139026812314,
4422
+ "eval_f1": 0.9642857142857143,
4423
+ "eval_loss": 0.014089370146393776,
4424
+ "eval_precision": 0.9818181818181818,
4425
+ "eval_recall": 0.9473684210526315,
4426
+ "eval_runtime": 85.7572,
4427
+ "eval_samples_per_second": 5.306,
4428
+ "eval_steps_per_second": 0.175,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.7487113402061856,
4433
+ "grad_norm": 0.18626463413238525,
4434
+ "learning_rate": 3.610513571196832e-06,
4435
+ "loss": 0.0008,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.75,
4440
+ "grad_norm": 0.16629698872566223,
4441
+ "learning_rate": 3.5759556516764205e-06,
4442
+ "loss": 0.001,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.7512886597938144,
4447
+ "grad_norm": 0.0471065454185009,
4448
+ "learning_rate": 3.541527867929403e-06,
4449
+ "loss": 0.0003,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.7525773195876289,
4454
+ "grad_norm": 0.925058901309967,
4455
+ "learning_rate": 3.507230917380332e-06,
4456
+ "loss": 0.0022,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.7538659793814433,
4461
+ "grad_norm": 5.241347312927246,
4462
+ "learning_rate": 3.4730654948033957e-06,
4463
+ "loss": 0.0038,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.7551546391752577,
4468
+ "grad_norm": 5.135495662689209,
4469
+ "learning_rate": 3.4390322923083385e-06,
4470
+ "loss": 0.0154,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.7564432989690721,
4475
+ "grad_norm": 0.30281150341033936,
4476
+ "learning_rate": 3.4051319993264397e-06,
4477
+ "loss": 0.0009,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.7577319587628866,
4482
+ "grad_norm": 0.08247953653335571,
4483
+ "learning_rate": 3.3713653025965544e-06,
4484
+ "loss": 0.0006,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.759020618556701,
4489
+ "grad_norm": 0.1323813498020172,
4490
+ "learning_rate": 3.3377328861511927e-06,
4491
+ "loss": 0.0005,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.7603092783505154,
4496
+ "grad_norm": 1.5231373310089111,
4497
+ "learning_rate": 3.3042354313026702e-06,
4498
+ "loss": 0.0051,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.7615979381443299,
4503
+ "grad_norm": 0.08996398001909256,
4504
+ "learning_rate": 3.2708736166293064e-06,
4505
+ "loss": 0.0004,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.7628865979381443,
4510
+ "grad_norm": 0.5507305264472961,
4511
+ "learning_rate": 3.237648117961665e-06,
4512
+ "loss": 0.001,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.7641752577319587,
4517
+ "grad_norm": 3.912440061569214,
4518
+ "learning_rate": 3.2045596083688814e-06,
4519
+ "loss": 0.0169,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.7654639175257731,
4524
+ "grad_norm": 1.7454997301101685,
4525
+ "learning_rate": 3.1716087581450193e-06,
4526
+ "loss": 0.0051,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.7667525773195877,
4531
+ "grad_norm": 2.3474819660186768,
4532
+ "learning_rate": 3.1387962347954936e-06,
4533
+ "loss": 0.0101,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.7680412371134021,
4538
+ "grad_norm": 0.4886447787284851,
4539
+ "learning_rate": 3.1061227030235442e-06,
4540
+ "loss": 0.0009,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.7693298969072165,
4545
+ "grad_norm": 2.3838088512420654,
4546
+ "learning_rate": 3.073588824716777e-06,
4547
+ "loss": 0.0057,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.770618556701031,
4552
+ "grad_norm": 0.4210747480392456,
4553
+ "learning_rate": 3.041195258933749e-06,
4554
+ "loss": 0.0026,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.7719072164948454,
4559
+ "grad_norm": 1.517642855644226,
4560
+ "learning_rate": 3.008942661890627e-06,
4561
+ "loss": 0.0033,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.7731958762886598,
4566
+ "grad_norm": 0.5007296800613403,
4567
+ "learning_rate": 2.976831686947884e-06,
4568
+ "loss": 0.0012,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.7731958762886598,
4573
+ "eval_accuracy": 0.9980139026812314,
4574
+ "eval_f1": 0.9642857142857143,
4575
+ "eval_loss": 0.01421260554343462,
4576
+ "eval_precision": 0.9818181818181818,
4577
+ "eval_recall": 0.9473684210526315,
4578
+ "eval_runtime": 85.1923,
4579
+ "eval_samples_per_second": 5.341,
4580
+ "eval_steps_per_second": 0.176,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 2.004089890144256e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null