mtzig commited on
Commit
eae153f
·
verified ·
1 Parent(s): e0631a2

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec0b23fb29b3168d711126e63c390cefd28562954b8b8ef4840f478aa2aec88c
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d887d13337c308c5ca733d6323a9864415d8d7fdb689e7054b8b65fc58af94a6
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:719e2f980220aec49e7260281d780c4b705e08e44da393275851a1c0ada4a677
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b06daf412411fb1721690a1827cb6e1038a59c2d2dc7e7999d0561604a9799f2
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9348a9304f5360f4c21ad45a44bbc0168ae2b80712245674a9a769f6eb7aa152
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7aff80990d5d3dea0a8ba059b313ef8993ab8cddb3b3079270f3d150397961f
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9e3bcd9ed5fbe41d6c66f117fd9b6d9de2950c64c39151e84cf8161db4402f2
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b2cfeefe6c41b4091cb2d5b0fdfa5ff9456b7db0f81ce256bdc2e5c5cfd9de5
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b902873a146629a1ece23ec06ea7f89258be805f622edd3f56bb4e27b370d1f
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e7f5bf95c7024205a8119ce9bcf03a4fdd51aae9f436a0a35acb375bb3884b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90be759819f75fd58133ca2a31ffb7e3abdb5e55026b34e76783d4cbdb7645cc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b48bfb689ef000812053e33e2dc970f52c5dd0b8bea9cda6777c2c298951a80
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bac79d5ff2444d45cca1c402876704174077427c6b7d2902ab84bdd3aeb6a4c1
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:516752f1fbf9ee4f98d0697eb89075aa40a9ed64fdd213cf582c9b1a40cf8d2d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36fa53207112e96cf1d931008a46af86708bec88e31fc02618c631b73b238844
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b0bde13b911b7337906544cf9c5bd2107b176fc7783ebbbad8f40ece936477
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:002fb55933219d3afc15cf13593cee3b4cc68a24a920a24f43ed82f5a081cc35
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cedc08212b07c1673536f40c11159242b59a9f0aadf57db9bad6f13ac81a6af
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af32edbfbf253ca5324b65d305f359aaf2d7238a6c9110be03e0839d25660469
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cbab2b6a44c6ac01a6a8e4c5859b2a55eaebd5654b8efdeeaf69b4dbf690320
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3382949932341001,
5
  "eval_steps": 20,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3819,6 +3819,766 @@
3819
  "eval_samples_per_second": 5.626,
3820
  "eval_steps_per_second": 0.189,
3821
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3822
  }
3823
  ],
3824
  "logging_steps": 1,
@@ -3838,7 +4598,7 @@
3838
  "attributes": {}
3839
  }
3840
  },
3841
- "total_flos": 1.513912347435991e+17,
3842
  "train_batch_size": 8,
3843
  "trial_name": null,
3844
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4059539918809202,
5
  "eval_steps": 20,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3819
  "eval_samples_per_second": 5.626,
3820
  "eval_steps_per_second": 0.189,
3821
  "step": 500
3822
+ },
3823
+ {
3824
+ "epoch": 0.33897158322056836,
3825
+ "grad_norm": 4.322254180908203,
3826
+ "learning_rate": 1.672051050494526e-05,
3827
+ "loss": 0.2227,
3828
+ "step": 501
3829
+ },
3830
+ {
3831
+ "epoch": 0.33964817320703655,
3832
+ "grad_norm": 3.6232991218566895,
3833
+ "learning_rate": 1.67030003102464e-05,
3834
+ "loss": 0.1609,
3835
+ "step": 502
3836
+ },
3837
+ {
3838
+ "epoch": 0.34032476319350474,
3839
+ "grad_norm": 6.037874221801758,
3840
+ "learning_rate": 1.6685452716072946e-05,
3841
+ "loss": 0.144,
3842
+ "step": 503
3843
+ },
3844
+ {
3845
+ "epoch": 0.34100135317997293,
3846
+ "grad_norm": 3.3179101943969727,
3847
+ "learning_rate": 1.6667867820331927e-05,
3848
+ "loss": 0.1325,
3849
+ "step": 504
3850
+ },
3851
+ {
3852
+ "epoch": 0.3416779431664411,
3853
+ "grad_norm": 3.1885428428649902,
3854
+ "learning_rate": 1.6650245721138483e-05,
3855
+ "loss": 0.1493,
3856
+ "step": 505
3857
+ },
3858
+ {
3859
+ "epoch": 0.3423545331529093,
3860
+ "grad_norm": 3.5949137210845947,
3861
+ "learning_rate": 1.6632586516815346e-05,
3862
+ "loss": 0.1273,
3863
+ "step": 506
3864
+ },
3865
+ {
3866
+ "epoch": 0.34303112313937756,
3867
+ "grad_norm": 2.8679418563842773,
3868
+ "learning_rate": 1.6614890305892266e-05,
3869
+ "loss": 0.0887,
3870
+ "step": 507
3871
+ },
3872
+ {
3873
+ "epoch": 0.34370771312584575,
3874
+ "grad_norm": 2.384528160095215,
3875
+ "learning_rate": 1.6597157187105475e-05,
3876
+ "loss": 0.0974,
3877
+ "step": 508
3878
+ },
3879
+ {
3880
+ "epoch": 0.34438430311231394,
3881
+ "grad_norm": 3.2372498512268066,
3882
+ "learning_rate": 1.657938725939713e-05,
3883
+ "loss": 0.1175,
3884
+ "step": 509
3885
+ },
3886
+ {
3887
+ "epoch": 0.34506089309878213,
3888
+ "grad_norm": 2.4635872840881348,
3889
+ "learning_rate": 1.6561580621914764e-05,
3890
+ "loss": 0.0602,
3891
+ "step": 510
3892
+ },
3893
+ {
3894
+ "epoch": 0.3457374830852503,
3895
+ "grad_norm": 4.7463531494140625,
3896
+ "learning_rate": 1.6543737374010742e-05,
3897
+ "loss": 0.1404,
3898
+ "step": 511
3899
+ },
3900
+ {
3901
+ "epoch": 0.34641407307171856,
3902
+ "grad_norm": 3.910125255584717,
3903
+ "learning_rate": 1.6525857615241686e-05,
3904
+ "loss": 0.1732,
3905
+ "step": 512
3906
+ },
3907
+ {
3908
+ "epoch": 0.34709066305818675,
3909
+ "grad_norm": 3.2249362468719482,
3910
+ "learning_rate": 1.6507941445367935e-05,
3911
+ "loss": 0.1706,
3912
+ "step": 513
3913
+ },
3914
+ {
3915
+ "epoch": 0.34776725304465494,
3916
+ "grad_norm": 3.5670406818389893,
3917
+ "learning_rate": 1.648998896435299e-05,
3918
+ "loss": 0.1288,
3919
+ "step": 514
3920
+ },
3921
+ {
3922
+ "epoch": 0.34844384303112313,
3923
+ "grad_norm": 2.954425096511841,
3924
+ "learning_rate": 1.6472000272362937e-05,
3925
+ "loss": 0.1691,
3926
+ "step": 515
3927
+ },
3928
+ {
3929
+ "epoch": 0.3491204330175913,
3930
+ "grad_norm": 3.0924575328826904,
3931
+ "learning_rate": 1.6453975469765913e-05,
3932
+ "loss": 0.1445,
3933
+ "step": 516
3934
+ },
3935
+ {
3936
+ "epoch": 0.3497970230040595,
3937
+ "grad_norm": 3.242204427719116,
3938
+ "learning_rate": 1.643591465713153e-05,
3939
+ "loss": 0.113,
3940
+ "step": 517
3941
+ },
3942
+ {
3943
+ "epoch": 0.35047361299052776,
3944
+ "grad_norm": 3.513796806335449,
3945
+ "learning_rate": 1.6417817935230318e-05,
3946
+ "loss": 0.1342,
3947
+ "step": 518
3948
+ },
3949
+ {
3950
+ "epoch": 0.35115020297699595,
3951
+ "grad_norm": 3.459606409072876,
3952
+ "learning_rate": 1.6399685405033168e-05,
3953
+ "loss": 0.167,
3954
+ "step": 519
3955
+ },
3956
+ {
3957
+ "epoch": 0.35182679296346414,
3958
+ "grad_norm": 3.4279625415802,
3959
+ "learning_rate": 1.6381517167710757e-05,
3960
+ "loss": 0.1466,
3961
+ "step": 520
3962
+ },
3963
+ {
3964
+ "epoch": 0.35182679296346414,
3965
+ "eval_accuracy": 0.8027522935779816,
3966
+ "eval_f1": 0.48687350835322196,
3967
+ "eval_loss": 0.4396270513534546,
3968
+ "eval_precision": 0.8429752066115702,
3969
+ "eval_recall": 0.3422818791946309,
3970
+ "eval_runtime": 53.1809,
3971
+ "eval_samples_per_second": 5.604,
3972
+ "eval_steps_per_second": 0.188,
3973
+ "step": 520
3974
+ },
3975
+ {
3976
+ "epoch": 0.35250338294993233,
3977
+ "grad_norm": 3.181802988052368,
3978
+ "learning_rate": 1.6363313324632995e-05,
3979
+ "loss": 0.1381,
3980
+ "step": 521
3981
+ },
3982
+ {
3983
+ "epoch": 0.3531799729364005,
3984
+ "grad_norm": 2.620626449584961,
3985
+ "learning_rate": 1.6345073977368455e-05,
3986
+ "loss": 0.1523,
3987
+ "step": 522
3988
+ },
3989
+ {
3990
+ "epoch": 0.35385656292286877,
3991
+ "grad_norm": 4.116923809051514,
3992
+ "learning_rate": 1.6326799227683806e-05,
3993
+ "loss": 0.0602,
3994
+ "step": 523
3995
+ },
3996
+ {
3997
+ "epoch": 0.35453315290933696,
3998
+ "grad_norm": 3.4836175441741943,
3999
+ "learning_rate": 1.630848917754324e-05,
4000
+ "loss": 0.0969,
4001
+ "step": 524
4002
+ },
4003
+ {
4004
+ "epoch": 0.35520974289580515,
4005
+ "grad_norm": 3.9089815616607666,
4006
+ "learning_rate": 1.629014392910791e-05,
4007
+ "loss": 0.1509,
4008
+ "step": 525
4009
+ },
4010
+ {
4011
+ "epoch": 0.35588633288227334,
4012
+ "grad_norm": 3.1335699558258057,
4013
+ "learning_rate": 1.6271763584735373e-05,
4014
+ "loss": 0.1366,
4015
+ "step": 526
4016
+ },
4017
+ {
4018
+ "epoch": 0.3565629228687415,
4019
+ "grad_norm": 3.3636960983276367,
4020
+ "learning_rate": 1.625334824697898e-05,
4021
+ "loss": 0.1233,
4022
+ "step": 527
4023
+ },
4024
+ {
4025
+ "epoch": 0.3572395128552097,
4026
+ "grad_norm": 3.7551486492156982,
4027
+ "learning_rate": 1.6234898018587336e-05,
4028
+ "loss": 0.1541,
4029
+ "step": 528
4030
+ },
4031
+ {
4032
+ "epoch": 0.35791610284167796,
4033
+ "grad_norm": 6.929388046264648,
4034
+ "learning_rate": 1.6216413002503736e-05,
4035
+ "loss": 0.2057,
4036
+ "step": 529
4037
+ },
4038
+ {
4039
+ "epoch": 0.35859269282814615,
4040
+ "grad_norm": 4.0752763748168945,
4041
+ "learning_rate": 1.619789330186555e-05,
4042
+ "loss": 0.1008,
4043
+ "step": 530
4044
+ },
4045
+ {
4046
+ "epoch": 0.35926928281461434,
4047
+ "grad_norm": 3.1588234901428223,
4048
+ "learning_rate": 1.6179339020003685e-05,
4049
+ "loss": 0.1454,
4050
+ "step": 531
4051
+ },
4052
+ {
4053
+ "epoch": 0.35994587280108253,
4054
+ "grad_norm": 6.536987781524658,
4055
+ "learning_rate": 1.616075026044199e-05,
4056
+ "loss": 0.1461,
4057
+ "step": 532
4058
+ },
4059
+ {
4060
+ "epoch": 0.3606224627875507,
4061
+ "grad_norm": 3.1867458820343018,
4062
+ "learning_rate": 1.6142127126896682e-05,
4063
+ "loss": 0.182,
4064
+ "step": 533
4065
+ },
4066
+ {
4067
+ "epoch": 0.36129905277401897,
4068
+ "grad_norm": 3.9853105545043945,
4069
+ "learning_rate": 1.6123469723275766e-05,
4070
+ "loss": 0.1525,
4071
+ "step": 534
4072
+ },
4073
+ {
4074
+ "epoch": 0.36197564276048716,
4075
+ "grad_norm": 2.4770116806030273,
4076
+ "learning_rate": 1.6104778153678467e-05,
4077
+ "loss": 0.1789,
4078
+ "step": 535
4079
+ },
4080
+ {
4081
+ "epoch": 0.36265223274695535,
4082
+ "grad_norm": 4.895524024963379,
4083
+ "learning_rate": 1.6086052522394625e-05,
4084
+ "loss": 0.1909,
4085
+ "step": 536
4086
+ },
4087
+ {
4088
+ "epoch": 0.36332882273342354,
4089
+ "grad_norm": 7.819604873657227,
4090
+ "learning_rate": 1.6067292933904144e-05,
4091
+ "loss": 0.1793,
4092
+ "step": 537
4093
+ },
4094
+ {
4095
+ "epoch": 0.36400541271989173,
4096
+ "grad_norm": 5.251774311065674,
4097
+ "learning_rate": 1.6048499492876378e-05,
4098
+ "loss": 0.1829,
4099
+ "step": 538
4100
+ },
4101
+ {
4102
+ "epoch": 0.3646820027063599,
4103
+ "grad_norm": 5.605532646179199,
4104
+ "learning_rate": 1.602967230416957e-05,
4105
+ "loss": 0.273,
4106
+ "step": 539
4107
+ },
4108
+ {
4109
+ "epoch": 0.36535859269282817,
4110
+ "grad_norm": 3.302903175354004,
4111
+ "learning_rate": 1.6010811472830253e-05,
4112
+ "loss": 0.1608,
4113
+ "step": 540
4114
+ },
4115
+ {
4116
+ "epoch": 0.36535859269282817,
4117
+ "eval_accuracy": 0.8165137614678899,
4118
+ "eval_f1": 0.5412844036697247,
4119
+ "eval_loss": 0.4029388427734375,
4120
+ "eval_precision": 0.855072463768116,
4121
+ "eval_recall": 0.3959731543624161,
4122
+ "eval_runtime": 54.3008,
4123
+ "eval_samples_per_second": 5.488,
4124
+ "eval_steps_per_second": 0.184,
4125
+ "step": 540
4126
+ },
4127
+ {
4128
+ "epoch": 0.36603518267929636,
4129
+ "grad_norm": 3.057288408279419,
4130
+ "learning_rate": 1.5991917104092677e-05,
4131
+ "loss": 0.1671,
4132
+ "step": 541
4133
+ },
4134
+ {
4135
+ "epoch": 0.36671177266576455,
4136
+ "grad_norm": 4.837218761444092,
4137
+ "learning_rate": 1.5972989303378207e-05,
4138
+ "loss": 0.1425,
4139
+ "step": 542
4140
+ },
4141
+ {
4142
+ "epoch": 0.36738836265223274,
4143
+ "grad_norm": 2.922201633453369,
4144
+ "learning_rate": 1.595402817629475e-05,
4145
+ "loss": 0.2097,
4146
+ "step": 543
4147
+ },
4148
+ {
4149
+ "epoch": 0.3680649526387009,
4150
+ "grad_norm": 8.20699691772461,
4151
+ "learning_rate": 1.593503382863615e-05,
4152
+ "loss": 0.1657,
4153
+ "step": 544
4154
+ },
4155
+ {
4156
+ "epoch": 0.36874154262516917,
4157
+ "grad_norm": 3.043370246887207,
4158
+ "learning_rate": 1.591600636638161e-05,
4159
+ "loss": 0.1568,
4160
+ "step": 545
4161
+ },
4162
+ {
4163
+ "epoch": 0.36941813261163736,
4164
+ "grad_norm": 6.523357391357422,
4165
+ "learning_rate": 1.589694589569509e-05,
4166
+ "loss": 0.1299,
4167
+ "step": 546
4168
+ },
4169
+ {
4170
+ "epoch": 0.37009472259810555,
4171
+ "grad_norm": 3.4266302585601807,
4172
+ "learning_rate": 1.5877852522924733e-05,
4173
+ "loss": 0.1608,
4174
+ "step": 547
4175
+ },
4176
+ {
4177
+ "epoch": 0.37077131258457374,
4178
+ "grad_norm": 4.111809253692627,
4179
+ "learning_rate": 1.5858726354602248e-05,
4180
+ "loss": 0.1975,
4181
+ "step": 548
4182
+ },
4183
+ {
4184
+ "epoch": 0.37144790257104193,
4185
+ "grad_norm": 3.3651816844940186,
4186
+ "learning_rate": 1.5839567497442338e-05,
4187
+ "loss": 0.171,
4188
+ "step": 549
4189
+ },
4190
+ {
4191
+ "epoch": 0.3721244925575101,
4192
+ "grad_norm": 3.0030105113983154,
4193
+ "learning_rate": 1.5820376058342077e-05,
4194
+ "loss": 0.1365,
4195
+ "step": 550
4196
+ },
4197
+ {
4198
+ "epoch": 0.37280108254397837,
4199
+ "grad_norm": 3.5873923301696777,
4200
+ "learning_rate": 1.5801152144380353e-05,
4201
+ "loss": 0.1745,
4202
+ "step": 551
4203
+ },
4204
+ {
4205
+ "epoch": 0.37347767253044656,
4206
+ "grad_norm": 3.0994861125946045,
4207
+ "learning_rate": 1.578189586281723e-05,
4208
+ "loss": 0.1407,
4209
+ "step": 552
4210
+ },
4211
+ {
4212
+ "epoch": 0.37415426251691475,
4213
+ "grad_norm": 2.788184642791748,
4214
+ "learning_rate": 1.5762607321093368e-05,
4215
+ "loss": 0.134,
4216
+ "step": 553
4217
+ },
4218
+ {
4219
+ "epoch": 0.37483085250338294,
4220
+ "grad_norm": 2.5775389671325684,
4221
+ "learning_rate": 1.5743286626829437e-05,
4222
+ "loss": 0.1075,
4223
+ "step": 554
4224
+ },
4225
+ {
4226
+ "epoch": 0.37550744248985113,
4227
+ "grad_norm": 3.5334537029266357,
4228
+ "learning_rate": 1.5723933887825492e-05,
4229
+ "loss": 0.1165,
4230
+ "step": 555
4231
+ },
4232
+ {
4233
+ "epoch": 0.3761840324763194,
4234
+ "grad_norm": 3.544222116470337,
4235
+ "learning_rate": 1.5704549212060383e-05,
4236
+ "loss": 0.1739,
4237
+ "step": 556
4238
+ },
4239
+ {
4240
+ "epoch": 0.37686062246278756,
4241
+ "grad_norm": 3.89497709274292,
4242
+ "learning_rate": 1.568513270769115e-05,
4243
+ "loss": 0.1549,
4244
+ "step": 557
4245
+ },
4246
+ {
4247
+ "epoch": 0.37753721244925575,
4248
+ "grad_norm": 2.996244192123413,
4249
+ "learning_rate": 1.5665684483052425e-05,
4250
+ "loss": 0.1742,
4251
+ "step": 558
4252
+ },
4253
+ {
4254
+ "epoch": 0.37821380243572394,
4255
+ "grad_norm": 4.149686336517334,
4256
+ "learning_rate": 1.564620464665582e-05,
4257
+ "loss": 0.1655,
4258
+ "step": 559
4259
+ },
4260
+ {
4261
+ "epoch": 0.37889039242219213,
4262
+ "grad_norm": 3.9218225479125977,
4263
+ "learning_rate": 1.5626693307189334e-05,
4264
+ "loss": 0.1472,
4265
+ "step": 560
4266
+ },
4267
+ {
4268
+ "epoch": 0.37889039242219213,
4269
+ "eval_accuracy": 0.8064220183486238,
4270
+ "eval_f1": 0.5104408352668214,
4271
+ "eval_loss": 0.44546324014663696,
4272
+ "eval_precision": 0.8270676691729323,
4273
+ "eval_recall": 0.3691275167785235,
4274
+ "eval_runtime": 52.4956,
4275
+ "eval_samples_per_second": 5.677,
4276
+ "eval_steps_per_second": 0.19,
4277
+ "step": 560
4278
+ },
4279
+ {
4280
+ "epoch": 0.3795669824086603,
4281
+ "grad_norm": 7.877570629119873,
4282
+ "learning_rate": 1.560715057351673e-05,
4283
+ "loss": 0.0964,
4284
+ "step": 561
4285
+ },
4286
+ {
4287
+ "epoch": 0.38024357239512857,
4288
+ "grad_norm": 3.73523211479187,
4289
+ "learning_rate": 1.5587576554676927e-05,
4290
+ "loss": 0.1444,
4291
+ "step": 562
4292
+ },
4293
+ {
4294
+ "epoch": 0.38092016238159676,
4295
+ "grad_norm": 10.825215339660645,
4296
+ "learning_rate": 1.556797135988342e-05,
4297
+ "loss": 0.151,
4298
+ "step": 563
4299
+ },
4300
+ {
4301
+ "epoch": 0.38159675236806495,
4302
+ "grad_norm": 6.863844871520996,
4303
+ "learning_rate": 1.5548335098523634e-05,
4304
+ "loss": 0.2423,
4305
+ "step": 564
4306
+ },
4307
+ {
4308
+ "epoch": 0.38227334235453314,
4309
+ "grad_norm": 3.01707124710083,
4310
+ "learning_rate": 1.5528667880158338e-05,
4311
+ "loss": 0.1483,
4312
+ "step": 565
4313
+ },
4314
+ {
4315
+ "epoch": 0.38294993234100133,
4316
+ "grad_norm": 3.50577712059021,
4317
+ "learning_rate": 1.5508969814521026e-05,
4318
+ "loss": 0.1359,
4319
+ "step": 566
4320
+ },
4321
+ {
4322
+ "epoch": 0.3836265223274696,
4323
+ "grad_norm": 3.558225631713867,
4324
+ "learning_rate": 1.5489241011517303e-05,
4325
+ "loss": 0.0951,
4326
+ "step": 567
4327
+ },
4328
+ {
4329
+ "epoch": 0.38430311231393777,
4330
+ "grad_norm": 7.069665431976318,
4331
+ "learning_rate": 1.5469481581224274e-05,
4332
+ "loss": 0.0979,
4333
+ "step": 568
4334
+ },
4335
+ {
4336
+ "epoch": 0.38497970230040596,
4337
+ "grad_norm": 4.208998680114746,
4338
+ "learning_rate": 1.5449691633889924e-05,
4339
+ "loss": 0.1451,
4340
+ "step": 569
4341
+ },
4342
+ {
4343
+ "epoch": 0.38565629228687415,
4344
+ "grad_norm": 3.042346477508545,
4345
+ "learning_rate": 1.5429871279932514e-05,
4346
+ "loss": 0.1555,
4347
+ "step": 570
4348
+ },
4349
+ {
4350
+ "epoch": 0.38633288227334234,
4351
+ "grad_norm": 6.0393595695495605,
4352
+ "learning_rate": 1.5410020629939966e-05,
4353
+ "loss": 0.1965,
4354
+ "step": 571
4355
+ },
4356
+ {
4357
+ "epoch": 0.3870094722598105,
4358
+ "grad_norm": 3.022724151611328,
4359
+ "learning_rate": 1.5390139794669225e-05,
4360
+ "loss": 0.1219,
4361
+ "step": 572
4362
+ },
4363
+ {
4364
+ "epoch": 0.3876860622462788,
4365
+ "grad_norm": 2.8513102531433105,
4366
+ "learning_rate": 1.5370228885045662e-05,
4367
+ "loss": 0.1634,
4368
+ "step": 573
4369
+ },
4370
+ {
4371
+ "epoch": 0.38836265223274696,
4372
+ "grad_norm": 3.42635440826416,
4373
+ "learning_rate": 1.535028801216245e-05,
4374
+ "loss": 0.1874,
4375
+ "step": 574
4376
+ },
4377
+ {
4378
+ "epoch": 0.38903924221921515,
4379
+ "grad_norm": 6.154781341552734,
4380
+ "learning_rate": 1.533031728727994e-05,
4381
+ "loss": 0.1979,
4382
+ "step": 575
4383
+ },
4384
+ {
4385
+ "epoch": 0.38971583220568334,
4386
+ "grad_norm": 4.227107524871826,
4387
+ "learning_rate": 1.531031682182504e-05,
4388
+ "loss": 0.2125,
4389
+ "step": 576
4390
+ },
4391
+ {
4392
+ "epoch": 0.39039242219215153,
4393
+ "grad_norm": 3.389040946960449,
4394
+ "learning_rate": 1.5290286727390604e-05,
4395
+ "loss": 0.1554,
4396
+ "step": 577
4397
+ },
4398
+ {
4399
+ "epoch": 0.3910690121786198,
4400
+ "grad_norm": 3.34859299659729,
4401
+ "learning_rate": 1.527022711573479e-05,
4402
+ "loss": 0.1859,
4403
+ "step": 578
4404
+ },
4405
+ {
4406
+ "epoch": 0.39174560216508797,
4407
+ "grad_norm": 2.8738038539886475,
4408
+ "learning_rate": 1.5250138098780456e-05,
4409
+ "loss": 0.105,
4410
+ "step": 579
4411
+ },
4412
+ {
4413
+ "epoch": 0.39242219215155616,
4414
+ "grad_norm": 3.0525081157684326,
4415
+ "learning_rate": 1.5230019788614527e-05,
4416
+ "loss": 0.1437,
4417
+ "step": 580
4418
+ },
4419
+ {
4420
+ "epoch": 0.39242219215155616,
4421
+ "eval_accuracy": 0.8036697247706422,
4422
+ "eval_f1": 0.4928909952606635,
4423
+ "eval_loss": 0.4023875296115875,
4424
+ "eval_precision": 0.8387096774193549,
4425
+ "eval_recall": 0.348993288590604,
4426
+ "eval_runtime": 53.2583,
4427
+ "eval_samples_per_second": 5.595,
4428
+ "eval_steps_per_second": 0.188,
4429
+ "step": 580
4430
+ },
4431
+ {
4432
+ "epoch": 0.39309878213802435,
4433
+ "grad_norm": 2.1808993816375732,
4434
+ "learning_rate": 1.5209872297487365e-05,
4435
+ "loss": 0.158,
4436
+ "step": 581
4437
+ },
4438
+ {
4439
+ "epoch": 0.39377537212449254,
4440
+ "grad_norm": 2.2843339443206787,
4441
+ "learning_rate": 1.5189695737812153e-05,
4442
+ "loss": 0.0944,
4443
+ "step": 582
4444
+ },
4445
+ {
4446
+ "epoch": 0.3944519621109608,
4447
+ "grad_norm": 3.0277621746063232,
4448
+ "learning_rate": 1.5169490222164255e-05,
4449
+ "loss": 0.1253,
4450
+ "step": 583
4451
+ },
4452
+ {
4453
+ "epoch": 0.395128552097429,
4454
+ "grad_norm": 3.6869609355926514,
4455
+ "learning_rate": 1.5149255863280607e-05,
4456
+ "loss": 0.1293,
4457
+ "step": 584
4458
+ },
4459
+ {
4460
+ "epoch": 0.39580514208389717,
4461
+ "grad_norm": 2.9757912158966064,
4462
+ "learning_rate": 1.5128992774059063e-05,
4463
+ "loss": 0.1344,
4464
+ "step": 585
4465
+ },
4466
+ {
4467
+ "epoch": 0.39648173207036536,
4468
+ "grad_norm": 3.4651436805725098,
4469
+ "learning_rate": 1.5108701067557787e-05,
4470
+ "loss": 0.1131,
4471
+ "step": 586
4472
+ },
4473
+ {
4474
+ "epoch": 0.39715832205683355,
4475
+ "grad_norm": 3.6160385608673096,
4476
+ "learning_rate": 1.5088380856994608e-05,
4477
+ "loss": 0.1679,
4478
+ "step": 587
4479
+ },
4480
+ {
4481
+ "epoch": 0.39783491204330174,
4482
+ "grad_norm": 2.459801197052002,
4483
+ "learning_rate": 1.50680322557464e-05,
4484
+ "loss": 0.0886,
4485
+ "step": 588
4486
+ },
4487
+ {
4488
+ "epoch": 0.39851150202977,
4489
+ "grad_norm": 3.2416129112243652,
4490
+ "learning_rate": 1.504765537734844e-05,
4491
+ "loss": 0.1534,
4492
+ "step": 589
4493
+ },
4494
+ {
4495
+ "epoch": 0.39918809201623817,
4496
+ "grad_norm": 2.9892609119415283,
4497
+ "learning_rate": 1.5027250335493771e-05,
4498
+ "loss": 0.0943,
4499
+ "step": 590
4500
+ },
4501
+ {
4502
+ "epoch": 0.39986468200270636,
4503
+ "grad_norm": 5.113293647766113,
4504
+ "learning_rate": 1.5006817244032589e-05,
4505
+ "loss": 0.1717,
4506
+ "step": 591
4507
+ },
4508
+ {
4509
+ "epoch": 0.40054127198917455,
4510
+ "grad_norm": 6.7918195724487305,
4511
+ "learning_rate": 1.4986356216971583e-05,
4512
+ "loss": 0.1747,
4513
+ "step": 592
4514
+ },
4515
+ {
4516
+ "epoch": 0.40121786197564274,
4517
+ "grad_norm": 3.726599931716919,
4518
+ "learning_rate": 1.4965867368473308e-05,
4519
+ "loss": 0.1416,
4520
+ "step": 593
4521
+ },
4522
+ {
4523
+ "epoch": 0.401894451962111,
4524
+ "grad_norm": 6.388960361480713,
4525
+ "learning_rate": 1.4945350812855555e-05,
4526
+ "loss": 0.1946,
4527
+ "step": 594
4528
+ },
4529
+ {
4530
+ "epoch": 0.4025710419485792,
4531
+ "grad_norm": 3.8928306102752686,
4532
+ "learning_rate": 1.4924806664590702e-05,
4533
+ "loss": 0.1622,
4534
+ "step": 595
4535
+ },
4536
+ {
4537
+ "epoch": 0.40324763193504737,
4538
+ "grad_norm": 3.4860315322875977,
4539
+ "learning_rate": 1.4904235038305084e-05,
4540
+ "loss": 0.1549,
4541
+ "step": 596
4542
+ },
4543
+ {
4544
+ "epoch": 0.40392422192151556,
4545
+ "grad_norm": 4.641502857208252,
4546
+ "learning_rate": 1.4883636048778347e-05,
4547
+ "loss": 0.1789,
4548
+ "step": 597
4549
+ },
4550
+ {
4551
+ "epoch": 0.40460081190798375,
4552
+ "grad_norm": 4.543725967407227,
4553
+ "learning_rate": 1.4863009810942814e-05,
4554
+ "loss": 0.1651,
4555
+ "step": 598
4556
+ },
4557
+ {
4558
+ "epoch": 0.40527740189445194,
4559
+ "grad_norm": 10.844779014587402,
4560
+ "learning_rate": 1.4842356439882841e-05,
4561
+ "loss": 0.2244,
4562
+ "step": 599
4563
+ },
4564
+ {
4565
+ "epoch": 0.4059539918809202,
4566
+ "grad_norm": 7.078522205352783,
4567
+ "learning_rate": 1.4821676050834166e-05,
4568
+ "loss": 0.2055,
4569
+ "step": 600
4570
+ },
4571
+ {
4572
+ "epoch": 0.4059539918809202,
4573
+ "eval_accuracy": 0.8229357798165138,
4574
+ "eval_f1": 0.5758241758241758,
4575
+ "eval_loss": 0.40693244338035583,
4576
+ "eval_precision": 0.8343949044585988,
4577
+ "eval_recall": 0.4395973154362416,
4578
+ "eval_runtime": 53.515,
4579
+ "eval_samples_per_second": 5.569,
4580
+ "eval_steps_per_second": 0.187,
4581
+ "step": 600
4582
  }
4583
  ],
4584
  "logging_steps": 1,
 
4598
  "attributes": {}
4599
  }
4600
  },
4601
+ "total_flos": 1.81775286403072e+17,
4602
  "train_batch_size": 8,
4603
  "trial_name": null,
4604
  "trial_params": null