mtzig commited on
Commit
168605c
1 Parent(s): 44e8c30

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8dbe76f35e5656136a51bded0139fef27a2028b00f8f726fd0d386bb3522e13
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d943c6bfd2dc2b761b4d682134e0a0fc60ac1cb4096855e5091cc3393184aa64
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b66c1ebc853e6845470c2c1d9d04f694dccd7d0e852c4bc0fff9a7f3b72ba092
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:376a9db1af520346798c868246148f3564e7f951f971cfad89c922d341bf7f29
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ac9e1924ab634b10849be3ecb1321e5393dbc84ae65beaffd307850b3ae9f82
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9178eab5bc585c22cd46ed2fc1e92f4fdda57d7a3fa8d58230990de0c4d1f153
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc2b2b52b7b5b1c43d8786246ce0c4845f1a27260d756e383ebd9ee1be107e16
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:664f4c51ad8b8db2543ac5506c908df582362cb40fdd8ba94c8d4d17fd478154
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89bc93c64b991b306a939f0419250bf9841787d18646263da1e9b2c8779f9699
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21d1ac001b88f8a5c52ed311d48d65c35c0b16a38d8e46e3f8f798f890a0ff73
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ca40ef1084fb0572972d0d791f24456333f4e2bb411fb46f1d9fd3067b04bb8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce135a57bb7016f162e8a5a5cf147734c4de738983a8be7d0e78e3767402b122
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6eacdb71077b64054c4a56453b8184802582c4895bcb7585409a5be89035fca7
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b143b6fca120ff625503c29f4c425415e9b350b6c85048892c81f6d44c3563a
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:293394047fb4a3a8ea9a2c352bdfb1e609e58a84c1d1613313fea1af7bf3513c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec8c8fdc32c70be31edda43085207cfa5ae9a7dbb023c61d9fae6f55d607e9e
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b9a1f1e453a43f83aa53b56d67647ccab7a6102d29ca677d252db57c6d84112
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6151a2afa91abecb8de37b99e4409d6e56f16b75b43592d3da5abbb3ee272563
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f37b096e91cd08927cf4ae4abd0d391ce5ada891c3fc1b2de21881502f3589eb
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d72bd74ab0857553e9460d81b4abc084b39a8189791c68c03d4ede2cfc8a8c60
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e41f09d05169607eee8dce8b84f8f78818000c06c0c89cf2ba601fd24a650bd
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5f15126da64891eca473dd1e04d5b41141f581a035a13b14aee5904e6e3f7e
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:418fcb9eed9f4b34f4356e235c9f424e1c20f8f4d59e678e6ebeeb8a33e83523
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:135e2ac2b5b60d2eeef0012629f402a00ab445fa2c678e7dedd20b300813acb6
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afecba1fad91cfcc309b7789abc8c48e2a84100fb8c489eb3925241bd70b9c9d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f67a0e885145319f81ed1f8c4c49622761e3f92d5ce81c356bbb700855e8e6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.12886597938144329,
5
  "eval_steps": 20,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -779,6 +779,766 @@
779
  "eval_samples_per_second": 5.346,
780
  "eval_steps_per_second": 0.176,
781
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  }
783
  ],
784
  "logging_steps": 1,
@@ -798,7 +1558,7 @@
798
  "attributes": {}
799
  }
800
  },
801
- "total_flos": 3.347688371467059e+16,
802
  "train_batch_size": 8,
803
  "trial_name": null,
804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.25773195876288657,
5
  "eval_steps": 20,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
779
  "eval_samples_per_second": 5.346,
780
  "eval_steps_per_second": 0.176,
781
  "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.13015463917525774,
785
+ "grad_norm": 2.9534921646118164,
786
+ "learning_rate": 1.994646636835458e-05,
787
+ "loss": 0.0741,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.13144329896907217,
792
+ "grad_norm": 2.0482945442199707,
793
+ "learning_rate": 1.9941714697703333e-05,
794
+ "loss": 0.0596,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.1327319587628866,
799
+ "grad_norm": 0.8915924429893494,
800
+ "learning_rate": 1.9936761631691007e-05,
801
+ "loss": 0.0271,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.13402061855670103,
806
+ "grad_norm": 3.5569581985473633,
807
+ "learning_rate": 1.993160727065489e-05,
808
+ "loss": 0.097,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.13530927835051546,
813
+ "grad_norm": 1.0290688276290894,
814
+ "learning_rate": 1.992625171901e-05,
815
+ "loss": 0.0309,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.13659793814432988,
820
+ "grad_norm": 3.104780673980713,
821
+ "learning_rate": 1.9920695085247012e-05,
822
+ "loss": 0.0466,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.13788659793814434,
827
+ "grad_norm": 1.300478458404541,
828
+ "learning_rate": 1.991493748193002e-05,
829
+ "loss": 0.035,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.13917525773195877,
834
+ "grad_norm": 1.9571739435195923,
835
+ "learning_rate": 1.9908979025694312e-05,
836
+ "loss": 0.0432,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.1404639175257732,
841
+ "grad_norm": 0.9955072402954102,
842
+ "learning_rate": 1.9902819837243954e-05,
843
+ "loss": 0.0182,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.14175257731958762,
848
+ "grad_norm": 1.2352385520935059,
849
+ "learning_rate": 1.989646004134937e-05,
850
+ "loss": 0.0338,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.14304123711340205,
855
+ "grad_norm": 2.855053663253784,
856
+ "learning_rate": 1.9889899766844817e-05,
857
+ "loss": 0.0701,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.14432989690721648,
862
+ "grad_norm": 2.372802495956421,
863
+ "learning_rate": 1.9883139146625763e-05,
864
+ "loss": 0.0386,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.14561855670103094,
869
+ "grad_norm": 1.9221031665802002,
870
+ "learning_rate": 1.9876178317646203e-05,
871
+ "loss": 0.0277,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.14690721649484537,
876
+ "grad_norm": 0.9431936144828796,
877
+ "learning_rate": 1.9869017420915888e-05,
878
+ "loss": 0.0188,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.1481958762886598,
883
+ "grad_norm": 1.950210690498352,
884
+ "learning_rate": 1.9861656601497452e-05,
885
+ "loss": 0.0302,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.14948453608247422,
890
+ "grad_norm": 3.239633560180664,
891
+ "learning_rate": 1.9854096008503495e-05,
892
+ "loss": 0.0416,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.15077319587628865,
897
+ "grad_norm": 3.1708860397338867,
898
+ "learning_rate": 1.9846335795093547e-05,
899
+ "loss": 0.0688,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.15206185567010308,
904
+ "grad_norm": 0.6930286288261414,
905
+ "learning_rate": 1.9838376118470965e-05,
906
+ "loss": 0.0141,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.15335051546391754,
911
+ "grad_norm": 2.929121971130371,
912
+ "learning_rate": 1.9830217139879768e-05,
913
+ "loss": 0.034,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.15463917525773196,
918
+ "grad_norm": 1.3847970962524414,
919
+ "learning_rate": 1.9821859024601345e-05,
920
+ "loss": 0.03,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.15463917525773196,
925
+ "eval_accuracy": 0.9821251241310824,
926
+ "eval_f1": 0.7391304347826086,
927
+ "eval_loss": 0.04716553911566734,
928
+ "eval_precision": 0.6296296296296297,
929
+ "eval_recall": 0.8947368421052632,
930
+ "eval_runtime": 83.825,
931
+ "eval_samples_per_second": 5.428,
932
+ "eval_steps_per_second": 0.179,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.1559278350515464,
937
+ "grad_norm": 2.072525978088379,
938
+ "learning_rate": 1.981330194195112e-05,
939
+ "loss": 0.016,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.15721649484536082,
944
+ "grad_norm": 3.0791800022125244,
945
+ "learning_rate": 1.9804546065275116e-05,
946
+ "loss": 0.0618,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.15850515463917525,
951
+ "grad_norm": 2.1992335319519043,
952
+ "learning_rate": 1.9795591571946454e-05,
953
+ "loss": 0.0276,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.15979381443298968,
958
+ "grad_norm": 2.476609706878662,
959
+ "learning_rate": 1.978643864336176e-05,
960
+ "loss": 0.0207,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.16108247422680413,
965
+ "grad_norm": 2.674210786819458,
966
+ "learning_rate": 1.9777087464937464e-05,
967
+ "loss": 0.0378,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.16237113402061856,
972
+ "grad_norm": 2.6775150299072266,
973
+ "learning_rate": 1.9767538226106078e-05,
974
+ "loss": 0.0312,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.163659793814433,
979
+ "grad_norm": 2.105435848236084,
980
+ "learning_rate": 1.9757791120312344e-05,
981
+ "loss": 0.0239,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.16494845360824742,
986
+ "grad_norm": 1.7885074615478516,
987
+ "learning_rate": 1.9747846345009306e-05,
988
+ "loss": 0.0402,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.16623711340206185,
993
+ "grad_norm": 4.384532451629639,
994
+ "learning_rate": 1.9737704101654335e-05,
995
+ "loss": 0.0674,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.16752577319587628,
1000
+ "grad_norm": 0.733161211013794,
1001
+ "learning_rate": 1.9727364595705012e-05,
1002
+ "loss": 0.0109,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.16881443298969073,
1007
+ "grad_norm": 2.310255765914917,
1008
+ "learning_rate": 1.9716828036615006e-05,
1009
+ "loss": 0.0245,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.17010309278350516,
1014
+ "grad_norm": 2.1358768939971924,
1015
+ "learning_rate": 1.9706094637829797e-05,
1016
+ "loss": 0.0506,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.1713917525773196,
1021
+ "grad_norm": 1.873978853225708,
1022
+ "learning_rate": 1.9695164616782378e-05,
1023
+ "loss": 0.0239,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.17268041237113402,
1028
+ "grad_norm": 3.210780620574951,
1029
+ "learning_rate": 1.9684038194888827e-05,
1030
+ "loss": 0.0453,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.17396907216494845,
1035
+ "grad_norm": 2.6000077724456787,
1036
+ "learning_rate": 1.9672715597543845e-05,
1037
+ "loss": 0.0222,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.17525773195876287,
1042
+ "grad_norm": 0.8902448415756226,
1043
+ "learning_rate": 1.9661197054116165e-05,
1044
+ "loss": 0.0114,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.17654639175257733,
1049
+ "grad_norm": 2.048377513885498,
1050
+ "learning_rate": 1.964948279794393e-05,
1051
+ "loss": 0.0299,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.17783505154639176,
1056
+ "grad_norm": 0.35185545682907104,
1057
+ "learning_rate": 1.963757306632996e-05,
1058
+ "loss": 0.0062,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.1791237113402062,
1063
+ "grad_norm": 0.8665434122085571,
1064
+ "learning_rate": 1.962546810053692e-05,
1065
+ "loss": 0.0122,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.18041237113402062,
1070
+ "grad_norm": 0.7568170428276062,
1071
+ "learning_rate": 1.9613168145782468e-05,
1072
+ "loss": 0.0109,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.18041237113402062,
1077
+ "eval_accuracy": 0.9910625620655412,
1078
+ "eval_f1": 0.8448275862068966,
1079
+ "eval_loss": 0.03413279354572296,
1080
+ "eval_precision": 0.8305084745762712,
1081
+ "eval_recall": 0.8596491228070176,
1082
+ "eval_runtime": 83.9067,
1083
+ "eval_samples_per_second": 5.423,
1084
+ "eval_steps_per_second": 0.179,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.18170103092783504,
1089
+ "grad_norm": 2.2702317237854004,
1090
+ "learning_rate": 1.960067345123427e-05,
1091
+ "loss": 0.0247,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.18298969072164947,
1096
+ "grad_norm": 3.507333755493164,
1097
+ "learning_rate": 1.958798427000495e-05,
1098
+ "loss": 0.0297,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.18427835051546393,
1103
+ "grad_norm": 0.5789155960083008,
1104
+ "learning_rate": 1.9575100859146974e-05,
1105
+ "loss": 0.013,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.18556701030927836,
1110
+ "grad_norm": 1.9476535320281982,
1111
+ "learning_rate": 1.956202347964743e-05,
1112
+ "loss": 0.0208,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.18685567010309279,
1117
+ "grad_norm": 0.855241060256958,
1118
+ "learning_rate": 1.954875239642274e-05,
1119
+ "loss": 0.0071,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.18814432989690721,
1124
+ "grad_norm": 2.169466495513916,
1125
+ "learning_rate": 1.9535287878313315e-05,
1126
+ "loss": 0.0191,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.18943298969072164,
1131
+ "grad_norm": 1.1874339580535889,
1132
+ "learning_rate": 1.952163019807809e-05,
1133
+ "loss": 0.0086,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.19072164948453607,
1138
+ "grad_norm": 3.9380855560302734,
1139
+ "learning_rate": 1.9507779632388997e-05,
1140
+ "loss": 0.0264,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.19201030927835053,
1145
+ "grad_norm": 2.052539587020874,
1146
+ "learning_rate": 1.9493736461825366e-05,
1147
+ "loss": 0.0126,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.19329896907216496,
1152
+ "grad_norm": 2.4338552951812744,
1153
+ "learning_rate": 1.947950097086825e-05,
1154
+ "loss": 0.0426,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.19458762886597938,
1159
+ "grad_norm": 1.8210889101028442,
1160
+ "learning_rate": 1.946507344789464e-05,
1161
+ "loss": 0.0088,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.1958762886597938,
1166
+ "grad_norm": 0.9345032572746277,
1167
+ "learning_rate": 1.945045418517165e-05,
1168
+ "loss": 0.01,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.19716494845360824,
1173
+ "grad_norm": 2.274660587310791,
1174
+ "learning_rate": 1.9435643478850573e-05,
1175
+ "loss": 0.0208,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.19845360824742267,
1180
+ "grad_norm": 1.3613721132278442,
1181
+ "learning_rate": 1.9420641628960897e-05,
1182
+ "loss": 0.0136,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.19974226804123713,
1187
+ "grad_norm": 0.8850100040435791,
1188
+ "learning_rate": 1.9405448939404215e-05,
1189
+ "loss": 0.009,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.20103092783505155,
1194
+ "grad_norm": 0.5833643078804016,
1195
+ "learning_rate": 1.9390065717948084e-05,
1196
+ "loss": 0.0046,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.20231958762886598,
1201
+ "grad_norm": 0.42478522658348083,
1202
+ "learning_rate": 1.9374492276219776e-05,
1203
+ "loss": 0.0052,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.2036082474226804,
1208
+ "grad_norm": 1.2607591152191162,
1209
+ "learning_rate": 1.9358728929699966e-05,
1210
+ "loss": 0.0101,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.20489690721649484,
1215
+ "grad_norm": 1.5455127954483032,
1216
+ "learning_rate": 1.9342775997716357e-05,
1217
+ "loss": 0.0051,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.20618556701030927,
1222
+ "grad_norm": 5.292853832244873,
1223
+ "learning_rate": 1.9326633803437197e-05,
1224
+ "loss": 0.043,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.20618556701030927,
1229
+ "eval_accuracy": 0.9915590863952334,
1230
+ "eval_f1": 0.8547008547008547,
1231
+ "eval_loss": 0.033666037023067474,
1232
+ "eval_precision": 0.8333333333333334,
1233
+ "eval_recall": 0.8771929824561403,
1234
+ "eval_runtime": 83.7677,
1235
+ "eval_samples_per_second": 5.432,
1236
+ "eval_steps_per_second": 0.179,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.20747422680412372,
1241
+ "grad_norm": 5.327892303466797,
1242
+ "learning_rate": 1.9310302673864724e-05,
1243
+ "loss": 0.057,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.20876288659793815,
1248
+ "grad_norm": 2.6782376766204834,
1249
+ "learning_rate": 1.929378293982857e-05,
1250
+ "loss": 0.0288,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.21005154639175258,
1255
+ "grad_norm": 1.8482961654663086,
1256
+ "learning_rate": 1.9277074935979034e-05,
1257
+ "loss": 0.0087,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.211340206185567,
1262
+ "grad_norm": 0.3108800947666168,
1263
+ "learning_rate": 1.926017900078031e-05,
1264
+ "loss": 0.002,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.21262886597938144,
1269
+ "grad_norm": 6.560524940490723,
1270
+ "learning_rate": 1.924309547650363e-05,
1271
+ "loss": 0.0385,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.21391752577319587,
1276
+ "grad_norm": 1.7873457670211792,
1277
+ "learning_rate": 1.922582470922034e-05,
1278
+ "loss": 0.006,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.21520618556701032,
1283
+ "grad_norm": 4.115209102630615,
1284
+ "learning_rate": 1.9208367048794878e-05,
1285
+ "loss": 0.0095,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.21649484536082475,
1290
+ "grad_norm": 3.2223434448242188,
1291
+ "learning_rate": 1.9190722848877683e-05,
1292
+ "loss": 0.0151,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.21778350515463918,
1297
+ "grad_norm": 4.802370071411133,
1298
+ "learning_rate": 1.9172892466898047e-05,
1299
+ "loss": 0.0576,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.2190721649484536,
1304
+ "grad_norm": 2.843043327331543,
1305
+ "learning_rate": 1.9154876264056863e-05,
1306
+ "loss": 0.0116,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.22036082474226804,
1311
+ "grad_norm": 1.8300056457519531,
1312
+ "learning_rate": 1.9136674605319304e-05,
1313
+ "loss": 0.0048,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.22164948453608246,
1318
+ "grad_norm": 0.7112641930580139,
1319
+ "learning_rate": 1.911828785940745e-05,
1320
+ "loss": 0.0029,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.22293814432989692,
1325
+ "grad_norm": 3.5936992168426514,
1326
+ "learning_rate": 1.9099716398792788e-05,
1327
+ "loss": 0.0335,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.22422680412371135,
1332
+ "grad_norm": 2.8544235229492188,
1333
+ "learning_rate": 1.908096059968869e-05,
1334
+ "loss": 0.0207,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.22551546391752578,
1339
+ "grad_norm": 3.7631168365478516,
1340
+ "learning_rate": 1.906202084204279e-05,
1341
+ "loss": 0.0212,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.2268041237113402,
1346
+ "grad_norm": 1.2712973356246948,
1347
+ "learning_rate": 1.904289750952928e-05,
1348
+ "loss": 0.0084,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.22809278350515463,
1353
+ "grad_norm": 2.580491542816162,
1354
+ "learning_rate": 1.9023590989541126e-05,
1355
+ "loss": 0.0151,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.22938144329896906,
1360
+ "grad_norm": 6.0741777420043945,
1361
+ "learning_rate": 1.900410167318226e-05,
1362
+ "loss": 0.0616,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.23067010309278352,
1367
+ "grad_norm": 1.9606350660324097,
1368
+ "learning_rate": 1.8984429955259607e-05,
1369
+ "loss": 0.0305,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.23195876288659795,
1374
+ "grad_norm": 4.825283527374268,
1375
+ "learning_rate": 1.8964576234275123e-05,
1376
+ "loss": 0.0233,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.23195876288659795,
1381
+ "eval_accuracy": 0.9925521350546177,
1382
+ "eval_f1": 0.8760330578512396,
1383
+ "eval_loss": 0.027217118069529533,
1384
+ "eval_precision": 0.828125,
1385
+ "eval_recall": 0.9298245614035088,
1386
+ "eval_runtime": 84.1193,
1387
+ "eval_samples_per_second": 5.409,
1388
+ "eval_steps_per_second": 0.178,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.23324742268041238,
1393
+ "grad_norm": 3.7470309734344482,
1394
+ "learning_rate": 1.894454091241771e-05,
1395
+ "loss": 0.0375,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.2345360824742268,
1400
+ "grad_norm": 5.566728115081787,
1401
+ "learning_rate": 1.8924324395555066e-05,
1402
+ "loss": 0.0397,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.23582474226804123,
1407
+ "grad_norm": 4.115679740905762,
1408
+ "learning_rate": 1.8903927093225474e-05,
1409
+ "loss": 0.0318,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.23711340206185566,
1414
+ "grad_norm": 2.0655646324157715,
1415
+ "learning_rate": 1.8883349418629487e-05,
1416
+ "loss": 0.0502,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.23840206185567012,
1421
+ "grad_norm": 3.514209270477295,
1422
+ "learning_rate": 1.8862591788621572e-05,
1423
+ "loss": 0.034,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.23969072164948454,
1428
+ "grad_norm": 2.274663209915161,
1429
+ "learning_rate": 1.8841654623701673e-05,
1430
+ "loss": 0.0105,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.24097938144329897,
1435
+ "grad_norm": 1.3190113306045532,
1436
+ "learning_rate": 1.8820538348006666e-05,
1437
+ "loss": 0.0099,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.2422680412371134,
1442
+ "grad_norm": 1.9200594425201416,
1443
+ "learning_rate": 1.8799243389301796e-05,
1444
+ "loss": 0.0087,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.24355670103092783,
1449
+ "grad_norm": 3.5742523670196533,
1450
+ "learning_rate": 1.877777017897199e-05,
1451
+ "loss": 0.0383,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.24484536082474226,
1456
+ "grad_norm": 2.926935911178589,
1457
+ "learning_rate": 1.8756119152013134e-05,
1458
+ "loss": 0.0198,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.24613402061855671,
1463
+ "grad_norm": 4.095611095428467,
1464
+ "learning_rate": 1.873429074702324e-05,
1465
+ "loss": 0.0151,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.24742268041237114,
1470
+ "grad_norm": 1.0907986164093018,
1471
+ "learning_rate": 1.8712285406193585e-05,
1472
+ "loss": 0.0059,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.24871134020618557,
1477
+ "grad_norm": 1.646490454673767,
1478
+ "learning_rate": 1.8690103575299754e-05,
1479
+ "loss": 0.0262,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.25,
1484
+ "grad_norm": 0.9283900856971741,
1485
+ "learning_rate": 1.866774570369257e-05,
1486
+ "loss": 0.0071,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.25128865979381443,
1491
+ "grad_norm": 1.8307346105575562,
1492
+ "learning_rate": 1.8645212244289047e-05,
1493
+ "loss": 0.0246,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.25257731958762886,
1498
+ "grad_norm": 1.3150577545166016,
1499
+ "learning_rate": 1.8622503653563173e-05,
1500
+ "loss": 0.0198,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.2538659793814433,
1505
+ "grad_norm": 3.4825661182403564,
1506
+ "learning_rate": 1.8599620391536682e-05,
1507
+ "loss": 0.0136,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.2551546391752577,
1512
+ "grad_norm": 5.4773077964782715,
1513
+ "learning_rate": 1.8576562921769727e-05,
1514
+ "loss": 0.0223,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.25644329896907214,
1519
+ "grad_norm": 3.3178765773773193,
1520
+ "learning_rate": 1.8553331711351502e-05,
1521
+ "loss": 0.0392,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.25773195876288657,
1526
+ "grad_norm": 4.358588218688965,
1527
+ "learning_rate": 1.8529927230890757e-05,
1528
+ "loss": 0.029,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.25773195876288657,
1533
+ "eval_accuracy": 0.9920556107249255,
1534
+ "eval_f1": 0.8666666666666667,
1535
+ "eval_loss": 0.02330821380019188,
1536
+ "eval_precision": 0.8253968253968254,
1537
+ "eval_recall": 0.9122807017543859,
1538
+ "eval_runtime": 84.2136,
1539
+ "eval_samples_per_second": 5.403,
1540
+ "eval_steps_per_second": 0.178,
1541
+ "step": 200
1542
  }
1543
  ],
1544
  "logging_steps": 1,
 
1558
  "attributes": {}
1559
  }
1560
  },
1561
+ "total_flos": 6.685140289008435e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null