mtzig commited on
Commit
69975d3
·
verified ·
1 Parent(s): cfb1295

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:751cf08e904d158afe5d5c376833ae0e82507272c605040cee2892e5b08babd7
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd9f4e1d0bb326b818db1b6faa552753bc4a3328ac93e01b3631a83d08e1c95
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06a90cd633a16bc937002e25bf17189511b59b8e8d6bd000662a0b07bbf80e65
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c1ce66274008394f36d101e20d4378dd480a6f7db7387a58eed60435a8f39a7
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a2a1314f17bbfab7f1e0d63c5f4ad16ed3c81d2546ede535c552f819fd2b3ab
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b3e7a569d804afc7e9af01c045d344bcf8aa04435a748d8f22d80f77f68191f
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61ce145ff3942d4b7afa3864d5060ecb372e5343ad7abd8681f9ef04e3996a27
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84f72d90d6d6f96ffde5e12766b8aa3f0ebf70484ff977b4cc1380cfd2635d82
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef11d2a12800199b371850d31a6f25caf9ed6263ace1a113ebdb48708b504181
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ec10706acfd7aebf2e0313a26ad47f112db6494baa4011866a112fa6459782
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:146b0c5dbeeaf44008b0996c6e5385da4bd543d15c17518bdbadad49738050cf
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d45fd8c4f5cac20eb0715bd7c3583b8b9d6d50be52eb3b819ead289c264bf4c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce829e48185824cbdd9332cfc8db3ea7a52a07e0542cfb9ceb1a123cf238c986
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc7bbedf822084a972aaf7dbfdc31778a6b5afdff5f9d51666b28397948c4cf6
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ebc6b5b3bf3ff4db9103e1f6c2a534b04905118e03682dcfd5b2651cf5023a9
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc56dd27c16979078189d0168509b3491fac9a7018e2acd5413b0b5bfb9e62b8
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de551c8b1c7165a88a996c1e2c9ccef2b9c716ed5dd27dbead3aed497f875baf
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbefc2a9b5877ac52b5c278c40b832840a445a83b4f45552eae9c8d8fd7025ab
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d7ac6617d0f4bedb4a49c51f3499536a49a714fbf47c2bf0f3ef31d2fc421c6
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7ee2bc06c634de7d668e8f27eb2c655185598b0005a48f28db9b8c13871cf8
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52e8353d56a8d408800aeacfb35920a2641f5b231344cfc6e9685c5fff1b02a0
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e89f6ffe398cd010021cbea856f31e9f12c086dc22192dd94cd4139ed13bc428
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e436a7cd1ba66004593b56f4b7ea09bd56b9555226a359189f1b562635d9c755
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cac7e1aa01f996ea4ccf65c0edbca9c2218b27d0fee393e5dadf9e12f0a4ac0
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7754cc7953f3b6df72a3313e3aea13311d99a52571c658951fa15bc741d1fcff
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca19ec64a3f37f86c1a9f3bd1615be54fe5912d912de79a7d3f808a593a2192d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7627118644067796,
5
  "eval_steps": 20,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6859,6 +6859,766 @@
6859
  "eval_samples_per_second": 5.876,
6860
  "eval_steps_per_second": 0.201,
6861
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6862
  }
6863
  ],
6864
  "logging_steps": 1,
@@ -6878,7 +7638,7 @@
6878
  "attributes": {}
6879
  }
6880
  },
6881
- "total_flos": 2.771703818092544e+17,
6882
  "train_batch_size": 8,
6883
  "trial_name": null,
6884
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.847457627118644,
5
  "eval_steps": 20,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6859
  "eval_samples_per_second": 5.876,
6860
  "eval_steps_per_second": 0.201,
6861
  "step": 900
6862
+ },
6863
+ {
6864
+ "epoch": 0.7635593220338983,
6865
+ "grad_norm": 1.5581014156341553,
6866
+ "learning_rate": 3.216881637303839e-06,
6867
+ "loss": 0.0083,
6868
+ "step": 901
6869
+ },
6870
+ {
6871
+ "epoch": 0.764406779661017,
6872
+ "grad_norm": 1.8738924264907837,
6873
+ "learning_rate": 3.1951753680566143e-06,
6874
+ "loss": 0.0215,
6875
+ "step": 902
6876
+ },
6877
+ {
6878
+ "epoch": 0.7652542372881356,
6879
+ "grad_norm": 0.4267842173576355,
6880
+ "learning_rate": 3.1735286468303563e-06,
6881
+ "loss": 0.0016,
6882
+ "step": 903
6883
+ },
6884
+ {
6885
+ "epoch": 0.7661016949152543,
6886
+ "grad_norm": 1.4631012678146362,
6887
+ "learning_rate": 3.151941663052345e-06,
6888
+ "loss": 0.0058,
6889
+ "step": 904
6890
+ },
6891
+ {
6892
+ "epoch": 0.7669491525423728,
6893
+ "grad_norm": 0.23579372465610504,
6894
+ "learning_rate": 3.130414605627102e-06,
6895
+ "loss": 0.0017,
6896
+ "step": 905
6897
+ },
6898
+ {
6899
+ "epoch": 0.7677966101694915,
6900
+ "grad_norm": 1.0443428754806519,
6901
+ "learning_rate": 3.1089476629347494e-06,
6902
+ "loss": 0.0078,
6903
+ "step": 906
6904
+ },
6905
+ {
6906
+ "epoch": 0.7686440677966102,
6907
+ "grad_norm": 0.8802245259284973,
6908
+ "learning_rate": 3.087541022829347e-06,
6909
+ "loss": 0.0052,
6910
+ "step": 907
6911
+ },
6912
+ {
6913
+ "epoch": 0.7694915254237288,
6914
+ "grad_norm": 0.9820923805236816,
6915
+ "learning_rate": 3.066194872637258e-06,
6916
+ "loss": 0.0022,
6917
+ "step": 908
6918
+ },
6919
+ {
6920
+ "epoch": 0.7703389830508475,
6921
+ "grad_norm": 0.40738704800605774,
6922
+ "learning_rate": 3.04490939915551e-06,
6923
+ "loss": 0.001,
6924
+ "step": 909
6925
+ },
6926
+ {
6927
+ "epoch": 0.7711864406779662,
6928
+ "grad_norm": 2.081660032272339,
6929
+ "learning_rate": 3.023684788650154e-06,
6930
+ "loss": 0.0101,
6931
+ "step": 910
6932
+ },
6933
+ {
6934
+ "epoch": 0.7720338983050847,
6935
+ "grad_norm": 1.3725014925003052,
6936
+ "learning_rate": 3.002521226854641e-06,
6937
+ "loss": 0.0069,
6938
+ "step": 911
6939
+ },
6940
+ {
6941
+ "epoch": 0.7728813559322034,
6942
+ "grad_norm": 2.1171929836273193,
6943
+ "learning_rate": 2.981418898968186e-06,
6944
+ "loss": 0.0139,
6945
+ "step": 912
6946
+ },
6947
+ {
6948
+ "epoch": 0.773728813559322,
6949
+ "grad_norm": 1.6483219861984253,
6950
+ "learning_rate": 2.9603779896541705e-06,
6951
+ "loss": 0.0092,
6952
+ "step": 913
6953
+ },
6954
+ {
6955
+ "epoch": 0.7745762711864407,
6956
+ "grad_norm": 0.36683687567710876,
6957
+ "learning_rate": 2.939398683038497e-06,
6958
+ "loss": 0.0012,
6959
+ "step": 914
6960
+ },
6961
+ {
6962
+ "epoch": 0.7754237288135594,
6963
+ "grad_norm": 1.9361350536346436,
6964
+ "learning_rate": 2.918481162707999e-06,
6965
+ "loss": 0.0093,
6966
+ "step": 915
6967
+ },
6968
+ {
6969
+ "epoch": 0.7762711864406779,
6970
+ "grad_norm": 0.6846543550491333,
6971
+ "learning_rate": 2.89762561170882e-06,
6972
+ "loss": 0.0035,
6973
+ "step": 916
6974
+ },
6975
+ {
6976
+ "epoch": 0.7771186440677966,
6977
+ "grad_norm": 1.052035927772522,
6978
+ "learning_rate": 2.8768322125448265e-06,
6979
+ "loss": 0.0123,
6980
+ "step": 917
6981
+ },
6982
+ {
6983
+ "epoch": 0.7779661016949152,
6984
+ "grad_norm": 0.6025975942611694,
6985
+ "learning_rate": 2.856101147175998e-06,
6986
+ "loss": 0.0035,
6987
+ "step": 918
6988
+ },
6989
+ {
6990
+ "epoch": 0.7788135593220339,
6991
+ "grad_norm": 1.8254081010818481,
6992
+ "learning_rate": 2.8354325970168483e-06,
6993
+ "loss": 0.0175,
6994
+ "step": 919
6995
+ },
6996
+ {
6997
+ "epoch": 0.7796610169491526,
6998
+ "grad_norm": 0.6324992179870605,
6999
+ "learning_rate": 2.814826742934823e-06,
7000
+ "loss": 0.0027,
7001
+ "step": 920
7002
+ },
7003
+ {
7004
+ "epoch": 0.7796610169491526,
7005
+ "eval_accuracy": 1.0,
7006
+ "eval_f1": 1.0,
7007
+ "eval_loss": 7.932856533443555e-05,
7008
+ "eval_precision": 1.0,
7009
+ "eval_recall": 1.0,
7010
+ "eval_runtime": 50.5195,
7011
+ "eval_samples_per_second": 5.78,
7012
+ "eval_steps_per_second": 0.198,
7013
+ "step": 920
7014
+ },
7015
+ {
7016
+ "epoch": 0.7805084745762711,
7017
+ "grad_norm": 4.134251117706299,
7018
+ "learning_rate": 2.794283765248722e-06,
7019
+ "loss": 0.0218,
7020
+ "step": 921
7021
+ },
7022
+ {
7023
+ "epoch": 0.7813559322033898,
7024
+ "grad_norm": 1.057350754737854,
7025
+ "learning_rate": 2.7738038437271288e-06,
7026
+ "loss": 0.0032,
7027
+ "step": 922
7028
+ },
7029
+ {
7030
+ "epoch": 0.7822033898305085,
7031
+ "grad_norm": 0.7094781994819641,
7032
+ "learning_rate": 2.7533871575868275e-06,
7033
+ "loss": 0.0028,
7034
+ "step": 923
7035
+ },
7036
+ {
7037
+ "epoch": 0.7830508474576271,
7038
+ "grad_norm": 2.3617732524871826,
7039
+ "learning_rate": 2.733033885491241e-06,
7040
+ "loss": 0.0126,
7041
+ "step": 924
7042
+ },
7043
+ {
7044
+ "epoch": 0.7838983050847458,
7045
+ "grad_norm": 0.1944715678691864,
7046
+ "learning_rate": 2.7127442055488617e-06,
7047
+ "loss": 0.0007,
7048
+ "step": 925
7049
+ },
7050
+ {
7051
+ "epoch": 0.7847457627118644,
7052
+ "grad_norm": 0.6528817415237427,
7053
+ "learning_rate": 2.6925182953117022e-06,
7054
+ "loss": 0.0046,
7055
+ "step": 926
7056
+ },
7057
+ {
7058
+ "epoch": 0.785593220338983,
7059
+ "grad_norm": 0.31304916739463806,
7060
+ "learning_rate": 2.67235633177373e-06,
7061
+ "loss": 0.0016,
7062
+ "step": 927
7063
+ },
7064
+ {
7065
+ "epoch": 0.7864406779661017,
7066
+ "grad_norm": 0.75702303647995,
7067
+ "learning_rate": 2.6522584913693295e-06,
7068
+ "loss": 0.0047,
7069
+ "step": 928
7070
+ },
7071
+ {
7072
+ "epoch": 0.7872881355932203,
7073
+ "grad_norm": 1.600816011428833,
7074
+ "learning_rate": 2.6322249499717477e-06,
7075
+ "loss": 0.0062,
7076
+ "step": 929
7077
+ },
7078
+ {
7079
+ "epoch": 0.788135593220339,
7080
+ "grad_norm": 1.592640995979309,
7081
+ "learning_rate": 2.6122558828915647e-06,
7082
+ "loss": 0.0064,
7083
+ "step": 930
7084
+ },
7085
+ {
7086
+ "epoch": 0.7889830508474577,
7087
+ "grad_norm": 2.1126153469085693,
7088
+ "learning_rate": 2.5923514648751537e-06,
7089
+ "loss": 0.0265,
7090
+ "step": 931
7091
+ },
7092
+ {
7093
+ "epoch": 0.7898305084745763,
7094
+ "grad_norm": 1.4339178800582886,
7095
+ "learning_rate": 2.572511870103149e-06,
7096
+ "loss": 0.0054,
7097
+ "step": 932
7098
+ },
7099
+ {
7100
+ "epoch": 0.7906779661016949,
7101
+ "grad_norm": 2.253162145614624,
7102
+ "learning_rate": 2.55273727218894e-06,
7103
+ "loss": 0.0321,
7104
+ "step": 933
7105
+ },
7106
+ {
7107
+ "epoch": 0.7915254237288135,
7108
+ "grad_norm": 1.1612133979797363,
7109
+ "learning_rate": 2.533027844177123e-06,
7110
+ "loss": 0.0062,
7111
+ "step": 934
7112
+ },
7113
+ {
7114
+ "epoch": 0.7923728813559322,
7115
+ "grad_norm": 1.0363982915878296,
7116
+ "learning_rate": 2.5133837585420084e-06,
7117
+ "loss": 0.0053,
7118
+ "step": 935
7119
+ },
7120
+ {
7121
+ "epoch": 0.7932203389830509,
7122
+ "grad_norm": 1.3332302570343018,
7123
+ "learning_rate": 2.4938051871861046e-06,
7124
+ "loss": 0.0072,
7125
+ "step": 936
7126
+ },
7127
+ {
7128
+ "epoch": 0.7940677966101695,
7129
+ "grad_norm": 0.3061300218105316,
7130
+ "learning_rate": 2.4742923014386154e-06,
7131
+ "loss": 0.0015,
7132
+ "step": 937
7133
+ },
7134
+ {
7135
+ "epoch": 0.7949152542372881,
7136
+ "grad_norm": 2.649893045425415,
7137
+ "learning_rate": 2.4548452720539375e-06,
7138
+ "loss": 0.0238,
7139
+ "step": 938
7140
+ },
7141
+ {
7142
+ "epoch": 0.7957627118644067,
7143
+ "grad_norm": 0.9358623623847961,
7144
+ "learning_rate": 2.435464269210167e-06,
7145
+ "loss": 0.0036,
7146
+ "step": 939
7147
+ },
7148
+ {
7149
+ "epoch": 0.7966101694915254,
7150
+ "grad_norm": 1.4924583435058594,
7151
+ "learning_rate": 2.4161494625076164e-06,
7152
+ "loss": 0.0105,
7153
+ "step": 940
7154
+ },
7155
+ {
7156
+ "epoch": 0.7966101694915254,
7157
+ "eval_accuracy": 1.0,
7158
+ "eval_f1": 1.0,
7159
+ "eval_loss": 0.00010792797547765076,
7160
+ "eval_precision": 1.0,
7161
+ "eval_recall": 1.0,
7162
+ "eval_runtime": 49.837,
7163
+ "eval_samples_per_second": 5.859,
7164
+ "eval_steps_per_second": 0.201,
7165
+ "step": 940
7166
+ },
7167
+ {
7168
+ "epoch": 0.7974576271186441,
7169
+ "grad_norm": 0.9415515661239624,
7170
+ "learning_rate": 2.3969010209673215e-06,
7171
+ "loss": 0.0031,
7172
+ "step": 941
7173
+ },
7174
+ {
7175
+ "epoch": 0.7983050847457627,
7176
+ "grad_norm": 1.4553923606872559,
7177
+ "learning_rate": 2.3777191130295673e-06,
7178
+ "loss": 0.008,
7179
+ "step": 942
7180
+ },
7181
+ {
7182
+ "epoch": 0.7991525423728814,
7183
+ "grad_norm": 0.9974135160446167,
7184
+ "learning_rate": 2.3586039065524113e-06,
7185
+ "loss": 0.0037,
7186
+ "step": 943
7187
+ },
7188
+ {
7189
+ "epoch": 0.8,
7190
+ "grad_norm": 1.052581548690796,
7191
+ "learning_rate": 2.339555568810221e-06,
7192
+ "loss": 0.0057,
7193
+ "step": 944
7194
+ },
7195
+ {
7196
+ "epoch": 0.8008474576271186,
7197
+ "grad_norm": 0.27318713068962097,
7198
+ "learning_rate": 2.3205742664922006e-06,
7199
+ "loss": 0.0011,
7200
+ "step": 945
7201
+ },
7202
+ {
7203
+ "epoch": 0.8016949152542373,
7204
+ "grad_norm": 2.6839377880096436,
7205
+ "learning_rate": 2.3016601657009364e-06,
7206
+ "loss": 0.0192,
7207
+ "step": 946
7208
+ },
7209
+ {
7210
+ "epoch": 0.8025423728813559,
7211
+ "grad_norm": 0.8619096279144287,
7212
+ "learning_rate": 2.282813431950952e-06,
7213
+ "loss": 0.0026,
7214
+ "step": 947
7215
+ },
7216
+ {
7217
+ "epoch": 0.8033898305084746,
7218
+ "grad_norm": 2.3613054752349854,
7219
+ "learning_rate": 2.264034230167246e-06,
7220
+ "loss": 0.0161,
7221
+ "step": 948
7222
+ },
7223
+ {
7224
+ "epoch": 0.8042372881355933,
7225
+ "grad_norm": 1.840660572052002,
7226
+ "learning_rate": 2.245322724683854e-06,
7227
+ "loss": 0.0191,
7228
+ "step": 949
7229
+ },
7230
+ {
7231
+ "epoch": 0.8050847457627118,
7232
+ "grad_norm": 1.5182996988296509,
7233
+ "learning_rate": 2.2266790792424096e-06,
7234
+ "loss": 0.0083,
7235
+ "step": 950
7236
+ },
7237
+ {
7238
+ "epoch": 0.8059322033898305,
7239
+ "grad_norm": 1.8400460481643677,
7240
+ "learning_rate": 2.208103456990719e-06,
7241
+ "loss": 0.0136,
7242
+ "step": 951
7243
+ },
7244
+ {
7245
+ "epoch": 0.8067796610169492,
7246
+ "grad_norm": 1.5054808855056763,
7247
+ "learning_rate": 2.1895960204813194e-06,
7248
+ "loss": 0.0101,
7249
+ "step": 952
7250
+ },
7251
+ {
7252
+ "epoch": 0.8076271186440678,
7253
+ "grad_norm": 1.5928698778152466,
7254
+ "learning_rate": 2.1711569316700774e-06,
7255
+ "loss": 0.0118,
7256
+ "step": 953
7257
+ },
7258
+ {
7259
+ "epoch": 0.8084745762711865,
7260
+ "grad_norm": 1.162479281425476,
7261
+ "learning_rate": 2.1527863519147474e-06,
7262
+ "loss": 0.0068,
7263
+ "step": 954
7264
+ },
7265
+ {
7266
+ "epoch": 0.809322033898305,
7267
+ "grad_norm": 1.07491135597229,
7268
+ "learning_rate": 2.1344844419735757e-06,
7269
+ "loss": 0.0025,
7270
+ "step": 955
7271
+ },
7272
+ {
7273
+ "epoch": 0.8101694915254237,
7274
+ "grad_norm": 0.22395382821559906,
7275
+ "learning_rate": 2.116251362003887e-06,
7276
+ "loss": 0.0007,
7277
+ "step": 956
7278
+ },
7279
+ {
7280
+ "epoch": 0.8110169491525424,
7281
+ "grad_norm": 0.4018426239490509,
7282
+ "learning_rate": 2.098087271560687e-06,
7283
+ "loss": 0.0026,
7284
+ "step": 957
7285
+ },
7286
+ {
7287
+ "epoch": 0.811864406779661,
7288
+ "grad_norm": 1.074708104133606,
7289
+ "learning_rate": 2.079992329595263e-06,
7290
+ "loss": 0.0035,
7291
+ "step": 958
7292
+ },
7293
+ {
7294
+ "epoch": 0.8127118644067797,
7295
+ "grad_norm": 1.0309704542160034,
7296
+ "learning_rate": 2.0619666944537954e-06,
7297
+ "loss": 0.0041,
7298
+ "step": 959
7299
+ },
7300
+ {
7301
+ "epoch": 0.8135593220338984,
7302
+ "grad_norm": 2.1775588989257812,
7303
+ "learning_rate": 2.044010523875969e-06,
7304
+ "loss": 0.0157,
7305
+ "step": 960
7306
+ },
7307
+ {
7308
+ "epoch": 0.8135593220338984,
7309
+ "eval_accuracy": 1.0,
7310
+ "eval_f1": 1.0,
7311
+ "eval_loss": 9.212108125211671e-05,
7312
+ "eval_precision": 1.0,
7313
+ "eval_recall": 1.0,
7314
+ "eval_runtime": 49.5926,
7315
+ "eval_samples_per_second": 5.888,
7316
+ "eval_steps_per_second": 0.202,
7317
+ "step": 960
7318
+ },
7319
+ {
7320
+ "epoch": 0.8144067796610169,
7321
+ "grad_norm": 0.222603902220726,
7322
+ "learning_rate": 2.0261239749935966e-06,
7323
+ "loss": 0.0009,
7324
+ "step": 961
7325
+ },
7326
+ {
7327
+ "epoch": 0.8152542372881356,
7328
+ "grad_norm": 0.21753355860710144,
7329
+ "learning_rate": 2.0083072043292406e-06,
7330
+ "loss": 0.0007,
7331
+ "step": 962
7332
+ },
7333
+ {
7334
+ "epoch": 0.8161016949152542,
7335
+ "grad_norm": 1.3669072389602661,
7336
+ "learning_rate": 1.9905603677948425e-06,
7337
+ "loss": 0.0065,
7338
+ "step": 963
7339
+ },
7340
+ {
7341
+ "epoch": 0.8169491525423729,
7342
+ "grad_norm": 2.4227099418640137,
7343
+ "learning_rate": 1.972883620690366e-06,
7344
+ "loss": 0.0253,
7345
+ "step": 964
7346
+ },
7347
+ {
7348
+ "epoch": 0.8177966101694916,
7349
+ "grad_norm": 0.42630961537361145,
7350
+ "learning_rate": 1.955277117702424e-06,
7351
+ "loss": 0.0013,
7352
+ "step": 965
7353
+ },
7354
+ {
7355
+ "epoch": 0.8186440677966101,
7356
+ "grad_norm": 1.9701416492462158,
7357
+ "learning_rate": 1.9377410129029407e-06,
7358
+ "loss": 0.011,
7359
+ "step": 966
7360
+ },
7361
+ {
7362
+ "epoch": 0.8194915254237288,
7363
+ "grad_norm": 2.1445109844207764,
7364
+ "learning_rate": 1.920275459747796e-06,
7365
+ "loss": 0.0132,
7366
+ "step": 967
7367
+ },
7368
+ {
7369
+ "epoch": 0.8203389830508474,
7370
+ "grad_norm": 1.7752200365066528,
7371
+ "learning_rate": 1.902880611075477e-06,
7372
+ "loss": 0.0069,
7373
+ "step": 968
7374
+ },
7375
+ {
7376
+ "epoch": 0.8211864406779661,
7377
+ "grad_norm": 0.9991908669471741,
7378
+ "learning_rate": 1.8855566191057538e-06,
7379
+ "loss": 0.0043,
7380
+ "step": 969
7381
+ },
7382
+ {
7383
+ "epoch": 0.8220338983050848,
7384
+ "grad_norm": 1.4875959157943726,
7385
+ "learning_rate": 1.868303635438332e-06,
7386
+ "loss": 0.0118,
7387
+ "step": 970
7388
+ },
7389
+ {
7390
+ "epoch": 0.8228813559322034,
7391
+ "grad_norm": 0.8871830701828003,
7392
+ "learning_rate": 1.8511218110515428e-06,
7393
+ "loss": 0.0042,
7394
+ "step": 971
7395
+ },
7396
+ {
7397
+ "epoch": 0.823728813559322,
7398
+ "grad_norm": 1.4304015636444092,
7399
+ "learning_rate": 1.8340112963009993e-06,
7400
+ "loss": 0.0085,
7401
+ "step": 972
7402
+ },
7403
+ {
7404
+ "epoch": 0.8245762711864407,
7405
+ "grad_norm": 1.7509040832519531,
7406
+ "learning_rate": 1.81697224091831e-06,
7407
+ "loss": 0.0181,
7408
+ "step": 973
7409
+ },
7410
+ {
7411
+ "epoch": 0.8254237288135593,
7412
+ "grad_norm": 0.6783941388130188,
7413
+ "learning_rate": 1.8000047940097453e-06,
7414
+ "loss": 0.0039,
7415
+ "step": 974
7416
+ },
7417
+ {
7418
+ "epoch": 0.826271186440678,
7419
+ "grad_norm": 0.9287757873535156,
7420
+ "learning_rate": 1.7831091040549397e-06,
7421
+ "loss": 0.0097,
7422
+ "step": 975
7423
+ },
7424
+ {
7425
+ "epoch": 0.8271186440677966,
7426
+ "grad_norm": 1.7914001941680908,
7427
+ "learning_rate": 1.7662853189055951e-06,
7428
+ "loss": 0.0152,
7429
+ "step": 976
7430
+ },
7431
+ {
7432
+ "epoch": 0.8279661016949152,
7433
+ "grad_norm": 0.4140421450138092,
7434
+ "learning_rate": 1.7495335857841855e-06,
7435
+ "loss": 0.0023,
7436
+ "step": 977
7437
+ },
7438
+ {
7439
+ "epoch": 0.8288135593220339,
7440
+ "grad_norm": 0.8546230792999268,
7441
+ "learning_rate": 1.7328540512826664e-06,
7442
+ "loss": 0.0077,
7443
+ "step": 978
7444
+ },
7445
+ {
7446
+ "epoch": 0.8296610169491525,
7447
+ "grad_norm": 1.1925534009933472,
7448
+ "learning_rate": 1.7162468613611937e-06,
7449
+ "loss": 0.0077,
7450
+ "step": 979
7451
+ },
7452
+ {
7453
+ "epoch": 0.8305084745762712,
7454
+ "grad_norm": 1.0941600799560547,
7455
+ "learning_rate": 1.699712161346846e-06,
7456
+ "loss": 0.0082,
7457
+ "step": 980
7458
+ },
7459
+ {
7460
+ "epoch": 0.8305084745762712,
7461
+ "eval_accuracy": 1.0,
7462
+ "eval_f1": 1.0,
7463
+ "eval_loss": 0.00011068069579778239,
7464
+ "eval_precision": 1.0,
7465
+ "eval_recall": 1.0,
7466
+ "eval_runtime": 49.9161,
7467
+ "eval_samples_per_second": 5.85,
7468
+ "eval_steps_per_second": 0.2,
7469
+ "step": 980
7470
+ },
7471
+ {
7472
+ "epoch": 0.8313559322033899,
7473
+ "grad_norm": 3.0363481044769287,
7474
+ "learning_rate": 1.6832500959323605e-06,
7475
+ "loss": 0.0313,
7476
+ "step": 981
7477
+ },
7478
+ {
7479
+ "epoch": 0.8322033898305085,
7480
+ "grad_norm": 1.8849022388458252,
7481
+ "learning_rate": 1.6668608091748495e-06,
7482
+ "loss": 0.007,
7483
+ "step": 982
7484
+ },
7485
+ {
7486
+ "epoch": 0.8330508474576271,
7487
+ "grad_norm": 0.2518068253993988,
7488
+ "learning_rate": 1.6505444444945584e-06,
7489
+ "loss": 0.0009,
7490
+ "step": 983
7491
+ },
7492
+ {
7493
+ "epoch": 0.8338983050847457,
7494
+ "grad_norm": 0.548155665397644,
7495
+ "learning_rate": 1.6343011446735925e-06,
7496
+ "loss": 0.0024,
7497
+ "step": 984
7498
+ },
7499
+ {
7500
+ "epoch": 0.8347457627118644,
7501
+ "grad_norm": 1.4121159315109253,
7502
+ "learning_rate": 1.6181310518546856e-06,
7503
+ "loss": 0.0082,
7504
+ "step": 985
7505
+ },
7506
+ {
7507
+ "epoch": 0.8355932203389831,
7508
+ "grad_norm": 3.7406160831451416,
7509
+ "learning_rate": 1.6020343075399425e-06,
7510
+ "loss": 0.0086,
7511
+ "step": 986
7512
+ },
7513
+ {
7514
+ "epoch": 0.8364406779661017,
7515
+ "grad_norm": 0.4382129907608032,
7516
+ "learning_rate": 1.5860110525896143e-06,
7517
+ "loss": 0.0032,
7518
+ "step": 987
7519
+ },
7520
+ {
7521
+ "epoch": 0.8372881355932204,
7522
+ "grad_norm": 1.0554977655410767,
7523
+ "learning_rate": 1.5700614272208492e-06,
7524
+ "loss": 0.0042,
7525
+ "step": 988
7526
+ },
7527
+ {
7528
+ "epoch": 0.838135593220339,
7529
+ "grad_norm": 0.5351442694664001,
7530
+ "learning_rate": 1.5541855710064757e-06,
7531
+ "loss": 0.0021,
7532
+ "step": 989
7533
+ },
7534
+ {
7535
+ "epoch": 0.8389830508474576,
7536
+ "grad_norm": 1.2155871391296387,
7537
+ "learning_rate": 1.5383836228737815e-06,
7538
+ "loss": 0.0059,
7539
+ "step": 990
7540
+ },
7541
+ {
7542
+ "epoch": 0.8398305084745763,
7543
+ "grad_norm": 1.8322945833206177,
7544
+ "learning_rate": 1.522655721103291e-06,
7545
+ "loss": 0.0069,
7546
+ "step": 991
7547
+ },
7548
+ {
7549
+ "epoch": 0.8406779661016949,
7550
+ "grad_norm": 1.3039281368255615,
7551
+ "learning_rate": 1.5070020033275655e-06,
7552
+ "loss": 0.0102,
7553
+ "step": 992
7554
+ },
7555
+ {
7556
+ "epoch": 0.8415254237288136,
7557
+ "grad_norm": 1.6748837232589722,
7558
+ "learning_rate": 1.4914226065299886e-06,
7559
+ "loss": 0.0059,
7560
+ "step": 993
7561
+ },
7562
+ {
7563
+ "epoch": 0.8423728813559322,
7564
+ "grad_norm": 0.4845666289329529,
7565
+ "learning_rate": 1.475917667043575e-06,
7566
+ "loss": 0.0019,
7567
+ "step": 994
7568
+ },
7569
+ {
7570
+ "epoch": 0.8432203389830508,
7571
+ "grad_norm": 0.8964245915412903,
7572
+ "learning_rate": 1.4604873205497727e-06,
7573
+ "loss": 0.002,
7574
+ "step": 995
7575
+ },
7576
+ {
7577
+ "epoch": 0.8440677966101695,
7578
+ "grad_norm": 1.370054841041565,
7579
+ "learning_rate": 1.445131702077277e-06,
7580
+ "loss": 0.0086,
7581
+ "step": 996
7582
+ },
7583
+ {
7584
+ "epoch": 0.8449152542372881,
7585
+ "grad_norm": 2.0046818256378174,
7586
+ "learning_rate": 1.4298509460008491e-06,
7587
+ "loss": 0.0311,
7588
+ "step": 997
7589
+ },
7590
+ {
7591
+ "epoch": 0.8457627118644068,
7592
+ "grad_norm": 1.3406736850738525,
7593
+ "learning_rate": 1.4146451860401445e-06,
7594
+ "loss": 0.0075,
7595
+ "step": 998
7596
+ },
7597
+ {
7598
+ "epoch": 0.8466101694915255,
7599
+ "grad_norm": 0.8433687090873718,
7600
+ "learning_rate": 1.3995145552585321e-06,
7601
+ "loss": 0.0047,
7602
+ "step": 999
7603
+ },
7604
+ {
7605
+ "epoch": 0.847457627118644,
7606
+ "grad_norm": 2.1373324394226074,
7607
+ "learning_rate": 1.3844591860619382e-06,
7608
+ "loss": 0.0084,
7609
+ "step": 1000
7610
+ },
7611
+ {
7612
+ "epoch": 0.847457627118644,
7613
+ "eval_accuracy": 1.0,
7614
+ "eval_f1": 1.0,
7615
+ "eval_loss": 0.0001222841819981113,
7616
+ "eval_precision": 1.0,
7617
+ "eval_recall": 1.0,
7618
+ "eval_runtime": 50.0901,
7619
+ "eval_samples_per_second": 5.83,
7620
+ "eval_steps_per_second": 0.2,
7621
+ "step": 1000
7622
  }
7623
  ],
7624
  "logging_steps": 1,
 
7638
  "attributes": {}
7639
  }
7640
  },
7641
+ "total_flos": 3.076975196163277e+17,
7642
  "train_batch_size": 8,
7643
  "trial_name": null,
7644
  "trial_params": null