Delta-Vector commited on
Commit
4c21acd
·
verified ·
1 Parent(s): 42210b1

Training in progress, step 285, checkpoint

Browse files
Files changed (25) hide show
  1. checkpoint-285/global_step284/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
  2. checkpoint-285/global_step284/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  3. checkpoint-285/global_step284/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  4. checkpoint-285/global_step284/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  5. checkpoint-285/global_step284/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  6. checkpoint-285/global_step284/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-285/global_step284/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  8. checkpoint-285/global_step284/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-285/model-00001-of-00014.safetensors +1 -1
  10. checkpoint-285/model-00002-of-00014.safetensors +1 -1
  11. checkpoint-285/model-00003-of-00014.safetensors +1 -1
  12. checkpoint-285/model-00004-of-00014.safetensors +1 -1
  13. checkpoint-285/model-00005-of-00014.safetensors +1 -1
  14. checkpoint-285/model-00006-of-00014.safetensors +1 -1
  15. checkpoint-285/model-00007-of-00014.safetensors +1 -1
  16. checkpoint-285/model-00008-of-00014.safetensors +1 -1
  17. checkpoint-285/model-00009-of-00014.safetensors +1 -1
  18. checkpoint-285/model-00010-of-00014.safetensors +1 -1
  19. checkpoint-285/model-00011-of-00014.safetensors +1 -1
  20. checkpoint-285/model-00012-of-00014.safetensors +1 -1
  21. checkpoint-285/model-00013-of-00014.safetensors +1 -1
  22. checkpoint-285/model-00014-of-00014.safetensors +1 -1
  23. checkpoint-285/scheduler.pt +1 -1
  24. checkpoint-285/trainer_state.json +853 -853
  25. checkpoint-285/training_args.bin +1 -1
checkpoint-285/global_step284/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e313e6e58e9d7cc9981cff1d6c99d37e9c380fab3b300a1d6192707eb85c2d2
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de31ab4cc55b5a62a02311287c1c4727c533b8f311e2a9d2dfc62cc9bc0e13d5
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0914b5489b926c3dda5cab10e5f6a86e334b09c2a40a9c1851d4090541a4011
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4b18d2a78e83956f01de73a12f6e4073ee73a7aecaade2daedc7a9846265f2c
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae091b7ba1f3ee9613567fee6ebd5c9fbbe6caad8d4e357de34b197eea1c14ef
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da7ffd025ed28a5619ead7166b1c8f0c355478aeeb1cd72606519f0bb0c9e8f9
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92b22d8330b661f3f8ad5c49ead6dcb7636d303857fcc55e77beb694f00e6997
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7743641d34754453647cb84c53bc0e4ac5dcbeb0d3bbd2e76863cc9f02a55cf0
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1bf338c3c6f37bdc4cb69f378b263ae3a8919557ab946837a39137d5190b8b6
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1501dda8d48b87a7768c14609361d9ef225fdf4e044114b6901eedcda389ba8d
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edd136028008377466f9e4a32cae5bea5fe1cd040fc91b058cec6be9744ca06e
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2fa2f137899bedcb1e4c119d894eb991a11d22e3083de538541d7d7ebbdae57
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c26d27103a860890516b978ccd73a795d8215578158607db8bb49604190dedae
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c13d22c1fd3ac104de66b63a8439eff7253e8c8638787dd9de04cb833fb693
3
  size 24702834279
checkpoint-285/global_step284/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f38ae7e2aaf50473a127aebe2d0c5aeeefb0a3d2330d15ac44bd5b67668372d3
3
  size 24702834279
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f42819fba647e363429895c0a5b1c2b06a9a9ec455d353be37c37a503a56ebe
3
  size 24702834279
checkpoint-285/model-00001-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42346fcee4c84b21716f181c28a0c2fdc62a9fadf4b258f468bdc20c9c87ce50
3
  size 4891730992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ea61edbcc041ff874873784b2ea0ed8c3a8e61039e048d21944f403711ba32d
3
  size 4891730992
checkpoint-285/model-00002-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2bded2abc95e816bbfd41d45fdaf02c472b93e39b186c2c67e64c1ef584090c
3
  size 4876059352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6344c83acb079116781c108e03221a8f14aaa0c4cdfecb38d5a31a0fe1d46c
3
  size 4876059352
checkpoint-285/model-00003-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1381b8a8d48b624aea2de2d3e7dcf7e53a5d4dc237f757b8c263eef405faa9d
3
  size 4876059384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd504b64c920ca644ef03455badf3bef40c8f07909e704f1749699769b132792
3
  size 4876059384
checkpoint-285/model-00004-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac15677e790996976ec66bda89206c21574de8ea8939648e68c9c5e5def35123
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4582cc3002f1f8ff564b58a8f2d0be4b54ec8aec6f712f8ef6fb7cee92f8349c
3
  size 4876059416
checkpoint-285/model-00005-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:565a75ee8993f80a7040893b826028fc6daf5578d3ba851232548dd5d51862bf
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6673e14622d95b673ff9ac0e8a50aaf111a50dd1a2e623cea6a27b4c0a3aeb5e
3
  size 4876059416
checkpoint-285/model-00006-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9efb5296f2182c7153f643050a69d2ab537e27fc15b488055775b6ef40f5159
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3733f1990dd750cde99801b3977e22d405b70360761216f9026297e4439d75dd
3
  size 4876059416
checkpoint-285/model-00007-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abfe635f0c526f713e09761d51c5305bd3685d47a4669e02588c3e792d5a1d71
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d6ff20d4c0007e797b00c8621f4a605c61019456d2f2b14642eb703fd478134
3
  size 4876059416
checkpoint-285/model-00008-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c86b874ee5475c8e4698541d2a33a4e89c2a4bb9e78aa3b3175f3e12afc7851e
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e803fcc649cbd9981515476d92049bb420330da7f1ea2e75447e51bb05ec20c1
3
  size 4876059416
checkpoint-285/model-00009-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0993083a57a47d22c37e184fb1fa7dda6ad6c176afb0cad4dbd41951b76a96b7
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d06042deae94dbaf66a368317cbba0a0e94606aabdf427a36f4504e59eb808
3
  size 4876059416
checkpoint-285/model-00010-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d40d444856e5594ff19f8dd60cc68c223ade5e49a24aa5247616897a01bc473a
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f594cbd3d30f4012a27083a171fd5cf89d94a62324444378425bc97f777313
3
  size 4876059416
checkpoint-285/model-00011-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:296c0eb8bc9910e7f8c211fe26b54269a2e83b438d1a9e2e4967aba5ff9a11e0
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e54aa15ff3f38d10e456a9cdb0e18de707067412dd846593f5e8340fb3ea19e7
3
  size 4876059416
checkpoint-285/model-00012-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:472fb6923b910b57bb3701a378b63e2cea67b082eeb75015675686cbf4340919
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e368c270123c34cb6b7a0666d8bd590dc54d1ea4ccd1ed978816de306af6b4b
3
  size 4876059416
checkpoint-285/model-00013-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cf6936306c154f4646fbef8f831d6945fb42944d4c9491c1b28ae5d4cd73e74
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec1c581c74495ee32834a958994e33ea0ded5add67ccc7dceb06eda98815f6e
3
  size 4876059416
checkpoint-285/model-00014-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2b3a32254b03b9773e446fd45437e3f723cdc245ac93955ef1773a93c5b0335
3
  size 2123397800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c128cac91acee62f711921c9bd02d95fc475f3bad50fbc3a7f854b1d3c2deb03
3
  size 2123397800
checkpoint-285/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9fe10da9ed8e0631515115e774f569e4b1576f2b9267c4a7ac821a418a1fa16
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2293e17d174bd961d6dabc3bd55c0c178258aa07f87daf3a13f6b70c674f144
3
  size 1064
checkpoint-285/trainer_state.json CHANGED
@@ -10,1997 +10,1997 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
- "grad_norm": 1.134754623075341,
14
- "learning_rate": 1.0000000000000002e-06,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
- "grad_norm": 1.1234145683168772,
21
- "learning_rate": 2.0000000000000003e-06,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
- "grad_norm": 1.0799860590372758,
28
- "learning_rate": 3e-06,
29
- "loss": 1.1152,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
- "grad_norm": 0.9984297481710986,
35
- "learning_rate": 4.000000000000001e-06,
36
- "loss": 1.0953,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
- "grad_norm": 0.8302026280344834,
42
- "learning_rate": 5e-06,
43
- "loss": 1.0617,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
- "grad_norm": 0.8911823807745126,
49
- "learning_rate": 6e-06,
50
- "loss": 1.1297,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
- "grad_norm": 0.686211615667355,
56
- "learning_rate": 7e-06,
57
- "loss": 1.0705,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
- "grad_norm": 0.9091855799181295,
63
- "learning_rate": 8.000000000000001e-06,
64
- "loss": 1.065,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
- "grad_norm": 0.8934722980371054,
70
- "learning_rate": 9e-06,
71
- "loss": 1.0767,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
- "grad_norm": 0.8688110393935611,
77
- "learning_rate": 1e-05,
78
- "loss": 1.0303,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
- "grad_norm": 0.9920393807379069,
84
- "learning_rate": 1.1000000000000001e-05,
85
- "loss": 1.0855,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
- "grad_norm": 0.9220245541797021,
91
- "learning_rate": 1.2e-05,
92
- "loss": 1.0531,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
- "grad_norm": 0.736886642754733,
98
- "learning_rate": 1.3000000000000001e-05,
99
- "loss": 1.0456,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
- "grad_norm": 0.771339891024354,
105
- "learning_rate": 1.4e-05,
106
- "loss": 1.0671,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
- "grad_norm": 0.7161080553611359,
112
- "learning_rate": 1.5000000000000002e-05,
113
- "loss": 1.0521,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
- "grad_norm": 0.6788342613059561,
119
- "learning_rate": 1.6000000000000003e-05,
120
- "loss": 1.0674,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
- "grad_norm": 0.7102848455414168,
126
- "learning_rate": 1.7e-05,
127
- "loss": 1.0459,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
- "grad_norm": 0.6425246555654909,
133
- "learning_rate": 1.8e-05,
134
- "loss": 1.0093,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
- "grad_norm": 2.099748819540086,
140
- "learning_rate": 1.9e-05,
141
- "loss": 1.0301,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
- "grad_norm": 0.6691987921672391,
147
- "learning_rate": 2e-05,
148
- "loss": 1.0199,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
- "grad_norm": 0.5883655485426926,
154
- "learning_rate": 2.1000000000000002e-05,
155
- "loss": 1.0085,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
- "grad_norm": 0.5443706631485103,
161
- "learning_rate": 2.2000000000000003e-05,
162
- "loss": 1.0432,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
- "grad_norm": 0.593023936793411,
168
- "learning_rate": 2.3e-05,
169
- "loss": 1.0196,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
- "grad_norm": 0.5618656915734137,
175
- "learning_rate": 2.4e-05,
176
- "loss": 1.0386,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
- "grad_norm": 0.46871710098096486,
182
- "learning_rate": 2.5e-05,
183
- "loss": 0.9611,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
- "grad_norm": 0.5700902276763852,
189
- "learning_rate": 2.6000000000000002e-05,
190
- "loss": 1.0045,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
- "grad_norm": 0.603692765386866,
196
- "learning_rate": 2.7000000000000002e-05,
197
- "loss": 1.019,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
- "grad_norm": 0.48456720859923497,
203
- "learning_rate": 2.8e-05,
204
- "loss": 0.9892,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
- "grad_norm": 0.45729475661677665,
210
- "learning_rate": 2.9e-05,
211
- "loss": 0.9645,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
- "grad_norm": 0.5439846777665153,
217
- "learning_rate": 3.0000000000000004e-05,
218
- "loss": 0.9497,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
- "grad_norm": 0.4965459941185334,
224
- "learning_rate": 3.1e-05,
225
- "loss": 0.9882,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
- "grad_norm": 0.4656328962534996,
231
- "learning_rate": 3.2000000000000005e-05,
232
- "loss": 1.0057,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
- "grad_norm": 0.5241601609773927,
238
- "learning_rate": 3.3e-05,
239
- "loss": 1.0033,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
- "grad_norm": 0.5062226992393802,
245
- "learning_rate": 3.4e-05,
246
- "loss": 1.0166,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
- "grad_norm": 0.43771829747985674,
252
- "learning_rate": 3.5000000000000004e-05,
253
- "loss": 1.0102,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
- "grad_norm": 0.48092156639697076,
259
- "learning_rate": 3.6e-05,
260
- "loss": 1.018,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
- "grad_norm": 0.48115559949514536,
266
- "learning_rate": 3.7000000000000005e-05,
267
- "loss": 1.0079,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
- "grad_norm": 0.4777546937622387,
273
- "learning_rate": 3.8e-05,
274
- "loss": 1.0085,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
- "grad_norm": 0.44755392669080185,
280
- "learning_rate": 3.9e-05,
281
- "loss": 0.9825,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
- "grad_norm": 0.44510881962201315,
287
- "learning_rate": 4e-05,
288
- "loss": 0.9848,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
- "grad_norm": 0.4746290969046573,
294
- "learning_rate": 3.999914623406736e-05,
295
- "loss": 0.9888,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
- "grad_norm": 0.5953130701884418,
301
- "learning_rate": 3.9996585009161056e-05,
302
- "loss": 0.9882,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
- "grad_norm": 0.4251472611705547,
308
- "learning_rate": 3.999231654394975e-05,
309
- "loss": 0.9958,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
- "grad_norm": 0.44690799367073597,
315
- "learning_rate": 3.9986341202860467e-05,
316
- "loss": 0.9543,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
- "grad_norm": 0.5976579403936895,
322
- "learning_rate": 3.9978659496047456e-05,
323
- "loss": 0.9762,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
- "grad_norm": 0.3962092871428472,
329
- "learning_rate": 3.9969272079348685e-05,
330
- "loss": 0.9605,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
- "grad_norm": 0.43362883575028716,
336
- "learning_rate": 3.995817975422981e-05,
337
- "loss": 0.9456,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
- "grad_norm": 0.4139776793240363,
343
- "learning_rate": 3.994538346771576e-05,
344
- "loss": 0.9165,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
- "grad_norm": 0.3940723609427906,
350
- "learning_rate": 3.9930884312309894e-05,
351
- "loss": 0.9071,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
- "grad_norm": 0.4016006422322008,
357
- "learning_rate": 3.991468352590069e-05,
358
- "loss": 0.9668,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
- "grad_norm": 0.9528446542157881,
364
- "learning_rate": 3.989678249165612e-05,
365
- "loss": 1.0431,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
- "grad_norm": 0.41600529189619084,
371
- "learning_rate": 3.987718273790548e-05,
372
- "loss": 0.9464,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
- "grad_norm": 1.1382476752327089,
378
- "learning_rate": 3.9855885938008986e-05,
379
- "loss": 1.0186,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
- "grad_norm": 0.44849148754190465,
385
- "learning_rate": 3.983289391021486e-05,
386
- "loss": 0.9981,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
- "grad_norm": 0.4296819710357216,
392
- "learning_rate": 3.9808208617504106e-05,
393
- "loss": 0.9124,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
- "grad_norm": 1.4708100276334197,
399
- "learning_rate": 3.9781832167422926e-05,
400
- "loss": 1.0627,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
- "grad_norm": 0.436502847615945,
406
- "learning_rate": 3.9753766811902756e-05,
407
- "loss": 0.9399,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
- "grad_norm": 0.41131082586189677,
413
- "learning_rate": 3.972401494706805e-05,
414
- "loss": 0.9381,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
- "grad_norm": 0.42792569998778285,
420
- "learning_rate": 3.969257911303167e-05,
421
- "loss": 0.9426,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
- "grad_norm": 1.0484985550985957,
427
- "learning_rate": 3.965946199367804e-05,
428
- "loss": 1.0745,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
- "grad_norm": 0.45563925287513607,
434
- "learning_rate": 3.962466641643398e-05,
435
- "loss": 1.0085,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
- "grad_norm": 0.4216131864169055,
441
- "learning_rate": 3.958819535202732e-05,
442
- "loss": 0.9533,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
- "grad_norm": 0.47284588975540814,
448
- "learning_rate": 3.9550051914233314e-05,
449
- "loss": 0.9727,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
- "grad_norm": 0.4112493584955737,
455
- "learning_rate": 3.951023935960874e-05,
456
- "loss": 0.9408,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
- "grad_norm": 0.44123500755805545,
462
- "learning_rate": 3.9468761087213864e-05,
463
- "loss": 0.9547,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
- "grad_norm": 0.4160767709488051,
469
- "learning_rate": 3.942562063832228e-05,
470
- "loss": 0.9862,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
- "grad_norm": 0.40282812591350464,
476
- "learning_rate": 3.9380821696118556e-05,
477
- "loss": 0.9301,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
- "grad_norm": 0.42252313457664165,
483
- "learning_rate": 3.933436808538375e-05,
484
- "loss": 0.9751,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
- "grad_norm": 0.4084367556454159,
490
- "learning_rate": 3.92862637721689e-05,
491
- "loss": 0.9838,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
- "grad_norm": 0.39446053200993564,
497
- "learning_rate": 3.923651286345638e-05,
498
- "loss": 0.9237,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
- "grad_norm": 0.43051114259650114,
504
- "learning_rate": 3.9185119606809305e-05,
505
- "loss": 0.9543,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
- "grad_norm": 0.41527447901851827,
511
- "learning_rate": 3.913208839000882e-05,
512
- "loss": 0.9688,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
- "grad_norm": 0.4033220715509175,
518
- "learning_rate": 3.907742374067956e-05,
519
- "loss": 0.9401,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
- "grad_norm": 0.4039636146150166,
525
- "learning_rate": 3.9021130325903076e-05,
526
- "loss": 0.9621,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
- "grad_norm": 0.3896809489063709,
532
- "learning_rate": 3.896321295181932e-05,
533
- "loss": 0.986,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
- "grad_norm": 0.7547382513819603,
539
- "learning_rate": 3.89036765632164e-05,
540
- "loss": 1.0528,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
- "grad_norm": 0.42422582617937166,
546
- "learning_rate": 3.8842526243108326e-05,
547
- "loss": 0.9541,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
- "grad_norm": 0.41581388939730257,
553
- "learning_rate": 3.877976721230114e-05,
554
- "loss": 0.9711,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
- "grad_norm": 0.4326138308224312,
560
- "learning_rate": 3.8715404828947055e-05,
561
- "loss": 0.9261,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
- "grad_norm": 0.38852695749391314,
567
- "learning_rate": 3.864944458808712e-05,
568
- "loss": 0.9648,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
- "grad_norm": 0.3897195092049238,
574
- "learning_rate": 3.8581892121181984e-05,
575
- "loss": 0.9397,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
- "grad_norm": 0.43934794613481915,
581
- "learning_rate": 3.851275319563113e-05,
582
- "loss": 0.9905,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
- "grad_norm": 0.5323662587576004,
588
- "learning_rate": 3.844203371428049e-05,
589
- "loss": 0.9896,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
- "grad_norm": 0.38441956539336747,
595
- "learning_rate": 3.836973971491847e-05,
596
- "loss": 0.9385,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
- "grad_norm": 0.38662975914153885,
602
- "learning_rate": 3.8295877369760426e-05,
603
- "loss": 0.9586,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
- "grad_norm": 0.41009140101075614,
609
- "learning_rate": 3.822045298492177e-05,
610
- "loss": 0.9667,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
- "grad_norm": 0.4258642992742759,
616
- "learning_rate": 3.814347299987953e-05,
617
- "loss": 0.954,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
- "grad_norm": 0.40527142541860056,
623
- "learning_rate": 3.806494398692258e-05,
624
- "loss": 0.9351,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
- "grad_norm": 0.3743850574341336,
630
- "learning_rate": 3.7984872650590516e-05,
631
- "loss": 0.9498,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
- "grad_norm": 0.4151867667600151,
637
- "learning_rate": 3.790326582710125e-05,
638
- "loss": 0.9466,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
- "grad_norm": 0.4448011376311795,
644
- "learning_rate": 3.782013048376736e-05,
645
- "loss": 1.0266,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
- "grad_norm": 0.38192124855359877,
651
- "learning_rate": 3.773547371840124e-05,
652
- "loss": 0.978,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
- "grad_norm": 0.4235778210861527,
658
- "learning_rate": 3.764930275870912e-05,
659
- "loss": 0.9827,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
- "grad_norm": 0.4051195260626496,
665
- "learning_rate": 3.756162496167396e-05,
666
- "loss": 0.963,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
- "grad_norm": 0.40700055373961197,
672
- "learning_rate": 3.7472447812927395e-05,
673
- "loss": 0.9437,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.5039370078740157,
678
- "grad_norm": 0.38712614108502513,
679
- "learning_rate": 3.738177892611057e-05,
680
- "loss": 0.955,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.5091863517060368,
685
- "grad_norm": 0.4099596350735423,
686
- "learning_rate": 3.728962604222416e-05,
687
- "loss": 0.9741,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.5144356955380578,
692
- "grad_norm": 0.40040635119594403,
693
- "learning_rate": 3.719599702896745e-05,
694
- "loss": 0.9528,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.5196850393700787,
699
- "grad_norm": 0.4136053200425271,
700
- "learning_rate": 3.710089988006662e-05,
701
- "loss": 0.9466,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.5249343832020997,
706
- "grad_norm": 0.41412239719227456,
707
- "learning_rate": 3.700434271459229e-05,
708
- "loss": 0.9242,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.5301837270341208,
713
- "grad_norm": 0.4309979528684408,
714
- "learning_rate": 3.690633377626628e-05,
715
- "loss": 0.9861,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.5354330708661418,
720
- "grad_norm": 0.4064293156979199,
721
- "learning_rate": 3.680688143275786e-05,
722
- "loss": 0.931,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.5406824146981627,
727
- "grad_norm": 0.4463450853160405,
728
- "learning_rate": 3.670599417496931e-05,
729
- "loss": 0.9084,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.5459317585301837,
734
- "grad_norm": 0.4542877579158036,
735
- "learning_rate": 3.6603680616311013e-05,
736
- "loss": 0.9561,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.5511811023622047,
741
- "grad_norm": 0.4606576229715047,
742
- "learning_rate": 3.6499949491966046e-05,
743
- "loss": 0.9424,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.5564304461942258,
748
- "grad_norm": 1.6662857295077933,
749
- "learning_rate": 3.639480965814443e-05,
750
- "loss": 1.0371,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.5616797900262467,
755
- "grad_norm": 0.42684188670392853,
756
- "learning_rate": 3.628827009132697e-05,
757
- "loss": 0.9635,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.5669291338582677,
762
- "grad_norm": 1.2208350090054685,
763
- "learning_rate": 3.6180339887498953e-05,
764
- "loss": 0.9917,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.5721784776902887,
769
- "grad_norm": 0.4294502318914682,
770
- "learning_rate": 3.6071028261373474e-05,
771
- "loss": 0.9446,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.5774278215223098,
776
- "grad_norm": 0.3937562720593612,
777
- "learning_rate": 3.5960344545604796e-05,
778
- "loss": 0.9278,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.5826771653543307,
783
- "grad_norm": 1.4854854417438403,
784
- "learning_rate": 3.584829818999148e-05,
785
- "loss": 1.0161,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.5879265091863517,
790
- "grad_norm": 0.4240627994414154,
791
- "learning_rate": 3.573489876066967e-05,
792
- "loss": 0.9483,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.5931758530183727,
797
- "grad_norm": 0.3995864923040328,
798
- "learning_rate": 3.5620155939296314e-05,
799
- "loss": 0.9426,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.5984251968503937,
804
- "grad_norm": 0.4085167442197417,
805
- "learning_rate": 3.55040795222226e-05,
806
- "loss": 0.9189,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.6036745406824147,
811
- "grad_norm": 0.411605976954782,
812
- "learning_rate": 3.538667941965758e-05,
813
- "loss": 0.9406,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.6089238845144357,
818
- "grad_norm": 0.4510885035850897,
819
- "learning_rate": 3.526796565482206e-05,
820
- "loss": 0.9609,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.6141732283464567,
825
- "grad_norm": 0.39711542861711363,
826
- "learning_rate": 3.514794836309286e-05,
827
- "loss": 0.9353,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.6194225721784777,
832
- "grad_norm": 0.3860750426711258,
833
- "learning_rate": 3.502663779113747e-05,
834
- "loss": 0.9168,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.6246719160104987,
839
- "grad_norm": 0.4324143866853257,
840
- "learning_rate": 3.490404429603925e-05,
841
- "loss": 0.9412,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.6299212598425197,
846
- "grad_norm": 0.42486288700695524,
847
- "learning_rate": 3.478017834441319e-05,
848
- "loss": 0.9967,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.6351706036745407,
853
- "grad_norm": 0.42059534343716903,
854
- "learning_rate": 3.4655050511512236e-05,
855
- "loss": 0.9042,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.6404199475065617,
860
- "grad_norm": 0.375540386715667,
861
- "learning_rate": 3.452867148032449e-05,
862
- "loss": 0.9261,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.6456692913385826,
867
- "grad_norm": 0.38698966212541075,
868
- "learning_rate": 3.44010520406611e-05,
869
- "loss": 0.9252,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.6509186351706037,
874
- "grad_norm": 0.41709615104288367,
875
- "learning_rate": 3.427220308823505e-05,
876
- "loss": 0.9253,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.6561679790026247,
881
- "grad_norm": 0.4293707133542124,
882
- "learning_rate": 3.4142135623730954e-05,
883
- "loss": 0.9545,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.6614173228346457,
888
- "grad_norm": 0.40563024635145306,
889
- "learning_rate": 3.401086075186582e-05,
890
- "loss": 0.9424,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.6666666666666666,
895
- "grad_norm": 0.47226124502094396,
896
- "learning_rate": 3.3878389680440995e-05,
897
- "loss": 0.9408,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.6719160104986877,
902
- "grad_norm": 0.3921360030995963,
903
- "learning_rate": 3.374473371938526e-05,
904
- "loss": 0.9309,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.6771653543307087,
909
- "grad_norm": 0.4188603496902975,
910
- "learning_rate": 3.3609904279789235e-05,
911
- "loss": 0.9625,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.6824146981627297,
916
- "grad_norm": 0.40729320283126413,
917
- "learning_rate": 3.347391287293115e-05,
918
- "loss": 0.9222,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.6876640419947506,
923
- "grad_norm": 0.43355828675253894,
924
- "learning_rate": 3.333677110929403e-05,
925
- "loss": 0.9245,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.6929133858267716,
930
- "grad_norm": 0.40875412645403303,
931
- "learning_rate": 3.319849069757446e-05,
932
- "loss": 0.9416,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.6981627296587927,
937
- "grad_norm": 0.4184583102080097,
938
- "learning_rate": 3.305908344368289e-05,
939
- "loss": 0.9575,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.7034120734908137,
944
- "grad_norm": 0.37949729176161695,
945
- "learning_rate": 3.291856124973575e-05,
946
- "loss": 0.9283,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.7086614173228346,
951
- "grad_norm": 0.4359197990076154,
952
- "learning_rate": 3.277693611303922e-05,
953
- "loss": 0.9591,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.7139107611548556,
958
- "grad_norm": 0.4127988509227564,
959
- "learning_rate": 3.263422012506502e-05,
960
- "loss": 0.9507,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.7191601049868767,
965
- "grad_norm": 0.4119681718108907,
966
- "learning_rate": 3.249042547041799e-05,
967
- "loss": 0.9252,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.7244094488188977,
972
- "grad_norm": 0.4155554867266832,
973
- "learning_rate": 3.234556442579586e-05,
974
- "loss": 0.9263,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.7296587926509186,
979
- "grad_norm": 0.37277040517135684,
980
- "learning_rate": 3.219964935894114e-05,
981
- "loss": 0.9544,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.7349081364829396,
986
- "grad_norm": 0.41745861140292206,
987
- "learning_rate": 3.205269272758513e-05,
988
- "loss": 0.9213,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.7401574803149606,
993
- "grad_norm": 0.41788351218514774,
994
- "learning_rate": 3.190470707838438e-05,
995
- "loss": 0.9429,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.7454068241469817,
1000
- "grad_norm": 0.3994620013935183,
1001
- "learning_rate": 3.1755705045849465e-05,
1002
- "loss": 0.9065,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.7506561679790026,
1007
- "grad_norm": 0.4006844018528632,
1008
- "learning_rate": 3.160569935126632e-05,
1009
- "loss": 0.9064,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.7559055118110236,
1014
- "grad_norm": 0.44223134289541643,
1015
- "learning_rate": 3.145470280161011e-05,
1016
- "loss": 0.9247,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.7611548556430446,
1021
- "grad_norm": 0.41494829719611687,
1022
- "learning_rate": 3.130272828845184e-05,
1023
- "loss": 0.9126,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.7664041994750657,
1028
- "grad_norm": 0.38947944768031434,
1029
- "learning_rate": 3.114978878685771e-05,
1030
- "loss": 0.8928,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.7716535433070866,
1035
- "grad_norm": 0.3945361927140775,
1036
- "learning_rate": 3.0995897354281347e-05,
1037
- "loss": 0.889,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.7769028871391076,
1042
- "grad_norm": 0.39978716157020916,
1043
- "learning_rate": 3.084106712944899e-05,
1044
- "loss": 0.9227,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.7821522309711286,
1049
- "grad_norm": 0.39603745657551037,
1050
- "learning_rate": 3.068531133123777e-05,
1051
- "loss": 0.8969,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.7874015748031497,
1056
- "grad_norm": 1.3504046960889928,
1057
- "learning_rate": 3.052864325754712e-05,
1058
- "loss": 1.0631,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.7926509186351706,
1063
- "grad_norm": 0.4310383398496922,
1064
- "learning_rate": 3.0371076284163442e-05,
1065
- "loss": 0.9262,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.7979002624671916,
1070
- "grad_norm": 0.41699772424137066,
1071
- "learning_rate": 3.021262386361814e-05,
1072
- "loss": 0.9352,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.8031496062992126,
1077
- "grad_norm": 0.4056852584293386,
1078
- "learning_rate": 3.0053299524039077e-05,
1079
- "loss": 0.8957,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.8083989501312336,
1084
- "grad_norm": 0.4308645558537417,
1085
- "learning_rate": 2.9893116867995583e-05,
1086
- "loss": 0.9137,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.8136482939632546,
1091
- "grad_norm": 0.39136699559712107,
1092
- "learning_rate": 2.9732089571337126e-05,
1093
- "loss": 0.9392,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.8188976377952756,
1098
- "grad_norm": 0.39692286867805615,
1099
- "learning_rate": 2.9570231382025732e-05,
1100
- "loss": 0.9319,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.8241469816272966,
1105
- "grad_norm": 0.389760753952324,
1106
- "learning_rate": 2.9407556118962192e-05,
1107
- "loss": 0.9328,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.8293963254593176,
1112
- "grad_norm": 0.40644738344754366,
1113
- "learning_rate": 2.924407767080627e-05,
1114
- "loss": 0.9511,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.8346456692913385,
1119
- "grad_norm": 0.4235598803780184,
1120
- "learning_rate": 2.9079809994790937e-05,
1121
- "loss": 0.9443,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.8398950131233596,
1126
- "grad_norm": 0.39469735698768543,
1127
- "learning_rate": 2.891476711553077e-05,
1128
- "loss": 0.9353,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.8451443569553806,
1133
- "grad_norm": 0.4231486830962651,
1134
- "learning_rate": 2.8748963123824532e-05,
1135
- "loss": 0.9598,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.8503937007874016,
1140
- "grad_norm": 0.4016499332546737,
1141
- "learning_rate": 2.858241217545218e-05,
1142
- "loss": 0.9182,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.8556430446194225,
1147
- "grad_norm": 0.7416569697844047,
1148
- "learning_rate": 2.8415128489966308e-05,
1149
- "loss": 1.017,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.8608923884514436,
1154
- "grad_norm": 0.4049886957012087,
1155
- "learning_rate": 2.8247126349478073e-05,
1156
- "loss": 0.9377,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.8661417322834646,
1161
- "grad_norm": 0.4240641122781046,
1162
- "learning_rate": 2.80784200974379e-05,
1163
- "loss": 0.936,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.8713910761154856,
1168
- "grad_norm": 0.4010428790320475,
1169
- "learning_rate": 2.790902413741085e-05,
1170
- "loss": 0.9076,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.8766404199475065,
1175
- "grad_norm": 0.40515062001849617,
1176
- "learning_rate": 2.773895293184691e-05,
1177
- "loss": 0.9144,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.8818897637795275,
1182
- "grad_norm": 0.4171752905975984,
1183
- "learning_rate": 2.756822100084621e-05,
1184
- "loss": 0.9302,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.8871391076115486,
1189
- "grad_norm": 0.4018514009140958,
1190
- "learning_rate": 2.7396842920919384e-05,
1191
- "loss": 0.9208,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.8923884514435696,
1196
- "grad_norm": 0.39277733117068253,
1197
- "learning_rate": 2.7224833323743064e-05,
1198
- "loss": 0.9116,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.8976377952755905,
1203
- "grad_norm": 0.6692602521355003,
1204
- "learning_rate": 2.7052206894910653e-05,
1205
- "loss": 1.0122,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.9028871391076115,
1210
- "grad_norm": 0.40843018046933677,
1211
- "learning_rate": 2.6878978372678567e-05,
1212
- "loss": 0.9014,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.9081364829396326,
1217
- "grad_norm": 0.3862093081092539,
1218
- "learning_rate": 2.670516254670788e-05,
1219
- "loss": 0.9367,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.9133858267716536,
1224
- "grad_norm": 0.39106222738031376,
1225
- "learning_rate": 2.6530774256801666e-05,
1226
- "loss": 0.9253,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.9186351706036745,
1231
- "grad_norm": 0.409656789286683,
1232
- "learning_rate": 2.6355828391638036e-05,
1233
- "loss": 0.9259,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.9238845144356955,
1238
- "grad_norm": 0.41048542482358136,
1239
- "learning_rate": 2.618033988749895e-05,
1240
- "loss": 0.9151,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.9291338582677166,
1245
- "grad_norm": 0.39649048041899354,
1246
- "learning_rate": 2.6004323726995057e-05,
1247
- "loss": 0.9197,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.9343832020997376,
1252
- "grad_norm": 0.4041163682720282,
1253
- "learning_rate": 2.5827794937786497e-05,
1254
- "loss": 0.9184,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.9396325459317585,
1259
- "grad_norm": 0.40988870079100986,
1260
- "learning_rate": 2.5650768591299905e-05,
1261
- "loss": 0.9376,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.9448818897637795,
1266
- "grad_norm": 0.3918596025187431,
1267
- "learning_rate": 2.5473259801441663e-05,
1268
- "loss": 0.9102,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.9501312335958005,
1273
- "grad_norm": 0.39082215171197704,
1274
- "learning_rate": 2.5295283723307517e-05,
1275
- "loss": 0.9025,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.9553805774278216,
1280
- "grad_norm": 0.38010414440929924,
1281
- "learning_rate": 2.5116855551888715e-05,
1282
- "loss": 0.9354,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.9606299212598425,
1287
- "grad_norm": 0.4141554447250008,
1288
- "learning_rate": 2.4937990520774664e-05,
1289
- "loss": 0.8782,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.9658792650918635,
1294
- "grad_norm": 0.38201600299774646,
1295
- "learning_rate": 2.4758703900852376e-05,
1296
- "loss": 0.9008,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.9711286089238845,
1301
- "grad_norm": 0.42204019171609175,
1302
- "learning_rate": 2.4579010999002683e-05,
1303
- "loss": 0.8856,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.9763779527559056,
1308
- "grad_norm": 0.4270135581368761,
1309
- "learning_rate": 2.4398927156793376e-05,
1310
- "loss": 0.9205,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.9816272965879265,
1315
- "grad_norm": 0.4150364763728507,
1316
- "learning_rate": 2.42184677491694e-05,
1317
- "loss": 0.8947,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.9868766404199475,
1322
- "grad_norm": 0.51571681852072,
1323
- "learning_rate": 2.4037648183140205e-05,
1324
- "loss": 0.9929,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.9921259842519685,
1329
- "grad_norm": 0.38917079851953085,
1330
- "learning_rate": 2.385648389646434e-05,
1331
- "loss": 0.9121,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.9973753280839895,
1336
- "grad_norm": 0.45530540311855816,
1337
- "learning_rate": 2.367499035633141e-05,
1338
- "loss": 0.9113,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 1.0,
1343
- "grad_norm": 0.45530540311855816,
1344
- "learning_rate": 2.3493183058041578e-05,
1345
- "loss": 0.9347,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 1.005249343832021,
1350
- "grad_norm": 0.7905516160996418,
1351
- "learning_rate": 2.33110775236826e-05,
1352
- "loss": 0.6378,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 1.010498687664042,
1357
- "grad_norm": 0.5781672455733289,
1358
- "learning_rate": 2.312868930080462e-05,
1359
- "loss": 0.639,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 1.015748031496063,
1364
- "grad_norm": 1.0363184255291382,
1365
- "learning_rate": 2.2946033961092754e-05,
1366
- "loss": 0.6442,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 1.020997375328084,
1371
- "grad_norm": 0.4833222962879287,
1372
- "learning_rate": 2.2763127099037646e-05,
1373
- "loss": 0.6441,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 1.026246719160105,
1378
- "grad_norm": 0.5713369686471519,
1379
- "learning_rate": 2.257998433060407e-05,
1380
- "loss": 0.6667,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 1.031496062992126,
1385
- "grad_norm": 0.4764665095808476,
1386
- "learning_rate": 2.2396621291897666e-05,
1387
- "loss": 0.6407,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 1.036745406824147,
1392
- "grad_norm": 0.4588405259893301,
1393
- "learning_rate": 2.2213053637830016e-05,
1394
- "loss": 0.6146,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 1.041994750656168,
1399
- "grad_norm": 0.42224522477353676,
1400
- "learning_rate": 2.2029297040782063e-05,
1401
- "loss": 0.6108,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 1.047244094488189,
1406
- "grad_norm": 0.43495470036978195,
1407
- "learning_rate": 2.184536718926604e-05,
1408
- "loss": 0.6091,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 1.05249343832021,
1413
- "grad_norm": 0.3908883372852395,
1414
- "learning_rate": 2.166127978658608e-05,
1415
- "loss": 0.618,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 1.057742782152231,
1420
- "grad_norm": 0.41668658316901935,
1421
- "learning_rate": 2.147705054949748e-05,
1422
- "loss": 0.6345,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 1.0629921259842519,
1427
- "grad_norm": 0.38162610574666733,
1428
- "learning_rate": 2.1292695206864887e-05,
1429
- "loss": 0.6077,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 1.068241469816273,
1434
- "grad_norm": 0.359243479327879,
1435
- "learning_rate": 2.11082294983194e-05,
1436
- "loss": 0.6008,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 1.073490813648294,
1441
- "grad_norm": 0.338388070214759,
1442
- "learning_rate": 2.0923669172914796e-05,
1443
- "loss": 0.6198,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 1.078740157480315,
1448
- "grad_norm": 0.3736442857329487,
1449
- "learning_rate": 2.0739029987782903e-05,
1450
- "loss": 0.6038,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 1.083989501312336,
1455
- "grad_norm": 0.38343041994799376,
1456
- "learning_rate": 2.055432770678833e-05,
1457
- "loss": 0.6283,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 1.0892388451443569,
1462
- "grad_norm": 0.3839775235441468,
1463
- "learning_rate": 2.03695780991826e-05,
1464
- "loss": 0.621,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 1.094488188976378,
1469
- "grad_norm": 0.4026675034831997,
1470
- "learning_rate": 2.018479693825782e-05,
1471
- "loss": 0.5967,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 1.099737532808399,
1476
- "grad_norm": 0.354604482258031,
1477
- "learning_rate": 2e-05,
1478
- "loss": 0.5747,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 1.10498687664042,
1483
- "grad_norm": 0.37448103017465234,
1484
- "learning_rate": 1.9815203061742188e-05,
1485
- "loss": 0.6207,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 1.110236220472441,
1490
- "grad_norm": 0.3731173545168172,
1491
- "learning_rate": 1.9630421900817407e-05,
1492
- "loss": 0.6658,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 1.1154855643044619,
1497
- "grad_norm": 0.4440821296075456,
1498
- "learning_rate": 1.9445672293211675e-05,
1499
- "loss": 0.6147,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 1.120734908136483,
1504
- "grad_norm": 0.37062388694479104,
1505
- "learning_rate": 1.9260970012217107e-05,
1506
- "loss": 0.6235,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 1.125984251968504,
1511
- "grad_norm": 0.352789345485834,
1512
- "learning_rate": 1.9076330827085214e-05,
1513
- "loss": 0.6186,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 1.1312335958005248,
1518
- "grad_norm": 0.3510249370323435,
1519
- "learning_rate": 1.8891770501680602e-05,
1520
- "loss": 0.5919,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 1.136482939632546,
1525
- "grad_norm": 0.36641452500356003,
1526
- "learning_rate": 1.8707304793135117e-05,
1527
- "loss": 0.6325,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 1.141732283464567,
1532
- "grad_norm": 0.46538331418150264,
1533
- "learning_rate": 1.8522949450502522e-05,
1534
- "loss": 0.6314,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 1.1469816272965878,
1539
- "grad_norm": 0.35158983297625845,
1540
- "learning_rate": 1.8338720213413924e-05,
1541
- "loss": 0.6171,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 1.152230971128609,
1546
- "grad_norm": 0.34039327733619856,
1547
- "learning_rate": 1.815463281073396e-05,
1548
- "loss": 0.5929,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 1.1574803149606299,
1553
- "grad_norm": 0.3583390822597362,
1554
- "learning_rate": 1.7970702959217944e-05,
1555
- "loss": 0.5666,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 1.162729658792651,
1560
- "grad_norm": 0.3328969811354483,
1561
- "learning_rate": 1.7786946362169987e-05,
1562
- "loss": 0.6165,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 1.167979002624672,
1567
- "grad_norm": 0.35808562196234894,
1568
- "learning_rate": 1.760337870810234e-05,
1569
- "loss": 0.5907,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 1.1732283464566928,
1574
- "grad_norm": 0.3330883120593054,
1575
- "learning_rate": 1.742001566939594e-05,
1576
- "loss": 0.6139,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 1.178477690288714,
1581
- "grad_norm": 0.3477344689492317,
1582
- "learning_rate": 1.7236872900962364e-05,
1583
- "loss": 0.5772,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 1.1837270341207349,
1588
- "grad_norm": 0.3636049536281939,
1589
- "learning_rate": 1.705396603890725e-05,
1590
- "loss": 0.6101,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 1.188976377952756,
1595
- "grad_norm": 0.5140813534017206,
1596
- "learning_rate": 1.687131069919538e-05,
1597
- "loss": 0.6455,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 1.194225721784777,
1602
- "grad_norm": 0.3445915498918541,
1603
- "learning_rate": 1.66889224763174e-05,
1604
- "loss": 0.6253,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 1.1994750656167978,
1609
- "grad_norm": 0.3817825400306628,
1610
- "learning_rate": 1.6506816941958425e-05,
1611
- "loss": 0.6264,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 1.204724409448819,
1616
- "grad_norm": 0.3871021865440479,
1617
- "learning_rate": 1.6325009643668592e-05,
1618
- "loss": 0.5875,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 1.20997375328084,
1623
- "grad_norm": 0.4003951939816748,
1624
- "learning_rate": 1.6143516103535666e-05,
1625
- "loss": 0.6068,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 1.2152230971128608,
1630
- "grad_norm": 0.377400380096938,
1631
- "learning_rate": 1.59623518168598e-05,
1632
- "loss": 0.6216,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.220472440944882,
1637
- "grad_norm": 0.3523802116183249,
1638
- "learning_rate": 1.578153225083061e-05,
1639
- "loss": 0.5773,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.2257217847769029,
1644
- "grad_norm": 0.3774158319806266,
1645
- "learning_rate": 1.5601072843206634e-05,
1646
- "loss": 0.6485,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 1.2309711286089238,
1651
- "grad_norm": 0.35618682756111997,
1652
- "learning_rate": 1.5420989000997324e-05,
1653
- "loss": 0.57,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 1.236220472440945,
1658
- "grad_norm": 0.329939498867074,
1659
- "learning_rate": 1.524129609914763e-05,
1660
- "loss": 0.5922,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 1.2414698162729658,
1665
- "grad_norm": 0.3694137451232253,
1666
- "learning_rate": 1.5062009479225336e-05,
1667
- "loss": 0.614,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 1.246719160104987,
1672
- "grad_norm": 0.33955880982317405,
1673
- "learning_rate": 1.4883144448111288e-05,
1674
- "loss": 0.5734,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 1.2519685039370079,
1679
- "grad_norm": 0.34083473110033147,
1680
- "learning_rate": 1.4704716276692483e-05,
1681
- "loss": 0.5838,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 1.257217847769029,
1686
- "grad_norm": 0.33904875318136807,
1687
- "learning_rate": 1.4526740198558345e-05,
1688
- "loss": 0.5721,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 1.26246719160105,
1693
- "grad_norm": 0.3452861169118219,
1694
- "learning_rate": 1.43492314087001e-05,
1695
- "loss": 0.5978,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 1.2677165354330708,
1700
- "grad_norm": 0.36203138569672677,
1701
- "learning_rate": 1.417220506221351e-05,
1702
- "loss": 0.6061,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 1.272965879265092,
1707
- "grad_norm": 0.3423141141716414,
1708
- "learning_rate": 1.3995676273004948e-05,
1709
- "loss": 0.617,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 1.2782152230971129,
1714
- "grad_norm": 0.35751648695826527,
1715
- "learning_rate": 1.3819660112501054e-05,
1716
- "loss": 0.6393,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 1.2834645669291338,
1721
- "grad_norm": 0.3703820503916198,
1722
- "learning_rate": 1.364417160836197e-05,
1723
- "loss": 0.6767,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 1.288713910761155,
1728
- "grad_norm": 0.48807956046543455,
1729
- "learning_rate": 1.3469225743198337e-05,
1730
- "loss": 0.6156,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 1.2939632545931758,
1735
- "grad_norm": 0.3687500146311602,
1736
- "learning_rate": 1.329483745329213e-05,
1737
- "loss": 0.6587,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 1.2992125984251968,
1742
- "grad_norm": 0.5835964568778699,
1743
- "learning_rate": 1.3121021627321438e-05,
1744
- "loss": 0.5912,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 1.304461942257218,
1749
- "grad_norm": 0.35312287301928247,
1750
- "learning_rate": 1.2947793105089347e-05,
1751
- "loss": 0.622,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 1.3097112860892388,
1756
- "grad_norm": 0.3442037878519564,
1757
- "learning_rate": 1.2775166676256942e-05,
1758
- "loss": 0.5905,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 1.3149606299212597,
1763
- "grad_norm": 0.3407336842366384,
1764
- "learning_rate": 1.260315707908062e-05,
1765
- "loss": 0.6023,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 1.3202099737532809,
1770
- "grad_norm": 0.3532792670283708,
1771
- "learning_rate": 1.2431778999153796e-05,
1772
- "loss": 0.5994,
1773
  "step": 252
1774
  },
1775
  {
1776
  "epoch": 1.3254593175853018,
1777
- "grad_norm": 0.33944334283473854,
1778
- "learning_rate": 1.2261047068153098e-05,
1779
- "loss": 0.6136,
1780
  "step": 253
1781
  },
1782
  {
1783
  "epoch": 1.330708661417323,
1784
- "grad_norm": 0.32837150028469475,
1785
- "learning_rate": 1.2090975862589151e-05,
1786
- "loss": 0.6655,
1787
  "step": 254
1788
  },
1789
  {
1790
  "epoch": 1.3359580052493438,
1791
- "grad_norm": 0.4777003027087777,
1792
- "learning_rate": 1.1921579902562103e-05,
1793
- "loss": 0.6005,
1794
  "step": 255
1795
  },
1796
  {
1797
  "epoch": 1.341207349081365,
1798
- "grad_norm": 0.3348911333128937,
1799
- "learning_rate": 1.1752873650521934e-05,
1800
- "loss": 0.5908,
1801
  "step": 256
1802
  },
1803
  {
1804
  "epoch": 1.3464566929133859,
1805
- "grad_norm": 0.32219847234981497,
1806
- "learning_rate": 1.1584871510033707e-05,
1807
- "loss": 0.599,
1808
  "step": 257
1809
  },
1810
  {
1811
  "epoch": 1.3517060367454068,
1812
- "grad_norm": 0.32820186480956876,
1813
- "learning_rate": 1.1417587824547822e-05,
1814
- "loss": 0.5949,
1815
  "step": 258
1816
  },
1817
  {
1818
  "epoch": 1.356955380577428,
1819
- "grad_norm": 0.35813565851987533,
1820
- "learning_rate": 1.1251036876175476e-05,
1821
- "loss": 0.6273,
1822
  "step": 259
1823
  },
1824
  {
1825
  "epoch": 1.3622047244094488,
1826
- "grad_norm": 0.34029610416797407,
1827
- "learning_rate": 1.1085232884469236e-05,
1828
- "loss": 0.6026,
1829
  "step": 260
1830
  },
1831
  {
1832
  "epoch": 1.3674540682414698,
1833
- "grad_norm": 0.3319785571663989,
1834
- "learning_rate": 1.0920190005209066e-05,
1835
- "loss": 0.5993,
1836
  "step": 261
1837
  },
1838
  {
1839
  "epoch": 1.372703412073491,
1840
- "grad_norm": 0.3368980390679111,
1841
- "learning_rate": 1.0755922329193739e-05,
1842
- "loss": 0.5763,
1843
  "step": 262
1844
  },
1845
  {
1846
  "epoch": 1.3779527559055118,
1847
- "grad_norm": 0.32898129554439637,
1848
- "learning_rate": 1.0592443881037816e-05,
1849
- "loss": 0.5687,
1850
  "step": 263
1851
  },
1852
  {
1853
  "epoch": 1.3832020997375327,
1854
- "grad_norm": 0.3518104139897649,
1855
- "learning_rate": 1.0429768617974271e-05,
1856
- "loss": 0.5934,
1857
  "step": 264
1858
  },
1859
  {
1860
  "epoch": 1.3884514435695539,
1861
- "grad_norm": 0.329339944599712,
1862
- "learning_rate": 1.0267910428662878e-05,
1863
- "loss": 0.6384,
1864
  "step": 265
1865
  },
1866
  {
1867
  "epoch": 1.3937007874015748,
1868
- "grad_norm": 0.3385813426254148,
1869
- "learning_rate": 1.0106883132004428e-05,
1870
- "loss": 0.6055,
1871
  "step": 266
1872
  },
1873
  {
1874
  "epoch": 1.3989501312335957,
1875
- "grad_norm": 0.36895054605374145,
1876
- "learning_rate": 9.946700475960933e-06,
1877
- "loss": 0.6145,
1878
  "step": 267
1879
  },
1880
  {
1881
  "epoch": 1.4041994750656168,
1882
- "grad_norm": 0.3996589842241198,
1883
- "learning_rate": 9.787376136381866e-06,
1884
- "loss": 0.5953,
1885
  "step": 268
1886
  },
1887
  {
1888
  "epoch": 1.4094488188976377,
1889
- "grad_norm": 0.32114805305907074,
1890
- "learning_rate": 9.628923715836558e-06,
1891
- "loss": 0.5807,
1892
  "step": 269
1893
  },
1894
  {
1895
  "epoch": 1.4146981627296589,
1896
- "grad_norm": 0.3463735106041919,
1897
- "learning_rate": 9.471356742452881e-06,
1898
- "loss": 0.5991,
1899
  "step": 270
1900
  },
1901
  {
1902
  "epoch": 1.4199475065616798,
1903
- "grad_norm": 0.3218652005510028,
1904
- "learning_rate": 9.314688668762232e-06,
1905
- "loss": 0.615,
1906
  "step": 271
1907
  },
1908
  {
1909
  "epoch": 1.425196850393701,
1910
- "grad_norm": 0.35660358239424667,
1911
- "learning_rate": 9.158932870551012e-06,
1912
- "loss": 0.5915,
1913
  "step": 272
1914
  },
1915
  {
1916
  "epoch": 1.4304461942257218,
1917
- "grad_norm": 0.328623251389403,
1918
- "learning_rate": 9.004102645718655e-06,
1919
- "loss": 0.594,
1920
  "step": 273
1921
  },
1922
  {
1923
  "epoch": 1.4356955380577427,
1924
- "grad_norm": 0.3269276651095935,
1925
- "learning_rate": 8.85021121314229e-06,
1926
- "loss": 0.6032,
1927
  "step": 274
1928
  },
1929
  {
1930
  "epoch": 1.4409448818897639,
1931
- "grad_norm": 0.3563617148746268,
1932
- "learning_rate": 8.697271711548163e-06,
1933
- "loss": 0.5727,
1934
  "step": 275
1935
  },
1936
  {
1937
  "epoch": 1.4461942257217848,
1938
- "grad_norm": 0.3379353625795505,
1939
- "learning_rate": 8.545297198389896e-06,
1940
- "loss": 0.572,
1941
  "step": 276
1942
  },
1943
  {
1944
  "epoch": 1.4514435695538057,
1945
- "grad_norm": 0.32800047402633126,
1946
- "learning_rate": 8.394300648733688e-06,
1947
- "loss": 0.5784,
1948
  "step": 277
1949
  },
1950
  {
1951
  "epoch": 1.4566929133858268,
1952
- "grad_norm": 0.353294123585671,
1953
- "learning_rate": 8.24429495415054e-06,
1954
- "loss": 0.5948,
1955
  "step": 278
1956
  },
1957
  {
1958
  "epoch": 1.4619422572178478,
1959
- "grad_norm": 0.33905615123641586,
1960
- "learning_rate": 8.095292921615628e-06,
1961
- "loss": 0.6164,
1962
  "step": 279
1963
  },
1964
  {
1965
  "epoch": 1.4671916010498687,
1966
- "grad_norm": 0.34568179548405675,
1967
- "learning_rate": 7.947307272414874e-06,
1968
- "loss": 0.587,
1969
  "step": 280
1970
  },
1971
  {
1972
  "epoch": 1.4724409448818898,
1973
- "grad_norm": 0.31900836965125545,
1974
- "learning_rate": 7.800350641058867e-06,
1975
- "loss": 0.5829,
1976
  "step": 281
1977
  },
1978
  {
1979
  "epoch": 1.4776902887139107,
1980
- "grad_norm": 0.33258497484716926,
1981
- "learning_rate": 7.654435574204145e-06,
1982
- "loss": 0.5891,
1983
  "step": 282
1984
  },
1985
  {
1986
  "epoch": 1.4829396325459316,
1987
- "grad_norm": 0.32711726879832903,
1988
- "learning_rate": 7.509574529582022e-06,
1989
- "loss": 0.5915,
1990
  "step": 283
1991
  },
1992
  {
1993
  "epoch": 1.4881889763779528,
1994
- "grad_norm": 0.3332370756155197,
1995
- "learning_rate": 7.365779874934987e-06,
1996
- "loss": 0.5925,
1997
  "step": 284
1998
  },
1999
  {
2000
  "epoch": 1.4934383202099737,
2001
- "grad_norm": 0.3454414557061501,
2002
- "learning_rate": 7.223063886960779e-06,
2003
- "loss": 0.5729,
2004
  "step": 285
2005
  }
2006
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
+ "grad_norm": 1.1348930782232016,
14
+ "learning_rate": 1.5000000000000002e-07,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
+ "grad_norm": 1.123696373079589,
21
+ "learning_rate": 3.0000000000000004e-07,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
+ "grad_norm": 1.0989081863562118,
28
+ "learning_rate": 4.5e-07,
29
+ "loss": 1.1158,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
+ "grad_norm": 1.0628548113414964,
35
+ "learning_rate": 6.000000000000001e-07,
36
+ "loss": 1.0986,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
+ "grad_norm": 1.0629069543612368,
42
+ "learning_rate": 7.5e-07,
43
+ "loss": 1.0727,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
+ "grad_norm": 1.1219311917213644,
49
+ "learning_rate": 9e-07,
50
+ "loss": 1.1513,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
+ "grad_norm": 1.068318638334139,
56
+ "learning_rate": 1.05e-06,
57
+ "loss": 1.0978,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
+ "grad_norm": 1.0335025624008565,
63
+ "learning_rate": 1.2000000000000002e-06,
64
+ "loss": 1.0932,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
+ "grad_norm": 0.9514112971268772,
70
+ "learning_rate": 1.35e-06,
71
+ "loss": 1.1046,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
+ "grad_norm": 0.8944230714776324,
77
+ "learning_rate": 1.5e-06,
78
+ "loss": 1.0638,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
+ "grad_norm": 0.8720343077794245,
84
+ "learning_rate": 1.65e-06,
85
+ "loss": 1.1132,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
+ "grad_norm": 0.7519518665820406,
91
+ "learning_rate": 1.8e-06,
92
+ "loss": 1.0788,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
+ "grad_norm": 0.7768466543241798,
98
+ "learning_rate": 1.95e-06,
99
+ "loss": 1.0795,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
+ "grad_norm": 0.7109922479048013,
105
+ "learning_rate": 2.1e-06,
106
+ "loss": 1.1012,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
+ "grad_norm": 0.6312078880187205,
112
+ "learning_rate": 2.25e-06,
113
+ "loss": 1.0851,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
+ "grad_norm": 0.5514473048370377,
119
+ "learning_rate": 2.4000000000000003e-06,
120
+ "loss": 1.1041,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
+ "grad_norm": 0.6271281070432462,
126
+ "learning_rate": 2.55e-06,
127
+ "loss": 1.0855,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
+ "grad_norm": 0.7059888078645049,
133
+ "learning_rate": 2.7e-06,
134
+ "loss": 1.0473,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
+ "grad_norm": 0.7226157330393405,
140
+ "learning_rate": 2.85e-06,
141
+ "loss": 1.0665,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
+ "grad_norm": 0.7244742832208652,
147
+ "learning_rate": 3e-06,
148
+ "loss": 1.0604,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
+ "grad_norm": 0.7088251146482789,
154
+ "learning_rate": 3.1500000000000003e-06,
155
+ "loss": 1.0516,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
+ "grad_norm": 0.5987242362229293,
161
+ "learning_rate": 3.3e-06,
162
+ "loss": 1.084,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
+ "grad_norm": 0.5730637810768702,
168
+ "learning_rate": 3.45e-06,
169
+ "loss": 1.0621,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
+ "grad_norm": 0.5894968443138215,
175
+ "learning_rate": 3.6e-06,
176
+ "loss": 1.0797,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
+ "grad_norm": 0.5798124303184627,
182
+ "learning_rate": 3.75e-06,
183
+ "loss": 1.0035,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
+ "grad_norm": 0.643205751513686,
189
+ "learning_rate": 3.9e-06,
190
+ "loss": 1.0455,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
+ "grad_norm": 0.5621970774702022,
196
+ "learning_rate": 4.05e-06,
197
+ "loss": 1.0576,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
+ "grad_norm": 0.5506084571895594,
203
+ "learning_rate": 4.2e-06,
204
+ "loss": 1.0298,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
+ "grad_norm": 0.48741149421912777,
210
+ "learning_rate": 4.35e-06,
211
+ "loss": 1.0018,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
+ "grad_norm": 0.46403007703544275,
217
+ "learning_rate": 4.5e-06,
218
+ "loss": 0.9872,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
+ "grad_norm": 0.4754381818573106,
224
+ "learning_rate": 4.65e-06,
225
+ "loss": 1.0271,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
+ "grad_norm": 0.9362850890979981,
231
+ "learning_rate": 4.800000000000001e-06,
232
+ "loss": 1.0437,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
+ "grad_norm": 0.47391181595772164,
238
+ "learning_rate": 4.95e-06,
239
+ "loss": 1.0437,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
+ "grad_norm": 0.5276920454851337,
245
+ "learning_rate": 5.1e-06,
246
+ "loss": 1.0557,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
+ "grad_norm": 0.4616075133913133,
252
+ "learning_rate": 5.2500000000000006e-06,
253
+ "loss": 1.0465,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
+ "grad_norm": 0.4555174555636226,
259
+ "learning_rate": 5.4e-06,
260
+ "loss": 1.0588,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
+ "grad_norm": 0.5071864534648831,
266
+ "learning_rate": 5.55e-06,
267
+ "loss": 1.044,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
+ "grad_norm": 0.4851367263882934,
273
+ "learning_rate": 5.7e-06,
274
+ "loss": 1.0464,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
+ "grad_norm": 0.44188022228811896,
280
+ "learning_rate": 5.85e-06,
281
+ "loss": 1.0182,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
+ "grad_norm": 0.43420740120454643,
287
+ "learning_rate": 6e-06,
288
+ "loss": 1.0188,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
+ "grad_norm": 0.4291543441241407,
294
+ "learning_rate": 5.9998719351101036e-06,
295
+ "loss": 1.0245,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
+ "grad_norm": 0.43326370236005163,
301
+ "learning_rate": 5.999487751374158e-06,
302
+ "loss": 1.0238,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
+ "grad_norm": 0.427571644972227,
308
+ "learning_rate": 5.998847481592462e-06,
309
+ "loss": 1.0311,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
+ "grad_norm": 0.4215063088273006,
315
+ "learning_rate": 5.997951180429069e-06,
316
+ "loss": 0.9925,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
+ "grad_norm": 0.4206536914503675,
322
+ "learning_rate": 5.996798924407118e-06,
323
+ "loss": 1.003,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
+ "grad_norm": 0.40910969064965136,
329
+ "learning_rate": 5.995390811902302e-06,
330
+ "loss": 0.9949,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
+ "grad_norm": 0.4165775049327623,
336
+ "learning_rate": 5.993726963134471e-06,
337
+ "loss": 0.9734,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
+ "grad_norm": 0.3832235501001726,
343
+ "learning_rate": 5.9918075201573645e-06,
344
+ "loss": 0.9485,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
+ "grad_norm": 0.37002495168808525,
350
+ "learning_rate": 5.9896326468464835e-06,
351
+ "loss": 0.9358,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
+ "grad_norm": 0.44836853406053057,
357
+ "learning_rate": 5.987202528885104e-06,
358
+ "loss": 0.9982,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
+ "grad_norm": 0.4080608606117312,
364
+ "learning_rate": 5.984517373748417e-06,
365
+ "loss": 1.0129,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
+ "grad_norm": 0.4001550595702573,
371
+ "learning_rate": 5.981577410685822e-06,
372
+ "loss": 0.9788,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
+ "grad_norm": 0.41021488877460305,
378
+ "learning_rate": 5.978382890701347e-06,
379
+ "loss": 1.0262,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
+ "grad_norm": 0.39997016380492506,
385
+ "learning_rate": 5.9749340865322284e-06,
386
+ "loss": 1.0275,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
+ "grad_norm": 0.3839823787027912,
392
+ "learning_rate": 5.971231292625615e-06,
393
+ "loss": 0.9374,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
+ "grad_norm": 0.4125068495663659,
399
+ "learning_rate": 5.967274825113438e-06,
400
+ "loss": 0.9954,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
+ "grad_norm": 0.3908377197765856,
406
+ "learning_rate": 5.963065021785414e-06,
407
+ "loss": 0.9671,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
+ "grad_norm": 0.3850488592862481,
413
+ "learning_rate": 5.958602242060207e-06,
414
+ "loss": 0.9657,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
+ "grad_norm": 0.3877990366088493,
420
+ "learning_rate": 5.95388686695475e-06,
421
+ "loss": 0.9678,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
+ "grad_norm": 0.40470471194287355,
427
+ "learning_rate": 5.948919299051706e-06,
428
+ "loss": 1.0149,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
+ "grad_norm": 0.42889495063392963,
434
+ "learning_rate": 5.943699962465096e-06,
435
+ "loss": 1.033,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
+ "grad_norm": 0.39164358737100274,
441
+ "learning_rate": 5.9382293028040985e-06,
442
+ "loss": 0.9761,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
+ "grad_norm": 0.3869342590567232,
448
+ "learning_rate": 5.9325077871349975e-06,
449
+ "loss": 0.9982,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
+ "grad_norm": 0.39264627926569035,
455
+ "learning_rate": 5.9265359039413105e-06,
456
+ "loss": 0.9667,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
+ "grad_norm": 0.3887717698297268,
462
+ "learning_rate": 5.920314163082079e-06,
463
+ "loss": 0.9806,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
+ "grad_norm": 0.40896336915084297,
469
+ "learning_rate": 5.913843095748342e-06,
470
+ "loss": 1.0135,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
+ "grad_norm": 0.3610209560875707,
476
+ "learning_rate": 5.907123254417783e-06,
477
+ "loss": 0.956,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
+ "grad_norm": 0.38154744815823505,
483
+ "learning_rate": 5.9001552128075625e-06,
484
+ "loss": 1.0045,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
+ "grad_norm": 0.4094826396119445,
490
+ "learning_rate": 5.892939565825335e-06,
491
+ "loss": 1.0069,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
+ "grad_norm": 0.39129138622932325,
497
+ "learning_rate": 5.885476929518457e-06,
498
+ "loss": 0.9525,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
+ "grad_norm": 0.3712890701175899,
504
+ "learning_rate": 5.8777679410213956e-06,
505
+ "loss": 0.9792,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
+ "grad_norm": 0.4086264062600148,
511
+ "learning_rate": 5.869813258501323e-06,
512
+ "loss": 0.9926,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
+ "grad_norm": 0.368975878599487,
518
+ "learning_rate": 5.861613561101934e-06,
519
+ "loss": 0.9643,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
+ "grad_norm": 0.36792811629461203,
525
+ "learning_rate": 5.853169548885461e-06,
526
+ "loss": 0.9867,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
+ "grad_norm": 0.3566251893981936,
532
+ "learning_rate": 5.844481942772898e-06,
533
+ "loss": 1.0069,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
+ "grad_norm": 0.4578529359685586,
539
+ "learning_rate": 5.835551484482459e-06,
540
+ "loss": 1.0173,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
+ "grad_norm": 0.3935925285922137,
546
+ "learning_rate": 5.826378936466249e-06,
547
+ "loss": 0.9743,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
+ "grad_norm": 0.4109939217838428,
553
+ "learning_rate": 5.81696508184517e-06,
554
+ "loss": 0.9866,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
+ "grad_norm": 0.3839870332489822,
560
+ "learning_rate": 5.807310724342058e-06,
561
+ "loss": 0.9516,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
+ "grad_norm": 0.3774576797883406,
567
+ "learning_rate": 5.797416688213067e-06,
568
+ "loss": 0.9895,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
+ "grad_norm": 0.3817468964498129,
574
+ "learning_rate": 5.787283818177297e-06,
575
+ "loss": 0.9632,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
+ "grad_norm": 0.60843002346461,
581
+ "learning_rate": 5.776912979344669e-06,
582
+ "loss": 1.0166,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
+ "grad_norm": 0.3858713700245362,
588
+ "learning_rate": 5.766305057142073e-06,
589
+ "loss": 0.9976,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
+ "grad_norm": 0.3724153436541016,
595
+ "learning_rate": 5.755460957237769e-06,
596
+ "loss": 0.9645,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
+ "grad_norm": 0.38201105695018567,
602
+ "learning_rate": 5.744381605464064e-06,
603
+ "loss": 0.9899,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
+ "grad_norm": 0.38383930861007165,
609
+ "learning_rate": 5.7330679477382655e-06,
610
+ "loss": 0.9919,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
+ "grad_norm": 0.4078870418259581,
616
+ "learning_rate": 5.7215209499819296e-06,
617
+ "loss": 0.9797,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
+ "grad_norm": 0.38463767466523974,
623
+ "learning_rate": 5.709741598038387e-06,
624
+ "loss": 0.9597,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
+ "grad_norm": 0.36309855116472584,
630
+ "learning_rate": 5.697730897588577e-06,
631
+ "loss": 0.9737,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
+ "grad_norm": 0.4106701446638758,
637
+ "learning_rate": 5.685489874065187e-06,
638
+ "loss": 0.9683,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
+ "grad_norm": 0.37110409255145443,
644
+ "learning_rate": 5.673019572565103e-06,
645
+ "loss": 1.0418,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
+ "grad_norm": 0.3558357783330656,
651
+ "learning_rate": 5.660321057760186e-06,
652
+ "loss": 1.0055,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
+ "grad_norm": 0.40499489938404787,
658
+ "learning_rate": 5.6473954138063674e-06,
659
+ "loss": 1.0113,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
+ "grad_norm": 0.39428526462199764,
665
+ "learning_rate": 5.634243744251094e-06,
666
+ "loss": 0.9875,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
+ "grad_norm": 0.3711741011240413,
672
+ "learning_rate": 5.620867171939109e-06,
673
+ "loss": 0.9749,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.5039370078740157,
678
+ "grad_norm": 0.3961340085644134,
679
+ "learning_rate": 5.607266838916585e-06,
680
+ "loss": 0.982,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.5091863517060368,
685
+ "grad_norm": 0.3784646685814138,
686
+ "learning_rate": 5.593443906333624e-06,
687
+ "loss": 0.9957,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.5144356955380578,
692
+ "grad_norm": 0.3750460397069026,
693
+ "learning_rate": 5.579399554345118e-06,
694
+ "loss": 0.9755,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.5196850393700787,
699
+ "grad_norm": 0.3746718538274792,
700
+ "learning_rate": 5.565134982009994e-06,
701
+ "loss": 0.9736,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.5249343832020997,
706
+ "grad_norm": 0.38418890409196027,
707
+ "learning_rate": 5.550651407188843e-06,
708
+ "loss": 0.9506,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.5301837270341208,
713
+ "grad_norm": 0.422976375435725,
714
+ "learning_rate": 5.535950066439941e-06,
715
+ "loss": 1.0141,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.5354330708661418,
720
+ "grad_norm": 0.38354451243133536,
721
+ "learning_rate": 5.521032214913679e-06,
722
+ "loss": 0.9618,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.5406824146981627,
727
+ "grad_norm": 0.38257660011773076,
728
+ "learning_rate": 5.505899126245397e-06,
729
+ "loss": 0.939,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.5459317585301837,
734
+ "grad_norm": 0.3768438915225408,
735
+ "learning_rate": 5.490552092446652e-06,
736
+ "loss": 0.9675,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.5511811023622047,
741
+ "grad_norm": 0.3749655286727107,
742
+ "learning_rate": 5.474992423794907e-06,
743
+ "loss": 0.9592,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.5564304461942258,
748
+ "grad_norm": 0.38461916993489687,
749
+ "learning_rate": 5.459221448721664e-06,
750
+ "loss": 0.9623,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.5616797900262467,
755
+ "grad_norm": 0.35648642966931204,
756
+ "learning_rate": 5.443240513699045e-06,
757
+ "loss": 0.985,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.5669291338582677,
762
+ "grad_norm": 0.4051560712719681,
763
+ "learning_rate": 5.427050983124842e-06,
764
+ "loss": 0.9407,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.5721784776902887,
769
+ "grad_norm": 0.3769879713701903,
770
+ "learning_rate": 5.410654239206021e-06,
771
+ "loss": 0.968,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.5774278215223098,
776
+ "grad_norm": 0.3746822083724367,
777
+ "learning_rate": 5.394051681840719e-06,
778
+ "loss": 0.9497,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.5826771653543307,
783
+ "grad_norm": 0.3987231911136733,
784
+ "learning_rate": 5.3772447284987216e-06,
785
+ "loss": 0.961,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.5879265091863517,
790
+ "grad_norm": 0.37848222525971176,
791
+ "learning_rate": 5.36023481410045e-06,
792
+ "loss": 0.9707,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.5931758530183727,
797
+ "grad_norm": 0.3794904855253974,
798
+ "learning_rate": 5.343023390894446e-06,
799
+ "loss": 0.9714,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.5984251968503937,
804
+ "grad_norm": 0.37452267525256994,
805
+ "learning_rate": 5.325611928333389e-06,
806
+ "loss": 0.9406,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.6036745406824147,
811
+ "grad_norm": 0.39474437059829304,
812
+ "learning_rate": 5.308001912948637e-06,
813
+ "loss": 0.9626,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.6089238845144357,
818
+ "grad_norm": 0.4023921986663554,
819
+ "learning_rate": 5.290194848223309e-06,
820
+ "loss": 0.9889,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.6141732283464567,
825
+ "grad_norm": 0.39963771712171875,
826
+ "learning_rate": 5.272192254463929e-06,
827
+ "loss": 0.9639,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.6194225721784777,
832
+ "grad_norm": 0.3893586064595733,
833
+ "learning_rate": 5.2539956686706205e-06,
834
+ "loss": 0.9469,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.6246719160104987,
839
+ "grad_norm": 0.4651495625439333,
840
+ "learning_rate": 5.2356066444058875e-06,
841
+ "loss": 0.9658,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.6299212598425197,
846
+ "grad_norm": 0.39599728107932586,
847
+ "learning_rate": 5.217026751661978e-06,
848
+ "loss": 1.0137,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.6351706036745407,
853
+ "grad_norm": 0.406988761369817,
854
+ "learning_rate": 5.198257576726835e-06,
855
+ "loss": 0.9306,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.6404199475065617,
860
+ "grad_norm": 0.3611939094322339,
861
+ "learning_rate": 5.179300722048673e-06,
862
+ "loss": 0.9462,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.6456692913385826,
867
+ "grad_norm": 0.3809841775392484,
868
+ "learning_rate": 5.1601578060991645e-06,
869
+ "loss": 0.953,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.6509186351706037,
874
+ "grad_norm": 0.46022843064705843,
875
+ "learning_rate": 5.1408304632352575e-06,
876
+ "loss": 0.9422,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.6561679790026247,
881
+ "grad_norm": 0.3979704646560941,
882
+ "learning_rate": 5.1213203435596425e-06,
883
+ "loss": 0.9751,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.6614173228346457,
888
+ "grad_norm": 0.39388496260457084,
889
+ "learning_rate": 5.101629112779873e-06,
890
+ "loss": 0.9722,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.6666666666666666,
895
+ "grad_norm": 0.3899148438115094,
896
+ "learning_rate": 5.08175845206615e-06,
897
+ "loss": 0.9652,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.6719160104986877,
902
+ "grad_norm": 0.37391882787694275,
903
+ "learning_rate": 5.061710057907788e-06,
904
+ "loss": 0.9621,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.6771653543307087,
909
+ "grad_norm": 0.39500875865406576,
910
+ "learning_rate": 5.041485641968385e-06,
911
+ "loss": 0.9899,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.6824146981627297,
916
+ "grad_norm": 0.37540362490802714,
917
+ "learning_rate": 5.021086930939672e-06,
918
+ "loss": 0.9472,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.6876640419947506,
923
+ "grad_norm": 0.3940788728379769,
924
+ "learning_rate": 5.000515666394105e-06,
925
+ "loss": 0.9479,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.6929133858267716,
930
+ "grad_norm": 0.3919125365655477,
931
+ "learning_rate": 4.979773604636169e-06,
932
+ "loss": 0.9624,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.6981627296587927,
937
+ "grad_norm": 0.3804552314744538,
938
+ "learning_rate": 4.958862516552433e-06,
939
+ "loss": 0.9806,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.7034120734908137,
944
+ "grad_norm": 0.3674434286105591,
945
+ "learning_rate": 4.937784187460362e-06,
946
+ "loss": 0.9511,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.7086614173228346,
951
+ "grad_norm": 0.4109777494732396,
952
+ "learning_rate": 4.916540416955884e-06,
953
+ "loss": 0.9943,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.7139107611548556,
958
+ "grad_norm": 0.40231567788837497,
959
+ "learning_rate": 4.895133018759753e-06,
960
+ "loss": 0.9798,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.7191601049868767,
965
+ "grad_norm": 0.3721834479908975,
966
+ "learning_rate": 4.873563820562698e-06,
967
+ "loss": 0.9504,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.7244094488188977,
972
+ "grad_norm": 0.36127526200518306,
973
+ "learning_rate": 4.851834663869379e-06,
974
+ "loss": 0.9517,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.7296587926509186,
979
+ "grad_norm": 0.3513827139135777,
980
+ "learning_rate": 4.82994740384117e-06,
981
+ "loss": 0.9835,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.7349081364829396,
986
+ "grad_norm": 0.36760728272750326,
987
+ "learning_rate": 4.80790390913777e-06,
988
+ "loss": 0.9503,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.7401574803149606,
993
+ "grad_norm": 0.36275280721999276,
994
+ "learning_rate": 4.785706061757656e-06,
995
+ "loss": 0.9743,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.7454068241469817,
1000
+ "grad_norm": 0.3733380512329921,
1001
+ "learning_rate": 4.763355756877419e-06,
1002
+ "loss": 0.9384,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.7506561679790026,
1007
+ "grad_norm": 0.3801691027568987,
1008
+ "learning_rate": 4.740854902689947e-06,
1009
+ "loss": 0.9296,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.7559055118110236,
1014
+ "grad_norm": 0.39053906811778566,
1015
+ "learning_rate": 4.718205420241516e-06,
1016
+ "loss": 0.9488,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.7611548556430446,
1021
+ "grad_norm": 0.3923993707534958,
1022
+ "learning_rate": 4.695409243267776e-06,
1023
+ "loss": 0.9383,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.7664041994750657,
1028
+ "grad_norm": 0.364792552828712,
1029
+ "learning_rate": 4.672468318028657e-06,
1030
+ "loss": 0.9193,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.7716535433070866,
1035
+ "grad_norm": 0.35070825551906964,
1036
+ "learning_rate": 4.649384603142202e-06,
1037
+ "loss": 0.9164,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.7769028871391076,
1042
+ "grad_norm": 0.37099778180795795,
1043
+ "learning_rate": 4.626160069417348e-06,
1044
+ "loss": 0.9425,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.7821522309711286,
1049
+ "grad_norm": 0.36954118968922517,
1050
+ "learning_rate": 4.602796699685665e-06,
1051
+ "loss": 0.9265,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.7874015748031497,
1056
+ "grad_norm": 0.4076466706382121,
1057
+ "learning_rate": 4.579296488632067e-06,
1058
+ "loss": 1.0133,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.7926509186351706,
1063
+ "grad_norm": 0.4015334925568992,
1064
+ "learning_rate": 4.5556614426245165e-06,
1065
+ "loss": 0.9486,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.7979002624671916,
1070
+ "grad_norm": 0.39628644809730684,
1071
+ "learning_rate": 4.5318935795427206e-06,
1072
+ "loss": 0.9605,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.8031496062992126,
1077
+ "grad_norm": 0.36792154742540445,
1078
+ "learning_rate": 4.507994928605862e-06,
1079
+ "loss": 0.9287,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.8083989501312336,
1084
+ "grad_norm": 0.3887839296706913,
1085
+ "learning_rate": 4.483967530199337e-06,
1086
+ "loss": 0.951,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.8136482939632546,
1091
+ "grad_norm": 0.36716852968968616,
1092
+ "learning_rate": 4.459813435700569e-06,
1093
+ "loss": 0.9702,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.8188976377952756,
1098
+ "grad_norm": 0.3533521076976156,
1099
+ "learning_rate": 4.4355347073038595e-06,
1100
+ "loss": 0.9612,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.8241469816272966,
1105
+ "grad_norm": 0.3499649930079787,
1106
+ "learning_rate": 4.411133417844328e-06,
1107
+ "loss": 0.9599,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.8293963254593176,
1112
+ "grad_norm": 0.38582146832565867,
1113
+ "learning_rate": 4.38661165062094e-06,
1114
+ "loss": 0.9894,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.8346456692913385,
1119
+ "grad_norm": 0.39040836855795735,
1120
+ "learning_rate": 4.36197149921864e-06,
1121
+ "loss": 0.9747,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.8398950131233596,
1126
+ "grad_norm": 0.3798580758700489,
1127
+ "learning_rate": 4.3372150673296155e-06,
1128
+ "loss": 0.9654,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.8451443569553806,
1133
+ "grad_norm": 0.3764456540061034,
1134
+ "learning_rate": 4.3123444685736795e-06,
1135
+ "loss": 0.9823,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.8503937007874016,
1140
+ "grad_norm": 0.3771195417830333,
1141
+ "learning_rate": 4.287361826317827e-06,
1142
+ "loss": 0.9456,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.8556430446194225,
1147
+ "grad_norm": 0.37650137746409273,
1148
+ "learning_rate": 4.262269273494946e-06,
1149
+ "loss": 1.0022,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.8608923884514436,
1154
+ "grad_norm": 0.38148353077474145,
1155
+ "learning_rate": 4.237068952421711e-06,
1156
+ "loss": 0.964,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.8661417322834646,
1161
+ "grad_norm": 0.3982519128695332,
1162
+ "learning_rate": 4.2117630146156845e-06,
1163
+ "loss": 0.9673,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.8713910761154856,
1168
+ "grad_norm": 0.36000775624632003,
1169
+ "learning_rate": 4.186353620611627e-06,
1170
+ "loss": 0.9359,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.8766404199475065,
1175
+ "grad_norm": 0.36850454735662447,
1176
+ "learning_rate": 4.160842939777036e-06,
1177
+ "loss": 0.9422,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.8818897637795275,
1182
+ "grad_norm": 0.37804115639757085,
1183
+ "learning_rate": 4.135233150126931e-06,
1184
+ "loss": 0.9454,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.8871391076115486,
1189
+ "grad_norm": 0.3689383402086321,
1190
+ "learning_rate": 4.109526438137908e-06,
1191
+ "loss": 0.9455,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.8923884514435696,
1196
+ "grad_norm": 0.46527154775209717,
1197
+ "learning_rate": 4.08372499856146e-06,
1198
+ "loss": 0.9386,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.8976377952755905,
1203
+ "grad_norm": 0.45653306710128705,
1204
+ "learning_rate": 4.0578310342365975e-06,
1205
+ "loss": 0.9616,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.9028871391076115,
1210
+ "grad_norm": 0.3773630567359451,
1211
+ "learning_rate": 4.031846755901785e-06,
1212
+ "loss": 0.9285,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.9081364829396326,
1217
+ "grad_norm": 0.3644595191521506,
1218
+ "learning_rate": 4.005774382006182e-06,
1219
+ "loss": 0.9663,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.9133858267716536,
1224
+ "grad_norm": 0.3539767481135477,
1225
+ "learning_rate": 3.97961613852025e-06,
1226
+ "loss": 0.9564,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.9186351706036745,
1231
+ "grad_norm": 0.3819676152776953,
1232
+ "learning_rate": 3.953374258745705e-06,
1233
+ "loss": 0.9607,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.9238845144356955,
1238
+ "grad_norm": 0.38397675786726637,
1239
+ "learning_rate": 3.927050983124842e-06,
1240
+ "loss": 0.9539,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.9291338582677166,
1245
+ "grad_norm": 0.3979084367711538,
1246
+ "learning_rate": 3.900648559049258e-06,
1247
+ "loss": 0.9505,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.9343832020997376,
1252
+ "grad_norm": 0.3756154385935223,
1253
+ "learning_rate": 3.874169240667974e-06,
1254
+ "loss": 0.9519,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.9396325459317585,
1259
+ "grad_norm": 0.40551973597201274,
1260
+ "learning_rate": 3.847615288694985e-06,
1261
+ "loss": 0.9727,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.9448818897637795,
1266
+ "grad_norm": 0.4149625851710124,
1267
+ "learning_rate": 3.820988970216249e-06,
1268
+ "loss": 0.9464,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.9501312335958005,
1273
+ "grad_norm": 0.35739115830542967,
1274
+ "learning_rate": 3.7942925584961272e-06,
1275
+ "loss": 0.9427,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.9553805774278216,
1280
+ "grad_norm": 0.3759540038847051,
1281
+ "learning_rate": 3.767528332783307e-06,
1282
+ "loss": 0.9679,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.9606299212598425,
1287
+ "grad_norm": 0.3525867658299593,
1288
+ "learning_rate": 3.740698578116199e-06,
1289
+ "loss": 0.9183,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.9658792650918635,
1294
+ "grad_norm": 0.3557123352774738,
1295
+ "learning_rate": 3.7138055851278564e-06,
1296
+ "loss": 0.9383,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.9711286089238845,
1301
+ "grad_norm": 0.3623514252763418,
1302
+ "learning_rate": 3.6868516498504025e-06,
1303
+ "loss": 0.9246,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.9763779527559056,
1308
+ "grad_norm": 0.38495496418054853,
1309
+ "learning_rate": 3.6598390735190066e-06,
1310
+ "loss": 0.9612,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.9816272965879265,
1315
+ "grad_norm": 0.3648599004428126,
1316
+ "learning_rate": 3.63277016237541e-06,
1317
+ "loss": 0.9293,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.9868766404199475,
1322
+ "grad_norm": 0.38871547084803876,
1323
+ "learning_rate": 3.6056472274710305e-06,
1324
+ "loss": 0.9973,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.9921259842519685,
1329
+ "grad_norm": 0.38590844403642666,
1330
+ "learning_rate": 3.578472584469651e-06,
1331
+ "loss": 0.9457,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.9973753280839895,
1336
+ "grad_norm": 0.3872507088649178,
1337
+ "learning_rate": 3.5512485534497116e-06,
1338
+ "loss": 0.9462,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 1.0,
1343
+ "grad_norm": 0.3872507088649178,
1344
+ "learning_rate": 3.523977458706237e-06,
1345
+ "loss": 0.9693,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 1.005249343832021,
1350
+ "grad_norm": 0.6232728744646114,
1351
+ "learning_rate": 3.49666162855239e-06,
1352
+ "loss": 0.887,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 1.010498687664042,
1357
+ "grad_norm": 0.4149641950734625,
1358
+ "learning_rate": 3.469303395120693e-06,
1359
+ "loss": 0.8826,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 1.015748031496063,
1364
+ "grad_norm": 0.37273340109017755,
1365
+ "learning_rate": 3.441905094163913e-06,
1366
+ "loss": 0.8893,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 1.020997375328084,
1371
+ "grad_norm": 0.4113832689982837,
1372
+ "learning_rate": 3.414469064855647e-06,
1373
+ "loss": 0.9205,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 1.026246719160105,
1378
+ "grad_norm": 0.49485155842511663,
1379
+ "learning_rate": 3.3869976495906104e-06,
1380
+ "loss": 0.9074,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 1.031496062992126,
1385
+ "grad_norm": 0.3736781934252868,
1386
+ "learning_rate": 3.3594931937846498e-06,
1387
+ "loss": 0.8966,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 1.036745406824147,
1392
+ "grad_norm": 0.3758650059773124,
1393
+ "learning_rate": 3.3319580456745023e-06,
1394
+ "loss": 0.8759,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 1.041994750656168,
1399
+ "grad_norm": 0.4056031624712629,
1400
+ "learning_rate": 3.3043945561173092e-06,
1401
+ "loss": 0.8788,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 1.047244094488189,
1406
+ "grad_norm": 0.36344982085137467,
1407
+ "learning_rate": 3.2768050783899063e-06,
1408
+ "loss": 0.873,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 1.05249343832021,
1413
+ "grad_norm": 0.3760103676246,
1414
+ "learning_rate": 3.249191967987912e-06,
1415
+ "loss": 0.899,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 1.057742782152231,
1420
+ "grad_norm": 0.39433477834527153,
1421
+ "learning_rate": 3.221557582424622e-06,
1422
+ "loss": 0.9019,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 1.0629921259842519,
1427
+ "grad_norm": 0.3595753440791428,
1428
+ "learning_rate": 3.1939042810297328e-06,
1429
+ "loss": 0.8781,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 1.068241469816273,
1434
+ "grad_norm": 0.3743448170598354,
1435
+ "learning_rate": 3.16623442474791e-06,
1436
+ "loss": 0.8689,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 1.073490813648294,
1441
+ "grad_norm": 0.3618551186966609,
1442
+ "learning_rate": 3.138550375937219e-06,
1443
+ "loss": 0.9094,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 1.078740157480315,
1448
+ "grad_norm": 0.36577516842050983,
1449
+ "learning_rate": 3.1108544981674356e-06,
1450
+ "loss": 0.8668,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 1.083989501312336,
1455
+ "grad_norm": 0.3985134455319658,
1456
+ "learning_rate": 3.0831491560182495e-06,
1457
+ "loss": 0.9016,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 1.0892388451443569,
1462
+ "grad_norm": 0.37808489525197075,
1463
+ "learning_rate": 3.0554367148773897e-06,
1464
+ "loss": 0.895,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 1.094488188976378,
1469
+ "grad_norm": 0.4112784941005797,
1470
+ "learning_rate": 3.027719540738673e-06,
1471
+ "loss": 0.859,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 1.099737532808399,
1476
+ "grad_norm": 0.3830296759827936,
1477
+ "learning_rate": 3e-06,
1478
+ "loss": 0.8569,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 1.10498687664042,
1483
+ "grad_norm": 0.3930755503999148,
1484
+ "learning_rate": 2.972280459261328e-06,
1485
+ "loss": 0.8774,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 1.110236220472441,
1490
+ "grad_norm": 0.36738851637178116,
1491
+ "learning_rate": 2.944563285122611e-06,
1492
+ "loss": 0.9086,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 1.1154855643044619,
1497
+ "grad_norm": 0.3897160841039193,
1498
+ "learning_rate": 2.9168508439817515e-06,
1499
+ "loss": 0.889,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 1.120734908136483,
1504
+ "grad_norm": 0.39858146379374537,
1505
+ "learning_rate": 2.889145501832566e-06,
1506
+ "loss": 0.8964,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 1.125984251968504,
1511
+ "grad_norm": 0.3739395525411432,
1512
+ "learning_rate": 2.861449624062782e-06,
1513
+ "loss": 0.8884,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 1.1312335958005248,
1518
+ "grad_norm": 0.3755768464864809,
1519
+ "learning_rate": 2.83376557525209e-06,
1520
+ "loss": 0.851,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 1.136482939632546,
1525
+ "grad_norm": 0.38260315757882735,
1526
+ "learning_rate": 2.8060957189702674e-06,
1527
+ "loss": 0.9152,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 1.141732283464567,
1532
+ "grad_norm": 0.4205379839527009,
1533
+ "learning_rate": 2.7784424175753784e-06,
1534
+ "loss": 0.8683,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 1.1469816272965878,
1539
+ "grad_norm": 0.38325260941818995,
1540
+ "learning_rate": 2.7508080320120888e-06,
1541
+ "loss": 0.8943,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 1.152230971128609,
1546
+ "grad_norm": 0.3763198826603672,
1547
+ "learning_rate": 2.7231949216100943e-06,
1548
+ "loss": 0.8676,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 1.1574803149606299,
1553
+ "grad_norm": 0.3767162287387105,
1554
+ "learning_rate": 2.6956054438826918e-06,
1555
+ "loss": 0.8482,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 1.162729658792651,
1560
+ "grad_norm": 0.3486273740901837,
1561
+ "learning_rate": 2.668041954325498e-06,
1562
+ "loss": 0.8879,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 1.167979002624672,
1567
+ "grad_norm": 0.39084218665366566,
1568
+ "learning_rate": 2.640506806215351e-06,
1569
+ "loss": 0.8679,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 1.1732283464566928,
1574
+ "grad_norm": 0.3538552501730603,
1575
+ "learning_rate": 2.613002350409391e-06,
1576
+ "loss": 0.8871,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 1.178477690288714,
1581
+ "grad_norm": 0.36544200913577,
1582
+ "learning_rate": 2.585530935144354e-06,
1583
+ "loss": 0.8616,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 1.1837270341207349,
1588
+ "grad_norm": 0.3985990462573467,
1589
+ "learning_rate": 2.558094905836087e-06,
1590
+ "loss": 0.8917,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 1.188976377952756,
1595
+ "grad_norm": 0.42608518999556655,
1596
+ "learning_rate": 2.5306966048793067e-06,
1597
+ "loss": 0.8817,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 1.194225721784777,
1602
+ "grad_norm": 0.37952769789031354,
1603
+ "learning_rate": 2.5033383714476097e-06,
1604
+ "loss": 0.8985,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 1.1994750656167978,
1609
+ "grad_norm": 0.40804864076806885,
1610
+ "learning_rate": 2.4760225412937633e-06,
1611
+ "loss": 0.9073,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 1.204724409448819,
1616
+ "grad_norm": 0.4167713152946991,
1617
+ "learning_rate": 2.4487514465502885e-06,
1618
+ "loss": 0.8566,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 1.20997375328084,
1623
+ "grad_norm": 0.4022153540631621,
1624
+ "learning_rate": 2.42152741553035e-06,
1625
+ "loss": 0.8713,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 1.2152230971128608,
1630
+ "grad_norm": 0.4222065137992956,
1631
+ "learning_rate": 2.39435277252897e-06,
1632
+ "loss": 0.9035,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.220472440944882,
1637
+ "grad_norm": 0.3666365807384159,
1638
+ "learning_rate": 2.3672298376245908e-06,
1639
+ "loss": 0.8637,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.2257217847769029,
1644
+ "grad_norm": 0.3976853335036615,
1645
+ "learning_rate": 2.3401609264809953e-06,
1646
+ "loss": 0.9398,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 1.2309711286089238,
1651
+ "grad_norm": 0.37956934109451046,
1652
+ "learning_rate": 2.3131483501495985e-06,
1653
+ "loss": 0.8353,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 1.236220472440945,
1658
+ "grad_norm": 0.33722056538083744,
1659
+ "learning_rate": 2.2861944148721446e-06,
1660
+ "loss": 0.8786,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 1.2414698162729658,
1665
+ "grad_norm": 0.49777382093647954,
1666
+ "learning_rate": 2.2593014218838e-06,
1667
+ "loss": 0.8834,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 1.246719160104987,
1672
+ "grad_norm": 0.35315516410389436,
1673
+ "learning_rate": 2.232471667216693e-06,
1674
+ "loss": 0.8442,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 1.2519685039370079,
1679
+ "grad_norm": 0.3816124424363711,
1680
+ "learning_rate": 2.2057074415038725e-06,
1681
+ "loss": 0.8573,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 1.257217847769029,
1686
+ "grad_norm": 0.36319142999803095,
1687
+ "learning_rate": 2.1790110297837514e-06,
1688
+ "loss": 0.8481,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 1.26246719160105,
1693
+ "grad_norm": 0.34672889281207053,
1694
+ "learning_rate": 2.152384711305015e-06,
1695
+ "loss": 0.8623,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 1.2677165354330708,
1700
+ "grad_norm": 0.37448151544392105,
1701
+ "learning_rate": 2.1258307593320262e-06,
1702
+ "loss": 0.8751,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 1.272965879265092,
1707
+ "grad_norm": 0.37082567424502005,
1708
+ "learning_rate": 2.099351440950742e-06,
1709
+ "loss": 0.8914,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 1.2782152230971129,
1714
+ "grad_norm": 0.39074992783073415,
1715
+ "learning_rate": 2.072949016875158e-06,
1716
+ "loss": 0.9222,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 1.2834645669291338,
1721
+ "grad_norm": 0.4150437401629804,
1722
+ "learning_rate": 2.046625741254295e-06,
1723
+ "loss": 0.9475,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 1.288713910761155,
1728
+ "grad_norm": 0.4504166670407193,
1729
+ "learning_rate": 2.0203838614797505e-06,
1730
+ "loss": 0.9026,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 1.2939632545931758,
1735
+ "grad_norm": 0.38345958484903814,
1736
+ "learning_rate": 1.994225617993819e-06,
1737
+ "loss": 0.9074,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 1.2992125984251968,
1742
+ "grad_norm": 0.37086048031752866,
1743
+ "learning_rate": 1.9681532440982154e-06,
1744
+ "loss": 0.8755,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 1.304461942257218,
1749
+ "grad_norm": 0.3775524407980251,
1750
+ "learning_rate": 1.942168965763402e-06,
1751
+ "loss": 0.8986,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 1.3097112860892388,
1756
+ "grad_norm": 0.364796377340789,
1757
+ "learning_rate": 1.916275001438541e-06,
1758
+ "loss": 0.867,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 1.3149606299212597,
1763
+ "grad_norm": 0.3705604843330414,
1764
+ "learning_rate": 1.8904735618620928e-06,
1765
+ "loss": 0.8875,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 1.3202099737532809,
1770
+ "grad_norm": 0.3847344001283667,
1771
+ "learning_rate": 1.8647668498730693e-06,
1772
+ "loss": 0.8678,
1773
  "step": 252
1774
  },
1775
  {
1776
  "epoch": 1.3254593175853018,
1777
+ "grad_norm": 0.3507183610862785,
1778
+ "learning_rate": 1.8391570602229647e-06,
1779
+ "loss": 0.8895,
1780
  "step": 253
1781
  },
1782
  {
1783
  "epoch": 1.330708661417323,
1784
+ "grad_norm": 0.34464955572346173,
1785
+ "learning_rate": 1.8136463793883725e-06,
1786
+ "loss": 0.9112,
1787
  "step": 254
1788
  },
1789
  {
1790
  "epoch": 1.3359580052493438,
1791
+ "grad_norm": 0.3804540728076062,
1792
+ "learning_rate": 1.7882369853843155e-06,
1793
+ "loss": 0.8818,
1794
  "step": 255
1795
  },
1796
  {
1797
  "epoch": 1.341207349081365,
1798
+ "grad_norm": 0.38671544491057547,
1799
+ "learning_rate": 1.76293104757829e-06,
1800
+ "loss": 0.8712,
1801
  "step": 256
1802
  },
1803
  {
1804
  "epoch": 1.3464566929133859,
1805
+ "grad_norm": 0.35028636565033566,
1806
+ "learning_rate": 1.7377307265050559e-06,
1807
+ "loss": 0.8795,
1808
  "step": 257
1809
  },
1810
  {
1811
  "epoch": 1.3517060367454068,
1812
+ "grad_norm": 0.3596694021401425,
1813
+ "learning_rate": 1.7126381736821732e-06,
1814
+ "loss": 0.8791,
1815
  "step": 258
1816
  },
1817
  {
1818
  "epoch": 1.356955380577428,
1819
+ "grad_norm": 0.3833574983214166,
1820
+ "learning_rate": 1.6876555314263213e-06,
1821
+ "loss": 0.9108,
1822
  "step": 259
1823
  },
1824
  {
1825
  "epoch": 1.3622047244094488,
1826
+ "grad_norm": 0.3701840047085969,
1827
+ "learning_rate": 1.6627849326703855e-06,
1828
+ "loss": 0.8695,
1829
  "step": 260
1830
  },
1831
  {
1832
  "epoch": 1.3674540682414698,
1833
+ "grad_norm": 0.36098816535443995,
1834
+ "learning_rate": 1.6380285007813598e-06,
1835
+ "loss": 0.876,
1836
  "step": 261
1837
  },
1838
  {
1839
  "epoch": 1.372703412073491,
1840
+ "grad_norm": 0.3900890284585014,
1841
+ "learning_rate": 1.6133883493790609e-06,
1842
+ "loss": 0.8498,
1843
  "step": 262
1844
  },
1845
  {
1846
  "epoch": 1.3779527559055118,
1847
+ "grad_norm": 0.34906551126755136,
1848
+ "learning_rate": 1.5888665821556724e-06,
1849
+ "loss": 0.8513,
1850
  "step": 263
1851
  },
1852
  {
1853
  "epoch": 1.3832020997375327,
1854
+ "grad_norm": 0.3753732283477496,
1855
+ "learning_rate": 1.5644652926961407e-06,
1856
+ "loss": 0.8714,
1857
  "step": 264
1858
  },
1859
  {
1860
  "epoch": 1.3884514435695539,
1861
+ "grad_norm": 0.34748864593560347,
1862
+ "learning_rate": 1.5401865642994315e-06,
1863
+ "loss": 0.9124,
1864
  "step": 265
1865
  },
1866
  {
1867
  "epoch": 1.3937007874015748,
1868
+ "grad_norm": 0.36698053817770165,
1869
+ "learning_rate": 1.5160324698006642e-06,
1870
+ "loss": 0.8814,
1871
  "step": 266
1872
  },
1873
  {
1874
  "epoch": 1.3989501312335957,
1875
+ "grad_norm": 0.4000964153653425,
1876
+ "learning_rate": 1.4920050713941398e-06,
1877
+ "loss": 0.9082,
1878
  "step": 267
1879
  },
1880
  {
1881
  "epoch": 1.4041994750656168,
1882
+ "grad_norm": 0.3985391177875817,
1883
+ "learning_rate": 1.4681064204572798e-06,
1884
+ "loss": 0.8749,
1885
  "step": 268
1886
  },
1887
  {
1888
  "epoch": 1.4094488188976377,
1889
+ "grad_norm": 0.3578122677174226,
1890
+ "learning_rate": 1.4443385573754837e-06,
1891
+ "loss": 0.8608,
1892
  "step": 269
1893
  },
1894
  {
1895
  "epoch": 1.4146981627296589,
1896
+ "grad_norm": 0.3576093239254431,
1897
+ "learning_rate": 1.4207035113679322e-06,
1898
+ "loss": 0.8798,
1899
  "step": 270
1900
  },
1901
  {
1902
  "epoch": 1.4199475065616798,
1903
+ "grad_norm": 0.35299639204379674,
1904
+ "learning_rate": 1.3972033003143348e-06,
1905
+ "loss": 0.8972,
1906
  "step": 271
1907
  },
1908
  {
1909
  "epoch": 1.425196850393701,
1910
+ "grad_norm": 0.3937775289907907,
1911
+ "learning_rate": 1.3738399305826516e-06,
1912
+ "loss": 0.8736,
1913
  "step": 272
1914
  },
1915
  {
1916
  "epoch": 1.4304461942257218,
1917
+ "grad_norm": 0.3691998032129419,
1918
+ "learning_rate": 1.3506153968577983e-06,
1919
+ "loss": 0.8667,
1920
  "step": 273
1921
  },
1922
  {
1923
  "epoch": 1.4356955380577427,
1924
+ "grad_norm": 0.35764876894907843,
1925
+ "learning_rate": 1.3275316819713435e-06,
1926
+ "loss": 0.882,
1927
  "step": 274
1928
  },
1929
  {
1930
  "epoch": 1.4409448818897639,
1931
+ "grad_norm": 0.3859579688778526,
1932
+ "learning_rate": 1.3045907567322243e-06,
1933
+ "loss": 0.844,
1934
  "step": 275
1935
  },
1936
  {
1937
  "epoch": 1.4461942257217848,
1938
+ "grad_norm": 0.3736621084680505,
1939
+ "learning_rate": 1.2817945797584844e-06,
1940
+ "loss": 0.8525,
1941
  "step": 276
1942
  },
1943
  {
1944
  "epoch": 1.4514435695538057,
1945
+ "grad_norm": 0.36602372507940695,
1946
+ "learning_rate": 1.2591450973100532e-06,
1947
+ "loss": 0.8577,
1948
  "step": 277
1949
  },
1950
  {
1951
  "epoch": 1.4566929133858268,
1952
+ "grad_norm": 0.37926054124030645,
1953
+ "learning_rate": 1.236644243122581e-06,
1954
+ "loss": 0.8837,
1955
  "step": 278
1956
  },
1957
  {
1958
  "epoch": 1.4619422572178478,
1959
+ "grad_norm": 0.3680022216795608,
1960
+ "learning_rate": 1.214293938242344e-06,
1961
+ "loss": 0.8984,
1962
  "step": 279
1963
  },
1964
  {
1965
  "epoch": 1.4671916010498687,
1966
+ "grad_norm": 0.37824901927870175,
1967
+ "learning_rate": 1.1920960908622313e-06,
1968
+ "loss": 0.8745,
1969
  "step": 280
1970
  },
1971
  {
1972
  "epoch": 1.4724409448818898,
1973
+ "grad_norm": 0.3489273490529577,
1974
+ "learning_rate": 1.17005259615883e-06,
1975
+ "loss": 0.8628,
1976
  "step": 281
1977
  },
1978
  {
1979
  "epoch": 1.4776902887139107,
1980
+ "grad_norm": 0.3735770062938505,
1981
+ "learning_rate": 1.1481653361306215e-06,
1982
+ "loss": 0.8619,
1983
  "step": 282
1984
  },
1985
  {
1986
  "epoch": 1.4829396325459316,
1987
+ "grad_norm": 0.3458041443504503,
1988
+ "learning_rate": 1.1264361794373032e-06,
1989
+ "loss": 0.8761,
1990
  "step": 283
1991
  },
1992
  {
1993
  "epoch": 1.4881889763779528,
1994
+ "grad_norm": 0.35998420937846626,
1995
+ "learning_rate": 1.104866981240248e-06,
1996
+ "loss": 0.8844,
1997
  "step": 284
1998
  },
1999
  {
2000
  "epoch": 1.4934383202099737,
2001
+ "grad_norm": 0.4029178073367971,
2002
+ "learning_rate": 1.0834595830441168e-06,
2003
+ "loss": 0.8511,
2004
  "step": 285
2005
  }
2006
  ],
checkpoint-285/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c0d2528dcfd8d62d3c517248c2d231cc9ff64ec148911ec3ce58a9d39f7507d
3
  size 8376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b860c7e838727b1a9d8001f5c7a769bd0d63566ea45620719245b6beb59f1cd9
3
  size 8376