Delta-Vector commited on
Commit
867107c
·
verified ·
1 Parent(s): 441a49e

Training in progress, step 190, checkpoint

Browse files
Files changed (25) hide show
  1. checkpoint-190/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
  2. checkpoint-190/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  3. checkpoint-190/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  4. checkpoint-190/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  5. checkpoint-190/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  6. checkpoint-190/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-190/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  8. checkpoint-190/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-190/model-00001-of-00014.safetensors +1 -1
  10. checkpoint-190/model-00002-of-00014.safetensors +1 -1
  11. checkpoint-190/model-00003-of-00014.safetensors +1 -1
  12. checkpoint-190/model-00004-of-00014.safetensors +1 -1
  13. checkpoint-190/model-00005-of-00014.safetensors +1 -1
  14. checkpoint-190/model-00006-of-00014.safetensors +1 -1
  15. checkpoint-190/model-00007-of-00014.safetensors +1 -1
  16. checkpoint-190/model-00008-of-00014.safetensors +1 -1
  17. checkpoint-190/model-00009-of-00014.safetensors +1 -1
  18. checkpoint-190/model-00010-of-00014.safetensors +1 -1
  19. checkpoint-190/model-00011-of-00014.safetensors +1 -1
  20. checkpoint-190/model-00012-of-00014.safetensors +1 -1
  21. checkpoint-190/model-00013-of-00014.safetensors +1 -1
  22. checkpoint-190/model-00014-of-00014.safetensors +1 -1
  23. checkpoint-190/scheduler.pt +1 -1
  24. checkpoint-190/trainer_state.json +568 -568
  25. checkpoint-190/training_args.bin +1 -1
checkpoint-190/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b492522c2621fa7d92d5ff09018a973c1df0d001023dd5e699e678724925175
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3541776382759386e52f0febd1a8b68f39637b1f8eb3461e0fe89a2a0974fa44
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eec645b36b324b5fe84fe66c847bc6746912a74b8b88f6635098a0a2be848480
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8d8831089f5b892a3007b0a90a5b1de44307d0bbaa5bf248d2ff4f79104511
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:630a3721e91b3d394a18c9993888327e7cdddc335ee1078f3c0333f3ce9d4154
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17c8b54f4d34fbad6bb8aff69ffa500110379ebf67c0761fbe31a5df5f77c135
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03ef347f349eafadf1ae2a608a554fa11fceb5ef686196b2da393aedfe3b7f33
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc499ea3adaa850557203b81697209d997b0371ecc644f4c8777c0eb4dff8171
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5a107a64876bd77ca0e4f0729d0c5061f4f88fb0c018452e13e47940fa3d228
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4b6533c980a8af3289b020538036f2bf41a8eb3b4b4eb28db6d20141f4a0634
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dc14bdfb207b8713ca3c5883a415b02c5217c474de2b8acccd7cd7f5ae84ee5
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f3fedf06eb0d61469718f363b311c03e2d4ceca30d821dc87cb28a9db1e55ea
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad0bbe907243dea037d499e0a8de68d0cc758c153b80b09ce13b0627621066b5
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:790dd79ef120938a1d9e3d111e973c57a2e485b09964130662916cbde34aba4b
3
  size 24702833511
checkpoint-190/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cb9a8426ac2ce4a68ce970141f26e9c0ffb3c18ee53bdf4b70ccf8645f9cfdf
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47bb110081dc718553a91dbe4a2d855ca0ad096e15cf603009ef75f6e518c7c9
3
  size 24702833511
checkpoint-190/model-00001-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169232493df11cd7afc50eda21baa777e39c981fd4b7532a4d118c74aced6bc2
3
  size 4891730992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18655898de26c2499a9cb998d3172238b900528f8fa051e28c1501fb6d494816
3
  size 4891730992
checkpoint-190/model-00002-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:575ada9ddc7e810c27fb8b24f651a060e9460bc8325a13e6758abf6ea7603bf4
3
  size 4876059352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6895a87d6de8264c3f9aee6c5665c7608a9f68a4969405dd24797b80c73f31b6
3
  size 4876059352
checkpoint-190/model-00003-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:599e4db2d87fc3e8375087a2555203189347749fe218ca223e7569c709590c70
3
  size 4876059384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11a264f96be19b45eddfad9a94a935de63c8d99fc302c11d54c720bf92f6566d
3
  size 4876059384
checkpoint-190/model-00004-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6d3cbf63b56a6a020c03b876ba95035e075b68eeb7e9c4ea43db3460a372a36
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6a0ce88e4be7470c6e3cc62f9223fddd4feb157d9896d9a32f8631d88bb8848
3
  size 4876059416
checkpoint-190/model-00005-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:503b6ec7a60c0e3cf96ea96e26493a29c22a48d90dd569489fbf662ffde0aea6
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0fb3e67f4dcd8905765f154bd890cfe2cbe75eb7401290bf180ea998d9ee3a6
3
  size 4876059416
checkpoint-190/model-00006-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1b7c58f9141cc0273e15c98858fd64c2d8b15a8d1a7adc2c4518029a51df24f
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50435db08c6497948c82e3d506a47d4a15199fc275beaf85b55b5f0e05e57f13
3
  size 4876059416
checkpoint-190/model-00007-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29d71b6095735d91c1c0861482781e11235495280bfa315bf926c67ab86e9510
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3e79c4c319a7f7cdb2e5b90018fc8ea6a45052f907ba6c6ba7b45954eaa51f
3
  size 4876059416
checkpoint-190/model-00008-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edfe2b5a16d63446025ae5bb0a482e7c8653b31d22027a0c3b8bd5dec25b8f5f
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b38fbac984d88974d253478c02ee53d934e6fac621a94a54214407763017be4e
3
  size 4876059416
checkpoint-190/model-00009-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28f29721a3b0bba86d4c77a8079840d6de1e5129a7850fcbc5233b83662ca122
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441da3091300b55242e64b08951ce667fc6ab8ee839cffb39cb23cfae9d42eaf
3
  size 4876059416
checkpoint-190/model-00010-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e0d7fcd56e2ee76a15fec1e09c962a7ab0729c3bb19fdaa7898aebf261fb8a1
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e66b2a986f40f3f99406d49d2d37abc3116a27020c9e980cc090c0e9c969b26
3
  size 4876059416
checkpoint-190/model-00011-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0242233c6be0bdae8cc2c3b09ca08d2c30e076f313857b41287501e2b27c1573
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:087c4681c8cbc8ef8169b2baa08b721bf7ec92dd88deea86d5c40423821b4126
3
  size 4876059416
checkpoint-190/model-00012-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbfec0c2ec4c9be22e6c1cc19f25703c0c862c89887a9249e1983e6bcb833f35
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ef256a0d3458c76c22a6fa4d279b290cf1e67d9e4ba82e9e373aafe5092993
3
  size 4876059416
checkpoint-190/model-00013-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0787260fa4443b2c10b418a926e9cfecf1b28c0644ea1d027bc4716a8667000
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:081ac51de4f8c0228616b03d106e3c9bf23584b4f74a7a40194630762ea06a97
3
  size 4876059416
checkpoint-190/model-00014-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4154cf30c46a55b5596e1268d3e94bb325a4e3407b69c79c7eb95a92d908737
3
  size 2123397800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bc918a617fbfec918d94d48c04a00906ac5fe72e6c9bba9d9d2bdb61e72d13
3
  size 2123397800
checkpoint-190/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f7ded6ee49549061f8b11e49b6f24cbfd319f9bce6c5b9fb15680661980fc08
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f572459ea633e6969f294dad697709db1ff09e4ab9bad307878e319454f0651
3
  size 1064
checkpoint-190/trainer_state.json CHANGED
@@ -10,1332 +10,1332 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
- "grad_norm": 1.134754623075341,
14
- "learning_rate": 1.0000000000000002e-06,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
- "grad_norm": 1.1234145683168772,
21
- "learning_rate": 2.0000000000000003e-06,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
- "grad_norm": 1.0799860590372758,
28
- "learning_rate": 3e-06,
29
- "loss": 1.1152,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
- "grad_norm": 0.9984297481710986,
35
- "learning_rate": 4.000000000000001e-06,
36
- "loss": 1.0953,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
- "grad_norm": 0.8302026280344834,
42
- "learning_rate": 5e-06,
43
- "loss": 1.0617,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
- "grad_norm": 0.8911823807745126,
49
- "learning_rate": 6e-06,
50
- "loss": 1.1297,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
- "grad_norm": 0.686211615667355,
56
- "learning_rate": 7e-06,
57
- "loss": 1.0705,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
- "grad_norm": 0.9091855799181295,
63
- "learning_rate": 8.000000000000001e-06,
64
- "loss": 1.065,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
- "grad_norm": 0.8934722980371054,
70
- "learning_rate": 9e-06,
71
- "loss": 1.0767,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
- "grad_norm": 0.8688110393935611,
77
- "learning_rate": 1e-05,
78
- "loss": 1.0303,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
- "grad_norm": 0.9920393807379069,
84
- "learning_rate": 1.1000000000000001e-05,
85
- "loss": 1.0855,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
- "grad_norm": 0.9220245541797021,
91
- "learning_rate": 1.2e-05,
92
- "loss": 1.0531,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
- "grad_norm": 0.736886642754733,
98
- "learning_rate": 1.3000000000000001e-05,
99
- "loss": 1.0456,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
- "grad_norm": 0.771339891024354,
105
- "learning_rate": 1.4e-05,
106
- "loss": 1.0671,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
- "grad_norm": 0.7161080553611359,
112
- "learning_rate": 1.5000000000000002e-05,
113
- "loss": 1.0521,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
- "grad_norm": 0.6788342613059561,
119
- "learning_rate": 1.6000000000000003e-05,
120
- "loss": 1.0674,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
- "grad_norm": 0.7102848455414168,
126
- "learning_rate": 1.7e-05,
127
- "loss": 1.0459,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
- "grad_norm": 0.6425246555654909,
133
- "learning_rate": 1.8e-05,
134
- "loss": 1.0093,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
- "grad_norm": 2.099748819540086,
140
- "learning_rate": 1.9e-05,
141
- "loss": 1.0301,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
- "grad_norm": 0.6691987921672391,
147
- "learning_rate": 2e-05,
148
- "loss": 1.0199,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
- "grad_norm": 0.5883655485426926,
154
- "learning_rate": 2.1000000000000002e-05,
155
- "loss": 1.0085,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
- "grad_norm": 0.5443706631485103,
161
- "learning_rate": 2.2000000000000003e-05,
162
- "loss": 1.0432,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
- "grad_norm": 0.593023936793411,
168
- "learning_rate": 2.3e-05,
169
- "loss": 1.0196,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
- "grad_norm": 0.5618656915734137,
175
- "learning_rate": 2.4e-05,
176
- "loss": 1.0386,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
- "grad_norm": 0.46871710098096486,
182
- "learning_rate": 2.5e-05,
183
- "loss": 0.9611,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
- "grad_norm": 0.5700902276763852,
189
- "learning_rate": 2.6000000000000002e-05,
190
- "loss": 1.0045,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
- "grad_norm": 0.603692765386866,
196
- "learning_rate": 2.7000000000000002e-05,
197
- "loss": 1.019,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
- "grad_norm": 0.48456720859923497,
203
- "learning_rate": 2.8e-05,
204
- "loss": 0.9892,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
- "grad_norm": 0.45729475661677665,
210
- "learning_rate": 2.9e-05,
211
- "loss": 0.9645,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
- "grad_norm": 0.5439846777665153,
217
- "learning_rate": 3.0000000000000004e-05,
218
- "loss": 0.9497,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
- "grad_norm": 0.4965459941185334,
224
- "learning_rate": 3.1e-05,
225
- "loss": 0.9882,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
- "grad_norm": 0.4656328962534996,
231
- "learning_rate": 3.2000000000000005e-05,
232
- "loss": 1.0057,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
- "grad_norm": 0.5241601609773927,
238
- "learning_rate": 3.3e-05,
239
- "loss": 1.0033,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
- "grad_norm": 0.5062226992393802,
245
- "learning_rate": 3.4e-05,
246
- "loss": 1.0166,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
- "grad_norm": 0.43771829747985674,
252
- "learning_rate": 3.5000000000000004e-05,
253
- "loss": 1.0102,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
- "grad_norm": 0.48092156639697076,
259
- "learning_rate": 3.6e-05,
260
- "loss": 1.018,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
- "grad_norm": 0.48115559949514536,
266
- "learning_rate": 3.7000000000000005e-05,
267
- "loss": 1.0079,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
- "grad_norm": 0.4777546937622387,
273
- "learning_rate": 3.8e-05,
274
- "loss": 1.0085,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
- "grad_norm": 0.44755392669080185,
280
- "learning_rate": 3.9e-05,
281
- "loss": 0.9825,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
- "grad_norm": 0.44510881962201315,
287
- "learning_rate": 4e-05,
288
- "loss": 0.9848,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
- "grad_norm": 0.4746290969046573,
294
- "learning_rate": 3.999914623406736e-05,
295
- "loss": 0.9888,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
- "grad_norm": 0.5953130701884418,
301
- "learning_rate": 3.9996585009161056e-05,
302
- "loss": 0.9882,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
- "grad_norm": 0.4251472611705547,
308
- "learning_rate": 3.999231654394975e-05,
309
- "loss": 0.9958,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
- "grad_norm": 0.44690799367073597,
315
- "learning_rate": 3.9986341202860467e-05,
316
- "loss": 0.9543,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
- "grad_norm": 0.5976579403936895,
322
- "learning_rate": 3.9978659496047456e-05,
323
- "loss": 0.9762,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
- "grad_norm": 0.3962092871428472,
329
- "learning_rate": 3.9969272079348685e-05,
330
- "loss": 0.9605,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
- "grad_norm": 0.43362883575028716,
336
- "learning_rate": 3.995817975422981e-05,
337
- "loss": 0.9456,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
- "grad_norm": 0.4139776793240363,
343
- "learning_rate": 3.994538346771576e-05,
344
- "loss": 0.9165,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
- "grad_norm": 0.3940723609427906,
350
- "learning_rate": 3.9930884312309894e-05,
351
- "loss": 0.9071,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
- "grad_norm": 0.4016006422322008,
357
- "learning_rate": 3.991468352590069e-05,
358
- "loss": 0.9668,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
- "grad_norm": 0.9528446542157881,
364
- "learning_rate": 3.989678249165612e-05,
365
- "loss": 1.0431,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
- "grad_norm": 0.41600529189619084,
371
- "learning_rate": 3.987718273790548e-05,
372
- "loss": 0.9464,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
- "grad_norm": 1.1382476752327089,
378
- "learning_rate": 3.9855885938008986e-05,
379
- "loss": 1.0186,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
- "grad_norm": 0.44849148754190465,
385
- "learning_rate": 3.983289391021486e-05,
386
- "loss": 0.9981,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
- "grad_norm": 0.4296819710357216,
392
- "learning_rate": 3.9808208617504106e-05,
393
- "loss": 0.9124,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
- "grad_norm": 1.4708100276334197,
399
- "learning_rate": 3.9781832167422926e-05,
400
- "loss": 1.0627,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
- "grad_norm": 0.436502847615945,
406
- "learning_rate": 3.9753766811902756e-05,
407
- "loss": 0.9399,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
- "grad_norm": 0.41131082586189677,
413
- "learning_rate": 3.972401494706805e-05,
414
- "loss": 0.9381,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
- "grad_norm": 0.42792569998778285,
420
- "learning_rate": 3.969257911303167e-05,
421
- "loss": 0.9426,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
- "grad_norm": 1.0484985550985957,
427
- "learning_rate": 3.965946199367804e-05,
428
- "loss": 1.0745,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
- "grad_norm": 0.45563925287513607,
434
- "learning_rate": 3.962466641643398e-05,
435
- "loss": 1.0085,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
- "grad_norm": 0.4216131864169055,
441
- "learning_rate": 3.958819535202732e-05,
442
- "loss": 0.9533,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
- "grad_norm": 0.47284588975540814,
448
- "learning_rate": 3.9550051914233314e-05,
449
- "loss": 0.9727,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
- "grad_norm": 0.4112493584955737,
455
- "learning_rate": 3.951023935960874e-05,
456
- "loss": 0.9408,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
- "grad_norm": 0.44123500755805545,
462
- "learning_rate": 3.9468761087213864e-05,
463
- "loss": 0.9547,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
- "grad_norm": 0.4160767709488051,
469
- "learning_rate": 3.942562063832228e-05,
470
- "loss": 0.9862,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
- "grad_norm": 0.40282812591350464,
476
- "learning_rate": 3.9380821696118556e-05,
477
- "loss": 0.9301,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
- "grad_norm": 0.42252313457664165,
483
- "learning_rate": 3.933436808538375e-05,
484
- "loss": 0.9751,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
- "grad_norm": 0.4084367556454159,
490
- "learning_rate": 3.92862637721689e-05,
491
- "loss": 0.9838,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
- "grad_norm": 0.39446053200993564,
497
- "learning_rate": 3.923651286345638e-05,
498
- "loss": 0.9237,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
- "grad_norm": 0.43051114259650114,
504
- "learning_rate": 3.9185119606809305e-05,
505
- "loss": 0.9543,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
- "grad_norm": 0.41527447901851827,
511
- "learning_rate": 3.913208839000882e-05,
512
- "loss": 0.9688,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
- "grad_norm": 0.4033220715509175,
518
- "learning_rate": 3.907742374067956e-05,
519
- "loss": 0.9401,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
- "grad_norm": 0.4039636146150166,
525
- "learning_rate": 3.9021130325903076e-05,
526
- "loss": 0.9621,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
- "grad_norm": 0.3896809489063709,
532
- "learning_rate": 3.896321295181932e-05,
533
- "loss": 0.986,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
- "grad_norm": 0.7547382513819603,
539
- "learning_rate": 3.89036765632164e-05,
540
- "loss": 1.0528,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
- "grad_norm": 0.42422582617937166,
546
- "learning_rate": 3.8842526243108326e-05,
547
- "loss": 0.9541,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
- "grad_norm": 0.41581388939730257,
553
- "learning_rate": 3.877976721230114e-05,
554
- "loss": 0.9711,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
- "grad_norm": 0.4326138308224312,
560
- "learning_rate": 3.8715404828947055e-05,
561
- "loss": 0.9261,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
- "grad_norm": 0.38852695749391314,
567
- "learning_rate": 3.864944458808712e-05,
568
- "loss": 0.9648,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
- "grad_norm": 0.3897195092049238,
574
- "learning_rate": 3.8581892121181984e-05,
575
- "loss": 0.9397,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
- "grad_norm": 0.43934794613481915,
581
- "learning_rate": 3.851275319563113e-05,
582
- "loss": 0.9905,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
- "grad_norm": 0.5323662587576004,
588
- "learning_rate": 3.844203371428049e-05,
589
- "loss": 0.9896,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
- "grad_norm": 0.38441956539336747,
595
- "learning_rate": 3.836973971491847e-05,
596
- "loss": 0.9385,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
- "grad_norm": 0.38662975914153885,
602
- "learning_rate": 3.8295877369760426e-05,
603
- "loss": 0.9586,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
- "grad_norm": 0.41009140101075614,
609
- "learning_rate": 3.822045298492177e-05,
610
- "loss": 0.9667,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
- "grad_norm": 0.4258642992742759,
616
- "learning_rate": 3.814347299987953e-05,
617
- "loss": 0.954,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
- "grad_norm": 0.40527142541860056,
623
- "learning_rate": 3.806494398692258e-05,
624
- "loss": 0.9351,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
- "grad_norm": 0.3743850574341336,
630
- "learning_rate": 3.7984872650590516e-05,
631
- "loss": 0.9498,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
- "grad_norm": 0.4151867667600151,
637
- "learning_rate": 3.790326582710125e-05,
638
- "loss": 0.9466,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
- "grad_norm": 0.4448011376311795,
644
- "learning_rate": 3.782013048376736e-05,
645
- "loss": 1.0266,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
- "grad_norm": 0.38192124855359877,
651
- "learning_rate": 3.773547371840124e-05,
652
- "loss": 0.978,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
- "grad_norm": 0.4235778210861527,
658
- "learning_rate": 3.764930275870912e-05,
659
- "loss": 0.9827,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
- "grad_norm": 0.4051195260626496,
665
- "learning_rate": 3.756162496167396e-05,
666
- "loss": 0.963,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
- "grad_norm": 0.40700055373961197,
672
- "learning_rate": 3.7472447812927395e-05,
673
- "loss": 0.9437,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.5039370078740157,
678
- "grad_norm": 0.38712614108502513,
679
- "learning_rate": 3.738177892611057e-05,
680
- "loss": 0.955,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.5091863517060368,
685
- "grad_norm": 0.4099596350735423,
686
- "learning_rate": 3.728962604222416e-05,
687
- "loss": 0.9741,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.5144356955380578,
692
- "grad_norm": 0.40040635119594403,
693
- "learning_rate": 3.719599702896745e-05,
694
- "loss": 0.9528,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.5196850393700787,
699
- "grad_norm": 0.4136053200425271,
700
- "learning_rate": 3.710089988006662e-05,
701
- "loss": 0.9466,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.5249343832020997,
706
- "grad_norm": 0.41412239719227456,
707
- "learning_rate": 3.700434271459229e-05,
708
- "loss": 0.9242,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.5301837270341208,
713
- "grad_norm": 0.4309979528684408,
714
- "learning_rate": 3.690633377626628e-05,
715
- "loss": 0.9861,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.5354330708661418,
720
- "grad_norm": 0.4064293156979199,
721
- "learning_rate": 3.680688143275786e-05,
722
- "loss": 0.931,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.5406824146981627,
727
- "grad_norm": 0.4463450853160405,
728
- "learning_rate": 3.670599417496931e-05,
729
- "loss": 0.9084,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.5459317585301837,
734
- "grad_norm": 0.4542877579158036,
735
- "learning_rate": 3.6603680616311013e-05,
736
- "loss": 0.9561,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.5511811023622047,
741
- "grad_norm": 0.4606576229715047,
742
- "learning_rate": 3.6499949491966046e-05,
743
- "loss": 0.9424,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.5564304461942258,
748
- "grad_norm": 1.6662857295077933,
749
- "learning_rate": 3.639480965814443e-05,
750
- "loss": 1.0371,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.5616797900262467,
755
- "grad_norm": 0.42684188670392853,
756
- "learning_rate": 3.628827009132697e-05,
757
- "loss": 0.9635,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.5669291338582677,
762
- "grad_norm": 1.2208350090054685,
763
- "learning_rate": 3.6180339887498953e-05,
764
- "loss": 0.9917,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.5721784776902887,
769
- "grad_norm": 0.4294502318914682,
770
- "learning_rate": 3.6071028261373474e-05,
771
- "loss": 0.9446,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.5774278215223098,
776
- "grad_norm": 0.3937562720593612,
777
- "learning_rate": 3.5960344545604796e-05,
778
- "loss": 0.9278,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.5826771653543307,
783
- "grad_norm": 1.4854854417438403,
784
- "learning_rate": 3.584829818999148e-05,
785
- "loss": 1.0161,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.5879265091863517,
790
- "grad_norm": 0.4240627994414154,
791
- "learning_rate": 3.573489876066967e-05,
792
- "loss": 0.9483,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.5931758530183727,
797
- "grad_norm": 0.3995864923040328,
798
- "learning_rate": 3.5620155939296314e-05,
799
- "loss": 0.9426,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.5984251968503937,
804
- "grad_norm": 0.4085167442197417,
805
- "learning_rate": 3.55040795222226e-05,
806
- "loss": 0.9189,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.6036745406824147,
811
- "grad_norm": 0.411605976954782,
812
- "learning_rate": 3.538667941965758e-05,
813
- "loss": 0.9406,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.6089238845144357,
818
- "grad_norm": 0.4510885035850897,
819
- "learning_rate": 3.526796565482206e-05,
820
- "loss": 0.9609,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.6141732283464567,
825
- "grad_norm": 0.39711542861711363,
826
- "learning_rate": 3.514794836309286e-05,
827
- "loss": 0.9353,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.6194225721784777,
832
- "grad_norm": 0.3860750426711258,
833
- "learning_rate": 3.502663779113747e-05,
834
- "loss": 0.9168,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.6246719160104987,
839
- "grad_norm": 0.4324143866853257,
840
- "learning_rate": 3.490404429603925e-05,
841
- "loss": 0.9412,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.6299212598425197,
846
- "grad_norm": 0.42486288700695524,
847
- "learning_rate": 3.478017834441319e-05,
848
- "loss": 0.9967,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.6351706036745407,
853
- "grad_norm": 0.42059534343716903,
854
- "learning_rate": 3.4655050511512236e-05,
855
- "loss": 0.9042,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.6404199475065617,
860
- "grad_norm": 0.375540386715667,
861
- "learning_rate": 3.452867148032449e-05,
862
- "loss": 0.9261,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.6456692913385826,
867
- "grad_norm": 0.38698966212541075,
868
- "learning_rate": 3.44010520406611e-05,
869
- "loss": 0.9252,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.6509186351706037,
874
- "grad_norm": 0.41709615104288367,
875
- "learning_rate": 3.427220308823505e-05,
876
- "loss": 0.9253,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.6561679790026247,
881
- "grad_norm": 0.4293707133542124,
882
- "learning_rate": 3.4142135623730954e-05,
883
- "loss": 0.9545,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.6614173228346457,
888
- "grad_norm": 0.40563024635145306,
889
- "learning_rate": 3.401086075186582e-05,
890
- "loss": 0.9424,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.6666666666666666,
895
- "grad_norm": 0.47226124502094396,
896
- "learning_rate": 3.3878389680440995e-05,
897
- "loss": 0.9408,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.6719160104986877,
902
- "grad_norm": 0.3921360030995963,
903
- "learning_rate": 3.374473371938526e-05,
904
- "loss": 0.9309,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.6771653543307087,
909
- "grad_norm": 0.4188603496902975,
910
- "learning_rate": 3.3609904279789235e-05,
911
- "loss": 0.9625,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.6824146981627297,
916
- "grad_norm": 0.40729320283126413,
917
- "learning_rate": 3.347391287293115e-05,
918
- "loss": 0.9222,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.6876640419947506,
923
- "grad_norm": 0.43355828675253894,
924
- "learning_rate": 3.333677110929403e-05,
925
- "loss": 0.9245,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.6929133858267716,
930
- "grad_norm": 0.40875412645403303,
931
- "learning_rate": 3.319849069757446e-05,
932
- "loss": 0.9416,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.6981627296587927,
937
- "grad_norm": 0.4184583102080097,
938
- "learning_rate": 3.305908344368289e-05,
939
- "loss": 0.9575,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.7034120734908137,
944
- "grad_norm": 0.37949729176161695,
945
- "learning_rate": 3.291856124973575e-05,
946
- "loss": 0.9283,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.7086614173228346,
951
- "grad_norm": 0.4359197990076154,
952
- "learning_rate": 3.277693611303922e-05,
953
- "loss": 0.9591,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.7139107611548556,
958
- "grad_norm": 0.4127988509227564,
959
- "learning_rate": 3.263422012506502e-05,
960
- "loss": 0.9507,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.7191601049868767,
965
- "grad_norm": 0.4119681718108907,
966
- "learning_rate": 3.249042547041799e-05,
967
- "loss": 0.9252,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.7244094488188977,
972
- "grad_norm": 0.4155554867266832,
973
- "learning_rate": 3.234556442579586e-05,
974
- "loss": 0.9263,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.7296587926509186,
979
- "grad_norm": 0.37277040517135684,
980
- "learning_rate": 3.219964935894114e-05,
981
- "loss": 0.9544,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.7349081364829396,
986
- "grad_norm": 0.41745861140292206,
987
- "learning_rate": 3.205269272758513e-05,
988
- "loss": 0.9213,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.7401574803149606,
993
- "grad_norm": 0.41788351218514774,
994
- "learning_rate": 3.190470707838438e-05,
995
- "loss": 0.9429,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.7454068241469817,
1000
- "grad_norm": 0.3994620013935183,
1001
- "learning_rate": 3.1755705045849465e-05,
1002
- "loss": 0.9065,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.7506561679790026,
1007
- "grad_norm": 0.4006844018528632,
1008
- "learning_rate": 3.160569935126632e-05,
1009
- "loss": 0.9064,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.7559055118110236,
1014
- "grad_norm": 0.44223134289541643,
1015
- "learning_rate": 3.145470280161011e-05,
1016
- "loss": 0.9247,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.7611548556430446,
1021
- "grad_norm": 0.41494829719611687,
1022
- "learning_rate": 3.130272828845184e-05,
1023
- "loss": 0.9126,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.7664041994750657,
1028
- "grad_norm": 0.38947944768031434,
1029
- "learning_rate": 3.114978878685771e-05,
1030
- "loss": 0.8928,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.7716535433070866,
1035
- "grad_norm": 0.3945361927140775,
1036
- "learning_rate": 3.0995897354281347e-05,
1037
- "loss": 0.889,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.7769028871391076,
1042
- "grad_norm": 0.39978716157020916,
1043
- "learning_rate": 3.084106712944899e-05,
1044
- "loss": 0.9227,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.7821522309711286,
1049
- "grad_norm": 0.39603745657551037,
1050
- "learning_rate": 3.068531133123777e-05,
1051
- "loss": 0.8969,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.7874015748031497,
1056
- "grad_norm": 1.3504046960889928,
1057
- "learning_rate": 3.052864325754712e-05,
1058
- "loss": 1.0631,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.7926509186351706,
1063
- "grad_norm": 0.4310383398496922,
1064
- "learning_rate": 3.0371076284163442e-05,
1065
- "loss": 0.9262,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.7979002624671916,
1070
- "grad_norm": 0.41699772424137066,
1071
- "learning_rate": 3.021262386361814e-05,
1072
- "loss": 0.9352,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.8031496062992126,
1077
- "grad_norm": 0.4056852584293386,
1078
- "learning_rate": 3.0053299524039077e-05,
1079
- "loss": 0.8957,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.8083989501312336,
1084
- "grad_norm": 0.4308645558537417,
1085
- "learning_rate": 2.9893116867995583e-05,
1086
- "loss": 0.9137,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.8136482939632546,
1091
- "grad_norm": 0.39136699559712107,
1092
- "learning_rate": 2.9732089571337126e-05,
1093
- "loss": 0.9392,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.8188976377952756,
1098
- "grad_norm": 0.39692286867805615,
1099
- "learning_rate": 2.9570231382025732e-05,
1100
- "loss": 0.9319,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.8241469816272966,
1105
- "grad_norm": 0.389760753952324,
1106
- "learning_rate": 2.9407556118962192e-05,
1107
- "loss": 0.9328,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.8293963254593176,
1112
- "grad_norm": 0.40644738344754366,
1113
- "learning_rate": 2.924407767080627e-05,
1114
- "loss": 0.9511,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.8346456692913385,
1119
- "grad_norm": 0.4235598803780184,
1120
- "learning_rate": 2.9079809994790937e-05,
1121
- "loss": 0.9443,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.8398950131233596,
1126
- "grad_norm": 0.39469735698768543,
1127
- "learning_rate": 2.891476711553077e-05,
1128
- "loss": 0.9353,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.8451443569553806,
1133
- "grad_norm": 0.4231486830962651,
1134
- "learning_rate": 2.8748963123824532e-05,
1135
- "loss": 0.9598,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.8503937007874016,
1140
- "grad_norm": 0.4016499332546737,
1141
- "learning_rate": 2.858241217545218e-05,
1142
- "loss": 0.9182,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.8556430446194225,
1147
- "grad_norm": 0.7416569697844047,
1148
- "learning_rate": 2.8415128489966308e-05,
1149
- "loss": 1.017,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.8608923884514436,
1154
- "grad_norm": 0.4049886957012087,
1155
- "learning_rate": 2.8247126349478073e-05,
1156
- "loss": 0.9377,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.8661417322834646,
1161
- "grad_norm": 0.4240641122781046,
1162
- "learning_rate": 2.80784200974379e-05,
1163
- "loss": 0.936,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.8713910761154856,
1168
- "grad_norm": 0.4010428790320475,
1169
- "learning_rate": 2.790902413741085e-05,
1170
- "loss": 0.9076,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.8766404199475065,
1175
- "grad_norm": 0.40515062001849617,
1176
- "learning_rate": 2.773895293184691e-05,
1177
- "loss": 0.9144,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.8818897637795275,
1182
- "grad_norm": 0.4171752905975984,
1183
- "learning_rate": 2.756822100084621e-05,
1184
- "loss": 0.9302,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.8871391076115486,
1189
- "grad_norm": 0.4018514009140958,
1190
- "learning_rate": 2.7396842920919384e-05,
1191
- "loss": 0.9208,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.8923884514435696,
1196
- "grad_norm": 0.39277733117068253,
1197
- "learning_rate": 2.7224833323743064e-05,
1198
- "loss": 0.9116,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.8976377952755905,
1203
- "grad_norm": 0.6692602521355003,
1204
- "learning_rate": 2.7052206894910653e-05,
1205
- "loss": 1.0122,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.9028871391076115,
1210
- "grad_norm": 0.40843018046933677,
1211
- "learning_rate": 2.6878978372678567e-05,
1212
- "loss": 0.9014,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.9081364829396326,
1217
- "grad_norm": 0.3862093081092539,
1218
- "learning_rate": 2.670516254670788e-05,
1219
- "loss": 0.9367,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.9133858267716536,
1224
- "grad_norm": 0.39106222738031376,
1225
- "learning_rate": 2.6530774256801666e-05,
1226
- "loss": 0.9253,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.9186351706036745,
1231
- "grad_norm": 0.409656789286683,
1232
- "learning_rate": 2.6355828391638036e-05,
1233
- "loss": 0.9259,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.9238845144356955,
1238
- "grad_norm": 0.41048542482358136,
1239
- "learning_rate": 2.618033988749895e-05,
1240
- "loss": 0.9151,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.9291338582677166,
1245
- "grad_norm": 0.39649048041899354,
1246
- "learning_rate": 2.6004323726995057e-05,
1247
- "loss": 0.9197,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.9343832020997376,
1252
- "grad_norm": 0.4041163682720282,
1253
- "learning_rate": 2.5827794937786497e-05,
1254
- "loss": 0.9184,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.9396325459317585,
1259
- "grad_norm": 0.40988870079100986,
1260
- "learning_rate": 2.5650768591299905e-05,
1261
- "loss": 0.9376,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.9448818897637795,
1266
- "grad_norm": 0.3918596025187431,
1267
- "learning_rate": 2.5473259801441663e-05,
1268
- "loss": 0.9102,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.9501312335958005,
1273
- "grad_norm": 0.39082215171197704,
1274
- "learning_rate": 2.5295283723307517e-05,
1275
- "loss": 0.9025,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.9553805774278216,
1280
- "grad_norm": 0.38010414440929924,
1281
- "learning_rate": 2.5116855551888715e-05,
1282
- "loss": 0.9354,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.9606299212598425,
1287
- "grad_norm": 0.4141554447250008,
1288
- "learning_rate": 2.4937990520774664e-05,
1289
- "loss": 0.8782,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.9658792650918635,
1294
- "grad_norm": 0.38201600299774646,
1295
- "learning_rate": 2.4758703900852376e-05,
1296
- "loss": 0.9008,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.9711286089238845,
1301
- "grad_norm": 0.42204019171609175,
1302
- "learning_rate": 2.4579010999002683e-05,
1303
- "loss": 0.8856,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.9763779527559056,
1308
- "grad_norm": 0.4270135581368761,
1309
- "learning_rate": 2.4398927156793376e-05,
1310
- "loss": 0.9205,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.9816272965879265,
1315
- "grad_norm": 0.4150364763728507,
1316
- "learning_rate": 2.42184677491694e-05,
1317
- "loss": 0.8947,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.9868766404199475,
1322
- "grad_norm": 0.51571681852072,
1323
- "learning_rate": 2.4037648183140205e-05,
1324
- "loss": 0.9929,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.9921259842519685,
1329
- "grad_norm": 0.38917079851953085,
1330
- "learning_rate": 2.385648389646434e-05,
1331
- "loss": 0.9121,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.9973753280839895,
1336
- "grad_norm": 0.45530540311855816,
1337
- "learning_rate": 2.367499035633141e-05,
1338
- "loss": 0.9113,
1339
  "step": 190
1340
  }
1341
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
+ "grad_norm": 1.1348930782232016,
14
+ "learning_rate": 1.5000000000000002e-07,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
+ "grad_norm": 1.123696373079589,
21
+ "learning_rate": 3.0000000000000004e-07,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
+ "grad_norm": 1.0989081863562118,
28
+ "learning_rate": 4.5e-07,
29
+ "loss": 1.1158,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
+ "grad_norm": 1.0628548113414964,
35
+ "learning_rate": 6.000000000000001e-07,
36
+ "loss": 1.0986,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
+ "grad_norm": 1.0629069543612368,
42
+ "learning_rate": 7.5e-07,
43
+ "loss": 1.0727,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
+ "grad_norm": 1.1219311917213644,
49
+ "learning_rate": 9e-07,
50
+ "loss": 1.1513,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
+ "grad_norm": 1.068318638334139,
56
+ "learning_rate": 1.05e-06,
57
+ "loss": 1.0978,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
+ "grad_norm": 1.0335025624008565,
63
+ "learning_rate": 1.2000000000000002e-06,
64
+ "loss": 1.0932,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
+ "grad_norm": 0.9514112971268772,
70
+ "learning_rate": 1.35e-06,
71
+ "loss": 1.1046,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
+ "grad_norm": 0.8944230714776324,
77
+ "learning_rate": 1.5e-06,
78
+ "loss": 1.0638,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
+ "grad_norm": 0.8720343077794245,
84
+ "learning_rate": 1.65e-06,
85
+ "loss": 1.1132,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
+ "grad_norm": 0.7519518665820406,
91
+ "learning_rate": 1.8e-06,
92
+ "loss": 1.0788,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
+ "grad_norm": 0.7768466543241798,
98
+ "learning_rate": 1.95e-06,
99
+ "loss": 1.0795,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
+ "grad_norm": 0.7109922479048013,
105
+ "learning_rate": 2.1e-06,
106
+ "loss": 1.1012,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
+ "grad_norm": 0.6312078880187205,
112
+ "learning_rate": 2.25e-06,
113
+ "loss": 1.0851,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
+ "grad_norm": 0.5514473048370377,
119
+ "learning_rate": 2.4000000000000003e-06,
120
+ "loss": 1.1041,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
+ "grad_norm": 0.6271281070432462,
126
+ "learning_rate": 2.55e-06,
127
+ "loss": 1.0855,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
+ "grad_norm": 0.7059888078645049,
133
+ "learning_rate": 2.7e-06,
134
+ "loss": 1.0473,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
+ "grad_norm": 0.7226157330393405,
140
+ "learning_rate": 2.85e-06,
141
+ "loss": 1.0665,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
+ "grad_norm": 0.7244742832208652,
147
+ "learning_rate": 3e-06,
148
+ "loss": 1.0604,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
+ "grad_norm": 0.7088251146482789,
154
+ "learning_rate": 3.1500000000000003e-06,
155
+ "loss": 1.0516,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
+ "grad_norm": 0.5987242362229293,
161
+ "learning_rate": 3.3e-06,
162
+ "loss": 1.084,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
+ "grad_norm": 0.5730637810768702,
168
+ "learning_rate": 3.45e-06,
169
+ "loss": 1.0621,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
+ "grad_norm": 0.5894968443138215,
175
+ "learning_rate": 3.6e-06,
176
+ "loss": 1.0797,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
+ "grad_norm": 0.5798124303184627,
182
+ "learning_rate": 3.75e-06,
183
+ "loss": 1.0035,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
+ "grad_norm": 0.643205751513686,
189
+ "learning_rate": 3.9e-06,
190
+ "loss": 1.0455,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
+ "grad_norm": 0.5621970774702022,
196
+ "learning_rate": 4.05e-06,
197
+ "loss": 1.0576,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
+ "grad_norm": 0.5506084571895594,
203
+ "learning_rate": 4.2e-06,
204
+ "loss": 1.0298,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
+ "grad_norm": 0.48741149421912777,
210
+ "learning_rate": 4.35e-06,
211
+ "loss": 1.0018,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
+ "grad_norm": 0.46403007703544275,
217
+ "learning_rate": 4.5e-06,
218
+ "loss": 0.9872,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
+ "grad_norm": 0.4754381818573106,
224
+ "learning_rate": 4.65e-06,
225
+ "loss": 1.0271,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
+ "grad_norm": 0.9362850890979981,
231
+ "learning_rate": 4.800000000000001e-06,
232
+ "loss": 1.0437,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
+ "grad_norm": 0.47391181595772164,
238
+ "learning_rate": 4.95e-06,
239
+ "loss": 1.0437,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
+ "grad_norm": 0.5276920454851337,
245
+ "learning_rate": 5.1e-06,
246
+ "loss": 1.0557,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
+ "grad_norm": 0.4616075133913133,
252
+ "learning_rate": 5.2500000000000006e-06,
253
+ "loss": 1.0465,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
+ "grad_norm": 0.4555174555636226,
259
+ "learning_rate": 5.4e-06,
260
+ "loss": 1.0588,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
+ "grad_norm": 0.5071864534648831,
266
+ "learning_rate": 5.55e-06,
267
+ "loss": 1.044,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
+ "grad_norm": 0.4851367263882934,
273
+ "learning_rate": 5.7e-06,
274
+ "loss": 1.0464,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
+ "grad_norm": 0.44188022228811896,
280
+ "learning_rate": 5.85e-06,
281
+ "loss": 1.0182,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
+ "grad_norm": 0.43420740120454643,
287
+ "learning_rate": 6e-06,
288
+ "loss": 1.0188,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
+ "grad_norm": 0.4291543441241407,
294
+ "learning_rate": 5.9998719351101036e-06,
295
+ "loss": 1.0245,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
+ "grad_norm": 0.43326370236005163,
301
+ "learning_rate": 5.999487751374158e-06,
302
+ "loss": 1.0238,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
+ "grad_norm": 0.427571644972227,
308
+ "learning_rate": 5.998847481592462e-06,
309
+ "loss": 1.0311,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
+ "grad_norm": 0.4215063088273006,
315
+ "learning_rate": 5.997951180429069e-06,
316
+ "loss": 0.9925,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
+ "grad_norm": 0.4206536914503675,
322
+ "learning_rate": 5.996798924407118e-06,
323
+ "loss": 1.003,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
+ "grad_norm": 0.40910969064965136,
329
+ "learning_rate": 5.995390811902302e-06,
330
+ "loss": 0.9949,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
+ "grad_norm": 0.4165775049327623,
336
+ "learning_rate": 5.993726963134471e-06,
337
+ "loss": 0.9734,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
+ "grad_norm": 0.3832235501001726,
343
+ "learning_rate": 5.9918075201573645e-06,
344
+ "loss": 0.9485,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
+ "grad_norm": 0.37002495168808525,
350
+ "learning_rate": 5.9896326468464835e-06,
351
+ "loss": 0.9358,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
+ "grad_norm": 0.44836853406053057,
357
+ "learning_rate": 5.987202528885104e-06,
358
+ "loss": 0.9982,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
+ "grad_norm": 0.4080608606117312,
364
+ "learning_rate": 5.984517373748417e-06,
365
+ "loss": 1.0129,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
+ "grad_norm": 0.4001550595702573,
371
+ "learning_rate": 5.981577410685822e-06,
372
+ "loss": 0.9788,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
+ "grad_norm": 0.41021488877460305,
378
+ "learning_rate": 5.978382890701347e-06,
379
+ "loss": 1.0262,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
+ "grad_norm": 0.39997016380492506,
385
+ "learning_rate": 5.9749340865322284e-06,
386
+ "loss": 1.0275,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
+ "grad_norm": 0.3839823787027912,
392
+ "learning_rate": 5.971231292625615e-06,
393
+ "loss": 0.9374,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
+ "grad_norm": 0.4125068495663659,
399
+ "learning_rate": 5.967274825113438e-06,
400
+ "loss": 0.9954,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
+ "grad_norm": 0.3908377197765856,
406
+ "learning_rate": 5.963065021785414e-06,
407
+ "loss": 0.9671,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
+ "grad_norm": 0.3850488592862481,
413
+ "learning_rate": 5.958602242060207e-06,
414
+ "loss": 0.9657,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
+ "grad_norm": 0.3877990366088493,
420
+ "learning_rate": 5.95388686695475e-06,
421
+ "loss": 0.9678,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
+ "grad_norm": 0.40470471194287355,
427
+ "learning_rate": 5.948919299051706e-06,
428
+ "loss": 1.0149,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
+ "grad_norm": 0.42889495063392963,
434
+ "learning_rate": 5.943699962465096e-06,
435
+ "loss": 1.033,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
+ "grad_norm": 0.39164358737100274,
441
+ "learning_rate": 5.9382293028040985e-06,
442
+ "loss": 0.9761,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
+ "grad_norm": 0.3869342590567232,
448
+ "learning_rate": 5.9325077871349975e-06,
449
+ "loss": 0.9982,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
+ "grad_norm": 0.39264627926569035,
455
+ "learning_rate": 5.9265359039413105e-06,
456
+ "loss": 0.9667,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
+ "grad_norm": 0.3887717698297268,
462
+ "learning_rate": 5.920314163082079e-06,
463
+ "loss": 0.9806,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
+ "grad_norm": 0.40896336915084297,
469
+ "learning_rate": 5.913843095748342e-06,
470
+ "loss": 1.0135,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
+ "grad_norm": 0.3610209560875707,
476
+ "learning_rate": 5.907123254417783e-06,
477
+ "loss": 0.956,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
+ "grad_norm": 0.38154744815823505,
483
+ "learning_rate": 5.9001552128075625e-06,
484
+ "loss": 1.0045,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
+ "grad_norm": 0.4094826396119445,
490
+ "learning_rate": 5.892939565825335e-06,
491
+ "loss": 1.0069,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
+ "grad_norm": 0.39129138622932325,
497
+ "learning_rate": 5.885476929518457e-06,
498
+ "loss": 0.9525,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
+ "grad_norm": 0.3712890701175899,
504
+ "learning_rate": 5.8777679410213956e-06,
505
+ "loss": 0.9792,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
+ "grad_norm": 0.4086264062600148,
511
+ "learning_rate": 5.869813258501323e-06,
512
+ "loss": 0.9926,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
+ "grad_norm": 0.368975878599487,
518
+ "learning_rate": 5.861613561101934e-06,
519
+ "loss": 0.9643,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
+ "grad_norm": 0.36792811629461203,
525
+ "learning_rate": 5.853169548885461e-06,
526
+ "loss": 0.9867,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
+ "grad_norm": 0.3566251893981936,
532
+ "learning_rate": 5.844481942772898e-06,
533
+ "loss": 1.0069,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
+ "grad_norm": 0.4578529359685586,
539
+ "learning_rate": 5.835551484482459e-06,
540
+ "loss": 1.0173,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
+ "grad_norm": 0.3935925285922137,
546
+ "learning_rate": 5.826378936466249e-06,
547
+ "loss": 0.9743,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
+ "grad_norm": 0.4109939217838428,
553
+ "learning_rate": 5.81696508184517e-06,
554
+ "loss": 0.9866,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
+ "grad_norm": 0.3839870332489822,
560
+ "learning_rate": 5.807310724342058e-06,
561
+ "loss": 0.9516,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
+ "grad_norm": 0.3774576797883406,
567
+ "learning_rate": 5.797416688213067e-06,
568
+ "loss": 0.9895,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
+ "grad_norm": 0.3817468964498129,
574
+ "learning_rate": 5.787283818177297e-06,
575
+ "loss": 0.9632,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
+ "grad_norm": 0.60843002346461,
581
+ "learning_rate": 5.776912979344669e-06,
582
+ "loss": 1.0166,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
+ "grad_norm": 0.3858713700245362,
588
+ "learning_rate": 5.766305057142073e-06,
589
+ "loss": 0.9976,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
+ "grad_norm": 0.3724153436541016,
595
+ "learning_rate": 5.755460957237769e-06,
596
+ "loss": 0.9645,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
+ "grad_norm": 0.38201105695018567,
602
+ "learning_rate": 5.744381605464064e-06,
603
+ "loss": 0.9899,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
+ "grad_norm": 0.38383930861007165,
609
+ "learning_rate": 5.7330679477382655e-06,
610
+ "loss": 0.9919,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
+ "grad_norm": 0.4078870418259581,
616
+ "learning_rate": 5.7215209499819296e-06,
617
+ "loss": 0.9797,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
+ "grad_norm": 0.38463767466523974,
623
+ "learning_rate": 5.709741598038387e-06,
624
+ "loss": 0.9597,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
+ "grad_norm": 0.36309855116472584,
630
+ "learning_rate": 5.697730897588577e-06,
631
+ "loss": 0.9737,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
+ "grad_norm": 0.4106701446638758,
637
+ "learning_rate": 5.685489874065187e-06,
638
+ "loss": 0.9683,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
+ "grad_norm": 0.37110409255145443,
644
+ "learning_rate": 5.673019572565103e-06,
645
+ "loss": 1.0418,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
+ "grad_norm": 0.3558357783330656,
651
+ "learning_rate": 5.660321057760186e-06,
652
+ "loss": 1.0055,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
+ "grad_norm": 0.40499489938404787,
658
+ "learning_rate": 5.6473954138063674e-06,
659
+ "loss": 1.0113,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
+ "grad_norm": 0.39428526462199764,
665
+ "learning_rate": 5.634243744251094e-06,
666
+ "loss": 0.9875,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
+ "grad_norm": 0.3711741011240413,
672
+ "learning_rate": 5.620867171939109e-06,
673
+ "loss": 0.9749,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.5039370078740157,
678
+ "grad_norm": 0.3961340085644134,
679
+ "learning_rate": 5.607266838916585e-06,
680
+ "loss": 0.982,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.5091863517060368,
685
+ "grad_norm": 0.3784646685814138,
686
+ "learning_rate": 5.593443906333624e-06,
687
+ "loss": 0.9957,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.5144356955380578,
692
+ "grad_norm": 0.3750460397069026,
693
+ "learning_rate": 5.579399554345118e-06,
694
+ "loss": 0.9755,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.5196850393700787,
699
+ "grad_norm": 0.3746718538274792,
700
+ "learning_rate": 5.565134982009994e-06,
701
+ "loss": 0.9736,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.5249343832020997,
706
+ "grad_norm": 0.38418890409196027,
707
+ "learning_rate": 5.550651407188843e-06,
708
+ "loss": 0.9506,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.5301837270341208,
713
+ "grad_norm": 0.422976375435725,
714
+ "learning_rate": 5.535950066439941e-06,
715
+ "loss": 1.0141,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.5354330708661418,
720
+ "grad_norm": 0.38354451243133536,
721
+ "learning_rate": 5.521032214913679e-06,
722
+ "loss": 0.9618,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.5406824146981627,
727
+ "grad_norm": 0.38257660011773076,
728
+ "learning_rate": 5.505899126245397e-06,
729
+ "loss": 0.939,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.5459317585301837,
734
+ "grad_norm": 0.3768438915225408,
735
+ "learning_rate": 5.490552092446652e-06,
736
+ "loss": 0.9675,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.5511811023622047,
741
+ "grad_norm": 0.3749655286727107,
742
+ "learning_rate": 5.474992423794907e-06,
743
+ "loss": 0.9592,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.5564304461942258,
748
+ "grad_norm": 0.38461916993489687,
749
+ "learning_rate": 5.459221448721664e-06,
750
+ "loss": 0.9623,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.5616797900262467,
755
+ "grad_norm": 0.35648642966931204,
756
+ "learning_rate": 5.443240513699045e-06,
757
+ "loss": 0.985,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.5669291338582677,
762
+ "grad_norm": 0.4051560712719681,
763
+ "learning_rate": 5.427050983124842e-06,
764
+ "loss": 0.9407,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.5721784776902887,
769
+ "grad_norm": 0.3769879713701903,
770
+ "learning_rate": 5.410654239206021e-06,
771
+ "loss": 0.968,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.5774278215223098,
776
+ "grad_norm": 0.3746822083724367,
777
+ "learning_rate": 5.394051681840719e-06,
778
+ "loss": 0.9497,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.5826771653543307,
783
+ "grad_norm": 0.3987231911136733,
784
+ "learning_rate": 5.3772447284987216e-06,
785
+ "loss": 0.961,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.5879265091863517,
790
+ "grad_norm": 0.37848222525971176,
791
+ "learning_rate": 5.36023481410045e-06,
792
+ "loss": 0.9707,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.5931758530183727,
797
+ "grad_norm": 0.3794904855253974,
798
+ "learning_rate": 5.343023390894446e-06,
799
+ "loss": 0.9714,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.5984251968503937,
804
+ "grad_norm": 0.37452267525256994,
805
+ "learning_rate": 5.325611928333389e-06,
806
+ "loss": 0.9406,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.6036745406824147,
811
+ "grad_norm": 0.39474437059829304,
812
+ "learning_rate": 5.308001912948637e-06,
813
+ "loss": 0.9626,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.6089238845144357,
818
+ "grad_norm": 0.4023921986663554,
819
+ "learning_rate": 5.290194848223309e-06,
820
+ "loss": 0.9889,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.6141732283464567,
825
+ "grad_norm": 0.39963771712171875,
826
+ "learning_rate": 5.272192254463929e-06,
827
+ "loss": 0.9639,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.6194225721784777,
832
+ "grad_norm": 0.3893586064595733,
833
+ "learning_rate": 5.2539956686706205e-06,
834
+ "loss": 0.9469,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.6246719160104987,
839
+ "grad_norm": 0.4651495625439333,
840
+ "learning_rate": 5.2356066444058875e-06,
841
+ "loss": 0.9658,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.6299212598425197,
846
+ "grad_norm": 0.39599728107932586,
847
+ "learning_rate": 5.217026751661978e-06,
848
+ "loss": 1.0137,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.6351706036745407,
853
+ "grad_norm": 0.406988761369817,
854
+ "learning_rate": 5.198257576726835e-06,
855
+ "loss": 0.9306,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.6404199475065617,
860
+ "grad_norm": 0.3611939094322339,
861
+ "learning_rate": 5.179300722048673e-06,
862
+ "loss": 0.9462,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.6456692913385826,
867
+ "grad_norm": 0.3809841775392484,
868
+ "learning_rate": 5.1601578060991645e-06,
869
+ "loss": 0.953,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.6509186351706037,
874
+ "grad_norm": 0.46022843064705843,
875
+ "learning_rate": 5.1408304632352575e-06,
876
+ "loss": 0.9422,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.6561679790026247,
881
+ "grad_norm": 0.3979704646560941,
882
+ "learning_rate": 5.1213203435596425e-06,
883
+ "loss": 0.9751,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.6614173228346457,
888
+ "grad_norm": 0.39388496260457084,
889
+ "learning_rate": 5.101629112779873e-06,
890
+ "loss": 0.9722,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.6666666666666666,
895
+ "grad_norm": 0.3899148438115094,
896
+ "learning_rate": 5.08175845206615e-06,
897
+ "loss": 0.9652,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.6719160104986877,
902
+ "grad_norm": 0.37391882787694275,
903
+ "learning_rate": 5.061710057907788e-06,
904
+ "loss": 0.9621,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.6771653543307087,
909
+ "grad_norm": 0.39500875865406576,
910
+ "learning_rate": 5.041485641968385e-06,
911
+ "loss": 0.9899,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.6824146981627297,
916
+ "grad_norm": 0.37540362490802714,
917
+ "learning_rate": 5.021086930939672e-06,
918
+ "loss": 0.9472,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.6876640419947506,
923
+ "grad_norm": 0.3940788728379769,
924
+ "learning_rate": 5.000515666394105e-06,
925
+ "loss": 0.9479,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.6929133858267716,
930
+ "grad_norm": 0.3919125365655477,
931
+ "learning_rate": 4.979773604636169e-06,
932
+ "loss": 0.9624,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.6981627296587927,
937
+ "grad_norm": 0.3804552314744538,
938
+ "learning_rate": 4.958862516552433e-06,
939
+ "loss": 0.9806,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.7034120734908137,
944
+ "grad_norm": 0.3674434286105591,
945
+ "learning_rate": 4.937784187460362e-06,
946
+ "loss": 0.9511,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.7086614173228346,
951
+ "grad_norm": 0.4109777494732396,
952
+ "learning_rate": 4.916540416955884e-06,
953
+ "loss": 0.9943,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.7139107611548556,
958
+ "grad_norm": 0.40231567788837497,
959
+ "learning_rate": 4.895133018759753e-06,
960
+ "loss": 0.9798,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.7191601049868767,
965
+ "grad_norm": 0.3721834479908975,
966
+ "learning_rate": 4.873563820562698e-06,
967
+ "loss": 0.9504,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.7244094488188977,
972
+ "grad_norm": 0.36127526200518306,
973
+ "learning_rate": 4.851834663869379e-06,
974
+ "loss": 0.9517,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.7296587926509186,
979
+ "grad_norm": 0.3513827139135777,
980
+ "learning_rate": 4.82994740384117e-06,
981
+ "loss": 0.9835,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.7349081364829396,
986
+ "grad_norm": 0.36760728272750326,
987
+ "learning_rate": 4.80790390913777e-06,
988
+ "loss": 0.9503,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.7401574803149606,
993
+ "grad_norm": 0.36275280721999276,
994
+ "learning_rate": 4.785706061757656e-06,
995
+ "loss": 0.9743,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.7454068241469817,
1000
+ "grad_norm": 0.3733380512329921,
1001
+ "learning_rate": 4.763355756877419e-06,
1002
+ "loss": 0.9384,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.7506561679790026,
1007
+ "grad_norm": 0.3801691027568987,
1008
+ "learning_rate": 4.740854902689947e-06,
1009
+ "loss": 0.9296,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.7559055118110236,
1014
+ "grad_norm": 0.39053906811778566,
1015
+ "learning_rate": 4.718205420241516e-06,
1016
+ "loss": 0.9488,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.7611548556430446,
1021
+ "grad_norm": 0.3923993707534958,
1022
+ "learning_rate": 4.695409243267776e-06,
1023
+ "loss": 0.9383,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.7664041994750657,
1028
+ "grad_norm": 0.364792552828712,
1029
+ "learning_rate": 4.672468318028657e-06,
1030
+ "loss": 0.9193,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.7716535433070866,
1035
+ "grad_norm": 0.35070825551906964,
1036
+ "learning_rate": 4.649384603142202e-06,
1037
+ "loss": 0.9164,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.7769028871391076,
1042
+ "grad_norm": 0.37099778180795795,
1043
+ "learning_rate": 4.626160069417348e-06,
1044
+ "loss": 0.9425,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.7821522309711286,
1049
+ "grad_norm": 0.36954118968922517,
1050
+ "learning_rate": 4.602796699685665e-06,
1051
+ "loss": 0.9265,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.7874015748031497,
1056
+ "grad_norm": 0.4076466706382121,
1057
+ "learning_rate": 4.579296488632067e-06,
1058
+ "loss": 1.0133,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.7926509186351706,
1063
+ "grad_norm": 0.4015334925568992,
1064
+ "learning_rate": 4.5556614426245165e-06,
1065
+ "loss": 0.9486,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.7979002624671916,
1070
+ "grad_norm": 0.39628644809730684,
1071
+ "learning_rate": 4.5318935795427206e-06,
1072
+ "loss": 0.9605,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.8031496062992126,
1077
+ "grad_norm": 0.36792154742540445,
1078
+ "learning_rate": 4.507994928605862e-06,
1079
+ "loss": 0.9287,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.8083989501312336,
1084
+ "grad_norm": 0.3887839296706913,
1085
+ "learning_rate": 4.483967530199337e-06,
1086
+ "loss": 0.951,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.8136482939632546,
1091
+ "grad_norm": 0.36716852968968616,
1092
+ "learning_rate": 4.459813435700569e-06,
1093
+ "loss": 0.9702,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.8188976377952756,
1098
+ "grad_norm": 0.3533521076976156,
1099
+ "learning_rate": 4.4355347073038595e-06,
1100
+ "loss": 0.9612,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.8241469816272966,
1105
+ "grad_norm": 0.3499649930079787,
1106
+ "learning_rate": 4.411133417844328e-06,
1107
+ "loss": 0.9599,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.8293963254593176,
1112
+ "grad_norm": 0.38582146832565867,
1113
+ "learning_rate": 4.38661165062094e-06,
1114
+ "loss": 0.9894,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.8346456692913385,
1119
+ "grad_norm": 0.39040836855795735,
1120
+ "learning_rate": 4.36197149921864e-06,
1121
+ "loss": 0.9747,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.8398950131233596,
1126
+ "grad_norm": 0.3798580758700489,
1127
+ "learning_rate": 4.3372150673296155e-06,
1128
+ "loss": 0.9654,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.8451443569553806,
1133
+ "grad_norm": 0.3764456540061034,
1134
+ "learning_rate": 4.3123444685736795e-06,
1135
+ "loss": 0.9823,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.8503937007874016,
1140
+ "grad_norm": 0.3771195417830333,
1141
+ "learning_rate": 4.287361826317827e-06,
1142
+ "loss": 0.9456,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.8556430446194225,
1147
+ "grad_norm": 0.37650137746409273,
1148
+ "learning_rate": 4.262269273494946e-06,
1149
+ "loss": 1.0022,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.8608923884514436,
1154
+ "grad_norm": 0.38148353077474145,
1155
+ "learning_rate": 4.237068952421711e-06,
1156
+ "loss": 0.964,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.8661417322834646,
1161
+ "grad_norm": 0.3982519128695332,
1162
+ "learning_rate": 4.2117630146156845e-06,
1163
+ "loss": 0.9673,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.8713910761154856,
1168
+ "grad_norm": 0.36000775624632003,
1169
+ "learning_rate": 4.186353620611627e-06,
1170
+ "loss": 0.9359,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.8766404199475065,
1175
+ "grad_norm": 0.36850454735662447,
1176
+ "learning_rate": 4.160842939777036e-06,
1177
+ "loss": 0.9422,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.8818897637795275,
1182
+ "grad_norm": 0.37804115639757085,
1183
+ "learning_rate": 4.135233150126931e-06,
1184
+ "loss": 0.9454,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.8871391076115486,
1189
+ "grad_norm": 0.3689383402086321,
1190
+ "learning_rate": 4.109526438137908e-06,
1191
+ "loss": 0.9455,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.8923884514435696,
1196
+ "grad_norm": 0.46527154775209717,
1197
+ "learning_rate": 4.08372499856146e-06,
1198
+ "loss": 0.9386,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.8976377952755905,
1203
+ "grad_norm": 0.45653306710128705,
1204
+ "learning_rate": 4.0578310342365975e-06,
1205
+ "loss": 0.9616,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.9028871391076115,
1210
+ "grad_norm": 0.3773630567359451,
1211
+ "learning_rate": 4.031846755901785e-06,
1212
+ "loss": 0.9285,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.9081364829396326,
1217
+ "grad_norm": 0.3644595191521506,
1218
+ "learning_rate": 4.005774382006182e-06,
1219
+ "loss": 0.9663,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.9133858267716536,
1224
+ "grad_norm": 0.3539767481135477,
1225
+ "learning_rate": 3.97961613852025e-06,
1226
+ "loss": 0.9564,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.9186351706036745,
1231
+ "grad_norm": 0.3819676152776953,
1232
+ "learning_rate": 3.953374258745705e-06,
1233
+ "loss": 0.9607,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.9238845144356955,
1238
+ "grad_norm": 0.38397675786726637,
1239
+ "learning_rate": 3.927050983124842e-06,
1240
+ "loss": 0.9539,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.9291338582677166,
1245
+ "grad_norm": 0.3979084367711538,
1246
+ "learning_rate": 3.900648559049258e-06,
1247
+ "loss": 0.9505,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.9343832020997376,
1252
+ "grad_norm": 0.3756154385935223,
1253
+ "learning_rate": 3.874169240667974e-06,
1254
+ "loss": 0.9519,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.9396325459317585,
1259
+ "grad_norm": 0.40551973597201274,
1260
+ "learning_rate": 3.847615288694985e-06,
1261
+ "loss": 0.9727,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.9448818897637795,
1266
+ "grad_norm": 0.4149625851710124,
1267
+ "learning_rate": 3.820988970216249e-06,
1268
+ "loss": 0.9464,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.9501312335958005,
1273
+ "grad_norm": 0.35739115830542967,
1274
+ "learning_rate": 3.7942925584961272e-06,
1275
+ "loss": 0.9427,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.9553805774278216,
1280
+ "grad_norm": 0.3759540038847051,
1281
+ "learning_rate": 3.767528332783307e-06,
1282
+ "loss": 0.9679,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.9606299212598425,
1287
+ "grad_norm": 0.3525867658299593,
1288
+ "learning_rate": 3.740698578116199e-06,
1289
+ "loss": 0.9183,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.9658792650918635,
1294
+ "grad_norm": 0.3557123352774738,
1295
+ "learning_rate": 3.7138055851278564e-06,
1296
+ "loss": 0.9383,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.9711286089238845,
1301
+ "grad_norm": 0.3623514252763418,
1302
+ "learning_rate": 3.6868516498504025e-06,
1303
+ "loss": 0.9246,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.9763779527559056,
1308
+ "grad_norm": 0.38495496418054853,
1309
+ "learning_rate": 3.6598390735190066e-06,
1310
+ "loss": 0.9612,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.9816272965879265,
1315
+ "grad_norm": 0.3648599004428126,
1316
+ "learning_rate": 3.63277016237541e-06,
1317
+ "loss": 0.9293,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.9868766404199475,
1322
+ "grad_norm": 0.38871547084803876,
1323
+ "learning_rate": 3.6056472274710305e-06,
1324
+ "loss": 0.9973,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.9921259842519685,
1329
+ "grad_norm": 0.38590844403642666,
1330
+ "learning_rate": 3.578472584469651e-06,
1331
+ "loss": 0.9457,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.9973753280839895,
1336
+ "grad_norm": 0.3872507088649178,
1337
+ "learning_rate": 3.5512485534497116e-06,
1338
+ "loss": 0.9462,
1339
  "step": 190
1340
  }
1341
  ],
checkpoint-190/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c0d2528dcfd8d62d3c517248c2d231cc9ff64ec148911ec3ce58a9d39f7507d
3
  size 8376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b860c7e838727b1a9d8001f5c7a769bd0d63566ea45620719245b6beb59f1cd9
3
  size 8376