Delta-Vector commited on
Commit
f3de505
·
verified ·
1 Parent(s): e588d80

Training in progress, step 95, checkpoint

Browse files
Files changed (25) hide show
  1. checkpoint-95/global_step95/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  2. checkpoint-95/global_step95/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  3. checkpoint-95/global_step95/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  4. checkpoint-95/global_step95/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  5. checkpoint-95/global_step95/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  6. checkpoint-95/global_step95/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-95/global_step95/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  8. checkpoint-95/global_step95/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-95/model-00001-of-00014.safetensors +1 -1
  10. checkpoint-95/model-00002-of-00014.safetensors +1 -1
  11. checkpoint-95/model-00003-of-00014.safetensors +1 -1
  12. checkpoint-95/model-00004-of-00014.safetensors +1 -1
  13. checkpoint-95/model-00005-of-00014.safetensors +1 -1
  14. checkpoint-95/model-00006-of-00014.safetensors +1 -1
  15. checkpoint-95/model-00007-of-00014.safetensors +1 -1
  16. checkpoint-95/model-00008-of-00014.safetensors +1 -1
  17. checkpoint-95/model-00009-of-00014.safetensors +1 -1
  18. checkpoint-95/model-00010-of-00014.safetensors +1 -1
  19. checkpoint-95/model-00011-of-00014.safetensors +1 -1
  20. checkpoint-95/model-00012-of-00014.safetensors +1 -1
  21. checkpoint-95/model-00013-of-00014.safetensors +1 -1
  22. checkpoint-95/model-00014-of-00014.safetensors +1 -1
  23. checkpoint-95/scheduler.pt +1 -1
  24. checkpoint-95/trainer_state.json +283 -283
  25. checkpoint-95/training_args.bin +1 -1
checkpoint-95/global_step95/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aea7730a8aff4b6b047b5f5da7d6e92d171f8a7704acbff807de4e7a48c088b
3
+ size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:135d0497c207add1f36471bea35fff5000ba50745c92a575886123c2e0b68e74
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92d04a4e4406bda00a5fa7020911d719d56352c107274511d3ce06bf6394775
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50bc395471c82b6295c5ddb4b9f43197747bfa064550d76a24b98f98694df9b0
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd1785f7460d40c8691ae13c252dbde3e052b95b799fa03403970c9a4516c546
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17e69775bae6cb0a560ae20f16a69cd6ef8fcd7fa8c157aa214ebe3b6fb1f5bd
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:025dddccd8e71b2e557c577d277d6cdffa55508280ff20c3e78add6006f6f1d1
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2a042638c3c0d364b5461db5bd7ad3d8dbbc27bec8c5fd3ac1b815caba02fe3
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73b092191ba7f4bbfeef47cb91fcb39abc0cbd2f9de7c78034cf7607a79f4e87
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ae291ae038fec856d4ebbd265cb9b87db7725cdc5f336c723aeaf8ef551912e
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6ac37f7a7a938b100ca4a78a1c75f560ad3288e1b3003b4a3b13af592a7e3d1
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:157b631b579caf80f5cb4143694798dd10818d353835c9745fd523e65578ce81
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3b786e6e58598e2a74baaf245cf17953e2226a744d5579f59bd2f97ad136013
3
  size 24702833511
checkpoint-95/global_step95/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4695a5fb838935a2806f787a3e8470104b47cbc5573cb88c7d77a332c57e46cf
3
  size 24702833511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef871300ea108f7bbc6a9c90a08ca39c3d0260b3a812819c3c6526d63c1d5cc
3
  size 24702833511
checkpoint-95/model-00001-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34ba4337f0c6c65d807d3be8c2c276f1ddab5334ae41746dbb4b8d91115914b2
3
  size 4891730992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b59b78ab9bb06e6b8c2ed4cf69fa87a8ca10391fef4b38adeb5165a083a63504
3
  size 4891730992
checkpoint-95/model-00002-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:426332f314858aa0f46eb94863118745cc35707fedb26287ed50fd4c41afe724
3
  size 4876059352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6659988665885e4f868e31ed175f350f1fab7d301d16c18f91a93b75739fb8c1
3
  size 4876059352
checkpoint-95/model-00003-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63c844d8ad06cc198ed5c742c042f3a165abb5e41ebc428ec2260f4d91094d76
3
  size 4876059384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:336c9a2a9acb4c0441ba41cc7b36a1fab7ce2c85b7841eb9443e06394d39a9e5
3
  size 4876059384
checkpoint-95/model-00004-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:012066a115bf9e4d2a7185ddd2853ec0860bfba746ba3d94ca51b55038611610
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62c838f681f47e1a3343424b19f07db1f6fe31798b98cadded863b4c45b289aa
3
  size 4876059416
checkpoint-95/model-00005-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9cc4109b07b718c1e4b42e3f58958be3f8f5163ca1baac79821a6ef2a942a2e
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b71fd2c70d620236a0cf3cc712fe380edfac3c212d478d87b4b73a64c18ef1a1
3
  size 4876059416
checkpoint-95/model-00006-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49c43572b04c70397ed3a6069323d442eeed1e728522bbeb7f38c1cffc22d756
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3d5a728b9e5add582fd21331012f9c9a65177bb875d74045190c6e1c6959a7
3
  size 4876059416
checkpoint-95/model-00007-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfc32a6a516048945773181ce8a365dd4617d8dca622dfad56a2759cb9481ae2
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a9aa742d3a6b8cf76b6b60d8a6597b5af6635835034e21a5f77f1218af8f1eb
3
  size 4876059416
checkpoint-95/model-00008-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb802a607e065d561c8c6d90c2e379b3bf57b15a2b57d66e33ffc948b862b4d4
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470fd86cea4f6d87d4b03b85067c0c7680d81d1379995c2ad6bb8ff1fed91acd
3
  size 4876059416
checkpoint-95/model-00009-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1608b21abee903fe4c78e98f26227c6578a9d59d8159ed7fbb09de279a63cd38
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77bcd179a159e47aa37da34df41faa0cbd22ad91ee2e7e898409fa3cf6483f8
3
  size 4876059416
checkpoint-95/model-00010-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb239e8c9ed2c16a7503e704db2bb16844d151e8349e717e49941791d08b1299
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c83f2622b99a712bab1c27b04ddcdfa39de48eeba7210acd1057f23b6a3c7d64
3
  size 4876059416
checkpoint-95/model-00011-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ac8d18251e7988c1a5b43914218f4e0fb60e4abac1e38095c462d78e1bd0ed3
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ccdec93a48a0028de67e517bf47285fa9b4de1caa36e254faf36dc46cae73d6
3
  size 4876059416
checkpoint-95/model-00012-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d41de3b5a57810496f410243d9c29b81efb0c63633ed4ed10f9510c0bb09db1e
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69776e809d20dc3a8e4b12520145e3b0fee792177e905a90159a59c2e609156b
3
  size 4876059416
checkpoint-95/model-00013-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ba5a641f5cfeb99fbcff96e84bfd4c0017edaadb9d0092cf780d90d9d5c685a
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795b311e5d100a6ee12571412656cadce82b936d0f7188609f9c5396a7229514
3
  size 4876059416
checkpoint-95/model-00014-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c3aa08d5e70149c46d8e7bd223a8aed3ef99cc01abd164bceb060f96b986ca4
3
  size 2123397800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66269b7488a5708353163cfbb88dc3f525d065b12137c7b9b9860aa1acdb8e94
3
  size 2123397800
checkpoint-95/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b456952e7d2b7b867ccbbdb88af065c23c8a966d4cede0443109b961e31b140b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dcecbaf52006f95fb1d52efcb23120b8add71e8ceafba799bfe215035fbc32
3
  size 1064
checkpoint-95/trainer_state.json CHANGED
@@ -10,667 +10,667 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
- "grad_norm": 1.134754623075341,
14
- "learning_rate": 1.0000000000000002e-06,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
- "grad_norm": 1.1234145683168772,
21
- "learning_rate": 2.0000000000000003e-06,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
- "grad_norm": 1.0799860590372758,
28
- "learning_rate": 3e-06,
29
- "loss": 1.1152,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
- "grad_norm": 0.9984297481710986,
35
- "learning_rate": 4.000000000000001e-06,
36
- "loss": 1.0953,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
- "grad_norm": 0.8302026280344834,
42
- "learning_rate": 5e-06,
43
- "loss": 1.0617,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
- "grad_norm": 0.8911823807745126,
49
- "learning_rate": 6e-06,
50
- "loss": 1.1297,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
- "grad_norm": 0.686211615667355,
56
- "learning_rate": 7e-06,
57
- "loss": 1.0705,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
- "grad_norm": 0.9091855799181295,
63
- "learning_rate": 8.000000000000001e-06,
64
- "loss": 1.065,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
- "grad_norm": 0.8934722980371054,
70
- "learning_rate": 9e-06,
71
- "loss": 1.0767,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
- "grad_norm": 0.8688110393935611,
77
- "learning_rate": 1e-05,
78
- "loss": 1.0303,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
- "grad_norm": 0.9920393807379069,
84
- "learning_rate": 1.1000000000000001e-05,
85
- "loss": 1.0855,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
- "grad_norm": 0.9220245541797021,
91
- "learning_rate": 1.2e-05,
92
- "loss": 1.0531,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
- "grad_norm": 0.736886642754733,
98
- "learning_rate": 1.3000000000000001e-05,
99
- "loss": 1.0456,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
- "grad_norm": 0.771339891024354,
105
- "learning_rate": 1.4e-05,
106
- "loss": 1.0671,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
- "grad_norm": 0.7161080553611359,
112
- "learning_rate": 1.5000000000000002e-05,
113
- "loss": 1.0521,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
- "grad_norm": 0.6788342613059561,
119
- "learning_rate": 1.6000000000000003e-05,
120
- "loss": 1.0674,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
- "grad_norm": 0.7102848455414168,
126
- "learning_rate": 1.7e-05,
127
- "loss": 1.0459,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
- "grad_norm": 0.6425246555654909,
133
- "learning_rate": 1.8e-05,
134
- "loss": 1.0093,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
- "grad_norm": 2.099748819540086,
140
- "learning_rate": 1.9e-05,
141
- "loss": 1.0301,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
- "grad_norm": 0.6691987921672391,
147
- "learning_rate": 2e-05,
148
- "loss": 1.0199,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
- "grad_norm": 0.5883655485426926,
154
- "learning_rate": 2.1000000000000002e-05,
155
- "loss": 1.0085,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
- "grad_norm": 0.5443706631485103,
161
- "learning_rate": 2.2000000000000003e-05,
162
- "loss": 1.0432,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
- "grad_norm": 0.593023936793411,
168
- "learning_rate": 2.3e-05,
169
- "loss": 1.0196,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
- "grad_norm": 0.5618656915734137,
175
- "learning_rate": 2.4e-05,
176
- "loss": 1.0386,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
- "grad_norm": 0.46871710098096486,
182
- "learning_rate": 2.5e-05,
183
- "loss": 0.9611,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
- "grad_norm": 0.5700902276763852,
189
- "learning_rate": 2.6000000000000002e-05,
190
- "loss": 1.0045,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
- "grad_norm": 0.603692765386866,
196
- "learning_rate": 2.7000000000000002e-05,
197
- "loss": 1.019,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
- "grad_norm": 0.48456720859923497,
203
- "learning_rate": 2.8e-05,
204
- "loss": 0.9892,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
- "grad_norm": 0.45729475661677665,
210
- "learning_rate": 2.9e-05,
211
- "loss": 0.9645,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
- "grad_norm": 0.5439846777665153,
217
- "learning_rate": 3.0000000000000004e-05,
218
- "loss": 0.9497,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
- "grad_norm": 0.4965459941185334,
224
- "learning_rate": 3.1e-05,
225
- "loss": 0.9882,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
- "grad_norm": 0.4656328962534996,
231
- "learning_rate": 3.2000000000000005e-05,
232
- "loss": 1.0057,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
- "grad_norm": 0.5241601609773927,
238
- "learning_rate": 3.3e-05,
239
- "loss": 1.0033,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
- "grad_norm": 0.5062226992393802,
245
- "learning_rate": 3.4e-05,
246
- "loss": 1.0166,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
- "grad_norm": 0.43771829747985674,
252
- "learning_rate": 3.5000000000000004e-05,
253
- "loss": 1.0102,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
- "grad_norm": 0.48092156639697076,
259
- "learning_rate": 3.6e-05,
260
- "loss": 1.018,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
- "grad_norm": 0.48115559949514536,
266
- "learning_rate": 3.7000000000000005e-05,
267
- "loss": 1.0079,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
- "grad_norm": 0.4777546937622387,
273
- "learning_rate": 3.8e-05,
274
- "loss": 1.0085,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
- "grad_norm": 0.44755392669080185,
280
- "learning_rate": 3.9e-05,
281
- "loss": 0.9825,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
- "grad_norm": 0.44510881962201315,
287
- "learning_rate": 4e-05,
288
- "loss": 0.9848,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
- "grad_norm": 0.4746290969046573,
294
- "learning_rate": 3.999914623406736e-05,
295
- "loss": 0.9888,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
- "grad_norm": 0.5953130701884418,
301
- "learning_rate": 3.9996585009161056e-05,
302
- "loss": 0.9882,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
- "grad_norm": 0.4251472611705547,
308
- "learning_rate": 3.999231654394975e-05,
309
- "loss": 0.9958,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
- "grad_norm": 0.44690799367073597,
315
- "learning_rate": 3.9986341202860467e-05,
316
- "loss": 0.9543,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
- "grad_norm": 0.5976579403936895,
322
- "learning_rate": 3.9978659496047456e-05,
323
- "loss": 0.9762,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
- "grad_norm": 0.3962092871428472,
329
- "learning_rate": 3.9969272079348685e-05,
330
- "loss": 0.9605,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
- "grad_norm": 0.43362883575028716,
336
- "learning_rate": 3.995817975422981e-05,
337
- "loss": 0.9456,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
- "grad_norm": 0.4139776793240363,
343
- "learning_rate": 3.994538346771576e-05,
344
- "loss": 0.9165,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
- "grad_norm": 0.3940723609427906,
350
- "learning_rate": 3.9930884312309894e-05,
351
- "loss": 0.9071,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
- "grad_norm": 0.4016006422322008,
357
- "learning_rate": 3.991468352590069e-05,
358
- "loss": 0.9668,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
- "grad_norm": 0.9528446542157881,
364
- "learning_rate": 3.989678249165612e-05,
365
- "loss": 1.0431,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
- "grad_norm": 0.41600529189619084,
371
- "learning_rate": 3.987718273790548e-05,
372
- "loss": 0.9464,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
- "grad_norm": 1.1382476752327089,
378
- "learning_rate": 3.9855885938008986e-05,
379
- "loss": 1.0186,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
- "grad_norm": 0.44849148754190465,
385
- "learning_rate": 3.983289391021486e-05,
386
- "loss": 0.9981,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
- "grad_norm": 0.4296819710357216,
392
- "learning_rate": 3.9808208617504106e-05,
393
- "loss": 0.9124,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
- "grad_norm": 1.4708100276334197,
399
- "learning_rate": 3.9781832167422926e-05,
400
- "loss": 1.0627,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
- "grad_norm": 0.436502847615945,
406
- "learning_rate": 3.9753766811902756e-05,
407
- "loss": 0.9399,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
- "grad_norm": 0.41131082586189677,
413
- "learning_rate": 3.972401494706805e-05,
414
- "loss": 0.9381,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
- "grad_norm": 0.42792569998778285,
420
- "learning_rate": 3.969257911303167e-05,
421
- "loss": 0.9426,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
- "grad_norm": 1.0484985550985957,
427
- "learning_rate": 3.965946199367804e-05,
428
- "loss": 1.0745,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
- "grad_norm": 0.45563925287513607,
434
- "learning_rate": 3.962466641643398e-05,
435
- "loss": 1.0085,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
- "grad_norm": 0.4216131864169055,
441
- "learning_rate": 3.958819535202732e-05,
442
- "loss": 0.9533,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
- "grad_norm": 0.47284588975540814,
448
- "learning_rate": 3.9550051914233314e-05,
449
- "loss": 0.9727,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
- "grad_norm": 0.4112493584955737,
455
- "learning_rate": 3.951023935960874e-05,
456
- "loss": 0.9408,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
- "grad_norm": 0.44123500755805545,
462
- "learning_rate": 3.9468761087213864e-05,
463
- "loss": 0.9547,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
- "grad_norm": 0.4160767709488051,
469
- "learning_rate": 3.942562063832228e-05,
470
- "loss": 0.9862,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
- "grad_norm": 0.40282812591350464,
476
- "learning_rate": 3.9380821696118556e-05,
477
- "loss": 0.9301,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
- "grad_norm": 0.42252313457664165,
483
- "learning_rate": 3.933436808538375e-05,
484
- "loss": 0.9751,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
- "grad_norm": 0.4084367556454159,
490
- "learning_rate": 3.92862637721689e-05,
491
- "loss": 0.9838,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
- "grad_norm": 0.39446053200993564,
497
- "learning_rate": 3.923651286345638e-05,
498
- "loss": 0.9237,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
- "grad_norm": 0.43051114259650114,
504
- "learning_rate": 3.9185119606809305e-05,
505
- "loss": 0.9543,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
- "grad_norm": 0.41527447901851827,
511
- "learning_rate": 3.913208839000882e-05,
512
- "loss": 0.9688,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
- "grad_norm": 0.4033220715509175,
518
- "learning_rate": 3.907742374067956e-05,
519
- "loss": 0.9401,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
- "grad_norm": 0.4039636146150166,
525
- "learning_rate": 3.9021130325903076e-05,
526
- "loss": 0.9621,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
- "grad_norm": 0.3896809489063709,
532
- "learning_rate": 3.896321295181932e-05,
533
- "loss": 0.986,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
- "grad_norm": 0.7547382513819603,
539
- "learning_rate": 3.89036765632164e-05,
540
- "loss": 1.0528,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
- "grad_norm": 0.42422582617937166,
546
- "learning_rate": 3.8842526243108326e-05,
547
- "loss": 0.9541,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
- "grad_norm": 0.41581388939730257,
553
- "learning_rate": 3.877976721230114e-05,
554
- "loss": 0.9711,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
- "grad_norm": 0.4326138308224312,
560
- "learning_rate": 3.8715404828947055e-05,
561
- "loss": 0.9261,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
- "grad_norm": 0.38852695749391314,
567
- "learning_rate": 3.864944458808712e-05,
568
- "loss": 0.9648,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
- "grad_norm": 0.3897195092049238,
574
- "learning_rate": 3.8581892121181984e-05,
575
- "loss": 0.9397,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
- "grad_norm": 0.43934794613481915,
581
- "learning_rate": 3.851275319563113e-05,
582
- "loss": 0.9905,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
- "grad_norm": 0.5323662587576004,
588
- "learning_rate": 3.844203371428049e-05,
589
- "loss": 0.9896,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
- "grad_norm": 0.38441956539336747,
595
- "learning_rate": 3.836973971491847e-05,
596
- "loss": 0.9385,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
- "grad_norm": 0.38662975914153885,
602
- "learning_rate": 3.8295877369760426e-05,
603
- "loss": 0.9586,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
- "grad_norm": 0.41009140101075614,
609
- "learning_rate": 3.822045298492177e-05,
610
- "loss": 0.9667,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
- "grad_norm": 0.4258642992742759,
616
- "learning_rate": 3.814347299987953e-05,
617
- "loss": 0.954,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
- "grad_norm": 0.40527142541860056,
623
- "learning_rate": 3.806494398692258e-05,
624
- "loss": 0.9351,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
- "grad_norm": 0.3743850574341336,
630
- "learning_rate": 3.7984872650590516e-05,
631
- "loss": 0.9498,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
- "grad_norm": 0.4151867667600151,
637
- "learning_rate": 3.790326582710125e-05,
638
- "loss": 0.9466,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
- "grad_norm": 0.4448011376311795,
644
- "learning_rate": 3.782013048376736e-05,
645
- "loss": 1.0266,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
- "grad_norm": 0.38192124855359877,
651
- "learning_rate": 3.773547371840124e-05,
652
- "loss": 0.978,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
- "grad_norm": 0.4235778210861527,
658
- "learning_rate": 3.764930275870912e-05,
659
- "loss": 0.9827,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
- "grad_norm": 0.4051195260626496,
665
- "learning_rate": 3.756162496167396e-05,
666
- "loss": 0.963,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
- "grad_norm": 0.40700055373961197,
672
- "learning_rate": 3.7472447812927395e-05,
673
- "loss": 0.9437,
674
  "step": 95
675
  }
676
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.005249343832020997,
13
+ "grad_norm": 1.1348930782232016,
14
+ "learning_rate": 1.5000000000000002e-07,
15
  "loss": 1.1087,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.010498687664041995,
20
+ "grad_norm": 1.123696373079589,
21
+ "learning_rate": 3.0000000000000004e-07,
22
  "loss": 1.1356,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.015748031496062992,
27
+ "grad_norm": 1.0989081863562118,
28
+ "learning_rate": 4.5e-07,
29
+ "loss": 1.1158,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.02099737532808399,
34
+ "grad_norm": 1.0628548113414964,
35
+ "learning_rate": 6.000000000000001e-07,
36
+ "loss": 1.0986,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.026246719160104987,
41
+ "grad_norm": 1.0629069543612368,
42
+ "learning_rate": 7.5e-07,
43
+ "loss": 1.0727,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.031496062992125984,
48
+ "grad_norm": 1.1219311917213644,
49
+ "learning_rate": 9e-07,
50
+ "loss": 1.1513,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.03674540682414698,
55
+ "grad_norm": 1.068318638334139,
56
+ "learning_rate": 1.05e-06,
57
+ "loss": 1.0978,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.04199475065616798,
62
+ "grad_norm": 1.0335025624008565,
63
+ "learning_rate": 1.2000000000000002e-06,
64
+ "loss": 1.0932,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.047244094488188976,
69
+ "grad_norm": 0.9514112971268772,
70
+ "learning_rate": 1.35e-06,
71
+ "loss": 1.1046,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.05249343832020997,
76
+ "grad_norm": 0.8944230714776324,
77
+ "learning_rate": 1.5e-06,
78
+ "loss": 1.0638,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.05774278215223097,
83
+ "grad_norm": 0.8720343077794245,
84
+ "learning_rate": 1.65e-06,
85
+ "loss": 1.1132,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.06299212598425197,
90
+ "grad_norm": 0.7519518665820406,
91
+ "learning_rate": 1.8e-06,
92
+ "loss": 1.0788,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.06824146981627296,
97
+ "grad_norm": 0.7768466543241798,
98
+ "learning_rate": 1.95e-06,
99
+ "loss": 1.0795,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.07349081364829396,
104
+ "grad_norm": 0.7109922479048013,
105
+ "learning_rate": 2.1e-06,
106
+ "loss": 1.1012,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.07874015748031496,
111
+ "grad_norm": 0.6312078880187205,
112
+ "learning_rate": 2.25e-06,
113
+ "loss": 1.0851,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.08398950131233596,
118
+ "grad_norm": 0.5514473048370377,
119
+ "learning_rate": 2.4000000000000003e-06,
120
+ "loss": 1.1041,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.08923884514435695,
125
+ "grad_norm": 0.6271281070432462,
126
+ "learning_rate": 2.55e-06,
127
+ "loss": 1.0855,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.09448818897637795,
132
+ "grad_norm": 0.7059888078645049,
133
+ "learning_rate": 2.7e-06,
134
+ "loss": 1.0473,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.09973753280839895,
139
+ "grad_norm": 0.7226157330393405,
140
+ "learning_rate": 2.85e-06,
141
+ "loss": 1.0665,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.10498687664041995,
146
+ "grad_norm": 0.7244742832208652,
147
+ "learning_rate": 3e-06,
148
+ "loss": 1.0604,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.11023622047244094,
153
+ "grad_norm": 0.7088251146482789,
154
+ "learning_rate": 3.1500000000000003e-06,
155
+ "loss": 1.0516,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.11548556430446194,
160
+ "grad_norm": 0.5987242362229293,
161
+ "learning_rate": 3.3e-06,
162
+ "loss": 1.084,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.12073490813648294,
167
+ "grad_norm": 0.5730637810768702,
168
+ "learning_rate": 3.45e-06,
169
+ "loss": 1.0621,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.12598425196850394,
174
+ "grad_norm": 0.5894968443138215,
175
+ "learning_rate": 3.6e-06,
176
+ "loss": 1.0797,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.13123359580052493,
181
+ "grad_norm": 0.5798124303184627,
182
+ "learning_rate": 3.75e-06,
183
+ "loss": 1.0035,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.13648293963254593,
188
+ "grad_norm": 0.643205751513686,
189
+ "learning_rate": 3.9e-06,
190
+ "loss": 1.0455,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.14173228346456693,
195
+ "grad_norm": 0.5621970774702022,
196
+ "learning_rate": 4.05e-06,
197
+ "loss": 1.0576,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.14698162729658792,
202
+ "grad_norm": 0.5506084571895594,
203
+ "learning_rate": 4.2e-06,
204
+ "loss": 1.0298,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.15223097112860892,
209
+ "grad_norm": 0.48741149421912777,
210
+ "learning_rate": 4.35e-06,
211
+ "loss": 1.0018,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.15748031496062992,
216
+ "grad_norm": 0.46403007703544275,
217
+ "learning_rate": 4.5e-06,
218
+ "loss": 0.9872,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.16272965879265092,
223
+ "grad_norm": 0.4754381818573106,
224
+ "learning_rate": 4.65e-06,
225
+ "loss": 1.0271,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.1679790026246719,
230
+ "grad_norm": 0.9362850890979981,
231
+ "learning_rate": 4.800000000000001e-06,
232
+ "loss": 1.0437,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.1732283464566929,
237
+ "grad_norm": 0.47391181595772164,
238
+ "learning_rate": 4.95e-06,
239
+ "loss": 1.0437,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1784776902887139,
244
+ "grad_norm": 0.5276920454851337,
245
+ "learning_rate": 5.1e-06,
246
+ "loss": 1.0557,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.1837270341207349,
251
+ "grad_norm": 0.4616075133913133,
252
+ "learning_rate": 5.2500000000000006e-06,
253
+ "loss": 1.0465,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.1889763779527559,
258
+ "grad_norm": 0.4555174555636226,
259
+ "learning_rate": 5.4e-06,
260
+ "loss": 1.0588,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1942257217847769,
265
+ "grad_norm": 0.5071864534648831,
266
+ "learning_rate": 5.55e-06,
267
+ "loss": 1.044,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1994750656167979,
272
+ "grad_norm": 0.4851367263882934,
273
+ "learning_rate": 5.7e-06,
274
+ "loss": 1.0464,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.2047244094488189,
279
+ "grad_norm": 0.44188022228811896,
280
+ "learning_rate": 5.85e-06,
281
+ "loss": 1.0182,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.2099737532808399,
286
+ "grad_norm": 0.43420740120454643,
287
+ "learning_rate": 6e-06,
288
+ "loss": 1.0188,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.2152230971128609,
293
+ "grad_norm": 0.4291543441241407,
294
+ "learning_rate": 5.9998719351101036e-06,
295
+ "loss": 1.0245,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.2204724409448819,
300
+ "grad_norm": 0.43326370236005163,
301
+ "learning_rate": 5.999487751374158e-06,
302
+ "loss": 1.0238,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.22572178477690288,
307
+ "grad_norm": 0.427571644972227,
308
+ "learning_rate": 5.998847481592462e-06,
309
+ "loss": 1.0311,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.23097112860892388,
314
+ "grad_norm": 0.4215063088273006,
315
+ "learning_rate": 5.997951180429069e-06,
316
+ "loss": 0.9925,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.23622047244094488,
321
+ "grad_norm": 0.4206536914503675,
322
+ "learning_rate": 5.996798924407118e-06,
323
+ "loss": 1.003,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.24146981627296588,
328
+ "grad_norm": 0.40910969064965136,
329
+ "learning_rate": 5.995390811902302e-06,
330
+ "loss": 0.9949,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.24671916010498687,
335
+ "grad_norm": 0.4165775049327623,
336
+ "learning_rate": 5.993726963134471e-06,
337
+ "loss": 0.9734,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.25196850393700787,
342
+ "grad_norm": 0.3832235501001726,
343
+ "learning_rate": 5.9918075201573645e-06,
344
+ "loss": 0.9485,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.2572178477690289,
349
+ "grad_norm": 0.37002495168808525,
350
+ "learning_rate": 5.9896326468464835e-06,
351
+ "loss": 0.9358,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.26246719160104987,
356
+ "grad_norm": 0.44836853406053057,
357
+ "learning_rate": 5.987202528885104e-06,
358
+ "loss": 0.9982,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.2677165354330709,
363
+ "grad_norm": 0.4080608606117312,
364
+ "learning_rate": 5.984517373748417e-06,
365
+ "loss": 1.0129,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.27296587926509186,
370
+ "grad_norm": 0.4001550595702573,
371
+ "learning_rate": 5.981577410685822e-06,
372
+ "loss": 0.9788,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2782152230971129,
377
+ "grad_norm": 0.41021488877460305,
378
+ "learning_rate": 5.978382890701347e-06,
379
+ "loss": 1.0262,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.28346456692913385,
384
+ "grad_norm": 0.39997016380492506,
385
+ "learning_rate": 5.9749340865322284e-06,
386
+ "loss": 1.0275,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.2887139107611549,
391
+ "grad_norm": 0.3839823787027912,
392
+ "learning_rate": 5.971231292625615e-06,
393
+ "loss": 0.9374,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.29396325459317585,
398
+ "grad_norm": 0.4125068495663659,
399
+ "learning_rate": 5.967274825113438e-06,
400
+ "loss": 0.9954,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2992125984251969,
405
+ "grad_norm": 0.3908377197765856,
406
+ "learning_rate": 5.963065021785414e-06,
407
+ "loss": 0.9671,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.30446194225721784,
412
+ "grad_norm": 0.3850488592862481,
413
+ "learning_rate": 5.958602242060207e-06,
414
+ "loss": 0.9657,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.30971128608923887,
419
+ "grad_norm": 0.3877990366088493,
420
+ "learning_rate": 5.95388686695475e-06,
421
+ "loss": 0.9678,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.31496062992125984,
426
+ "grad_norm": 0.40470471194287355,
427
+ "learning_rate": 5.948919299051706e-06,
428
+ "loss": 1.0149,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.32020997375328086,
433
+ "grad_norm": 0.42889495063392963,
434
+ "learning_rate": 5.943699962465096e-06,
435
+ "loss": 1.033,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.32545931758530183,
440
+ "grad_norm": 0.39164358737100274,
441
+ "learning_rate": 5.9382293028040985e-06,
442
+ "loss": 0.9761,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.33070866141732286,
447
+ "grad_norm": 0.3869342590567232,
448
+ "learning_rate": 5.9325077871349975e-06,
449
+ "loss": 0.9982,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.3359580052493438,
454
+ "grad_norm": 0.39264627926569035,
455
+ "learning_rate": 5.9265359039413105e-06,
456
+ "loss": 0.9667,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.34120734908136485,
461
+ "grad_norm": 0.3887717698297268,
462
+ "learning_rate": 5.920314163082079e-06,
463
+ "loss": 0.9806,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.3464566929133858,
468
+ "grad_norm": 0.40896336915084297,
469
+ "learning_rate": 5.913843095748342e-06,
470
+ "loss": 1.0135,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.35170603674540685,
475
+ "grad_norm": 0.3610209560875707,
476
+ "learning_rate": 5.907123254417783e-06,
477
+ "loss": 0.956,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.3569553805774278,
482
+ "grad_norm": 0.38154744815823505,
483
+ "learning_rate": 5.9001552128075625e-06,
484
+ "loss": 1.0045,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.36220472440944884,
489
+ "grad_norm": 0.4094826396119445,
490
+ "learning_rate": 5.892939565825335e-06,
491
+ "loss": 1.0069,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.3674540682414698,
496
+ "grad_norm": 0.39129138622932325,
497
+ "learning_rate": 5.885476929518457e-06,
498
+ "loss": 0.9525,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.37270341207349084,
503
+ "grad_norm": 0.3712890701175899,
504
+ "learning_rate": 5.8777679410213956e-06,
505
+ "loss": 0.9792,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3779527559055118,
510
+ "grad_norm": 0.4086264062600148,
511
+ "learning_rate": 5.869813258501323e-06,
512
+ "loss": 0.9926,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.38320209973753283,
517
+ "grad_norm": 0.368975878599487,
518
+ "learning_rate": 5.861613561101934e-06,
519
+ "loss": 0.9643,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.3884514435695538,
524
+ "grad_norm": 0.36792811629461203,
525
+ "learning_rate": 5.853169548885461e-06,
526
+ "loss": 0.9867,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3937007874015748,
531
+ "grad_norm": 0.3566251893981936,
532
+ "learning_rate": 5.844481942772898e-06,
533
+ "loss": 1.0069,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3989501312335958,
538
+ "grad_norm": 0.4578529359685586,
539
+ "learning_rate": 5.835551484482459e-06,
540
+ "loss": 1.0173,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.4041994750656168,
545
+ "grad_norm": 0.3935925285922137,
546
+ "learning_rate": 5.826378936466249e-06,
547
+ "loss": 0.9743,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.4094488188976378,
552
+ "grad_norm": 0.4109939217838428,
553
+ "learning_rate": 5.81696508184517e-06,
554
+ "loss": 0.9866,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.4146981627296588,
559
+ "grad_norm": 0.3839870332489822,
560
+ "learning_rate": 5.807310724342058e-06,
561
+ "loss": 0.9516,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.4199475065616798,
566
+ "grad_norm": 0.3774576797883406,
567
+ "learning_rate": 5.797416688213067e-06,
568
+ "loss": 0.9895,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.4251968503937008,
573
+ "grad_norm": 0.3817468964498129,
574
+ "learning_rate": 5.787283818177297e-06,
575
+ "loss": 0.9632,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.4304461942257218,
580
+ "grad_norm": 0.60843002346461,
581
+ "learning_rate": 5.776912979344669e-06,
582
+ "loss": 1.0166,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.4356955380577428,
587
+ "grad_norm": 0.3858713700245362,
588
+ "learning_rate": 5.766305057142073e-06,
589
+ "loss": 0.9976,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.4409448818897638,
594
+ "grad_norm": 0.3724153436541016,
595
+ "learning_rate": 5.755460957237769e-06,
596
+ "loss": 0.9645,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.4461942257217848,
601
+ "grad_norm": 0.38201105695018567,
602
+ "learning_rate": 5.744381605464064e-06,
603
+ "loss": 0.9899,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.45144356955380577,
608
+ "grad_norm": 0.38383930861007165,
609
+ "learning_rate": 5.7330679477382655e-06,
610
+ "loss": 0.9919,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.4566929133858268,
615
+ "grad_norm": 0.4078870418259581,
616
+ "learning_rate": 5.7215209499819296e-06,
617
+ "loss": 0.9797,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.46194225721784776,
622
+ "grad_norm": 0.38463767466523974,
623
+ "learning_rate": 5.709741598038387e-06,
624
+ "loss": 0.9597,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.4671916010498688,
629
+ "grad_norm": 0.36309855116472584,
630
+ "learning_rate": 5.697730897588577e-06,
631
+ "loss": 0.9737,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.47244094488188976,
636
+ "grad_norm": 0.4106701446638758,
637
+ "learning_rate": 5.685489874065187e-06,
638
+ "loss": 0.9683,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.4776902887139108,
643
+ "grad_norm": 0.37110409255145443,
644
+ "learning_rate": 5.673019572565103e-06,
645
+ "loss": 1.0418,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.48293963254593175,
650
+ "grad_norm": 0.3558357783330656,
651
+ "learning_rate": 5.660321057760186e-06,
652
+ "loss": 1.0055,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.4881889763779528,
657
+ "grad_norm": 0.40499489938404787,
658
+ "learning_rate": 5.6473954138063674e-06,
659
+ "loss": 1.0113,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.49343832020997375,
664
+ "grad_norm": 0.39428526462199764,
665
+ "learning_rate": 5.634243744251094e-06,
666
+ "loss": 0.9875,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.49868766404199477,
671
+ "grad_norm": 0.3711741011240413,
672
+ "learning_rate": 5.620867171939109e-06,
673
+ "loss": 0.9749,
674
  "step": 95
675
  }
676
  ],
checkpoint-95/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c0d2528dcfd8d62d3c517248c2d231cc9ff64ec148911ec3ce58a9d39f7507d
3
  size 8376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b860c7e838727b1a9d8001f5c7a769bd0d63566ea45620719245b6beb59f1cd9
3
  size 8376