TweedleDeepLearnings commited on
Commit
a7766b9
·
verified ·
1 Parent(s): f71fbf5

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6b8ad6dc351cd6e6fb14be1e78d1bc303255df70fb405b36cd906f126d23eda
3
  size 1047100024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9284611f383403e4bddaaa1bdc9a68ea0e5a23d1df4e5a0c41e31b7eeeec410c
3
  size 1047100024
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:717d7aa626dbf9706f279c1e82ca897875a82bd64edcd7759a8d114e3a506905
3
  size 2027092538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f18b2877c25c005099ddcb6f1fd6cff53e9e6953825166edb88f31f4cea446b2
3
  size 2027092538
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05429ce80080b6006aefa7a0cdcf256e6801d1dc314fce88e03a2b2d615970a0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5579fe9e15618b81ef610fec990943b1331a8e0dc73ea65185b209958e17402
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea5caa49db915b45959ce55ffb19f89ee4df15a92a0cf61506d4b5cce9f63612
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c9bb74c7ccc0ce8938928022bb60a0f70baa72e09d1fb547a24fe2599bec65
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,238 +1,125 @@
1
  {
2
- "best_metric": 1.378271222114563,
3
- "best_model_checkpoint": "./output/checkpoint-300",
4
- "epoch": 0.024168210746797713,
5
  "eval_steps": 150,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0008056070248932571,
13
- "grad_norm": 1.6331578493118286,
14
  "learning_rate": 8.000000000000001e-06,
15
- "loss": 1.674,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.0016112140497865142,
20
- "grad_norm": 1.5799965858459473,
21
  "learning_rate": 1.6000000000000003e-05,
22
- "loss": 1.6139,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.002416821074679771,
27
- "grad_norm": 1.4048924446105957,
28
  "learning_rate": 2.4e-05,
29
- "loss": 1.5637,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.0032224280995730285,
34
- "grad_norm": 1.3288944959640503,
35
  "learning_rate": 3.2000000000000005e-05,
36
- "loss": 1.47,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.0040280351244662855,
41
- "grad_norm": 1.2938439846038818,
42
  "learning_rate": 4e-05,
43
- "loss": 1.5004,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.004833642149359542,
48
- "grad_norm": 1.257751226425171,
49
  "learning_rate": 4.8e-05,
50
- "loss": 1.4306,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.0056392491742527995,
55
- "grad_norm": 1.3014723062515259,
56
  "learning_rate": 5.6e-05,
57
- "loss": 1.4461,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.006444856199146057,
62
- "grad_norm": 1.2876347303390503,
63
  "learning_rate": 6.400000000000001e-05,
64
- "loss": 1.3987,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.0072504632240393135,
69
- "grad_norm": 1.3185112476348877,
70
  "learning_rate": 7.2e-05,
71
- "loss": 1.4719,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.008056070248932571,
76
- "grad_norm": 1.2125263214111328,
77
  "learning_rate": 8e-05,
78
- "loss": 1.4728,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.008861677273825828,
83
- "grad_norm": 1.3386430740356445,
84
  "learning_rate": 7.999917787833465e-05,
85
- "loss": 1.4244,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.009667284298719084,
90
- "grad_norm": 1.2090747356414795,
91
  "learning_rate": 7.999671154713278e-05,
92
- "loss": 1.3962,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.010472891323612342,
97
- "grad_norm": 1.2480571269989014,
98
  "learning_rate": 7.99926011077756e-05,
99
- "loss": 1.4184,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.011278498348505599,
104
- "grad_norm": 1.1953870058059692,
105
  "learning_rate": 7.99868467292272e-05,
106
- "loss": 1.4291,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.012084105373398856,
111
- "grad_norm": 1.2968889474868774,
112
  "learning_rate": 7.997944864802752e-05,
113
- "loss": 1.4176,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.012084105373398856,
118
- "eval_loss": 1.409696340560913,
119
- "eval_runtime": 41.1558,
120
- "eval_samples_per_second": 12.149,
121
- "eval_steps_per_second": 12.149,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.012889712398292114,
126
- "grad_norm": 1.2144309282302856,
127
- "learning_rate": 7.997040716828271e-05,
128
- "loss": 1.4389,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.01369531942318537,
133
- "grad_norm": 1.2486999034881592,
134
- "learning_rate": 7.995972266165259e-05,
135
- "loss": 1.3656,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.014500926448078627,
140
- "grad_norm": 1.0841563940048218,
141
- "learning_rate": 7.994739556733538e-05,
142
- "loss": 1.3916,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.015306533472971885,
147
- "grad_norm": 1.1280230283737183,
148
- "learning_rate": 7.993342639204965e-05,
149
- "loss": 1.4169,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.016112140497865142,
154
- "grad_norm": 1.2391259670257568,
155
- "learning_rate": 7.991781571001347e-05,
156
- "loss": 1.4259,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.0169177475227584,
161
- "grad_norm": 1.287666916847229,
162
- "learning_rate": 7.990056416292084e-05,
163
- "loss": 1.387,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.017723354547651657,
168
- "grad_norm": 1.099730134010315,
169
- "learning_rate": 7.988167245991528e-05,
170
- "loss": 1.3881,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.018528961572544914,
175
- "grad_norm": 1.049587607383728,
176
- "learning_rate": 7.986114137756074e-05,
177
- "loss": 1.3841,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.01933456859743817,
182
- "grad_norm": 1.247476577758789,
183
- "learning_rate": 7.983897175980957e-05,
184
- "loss": 1.3705,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.020140175622331426,
189
- "grad_norm": 1.2096580266952515,
190
- "learning_rate": 7.981516451796794e-05,
191
- "loss": 1.3743,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.020945782647224683,
196
- "grad_norm": 1.1319410800933838,
197
- "learning_rate": 7.97897206306583e-05,
198
- "loss": 1.3672,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.02175138967211794,
203
- "grad_norm": 1.1657007932662964,
204
- "learning_rate": 7.976264114377922e-05,
205
- "loss": 1.4038,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.022556996697011198,
210
- "grad_norm": 1.070646047592163,
211
- "learning_rate": 7.973392717046233e-05,
212
- "loss": 1.3644,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.023362603721904456,
217
- "grad_norm": 1.094254732131958,
218
- "learning_rate": 7.97035798910266e-05,
219
- "loss": 1.3324,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.024168210746797713,
224
- "grad_norm": 1.0771719217300415,
225
- "learning_rate": 7.967160055292984e-05,
226
- "loss": 1.3354,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.024168210746797713,
231
- "eval_loss": 1.378271222114563,
232
- "eval_runtime": 40.7201,
233
- "eval_samples_per_second": 12.279,
234
- "eval_steps_per_second": 12.279,
235
- "step": 300
236
  }
237
  ],
238
  "logging_steps": 10,
@@ -252,8 +139,8 @@
252
  "attributes": {}
253
  }
254
  },
255
- "total_flos": 6.034100572549939e+16,
256
- "train_batch_size": 8,
257
  "trial_name": null,
258
  "trial_params": null
259
  }
 
1
  {
2
+ "best_metric": 1.3760384321212769,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.006042296072507553,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0004028197381671702,
13
+ "grad_norm": 1.6403535604476929,
14
  "learning_rate": 8.000000000000001e-06,
15
+ "loss": 1.3224,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.0008056394763343404,
20
+ "grad_norm": 1.524896502494812,
21
  "learning_rate": 1.6000000000000003e-05,
22
+ "loss": 1.4594,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.0012084592145015106,
27
+ "grad_norm": 1.6602262258529663,
28
  "learning_rate": 2.4e-05,
29
+ "loss": 1.3775,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.0016112789526686808,
34
+ "grad_norm": 1.6227095127105713,
35
  "learning_rate": 3.2000000000000005e-05,
36
+ "loss": 1.3538,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.002014098690835851,
41
+ "grad_norm": 1.4939993619918823,
42
  "learning_rate": 4e-05,
43
+ "loss": 1.3195,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.002416918429003021,
48
+ "grad_norm": 1.4631342887878418,
49
  "learning_rate": 4.8e-05,
50
+ "loss": 1.3531,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.0028197381671701913,
55
+ "grad_norm": 1.5572013854980469,
56
  "learning_rate": 5.6e-05,
57
+ "loss": 1.3038,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.0032225579053373615,
62
+ "grad_norm": 1.4803887605667114,
63
  "learning_rate": 6.400000000000001e-05,
64
+ "loss": 1.2703,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.0036253776435045317,
69
+ "grad_norm": 1.625388741493225,
70
  "learning_rate": 7.2e-05,
71
+ "loss": 1.4105,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.004028197381671702,
76
+ "grad_norm": 1.5087355375289917,
77
  "learning_rate": 8e-05,
78
+ "loss": 1.3726,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.004431017119838872,
83
+ "grad_norm": 1.4768939018249512,
84
  "learning_rate": 7.999917787833465e-05,
85
+ "loss": 1.4166,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.004833836858006042,
90
+ "grad_norm": 1.5856446027755737,
91
  "learning_rate": 7.999671154713278e-05,
92
+ "loss": 1.3702,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.0052366565961732125,
97
+ "grad_norm": 1.5647201538085938,
98
  "learning_rate": 7.99926011077756e-05,
99
+ "loss": 1.3436,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.005639476334340383,
104
+ "grad_norm": 1.5936949253082275,
105
  "learning_rate": 7.99868467292272e-05,
106
+ "loss": 1.4691,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.006042296072507553,
111
+ "grad_norm": 1.4276344776153564,
112
  "learning_rate": 7.997944864802752e-05,
113
+ "loss": 1.3246,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.006042296072507553,
118
+ "eval_loss": 1.3760384321212769,
119
+ "eval_runtime": 41.2678,
120
+ "eval_samples_per_second": 12.116,
121
+ "eval_steps_per_second": 12.116,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 1.4294793286582272e+16,
143
+ "train_batch_size": 4,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66abaed6a1b373e7334cdae7d507a46c867f9e3559096310c36c292f8ae401f3
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:629cd7345246299ec3001e6450fd588adaf1dec8f116c5e1eec7d37b34176eb8
3
  size 5496