nazim-ks commited on
Commit
8aabc7d
1 Parent(s): 6dc1519

Training in progress, step 420

Browse files
all_results.json CHANGED
@@ -3,12 +3,12 @@
3
  "eval_accuracy": 0.09523809523809523,
4
  "eval_f1": 0.016563146997929608,
5
  "eval_loss": NaN,
6
- "eval_runtime": 2.8277,
7
- "eval_samples_per_second": 37.133,
8
- "eval_steps_per_second": 4.951,
9
  "total_flos": 5.130291560557363e+17,
10
- "train_loss": 1.0611230804806664,
11
- "train_runtime": 556.247,
12
- "train_samples_per_second": 11.901,
13
- "train_steps_per_second": 0.755
14
  }
 
3
  "eval_accuracy": 0.09523809523809523,
4
  "eval_f1": 0.016563146997929608,
5
  "eval_loss": NaN,
6
+ "eval_runtime": 3.2578,
7
+ "eval_samples_per_second": 32.231,
8
+ "eval_steps_per_second": 2.149,
9
  "total_flos": 5.130291560557363e+17,
10
+ "train_loss": 0.0,
11
+ "train_runtime": 523.0916,
12
+ "train_samples_per_second": 12.656,
13
+ "train_steps_per_second": 0.803
14
  }
eval_results.json CHANGED
@@ -3,7 +3,7 @@
3
  "eval_accuracy": 0.09523809523809523,
4
  "eval_f1": 0.016563146997929608,
5
  "eval_loss": NaN,
6
- "eval_runtime": 2.8277,
7
- "eval_samples_per_second": 37.133,
8
- "eval_steps_per_second": 4.951
9
  }
 
3
  "eval_accuracy": 0.09523809523809523,
4
  "eval_f1": 0.016563146997929608,
5
  "eval_loss": NaN,
6
+ "eval_runtime": 3.2578,
7
+ "eval_samples_per_second": 32.231,
8
+ "eval_steps_per_second": 2.149
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:faea4fd9bce54a0a8ac21d7a2f02bfb00b0583239a931c498b5f6a552ec027ec
3
  size 343245508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5baf8e84a842228091905a149230e11e5bfef12404cdaa7929684ce205580301
3
  size 343245508
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 5.130291560557363e+17,
4
- "train_loss": 1.0611230804806664,
5
- "train_runtime": 556.247,
6
- "train_samples_per_second": 11.901,
7
- "train_steps_per_second": 0.755
8
  }
 
1
  {
2
  "epoch": 10.0,
3
  "total_flos": 5.130291560557363e+17,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 523.0916,
6
+ "train_samples_per_second": 12.656,
7
+ "train_steps_per_second": 0.803
8
  }
trainer_state.json CHANGED
@@ -3,263 +3,193 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 830,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4939759036144578,
13
  "grad_norm": NaN,
14
- "learning_rate": 1.9012048192771087e-05,
15
  "loss": 0.0,
16
  "step": 41
17
  },
18
- {
19
- "epoch": 0.9879518072289156,
20
- "grad_norm": NaN,
21
- "learning_rate": 1.802409638554217e-05,
22
- "loss": 0.0,
23
- "step": 82
24
- },
25
  {
26
  "epoch": 1.0,
27
  "eval_accuracy": 0.09523809523809523,
28
  "eval_f1": 0.016563146997929608,
29
  "eval_loss": NaN,
30
- "eval_runtime": 2.8332,
31
- "eval_samples_per_second": 37.061,
32
- "eval_steps_per_second": 4.941,
33
- "step": 83
34
  },
35
  {
36
- "epoch": 1.4819277108433735,
37
  "grad_norm": NaN,
38
- "learning_rate": 1.7036144578313254e-05,
39
  "loss": 0.0,
40
- "step": 123
41
- },
42
- {
43
- "epoch": 1.9759036144578315,
44
- "grad_norm": NaN,
45
- "learning_rate": 1.604819277108434e-05,
46
- "loss": 0.0,
47
- "step": 164
48
  },
49
  {
50
  "epoch": 2.0,
51
  "eval_accuracy": 0.09523809523809523,
52
  "eval_f1": 0.016563146997929608,
53
  "eval_loss": NaN,
54
- "eval_runtime": 2.8629,
55
- "eval_samples_per_second": 36.676,
56
- "eval_steps_per_second": 4.89,
57
- "step": 166
58
  },
59
  {
60
- "epoch": 2.4698795180722892,
61
  "grad_norm": NaN,
62
- "learning_rate": 1.5060240963855424e-05,
63
  "loss": 0.0,
64
- "step": 205
65
- },
66
- {
67
- "epoch": 2.963855421686747,
68
- "grad_norm": NaN,
69
- "learning_rate": 1.4072289156626506e-05,
70
- "loss": 0.0,
71
- "step": 246
72
  },
73
  {
74
  "epoch": 3.0,
75
  "eval_accuracy": 0.09523809523809523,
76
  "eval_f1": 0.016563146997929608,
77
  "eval_loss": NaN,
78
- "eval_runtime": 2.8905,
79
- "eval_samples_per_second": 36.326,
80
- "eval_steps_per_second": 4.843,
81
- "step": 249
82
- },
83
- {
84
- "epoch": 3.4578313253012047,
85
- "grad_norm": NaN,
86
- "learning_rate": 1.3084337349397591e-05,
87
- "loss": 0.0,
88
- "step": 287
89
  },
90
  {
91
- "epoch": 3.9518072289156625,
92
  "grad_norm": NaN,
93
- "learning_rate": 1.2096385542168677e-05,
94
  "loss": 0.0,
95
- "step": 328
96
  },
97
  {
98
  "epoch": 4.0,
99
  "eval_accuracy": 0.09523809523809523,
100
  "eval_f1": 0.016563146997929608,
101
  "eval_loss": NaN,
102
- "eval_runtime": 2.8626,
103
- "eval_samples_per_second": 36.68,
104
- "eval_steps_per_second": 4.891,
105
- "step": 332
106
- },
107
- {
108
- "epoch": 4.445783132530121,
109
- "grad_norm": NaN,
110
- "learning_rate": 1.110843373493976e-05,
111
- "loss": 0.0,
112
- "step": 369
113
  },
114
  {
115
- "epoch": 4.9397590361445785,
116
  "grad_norm": NaN,
117
- "learning_rate": 1.0120481927710844e-05,
118
  "loss": 0.0,
119
- "step": 410
120
  },
121
  {
122
  "epoch": 5.0,
123
  "eval_accuracy": 0.09523809523809523,
124
  "eval_f1": 0.016563146997929608,
125
  "eval_loss": NaN,
126
- "eval_runtime": 2.8494,
127
- "eval_samples_per_second": 36.849,
128
- "eval_steps_per_second": 4.913,
129
- "step": 415
130
- },
131
- {
132
- "epoch": 5.433734939759036,
133
- "grad_norm": NaN,
134
- "learning_rate": 9.132530120481929e-06,
135
- "loss": 0.0,
136
- "step": 451
137
  },
138
  {
139
- "epoch": 5.927710843373494,
140
  "grad_norm": NaN,
141
- "learning_rate": 8.144578313253012e-06,
142
  "loss": 0.0,
143
- "step": 492
144
  },
145
  {
146
  "epoch": 6.0,
147
  "eval_accuracy": 0.09523809523809523,
148
  "eval_f1": 0.016563146997929608,
149
  "eval_loss": NaN,
150
- "eval_runtime": 2.8675,
151
- "eval_samples_per_second": 36.617,
152
- "eval_steps_per_second": 4.882,
153
- "step": 498
154
  },
155
  {
156
- "epoch": 6.421686746987952,
157
  "grad_norm": NaN,
158
- "learning_rate": 7.156626506024097e-06,
159
  "loss": 0.0,
160
- "step": 533
161
- },
162
- {
163
- "epoch": 6.9156626506024095,
164
- "grad_norm": NaN,
165
- "learning_rate": 6.168674698795182e-06,
166
- "loss": 0.0,
167
- "step": 574
168
  },
169
  {
170
  "epoch": 7.0,
171
  "eval_accuracy": 0.09523809523809523,
172
  "eval_f1": 0.016563146997929608,
173
  "eval_loss": NaN,
174
- "eval_runtime": 2.8441,
175
- "eval_samples_per_second": 36.919,
176
- "eval_steps_per_second": 4.922,
177
- "step": 581
178
  },
179
  {
180
- "epoch": 7.409638554216867,
181
  "grad_norm": NaN,
182
- "learning_rate": 5.180722891566266e-06,
183
  "loss": 0.0,
184
- "step": 615
185
- },
186
- {
187
- "epoch": 7.903614457831325,
188
- "grad_norm": NaN,
189
- "learning_rate": 4.19277108433735e-06,
190
- "loss": 0.0,
191
- "step": 656
192
  },
193
  {
194
  "epoch": 8.0,
195
  "eval_accuracy": 0.09523809523809523,
196
  "eval_f1": 0.016563146997929608,
197
  "eval_loss": NaN,
198
- "eval_runtime": 2.8258,
199
- "eval_samples_per_second": 37.157,
200
- "eval_steps_per_second": 4.954,
201
- "step": 664
202
- },
203
- {
204
- "epoch": 8.397590361445783,
205
- "grad_norm": NaN,
206
- "learning_rate": 3.204819277108434e-06,
207
- "loss": 0.0,
208
- "step": 697
209
  },
210
  {
211
- "epoch": 8.891566265060241,
212
  "grad_norm": NaN,
213
- "learning_rate": 2.2168674698795183e-06,
214
  "loss": 0.0,
215
- "step": 738
216
  },
217
  {
218
  "epoch": 9.0,
219
  "eval_accuracy": 0.09523809523809523,
220
  "eval_f1": 0.016563146997929608,
221
  "eval_loss": NaN,
222
- "eval_runtime": 2.8829,
223
- "eval_samples_per_second": 36.422,
224
- "eval_steps_per_second": 4.856,
225
- "step": 747
226
  },
227
  {
228
- "epoch": 9.385542168674698,
229
  "grad_norm": NaN,
230
- "learning_rate": 1.2289156626506025e-06,
231
  "loss": 0.0,
232
- "step": 779
233
- },
234
- {
235
- "epoch": 9.879518072289157,
236
- "grad_norm": NaN,
237
- "learning_rate": 2.409638554216868e-07,
238
- "loss": 0.0,
239
- "step": 820
240
  },
241
  {
242
  "epoch": 10.0,
243
  "eval_accuracy": 0.09523809523809523,
244
  "eval_f1": 0.016563146997929608,
245
  "eval_loss": NaN,
246
- "eval_runtime": 2.8484,
247
- "eval_samples_per_second": 36.863,
248
- "eval_steps_per_second": 4.915,
249
- "step": 830
250
  },
251
  {
252
  "epoch": 10.0,
253
- "step": 830,
254
  "total_flos": 5.130291560557363e+17,
255
  "train_loss": 0.0,
256
- "train_runtime": 548.693,
257
- "train_samples_per_second": 12.065,
258
- "train_steps_per_second": 1.513
259
  }
260
  ],
261
  "logging_steps": 41,
262
- "max_steps": 830,
263
  "num_input_tokens_seen": 0,
264
  "num_train_epochs": 10,
265
  "save_steps": 500,
@@ -276,7 +206,7 @@
276
  }
277
  },
278
  "total_flos": 5.130291560557363e+17,
279
- "train_batch_size": 8,
280
  "trial_name": null,
281
  "trial_params": null
282
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 420,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9761904761904762,
13
  "grad_norm": NaN,
14
+ "learning_rate": 1.804761904761905e-05,
15
  "loss": 0.0,
16
  "step": 41
17
  },
 
 
 
 
 
 
 
18
  {
19
  "epoch": 1.0,
20
  "eval_accuracy": 0.09523809523809523,
21
  "eval_f1": 0.016563146997929608,
22
  "eval_loss": NaN,
23
+ "eval_runtime": 3.2679,
24
+ "eval_samples_per_second": 32.131,
25
+ "eval_steps_per_second": 2.142,
26
+ "step": 42
27
  },
28
  {
29
+ "epoch": 1.9523809523809523,
30
  "grad_norm": NaN,
31
+ "learning_rate": 1.6095238095238096e-05,
32
  "loss": 0.0,
33
+ "step": 82
 
 
 
 
 
 
 
34
  },
35
  {
36
  "epoch": 2.0,
37
  "eval_accuracy": 0.09523809523809523,
38
  "eval_f1": 0.016563146997929608,
39
  "eval_loss": NaN,
40
+ "eval_runtime": 3.2909,
41
+ "eval_samples_per_second": 31.906,
42
+ "eval_steps_per_second": 2.127,
43
+ "step": 84
44
  },
45
  {
46
+ "epoch": 2.928571428571429,
47
  "grad_norm": NaN,
48
+ "learning_rate": 1.4142857142857145e-05,
49
  "loss": 0.0,
50
+ "step": 123
 
 
 
 
 
 
 
51
  },
52
  {
53
  "epoch": 3.0,
54
  "eval_accuracy": 0.09523809523809523,
55
  "eval_f1": 0.016563146997929608,
56
  "eval_loss": NaN,
57
+ "eval_runtime": 3.2747,
58
+ "eval_samples_per_second": 32.064,
59
+ "eval_steps_per_second": 2.138,
60
+ "step": 126
 
 
 
 
 
 
 
61
  },
62
  {
63
+ "epoch": 3.9047619047619047,
64
  "grad_norm": NaN,
65
+ "learning_rate": 1.2190476190476192e-05,
66
  "loss": 0.0,
67
+ "step": 164
68
  },
69
  {
70
  "epoch": 4.0,
71
  "eval_accuracy": 0.09523809523809523,
72
  "eval_f1": 0.016563146997929608,
73
  "eval_loss": NaN,
74
+ "eval_runtime": 3.2807,
75
+ "eval_samples_per_second": 32.006,
76
+ "eval_steps_per_second": 2.134,
77
+ "step": 168
 
 
 
 
 
 
 
78
  },
79
  {
80
+ "epoch": 4.880952380952381,
81
  "grad_norm": NaN,
82
+ "learning_rate": 1.0238095238095238e-05,
83
  "loss": 0.0,
84
+ "step": 205
85
  },
86
  {
87
  "epoch": 5.0,
88
  "eval_accuracy": 0.09523809523809523,
89
  "eval_f1": 0.016563146997929608,
90
  "eval_loss": NaN,
91
+ "eval_runtime": 3.2895,
92
+ "eval_samples_per_second": 31.919,
93
+ "eval_steps_per_second": 2.128,
94
+ "step": 210
 
 
 
 
 
 
 
95
  },
96
  {
97
+ "epoch": 5.857142857142857,
98
  "grad_norm": NaN,
99
+ "learning_rate": 8.285714285714287e-06,
100
  "loss": 0.0,
101
+ "step": 246
102
  },
103
  {
104
  "epoch": 6.0,
105
  "eval_accuracy": 0.09523809523809523,
106
  "eval_f1": 0.016563146997929608,
107
  "eval_loss": NaN,
108
+ "eval_runtime": 3.2926,
109
+ "eval_samples_per_second": 31.89,
110
+ "eval_steps_per_second": 2.126,
111
+ "step": 252
112
  },
113
  {
114
+ "epoch": 6.833333333333333,
115
  "grad_norm": NaN,
116
+ "learning_rate": 6.333333333333333e-06,
117
  "loss": 0.0,
118
+ "step": 287
 
 
 
 
 
 
 
119
  },
120
  {
121
  "epoch": 7.0,
122
  "eval_accuracy": 0.09523809523809523,
123
  "eval_f1": 0.016563146997929608,
124
  "eval_loss": NaN,
125
+ "eval_runtime": 3.3043,
126
+ "eval_samples_per_second": 31.777,
127
+ "eval_steps_per_second": 2.118,
128
+ "step": 294
129
  },
130
  {
131
+ "epoch": 7.809523809523809,
132
  "grad_norm": NaN,
133
+ "learning_rate": 4.3809523809523815e-06,
134
  "loss": 0.0,
135
+ "step": 328
 
 
 
 
 
 
 
136
  },
137
  {
138
  "epoch": 8.0,
139
  "eval_accuracy": 0.09523809523809523,
140
  "eval_f1": 0.016563146997929608,
141
  "eval_loss": NaN,
142
+ "eval_runtime": 3.2955,
143
+ "eval_samples_per_second": 31.862,
144
+ "eval_steps_per_second": 2.124,
145
+ "step": 336
 
 
 
 
 
 
 
146
  },
147
  {
148
+ "epoch": 8.785714285714286,
149
  "grad_norm": NaN,
150
+ "learning_rate": 2.428571428571429e-06,
151
  "loss": 0.0,
152
+ "step": 369
153
  },
154
  {
155
  "epoch": 9.0,
156
  "eval_accuracy": 0.09523809523809523,
157
  "eval_f1": 0.016563146997929608,
158
  "eval_loss": NaN,
159
+ "eval_runtime": 3.3393,
160
+ "eval_samples_per_second": 31.444,
161
+ "eval_steps_per_second": 2.096,
162
+ "step": 378
163
  },
164
  {
165
+ "epoch": 9.761904761904763,
166
  "grad_norm": NaN,
167
+ "learning_rate": 4.7619047619047623e-07,
168
  "loss": 0.0,
169
+ "step": 410
 
 
 
 
 
 
 
170
  },
171
  {
172
  "epoch": 10.0,
173
  "eval_accuracy": 0.09523809523809523,
174
  "eval_f1": 0.016563146997929608,
175
  "eval_loss": NaN,
176
+ "eval_runtime": 3.2936,
177
+ "eval_samples_per_second": 31.88,
178
+ "eval_steps_per_second": 2.125,
179
+ "step": 420
180
  },
181
  {
182
  "epoch": 10.0,
183
+ "step": 420,
184
  "total_flos": 5.130291560557363e+17,
185
  "train_loss": 0.0,
186
+ "train_runtime": 523.0916,
187
+ "train_samples_per_second": 12.656,
188
+ "train_steps_per_second": 0.803
189
  }
190
  ],
191
  "logging_steps": 41,
192
+ "max_steps": 420,
193
  "num_input_tokens_seen": 0,
194
  "num_train_epochs": 10,
195
  "save_steps": 500,
 
206
  }
207
  },
208
  "total_flos": 5.130291560557363e+17,
209
+ "train_batch_size": 16,
210
  "trial_name": null,
211
  "trial_params": null
212
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0700128b977dcfbe9aadb63bb0ff124cbe09812beff12427b460eda1058c382a
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b70fa78c33b8a562ff97b75b265095a179391c9993acd8b4fa6900d706aa695
3
  size 5240