chansung commited on
Commit
3df570d
·
verified ·
1 Parent(s): 01caa7c

Model save

Browse files
Files changed (4) hide show
  1. README.md +7 -7
  2. all_results.json +7 -12
  3. train_results.json +7 -7
  4. trainer_state.json +262 -101
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.5500
24
 
25
  ## Model description
26
 
@@ -48,22 +48,22 @@ The following hyperparameters were used during training:
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 192
50
  - total_eval_batch_size: 96
51
- - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
- | Training Loss | Epoch | Step | Validation Loss |
59
- |:-------------:|:------:|:----:|:---------------:|
60
- | 1.5519 | 0.9951 | 102 | 1.5500 |
61
 
62
 
63
  ### Framework versions
64
 
65
  - PEFT 0.13.1.dev0
66
- - Transformers 4.46.2
67
- - Pytorch 2.5.1+cu124
68
  - Datasets 3.1.0
69
  - Tokenizers 0.20.3
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.4864
24
 
25
  ## Model description
26
 
 
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 192
50
  - total_eval_batch_size: 96
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.982 | 1.0 | 216 | 1.4864 |
61
 
62
 
63
  ### Framework versions
64
 
65
  - PEFT 0.13.1.dev0
66
+ - Transformers 4.46.3
67
+ - Pytorch 2.3.1+cu121
68
  - Datasets 3.1.0
69
  - Tokenizers 0.20.3
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9951219512195122,
3
- "eval_loss": 1.5499577522277832,
4
- "eval_runtime": 1.7323,
5
- "eval_samples": 518,
6
- "eval_samples_per_second": 112.567,
7
- "eval_steps_per_second": 1.732,
8
- "total_flos": 4.281868708751606e+17,
9
- "train_loss": 1.6186961426454431,
10
- "train_runtime": 363.7432,
11
- "train_samples": 51241,
12
- "train_samples_per_second": 53.848,
13
- "train_steps_per_second": 0.28
14
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.067486658407956e+17,
4
+ "train_loss": 1.0541787544886272,
5
+ "train_runtime": 774.5406,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 53.491,
8
+ "train_steps_per_second": 0.279
 
 
 
 
 
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9951219512195122,
3
- "total_flos": 4.281868708751606e+17,
4
- "train_loss": 1.6186961426454431,
5
- "train_runtime": 363.7432,
6
- "train_samples": 51241,
7
- "train_samples_per_second": 53.848,
8
- "train_steps_per_second": 0.28
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 9.067486658407956e+17,
4
+ "train_loss": 1.0541787544886272,
5
+ "train_runtime": 774.5406,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 53.491,
8
+ "train_steps_per_second": 0.279
9
  }
trainer_state.json CHANGED
@@ -1,180 +1,341 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9951219512195122,
5
  "eval_steps": 500,
6
- "global_step": 102,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.00975609756097561,
13
- "grad_norm": 2.371129274368286,
14
- "learning_rate": 1.8181818181818182e-05,
15
- "loss": 2.0473,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.04878048780487805,
20
- "grad_norm": 2.3514864444732666,
21
- "learning_rate": 9.090909090909092e-05,
22
- "loss": 2.0124,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.0975609756097561,
27
- "grad_norm": 1.9732930660247803,
28
- "learning_rate": 0.00018181818181818183,
29
- "loss": 1.9222,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.14634146341463414,
34
- "grad_norm": 2.1772541999816895,
35
- "learning_rate": 0.00019904804439875633,
36
- "loss": 1.7928,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.1951219512195122,
41
- "grad_norm": 1.5341215133666992,
42
- "learning_rate": 0.00019521176659107142,
43
- "loss": 1.7354,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.24390243902439024,
48
- "grad_norm": 1.4509671926498413,
49
- "learning_rate": 0.000188545602565321,
50
- "loss": 1.6622,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.2926829268292683,
55
- "grad_norm": 0.9245131015777588,
56
- "learning_rate": 0.00017924768419510904,
57
- "loss": 1.5826,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.34146341463414637,
62
- "grad_norm": 0.8397857546806335,
63
- "learning_rate": 0.00016759436441447545,
64
- "loss": 1.5755,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.3902439024390244,
69
- "grad_norm": 0.7870491743087769,
70
- "learning_rate": 0.00015393200344991995,
71
- "loss": 1.5685,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.43902439024390244,
76
- "grad_norm": 0.7064230442047119,
77
- "learning_rate": 0.0001386666742941419,
78
- "loss": 1.5434,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.4878048780487805,
83
- "grad_norm": 0.8221641182899475,
84
- "learning_rate": 0.00012225209339563145,
85
- "loss": 1.5552,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.5365853658536586,
90
- "grad_norm": 0.7419559359550476,
91
- "learning_rate": 0.00010517613528842097,
92
- "loss": 1.5569,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.5853658536585366,
97
- "grad_norm": 0.7535139322280884,
98
- "learning_rate": 8.79463319744677e-05,
99
- "loss": 1.5627,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.6341463414634146,
104
- "grad_norm": 0.709441065788269,
105
- "learning_rate": 7.107478804634325e-05,
106
- "loss": 1.5634,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.6829268292682927,
111
- "grad_norm": 0.6745243668556213,
112
- "learning_rate": 5.506295990328385e-05,
113
- "loss": 1.5537,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.7317073170731707,
118
- "grad_norm": 0.6785560250282288,
119
- "learning_rate": 4.038675145307747e-05,
120
- "loss": 1.5373,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.7804878048780488,
125
- "grad_norm": 0.7758954167366028,
126
- "learning_rate": 2.7482369285662378e-05,
127
- "loss": 1.5341,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.8292682926829268,
132
- "grad_norm": 0.6606050729751587,
133
- "learning_rate": 1.6733357731279377e-05,
134
- "loss": 1.5326,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.8780487804878049,
139
- "grad_norm": 0.8155940771102905,
140
- "learning_rate": 8.45919914746337e-06,
141
- "loss": 1.5411,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.926829268292683,
146
- "grad_norm": 0.7202953100204468,
147
- "learning_rate": 2.905818257394799e-06,
148
- "loss": 1.5414,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.975609756097561,
153
- "grad_norm": 0.670049786567688,
154
- "learning_rate": 2.382727698752474e-07,
155
- "loss": 1.5519,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.9951219512195122,
160
- "eval_loss": 1.5499577522277832,
161
- "eval_runtime": 1.7555,
162
- "eval_samples_per_second": 111.078,
163
- "eval_steps_per_second": 1.709,
164
- "step": 102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  },
166
  {
167
- "epoch": 0.9951219512195122,
168
- "step": 102,
169
- "total_flos": 4.281868708751606e+17,
170
- "train_loss": 1.6186961426454431,
171
- "train_runtime": 363.7432,
172
- "train_samples_per_second": 53.848,
173
- "train_steps_per_second": 0.28
174
  }
175
  ],
176
  "logging_steps": 5,
177
- "max_steps": 102,
178
  "num_input_tokens_seen": 0,
179
  "num_train_epochs": 1,
180
  "save_steps": 100,
@@ -190,7 +351,7 @@
190
  "attributes": {}
191
  }
192
  },
193
- "total_flos": 4.281868708751606e+17,
194
  "train_batch_size": 12,
195
  "trial_name": null,
196
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 216,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.004629629629629629,
13
+ "grad_norm": 2.620875597000122,
14
+ "learning_rate": 9.090909090909091e-06,
15
+ "loss": 1.5605,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.023148148148148147,
20
+ "grad_norm": 2.674976348876953,
21
+ "learning_rate": 4.545454545454546e-05,
22
+ "loss": 1.5457,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.046296296296296294,
27
+ "grad_norm": 2.3885157108306885,
28
+ "learning_rate": 9.090909090909092e-05,
29
+ "loss": 1.5006,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.06944444444444445,
34
+ "grad_norm": 2.1752545833587646,
35
+ "learning_rate": 0.00013636363636363637,
36
+ "loss": 1.4093,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.09259259259259259,
41
+ "grad_norm": 2.1516635417938232,
42
+ "learning_rate": 0.00018181818181818183,
43
+ "loss": 1.301,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.11574074074074074,
48
+ "grad_norm": 1.6158533096313477,
49
+ "learning_rate": 0.0001998820159279591,
50
+ "loss": 1.195,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.1388888888888889,
55
+ "grad_norm": 0.7115136384963989,
56
+ "learning_rate": 0.00019916201012264254,
57
+ "loss": 1.1232,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.16203703703703703,
62
+ "grad_norm": 0.5917097926139832,
63
+ "learning_rate": 0.00019779225723955707,
64
+ "loss": 1.0867,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.18518518518518517,
69
+ "grad_norm": 0.6770131587982178,
70
+ "learning_rate": 0.00019578173241879872,
71
+ "loss": 1.0683,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.20833333333333334,
76
+ "grad_norm": 0.5598504543304443,
77
+ "learning_rate": 0.00019314360938108425,
78
+ "loss": 1.0576,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.23148148148148148,
83
+ "grad_norm": 0.5453623533248901,
84
+ "learning_rate": 0.00018989517410853955,
85
+ "loss": 1.0375,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.25462962962962965,
90
+ "grad_norm": 0.507411539554596,
91
+ "learning_rate": 0.00018605771158039253,
92
+ "loss": 1.0349,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.2777777777777778,
97
+ "grad_norm": 0.5281575918197632,
98
+ "learning_rate": 0.0001816563663057211,
99
+ "loss": 1.0306,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.30092592592592593,
104
+ "grad_norm": 0.49278953671455383,
105
+ "learning_rate": 0.00017671997756709863,
106
+ "loss": 1.0232,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.32407407407407407,
111
+ "grad_norm": 0.44363367557525635,
112
+ "learning_rate": 0.00017128089045468294,
113
+ "loss": 1.0206,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.3472222222222222,
118
+ "grad_norm": 0.4600500464439392,
119
+ "learning_rate": 0.00016537474392892528,
120
+ "loss": 1.0185,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.37037037037037035,
125
+ "grad_norm": 0.4178927540779114,
126
+ "learning_rate": 0.00015904023730059228,
127
+ "loss": 1.0105,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.39351851851851855,
132
+ "grad_norm": 0.5482760071754456,
133
+ "learning_rate": 0.000152318876658213,
134
+ "loss": 1.0164,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.4166666666666667,
139
+ "grad_norm": 0.4235095679759979,
140
+ "learning_rate": 0.00014525470290445392,
141
+ "loss": 1.0151,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.4398148148148148,
146
+ "grad_norm": 0.4932386875152588,
147
+ "learning_rate": 0.00013789400318343068,
148
+ "loss": 1.0081,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.46296296296296297,
153
+ "grad_norm": 0.4402116537094116,
154
+ "learning_rate": 0.00013028500758979506,
155
+ "loss": 1.0094,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.4861111111111111,
160
+ "grad_norm": 0.4497814476490021,
161
+ "learning_rate": 0.00012247757314687297,
162
+ "loss": 0.9996,
163
+ "step": 105
164
+ },
165
+ {
166
+ "epoch": 0.5092592592592593,
167
+ "grad_norm": 0.43658843636512756,
168
+ "learning_rate": 0.00011452285712454904,
169
+ "loss": 1.004,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.5324074074074074,
174
+ "grad_norm": 0.4577714800834656,
175
+ "learning_rate": 0.00010647298183744359,
176
+ "loss": 0.9936,
177
+ "step": 115
178
+ },
179
+ {
180
+ "epoch": 0.5555555555555556,
181
+ "grad_norm": 0.44585293531417847,
182
+ "learning_rate": 9.838069311974986e-05,
183
+ "loss": 0.9953,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.5787037037037037,
188
+ "grad_norm": 0.4536885619163513,
189
+ "learning_rate": 9.02990147145352e-05,
190
+ "loss": 0.9972,
191
+ "step": 125
192
+ },
193
+ {
194
+ "epoch": 0.6018518518518519,
195
+ "grad_norm": 0.4714517593383789,
196
+ "learning_rate": 8.228090084207774e-05,
197
+ "loss": 0.9963,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.625,
202
+ "grad_norm": 0.45539769530296326,
203
+ "learning_rate": 7.437888922374276e-05,
204
+ "loss": 1.0039,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.6481481481481481,
209
+ "grad_norm": 0.4661619961261749,
210
+ "learning_rate": 6.664475683491796e-05,
211
+ "loss": 0.996,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.6712962962962963,
216
+ "grad_norm": 0.4308771789073944,
217
+ "learning_rate": 5.9129180642644414e-05,
218
+ "loss": 0.9968,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.6944444444444444,
223
+ "grad_norm": 0.42372000217437744,
224
+ "learning_rate": 5.1881405550919493e-05,
225
+ "loss": 0.997,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.7175925925925926,
230
+ "grad_norm": 0.4466856122016907,
231
+ "learning_rate": 4.494892172941965e-05,
232
+ "loss": 0.997,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.7407407407407407,
237
+ "grad_norm": 0.47718337178230286,
238
+ "learning_rate": 3.8377153439907266e-05,
239
+ "loss": 0.9932,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.7638888888888888,
244
+ "grad_norm": 0.4494944214820862,
245
+ "learning_rate": 3.2209161399249674e-05,
246
+ "loss": 0.981,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.7870370370370371,
251
+ "grad_norm": 0.4661237597465515,
252
+ "learning_rate": 2.6485360629279987e-05,
253
+ "loss": 0.988,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.8101851851851852,
258
+ "grad_norm": 0.4337325394153595,
259
+ "learning_rate": 2.1243255642254578e-05,
260
+ "loss": 0.9888,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.8333333333333334,
265
+ "grad_norm": 0.4312609136104584,
266
+ "learning_rate": 1.65171946970729e-05,
267
+ "loss": 0.9938,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.8564814814814815,
272
+ "grad_norm": 0.41870856285095215,
273
+ "learning_rate": 1.233814473646524e-05,
274
+ "loss": 0.9948,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.8796296296296297,
279
+ "grad_norm": 0.47287535667419434,
280
+ "learning_rate": 8.733488479845997e-06,
281
+ "loss": 0.9905,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.9027777777777778,
286
+ "grad_norm": 0.42414429783821106,
287
+ "learning_rate": 5.726845001356573e-06,
288
+ "loss": 0.9834,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.9259259259259259,
293
+ "grad_norm": 0.4528570771217346,
294
+ "learning_rate": 3.3379149687388867e-06,
295
+ "loss": 0.9822,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.9490740740740741,
300
+ "grad_norm": 0.4307001233100891,
301
+ "learning_rate": 1.5823515570925763e-06,
302
+ "loss": 0.9802,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.9722222222222222,
307
+ "grad_norm": 0.42982199788093567,
308
+ "learning_rate": 4.7165788333860536e-07,
309
+ "loss": 0.9846,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.9953703703703703,
314
+ "grad_norm": 0.4325573146343231,
315
+ "learning_rate": 1.3111633436779791e-08,
316
+ "loss": 0.982,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 1.0,
321
+ "eval_loss": 1.4864426851272583,
322
+ "eval_runtime": 0.5986,
323
+ "eval_samples_per_second": 18.377,
324
+ "eval_steps_per_second": 1.671,
325
+ "step": 216
326
  },
327
  {
328
+ "epoch": 1.0,
329
+ "step": 216,
330
+ "total_flos": 9.067486658407956e+17,
331
+ "train_loss": 1.0541787544886272,
332
+ "train_runtime": 774.5406,
333
+ "train_samples_per_second": 53.491,
334
+ "train_steps_per_second": 0.279
335
  }
336
  ],
337
  "logging_steps": 5,
338
+ "max_steps": 216,
339
  "num_input_tokens_seen": 0,
340
  "num_train_epochs": 1,
341
  "save_steps": 100,
 
351
  "attributes": {}
352
  }
353
  },
354
+ "total_flos": 9.067486658407956e+17,
355
  "train_batch_size": 12,
356
  "trial_name": null,
357
  "trial_params": null