sedrickkeh commited on
Commit
608c49d
·
verified ·
1 Parent(s): 2f80188

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: top_5_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # top_5_ranking_stackexchange
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.7860
21
 
 
4
  base_model: meta-llama/Meta-Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: top_5_ranking_stackexchange
 
16
 
17
  # top_5_ranking_stackexchange
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_5_ranking_stackexchange dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7860
22
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9962546816479403,
3
+ "eval_loss": 0.7859531044960022,
4
+ "eval_runtime": 214.0804,
5
+ "eval_samples_per_second": 25.21,
6
+ "eval_steps_per_second": 0.397,
7
+ "total_flos": 1004812967608320.0,
8
+ "train_loss": 0.773849273522695,
9
+ "train_runtime": 35442.7976,
10
+ "train_samples_per_second": 8.678,
11
+ "train_steps_per_second": 0.017
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9962546816479403,
3
+ "eval_loss": 0.7859531044960022,
4
+ "eval_runtime": 214.0804,
5
+ "eval_samples_per_second": 25.21,
6
+ "eval_steps_per_second": 0.397
7
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9962546816479403,
3
+ "total_flos": 1004812967608320.0,
4
+ "train_loss": 0.773849273522695,
5
+ "train_runtime": 35442.7976,
6
+ "train_samples_per_second": 8.678,
7
+ "train_steps_per_second": 0.017
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9962546816479403,
5
+ "eval_steps": 500,
6
+ "global_step": 600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.049937578027465665,
13
+ "grad_norm": 29.759422123901825,
14
+ "learning_rate": 5e-06,
15
+ "loss": 1.0298,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.09987515605493133,
20
+ "grad_norm": 1.659620518169597,
21
+ "learning_rate": 5e-06,
22
+ "loss": 0.9418,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.149812734082397,
27
+ "grad_norm": 1.6655817038532692,
28
+ "learning_rate": 5e-06,
29
+ "loss": 0.9067,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.19975031210986266,
34
+ "grad_norm": 3.758532618440977,
35
+ "learning_rate": 5e-06,
36
+ "loss": 0.8837,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.24968789013732834,
41
+ "grad_norm": 1.0914089528556838,
42
+ "learning_rate": 5e-06,
43
+ "loss": 0.8683,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.299625468164794,
48
+ "grad_norm": 0.8217188574851603,
49
+ "learning_rate": 5e-06,
50
+ "loss": 0.8554,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.3495630461922597,
55
+ "grad_norm": 2.6152137256768913,
56
+ "learning_rate": 5e-06,
57
+ "loss": 0.8501,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.3995006242197253,
62
+ "grad_norm": 0.6790645381435163,
63
+ "learning_rate": 5e-06,
64
+ "loss": 0.8376,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.449438202247191,
69
+ "grad_norm": 1.050547891408586,
70
+ "learning_rate": 5e-06,
71
+ "loss": 0.8389,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.4993757802746567,
76
+ "grad_norm": 0.61531936130051,
77
+ "learning_rate": 5e-06,
78
+ "loss": 0.8283,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.5493133583021224,
83
+ "grad_norm": 0.7158756598167714,
84
+ "learning_rate": 5e-06,
85
+ "loss": 0.8285,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.599250936329588,
90
+ "grad_norm": 0.6741740850175894,
91
+ "learning_rate": 5e-06,
92
+ "loss": 0.8208,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.6491885143570537,
97
+ "grad_norm": 0.727395321147856,
98
+ "learning_rate": 5e-06,
99
+ "loss": 0.8226,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.6991260923845194,
104
+ "grad_norm": 0.5979029940330495,
105
+ "learning_rate": 5e-06,
106
+ "loss": 0.8218,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.7490636704119851,
111
+ "grad_norm": 0.7354628133399658,
112
+ "learning_rate": 5e-06,
113
+ "loss": 0.8146,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.7990012484394506,
118
+ "grad_norm": 1.0797869029852383,
119
+ "learning_rate": 5e-06,
120
+ "loss": 0.811,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.8489388264669163,
125
+ "grad_norm": 0.5732338176348897,
126
+ "learning_rate": 5e-06,
127
+ "loss": 0.8117,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.898876404494382,
132
+ "grad_norm": 0.516647597175339,
133
+ "learning_rate": 5e-06,
134
+ "loss": 0.8099,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.9488139825218477,
139
+ "grad_norm": 0.595235164677505,
140
+ "learning_rate": 5e-06,
141
+ "loss": 0.8062,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.9987515605493134,
146
+ "grad_norm": 0.5491264032653016,
147
+ "learning_rate": 5e-06,
148
+ "loss": 0.7992,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.9987515605493134,
153
+ "eval_loss": 0.8015628457069397,
154
+ "eval_runtime": 212.5855,
155
+ "eval_samples_per_second": 25.387,
156
+ "eval_steps_per_second": 0.4,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 1.048689138576779,
161
+ "grad_norm": 1.1815779977106664,
162
+ "learning_rate": 5e-06,
163
+ "loss": 0.8245,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 1.0986267166042447,
168
+ "grad_norm": 0.8607004375758024,
169
+ "learning_rate": 5e-06,
170
+ "loss": 0.7537,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 1.1485642946317103,
175
+ "grad_norm": 0.6382531480247193,
176
+ "learning_rate": 5e-06,
177
+ "loss": 0.7617,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 1.198501872659176,
182
+ "grad_norm": 0.5848998545511357,
183
+ "learning_rate": 5e-06,
184
+ "loss": 0.7635,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 1.2484394506866416,
189
+ "grad_norm": 0.6856479277932508,
190
+ "learning_rate": 5e-06,
191
+ "loss": 0.7623,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 1.2983770287141074,
196
+ "grad_norm": 0.7819524787327043,
197
+ "learning_rate": 5e-06,
198
+ "loss": 0.7567,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 1.348314606741573,
203
+ "grad_norm": 0.6831313099201878,
204
+ "learning_rate": 5e-06,
205
+ "loss": 0.7606,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 1.3982521847690386,
210
+ "grad_norm": 0.6700856388131974,
211
+ "learning_rate": 5e-06,
212
+ "loss": 0.7553,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 1.4481897627965044,
217
+ "grad_norm": 0.5874295240823044,
218
+ "learning_rate": 5e-06,
219
+ "loss": 0.7575,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 1.4981273408239701,
224
+ "grad_norm": 0.6100148315517313,
225
+ "learning_rate": 5e-06,
226
+ "loss": 0.7523,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 1.5480649188514357,
231
+ "grad_norm": 0.6291672713518774,
232
+ "learning_rate": 5e-06,
233
+ "loss": 0.759,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 1.5980024968789013,
238
+ "grad_norm": 0.7275448418797654,
239
+ "learning_rate": 5e-06,
240
+ "loss": 0.7532,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 1.647940074906367,
245
+ "grad_norm": 0.6798292981346045,
246
+ "learning_rate": 5e-06,
247
+ "loss": 0.7652,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 1.6978776529338329,
252
+ "grad_norm": 0.7320780258400261,
253
+ "learning_rate": 5e-06,
254
+ "loss": 0.7554,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 1.7478152309612984,
259
+ "grad_norm": 0.6107676047027211,
260
+ "learning_rate": 5e-06,
261
+ "loss": 0.757,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 1.797752808988764,
266
+ "grad_norm": 0.5083613384732135,
267
+ "learning_rate": 5e-06,
268
+ "loss": 0.7576,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 1.8476903870162298,
273
+ "grad_norm": 0.5021025632111004,
274
+ "learning_rate": 5e-06,
275
+ "loss": 0.7584,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 1.8976279650436954,
280
+ "grad_norm": 0.6593302140861815,
281
+ "learning_rate": 5e-06,
282
+ "loss": 0.755,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 1.947565543071161,
287
+ "grad_norm": 0.5669868340257436,
288
+ "learning_rate": 5e-06,
289
+ "loss": 0.7478,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 1.9975031210986267,
294
+ "grad_norm": 0.5745040341281294,
295
+ "learning_rate": 5e-06,
296
+ "loss": 0.7538,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 1.9975031210986267,
301
+ "eval_loss": 0.7874204516410828,
302
+ "eval_runtime": 212.6782,
303
+ "eval_samples_per_second": 25.376,
304
+ "eval_steps_per_second": 0.4,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 2.0474406991260925,
309
+ "grad_norm": 0.6952347883899184,
310
+ "learning_rate": 5e-06,
311
+ "loss": 0.7611,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 2.097378277153558,
316
+ "grad_norm": 0.6000479994459602,
317
+ "learning_rate": 5e-06,
318
+ "loss": 0.7037,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 2.1473158551810236,
323
+ "grad_norm": 0.6575266096005482,
324
+ "learning_rate": 5e-06,
325
+ "loss": 0.7089,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 2.1972534332084894,
330
+ "grad_norm": 0.7384159721059136,
331
+ "learning_rate": 5e-06,
332
+ "loss": 0.7057,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 2.247191011235955,
337
+ "grad_norm": 0.6735840214535883,
338
+ "learning_rate": 5e-06,
339
+ "loss": 0.706,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 2.2971285892634206,
344
+ "grad_norm": 0.7153617513297972,
345
+ "learning_rate": 5e-06,
346
+ "loss": 0.7064,
347
+ "step": 460
348
+ },
349
+ {
350
+ "epoch": 2.3470661672908864,
351
+ "grad_norm": 0.7396133098853745,
352
+ "learning_rate": 5e-06,
353
+ "loss": 0.7049,
354
+ "step": 470
355
+ },
356
+ {
357
+ "epoch": 2.397003745318352,
358
+ "grad_norm": 0.6440383221784979,
359
+ "learning_rate": 5e-06,
360
+ "loss": 0.705,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 2.4469413233458175,
365
+ "grad_norm": 0.5481603423583875,
366
+ "learning_rate": 5e-06,
367
+ "loss": 0.709,
368
+ "step": 490
369
+ },
370
+ {
371
+ "epoch": 2.4968789013732833,
372
+ "grad_norm": 0.6611697985224058,
373
+ "learning_rate": 5e-06,
374
+ "loss": 0.71,
375
+ "step": 500
376
+ },
377
+ {
378
+ "epoch": 2.546816479400749,
379
+ "grad_norm": 0.6252639550455323,
380
+ "learning_rate": 5e-06,
381
+ "loss": 0.7128,
382
+ "step": 510
383
+ },
384
+ {
385
+ "epoch": 2.596754057428215,
386
+ "grad_norm": 0.578764019014536,
387
+ "learning_rate": 5e-06,
388
+ "loss": 0.7116,
389
+ "step": 520
390
+ },
391
+ {
392
+ "epoch": 2.6466916354556806,
393
+ "grad_norm": 0.5718219886250622,
394
+ "learning_rate": 5e-06,
395
+ "loss": 0.711,
396
+ "step": 530
397
+ },
398
+ {
399
+ "epoch": 2.696629213483146,
400
+ "grad_norm": 0.6480951015929783,
401
+ "learning_rate": 5e-06,
402
+ "loss": 0.706,
403
+ "step": 540
404
+ },
405
+ {
406
+ "epoch": 2.746566791510612,
407
+ "grad_norm": 0.568128147930456,
408
+ "learning_rate": 5e-06,
409
+ "loss": 0.7144,
410
+ "step": 550
411
+ },
412
+ {
413
+ "epoch": 2.796504369538077,
414
+ "grad_norm": 0.7016907742592169,
415
+ "learning_rate": 5e-06,
416
+ "loss": 0.708,
417
+ "step": 560
418
+ },
419
+ {
420
+ "epoch": 2.846441947565543,
421
+ "grad_norm": 0.6718047517989062,
422
+ "learning_rate": 5e-06,
423
+ "loss": 0.7147,
424
+ "step": 570
425
+ },
426
+ {
427
+ "epoch": 2.8963795255930087,
428
+ "grad_norm": 0.7869266984488797,
429
+ "learning_rate": 5e-06,
430
+ "loss": 0.7114,
431
+ "step": 580
432
+ },
433
+ {
434
+ "epoch": 2.9463171036204745,
435
+ "grad_norm": 0.6070316014377024,
436
+ "learning_rate": 5e-06,
437
+ "loss": 0.7072,
438
+ "step": 590
439
+ },
440
+ {
441
+ "epoch": 2.9962546816479403,
442
+ "grad_norm": 0.5662761836861052,
443
+ "learning_rate": 5e-06,
444
+ "loss": 0.7158,
445
+ "step": 600
446
+ },
447
+ {
448
+ "epoch": 2.9962546816479403,
449
+ "eval_loss": 0.7859531044960022,
450
+ "eval_runtime": 213.866,
451
+ "eval_samples_per_second": 25.235,
452
+ "eval_steps_per_second": 0.397,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 2.9962546816479403,
457
+ "step": 600,
458
+ "total_flos": 1004812967608320.0,
459
+ "train_loss": 0.773849273522695,
460
+ "train_runtime": 35442.7976,
461
+ "train_samples_per_second": 8.678,
462
+ "train_steps_per_second": 0.017
463
+ }
464
+ ],
465
+ "logging_steps": 10,
466
+ "max_steps": 600,
467
+ "num_input_tokens_seen": 0,
468
+ "num_train_epochs": 3,
469
+ "save_steps": 500,
470
+ "stateful_callbacks": {
471
+ "TrainerControl": {
472
+ "args": {
473
+ "should_epoch_stop": false,
474
+ "should_evaluate": false,
475
+ "should_log": false,
476
+ "should_save": true,
477
+ "should_training_stop": true
478
+ },
479
+ "attributes": {}
480
+ }
481
+ },
482
+ "total_flos": 1004812967608320.0,
483
+ "train_batch_size": 8,
484
+ "trial_name": null,
485
+ "trial_params": null
486
+ }
training_eval_loss.png ADDED
training_loss.png ADDED