gsmyrnis commited on
Commit
2ef73b4
·
verified ·
1 Parent(s): 428a11d

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +419 -440
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: llama3-1_8b_4o_annotated_aime
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # llama3-1_8b_4o_annotated_aime
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: llama3-1_8b_4o_annotated_aime
 
16
 
17
  # llama3-1_8b_4o_annotated_aime
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/4o_annotated_aime dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 25167957131264.0,
4
- "train_loss": 0.4632317288606255,
5
- "train_runtime": 439.9384,
6
- "train_samples_per_second": 23.206,
7
- "train_steps_per_second": 0.245
8
  }
 
1
  {
2
+ "epoch": 2.9626168224299065,
3
+ "total_flos": 18524510396416.0,
4
+ "train_loss": 0.47221575805119104,
5
+ "train_runtime": 641.5687,
6
+ "train_samples_per_second": 15.913,
7
+ "train_steps_per_second": 0.164
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 25167957131264.0,
4
- "train_loss": 0.4632317288606255,
5
- "train_runtime": 439.9384,
6
- "train_samples_per_second": 23.206,
7
- "train_steps_per_second": 0.245
8
  }
 
1
  {
2
+ "epoch": 2.9626168224299065,
3
+ "total_flos": 18524510396416.0,
4
+ "train_loss": 0.47221575805119104,
5
+ "train_runtime": 641.5687,
6
+ "train_samples_per_second": 15.913,
7
+ "train_steps_per_second": 0.164
8
  }
trainer_state.json CHANGED
@@ -1,781 +1,760 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 108,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.027777777777777776,
13
- "grad_norm": 5.499531247448967,
14
  "learning_rate": 9.090909090909091e-07,
15
- "loss": 0.6855,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.05555555555555555,
20
- "grad_norm": 5.557838812807806,
21
  "learning_rate": 1.8181818181818183e-06,
22
- "loss": 0.6554,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.08333333333333333,
27
- "grad_norm": 5.5626937869370305,
28
  "learning_rate": 2.7272727272727272e-06,
29
- "loss": 0.631,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.1111111111111111,
34
- "grad_norm": 5.202877857676261,
35
  "learning_rate": 3.6363636363636366e-06,
36
- "loss": 0.6684,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.1388888888888889,
41
- "grad_norm": 3.392348666163089,
42
  "learning_rate": 4.5454545454545455e-06,
43
- "loss": 0.551,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.16666666666666666,
48
- "grad_norm": 2.213517617001493,
49
  "learning_rate": 5.4545454545454545e-06,
50
- "loss": 0.5989,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.19444444444444445,
55
- "grad_norm": 2.001366109944542,
56
  "learning_rate": 6.363636363636364e-06,
57
- "loss": 0.5333,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.2222222222222222,
62
- "grad_norm": 1.9419958988527681,
63
  "learning_rate": 7.272727272727273e-06,
64
  "loss": 0.5054,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.25,
69
- "grad_norm": 1.5833591254215535,
70
  "learning_rate": 8.181818181818183e-06,
71
- "loss": 0.5322,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.2777777777777778,
76
- "grad_norm": 1.436746398759113,
77
  "learning_rate": 9.090909090909091e-06,
78
  "loss": 0.5433,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.3055555555555556,
83
- "grad_norm": 1.7345453054514894,
84
  "learning_rate": 1e-05,
85
- "loss": 0.5709,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.3333333333333333,
90
- "grad_norm": 1.597109242203359,
91
- "learning_rate": 9.997377845227577e-06,
92
- "loss": 0.4906,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.3611111111111111,
97
- "grad_norm": 1.2794918602228373,
98
- "learning_rate": 9.98951413118856e-06,
99
- "loss": 0.6254,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.3888888888888889,
104
- "grad_norm": 1.2457098159062354,
105
- "learning_rate": 9.97641710583307e-06,
106
- "loss": 0.5566,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.4166666666666667,
111
- "grad_norm": 1.0051622325160827,
112
- "learning_rate": 9.958100506132127e-06,
113
- "loss": 0.5086,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.4444444444444444,
118
- "grad_norm": 1.0232838508074662,
119
- "learning_rate": 9.934583543669454e-06,
120
- "loss": 0.5798,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.4722222222222222,
125
- "grad_norm": 1.0687266662109796,
126
- "learning_rate": 9.905890884491196e-06,
127
- "loss": 0.547,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.5,
132
- "grad_norm": 0.8078554986405523,
133
- "learning_rate": 9.872052623234632e-06,
134
- "loss": 0.4698,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.5277777777777778,
139
- "grad_norm": 0.9700266562907995,
140
- "learning_rate": 9.833104251563058e-06,
141
- "loss": 0.6414,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.5555555555555556,
146
- "grad_norm": 0.7994292138406759,
147
- "learning_rate": 9.789086620939936e-06,
148
- "loss": 0.4365,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.5833333333333334,
153
- "grad_norm": 0.8745740021635219,
154
- "learning_rate": 9.740045899781353e-06,
155
- "loss": 0.4913,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.6111111111111112,
160
- "grad_norm": 0.8854836350036159,
161
- "learning_rate": 9.68603352503172e-06,
162
- "loss": 0.5518,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.6388888888888888,
167
- "grad_norm": 0.847456579465944,
168
- "learning_rate": 9.627106148213521e-06,
169
- "loss": 0.5354,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.6666666666666666,
174
- "grad_norm": 0.821018000951143,
175
- "learning_rate": 9.563325576007702e-06,
176
- "loss": 0.5213,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.6944444444444444,
181
- "grad_norm": 0.9295894881714469,
182
- "learning_rate": 9.494758705426978e-06,
183
- "loss": 0.598,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.7222222222222222,
188
- "grad_norm": 0.7448130004829931,
189
- "learning_rate": 9.421477453650118e-06,
190
- "loss": 0.4658,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.75,
195
- "grad_norm": 0.9080917046249632,
196
- "learning_rate": 9.343558682590757e-06,
197
- "loss": 0.6228,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.7777777777777778,
202
- "grad_norm": 0.8163873567576216,
203
- "learning_rate": 9.261084118279846e-06,
204
- "loss": 0.5333,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.8055555555555556,
209
- "grad_norm": 0.7764439372547906,
210
- "learning_rate": 9.174140265146356e-06,
211
- "loss": 0.4741,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.8333333333333334,
216
- "grad_norm": 1.1961962564927606,
217
- "learning_rate": 9.082818315286054e-06,
218
- "loss": 0.5714,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.8611111111111112,
223
- "grad_norm": 0.7267955393856498,
224
- "learning_rate": 8.987214052813605e-06,
225
- "loss": 0.5379,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.8888888888888888,
230
- "grad_norm": 0.769564835206009,
231
- "learning_rate": 8.887427753398249e-06,
232
- "loss": 0.5221,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.9166666666666666,
237
- "grad_norm": 0.7557825851349546,
238
- "learning_rate": 8.783564079088478e-06,
239
- "loss": 0.5237,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.9444444444444444,
244
- "grad_norm": 0.8013832351115299,
245
- "learning_rate": 8.675731968536004e-06,
246
- "loss": 0.5525,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.9722222222222222,
251
- "grad_norm": 0.6888521920999732,
252
- "learning_rate": 8.564044522734147e-06,
253
- "loss": 0.4881,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 1.0,
258
- "grad_norm": 0.6587728956884636,
259
- "learning_rate": 8.448618886390523e-06,
260
- "loss": 0.4985,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 1.0277777777777777,
265
- "grad_norm": 0.795241641899969,
266
- "learning_rate": 8.329576125058406e-06,
267
- "loss": 0.4365,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 1.0555555555555556,
272
- "grad_norm": 2.210851457630018,
273
- "learning_rate": 8.207041098155701e-06,
274
- "loss": 0.5556,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 1.0833333333333333,
279
- "grad_norm": 0.7613786102792487,
280
- "learning_rate": 8.081142328004638e-06,
281
- "loss": 0.4846,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 1.1111111111111112,
286
- "grad_norm": 0.723429529329392,
287
- "learning_rate": 7.952011865029614e-06,
288
- "loss": 0.4603,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 1.1388888888888888,
293
- "grad_norm": 0.7836107269741927,
294
- "learning_rate": 7.819785149254534e-06,
295
- "loss": 0.4483,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 1.1666666666666667,
300
- "grad_norm": 0.8412482780955819,
301
- "learning_rate": 7.68460086824492e-06,
302
- "loss": 0.5091,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 1.1944444444444444,
307
- "grad_norm": 0.7560956617656049,
308
- "learning_rate": 7.546600811643816e-06,
309
- "loss": 0.4768,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 1.2222222222222223,
314
- "grad_norm": 0.7532433430539396,
315
- "learning_rate": 7.405929722454026e-06,
316
- "loss": 0.4879,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 1.25,
321
- "grad_norm": 0.7250342590564604,
322
- "learning_rate": 7.262735145222696e-06,
323
- "loss": 0.4424,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 1.2777777777777777,
328
- "grad_norm": 0.7424652764504065,
329
- "learning_rate": 7.117167271287453e-06,
330
- "loss": 0.4268,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 1.3055555555555556,
335
- "grad_norm": 0.5613548384089758,
336
- "learning_rate": 6.969378781246436e-06,
337
- "loss": 0.3586,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 1.3333333333333333,
342
- "grad_norm": 0.7217076640093948,
343
- "learning_rate": 6.819524684817439e-06,
344
- "loss": 0.4861,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 1.3611111111111112,
349
- "grad_norm": 0.6561868334638776,
350
- "learning_rate": 6.667762158254104e-06,
351
- "loss": 0.4543,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 1.3888888888888888,
356
- "grad_norm": 0.7110755871975613,
357
- "learning_rate": 6.514250379489754e-06,
358
- "loss": 0.4277,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 1.4166666666666667,
363
- "grad_norm": 0.7340306300343789,
364
- "learning_rate": 6.3591503611817155e-06,
365
- "loss": 0.4245,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 1.4444444444444444,
370
- "grad_norm": 0.7373696398545522,
371
- "learning_rate": 6.202624781831269e-06,
372
- "loss": 0.4865,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 1.4722222222222223,
377
- "grad_norm": 0.7694890579716093,
378
- "learning_rate": 6.044837815156377e-06,
379
- "loss": 0.467,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 1.5,
384
- "grad_norm": 0.7185745702090169,
385
- "learning_rate": 5.885954957896115e-06,
386
- "loss": 0.4313,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 1.5277777777777777,
391
- "grad_norm": 0.6305358582918202,
392
- "learning_rate": 5.726142856227453e-06,
393
- "loss": 0.4161,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 1.5555555555555556,
398
- "grad_norm": 0.7209314959601887,
399
- "learning_rate": 5.5655691309764225e-06,
400
- "loss": 0.4562,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 1.5833333333333335,
405
- "grad_norm": 0.64260058204023,
406
- "learning_rate": 5.404402201807022e-06,
407
- "loss": 0.4636,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 1.6111111111111112,
412
- "grad_norm": 0.612522134619282,
413
- "learning_rate": 5.242811110572243e-06,
414
- "loss": 0.3946,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 1.6388888888888888,
419
- "grad_norm": 0.640209879297769,
420
- "learning_rate": 5.080965344012509e-06,
421
- "loss": 0.4234,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 1.6666666666666665,
426
- "grad_norm": 0.684267323330591,
427
- "learning_rate": 4.919034655987493e-06,
428
- "loss": 0.4956,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 1.6944444444444444,
433
- "grad_norm": 0.6560173146533167,
434
- "learning_rate": 4.757188889427761e-06,
435
- "loss": 0.4513,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 1.7222222222222223,
440
- "grad_norm": 0.6024082771930704,
441
- "learning_rate": 4.59559779819298e-06,
442
- "loss": 0.3554,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 1.75,
447
- "grad_norm": 0.5988731751099766,
448
- "learning_rate": 4.434430869023579e-06,
449
- "loss": 0.423,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 1.7777777777777777,
454
- "grad_norm": 0.7714096061714903,
455
- "learning_rate": 4.27385714377255e-06,
456
- "loss": 0.3804,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 1.8055555555555556,
461
- "grad_norm": 0.6870558807208743,
462
- "learning_rate": 4.1140450421038865e-06,
463
- "loss": 0.46,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 1.8333333333333335,
468
- "grad_norm": 0.6342926359159472,
469
- "learning_rate": 3.955162184843625e-06,
470
- "loss": 0.5136,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 1.8611111111111112,
475
- "grad_norm": 0.6155115323488202,
476
- "learning_rate": 3.7973752181687336e-06,
477
- "loss": 0.4108,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 1.8888888888888888,
482
- "grad_norm": 0.7006350597692972,
483
- "learning_rate": 3.6408496388182857e-06,
484
- "loss": 0.5141,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 1.9166666666666665,
489
- "grad_norm": 0.6853399554911791,
490
- "learning_rate": 3.4857496205102475e-06,
491
- "loss": 0.4591,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 1.9444444444444444,
496
- "grad_norm": 0.6807578527187025,
497
- "learning_rate": 3.3322378417458985e-06,
498
- "loss": 0.4447,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 1.9722222222222223,
503
- "grad_norm": 0.6002571912158542,
504
- "learning_rate": 3.180475315182563e-06,
505
- "loss": 0.3775,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 2.0,
510
- "grad_norm": 0.6176679391981694,
511
- "learning_rate": 3.0306212187535653e-06,
512
- "loss": 0.362,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 2.0277777777777777,
517
- "grad_norm": 0.6348434037827244,
518
- "learning_rate": 2.882832728712551e-06,
519
- "loss": 0.4395,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 2.0555555555555554,
524
- "grad_norm": 0.6194546514321693,
525
- "learning_rate": 2.7372648547773063e-06,
526
- "loss": 0.385,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 2.0833333333333335,
531
- "grad_norm": 0.5947575965373516,
532
- "learning_rate": 2.594070277545975e-06,
533
- "loss": 0.3634,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 2.111111111111111,
538
- "grad_norm": 0.6412237789531587,
539
- "learning_rate": 2.4533991883561868e-06,
540
- "loss": 0.3352,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 2.138888888888889,
545
- "grad_norm": 0.5993096167073967,
546
- "learning_rate": 2.315399131755081e-06,
547
- "loss": 0.3654,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 2.1666666666666665,
552
- "grad_norm": 0.5863285405996705,
553
- "learning_rate": 2.1802148507454675e-06,
554
- "loss": 0.3648,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 2.1944444444444446,
559
- "grad_norm": 0.6176169061591418,
560
- "learning_rate": 2.0479881349703885e-06,
561
- "loss": 0.3567,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 2.2222222222222223,
566
- "grad_norm": 2.036643091775969,
567
- "learning_rate": 1.9188576719953635e-06,
568
- "loss": 0.5043,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 2.25,
573
- "grad_norm": 0.5707024837734276,
574
- "learning_rate": 1.7929589018443016e-06,
575
- "loss": 0.3835,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 2.2777777777777777,
580
- "grad_norm": 0.6703596535434576,
581
- "learning_rate": 1.6704238749415958e-06,
582
- "loss": 0.4195,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 2.3055555555555554,
587
- "grad_norm": 0.6161351940524724,
588
- "learning_rate": 1.5513811136094786e-06,
589
- "loss": 0.3808,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 2.3333333333333335,
594
- "grad_norm": 0.567006481121317,
595
- "learning_rate": 1.4359554772658551e-06,
596
- "loss": 0.3484,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 2.361111111111111,
601
- "grad_norm": 0.5645679908744803,
602
- "learning_rate": 1.3242680314639995e-06,
603
- "loss": 0.3442,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 2.388888888888889,
608
- "grad_norm": 0.6134174690452912,
609
- "learning_rate": 1.2164359209115235e-06,
610
- "loss": 0.3701,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 2.4166666666666665,
615
- "grad_norm": 0.6351405467839962,
616
- "learning_rate": 1.1125722466017547e-06,
617
- "loss": 0.4531,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 2.4444444444444446,
622
- "grad_norm": 0.6153449983061651,
623
- "learning_rate": 1.012785947186397e-06,
624
- "loss": 0.3512,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 2.4722222222222223,
629
- "grad_norm": 0.5815107547799697,
630
- "learning_rate": 9.171816847139447e-07,
631
- "loss": 0.3224,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 2.5,
636
- "grad_norm": 0.5972403522847891,
637
- "learning_rate": 8.258597348536452e-07,
638
- "loss": 0.3708,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 2.5277777777777777,
643
- "grad_norm": 0.6224293908552574,
644
- "learning_rate": 7.389158817201541e-07,
645
- "loss": 0.4313,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 2.5555555555555554,
650
- "grad_norm": 0.6024938044516791,
651
- "learning_rate": 6.564413174092443e-07,
652
- "loss": 0.3964,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 2.5833333333333335,
657
- "grad_norm": 0.6108443482475023,
658
- "learning_rate": 5.785225463498828e-07,
659
- "loss": 0.3827,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 2.611111111111111,
664
- "grad_norm": 0.6722881160180866,
665
- "learning_rate": 5.05241294573024e-07,
666
- "loss": 0.4383,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 2.638888888888889,
671
- "grad_norm": 0.6149268282937064,
672
- "learning_rate": 4.3667442399229985e-07,
673
- "loss": 0.4371,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 2.6666666666666665,
678
- "grad_norm": 0.6113966401547459,
679
- "learning_rate": 3.728938517864794e-07,
680
- "loss": 0.3829,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 2.6944444444444446,
685
- "grad_norm": 0.6671411413349442,
686
- "learning_rate": 3.1396647496828245e-07,
687
- "loss": 0.3965,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 2.7222222222222223,
692
- "grad_norm": 0.547101720665063,
693
- "learning_rate": 2.599541002186479e-07,
694
- "loss": 0.3122,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 2.75,
699
- "grad_norm": 0.6044035422441768,
700
- "learning_rate": 2.109133790600648e-07,
701
- "loss": 0.4733,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 2.7777777777777777,
706
- "grad_norm": 0.6069536296371835,
707
- "learning_rate": 1.6689574843694433e-07,
708
- "loss": 0.4024,
709
  "step": 100
710
  },
711
  {
712
- "epoch": 2.8055555555555554,
713
- "grad_norm": 0.5982249811836392,
714
- "learning_rate": 1.2794737676536993e-07,
715
- "loss": 0.4336,
716
  "step": 101
717
  },
718
  {
719
- "epoch": 2.8333333333333335,
720
- "grad_norm": 0.6884925904147986,
721
- "learning_rate": 9.410911550880474e-08,
722
- "loss": 0.3595,
723
  "step": 102
724
  },
725
  {
726
- "epoch": 2.861111111111111,
727
- "grad_norm": 0.7589234561889401,
728
- "learning_rate": 6.54164563305465e-08,
729
- "loss": 0.4163,
730
  "step": 103
731
  },
732
  {
733
- "epoch": 2.888888888888889,
734
- "grad_norm": 0.6469041727623386,
735
- "learning_rate": 4.189949386787462e-08,
736
- "loss": 0.4264,
737
  "step": 104
738
  },
739
  {
740
- "epoch": 2.9166666666666665,
741
- "grad_norm": 0.6755069676275332,
742
- "learning_rate": 2.358289416693027e-08,
743
- "loss": 0.4073,
744
- "step": 105
745
- },
746
- {
747
- "epoch": 2.9444444444444446,
748
- "grad_norm": 0.6272555116057572,
749
- "learning_rate": 1.0485868811441757e-08,
750
- "loss": 0.4084,
751
- "step": 106
752
- },
753
- {
754
- "epoch": 2.9722222222222223,
755
- "grad_norm": 0.6263081374423879,
756
- "learning_rate": 2.6221547724253337e-09,
757
- "loss": 0.3967,
758
- "step": 107
759
- },
760
- {
761
- "epoch": 3.0,
762
- "grad_norm": 0.5852999921009103,
763
  "learning_rate": 0.0,
764
- "loss": 0.3847,
765
- "step": 108
766
  },
767
  {
768
- "epoch": 3.0,
769
- "step": 108,
770
- "total_flos": 25167957131264.0,
771
- "train_loss": 0.4632317288606255,
772
- "train_runtime": 439.9384,
773
- "train_samples_per_second": 23.206,
774
- "train_steps_per_second": 0.245
775
  }
776
  ],
777
  "logging_steps": 1,
778
- "max_steps": 108,
779
  "num_input_tokens_seen": 0,
780
  "num_train_epochs": 3,
781
  "save_steps": 500,
@@ -791,8 +770,8 @@
791
  "attributes": {}
792
  }
793
  },
794
- "total_flos": 25167957131264.0,
795
- "train_batch_size": 3,
796
  "trial_name": null,
797
  "trial_params": null
798
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9626168224299065,
5
  "eval_steps": 500,
6
+ "global_step": 105,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.028037383177570093,
13
+ "grad_norm": 5.46907120535221,
14
  "learning_rate": 9.090909090909091e-07,
15
+ "loss": 0.6948,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.056074766355140186,
20
+ "grad_norm": 5.695701169228273,
21
  "learning_rate": 1.8181818181818183e-06,
22
+ "loss": 0.659,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.08411214953271028,
27
+ "grad_norm": 5.869868275090631,
28
  "learning_rate": 2.7272727272727272e-06,
29
+ "loss": 0.6481,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.11214953271028037,
34
+ "grad_norm": 5.197705973950515,
35
  "learning_rate": 3.6363636363636366e-06,
36
+ "loss": 0.6641,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.14018691588785046,
41
+ "grad_norm": 3.3963665075709906,
42
  "learning_rate": 4.5454545454545455e-06,
43
+ "loss": 0.5579,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.16822429906542055,
48
+ "grad_norm": 2.20924738877731,
49
  "learning_rate": 5.4545454545454545e-06,
50
+ "loss": 0.6036,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.19626168224299065,
55
+ "grad_norm": 1.984739518286341,
56
  "learning_rate": 6.363636363636364e-06,
57
+ "loss": 0.5288,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.22429906542056074,
62
+ "grad_norm": 1.9291295994072917,
63
  "learning_rate": 7.272727272727273e-06,
64
  "loss": 0.5054,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.2523364485981308,
69
+ "grad_norm": 1.6389948204701459,
70
  "learning_rate": 8.181818181818183e-06,
71
+ "loss": 0.5339,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.2803738317757009,
76
+ "grad_norm": 1.4300152832592459,
77
  "learning_rate": 9.090909090909091e-06,
78
  "loss": 0.5433,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.308411214953271,
83
+ "grad_norm": 1.711886946596263,
84
  "learning_rate": 1e-05,
85
+ "loss": 0.5537,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.3364485981308411,
90
+ "grad_norm": 1.5442931844274987,
91
+ "learning_rate": 9.997207818651273e-06,
92
+ "loss": 0.498,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.3644859813084112,
97
+ "grad_norm": 1.3068883684134325,
98
+ "learning_rate": 9.988834393115768e-06,
99
+ "loss": 0.6412,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.3925233644859813,
104
+ "grad_norm": 1.2741183296138552,
105
+ "learning_rate": 9.97488907544252e-06,
106
+ "loss": 0.5796,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.4205607476635514,
111
+ "grad_norm": 1.0164680008182152,
112
+ "learning_rate": 9.955387440773902e-06,
113
+ "loss": 0.5019,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.4485981308411215,
118
+ "grad_norm": 1.036540358883728,
119
+ "learning_rate": 9.930351269950144e-06,
120
+ "loss": 0.5982,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.4766355140186916,
125
+ "grad_norm": 1.0480857466954097,
126
+ "learning_rate": 9.899808525182935e-06,
127
+ "loss": 0.5377,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.5046728971962616,
132
+ "grad_norm": 0.8148023572178399,
133
+ "learning_rate": 9.863793318825186e-06,
134
+ "loss": 0.4776,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.5327102803738317,
139
+ "grad_norm": 1.0017091343990974,
140
+ "learning_rate": 9.822345875271884e-06,
141
+ "loss": 0.648,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.5607476635514018,
146
+ "grad_norm": 0.7723478036715665,
147
+ "learning_rate": 9.775512486034564e-06,
148
+ "loss": 0.4544,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.5887850467289719,
153
+ "grad_norm": 0.8710833273679682,
154
+ "learning_rate": 9.723345458039595e-06,
155
+ "loss": 0.4936,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.616822429906542,
160
+ "grad_norm": 0.9055028217774582,
161
+ "learning_rate": 9.665903055208013e-06,
162
+ "loss": 0.5819,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.6448598130841121,
167
+ "grad_norm": 0.8103202901623571,
168
+ "learning_rate": 9.603249433382145e-06,
169
+ "loss": 0.5506,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.6728971962616822,
174
+ "grad_norm": 0.7712121760995281,
175
+ "learning_rate": 9.535454568671705e-06,
176
+ "loss": 0.4932,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.7009345794392523,
181
+ "grad_norm": 0.9091205022340342,
182
+ "learning_rate": 9.462594179299408e-06,
183
+ "loss": 0.5776,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.7289719626168224,
188
+ "grad_norm": 0.7512412627292558,
189
+ "learning_rate": 9.384749641033358e-06,
190
+ "loss": 0.4665,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.7570093457943925,
195
+ "grad_norm": 0.9759396921985853,
196
+ "learning_rate": 9.302007896300697e-06,
197
+ "loss": 0.6547,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.7850467289719626,
202
+ "grad_norm": 0.8218161831230707,
203
+ "learning_rate": 9.214461357083986e-06,
204
+ "loss": 0.5403,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.8130841121495327,
209
+ "grad_norm": 0.7802775736672756,
210
+ "learning_rate": 9.122207801708802e-06,
211
+ "loss": 0.4792,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.8411214953271028,
216
+ "grad_norm": 1.2211854499314727,
217
+ "learning_rate": 9.025350265637816e-06,
218
+ "loss": 0.5767,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.8691588785046729,
223
+ "grad_norm": 0.7291315847510311,
224
+ "learning_rate": 8.923996926393306e-06,
225
+ "loss": 0.5339,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.897196261682243,
230
+ "grad_norm": 0.7685026874314949,
231
+ "learning_rate": 8.818260982736662e-06,
232
+ "loss": 0.5102,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.9252336448598131,
237
+ "grad_norm": 0.7819112577141099,
238
+ "learning_rate": 8.708260528239788e-06,
239
+ "loss": 0.5196,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.9532710280373832,
244
+ "grad_norm": 0.805804762659976,
245
+ "learning_rate": 8.594118419389648e-06,
246
+ "loss": 0.5486,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.9813084112149533,
251
+ "grad_norm": 0.6960763295513189,
252
+ "learning_rate": 8.475962138373212e-06,
253
+ "loss": 0.4885,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 1.0186915887850467,
258
+ "grad_norm": 1.261779486223386,
259
+ "learning_rate": 8.353923650696119e-06,
260
+ "loss": 0.7283,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 1.0467289719626167,
265
+ "grad_norm": 2.1904861480940703,
266
+ "learning_rate": 8.228139257794012e-06,
267
+ "loss": 0.5858,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 1.074766355140187,
272
+ "grad_norm": 0.836565359732725,
273
+ "learning_rate": 8.098749444801226e-06,
274
+ "loss": 0.4984,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 1.102803738317757,
279
+ "grad_norm": 0.7156392071456283,
280
+ "learning_rate": 7.965898723646777e-06,
281
+ "loss": 0.4133,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 1.1308411214953271,
286
+ "grad_norm": 0.7811150734148334,
287
+ "learning_rate": 7.829735471652978e-06,
288
+ "loss": 0.4737,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 1.158878504672897,
293
+ "grad_norm": 0.8005430413673945,
294
+ "learning_rate": 7.690411765816864e-06,
295
+ "loss": 0.461,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 1.1869158878504673,
300
+ "grad_norm": 0.734823009686673,
301
+ "learning_rate": 7.548083212959588e-06,
302
+ "loss": 0.4878,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 1.2149532710280373,
307
+ "grad_norm": 0.9106888643783978,
308
+ "learning_rate": 7.402908775933419e-06,
309
+ "loss": 0.5398,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 1.2429906542056075,
314
+ "grad_norm": 0.6471585113802202,
315
+ "learning_rate": 7.25505059608051e-06,
316
+ "loss": 0.4282,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 1.2710280373831775,
321
+ "grad_norm": 0.7710047485521259,
322
+ "learning_rate": 7.104673812141676e-06,
323
+ "loss": 0.4352,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 1.2990654205607477,
328
+ "grad_norm": 0.6543038711142444,
329
+ "learning_rate": 6.9519463758174745e-06,
330
+ "loss": 0.3746,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 1.3271028037383177,
335
+ "grad_norm": 0.6480235385313834,
336
+ "learning_rate": 6.797038864187564e-06,
337
+ "loss": 0.4648,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 1.355140186915888,
342
+ "grad_norm": 0.663332249258245,
343
+ "learning_rate": 6.640124289197845e-06,
344
+ "loss": 0.412,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 1.3831775700934579,
349
+ "grad_norm": 0.735147105832247,
350
+ "learning_rate": 6.481377904428171e-06,
351
+ "loss": 0.4639,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 1.411214953271028,
356
+ "grad_norm": 0.7169861474373482,
357
+ "learning_rate": 6.3209770093564315e-06,
358
+ "loss": 0.4511,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 1.439252336448598,
363
+ "grad_norm": 0.7241007166406079,
364
+ "learning_rate": 6.1591007513376425e-06,
365
+ "loss": 0.4204,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 1.4672897196261683,
370
+ "grad_norm": 0.7785664265979848,
371
+ "learning_rate": 5.995929925519181e-06,
372
+ "loss": 0.4717,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 1.4953271028037383,
377
+ "grad_norm": 0.643031905575638,
378
+ "learning_rate": 5.831646772915651e-06,
379
+ "loss": 0.451,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 1.5233644859813085,
384
+ "grad_norm": 0.6622611979477618,
385
+ "learning_rate": 5.666434776868895e-06,
386
+ "loss": 0.4598,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 1.5514018691588785,
391
+ "grad_norm": 0.6681223638603929,
392
+ "learning_rate": 5.500478458120493e-06,
393
+ "loss": 0.4471,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 1.5794392523364484,
398
+ "grad_norm": 0.6742446596507364,
399
+ "learning_rate": 5.3339631687256085e-06,
400
+ "loss": 0.4323,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 1.6074766355140186,
405
+ "grad_norm": 0.6011350150512597,
406
+ "learning_rate": 5.1670748850383734e-06,
407
+ "loss": 0.421,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 1.6355140186915889,
412
+ "grad_norm": 0.6177912691346672,
413
+ "learning_rate": 5e-06,
414
+ "loss": 0.3956,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 1.6635514018691588,
419
+ "grad_norm": 0.6868626354051661,
420
+ "learning_rate": 4.832925114961629e-06,
421
+ "loss": 0.4661,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 1.6915887850467288,
426
+ "grad_norm": 0.6421560332745099,
427
+ "learning_rate": 4.666036831274392e-06,
428
+ "loss": 0.4704,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 1.719626168224299,
433
+ "grad_norm": 0.7187797397726619,
434
+ "learning_rate": 4.499521541879508e-06,
435
+ "loss": 0.465,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 1.7476635514018692,
440
+ "grad_norm": 0.5867907128348048,
441
+ "learning_rate": 4.333565223131107e-06,
442
+ "loss": 0.3283,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 1.7757009345794392,
447
+ "grad_norm": 0.662616949214837,
448
+ "learning_rate": 4.1683532270843505e-06,
449
+ "loss": 0.4216,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 1.8037383177570092,
454
+ "grad_norm": 0.6719507669829111,
455
+ "learning_rate": 4.004070074480821e-06,
456
+ "loss": 0.4443,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 1.8317757009345794,
461
+ "grad_norm": 0.6464555389487582,
462
+ "learning_rate": 3.840899248662358e-06,
463
+ "loss": 0.4901,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 1.8598130841121496,
468
+ "grad_norm": 0.642004590459993,
469
+ "learning_rate": 3.6790229906435706e-06,
470
+ "loss": 0.4687,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 1.8878504672897196,
475
+ "grad_norm": 0.6811855386223414,
476
+ "learning_rate": 3.518622095571831e-06,
477
+ "loss": 0.5103,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 1.9158878504672896,
482
+ "grad_norm": 0.6780639408715025,
483
+ "learning_rate": 3.3598757108021546e-06,
484
+ "loss": 0.4563,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 1.9439252336448598,
489
+ "grad_norm": 0.6096518872897251,
490
+ "learning_rate": 3.202961135812437e-06,
491
+ "loss": 0.4273,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 1.97196261682243,
496
+ "grad_norm": 0.6148567020386766,
497
+ "learning_rate": 3.0480536241825263e-06,
498
+ "loss": 0.3991,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 2.0093457943925235,
503
+ "grad_norm": 1.1093908775727706,
504
+ "learning_rate": 2.8953261878583263e-06,
505
+ "loss": 0.6552,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 2.0373831775700935,
510
+ "grad_norm": 0.5907034426928125,
511
+ "learning_rate": 2.74494940391949e-06,
512
+ "loss": 0.4244,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 2.0654205607476634,
517
+ "grad_norm": 0.6237639437264948,
518
+ "learning_rate": 2.5970912240665815e-06,
519
+ "loss": 0.3433,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 2.0934579439252334,
524
+ "grad_norm": 0.6456026273029044,
525
+ "learning_rate": 2.4519167870404126e-06,
526
+ "loss": 0.3633,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 2.121495327102804,
531
+ "grad_norm": 0.6114145984670124,
532
+ "learning_rate": 2.309588234183137e-06,
533
+ "loss": 0.3607,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 2.149532710280374,
538
+ "grad_norm": 0.5622408748138118,
539
+ "learning_rate": 2.1702645283470238e-06,
540
+ "loss": 0.3255,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 2.177570093457944,
545
+ "grad_norm": 0.6289348884143411,
546
+ "learning_rate": 2.0341012763532243e-06,
547
+ "loss": 0.3917,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 2.205607476635514,
552
+ "grad_norm": 2.11632857452312,
553
+ "learning_rate": 1.9012505551987764e-06,
554
+ "loss": 0.5284,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 2.2336448598130842,
559
+ "grad_norm": 0.6011421789250062,
560
+ "learning_rate": 1.771860742205988e-06,
561
+ "loss": 0.3717,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 2.2616822429906542,
566
+ "grad_norm": 0.605196953414431,
567
+ "learning_rate": 1.646076349303884e-06,
568
+ "loss": 0.3896,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 2.289719626168224,
573
+ "grad_norm": 0.6538497800730606,
574
+ "learning_rate": 1.5240378616267887e-06,
575
+ "loss": 0.3952,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 2.317757009345794,
580
+ "grad_norm": 0.6004986923958435,
581
+ "learning_rate": 1.4058815806103542e-06,
582
+ "loss": 0.4167,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 2.3457943925233646,
587
+ "grad_norm": 0.5871010729484297,
588
+ "learning_rate": 1.2917394717602123e-06,
589
+ "loss": 0.3395,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 2.3738317757009346,
594
+ "grad_norm": 0.6272792477288378,
595
+ "learning_rate": 1.1817390172633402e-06,
596
+ "loss": 0.3955,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 2.4018691588785046,
601
+ "grad_norm": 0.602988779712659,
602
+ "learning_rate": 1.0760030736066952e-06,
603
+ "loss": 0.3712,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 2.4299065420560746,
608
+ "grad_norm": 0.6514347127868433,
609
+ "learning_rate": 9.746497343621857e-07,
610
+ "loss": 0.439,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 2.457943925233645,
615
+ "grad_norm": 0.6171164966676075,
616
+ "learning_rate": 8.777921982911996e-07,
617
+ "loss": 0.3293,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 2.485981308411215,
622
+ "grad_norm": 0.5985061076335229,
623
+ "learning_rate": 7.85538642916015e-07,
624
+ "loss": 0.3554,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 2.514018691588785,
629
+ "grad_norm": 0.6023670543052161,
630
+ "learning_rate": 6.979921036993042e-07,
631
+ "loss": 0.3759,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 2.542056074766355,
636
+ "grad_norm": 0.6350504526148657,
637
+ "learning_rate": 6.152503589666426e-07,
638
+ "loss": 0.4048,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 2.5700934579439254,
643
+ "grad_norm": 0.6232837840764863,
644
+ "learning_rate": 5.374058207005945e-07,
645
+ "loss": 0.4454,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 2.5981308411214954,
650
+ "grad_norm": 0.6261955186268706,
651
+ "learning_rate": 4.6454543132829653e-07,
652
+ "loss": 0.3954,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 2.6261682242990654,
657
+ "grad_norm": 0.6079216763424581,
658
+ "learning_rate": 3.9675056661785563e-07,
659
+ "loss": 0.4013,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 2.6542056074766354,
664
+ "grad_norm": 0.6437379172509019,
665
+ "learning_rate": 3.340969447919873e-07,
666
+ "loss": 0.4624,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 2.6822429906542054,
671
+ "grad_norm": 0.6182485342353894,
672
+ "learning_rate": 2.7665454196040665e-07,
673
+ "loss": 0.3614,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 2.710280373831776,
678
+ "grad_norm": 0.6138213235934379,
679
+ "learning_rate": 2.2448751396543788e-07,
680
+ "loss": 0.3844,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 2.7383177570093458,
685
+ "grad_norm": 0.5790126523883554,
686
+ "learning_rate": 1.776541247281177e-07,
687
+ "loss": 0.3865,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 2.7663551401869158,
692
+ "grad_norm": 0.602742444058507,
693
+ "learning_rate": 1.3620668117481471e-07,
694
+ "loss": 0.4459,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 2.794392523364486,
699
+ "grad_norm": 0.6125089915071265,
700
+ "learning_rate": 1.0019147481706626e-07,
701
+ "loss": 0.4199,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 2.822429906542056,
706
+ "grad_norm": 0.6081244590976984,
707
+ "learning_rate": 6.964873004985717e-08,
708
+ "loss": 0.4009,
709
  "step": 100
710
  },
711
  {
712
+ "epoch": 2.850467289719626,
713
+ "grad_norm": 0.7145788800241012,
714
+ "learning_rate": 4.461255922609986e-08,
715
+ "loss": 0.3987,
716
  "step": 101
717
  },
718
  {
719
+ "epoch": 2.878504672897196,
720
+ "grad_norm": 0.6581593717216647,
721
+ "learning_rate": 2.511092455747932e-08,
722
+ "loss": 0.3866,
723
  "step": 102
724
  },
725
  {
726
+ "epoch": 2.906542056074766,
727
+ "grad_norm": 0.6290938268876005,
728
+ "learning_rate": 1.1165606884234182e-08,
729
+ "loss": 0.4225,
730
  "step": 103
731
  },
732
  {
733
+ "epoch": 2.9345794392523366,
734
+ "grad_norm": 0.6534972156634564,
735
+ "learning_rate": 2.792181348726941e-09,
736
+ "loss": 0.4279,
737
  "step": 104
738
  },
739
  {
740
+ "epoch": 2.9626168224299065,
741
+ "grad_norm": 0.58156649259114,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  "learning_rate": 0.0,
743
+ "loss": 0.3581,
744
+ "step": 105
745
  },
746
  {
747
+ "epoch": 2.9626168224299065,
748
+ "step": 105,
749
+ "total_flos": 18524510396416.0,
750
+ "train_loss": 0.47221575805119104,
751
+ "train_runtime": 641.5687,
752
+ "train_samples_per_second": 15.913,
753
+ "train_steps_per_second": 0.164
754
  }
755
  ],
756
  "logging_steps": 1,
757
+ "max_steps": 105,
758
  "num_input_tokens_seen": 0,
759
  "num_train_epochs": 3,
760
  "save_steps": 500,
 
770
  "attributes": {}
771
  }
772
  },
773
+ "total_flos": 18524510396416.0,
774
+ "train_batch_size": 1,
775
  "trial_name": null,
776
  "trial_params": null
777
  }
training_loss.png CHANGED