|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 685, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0072992700729927005, |
|
"grad_norm": 708.0, |
|
"learning_rate": 2.898550724637681e-06, |
|
"loss": 56.8346, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0364963503649635, |
|
"grad_norm": 604.0, |
|
"learning_rate": 1.4492753623188407e-05, |
|
"loss": 52.9742, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 340.0, |
|
"learning_rate": 2.8985507246376814e-05, |
|
"loss": 39.0746, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10948905109489052, |
|
"grad_norm": 40.25, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 20.8099, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 25.5, |
|
"learning_rate": 5.797101449275363e-05, |
|
"loss": 17.6144, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18248175182481752, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 7.246376811594203e-05, |
|
"loss": 15.3803, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.21897810218978103, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 14.0798, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25547445255474455, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 0.00010144927536231885, |
|
"loss": 13.4032, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 41.0, |
|
"learning_rate": 0.00011594202898550725, |
|
"loss": 10.8827, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3284671532846715, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 0.00013043478260869567, |
|
"loss": 4.5915, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.00014492753623188405, |
|
"loss": 1.9, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.40145985401459855, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00015942028985507247, |
|
"loss": 1.6474, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.43795620437956206, |
|
"grad_norm": 3.5, |
|
"learning_rate": 0.00017391304347826088, |
|
"loss": 1.477, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4744525547445255, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00018840579710144927, |
|
"loss": 1.3309, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5109489051094891, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.00019999869950890106, |
|
"loss": 1.2538, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5474452554744526, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.0001999531858720213, |
|
"loss": 1.224, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.583941605839416, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00019984268150178167, |
|
"loss": 1.1823, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6204379562043796, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00019966725824941932, |
|
"loss": 1.1279, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.656934306569343, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.00019942703017718975, |
|
"loss": 1.127, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6934306569343066, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.000199122153484202, |
|
"loss": 1.1284, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.00019875282640485645, |
|
"loss": 1.0566, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7664233576642335, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.0001983192890799503, |
|
"loss": 1.0361, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8029197080291971, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0001978218234005352, |
|
"loss": 1.0371, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8394160583941606, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.00019726075282462845, |
|
"loss": 1.0235, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8759124087591241, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00019663644216689683, |
|
"loss": 0.996, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9124087591240876, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.9734, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.948905109489051, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.00019519976519789616, |
|
"loss": 0.978, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9854014598540146, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.9712, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.307734489440918, |
|
"eval_runtime": 0.9962, |
|
"eval_samples_per_second": 5.019, |
|
"eval_steps_per_second": 2.008, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0218978102189782, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019351552846298025, |
|
"loss": 0.9374, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0583941605839415, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0001925819190020898, |
|
"loss": 0.9173, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.094890510948905, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00019158811169198313, |
|
"loss": 0.8916, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1313868613138687, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001905347527178252, |
|
"loss": 0.9418, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.167883211678832, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00018942252698597113, |
|
"loss": 0.9054, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2043795620437956, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00018825215767863214, |
|
"loss": 0.9039, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2408759124087592, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.00018702440578365387, |
|
"loss": 0.9146, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2773722627737225, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00018574006959971333, |
|
"loss": 0.8896, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.313868613138686, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.00018439998421725554, |
|
"loss": 0.8947, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3503649635036497, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00018300502097550806, |
|
"loss": 0.881, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3868613138686132, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00018155608689592604, |
|
"loss": 0.8906, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4233576642335766, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00018005412409243606, |
|
"loss": 0.8939, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001785001091588628, |
|
"loss": 0.9016, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4963503649635037, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001768950525339362, |
|
"loss": 0.8943, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5328467153284673, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00017523999784429238, |
|
"loss": 0.8614, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5693430656934306, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00017353602122589527, |
|
"loss": 0.8788, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6058394160583942, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001717842306243205, |
|
"loss": 0.8833, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6423357664233578, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00016998576507435618, |
|
"loss": 0.8713, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6788321167883211, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00016814179395938913, |
|
"loss": 0.8661, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7153284671532847, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00016625351625105796, |
|
"loss": 0.8413, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7518248175182483, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001643221597296679, |
|
"loss": 0.8741, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7883211678832116, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.8744, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00016033526060414842, |
|
"loss": 0.8517, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8613138686131387, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00015828231032857503, |
|
"loss": 0.8899, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.897810218978102, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00015619146421149232, |
|
"loss": 0.8537, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9343065693430657, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.8329, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9708029197080292, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00015190154617979938, |
|
"loss": 0.8675, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.247941017150879, |
|
"eval_runtime": 0.9979, |
|
"eval_samples_per_second": 5.01, |
|
"eval_steps_per_second": 2.004, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.0072992700729926, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00014970526362019079, |
|
"loss": 0.8435, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.0437956204379564, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00014747666211540459, |
|
"loss": 0.7774, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.0802919708029197, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00014521719072826858, |
|
"loss": 0.79, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.116788321167883, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00014292831859356997, |
|
"loss": 0.7929, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.153284671532847, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00014061153396280674, |
|
"loss": 0.8032, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.7932, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.2262773722627736, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00013590026998475986, |
|
"loss": 0.7657, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.2627737226277373, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001335088539565523, |
|
"loss": 0.783, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.2992700729927007, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00013109565007862596, |
|
"loss": 0.7755, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.335766423357664, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001286622274444361, |
|
"loss": 0.7723, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.372262773722628, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00012621016829391022, |
|
"loss": 0.7739, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.408759124087591, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00012374106698465732, |
|
"loss": 0.7821, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.445255474452555, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00012125652895529766, |
|
"loss": 0.7852, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.4817518248175183, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00011875816968158815, |
|
"loss": 0.7792, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.5182481751824817, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00011624761362602061, |
|
"loss": 0.7799, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.554744525547445, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00011372649318157749, |
|
"loss": 0.7914, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.591240875912409, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.7847, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.627737226277372, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001086591219775746, |
|
"loss": 0.8049, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.664233576642336, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00010611616608218429, |
|
"loss": 0.7865, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.7007299270072993, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00010356923338389806, |
|
"loss": 0.7908, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7372262773722627, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00010101997992821797, |
|
"loss": 0.7925, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.7737226277372264, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 9.847006326962974e-05, |
|
"loss": 0.799, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.81021897810219, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.592114139384145e-05, |
|
"loss": 0.7832, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.846715328467153, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 9.337487163974164e-05, |
|
"loss": 0.7796, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.883211678832117, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.083290962177828e-05, |
|
"loss": 0.7839, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 8.829690815345886e-05, |
|
"loss": 0.7781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9562043795620436, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.7457, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.9927007299270074, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.324937766952638e-05, |
|
"loss": 0.7623, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.275648355484009, |
|
"eval_runtime": 0.9945, |
|
"eval_samples_per_second": 5.028, |
|
"eval_steps_per_second": 2.011, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.0291970802919708, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.074113061741397e-05, |
|
"loss": 0.7329, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.065693430656934, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 7.824540590797568e-05, |
|
"loss": 0.7052, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.102189781021898, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.576382629067877e-05, |
|
"loss": 0.7015, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.1386861313868613, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 7.329800531768584e-05, |
|
"loss": 0.696, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.1751824817518246, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.084954629470417e-05, |
|
"loss": 0.7154, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.2116788321167884, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 6.842004123849752e-05, |
|
"loss": 0.7113, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.2481751824817517, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 6.601106984173835e-05, |
|
"loss": 0.7139, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.2846715328467155, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 6.362419844587287e-05, |
|
"loss": 0.6967, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.321167883211679, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 6.126097902266772e-05, |
|
"loss": 0.7073, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.3576642335766422, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 5.8922948165099524e-05, |
|
"loss": 0.6857, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.394160583941606, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.7199, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.4306569343065694, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 5.432851564081534e-05, |
|
"loss": 0.7075, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.4671532846715327, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.207510132799436e-05, |
|
"loss": 0.7006, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.5036496350364965, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 4.9852848346187566e-05, |
|
"loss": 0.7151, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.54014598540146, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 4.7663201630338816e-05, |
|
"loss": 0.7129, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.576642335766423, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 4.550758491441526e-05, |
|
"loss": 0.7139, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.613138686131387, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.3387399805679255e-05, |
|
"loss": 0.7162, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.1304024873346705e-05, |
|
"loss": 0.7132, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.686131386861314, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 3.9258814752225284e-05, |
|
"loss": 0.7007, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.7226277372262775, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 3.725309926191479e-05, |
|
"loss": 0.7037, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.759124087591241, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.528818254214329e-05, |
|
"loss": 0.7255, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.795620437956204, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.336534220479961e-05, |
|
"loss": 0.6966, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.832116788321168, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 3.1485828503215585e-05, |
|
"loss": 0.7143, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.8686131386861313, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.9650863519236418e-05, |
|
"loss": 0.7005, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.905109489051095, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.7861640368608844e-05, |
|
"loss": 0.7005, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.9416058394160585, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.6119322425203197e-05, |
|
"loss": 0.7139, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.978102189781022, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.709, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.341665267944336, |
|
"eval_runtime": 0.9977, |
|
"eval_samples_per_second": 5.012, |
|
"eval_steps_per_second": 2.005, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 4.014598540145985, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.277990242735185e-05, |
|
"loss": 0.6801, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.0510948905109485, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 2.118497170294195e-05, |
|
"loss": 0.6495, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.087591240875913, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.9641287434001355e-05, |
|
"loss": 0.672, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.124087591240876, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.8149853342140645e-05, |
|
"loss": 0.6611, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.160583941605839, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.671163917529285e-05, |
|
"loss": 0.662, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.197080291970803, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.6635, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.233576642335766, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.3998575979229944e-05, |
|
"loss": 0.6624, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.2700729927007295, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.272549101551438e-05, |
|
"loss": 0.6523, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.306569343065694, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.1509152960794666e-05, |
|
"loss": 0.6607, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.343065693430657, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.035035269233493e-05, |
|
"loss": 0.6626, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.249843675656212e-06, |
|
"loss": 0.678, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.416058394160584, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.208341474624071e-06, |
|
"loss": 0.6783, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.452554744525547, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 7.226523286180776e-06, |
|
"loss": 0.6699, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.489051094890511, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 6.3050275000238414e-06, |
|
"loss": 0.6607, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.525547445255475, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 5.4444532835175144e-06, |
|
"loss": 0.6702, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.562043795620438, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.6453601921072395e-06, |
|
"loss": 0.6793, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.598540145985401, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.908267805490051e-06, |
|
"loss": 0.6622, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.635036496350365, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 3.233655389777801e-06, |
|
"loss": 0.677, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.671532846715328, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 2.62196158587269e-06, |
|
"loss": 0.6588, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.708029197080292, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 2.073584124257899e-06, |
|
"loss": 0.6621, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.744525547445256, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.5888795663883904e-06, |
|
"loss": 0.6655, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.781021897810219, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.1681630728506699e-06, |
|
"loss": 0.6653, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.817518248175182, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 8.117081984415298e-07, |
|
"loss": 0.6734, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.854014598540146, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 5.19746714299596e-07, |
|
"loss": 0.6541, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.89051094890511, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.9246845720496407e-07, |
|
"loss": 0.6722, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.927007299270073, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.300212061451367e-07, |
|
"loss": 0.6472, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.963503649635037, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 3.251058622737446e-08, |
|
"loss": 0.667, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0, |
|
"loss": 0.6601, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.3811252117156982, |
|
"eval_runtime": 0.9953, |
|
"eval_samples_per_second": 5.024, |
|
"eval_steps_per_second": 2.01, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 685, |
|
"total_flos": 1.0472781231601746e+18, |
|
"train_loss": 2.151051264783762, |
|
"train_runtime": 5341.9856, |
|
"train_samples_per_second": 2.052, |
|
"train_steps_per_second": 0.128 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 685, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0472781231601746e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|