|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 14.962593516209477, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04987531172069826, |
|
"grad_norm": 39.54836654663086, |
|
"learning_rate": 1.9940000000000002e-05, |
|
"loss": 7.4096, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09975062344139651, |
|
"grad_norm": 47.2942008972168, |
|
"learning_rate": 1.9873333333333335e-05, |
|
"loss": 5.883, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14962593516209477, |
|
"grad_norm": 44.06085968017578, |
|
"learning_rate": 1.9806666666666668e-05, |
|
"loss": 3.9265, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19950124688279303, |
|
"grad_norm": 19.404735565185547, |
|
"learning_rate": 1.974e-05, |
|
"loss": 1.9858, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24937655860349128, |
|
"grad_norm": 1.9771770238876343, |
|
"learning_rate": 1.9673333333333337e-05, |
|
"loss": 1.0943, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.29925187032418954, |
|
"grad_norm": 0.6882551312446594, |
|
"learning_rate": 1.9606666666666666e-05, |
|
"loss": 0.9006, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3491271820448878, |
|
"grad_norm": 0.7775446772575378, |
|
"learning_rate": 1.9540000000000003e-05, |
|
"loss": 0.8786, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.39900249376558605, |
|
"grad_norm": 0.720007598400116, |
|
"learning_rate": 1.9473333333333335e-05, |
|
"loss": 0.858, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4488778054862843, |
|
"grad_norm": 0.6773384809494019, |
|
"learning_rate": 1.940666666666667e-05, |
|
"loss": 0.811, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.49875311720698257, |
|
"grad_norm": 0.6907929182052612, |
|
"learning_rate": 1.934e-05, |
|
"loss": 0.7896, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5486284289276808, |
|
"grad_norm": 0.7552163600921631, |
|
"learning_rate": 1.9273333333333334e-05, |
|
"loss": 0.7589, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5985037406483791, |
|
"grad_norm": 0.7480026483535767, |
|
"learning_rate": 1.920666666666667e-05, |
|
"loss": 0.7347, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6483790523690773, |
|
"grad_norm": 0.9136413931846619, |
|
"learning_rate": 1.914e-05, |
|
"loss": 0.7218, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6982543640897756, |
|
"grad_norm": 0.7177766561508179, |
|
"learning_rate": 1.9073333333333336e-05, |
|
"loss": 0.6983, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7481296758104738, |
|
"grad_norm": 0.7291643023490906, |
|
"learning_rate": 1.900666666666667e-05, |
|
"loss": 0.6811, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7980049875311721, |
|
"grad_norm": 0.8402264714241028, |
|
"learning_rate": 1.894e-05, |
|
"loss": 0.6518, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8478802992518704, |
|
"grad_norm": 0.7445259690284729, |
|
"learning_rate": 1.8873333333333334e-05, |
|
"loss": 0.6621, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8977556109725686, |
|
"grad_norm": 0.7637977004051208, |
|
"learning_rate": 1.8806666666666667e-05, |
|
"loss": 0.623, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9476309226932669, |
|
"grad_norm": 0.7605411410331726, |
|
"learning_rate": 1.8740000000000004e-05, |
|
"loss": 0.642, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"grad_norm": 0.7474659085273743, |
|
"learning_rate": 1.8673333333333333e-05, |
|
"loss": 0.6081, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"eval_loss": 0.6107227802276611, |
|
"eval_runtime": 57.9983, |
|
"eval_samples_per_second": 17.294, |
|
"eval_steps_per_second": 3.466, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0473815461346634, |
|
"grad_norm": 0.7955527901649475, |
|
"learning_rate": 1.860666666666667e-05, |
|
"loss": 0.6313, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0972568578553616, |
|
"grad_norm": 0.8842599391937256, |
|
"learning_rate": 1.8540000000000002e-05, |
|
"loss": 0.5837, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.14713216957606, |
|
"grad_norm": 0.9143279790878296, |
|
"learning_rate": 1.8473333333333335e-05, |
|
"loss": 0.5668, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.1970074812967582, |
|
"grad_norm": 0.8860597610473633, |
|
"learning_rate": 1.8406666666666668e-05, |
|
"loss": 0.565, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2468827930174564, |
|
"grad_norm": 0.8665011525154114, |
|
"learning_rate": 1.834e-05, |
|
"loss": 0.5611, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2967581047381547, |
|
"grad_norm": 0.8916261196136475, |
|
"learning_rate": 1.8273333333333333e-05, |
|
"loss": 0.5461, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.346633416458853, |
|
"grad_norm": 0.8792281150817871, |
|
"learning_rate": 1.820666666666667e-05, |
|
"loss": 0.5569, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.3965087281795512, |
|
"grad_norm": 0.8166815638542175, |
|
"learning_rate": 1.8140000000000003e-05, |
|
"loss": 0.5435, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4463840399002494, |
|
"grad_norm": 0.9711934328079224, |
|
"learning_rate": 1.8073333333333335e-05, |
|
"loss": 0.5205, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4962593516209477, |
|
"grad_norm": 0.8851374983787537, |
|
"learning_rate": 1.8006666666666668e-05, |
|
"loss": 0.5269, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.546134663341646, |
|
"grad_norm": 0.940987765789032, |
|
"learning_rate": 1.794e-05, |
|
"loss": 0.5177, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5960099750623442, |
|
"grad_norm": 1.1342016458511353, |
|
"learning_rate": 1.7873333333333337e-05, |
|
"loss": 0.4981, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6458852867830425, |
|
"grad_norm": 1.0847837924957275, |
|
"learning_rate": 1.7806666666666667e-05, |
|
"loss": 0.5056, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6957605985037407, |
|
"grad_norm": 0.979576587677002, |
|
"learning_rate": 1.7740000000000003e-05, |
|
"loss": 0.5048, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.745635910224439, |
|
"grad_norm": 0.9637285470962524, |
|
"learning_rate": 1.7673333333333336e-05, |
|
"loss": 0.5076, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.7955112219451372, |
|
"grad_norm": 0.9592918157577515, |
|
"learning_rate": 1.760666666666667e-05, |
|
"loss": 0.512, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8453865336658355, |
|
"grad_norm": 0.9521375894546509, |
|
"learning_rate": 1.754e-05, |
|
"loss": 0.4993, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.8952618453865338, |
|
"grad_norm": 1.1952577829360962, |
|
"learning_rate": 1.7473333333333334e-05, |
|
"loss": 0.4933, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.945137157107232, |
|
"grad_norm": 0.8078787326812744, |
|
"learning_rate": 1.7406666666666667e-05, |
|
"loss": 0.4894, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9950124688279303, |
|
"grad_norm": 1.0912864208221436, |
|
"learning_rate": 1.734e-05, |
|
"loss": 0.4865, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.49822476506233215, |
|
"eval_runtime": 57.9997, |
|
"eval_samples_per_second": 17.293, |
|
"eval_steps_per_second": 3.466, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.0448877805486285, |
|
"grad_norm": 0.9705982804298401, |
|
"learning_rate": 1.7273333333333336e-05, |
|
"loss": 0.4976, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.0947630922693268, |
|
"grad_norm": 1.0686686038970947, |
|
"learning_rate": 1.7206666666666666e-05, |
|
"loss": 0.4851, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.144638403990025, |
|
"grad_norm": 1.0422980785369873, |
|
"learning_rate": 1.7140000000000002e-05, |
|
"loss": 0.4577, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.1945137157107233, |
|
"grad_norm": 1.0678831338882446, |
|
"learning_rate": 1.7073333333333335e-05, |
|
"loss": 0.5092, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.2443890274314215, |
|
"grad_norm": 1.1621687412261963, |
|
"learning_rate": 1.7006666666666668e-05, |
|
"loss": 0.4897, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.29426433915212, |
|
"grad_norm": 0.9376991391181946, |
|
"learning_rate": 1.694e-05, |
|
"loss": 0.4788, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.344139650872818, |
|
"grad_norm": 1.0832690000534058, |
|
"learning_rate": 1.6873333333333333e-05, |
|
"loss": 0.4864, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.3940149625935163, |
|
"grad_norm": 1.1426657438278198, |
|
"learning_rate": 1.680666666666667e-05, |
|
"loss": 0.4659, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.4438902743142146, |
|
"grad_norm": 1.016119122505188, |
|
"learning_rate": 1.6740000000000002e-05, |
|
"loss": 0.4636, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.493765586034913, |
|
"grad_norm": 1.1347243785858154, |
|
"learning_rate": 1.6673333333333335e-05, |
|
"loss": 0.4663, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.543640897755611, |
|
"grad_norm": 1.087796926498413, |
|
"learning_rate": 1.6606666666666668e-05, |
|
"loss": 0.4729, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.5935162094763093, |
|
"grad_norm": 1.1080952882766724, |
|
"learning_rate": 1.654e-05, |
|
"loss": 0.4803, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.6433915211970076, |
|
"grad_norm": 0.9999821186065674, |
|
"learning_rate": 1.6473333333333334e-05, |
|
"loss": 0.4872, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.693266832917706, |
|
"grad_norm": 1.2594393491744995, |
|
"learning_rate": 1.640666666666667e-05, |
|
"loss": 0.4708, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.743142144638404, |
|
"grad_norm": 1.012904167175293, |
|
"learning_rate": 1.634e-05, |
|
"loss": 0.46, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.7930174563591024, |
|
"grad_norm": 1.052252173423767, |
|
"learning_rate": 1.6273333333333336e-05, |
|
"loss": 0.4664, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.8428927680798006, |
|
"grad_norm": 1.2155933380126953, |
|
"learning_rate": 1.620666666666667e-05, |
|
"loss": 0.4518, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.892768079800499, |
|
"grad_norm": 1.064907193183899, |
|
"learning_rate": 1.614e-05, |
|
"loss": 0.4933, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.942643391521197, |
|
"grad_norm": 1.0846999883651733, |
|
"learning_rate": 1.6073333333333334e-05, |
|
"loss": 0.4724, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.9925187032418954, |
|
"grad_norm": 1.2878086566925049, |
|
"learning_rate": 1.6006666666666667e-05, |
|
"loss": 0.4895, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.997506234413965, |
|
"eval_loss": 0.47463858127593994, |
|
"eval_runtime": 57.9983, |
|
"eval_samples_per_second": 17.294, |
|
"eval_steps_per_second": 3.466, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.0423940149625937, |
|
"grad_norm": 1.141946792602539, |
|
"learning_rate": 1.5940000000000003e-05, |
|
"loss": 0.4808, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.092269326683292, |
|
"grad_norm": 1.1112643480300903, |
|
"learning_rate": 1.5873333333333333e-05, |
|
"loss": 0.4525, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.14214463840399, |
|
"grad_norm": 1.0057584047317505, |
|
"learning_rate": 1.580666666666667e-05, |
|
"loss": 0.4699, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.1920199501246884, |
|
"grad_norm": 1.3252357244491577, |
|
"learning_rate": 1.5740000000000002e-05, |
|
"loss": 0.4631, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.2418952618453867, |
|
"grad_norm": 1.1567648649215698, |
|
"learning_rate": 1.5673333333333335e-05, |
|
"loss": 0.4559, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.291770573566085, |
|
"grad_norm": 1.0659315586090088, |
|
"learning_rate": 1.5606666666666667e-05, |
|
"loss": 0.4575, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.341645885286783, |
|
"grad_norm": 1.1061469316482544, |
|
"learning_rate": 1.554e-05, |
|
"loss": 0.4592, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.3915211970074814, |
|
"grad_norm": 1.342760443687439, |
|
"learning_rate": 1.5473333333333333e-05, |
|
"loss": 0.4459, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.4413965087281797, |
|
"grad_norm": 1.080640196800232, |
|
"learning_rate": 1.5406666666666666e-05, |
|
"loss": 0.4801, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.491271820448878, |
|
"grad_norm": 1.1660523414611816, |
|
"learning_rate": 1.5340000000000002e-05, |
|
"loss": 0.4536, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.541147132169576, |
|
"grad_norm": 1.0911084413528442, |
|
"learning_rate": 1.5273333333333335e-05, |
|
"loss": 0.4536, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.5910224438902745, |
|
"grad_norm": 1.1449761390686035, |
|
"learning_rate": 1.5206666666666668e-05, |
|
"loss": 0.4663, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.6408977556109727, |
|
"grad_norm": 1.0596632957458496, |
|
"learning_rate": 1.514e-05, |
|
"loss": 0.4633, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.690773067331671, |
|
"grad_norm": 1.1071261167526245, |
|
"learning_rate": 1.5073333333333335e-05, |
|
"loss": 0.4579, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.7406483790523692, |
|
"grad_norm": 1.3376258611679077, |
|
"learning_rate": 1.5006666666666666e-05, |
|
"loss": 0.4526, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.7905236907730675, |
|
"grad_norm": 1.2122935056686401, |
|
"learning_rate": 1.4940000000000001e-05, |
|
"loss": 0.4602, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.8403990024937658, |
|
"grad_norm": 1.1372263431549072, |
|
"learning_rate": 1.4873333333333335e-05, |
|
"loss": 0.4733, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.890274314214464, |
|
"grad_norm": 1.1192553043365479, |
|
"learning_rate": 1.4806666666666667e-05, |
|
"loss": 0.466, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.9401496259351623, |
|
"grad_norm": 1.1695116758346558, |
|
"learning_rate": 1.4740000000000001e-05, |
|
"loss": 0.4485, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.9900249376558605, |
|
"grad_norm": 1.2374085187911987, |
|
"learning_rate": 1.4673333333333336e-05, |
|
"loss": 0.4428, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.4639199376106262, |
|
"eval_runtime": 58.0007, |
|
"eval_samples_per_second": 17.293, |
|
"eval_steps_per_second": 3.465, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 4.039900249376559, |
|
"grad_norm": 1.2248201370239258, |
|
"learning_rate": 1.4606666666666667e-05, |
|
"loss": 0.4765, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.089775561097257, |
|
"grad_norm": 1.0709633827209473, |
|
"learning_rate": 1.4540000000000001e-05, |
|
"loss": 0.458, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.139650872817955, |
|
"grad_norm": 1.245808482170105, |
|
"learning_rate": 1.4473333333333334e-05, |
|
"loss": 0.4413, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.1895261845386536, |
|
"grad_norm": 1.0952578783035278, |
|
"learning_rate": 1.4406666666666669e-05, |
|
"loss": 0.4466, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.239401496259352, |
|
"grad_norm": 1.3013060092926025, |
|
"learning_rate": 1.434e-05, |
|
"loss": 0.4579, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.28927680798005, |
|
"grad_norm": 1.40675687789917, |
|
"learning_rate": 1.4273333333333334e-05, |
|
"loss": 0.4547, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.339152119700748, |
|
"grad_norm": 1.0993843078613281, |
|
"learning_rate": 1.4206666666666669e-05, |
|
"loss": 0.4326, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.389027431421447, |
|
"grad_norm": 1.279954195022583, |
|
"learning_rate": 1.414e-05, |
|
"loss": 0.4528, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.438902743142145, |
|
"grad_norm": 1.1767985820770264, |
|
"learning_rate": 1.4073333333333335e-05, |
|
"loss": 0.4678, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.488778054862843, |
|
"grad_norm": 1.2366374731063843, |
|
"learning_rate": 1.400666666666667e-05, |
|
"loss": 0.4383, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.538653366583541, |
|
"grad_norm": 1.3890389204025269, |
|
"learning_rate": 1.394e-05, |
|
"loss": 0.4586, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.58852867830424, |
|
"grad_norm": 1.288946509361267, |
|
"learning_rate": 1.3873333333333335e-05, |
|
"loss": 0.4329, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.638403990024938, |
|
"grad_norm": 1.5377354621887207, |
|
"learning_rate": 1.3806666666666668e-05, |
|
"loss": 0.4524, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.688279301745636, |
|
"grad_norm": 1.1805988550186157, |
|
"learning_rate": 1.3740000000000002e-05, |
|
"loss": 0.4421, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.738154613466334, |
|
"grad_norm": 1.2433724403381348, |
|
"learning_rate": 1.3673333333333333e-05, |
|
"loss": 0.4463, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.788029925187033, |
|
"grad_norm": 1.5534576177597046, |
|
"learning_rate": 1.3606666666666668e-05, |
|
"loss": 0.4529, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.837905236907731, |
|
"grad_norm": 1.1943343877792358, |
|
"learning_rate": 1.3540000000000003e-05, |
|
"loss": 0.4342, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.887780548628429, |
|
"grad_norm": 1.4089887142181396, |
|
"learning_rate": 1.3473333333333334e-05, |
|
"loss": 0.4474, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.937655860349127, |
|
"grad_norm": 1.2032736539840698, |
|
"learning_rate": 1.3406666666666668e-05, |
|
"loss": 0.4367, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.987531172069826, |
|
"grad_norm": 1.2193266153335571, |
|
"learning_rate": 1.3340000000000001e-05, |
|
"loss": 0.4617, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.997506234413965, |
|
"eval_loss": 0.457111656665802, |
|
"eval_runtime": 58.0103, |
|
"eval_samples_per_second": 17.29, |
|
"eval_steps_per_second": 3.465, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 5.037406483790524, |
|
"grad_norm": 1.2309563159942627, |
|
"learning_rate": 1.3273333333333334e-05, |
|
"loss": 0.4802, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 5.087281795511222, |
|
"grad_norm": 1.3006610870361328, |
|
"learning_rate": 1.3206666666666668e-05, |
|
"loss": 0.4304, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.13715710723192, |
|
"grad_norm": 1.1411958932876587, |
|
"learning_rate": 1.3140000000000001e-05, |
|
"loss": 0.4379, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 5.187032418952619, |
|
"grad_norm": 1.489011287689209, |
|
"learning_rate": 1.3073333333333334e-05, |
|
"loss": 0.4377, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.236907730673317, |
|
"grad_norm": 1.127456784248352, |
|
"learning_rate": 1.3006666666666667e-05, |
|
"loss": 0.4401, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.286783042394015, |
|
"grad_norm": 1.1423261165618896, |
|
"learning_rate": 1.2940000000000001e-05, |
|
"loss": 0.4195, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.3366583541147135, |
|
"grad_norm": 1.2664945125579834, |
|
"learning_rate": 1.2873333333333336e-05, |
|
"loss": 0.4421, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.386533665835412, |
|
"grad_norm": 1.3833181858062744, |
|
"learning_rate": 1.2806666666666667e-05, |
|
"loss": 0.4493, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.43640897755611, |
|
"grad_norm": 1.2505003213882446, |
|
"learning_rate": 1.2740000000000002e-05, |
|
"loss": 0.4322, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.486284289276808, |
|
"grad_norm": 1.1786164045333862, |
|
"learning_rate": 1.2673333333333335e-05, |
|
"loss": 0.4474, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.5361596009975065, |
|
"grad_norm": 1.3222135305404663, |
|
"learning_rate": 1.2606666666666667e-05, |
|
"loss": 0.4434, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.586034912718205, |
|
"grad_norm": 1.1747450828552246, |
|
"learning_rate": 1.254e-05, |
|
"loss": 0.4577, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.635910224438903, |
|
"grad_norm": 1.299611210823059, |
|
"learning_rate": 1.2473333333333335e-05, |
|
"loss": 0.4306, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.685785536159601, |
|
"grad_norm": 1.353705883026123, |
|
"learning_rate": 1.2406666666666668e-05, |
|
"loss": 0.4438, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.7356608478802995, |
|
"grad_norm": 1.195237398147583, |
|
"learning_rate": 1.234e-05, |
|
"loss": 0.4203, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.785536159600998, |
|
"grad_norm": 1.4759230613708496, |
|
"learning_rate": 1.2273333333333335e-05, |
|
"loss": 0.4399, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.835411471321696, |
|
"grad_norm": 1.1856341361999512, |
|
"learning_rate": 1.2206666666666666e-05, |
|
"loss": 0.4566, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.885286783042394, |
|
"grad_norm": 1.369195818901062, |
|
"learning_rate": 1.214e-05, |
|
"loss": 0.4568, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.9351620947630925, |
|
"grad_norm": 1.2702898979187012, |
|
"learning_rate": 1.2073333333333335e-05, |
|
"loss": 0.4372, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.985037406483791, |
|
"grad_norm": 1.5332574844360352, |
|
"learning_rate": 1.2006666666666668e-05, |
|
"loss": 0.454, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.45269879698753357, |
|
"eval_runtime": 57.8329, |
|
"eval_samples_per_second": 17.343, |
|
"eval_steps_per_second": 3.476, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 6.034912718204489, |
|
"grad_norm": 1.502089500427246, |
|
"learning_rate": 1.1940000000000001e-05, |
|
"loss": 0.4509, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 6.084788029925187, |
|
"grad_norm": 1.1866357326507568, |
|
"learning_rate": 1.1873333333333334e-05, |
|
"loss": 0.4315, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.134663341645886, |
|
"grad_norm": 1.2971949577331543, |
|
"learning_rate": 1.1806666666666668e-05, |
|
"loss": 0.4378, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 6.184538653366584, |
|
"grad_norm": 1.3105353116989136, |
|
"learning_rate": 1.1740000000000001e-05, |
|
"loss": 0.4462, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.234413965087282, |
|
"grad_norm": 1.6913713216781616, |
|
"learning_rate": 1.1673333333333334e-05, |
|
"loss": 0.4273, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.28428927680798, |
|
"grad_norm": 1.2743034362792969, |
|
"learning_rate": 1.1606666666666668e-05, |
|
"loss": 0.4308, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.334164588528679, |
|
"grad_norm": 1.2764904499053955, |
|
"learning_rate": 1.154e-05, |
|
"loss": 0.4416, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 6.384039900249377, |
|
"grad_norm": 1.2375209331512451, |
|
"learning_rate": 1.1473333333333334e-05, |
|
"loss": 0.4279, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.433915211970075, |
|
"grad_norm": 1.2813770771026611, |
|
"learning_rate": 1.1406666666666669e-05, |
|
"loss": 0.4318, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.483790523690773, |
|
"grad_norm": 1.413662314414978, |
|
"learning_rate": 1.134e-05, |
|
"loss": 0.4299, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.533665835411472, |
|
"grad_norm": 1.4377459287643433, |
|
"learning_rate": 1.1273333333333334e-05, |
|
"loss": 0.4538, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.58354114713217, |
|
"grad_norm": 1.280629277229309, |
|
"learning_rate": 1.1206666666666667e-05, |
|
"loss": 0.4305, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.633416458852868, |
|
"grad_norm": 1.4439893960952759, |
|
"learning_rate": 1.1140000000000002e-05, |
|
"loss": 0.4382, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.683291770573566, |
|
"grad_norm": 1.290280818939209, |
|
"learning_rate": 1.1073333333333333e-05, |
|
"loss": 0.4395, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.733167082294265, |
|
"grad_norm": 1.7553201913833618, |
|
"learning_rate": 1.1006666666666667e-05, |
|
"loss": 0.4633, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.783042394014963, |
|
"grad_norm": 1.3317337036132812, |
|
"learning_rate": 1.0940000000000002e-05, |
|
"loss": 0.4396, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.832917705735661, |
|
"grad_norm": 1.4611096382141113, |
|
"learning_rate": 1.0873333333333333e-05, |
|
"loss": 0.4306, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.882793017456359, |
|
"grad_norm": 1.461767554283142, |
|
"learning_rate": 1.0806666666666668e-05, |
|
"loss": 0.4363, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.932668329177058, |
|
"grad_norm": 1.2594436407089233, |
|
"learning_rate": 1.0740000000000002e-05, |
|
"loss": 0.4311, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.982543640897756, |
|
"grad_norm": 1.339422345161438, |
|
"learning_rate": 1.0673333333333333e-05, |
|
"loss": 0.4359, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.997506234413965, |
|
"eval_loss": 0.4493270516395569, |
|
"eval_runtime": 57.825, |
|
"eval_samples_per_second": 17.345, |
|
"eval_steps_per_second": 3.476, |
|
"step": 1403 |
|
}, |
|
{ |
|
"epoch": 7.032418952618454, |
|
"grad_norm": 1.5040308237075806, |
|
"learning_rate": 1.0606666666666668e-05, |
|
"loss": 0.4513, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 7.082294264339152, |
|
"grad_norm": 1.3062398433685303, |
|
"learning_rate": 1.054e-05, |
|
"loss": 0.4286, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.132169576059851, |
|
"grad_norm": 1.2743724584579468, |
|
"learning_rate": 1.0473333333333334e-05, |
|
"loss": 0.4308, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 7.182044887780549, |
|
"grad_norm": 1.335893154144287, |
|
"learning_rate": 1.0406666666666666e-05, |
|
"loss": 0.4467, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.231920199501247, |
|
"grad_norm": 1.5432289838790894, |
|
"learning_rate": 1.0340000000000001e-05, |
|
"loss": 0.4249, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 7.2817955112219455, |
|
"grad_norm": 1.3191583156585693, |
|
"learning_rate": 1.0273333333333335e-05, |
|
"loss": 0.4308, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.331670822942644, |
|
"grad_norm": 1.4070515632629395, |
|
"learning_rate": 1.0206666666666667e-05, |
|
"loss": 0.4346, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 7.381546134663342, |
|
"grad_norm": 1.2089998722076416, |
|
"learning_rate": 1.0140000000000001e-05, |
|
"loss": 0.4261, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.43142144638404, |
|
"grad_norm": 1.349908709526062, |
|
"learning_rate": 1.0073333333333336e-05, |
|
"loss": 0.4506, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 7.4812967581047385, |
|
"grad_norm": 1.3772706985473633, |
|
"learning_rate": 1.0006666666666667e-05, |
|
"loss": 0.4194, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.531172069825437, |
|
"grad_norm": 1.265548586845398, |
|
"learning_rate": 9.940000000000001e-06, |
|
"loss": 0.4109, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 7.581047381546135, |
|
"grad_norm": 1.4737778902053833, |
|
"learning_rate": 9.873333333333334e-06, |
|
"loss": 0.4335, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 7.630922693266833, |
|
"grad_norm": 1.2900265455245972, |
|
"learning_rate": 9.806666666666667e-06, |
|
"loss": 0.4489, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.6807980049875315, |
|
"grad_norm": 1.3885109424591064, |
|
"learning_rate": 9.74e-06, |
|
"loss": 0.4246, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.73067331670823, |
|
"grad_norm": 1.4786447286605835, |
|
"learning_rate": 9.673333333333334e-06, |
|
"loss": 0.4307, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.780548628428928, |
|
"grad_norm": 1.4943320751190186, |
|
"learning_rate": 9.606666666666667e-06, |
|
"loss": 0.4272, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.830423940149626, |
|
"grad_norm": 1.2281179428100586, |
|
"learning_rate": 9.54e-06, |
|
"loss": 0.4361, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.8802992518703245, |
|
"grad_norm": 1.4099763631820679, |
|
"learning_rate": 9.473333333333335e-06, |
|
"loss": 0.4328, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.930174563591023, |
|
"grad_norm": 1.2165679931640625, |
|
"learning_rate": 9.406666666666668e-06, |
|
"loss": 0.4325, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.980049875311721, |
|
"grad_norm": 1.5678939819335938, |
|
"learning_rate": 9.340000000000002e-06, |
|
"loss": 0.4431, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.4465142786502838, |
|
"eval_runtime": 57.9689, |
|
"eval_samples_per_second": 17.302, |
|
"eval_steps_per_second": 3.467, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 8.029925187032418, |
|
"grad_norm": 1.3903255462646484, |
|
"learning_rate": 9.273333333333335e-06, |
|
"loss": 0.4489, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 8.079800498753118, |
|
"grad_norm": 1.4039775133132935, |
|
"learning_rate": 9.206666666666668e-06, |
|
"loss": 0.4282, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.129675810473815, |
|
"grad_norm": 1.238571286201477, |
|
"learning_rate": 9.14e-06, |
|
"loss": 0.4352, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 8.179551122194514, |
|
"grad_norm": 1.418980360031128, |
|
"learning_rate": 9.073333333333333e-06, |
|
"loss": 0.4232, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.229426433915211, |
|
"grad_norm": 1.362851619720459, |
|
"learning_rate": 9.006666666666666e-06, |
|
"loss": 0.4251, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 8.27930174563591, |
|
"grad_norm": 1.3849656581878662, |
|
"learning_rate": 8.94e-06, |
|
"loss": 0.4367, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.329177057356608, |
|
"grad_norm": 1.4366765022277832, |
|
"learning_rate": 8.873333333333334e-06, |
|
"loss": 0.4348, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 8.379052369077307, |
|
"grad_norm": 1.3090085983276367, |
|
"learning_rate": 8.806666666666668e-06, |
|
"loss": 0.4349, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 8.428927680798004, |
|
"grad_norm": 1.3340786695480347, |
|
"learning_rate": 8.740000000000001e-06, |
|
"loss": 0.4292, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 8.478802992518704, |
|
"grad_norm": 1.224572777748108, |
|
"learning_rate": 8.673333333333334e-06, |
|
"loss": 0.4225, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.528678304239401, |
|
"grad_norm": 1.3066871166229248, |
|
"learning_rate": 8.606666666666668e-06, |
|
"loss": 0.4295, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 8.5785536159601, |
|
"grad_norm": 1.3138604164123535, |
|
"learning_rate": 8.540000000000001e-06, |
|
"loss": 0.4249, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 8.628428927680797, |
|
"grad_norm": 1.3178144693374634, |
|
"learning_rate": 8.473333333333334e-06, |
|
"loss": 0.4295, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 8.678304239401497, |
|
"grad_norm": 1.5216317176818848, |
|
"learning_rate": 8.406666666666667e-06, |
|
"loss": 0.4249, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 8.728179551122194, |
|
"grad_norm": 1.3315293788909912, |
|
"learning_rate": 8.34e-06, |
|
"loss": 0.4156, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.778054862842893, |
|
"grad_norm": 1.613593578338623, |
|
"learning_rate": 8.273333333333334e-06, |
|
"loss": 0.418, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.82793017456359, |
|
"grad_norm": 1.2980856895446777, |
|
"learning_rate": 8.206666666666667e-06, |
|
"loss": 0.4234, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.87780548628429, |
|
"grad_norm": 1.4451056718826294, |
|
"learning_rate": 8.14e-06, |
|
"loss": 0.4427, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.927680798004987, |
|
"grad_norm": 1.3692457675933838, |
|
"learning_rate": 8.073333333333335e-06, |
|
"loss": 0.4394, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.977556109725686, |
|
"grad_norm": 1.4997522830963135, |
|
"learning_rate": 8.006666666666667e-06, |
|
"loss": 0.4299, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.997506234413965, |
|
"eval_loss": 0.4446782171726227, |
|
"eval_runtime": 57.956, |
|
"eval_samples_per_second": 17.306, |
|
"eval_steps_per_second": 3.468, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 9.027431421446384, |
|
"grad_norm": 1.4114794731140137, |
|
"learning_rate": 7.94e-06, |
|
"loss": 0.4409, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 9.077306733167083, |
|
"grad_norm": 1.2984321117401123, |
|
"learning_rate": 7.873333333333335e-06, |
|
"loss": 0.4091, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 9.12718204488778, |
|
"grad_norm": 1.3625205755233765, |
|
"learning_rate": 7.806666666666668e-06, |
|
"loss": 0.4224, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 9.17705735660848, |
|
"grad_norm": 1.4072176218032837, |
|
"learning_rate": 7.74e-06, |
|
"loss": 0.4416, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 9.226932668329177, |
|
"grad_norm": 1.2547061443328857, |
|
"learning_rate": 7.673333333333333e-06, |
|
"loss": 0.4228, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 9.276807980049876, |
|
"grad_norm": 1.5764844417572021, |
|
"learning_rate": 7.606666666666668e-06, |
|
"loss": 0.4346, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 9.326683291770573, |
|
"grad_norm": 1.4493110179901123, |
|
"learning_rate": 7.540000000000001e-06, |
|
"loss": 0.438, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 9.376558603491272, |
|
"grad_norm": 1.3886995315551758, |
|
"learning_rate": 7.4733333333333335e-06, |
|
"loss": 0.4282, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 9.42643391521197, |
|
"grad_norm": 1.5260852575302124, |
|
"learning_rate": 7.406666666666667e-06, |
|
"loss": 0.4267, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 9.476309226932669, |
|
"grad_norm": 1.7208826541900635, |
|
"learning_rate": 7.340000000000001e-06, |
|
"loss": 0.4195, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.526184538653366, |
|
"grad_norm": 1.3822261095046997, |
|
"learning_rate": 7.2733333333333346e-06, |
|
"loss": 0.4254, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 9.576059850374065, |
|
"grad_norm": 1.6141988039016724, |
|
"learning_rate": 7.206666666666667e-06, |
|
"loss": 0.4165, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 9.625935162094763, |
|
"grad_norm": 1.4055383205413818, |
|
"learning_rate": 7.14e-06, |
|
"loss": 0.4149, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 9.675810473815462, |
|
"grad_norm": 1.3142189979553223, |
|
"learning_rate": 7.073333333333334e-06, |
|
"loss": 0.4107, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 9.72568578553616, |
|
"grad_norm": 1.2457393407821655, |
|
"learning_rate": 7.006666666666667e-06, |
|
"loss": 0.4296, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.775561097256858, |
|
"grad_norm": 1.4622180461883545, |
|
"learning_rate": 6.9400000000000005e-06, |
|
"loss": 0.4422, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 9.825436408977556, |
|
"grad_norm": 1.4130418300628662, |
|
"learning_rate": 6.873333333333334e-06, |
|
"loss": 0.4317, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.875311720698255, |
|
"grad_norm": 1.311515212059021, |
|
"learning_rate": 6.806666666666667e-06, |
|
"loss": 0.4184, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.925187032418952, |
|
"grad_norm": 1.4601870775222778, |
|
"learning_rate": 6.740000000000001e-06, |
|
"loss": 0.4136, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.975062344139651, |
|
"grad_norm": 1.3348491191864014, |
|
"learning_rate": 6.6733333333333335e-06, |
|
"loss": 0.4409, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.44308173656463623, |
|
"eval_runtime": 57.9177, |
|
"eval_samples_per_second": 17.318, |
|
"eval_steps_per_second": 3.47, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 10.024937655860349, |
|
"grad_norm": 1.3334033489227295, |
|
"learning_rate": 6.606666666666666e-06, |
|
"loss": 0.4456, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 10.074812967581048, |
|
"grad_norm": 1.5118451118469238, |
|
"learning_rate": 6.540000000000001e-06, |
|
"loss": 0.4385, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 10.124688279301745, |
|
"grad_norm": 1.3644325733184814, |
|
"learning_rate": 6.473333333333334e-06, |
|
"loss": 0.4208, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 10.174563591022444, |
|
"grad_norm": 1.426771640777588, |
|
"learning_rate": 6.4066666666666674e-06, |
|
"loss": 0.4201, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 10.224438902743142, |
|
"grad_norm": 1.3023799657821655, |
|
"learning_rate": 6.34e-06, |
|
"loss": 0.4068, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 10.27431421446384, |
|
"grad_norm": 1.3506182432174683, |
|
"learning_rate": 6.273333333333333e-06, |
|
"loss": 0.4264, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 10.324189526184538, |
|
"grad_norm": 1.4448788166046143, |
|
"learning_rate": 6.206666666666668e-06, |
|
"loss": 0.4249, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 10.374064837905237, |
|
"grad_norm": 1.3727625608444214, |
|
"learning_rate": 6.1400000000000005e-06, |
|
"loss": 0.4351, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 10.423940149625935, |
|
"grad_norm": 1.512022614479065, |
|
"learning_rate": 6.073333333333333e-06, |
|
"loss": 0.4332, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 10.473815461346634, |
|
"grad_norm": 1.305282473564148, |
|
"learning_rate": 6.006666666666667e-06, |
|
"loss": 0.4248, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 10.523690773067331, |
|
"grad_norm": 1.4088374376296997, |
|
"learning_rate": 5.94e-06, |
|
"loss": 0.4323, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 10.57356608478803, |
|
"grad_norm": 1.4796631336212158, |
|
"learning_rate": 5.873333333333334e-06, |
|
"loss": 0.4254, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 10.623441396508728, |
|
"grad_norm": 1.556593418121338, |
|
"learning_rate": 5.806666666666667e-06, |
|
"loss": 0.4117, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 10.673316708229427, |
|
"grad_norm": 1.3857340812683105, |
|
"learning_rate": 5.74e-06, |
|
"loss": 0.4241, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 10.723192019950124, |
|
"grad_norm": 1.6449425220489502, |
|
"learning_rate": 5.673333333333334e-06, |
|
"loss": 0.4144, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 10.773067331670823, |
|
"grad_norm": 1.8624916076660156, |
|
"learning_rate": 5.606666666666667e-06, |
|
"loss": 0.4239, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 10.82294264339152, |
|
"grad_norm": 1.3718584775924683, |
|
"learning_rate": 5.540000000000001e-06, |
|
"loss": 0.4081, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 10.87281795511222, |
|
"grad_norm": 1.4391957521438599, |
|
"learning_rate": 5.473333333333334e-06, |
|
"loss": 0.4195, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 10.922693266832917, |
|
"grad_norm": 1.6265774965286255, |
|
"learning_rate": 5.406666666666667e-06, |
|
"loss": 0.4446, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 10.972568578553616, |
|
"grad_norm": 1.5026485919952393, |
|
"learning_rate": 5.3400000000000005e-06, |
|
"loss": 0.4229, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 10.997506234413965, |
|
"eval_loss": 0.44174298644065857, |
|
"eval_runtime": 57.9098, |
|
"eval_samples_per_second": 17.32, |
|
"eval_steps_per_second": 3.471, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 11.022443890274314, |
|
"grad_norm": 1.452600121498108, |
|
"learning_rate": 5.273333333333333e-06, |
|
"loss": 0.4326, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 11.072319201995013, |
|
"grad_norm": 1.3822201490402222, |
|
"learning_rate": 5.206666666666668e-06, |
|
"loss": 0.4264, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 11.12219451371571, |
|
"grad_norm": 1.468363642692566, |
|
"learning_rate": 5.140000000000001e-06, |
|
"loss": 0.4078, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 11.17206982543641, |
|
"grad_norm": 1.495955467224121, |
|
"learning_rate": 5.073333333333334e-06, |
|
"loss": 0.4173, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 11.221945137157107, |
|
"grad_norm": 1.441292405128479, |
|
"learning_rate": 5.006666666666667e-06, |
|
"loss": 0.415, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 11.271820448877806, |
|
"grad_norm": 1.421720266342163, |
|
"learning_rate": 4.94e-06, |
|
"loss": 0.4126, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 11.321695760598503, |
|
"grad_norm": 1.2590454816818237, |
|
"learning_rate": 4.873333333333334e-06, |
|
"loss": 0.4193, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 11.371571072319203, |
|
"grad_norm": 1.48396897315979, |
|
"learning_rate": 4.8066666666666675e-06, |
|
"loss": 0.4266, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 11.4214463840399, |
|
"grad_norm": 1.3837693929672241, |
|
"learning_rate": 4.74e-06, |
|
"loss": 0.4199, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 11.471321695760599, |
|
"grad_norm": 1.5965029001235962, |
|
"learning_rate": 4.673333333333333e-06, |
|
"loss": 0.4148, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 11.521197007481296, |
|
"grad_norm": 1.4824038743972778, |
|
"learning_rate": 4.606666666666667e-06, |
|
"loss": 0.4369, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 11.571072319201996, |
|
"grad_norm": 1.4421814680099487, |
|
"learning_rate": 4.540000000000001e-06, |
|
"loss": 0.4182, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 11.620947630922693, |
|
"grad_norm": 1.4961748123168945, |
|
"learning_rate": 4.473333333333334e-06, |
|
"loss": 0.4117, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 11.670822942643392, |
|
"grad_norm": 1.63477623462677, |
|
"learning_rate": 4.406666666666667e-06, |
|
"loss": 0.4237, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 11.72069825436409, |
|
"grad_norm": 1.6593093872070312, |
|
"learning_rate": 4.34e-06, |
|
"loss": 0.4279, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 11.770573566084789, |
|
"grad_norm": 1.5250682830810547, |
|
"learning_rate": 4.273333333333334e-06, |
|
"loss": 0.4264, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 11.820448877805486, |
|
"grad_norm": 1.5515984296798706, |
|
"learning_rate": 4.206666666666667e-06, |
|
"loss": 0.4341, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 11.870324189526185, |
|
"grad_norm": 1.6062873601913452, |
|
"learning_rate": 4.14e-06, |
|
"loss": 0.4247, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 11.920199501246882, |
|
"grad_norm": 1.760644793510437, |
|
"learning_rate": 4.073333333333334e-06, |
|
"loss": 0.4417, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 11.970074812967582, |
|
"grad_norm": 1.2911533117294312, |
|
"learning_rate": 4.006666666666667e-06, |
|
"loss": 0.4159, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.44075635075569153, |
|
"eval_runtime": 57.9093, |
|
"eval_samples_per_second": 17.32, |
|
"eval_steps_per_second": 3.471, |
|
"step": 2406 |
|
}, |
|
{ |
|
"epoch": 12.019950124688279, |
|
"grad_norm": 1.5172271728515625, |
|
"learning_rate": 3.94e-06, |
|
"loss": 0.437, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 12.069825436408978, |
|
"grad_norm": 1.4542344808578491, |
|
"learning_rate": 3.873333333333333e-06, |
|
"loss": 0.4314, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 12.119700748129675, |
|
"grad_norm": 1.4818074703216553, |
|
"learning_rate": 3.806666666666667e-06, |
|
"loss": 0.4184, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 12.169576059850375, |
|
"grad_norm": 1.4975398778915405, |
|
"learning_rate": 3.74e-06, |
|
"loss": 0.4087, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 12.219451371571072, |
|
"grad_norm": 1.3974961042404175, |
|
"learning_rate": 3.673333333333334e-06, |
|
"loss": 0.4135, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 12.269326683291771, |
|
"grad_norm": 1.6019673347473145, |
|
"learning_rate": 3.606666666666667e-06, |
|
"loss": 0.4238, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 12.319201995012468, |
|
"grad_norm": 1.7037951946258545, |
|
"learning_rate": 3.54e-06, |
|
"loss": 0.4218, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 12.369077306733168, |
|
"grad_norm": 1.5602372884750366, |
|
"learning_rate": 3.4733333333333337e-06, |
|
"loss": 0.4192, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 12.418952618453865, |
|
"grad_norm": 1.391041874885559, |
|
"learning_rate": 3.406666666666667e-06, |
|
"loss": 0.414, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 12.468827930174564, |
|
"grad_norm": 1.5659008026123047, |
|
"learning_rate": 3.3400000000000006e-06, |
|
"loss": 0.4251, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.518703241895262, |
|
"grad_norm": 1.4149389266967773, |
|
"learning_rate": 3.2733333333333335e-06, |
|
"loss": 0.4299, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 12.56857855361596, |
|
"grad_norm": 1.5269657373428345, |
|
"learning_rate": 3.2066666666666667e-06, |
|
"loss": 0.4355, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 12.618453865336658, |
|
"grad_norm": 1.6707123517990112, |
|
"learning_rate": 3.1400000000000004e-06, |
|
"loss": 0.4282, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 12.668329177057357, |
|
"grad_norm": 1.3647090196609497, |
|
"learning_rate": 3.0733333333333337e-06, |
|
"loss": 0.3994, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 12.718204488778055, |
|
"grad_norm": 1.4243894815444946, |
|
"learning_rate": 3.0066666666666674e-06, |
|
"loss": 0.4181, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 12.768079800498754, |
|
"grad_norm": 1.3579431772232056, |
|
"learning_rate": 2.9400000000000002e-06, |
|
"loss": 0.4251, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 12.817955112219451, |
|
"grad_norm": 1.6759490966796875, |
|
"learning_rate": 2.8733333333333335e-06, |
|
"loss": 0.4143, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 12.86783042394015, |
|
"grad_norm": 1.5581449270248413, |
|
"learning_rate": 2.806666666666667e-06, |
|
"loss": 0.4083, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 12.917705735660848, |
|
"grad_norm": 1.607188105583191, |
|
"learning_rate": 2.7400000000000004e-06, |
|
"loss": 0.4236, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 12.967581047381547, |
|
"grad_norm": 1.3592510223388672, |
|
"learning_rate": 2.6733333333333333e-06, |
|
"loss": 0.4108, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 12.997506234413965, |
|
"eval_loss": 0.44020339846611023, |
|
"eval_runtime": 57.8997, |
|
"eval_samples_per_second": 17.323, |
|
"eval_steps_per_second": 3.472, |
|
"step": 2606 |
|
}, |
|
{ |
|
"epoch": 13.017456359102244, |
|
"grad_norm": 1.461461067199707, |
|
"learning_rate": 2.606666666666667e-06, |
|
"loss": 0.4439, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 13.067331670822943, |
|
"grad_norm": 1.40589439868927, |
|
"learning_rate": 2.5400000000000002e-06, |
|
"loss": 0.4057, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 13.11720698254364, |
|
"grad_norm": 1.665216088294983, |
|
"learning_rate": 2.4733333333333335e-06, |
|
"loss": 0.4053, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 13.16708229426434, |
|
"grad_norm": 1.4636025428771973, |
|
"learning_rate": 2.4066666666666668e-06, |
|
"loss": 0.4199, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 13.216957605985037, |
|
"grad_norm": 1.4638257026672363, |
|
"learning_rate": 2.3400000000000005e-06, |
|
"loss": 0.4062, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 13.266832917705736, |
|
"grad_norm": 1.4565849304199219, |
|
"learning_rate": 2.2733333333333333e-06, |
|
"loss": 0.4354, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 13.316708229426434, |
|
"grad_norm": 1.4819647073745728, |
|
"learning_rate": 2.206666666666667e-06, |
|
"loss": 0.4088, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 13.366583541147133, |
|
"grad_norm": 1.5023897886276245, |
|
"learning_rate": 2.1400000000000003e-06, |
|
"loss": 0.4313, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 13.41645885286783, |
|
"grad_norm": 1.4261906147003174, |
|
"learning_rate": 2.0733333333333335e-06, |
|
"loss": 0.4179, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 13.46633416458853, |
|
"grad_norm": 1.6513773202896118, |
|
"learning_rate": 2.006666666666667e-06, |
|
"loss": 0.4341, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 13.516209476309227, |
|
"grad_norm": 1.6153149604797363, |
|
"learning_rate": 1.94e-06, |
|
"loss": 0.4231, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 13.566084788029926, |
|
"grad_norm": 1.6202324628829956, |
|
"learning_rate": 1.8733333333333333e-06, |
|
"loss": 0.4173, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 13.615960099750623, |
|
"grad_norm": 1.2559431791305542, |
|
"learning_rate": 1.8066666666666668e-06, |
|
"loss": 0.4238, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 13.665835411471322, |
|
"grad_norm": 1.3010227680206299, |
|
"learning_rate": 1.74e-06, |
|
"loss": 0.4287, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 13.71571072319202, |
|
"grad_norm": 1.432409644126892, |
|
"learning_rate": 1.6733333333333335e-06, |
|
"loss": 0.4143, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 13.765586034912719, |
|
"grad_norm": 1.4686477184295654, |
|
"learning_rate": 1.606666666666667e-06, |
|
"loss": 0.4192, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 13.815461346633416, |
|
"grad_norm": 1.526962161064148, |
|
"learning_rate": 1.54e-06, |
|
"loss": 0.4199, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 13.865336658354115, |
|
"grad_norm": 1.5790767669677734, |
|
"learning_rate": 1.4733333333333336e-06, |
|
"loss": 0.4089, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 13.915211970074813, |
|
"grad_norm": 1.4289913177490234, |
|
"learning_rate": 1.4066666666666668e-06, |
|
"loss": 0.4195, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 13.965087281795512, |
|
"grad_norm": 1.613978624343872, |
|
"learning_rate": 1.34e-06, |
|
"loss": 0.4279, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.439765065908432, |
|
"eval_runtime": 57.9105, |
|
"eval_samples_per_second": 17.32, |
|
"eval_steps_per_second": 3.471, |
|
"step": 2807 |
|
}, |
|
{ |
|
"epoch": 14.01496259351621, |
|
"grad_norm": 1.4722168445587158, |
|
"learning_rate": 1.2733333333333334e-06, |
|
"loss": 0.445, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 14.064837905236908, |
|
"grad_norm": 1.6725043058395386, |
|
"learning_rate": 1.2066666666666668e-06, |
|
"loss": 0.4313, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 14.114713216957606, |
|
"grad_norm": 1.4227081537246704, |
|
"learning_rate": 1.14e-06, |
|
"loss": 0.4181, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 14.164588528678305, |
|
"grad_norm": 1.5610967874526978, |
|
"learning_rate": 1.0733333333333334e-06, |
|
"loss": 0.4229, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 14.214463840399002, |
|
"grad_norm": 1.520051121711731, |
|
"learning_rate": 1.0066666666666668e-06, |
|
"loss": 0.4301, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 14.264339152119701, |
|
"grad_norm": 1.4826945066452026, |
|
"learning_rate": 9.400000000000001e-07, |
|
"loss": 0.443, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 14.314214463840399, |
|
"grad_norm": 1.6496617794036865, |
|
"learning_rate": 8.733333333333334e-07, |
|
"loss": 0.4109, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 14.364089775561098, |
|
"grad_norm": 1.472607135772705, |
|
"learning_rate": 8.066666666666667e-07, |
|
"loss": 0.4075, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 14.413965087281795, |
|
"grad_norm": 1.4754953384399414, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.4257, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 14.463840399002494, |
|
"grad_norm": 1.6782726049423218, |
|
"learning_rate": 6.733333333333334e-07, |
|
"loss": 0.4099, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 14.513715710723192, |
|
"grad_norm": 1.4635963439941406, |
|
"learning_rate": 6.066666666666668e-07, |
|
"loss": 0.3977, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 14.563591022443891, |
|
"grad_norm": 1.5924144983291626, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.4181, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 14.613466334164588, |
|
"grad_norm": 1.346475601196289, |
|
"learning_rate": 4.7333333333333334e-07, |
|
"loss": 0.4132, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 14.663341645885287, |
|
"grad_norm": 1.4001176357269287, |
|
"learning_rate": 4.0666666666666666e-07, |
|
"loss": 0.4007, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 14.713216957605985, |
|
"grad_norm": 1.6243798732757568, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.4332, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 14.763092269326684, |
|
"grad_norm": 1.5413304567337036, |
|
"learning_rate": 2.7333333333333335e-07, |
|
"loss": 0.4083, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 14.812967581047381, |
|
"grad_norm": 1.4332594871520996, |
|
"learning_rate": 2.066666666666667e-07, |
|
"loss": 0.4206, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 14.86284289276808, |
|
"grad_norm": 1.4844541549682617, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.4285, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 14.912718204488778, |
|
"grad_norm": 1.471591830253601, |
|
"learning_rate": 7.333333333333334e-08, |
|
"loss": 0.4118, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 14.962593516209477, |
|
"grad_norm": 1.3903006315231323, |
|
"learning_rate": 6.666666666666667e-09, |
|
"loss": 0.4145, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.48818315264e+16, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|