|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1134, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.026455026455026454, |
|
"grad_norm": 7.5692021410659995, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0155, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05291005291005291, |
|
"grad_norm": 2.6174349537450885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9187, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 2.793450551829053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8889, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10582010582010581, |
|
"grad_norm": 1.0752828617491306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8728, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13227513227513227, |
|
"grad_norm": 0.7890229586623537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8553, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 0.8515898940332954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8472, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.6739899043810001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8386, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21164021164021163, |
|
"grad_norm": 0.7472386351456437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8303, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.8201164599585603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8296, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26455026455026454, |
|
"grad_norm": 0.5974248203060147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8208, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.291005291005291, |
|
"grad_norm": 0.7418126551089665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.817, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 0.8971788447098652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.816, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3439153439153439, |
|
"grad_norm": 0.6659038218094219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8095, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.8687534608310304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.809, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 0.6997183066642144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8116, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42328042328042326, |
|
"grad_norm": 0.6784743741082218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8043, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4497354497354497, |
|
"grad_norm": 0.8279542209712195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8081, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.6448467413029932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.806, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5026455026455027, |
|
"grad_norm": 0.5414700848521968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8021, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5291005291005291, |
|
"grad_norm": 0.6833581882793117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7978, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.9216374614283346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7997, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.582010582010582, |
|
"grad_norm": 0.6017660770757399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7942, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6084656084656085, |
|
"grad_norm": 0.6412167679844537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.791, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.6772848668775633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7948, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6613756613756614, |
|
"grad_norm": 0.600355639535383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7914, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6878306878306878, |
|
"grad_norm": 0.5606573561606166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7941, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.5274683257315145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7934, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.5268307259184533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7938, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7671957671957672, |
|
"grad_norm": 0.7728039906376213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7917, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.5113816020074538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7882, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8201058201058201, |
|
"grad_norm": 0.5190008909833674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7849, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8465608465608465, |
|
"grad_norm": 0.5836368027639713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7884, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 0.7393083910981807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7832, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8994708994708994, |
|
"grad_norm": 0.6790621837196994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7889, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.7554140140474603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7915, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.5810605450609724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7863, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9788359788359788, |
|
"grad_norm": 0.6061224991845839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7836, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7853822112083435, |
|
"eval_runtime": 36.7907, |
|
"eval_samples_per_second": 276.537, |
|
"eval_steps_per_second": 1.087, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0052910052910053, |
|
"grad_norm": 0.8837267831248354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7782, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0317460317460316, |
|
"grad_norm": 0.8327957648034358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.752, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0582010582010581, |
|
"grad_norm": 0.7661996686677544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7504, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0846560846560847, |
|
"grad_norm": 0.574247744750553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.753, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.6270570009050412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7526, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1375661375661377, |
|
"grad_norm": 0.7564791753598593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7499, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.164021164021164, |
|
"grad_norm": 0.6019767178166228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7534, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.7045276656881013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7516, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.216931216931217, |
|
"grad_norm": 0.7151521174595956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7525, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2433862433862433, |
|
"grad_norm": 0.8589885709685776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7506, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.8448779205925924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7529, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.698173785850267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3227513227513228, |
|
"grad_norm": 0.5810999760014718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7493, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3492063492063493, |
|
"grad_norm": 0.6992760350263512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7468, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3756613756613756, |
|
"grad_norm": 0.6741207559975058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7498, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.402116402116402, |
|
"grad_norm": 0.5507423746197616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.6210883025743585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7491, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.455026455026455, |
|
"grad_norm": 0.5352886813564971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7501, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.6312193974549839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.5240777963700096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7506, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5343915343915344, |
|
"grad_norm": 0.6713658440893504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7498, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.560846560846561, |
|
"grad_norm": 0.5581636551389646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.749, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.5510394311493787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7507, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6137566137566137, |
|
"grad_norm": 0.7490035349717284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.751, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6402116402116402, |
|
"grad_norm": 0.7019577853473584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7452, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.5188517209828246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.747, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.693121693121693, |
|
"grad_norm": 0.5852910158185847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7478, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7195767195767195, |
|
"grad_norm": 0.5163509954560109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7493, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 0.5259629378155002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7488, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7724867724867726, |
|
"grad_norm": 0.5847699777528805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7484, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.798941798941799, |
|
"grad_norm": 0.6909699114073832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7425, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8253968253968254, |
|
"grad_norm": 0.5026266608814028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7481, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.4905863322036327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7427, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8783068783068781, |
|
"grad_norm": 0.4940567635795736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7468, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.5002276189718439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7411, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9312169312169312, |
|
"grad_norm": 0.5990930220626486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7445, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9576719576719577, |
|
"grad_norm": 0.575362618033918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.746, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 0.5971817140113224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7439, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7724016904830933, |
|
"eval_runtime": 36.4294, |
|
"eval_samples_per_second": 279.28, |
|
"eval_steps_per_second": 1.098, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.0105820105820107, |
|
"grad_norm": 0.7946153532333404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7315, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 0.6230352829055962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7084, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 0.6511688564166924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7103, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0899470899470898, |
|
"grad_norm": 0.6162972908978676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7106, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1164021164021163, |
|
"grad_norm": 0.5483322240242068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7103, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5167695346041687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7118, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.1693121693121693, |
|
"grad_norm": 0.5731147019555838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7117, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.195767195767196, |
|
"grad_norm": 0.5992340825814363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7115, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.8204374260544979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7116, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.248677248677249, |
|
"grad_norm": 0.6452436861380945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7132, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2751322751322753, |
|
"grad_norm": 0.7905676999430032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7156, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3015873015873014, |
|
"grad_norm": 0.6453047673638057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7134, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.328042328042328, |
|
"grad_norm": 0.6161389768819914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7182, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.3544973544973544, |
|
"grad_norm": 0.5925752010088033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7152, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.5465043223339282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7164, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 0.5991686328257099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7169, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.433862433862434, |
|
"grad_norm": 0.5529948466057861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7134, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.4603174603174605, |
|
"grad_norm": 0.5700704291829093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.714, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.4867724867724865, |
|
"grad_norm": 0.7107367519714186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7119, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5132275132275135, |
|
"grad_norm": 0.5145994918143386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7164, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.8526143229072503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7176, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.566137566137566, |
|
"grad_norm": 0.6460058195221838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.718, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 0.5722175187803165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7158, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.6199806409264405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7152, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6455026455026456, |
|
"grad_norm": 0.6295374166579548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7167, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.671957671957672, |
|
"grad_norm": 0.5562186015106506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7143, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 0.5802340156160444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7201, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7248677248677247, |
|
"grad_norm": 0.5674240362878424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7151, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.751322751322751, |
|
"grad_norm": 0.5579240804629998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7113, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.5511568456212458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7153, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.804232804232804, |
|
"grad_norm": 0.53261750815606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7147, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8306878306878307, |
|
"grad_norm": 0.601473528074606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7165, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.4922870741419976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7147, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.8835978835978837, |
|
"grad_norm": 0.5647388229398058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7146, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.91005291005291, |
|
"grad_norm": 0.6422541209526703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7175, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9365079365079367, |
|
"grad_norm": 0.6977559447017284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7162, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.7941392765996913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.714, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.9894179894179893, |
|
"grad_norm": 0.6766160989486909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7146, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7703806757926941, |
|
"eval_runtime": 35.7626, |
|
"eval_samples_per_second": 284.487, |
|
"eval_steps_per_second": 1.118, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1134, |
|
"total_flos": 1899492236328960.0, |
|
"train_loss": 0.7598387925923397, |
|
"train_runtime": 7118.4294, |
|
"train_samples_per_second": 81.465, |
|
"train_steps_per_second": 0.159 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1899492236328960.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|