{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6081081081081081, "eval_steps": 30, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033783783783783786, "grad_norm": 1.4937225174709319, "learning_rate": 2.5e-06, "loss": 2.1185, "step": 1 }, { "epoch": 0.0033783783783783786, "eval_loss": 2.0952866077423096, "eval_runtime": 187.5733, "eval_samples_per_second": 0.267, "eval_steps_per_second": 0.069, "step": 1 }, { "epoch": 0.006756756756756757, "grad_norm": 1.442277502174201, "learning_rate": 5e-06, "loss": 2.1679, "step": 2 }, { "epoch": 0.010135135135135136, "grad_norm": 1.4361264107689526, "learning_rate": 7.5e-06, "loss": 2.1475, "step": 3 }, { "epoch": 0.013513513513513514, "grad_norm": 1.5925067561400204, "learning_rate": 1e-05, "loss": 2.1111, "step": 4 }, { "epoch": 0.016891891891891893, "grad_norm": 1.5729343116389016, "learning_rate": 1.25e-05, "loss": 2.1772, "step": 5 }, { "epoch": 0.02027027027027027, "grad_norm": 0.77386625695677, "learning_rate": 1.5e-05, "loss": 2.1445, "step": 6 }, { "epoch": 0.02364864864864865, "grad_norm": 0.9687092422157498, "learning_rate": 1.75e-05, "loss": 2.0929, "step": 7 }, { "epoch": 0.02702702702702703, "grad_norm": 0.5745308246234537, "learning_rate": 2e-05, "loss": 2.1085, "step": 8 }, { "epoch": 0.030405405405405407, "grad_norm": 0.7880273258575979, "learning_rate": 2.25e-05, "loss": 2.1717, "step": 9 }, { "epoch": 0.033783783783783786, "grad_norm": 0.5987395071382063, "learning_rate": 2.5e-05, "loss": 2.074, "step": 10 }, { "epoch": 0.037162162162162164, "grad_norm": 0.8312307017045393, "learning_rate": 2.7500000000000004e-05, "loss": 2.2338, "step": 11 }, { "epoch": 0.04054054054054054, "grad_norm": 0.7733440491356615, "learning_rate": 3e-05, "loss": 2.2148, "step": 12 }, { "epoch": 0.04391891891891892, "grad_norm": 0.7510878886656168, "learning_rate": 3.2500000000000004e-05, "loss": 2.243, "step": 13 }, { "epoch": 0.0472972972972973, "grad_norm": 0.6210806018904155, "learning_rate": 3.5e-05, "loss": 1.9863, "step": 14 }, { "epoch": 0.05067567567567568, "grad_norm": 1.018373738736693, "learning_rate": 3.7500000000000003e-05, "loss": 2.0294, "step": 15 }, { "epoch": 0.05405405405405406, "grad_norm": 0.771759792529477, "learning_rate": 4e-05, "loss": 2.0353, "step": 16 }, { "epoch": 0.057432432432432436, "grad_norm": 0.5625682320002908, "learning_rate": 4.25e-05, "loss": 2.0775, "step": 17 }, { "epoch": 0.060810810810810814, "grad_norm": 0.5157565411105806, "learning_rate": 4.5e-05, "loss": 2.1791, "step": 18 }, { "epoch": 0.06418918918918919, "grad_norm": 0.8173537204599863, "learning_rate": 4.75e-05, "loss": 2.0549, "step": 19 }, { "epoch": 0.06756756756756757, "grad_norm": 2.5851976259061664, "learning_rate": 5e-05, "loss": 2.0757, "step": 20 }, { "epoch": 0.07094594594594594, "grad_norm": 0.5130578194248145, "learning_rate": 4.999854243002125e-05, "loss": 2.2505, "step": 21 }, { "epoch": 0.07432432432432433, "grad_norm": 0.5786770218696188, "learning_rate": 4.999416990893036e-05, "loss": 2.2533, "step": 22 }, { "epoch": 0.0777027027027027, "grad_norm": 0.8219446198677083, "learning_rate": 4.998688300323891e-05, "loss": 2.1092, "step": 23 }, { "epoch": 0.08108108108108109, "grad_norm": 0.7123342305131811, "learning_rate": 4.997668265705137e-05, "loss": 2.3369, "step": 24 }, { "epoch": 0.08445945945945946, "grad_norm": 0.44584577341815756, "learning_rate": 4.9963570191942696e-05, "loss": 2.0125, "step": 25 }, { "epoch": 0.08783783783783784, "grad_norm": 0.5308641065159894, "learning_rate": 4.994754730678713e-05, "loss": 2.0653, "step": 26 }, { "epoch": 0.09121621621621621, "grad_norm": 0.4833891468070717, "learning_rate": 4.992861607753817e-05, "loss": 2.0177, "step": 27 }, { "epoch": 0.0945945945945946, "grad_norm": 0.4487741255853019, "learning_rate": 4.9906778956959454e-05, "loss": 1.9773, "step": 28 }, { "epoch": 0.09797297297297297, "grad_norm": 0.7070413556606907, "learning_rate": 4.988203877430713e-05, "loss": 2.0163, "step": 29 }, { "epoch": 0.10135135135135136, "grad_norm": 0.6433406570980086, "learning_rate": 4.985439873496321e-05, "loss": 2.055, "step": 30 }, { "epoch": 0.10135135135135136, "eval_loss": 2.022883653640747, "eval_runtime": 194.2976, "eval_samples_per_second": 0.257, "eval_steps_per_second": 0.067, "step": 30 }, { "epoch": 0.10472972972972973, "grad_norm": 1.9573381693992367, "learning_rate": 4.982386242002024e-05, "loss": 2.3371, "step": 31 }, { "epoch": 0.10810810810810811, "grad_norm": 1.6578265269127788, "learning_rate": 4.979043378581744e-05, "loss": 2.1288, "step": 32 }, { "epoch": 0.11148648648648649, "grad_norm": 0.5544836623908992, "learning_rate": 4.975411716342802e-05, "loss": 2.1887, "step": 33 }, { "epoch": 0.11486486486486487, "grad_norm": 0.5455040506142, "learning_rate": 4.971491725809807e-05, "loss": 2.1214, "step": 34 }, { "epoch": 0.11824324324324324, "grad_norm": 0.5065209751020103, "learning_rate": 4.967283914863693e-05, "loss": 2.1692, "step": 35 }, { "epoch": 0.12162162162162163, "grad_norm": 0.6453232517165945, "learning_rate": 4.96278882867592e-05, "loss": 2.0598, "step": 36 }, { "epoch": 0.125, "grad_norm": 1.1731862208697716, "learning_rate": 4.9580070496378364e-05, "loss": 2.2156, "step": 37 }, { "epoch": 0.12837837837837837, "grad_norm": 0.7667101172051888, "learning_rate": 4.952939197285227e-05, "loss": 2.1392, "step": 38 }, { "epoch": 0.13175675675675674, "grad_norm": 0.41781856442196297, "learning_rate": 4.947585928218041e-05, "loss": 2.1534, "step": 39 }, { "epoch": 0.13513513513513514, "grad_norm": 0.5945736341225079, "learning_rate": 4.9419479360153286e-05, "loss": 1.9795, "step": 40 }, { "epoch": 0.13851351351351351, "grad_norm": 0.6467707595603454, "learning_rate": 4.936025951145368e-05, "loss": 2.0017, "step": 41 }, { "epoch": 0.14189189189189189, "grad_norm": 0.5109600024808127, "learning_rate": 4.929820740871039e-05, "loss": 2.2144, "step": 42 }, { "epoch": 0.14527027027027026, "grad_norm": 1.0548277894819795, "learning_rate": 4.9233331091504034e-05, "loss": 2.0657, "step": 43 }, { "epoch": 0.14864864864864866, "grad_norm": 0.6580975841596574, "learning_rate": 4.916563896532549e-05, "loss": 2.1431, "step": 44 }, { "epoch": 0.15202702702702703, "grad_norm": 0.3988619886160755, "learning_rate": 4.9095139800486824e-05, "loss": 2.0123, "step": 45 }, { "epoch": 0.1554054054054054, "grad_norm": 0.5209858406284475, "learning_rate": 4.9021842730985036e-05, "loss": 2.2487, "step": 46 }, { "epoch": 0.15878378378378377, "grad_norm": 0.5433634333987769, "learning_rate": 4.894575725331862e-05, "loss": 2.1736, "step": 47 }, { "epoch": 0.16216216216216217, "grad_norm": 0.5109855249570185, "learning_rate": 4.886689322525719e-05, "loss": 2.0823, "step": 48 }, { "epoch": 0.16554054054054054, "grad_norm": 0.6194335580813743, "learning_rate": 4.878526086456426e-05, "loss": 2.1036, "step": 49 }, { "epoch": 0.16891891891891891, "grad_norm": 1.0780413583147488, "learning_rate": 4.8700870747673466e-05, "loss": 2.0302, "step": 50 }, { "epoch": 0.17229729729729729, "grad_norm": 0.580340495769622, "learning_rate": 4.8613733808318204e-05, "loss": 2.1776, "step": 51 }, { "epoch": 0.17567567567567569, "grad_norm": 0.8129429382764808, "learning_rate": 4.85238613361151e-05, "loss": 2.0965, "step": 52 }, { "epoch": 0.17905405405405406, "grad_norm": 0.7779782864994534, "learning_rate": 4.8431264975101245e-05, "loss": 2.1582, "step": 53 }, { "epoch": 0.18243243243243243, "grad_norm": 0.8528745581545691, "learning_rate": 4.8335956722225616e-05, "loss": 2.1511, "step": 54 }, { "epoch": 0.1858108108108108, "grad_norm": 0.49188324413548323, "learning_rate": 4.823794892579471e-05, "loss": 2.1583, "step": 55 }, { "epoch": 0.1891891891891892, "grad_norm": 0.724195478643155, "learning_rate": 4.8137254283872696e-05, "loss": 1.9438, "step": 56 }, { "epoch": 0.19256756756756757, "grad_norm": 0.8754624301347438, "learning_rate": 4.803388584263618e-05, "loss": 2.1349, "step": 57 }, { "epoch": 0.19594594594594594, "grad_norm": 0.4870044791836658, "learning_rate": 4.7927856994684e-05, "loss": 2.0239, "step": 58 }, { "epoch": 0.19932432432432431, "grad_norm": 0.5041896039366629, "learning_rate": 4.781918147730199e-05, "loss": 2.0841, "step": 59 }, { "epoch": 0.20270270270270271, "grad_norm": 0.4874068239264915, "learning_rate": 4.7707873370683163e-05, "loss": 2.1407, "step": 60 }, { "epoch": 0.20270270270270271, "eval_loss": 2.015751838684082, "eval_runtime": 196.268, "eval_samples_per_second": 0.255, "eval_steps_per_second": 0.066, "step": 60 }, { "epoch": 0.20608108108108109, "grad_norm": 0.5424018707478311, "learning_rate": 4.75939470961035e-05, "loss": 2.186, "step": 61 }, { "epoch": 0.20945945945945946, "grad_norm": 0.5115180976703219, "learning_rate": 4.747741741405344e-05, "loss": 2.2014, "step": 62 }, { "epoch": 0.21283783783783783, "grad_norm": 0.5058558197015601, "learning_rate": 4.735829942232555e-05, "loss": 2.0927, "step": 63 }, { "epoch": 0.21621621621621623, "grad_norm": 0.8021043767946636, "learning_rate": 4.7236608554058375e-05, "loss": 2.1884, "step": 64 }, { "epoch": 0.2195945945945946, "grad_norm": 0.5293842363703689, "learning_rate": 4.711236057573691e-05, "loss": 2.0714, "step": 65 }, { "epoch": 0.22297297297297297, "grad_norm": 0.48722486629288786, "learning_rate": 4.6985571585149876e-05, "loss": 2.0562, "step": 66 }, { "epoch": 0.22635135135135134, "grad_norm": 0.4279248526312935, "learning_rate": 4.685625800930406e-05, "loss": 2.0847, "step": 67 }, { "epoch": 0.22972972972972974, "grad_norm": 0.5130161800768928, "learning_rate": 4.6724436602296e-05, "loss": 2.0617, "step": 68 }, { "epoch": 0.23310810810810811, "grad_norm": 0.7460272297512813, "learning_rate": 4.659012444314128e-05, "loss": 2.1029, "step": 69 }, { "epoch": 0.23648648648648649, "grad_norm": 0.4943699960691202, "learning_rate": 4.645333893356176e-05, "loss": 1.9948, "step": 70 }, { "epoch": 0.23986486486486486, "grad_norm": 0.6350670407835639, "learning_rate": 4.6314097795731e-05, "loss": 2.0935, "step": 71 }, { "epoch": 0.24324324324324326, "grad_norm": 0.542940777570525, "learning_rate": 4.6172419069978065e-05, "loss": 2.0267, "step": 72 }, { "epoch": 0.24662162162162163, "grad_norm": 0.4567422527911003, "learning_rate": 4.602832111245029e-05, "loss": 1.9971, "step": 73 }, { "epoch": 0.25, "grad_norm": 0.4699317313725709, "learning_rate": 4.5881822592734946e-05, "loss": 2.1758, "step": 74 }, { "epoch": 0.2533783783783784, "grad_norm": 0.6913326755097541, "learning_rate": 4.573294249144041e-05, "loss": 2.1574, "step": 75 }, { "epoch": 0.25675675675675674, "grad_norm": 0.4362574963180428, "learning_rate": 4.5581700097737015e-05, "loss": 2.0498, "step": 76 }, { "epoch": 0.26013513513513514, "grad_norm": 0.6932798791839507, "learning_rate": 4.542811500685785e-05, "loss": 2.0777, "step": 77 }, { "epoch": 0.2635135135135135, "grad_norm": 0.4469936094893796, "learning_rate": 4.527220711756007e-05, "loss": 2.1046, "step": 78 }, { "epoch": 0.2668918918918919, "grad_norm": 0.5112163825070385, "learning_rate": 4.511399662954667e-05, "loss": 2.0955, "step": 79 }, { "epoch": 0.2702702702702703, "grad_norm": 0.4272672573543929, "learning_rate": 4.4953504040849445e-05, "loss": 2.1927, "step": 80 }, { "epoch": 0.27364864864864863, "grad_norm": 0.4203647309881028, "learning_rate": 4.479075014517321e-05, "loss": 2.0421, "step": 81 }, { "epoch": 0.27702702702702703, "grad_norm": 0.5896256230009808, "learning_rate": 4.462575602920171e-05, "loss": 2.109, "step": 82 }, { "epoch": 0.28040540540540543, "grad_norm": 0.7482696034746834, "learning_rate": 4.445854306986563e-05, "loss": 2.0851, "step": 83 }, { "epoch": 0.28378378378378377, "grad_norm": 0.508289422303338, "learning_rate": 4.428913293157293e-05, "loss": 2.1759, "step": 84 }, { "epoch": 0.28716216216216217, "grad_norm": 0.4749189097462539, "learning_rate": 4.411754756340198e-05, "loss": 2.0721, "step": 85 }, { "epoch": 0.2905405405405405, "grad_norm": 0.493594282927666, "learning_rate": 4.3943809196257794e-05, "loss": 2.0351, "step": 86 }, { "epoch": 0.2939189189189189, "grad_norm": 0.4836545743981663, "learning_rate": 4.376794033999177e-05, "loss": 2.0896, "step": 87 }, { "epoch": 0.2972972972972973, "grad_norm": 0.47767898533003106, "learning_rate": 4.358996378048524e-05, "loss": 2.083, "step": 88 }, { "epoch": 0.30067567567567566, "grad_norm": 0.4390064059005752, "learning_rate": 4.340990257669732e-05, "loss": 2.0627, "step": 89 }, { "epoch": 0.30405405405405406, "grad_norm": 0.4601711355647231, "learning_rate": 4.3227780057677345e-05, "loss": 2.0997, "step": 90 }, { "epoch": 0.30405405405405406, "eval_loss": 2.011120080947876, "eval_runtime": 190.8365, "eval_samples_per_second": 0.262, "eval_steps_per_second": 0.068, "step": 90 }, { "epoch": 0.30743243243243246, "grad_norm": 0.5654017461216198, "learning_rate": 4.304361981954231e-05, "loss": 2.2149, "step": 91 }, { "epoch": 0.3108108108108108, "grad_norm": 0.5064873219371222, "learning_rate": 4.285744572241972e-05, "loss": 2.1093, "step": 92 }, { "epoch": 0.3141891891891892, "grad_norm": 0.46470652155232134, "learning_rate": 4.266928188735621e-05, "loss": 2.1098, "step": 93 }, { "epoch": 0.31756756756756754, "grad_norm": 0.4413493288478021, "learning_rate": 4.247915269319241e-05, "loss": 2.1431, "step": 94 }, { "epoch": 0.32094594594594594, "grad_norm": 1.0020586721524896, "learning_rate": 4.2287082773404386e-05, "loss": 2.0877, "step": 95 }, { "epoch": 0.32432432432432434, "grad_norm": 0.4848089444845991, "learning_rate": 4.209309701291201e-05, "loss": 2.1175, "step": 96 }, { "epoch": 0.3277027027027027, "grad_norm": 0.5555159336979827, "learning_rate": 4.189722054485492e-05, "loss": 2.1563, "step": 97 }, { "epoch": 0.3310810810810811, "grad_norm": 0.3802895433591588, "learning_rate": 4.169947874733619e-05, "loss": 2.0732, "step": 98 }, { "epoch": 0.3344594594594595, "grad_norm": 0.7320601930371831, "learning_rate": 4.149989724013425e-05, "loss": 2.1452, "step": 99 }, { "epoch": 0.33783783783783783, "grad_norm": 0.8602357197050848, "learning_rate": 4.1298501881383624e-05, "loss": 2.1766, "step": 100 }, { "epoch": 0.34121621621621623, "grad_norm": 0.5831812051877154, "learning_rate": 4.109531876422463e-05, "loss": 2.1593, "step": 101 }, { "epoch": 0.34459459459459457, "grad_norm": 0.7665489432929005, "learning_rate": 4.089037421342277e-05, "loss": 2.0295, "step": 102 }, { "epoch": 0.34797297297297297, "grad_norm": 0.45337471468141477, "learning_rate": 4.0683694781958e-05, "loss": 2.1087, "step": 103 }, { "epoch": 0.35135135135135137, "grad_norm": 0.4591955107711257, "learning_rate": 4.047530724758451e-05, "loss": 2.0354, "step": 104 }, { "epoch": 0.3547297297297297, "grad_norm": 0.7505711753697653, "learning_rate": 4.026523860936132e-05, "loss": 2.0258, "step": 105 }, { "epoch": 0.3581081081081081, "grad_norm": 0.589397231315979, "learning_rate": 4.005351608415426e-05, "loss": 2.0051, "step": 106 }, { "epoch": 0.3614864864864865, "grad_norm": 0.6872824203092515, "learning_rate": 3.9840167103109675e-05, "loss": 1.9355, "step": 107 }, { "epoch": 0.36486486486486486, "grad_norm": 0.45802674220232215, "learning_rate": 3.9625219308100455e-05, "loss": 2.0836, "step": 108 }, { "epoch": 0.36824324324324326, "grad_norm": 0.5524787455898191, "learning_rate": 3.940870054814462e-05, "loss": 2.0474, "step": 109 }, { "epoch": 0.3716216216216216, "grad_norm": 0.38416177227514725, "learning_rate": 3.919063887579726e-05, "loss": 2.0985, "step": 110 }, { "epoch": 0.375, "grad_norm": 0.45960539321926364, "learning_rate": 3.897106254351587e-05, "loss": 2.0129, "step": 111 }, { "epoch": 0.3783783783783784, "grad_norm": 0.37861955022931376, "learning_rate": 3.875e-05, "loss": 2.1427, "step": 112 }, { "epoch": 0.38175675675675674, "grad_norm": 0.581523763120522, "learning_rate": 3.852747988650539e-05, "loss": 2.0318, "step": 113 }, { "epoch": 0.38513513513513514, "grad_norm": 0.3504600135974039, "learning_rate": 3.83035310331331e-05, "loss": 2.033, "step": 114 }, { "epoch": 0.3885135135135135, "grad_norm": 0.4242352094167052, "learning_rate": 3.807818245509429e-05, "loss": 2.034, "step": 115 }, { "epoch": 0.3918918918918919, "grad_norm": 0.4250673616692747, "learning_rate": 3.785146334895093e-05, "loss": 2.1702, "step": 116 }, { "epoch": 0.3952702702702703, "grad_norm": 3.2751318573643617, "learning_rate": 3.762340308883302e-05, "loss": 2.1026, "step": 117 }, { "epoch": 0.39864864864864863, "grad_norm": 0.48584128738743915, "learning_rate": 3.739403122263288e-05, "loss": 2.22, "step": 118 }, { "epoch": 0.40202702702702703, "grad_norm": 0.4187766943285619, "learning_rate": 3.716337746817685e-05, "loss": 2.1541, "step": 119 }, { "epoch": 0.40540540540540543, "grad_norm": 0.5051183151449187, "learning_rate": 3.6931471709374946e-05, "loss": 2.1113, "step": 120 }, { "epoch": 0.40540540540540543, "eval_loss": 2.009887218475342, "eval_runtime": 193.8883, "eval_samples_per_second": 0.258, "eval_steps_per_second": 0.067, "step": 120 }, { "epoch": 0.40878378378378377, "grad_norm": 0.40423483975901836, "learning_rate": 3.669834399234913e-05, "loss": 2.0209, "step": 121 }, { "epoch": 0.41216216216216217, "grad_norm": 0.3825955755250103, "learning_rate": 3.646402452154043e-05, "loss": 2.074, "step": 122 }, { "epoch": 0.4155405405405405, "grad_norm": 1.4013949742645142, "learning_rate": 3.622854365579561e-05, "loss": 2.1807, "step": 123 }, { "epoch": 0.4189189189189189, "grad_norm": 0.43240866134882766, "learning_rate": 3.5991931904433824e-05, "loss": 1.9988, "step": 124 }, { "epoch": 0.4222972972972973, "grad_norm": 0.4777714379483413, "learning_rate": 3.575421992329377e-05, "loss": 2.0801, "step": 125 }, { "epoch": 0.42567567567567566, "grad_norm": 0.49357523405169307, "learning_rate": 3.551543851076188e-05, "loss": 2.0035, "step": 126 }, { "epoch": 0.42905405405405406, "grad_norm": 0.3447821776586979, "learning_rate": 3.5275618603782006e-05, "loss": 2.0435, "step": 127 }, { "epoch": 0.43243243243243246, "grad_norm": 0.3718901049854852, "learning_rate": 3.503479127384719e-05, "loss": 1.8828, "step": 128 }, { "epoch": 0.4358108108108108, "grad_norm": 0.3888324394883177, "learning_rate": 3.479298772297398e-05, "loss": 2.1094, "step": 129 }, { "epoch": 0.4391891891891892, "grad_norm": 0.5267690929558577, "learning_rate": 3.4550239279659854e-05, "loss": 2.1593, "step": 130 }, { "epoch": 0.44256756756756754, "grad_norm": 0.4636315581971976, "learning_rate": 3.4306577394824207e-05, "loss": 2.1605, "step": 131 }, { "epoch": 0.44594594594594594, "grad_norm": 0.3968487601273563, "learning_rate": 3.406203363773356e-05, "loss": 2.0023, "step": 132 }, { "epoch": 0.44932432432432434, "grad_norm": 0.392460435962094, "learning_rate": 3.381663969191137e-05, "loss": 2.1517, "step": 133 }, { "epoch": 0.4527027027027027, "grad_norm": 0.4031589419917055, "learning_rate": 3.3570427351033046e-05, "loss": 2.0701, "step": 134 }, { "epoch": 0.4560810810810811, "grad_norm": 0.4253231691696294, "learning_rate": 3.332342851480672e-05, "loss": 2.0944, "step": 135 }, { "epoch": 0.4594594594594595, "grad_norm": 1.089550042036897, "learning_rate": 3.307567518484025e-05, "loss": 2.1879, "step": 136 }, { "epoch": 0.46283783783783783, "grad_norm": 0.5433174961880026, "learning_rate": 3.282719946049505e-05, "loss": 2.1142, "step": 137 }, { "epoch": 0.46621621621621623, "grad_norm": 0.47522024279968983, "learning_rate": 3.257803353472724e-05, "loss": 2.0765, "step": 138 }, { "epoch": 0.46959459459459457, "grad_norm": 0.6397031743657627, "learning_rate": 3.232820968991664e-05, "loss": 2.0565, "step": 139 }, { "epoch": 0.47297297297297297, "grad_norm": 0.8231847588094275, "learning_rate": 3.207776029368427e-05, "loss": 2.1382, "step": 140 }, { "epoch": 0.47635135135135137, "grad_norm": 0.4277815234715947, "learning_rate": 3.1826717794698635e-05, "loss": 2.1454, "step": 141 }, { "epoch": 0.4797297297297297, "grad_norm": 0.395176536808308, "learning_rate": 3.157511471847176e-05, "loss": 2.0293, "step": 142 }, { "epoch": 0.4831081081081081, "grad_norm": 0.42124877734068217, "learning_rate": 3.1322983663145e-05, "loss": 2.0417, "step": 143 }, { "epoch": 0.4864864864864865, "grad_norm": 0.6103856586396839, "learning_rate": 3.107035729526566e-05, "loss": 2.0703, "step": 144 }, { "epoch": 0.48986486486486486, "grad_norm": 0.40802626443551787, "learning_rate": 3.081726834555458e-05, "loss": 1.9202, "step": 145 }, { "epoch": 0.49324324324324326, "grad_norm": 0.4038507240265364, "learning_rate": 3.0563749604665556e-05, "loss": 2.1105, "step": 146 }, { "epoch": 0.4966216216216216, "grad_norm": 0.39745914058435033, "learning_rate": 3.0309833918936865e-05, "loss": 2.0854, "step": 147 }, { "epoch": 0.5, "grad_norm": 0.39094920659893395, "learning_rate": 3.0055554186135688e-05, "loss": 2.1638, "step": 148 }, { "epoch": 0.5033783783783784, "grad_norm": 0.4684851166837971, "learning_rate": 2.980094335119577e-05, "loss": 2.0368, "step": 149 }, { "epoch": 0.5067567567567568, "grad_norm": 0.8309495298034889, "learning_rate": 2.9546034401949064e-05, "loss": 2.1695, "step": 150 }, { "epoch": 0.5067567567567568, "eval_loss": 2.0075740814208984, "eval_runtime": 193.1213, "eval_samples_per_second": 0.259, "eval_steps_per_second": 0.067, "step": 150 }, { "epoch": 0.5101351351351351, "grad_norm": 0.5583192813687627, "learning_rate": 2.9290860364851702e-05, "loss": 2.0582, "step": 151 }, { "epoch": 0.5135135135135135, "grad_norm": 0.42473795943255627, "learning_rate": 2.90354543007051e-05, "loss": 2.0871, "step": 152 }, { "epoch": 0.5168918918918919, "grad_norm": 0.4247509513157766, "learning_rate": 2.877984930037251e-05, "loss": 2.0964, "step": 153 }, { "epoch": 0.5202702702702703, "grad_norm": 0.8778087000903096, "learning_rate": 2.8524078480491684e-05, "loss": 2.0506, "step": 154 }, { "epoch": 0.5236486486486487, "grad_norm": 0.501109409670388, "learning_rate": 2.826817497918428e-05, "loss": 2.1435, "step": 155 }, { "epoch": 0.527027027027027, "grad_norm": 0.7730432872928119, "learning_rate": 2.8012171951762378e-05, "loss": 2.0909, "step": 156 }, { "epoch": 0.5304054054054054, "grad_norm": 0.4007388361090252, "learning_rate": 2.7756102566432845e-05, "loss": 2.1613, "step": 157 }, { "epoch": 0.5337837837837838, "grad_norm": 0.4121213099878934, "learning_rate": 2.7500000000000004e-05, "loss": 2.2213, "step": 158 }, { "epoch": 0.5371621621621622, "grad_norm": 0.4237936256648311, "learning_rate": 2.7243897433567157e-05, "loss": 2.1795, "step": 159 }, { "epoch": 0.5405405405405406, "grad_norm": 0.41107097005560067, "learning_rate": 2.6987828048237624e-05, "loss": 2.0895, "step": 160 }, { "epoch": 0.543918918918919, "grad_norm": 0.4463337134292773, "learning_rate": 2.6731825020815725e-05, "loss": 2.046, "step": 161 }, { "epoch": 0.5472972972972973, "grad_norm": 0.5230967905508315, "learning_rate": 2.6475921519508325e-05, "loss": 2.0374, "step": 162 }, { "epoch": 0.5506756756756757, "grad_norm": 0.414839716539812, "learning_rate": 2.62201506996275e-05, "loss": 2.2485, "step": 163 }, { "epoch": 0.5540540540540541, "grad_norm": 0.38087462228816815, "learning_rate": 2.5964545699294906e-05, "loss": 2.0365, "step": 164 }, { "epoch": 0.5574324324324325, "grad_norm": 0.5305482547516058, "learning_rate": 2.570913963514831e-05, "loss": 2.0701, "step": 165 }, { "epoch": 0.5608108108108109, "grad_norm": 0.39988875104823635, "learning_rate": 2.5453965598050944e-05, "loss": 2.1985, "step": 166 }, { "epoch": 0.5641891891891891, "grad_norm": 0.6054380331009478, "learning_rate": 2.5199056648804233e-05, "loss": 2.1136, "step": 167 }, { "epoch": 0.5675675675675675, "grad_norm": 0.39275163056643564, "learning_rate": 2.4944445813864314e-05, "loss": 2.0735, "step": 168 }, { "epoch": 0.5709459459459459, "grad_norm": 0.7078296605563206, "learning_rate": 2.469016608106315e-05, "loss": 2.1578, "step": 169 }, { "epoch": 0.5743243243243243, "grad_norm": 0.439519908152304, "learning_rate": 2.443625039533446e-05, "loss": 2.0469, "step": 170 }, { "epoch": 0.5777027027027027, "grad_norm": 0.47171431683346926, "learning_rate": 2.4182731654445427e-05, "loss": 2.0609, "step": 171 }, { "epoch": 0.581081081081081, "grad_norm": 0.43725040423496425, "learning_rate": 2.3929642704734347e-05, "loss": 2.1686, "step": 172 }, { "epoch": 0.5844594594594594, "grad_norm": 0.5156240873689197, "learning_rate": 2.3677016336855002e-05, "loss": 2.111, "step": 173 }, { "epoch": 0.5878378378378378, "grad_norm": 0.478909789809296, "learning_rate": 2.3424885281528248e-05, "loss": 2.0993, "step": 174 }, { "epoch": 0.5912162162162162, "grad_norm": 0.8822684350527039, "learning_rate": 2.3173282205301367e-05, "loss": 2.091, "step": 175 }, { "epoch": 0.5945945945945946, "grad_norm": 0.5005747298379083, "learning_rate": 2.2922239706315745e-05, "loss": 2.0998, "step": 176 }, { "epoch": 0.597972972972973, "grad_norm": 0.38147794073978136, "learning_rate": 2.2671790310083364e-05, "loss": 2.0433, "step": 177 }, { "epoch": 0.6013513513513513, "grad_norm": 0.449696856682795, "learning_rate": 2.2421966465272765e-05, "loss": 1.9547, "step": 178 }, { "epoch": 0.6047297297297297, "grad_norm": 0.4046647546809551, "learning_rate": 2.217280053950495e-05, "loss": 2.0109, "step": 179 }, { "epoch": 0.6081081081081081, "grad_norm": 0.44319480569143777, "learning_rate": 2.1924324815159757e-05, "loss": 2.0782, "step": 180 }, { "epoch": 0.6081081081081081, "eval_loss": 2.006296157836914, "eval_runtime": 194.8007, "eval_samples_per_second": 0.257, "eval_steps_per_second": 0.067, "step": 180 } ], "logging_steps": 1, "max_steps": 296, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 308899416637440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }