{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.40540540540540543, "eval_steps": 30, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033783783783783786, "grad_norm": 1.4937225174709319, "learning_rate": 2.5e-06, "loss": 2.1185, "step": 1 }, { "epoch": 0.0033783783783783786, "eval_loss": 2.0952866077423096, "eval_runtime": 187.5733, "eval_samples_per_second": 0.267, "eval_steps_per_second": 0.069, "step": 1 }, { "epoch": 0.006756756756756757, "grad_norm": 1.442277502174201, "learning_rate": 5e-06, "loss": 2.1679, "step": 2 }, { "epoch": 0.010135135135135136, "grad_norm": 1.4361264107689526, "learning_rate": 7.5e-06, "loss": 2.1475, "step": 3 }, { "epoch": 0.013513513513513514, "grad_norm": 1.5925067561400204, "learning_rate": 1e-05, "loss": 2.1111, "step": 4 }, { "epoch": 0.016891891891891893, "grad_norm": 1.5729343116389016, "learning_rate": 1.25e-05, "loss": 2.1772, "step": 5 }, { "epoch": 0.02027027027027027, "grad_norm": 0.77386625695677, "learning_rate": 1.5e-05, "loss": 2.1445, "step": 6 }, { "epoch": 0.02364864864864865, "grad_norm": 0.9687092422157498, "learning_rate": 1.75e-05, "loss": 2.0929, "step": 7 }, { "epoch": 0.02702702702702703, "grad_norm": 0.5745308246234537, "learning_rate": 2e-05, "loss": 2.1085, "step": 8 }, { "epoch": 0.030405405405405407, "grad_norm": 0.7880273258575979, "learning_rate": 2.25e-05, "loss": 2.1717, "step": 9 }, { "epoch": 0.033783783783783786, "grad_norm": 0.5987395071382063, "learning_rate": 2.5e-05, "loss": 2.074, "step": 10 }, { "epoch": 0.037162162162162164, "grad_norm": 0.8312307017045393, "learning_rate": 2.7500000000000004e-05, "loss": 2.2338, "step": 11 }, { "epoch": 0.04054054054054054, "grad_norm": 0.7733440491356615, "learning_rate": 3e-05, "loss": 2.2148, "step": 12 }, { "epoch": 0.04391891891891892, "grad_norm": 0.7510878886656168, "learning_rate": 3.2500000000000004e-05, "loss": 2.243, "step": 13 }, { "epoch": 0.0472972972972973, "grad_norm": 0.6210806018904155, "learning_rate": 3.5e-05, "loss": 1.9863, "step": 14 }, { "epoch": 0.05067567567567568, "grad_norm": 1.018373738736693, "learning_rate": 3.7500000000000003e-05, "loss": 2.0294, "step": 15 }, { "epoch": 0.05405405405405406, "grad_norm": 0.771759792529477, "learning_rate": 4e-05, "loss": 2.0353, "step": 16 }, { "epoch": 0.057432432432432436, "grad_norm": 0.5625682320002908, "learning_rate": 4.25e-05, "loss": 2.0775, "step": 17 }, { "epoch": 0.060810810810810814, "grad_norm": 0.5157565411105806, "learning_rate": 4.5e-05, "loss": 2.1791, "step": 18 }, { "epoch": 0.06418918918918919, "grad_norm": 0.8173537204599863, "learning_rate": 4.75e-05, "loss": 2.0549, "step": 19 }, { "epoch": 0.06756756756756757, "grad_norm": 2.5851976259061664, "learning_rate": 5e-05, "loss": 2.0757, "step": 20 }, { "epoch": 0.07094594594594594, "grad_norm": 0.5130578194248145, "learning_rate": 4.999854243002125e-05, "loss": 2.2505, "step": 21 }, { "epoch": 0.07432432432432433, "grad_norm": 0.5786770218696188, "learning_rate": 4.999416990893036e-05, "loss": 2.2533, "step": 22 }, { "epoch": 0.0777027027027027, "grad_norm": 0.8219446198677083, "learning_rate": 4.998688300323891e-05, "loss": 2.1092, "step": 23 }, { "epoch": 0.08108108108108109, "grad_norm": 0.7123342305131811, "learning_rate": 4.997668265705137e-05, "loss": 2.3369, "step": 24 }, { "epoch": 0.08445945945945946, "grad_norm": 0.44584577341815756, "learning_rate": 4.9963570191942696e-05, "loss": 2.0125, "step": 25 }, { "epoch": 0.08783783783783784, "grad_norm": 0.5308641065159894, "learning_rate": 4.994754730678713e-05, "loss": 2.0653, "step": 26 }, { "epoch": 0.09121621621621621, "grad_norm": 0.4833891468070717, "learning_rate": 4.992861607753817e-05, "loss": 2.0177, "step": 27 }, { "epoch": 0.0945945945945946, "grad_norm": 0.4487741255853019, "learning_rate": 4.9906778956959454e-05, "loss": 1.9773, "step": 28 }, { "epoch": 0.09797297297297297, "grad_norm": 0.7070413556606907, "learning_rate": 4.988203877430713e-05, "loss": 2.0163, "step": 29 }, { "epoch": 0.10135135135135136, "grad_norm": 0.6433406570980086, "learning_rate": 4.985439873496321e-05, "loss": 2.055, "step": 30 }, { "epoch": 0.10135135135135136, "eval_loss": 2.022883653640747, "eval_runtime": 194.2976, "eval_samples_per_second": 0.257, "eval_steps_per_second": 0.067, "step": 30 }, { "epoch": 0.10472972972972973, "grad_norm": 1.9573381693992367, "learning_rate": 4.982386242002024e-05, "loss": 2.3371, "step": 31 }, { "epoch": 0.10810810810810811, "grad_norm": 1.6578265269127788, "learning_rate": 4.979043378581744e-05, "loss": 2.1288, "step": 32 }, { "epoch": 0.11148648648648649, "grad_norm": 0.5544836623908992, "learning_rate": 4.975411716342802e-05, "loss": 2.1887, "step": 33 }, { "epoch": 0.11486486486486487, "grad_norm": 0.5455040506142, "learning_rate": 4.971491725809807e-05, "loss": 2.1214, "step": 34 }, { "epoch": 0.11824324324324324, "grad_norm": 0.5065209751020103, "learning_rate": 4.967283914863693e-05, "loss": 2.1692, "step": 35 }, { "epoch": 0.12162162162162163, "grad_norm": 0.6453232517165945, "learning_rate": 4.96278882867592e-05, "loss": 2.0598, "step": 36 }, { "epoch": 0.125, "grad_norm": 1.1731862208697716, "learning_rate": 4.9580070496378364e-05, "loss": 2.2156, "step": 37 }, { "epoch": 0.12837837837837837, "grad_norm": 0.7667101172051888, "learning_rate": 4.952939197285227e-05, "loss": 2.1392, "step": 38 }, { "epoch": 0.13175675675675674, "grad_norm": 0.41781856442196297, "learning_rate": 4.947585928218041e-05, "loss": 2.1534, "step": 39 }, { "epoch": 0.13513513513513514, "grad_norm": 0.5945736341225079, "learning_rate": 4.9419479360153286e-05, "loss": 1.9795, "step": 40 }, { "epoch": 0.13851351351351351, "grad_norm": 0.6467707595603454, "learning_rate": 4.936025951145368e-05, "loss": 2.0017, "step": 41 }, { "epoch": 0.14189189189189189, "grad_norm": 0.5109600024808127, "learning_rate": 4.929820740871039e-05, "loss": 2.2144, "step": 42 }, { "epoch": 0.14527027027027026, "grad_norm": 1.0548277894819795, "learning_rate": 4.9233331091504034e-05, "loss": 2.0657, "step": 43 }, { "epoch": 0.14864864864864866, "grad_norm": 0.6580975841596574, "learning_rate": 4.916563896532549e-05, "loss": 2.1431, "step": 44 }, { "epoch": 0.15202702702702703, "grad_norm": 0.3988619886160755, "learning_rate": 4.9095139800486824e-05, "loss": 2.0123, "step": 45 }, { "epoch": 0.1554054054054054, "grad_norm": 0.5209858406284475, "learning_rate": 4.9021842730985036e-05, "loss": 2.2487, "step": 46 }, { "epoch": 0.15878378378378377, "grad_norm": 0.5433634333987769, "learning_rate": 4.894575725331862e-05, "loss": 2.1736, "step": 47 }, { "epoch": 0.16216216216216217, "grad_norm": 0.5109855249570185, "learning_rate": 4.886689322525719e-05, "loss": 2.0823, "step": 48 }, { "epoch": 0.16554054054054054, "grad_norm": 0.6194335580813743, "learning_rate": 4.878526086456426e-05, "loss": 2.1036, "step": 49 }, { "epoch": 0.16891891891891891, "grad_norm": 1.0780413583147488, "learning_rate": 4.8700870747673466e-05, "loss": 2.0302, "step": 50 }, { "epoch": 0.17229729729729729, "grad_norm": 0.580340495769622, "learning_rate": 4.8613733808318204e-05, "loss": 2.1776, "step": 51 }, { "epoch": 0.17567567567567569, "grad_norm": 0.8129429382764808, "learning_rate": 4.85238613361151e-05, "loss": 2.0965, "step": 52 }, { "epoch": 0.17905405405405406, "grad_norm": 0.7779782864994534, "learning_rate": 4.8431264975101245e-05, "loss": 2.1582, "step": 53 }, { "epoch": 0.18243243243243243, "grad_norm": 0.8528745581545691, "learning_rate": 4.8335956722225616e-05, "loss": 2.1511, "step": 54 }, { "epoch": 0.1858108108108108, "grad_norm": 0.49188324413548323, "learning_rate": 4.823794892579471e-05, "loss": 2.1583, "step": 55 }, { "epoch": 0.1891891891891892, "grad_norm": 0.724195478643155, "learning_rate": 4.8137254283872696e-05, "loss": 1.9438, "step": 56 }, { "epoch": 0.19256756756756757, "grad_norm": 0.8754624301347438, "learning_rate": 4.803388584263618e-05, "loss": 2.1349, "step": 57 }, { "epoch": 0.19594594594594594, "grad_norm": 0.4870044791836658, "learning_rate": 4.7927856994684e-05, "loss": 2.0239, "step": 58 }, { "epoch": 0.19932432432432431, "grad_norm": 0.5041896039366629, "learning_rate": 4.781918147730199e-05, "loss": 2.0841, "step": 59 }, { "epoch": 0.20270270270270271, "grad_norm": 0.4874068239264915, "learning_rate": 4.7707873370683163e-05, "loss": 2.1407, "step": 60 }, { "epoch": 0.20270270270270271, "eval_loss": 2.015751838684082, "eval_runtime": 196.268, "eval_samples_per_second": 0.255, "eval_steps_per_second": 0.066, "step": 60 }, { "epoch": 0.20608108108108109, "grad_norm": 0.5424018707478311, "learning_rate": 4.75939470961035e-05, "loss": 2.186, "step": 61 }, { "epoch": 0.20945945945945946, "grad_norm": 0.5115180976703219, "learning_rate": 4.747741741405344e-05, "loss": 2.2014, "step": 62 }, { "epoch": 0.21283783783783783, "grad_norm": 0.5058558197015601, "learning_rate": 4.735829942232555e-05, "loss": 2.0927, "step": 63 }, { "epoch": 0.21621621621621623, "grad_norm": 0.8021043767946636, "learning_rate": 4.7236608554058375e-05, "loss": 2.1884, "step": 64 }, { "epoch": 0.2195945945945946, "grad_norm": 0.5293842363703689, "learning_rate": 4.711236057573691e-05, "loss": 2.0714, "step": 65 }, { "epoch": 0.22297297297297297, "grad_norm": 0.48722486629288786, "learning_rate": 4.6985571585149876e-05, "loss": 2.0562, "step": 66 }, { "epoch": 0.22635135135135134, "grad_norm": 0.4279248526312935, "learning_rate": 4.685625800930406e-05, "loss": 2.0847, "step": 67 }, { "epoch": 0.22972972972972974, "grad_norm": 0.5130161800768928, "learning_rate": 4.6724436602296e-05, "loss": 2.0617, "step": 68 }, { "epoch": 0.23310810810810811, "grad_norm": 0.7460272297512813, "learning_rate": 4.659012444314128e-05, "loss": 2.1029, "step": 69 }, { "epoch": 0.23648648648648649, "grad_norm": 0.4943699960691202, "learning_rate": 4.645333893356176e-05, "loss": 1.9948, "step": 70 }, { "epoch": 0.23986486486486486, "grad_norm": 0.6350670407835639, "learning_rate": 4.6314097795731e-05, "loss": 2.0935, "step": 71 }, { "epoch": 0.24324324324324326, "grad_norm": 0.542940777570525, "learning_rate": 4.6172419069978065e-05, "loss": 2.0267, "step": 72 }, { "epoch": 0.24662162162162163, "grad_norm": 0.4567422527911003, "learning_rate": 4.602832111245029e-05, "loss": 1.9971, "step": 73 }, { "epoch": 0.25, "grad_norm": 0.4699317313725709, "learning_rate": 4.5881822592734946e-05, "loss": 2.1758, "step": 74 }, { "epoch": 0.2533783783783784, "grad_norm": 0.6913326755097541, "learning_rate": 4.573294249144041e-05, "loss": 2.1574, "step": 75 }, { "epoch": 0.25675675675675674, "grad_norm": 0.4362574963180428, "learning_rate": 4.5581700097737015e-05, "loss": 2.0498, "step": 76 }, { "epoch": 0.26013513513513514, "grad_norm": 0.6932798791839507, "learning_rate": 4.542811500685785e-05, "loss": 2.0777, "step": 77 }, { "epoch": 0.2635135135135135, "grad_norm": 0.4469936094893796, "learning_rate": 4.527220711756007e-05, "loss": 2.1046, "step": 78 }, { "epoch": 0.2668918918918919, "grad_norm": 0.5112163825070385, "learning_rate": 4.511399662954667e-05, "loss": 2.0955, "step": 79 }, { "epoch": 0.2702702702702703, "grad_norm": 0.4272672573543929, "learning_rate": 4.4953504040849445e-05, "loss": 2.1927, "step": 80 }, { "epoch": 0.27364864864864863, "grad_norm": 0.4203647309881028, "learning_rate": 4.479075014517321e-05, "loss": 2.0421, "step": 81 }, { "epoch": 0.27702702702702703, "grad_norm": 0.5896256230009808, "learning_rate": 4.462575602920171e-05, "loss": 2.109, "step": 82 }, { "epoch": 0.28040540540540543, "grad_norm": 0.7482696034746834, "learning_rate": 4.445854306986563e-05, "loss": 2.0851, "step": 83 }, { "epoch": 0.28378378378378377, "grad_norm": 0.508289422303338, "learning_rate": 4.428913293157293e-05, "loss": 2.1759, "step": 84 }, { "epoch": 0.28716216216216217, "grad_norm": 0.4749189097462539, "learning_rate": 4.411754756340198e-05, "loss": 2.0721, "step": 85 }, { "epoch": 0.2905405405405405, "grad_norm": 0.493594282927666, "learning_rate": 4.3943809196257794e-05, "loss": 2.0351, "step": 86 }, { "epoch": 0.2939189189189189, "grad_norm": 0.4836545743981663, "learning_rate": 4.376794033999177e-05, "loss": 2.0896, "step": 87 }, { "epoch": 0.2972972972972973, "grad_norm": 0.47767898533003106, "learning_rate": 4.358996378048524e-05, "loss": 2.083, "step": 88 }, { "epoch": 0.30067567567567566, "grad_norm": 0.4390064059005752, "learning_rate": 4.340990257669732e-05, "loss": 2.0627, "step": 89 }, { "epoch": 0.30405405405405406, "grad_norm": 0.4601711355647231, "learning_rate": 4.3227780057677345e-05, "loss": 2.0997, "step": 90 }, { "epoch": 0.30405405405405406, "eval_loss": 2.011120080947876, "eval_runtime": 190.8365, "eval_samples_per_second": 0.262, "eval_steps_per_second": 0.068, "step": 90 }, { "epoch": 0.30743243243243246, "grad_norm": 0.5654017461216198, "learning_rate": 4.304361981954231e-05, "loss": 2.2149, "step": 91 }, { "epoch": 0.3108108108108108, "grad_norm": 0.5064873219371222, "learning_rate": 4.285744572241972e-05, "loss": 2.1093, "step": 92 }, { "epoch": 0.3141891891891892, "grad_norm": 0.46470652155232134, "learning_rate": 4.266928188735621e-05, "loss": 2.1098, "step": 93 }, { "epoch": 0.31756756756756754, "grad_norm": 0.4413493288478021, "learning_rate": 4.247915269319241e-05, "loss": 2.1431, "step": 94 }, { "epoch": 0.32094594594594594, "grad_norm": 1.0020586721524896, "learning_rate": 4.2287082773404386e-05, "loss": 2.0877, "step": 95 }, { "epoch": 0.32432432432432434, "grad_norm": 0.4848089444845991, "learning_rate": 4.209309701291201e-05, "loss": 2.1175, "step": 96 }, { "epoch": 0.3277027027027027, "grad_norm": 0.5555159336979827, "learning_rate": 4.189722054485492e-05, "loss": 2.1563, "step": 97 }, { "epoch": 0.3310810810810811, "grad_norm": 0.3802895433591588, "learning_rate": 4.169947874733619e-05, "loss": 2.0732, "step": 98 }, { "epoch": 0.3344594594594595, "grad_norm": 0.7320601930371831, "learning_rate": 4.149989724013425e-05, "loss": 2.1452, "step": 99 }, { "epoch": 0.33783783783783783, "grad_norm": 0.8602357197050848, "learning_rate": 4.1298501881383624e-05, "loss": 2.1766, "step": 100 }, { "epoch": 0.34121621621621623, "grad_norm": 0.5831812051877154, "learning_rate": 4.109531876422463e-05, "loss": 2.1593, "step": 101 }, { "epoch": 0.34459459459459457, "grad_norm": 0.7665489432929005, "learning_rate": 4.089037421342277e-05, "loss": 2.0295, "step": 102 }, { "epoch": 0.34797297297297297, "grad_norm": 0.45337471468141477, "learning_rate": 4.0683694781958e-05, "loss": 2.1087, "step": 103 }, { "epoch": 0.35135135135135137, "grad_norm": 0.4591955107711257, "learning_rate": 4.047530724758451e-05, "loss": 2.0354, "step": 104 }, { "epoch": 0.3547297297297297, "grad_norm": 0.7505711753697653, "learning_rate": 4.026523860936132e-05, "loss": 2.0258, "step": 105 }, { "epoch": 0.3581081081081081, "grad_norm": 0.589397231315979, "learning_rate": 4.005351608415426e-05, "loss": 2.0051, "step": 106 }, { "epoch": 0.3614864864864865, "grad_norm": 0.6872824203092515, "learning_rate": 3.9840167103109675e-05, "loss": 1.9355, "step": 107 }, { "epoch": 0.36486486486486486, "grad_norm": 0.45802674220232215, "learning_rate": 3.9625219308100455e-05, "loss": 2.0836, "step": 108 }, { "epoch": 0.36824324324324326, "grad_norm": 0.5524787455898191, "learning_rate": 3.940870054814462e-05, "loss": 2.0474, "step": 109 }, { "epoch": 0.3716216216216216, "grad_norm": 0.38416177227514725, "learning_rate": 3.919063887579726e-05, "loss": 2.0985, "step": 110 }, { "epoch": 0.375, "grad_norm": 0.45960539321926364, "learning_rate": 3.897106254351587e-05, "loss": 2.0129, "step": 111 }, { "epoch": 0.3783783783783784, "grad_norm": 0.37861955022931376, "learning_rate": 3.875e-05, "loss": 2.1427, "step": 112 }, { "epoch": 0.38175675675675674, "grad_norm": 0.581523763120522, "learning_rate": 3.852747988650539e-05, "loss": 2.0318, "step": 113 }, { "epoch": 0.38513513513513514, "grad_norm": 0.3504600135974039, "learning_rate": 3.83035310331331e-05, "loss": 2.033, "step": 114 }, { "epoch": 0.3885135135135135, "grad_norm": 0.4242352094167052, "learning_rate": 3.807818245509429e-05, "loss": 2.034, "step": 115 }, { "epoch": 0.3918918918918919, "grad_norm": 0.4250673616692747, "learning_rate": 3.785146334895093e-05, "loss": 2.1702, "step": 116 }, { "epoch": 0.3952702702702703, "grad_norm": 3.2751318573643617, "learning_rate": 3.762340308883302e-05, "loss": 2.1026, "step": 117 }, { "epoch": 0.39864864864864863, "grad_norm": 0.48584128738743915, "learning_rate": 3.739403122263288e-05, "loss": 2.22, "step": 118 }, { "epoch": 0.40202702702702703, "grad_norm": 0.4187766943285619, "learning_rate": 3.716337746817685e-05, "loss": 2.1541, "step": 119 }, { "epoch": 0.40540540540540543, "grad_norm": 0.5051183151449187, "learning_rate": 3.6931471709374946e-05, "loss": 2.1113, "step": 120 }, { "epoch": 0.40540540540540543, "eval_loss": 2.009887218475342, "eval_runtime": 193.8883, "eval_samples_per_second": 0.258, "eval_steps_per_second": 0.067, "step": 120 } ], "logging_steps": 1, "max_steps": 296, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 205932944424960.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }