{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.686635944700461, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04608294930875576, "grad_norm": 5.774722576141357, "learning_rate": 4.999970619519374e-05, "loss": 0.9942, "num_input_tokens_seen": 6072, "step": 5 }, { "epoch": 0.09216589861751152, "grad_norm": 4.252433776855469, "learning_rate": 4.9998824787680656e-05, "loss": 0.6317, "num_input_tokens_seen": 11928, "step": 10 }, { "epoch": 0.1382488479262673, "grad_norm": 1.8097654581069946, "learning_rate": 4.999735579817769e-05, "loss": 0.305, "num_input_tokens_seen": 18040, "step": 15 }, { "epoch": 0.18433179723502305, "grad_norm": 2.511850118637085, "learning_rate": 4.9995299261212536e-05, "loss": 0.1734, "num_input_tokens_seen": 24016, "step": 20 }, { "epoch": 0.2304147465437788, "grad_norm": 4.347301006317139, "learning_rate": 4.999265522512283e-05, "loss": 0.2654, "num_input_tokens_seen": 29848, "step": 25 }, { "epoch": 0.2764976958525346, "grad_norm": 1.3125089406967163, "learning_rate": 4.998942375205502e-05, "loss": 0.1326, "num_input_tokens_seen": 35440, "step": 30 }, { "epoch": 0.3225806451612903, "grad_norm": 2.6110188961029053, "learning_rate": 4.998560491796287e-05, "loss": 0.0996, "num_input_tokens_seen": 41360, "step": 35 }, { "epoch": 0.3686635944700461, "grad_norm": 2.0633397102355957, "learning_rate": 4.998119881260576e-05, "loss": 0.1085, "num_input_tokens_seen": 47016, "step": 40 }, { "epoch": 0.4147465437788018, "grad_norm": 1.6892552375793457, "learning_rate": 4.997620553954645e-05, "loss": 0.1384, "num_input_tokens_seen": 53056, "step": 45 }, { "epoch": 0.4608294930875576, "grad_norm": 2.2047786712646484, "learning_rate": 4.997062521614876e-05, "loss": 0.1176, "num_input_tokens_seen": 58952, "step": 50 }, { "epoch": 0.5069124423963134, "grad_norm": 2.8315374851226807, "learning_rate": 4.996445797357477e-05, "loss": 0.0888, "num_input_tokens_seen": 64760, "step": 55 }, { "epoch": 0.5529953917050692, "grad_norm": 2.423229932785034, "learning_rate": 4.995770395678171e-05, "loss": 0.0812, "num_input_tokens_seen": 70672, "step": 60 }, { "epoch": 0.5990783410138248, "grad_norm": 2.2459795475006104, "learning_rate": 4.9950363324518584e-05, "loss": 0.1233, "num_input_tokens_seen": 76608, "step": 65 }, { "epoch": 0.6451612903225806, "grad_norm": 2.267782211303711, "learning_rate": 4.9942436249322444e-05, "loss": 0.1446, "num_input_tokens_seen": 82336, "step": 70 }, { "epoch": 0.6912442396313364, "grad_norm": 2.2236413955688477, "learning_rate": 4.993392291751431e-05, "loss": 0.0736, "num_input_tokens_seen": 88120, "step": 75 }, { "epoch": 0.7373271889400922, "grad_norm": 1.833316683769226, "learning_rate": 4.99248235291948e-05, "loss": 0.1129, "num_input_tokens_seen": 93808, "step": 80 }, { "epoch": 0.783410138248848, "grad_norm": 2.8768177032470703, "learning_rate": 4.991513829823945e-05, "loss": 0.1426, "num_input_tokens_seen": 99928, "step": 85 }, { "epoch": 0.8294930875576036, "grad_norm": 2.096641778945923, "learning_rate": 4.990486745229364e-05, "loss": 0.1401, "num_input_tokens_seen": 106288, "step": 90 }, { "epoch": 0.8755760368663594, "grad_norm": 2.371711492538452, "learning_rate": 4.9894011232767294e-05, "loss": 0.1127, "num_input_tokens_seen": 112136, "step": 95 }, { "epoch": 0.9216589861751152, "grad_norm": 0.6439377665519714, "learning_rate": 4.9882569894829144e-05, "loss": 0.0524, "num_input_tokens_seen": 117936, "step": 100 }, { "epoch": 0.967741935483871, "grad_norm": 2.412799835205078, "learning_rate": 4.987054370740083e-05, "loss": 0.0728, "num_input_tokens_seen": 124152, "step": 105 }, { "epoch": 1.0138248847926268, "grad_norm": 2.985877513885498, "learning_rate": 4.9857932953150465e-05, "loss": 0.0449, "num_input_tokens_seen": 129872, "step": 110 }, { "epoch": 1.0599078341013826, "grad_norm": 3.5899670124053955, "learning_rate": 4.984473792848607e-05, "loss": 0.0506, "num_input_tokens_seen": 136080, "step": 115 }, { "epoch": 1.1059907834101383, "grad_norm": 2.940323829650879, "learning_rate": 4.983095894354858e-05, "loss": 0.0431, "num_input_tokens_seen": 141712, "step": 120 }, { "epoch": 1.1520737327188941, "grad_norm": 2.948409080505371, "learning_rate": 4.981659632220455e-05, "loss": 0.0318, "num_input_tokens_seen": 147544, "step": 125 }, { "epoch": 1.1981566820276497, "grad_norm": 1.2344759702682495, "learning_rate": 4.9801650402038555e-05, "loss": 0.0593, "num_input_tokens_seen": 153344, "step": 130 }, { "epoch": 1.2442396313364055, "grad_norm": 3.271662473678589, "learning_rate": 4.9786121534345265e-05, "loss": 0.0617, "num_input_tokens_seen": 159160, "step": 135 }, { "epoch": 1.2903225806451613, "grad_norm": 3.5447471141815186, "learning_rate": 4.977001008412113e-05, "loss": 0.0456, "num_input_tokens_seen": 164760, "step": 140 }, { "epoch": 1.336405529953917, "grad_norm": 4.001632213592529, "learning_rate": 4.9753316430055894e-05, "loss": 0.068, "num_input_tokens_seen": 170624, "step": 145 }, { "epoch": 1.3824884792626728, "grad_norm": 1.6239632368087769, "learning_rate": 4.973604096452361e-05, "loss": 0.0383, "num_input_tokens_seen": 176480, "step": 150 }, { "epoch": 1.4285714285714286, "grad_norm": 0.19192902743816376, "learning_rate": 4.9718184093573475e-05, "loss": 0.0351, "num_input_tokens_seen": 182056, "step": 155 }, { "epoch": 1.4746543778801844, "grad_norm": 3.1995279788970947, "learning_rate": 4.969974623692023e-05, "loss": 0.1139, "num_input_tokens_seen": 188032, "step": 160 }, { "epoch": 1.52073732718894, "grad_norm": 2.926187038421631, "learning_rate": 4.9680727827934354e-05, "loss": 0.0738, "num_input_tokens_seen": 194112, "step": 165 }, { "epoch": 1.5668202764976957, "grad_norm": 1.8316713571548462, "learning_rate": 4.966112931363185e-05, "loss": 0.0567, "num_input_tokens_seen": 200368, "step": 170 }, { "epoch": 1.6129032258064515, "grad_norm": 2.283552885055542, "learning_rate": 4.964095115466373e-05, "loss": 0.0793, "num_input_tokens_seen": 206520, "step": 175 }, { "epoch": 1.6589861751152073, "grad_norm": 2.575171709060669, "learning_rate": 4.962019382530521e-05, "loss": 0.0564, "num_input_tokens_seen": 212328, "step": 180 }, { "epoch": 1.705069124423963, "grad_norm": 3.201814651489258, "learning_rate": 4.959885781344452e-05, "loss": 0.0677, "num_input_tokens_seen": 218456, "step": 185 }, { "epoch": 1.7511520737327189, "grad_norm": 2.607283592224121, "learning_rate": 4.9576943620571507e-05, "loss": 0.0474, "num_input_tokens_seen": 224240, "step": 190 }, { "epoch": 1.7972350230414746, "grad_norm": 1.2559226751327515, "learning_rate": 4.9554451761765766e-05, "loss": 0.053, "num_input_tokens_seen": 230456, "step": 195 }, { "epoch": 1.8433179723502304, "grad_norm": 1.7003673315048218, "learning_rate": 4.953138276568462e-05, "loss": 0.0477, "num_input_tokens_seen": 236448, "step": 200 }, { "epoch": 1.8894009216589862, "grad_norm": 2.2908496856689453, "learning_rate": 4.950773717455061e-05, "loss": 0.0581, "num_input_tokens_seen": 242536, "step": 205 }, { "epoch": 1.935483870967742, "grad_norm": 3.7818005084991455, "learning_rate": 4.948351554413879e-05, "loss": 0.0583, "num_input_tokens_seen": 248352, "step": 210 }, { "epoch": 1.9815668202764978, "grad_norm": 1.7777669429779053, "learning_rate": 4.945871844376369e-05, "loss": 0.0245, "num_input_tokens_seen": 253928, "step": 215 }, { "epoch": 2.0276497695852536, "grad_norm": 1.5329607725143433, "learning_rate": 4.94333464562659e-05, "loss": 0.0354, "num_input_tokens_seen": 259904, "step": 220 }, { "epoch": 2.0737327188940093, "grad_norm": 2.1847078800201416, "learning_rate": 4.940740017799833e-05, "loss": 0.0352, "num_input_tokens_seen": 265656, "step": 225 }, { "epoch": 2.119815668202765, "grad_norm": 1.6991156339645386, "learning_rate": 4.938088021881233e-05, "loss": 0.0416, "num_input_tokens_seen": 271576, "step": 230 }, { "epoch": 2.165898617511521, "grad_norm": 3.675462484359741, "learning_rate": 4.935378720204319e-05, "loss": 0.047, "num_input_tokens_seen": 277464, "step": 235 }, { "epoch": 2.2119815668202767, "grad_norm": 1.8100879192352295, "learning_rate": 4.9326121764495596e-05, "loss": 0.0454, "num_input_tokens_seen": 283592, "step": 240 }, { "epoch": 2.258064516129032, "grad_norm": 2.6867613792419434, "learning_rate": 4.929788455642864e-05, "loss": 0.0647, "num_input_tokens_seen": 289544, "step": 245 }, { "epoch": 2.3041474654377883, "grad_norm": 0.8391556143760681, "learning_rate": 4.9269076241540505e-05, "loss": 0.0283, "num_input_tokens_seen": 295480, "step": 250 }, { "epoch": 2.3502304147465436, "grad_norm": 3.0261058807373047, "learning_rate": 4.92396974969529e-05, "loss": 0.0301, "num_input_tokens_seen": 301680, "step": 255 }, { "epoch": 2.3963133640552994, "grad_norm": 1.320726990699768, "learning_rate": 4.920974901319515e-05, "loss": 0.0416, "num_input_tokens_seen": 307768, "step": 260 }, { "epoch": 2.442396313364055, "grad_norm": 3.625941753387451, "learning_rate": 4.917923149418792e-05, "loss": 0.0504, "num_input_tokens_seen": 313376, "step": 265 }, { "epoch": 2.488479262672811, "grad_norm": 1.6434024572372437, "learning_rate": 4.914814565722671e-05, "loss": 0.0499, "num_input_tokens_seen": 319592, "step": 270 }, { "epoch": 2.5345622119815667, "grad_norm": 2.392733573913574, "learning_rate": 4.911649223296499e-05, "loss": 0.03, "num_input_tokens_seen": 325392, "step": 275 }, { "epoch": 2.5806451612903225, "grad_norm": 3.644463539123535, "learning_rate": 4.9084271965397014e-05, "loss": 0.0416, "num_input_tokens_seen": 331264, "step": 280 }, { "epoch": 2.6267281105990783, "grad_norm": 1.160701870918274, "learning_rate": 4.905148561184033e-05, "loss": 0.0639, "num_input_tokens_seen": 337584, "step": 285 }, { "epoch": 2.672811059907834, "grad_norm": 2.189955711364746, "learning_rate": 4.901813394291801e-05, "loss": 0.048, "num_input_tokens_seen": 343632, "step": 290 }, { "epoch": 2.71889400921659, "grad_norm": 2.4015591144561768, "learning_rate": 4.898421774254051e-05, "loss": 0.0465, "num_input_tokens_seen": 349392, "step": 295 }, { "epoch": 2.7649769585253456, "grad_norm": 1.5051063299179077, "learning_rate": 4.894973780788722e-05, "loss": 0.0512, "num_input_tokens_seen": 355104, "step": 300 }, { "epoch": 2.8110599078341014, "grad_norm": 2.82021427154541, "learning_rate": 4.891469494938781e-05, "loss": 0.0412, "num_input_tokens_seen": 360824, "step": 305 }, { "epoch": 2.857142857142857, "grad_norm": 1.578285574913025, "learning_rate": 4.887908999070308e-05, "loss": 0.0292, "num_input_tokens_seen": 366792, "step": 310 }, { "epoch": 2.903225806451613, "grad_norm": 2.392061710357666, "learning_rate": 4.884292376870567e-05, "loss": 0.0264, "num_input_tokens_seen": 372536, "step": 315 }, { "epoch": 2.9493087557603688, "grad_norm": 0.46661177277565, "learning_rate": 4.880619713346039e-05, "loss": 0.0402, "num_input_tokens_seen": 378472, "step": 320 }, { "epoch": 2.9953917050691246, "grad_norm": 2.8422093391418457, "learning_rate": 4.876891094820417e-05, "loss": 0.0214, "num_input_tokens_seen": 383984, "step": 325 }, { "epoch": 3.0414746543778803, "grad_norm": 1.581181526184082, "learning_rate": 4.873106608932585e-05, "loss": 0.0536, "num_input_tokens_seen": 389896, "step": 330 }, { "epoch": 3.087557603686636, "grad_norm": 2.1210877895355225, "learning_rate": 4.869266344634556e-05, "loss": 0.0295, "num_input_tokens_seen": 395848, "step": 335 }, { "epoch": 3.133640552995392, "grad_norm": 0.6473267078399658, "learning_rate": 4.8653703921893766e-05, "loss": 0.0209, "num_input_tokens_seen": 401784, "step": 340 }, { "epoch": 3.1797235023041477, "grad_norm": 3.651721239089966, "learning_rate": 4.8614188431690125e-05, "loss": 0.0348, "num_input_tokens_seen": 407656, "step": 345 }, { "epoch": 3.225806451612903, "grad_norm": 1.2039521932601929, "learning_rate": 4.85741179045219e-05, "loss": 0.0167, "num_input_tokens_seen": 413344, "step": 350 }, { "epoch": 3.271889400921659, "grad_norm": 2.5414485931396484, "learning_rate": 4.853349328222219e-05, "loss": 0.0567, "num_input_tokens_seen": 419112, "step": 355 }, { "epoch": 3.3179723502304146, "grad_norm": 0.12772409617900848, "learning_rate": 4.849231551964771e-05, "loss": 0.0483, "num_input_tokens_seen": 425104, "step": 360 }, { "epoch": 3.3640552995391704, "grad_norm": 1.686139702796936, "learning_rate": 4.845058558465645e-05, "loss": 0.0379, "num_input_tokens_seen": 430920, "step": 365 }, { "epoch": 3.410138248847926, "grad_norm": 1.3768738508224487, "learning_rate": 4.840830445808483e-05, "loss": 0.0447, "num_input_tokens_seen": 436864, "step": 370 }, { "epoch": 3.456221198156682, "grad_norm": 1.8329992294311523, "learning_rate": 4.836547313372471e-05, "loss": 0.0417, "num_input_tokens_seen": 442880, "step": 375 }, { "epoch": 3.5023041474654377, "grad_norm": 2.0811145305633545, "learning_rate": 4.832209261830002e-05, "loss": 0.0368, "num_input_tokens_seen": 448536, "step": 380 }, { "epoch": 3.5483870967741935, "grad_norm": 0.14438925683498383, "learning_rate": 4.827816393144305e-05, "loss": 0.0269, "num_input_tokens_seen": 454664, "step": 385 }, { "epoch": 3.5944700460829493, "grad_norm": 2.131544828414917, "learning_rate": 4.823368810567056e-05, "loss": 0.0387, "num_input_tokens_seen": 460584, "step": 390 }, { "epoch": 3.640552995391705, "grad_norm": 2.2278664112091064, "learning_rate": 4.818866618635947e-05, "loss": 0.0365, "num_input_tokens_seen": 466352, "step": 395 }, { "epoch": 3.686635944700461, "grad_norm": 0.2778138220310211, "learning_rate": 4.814309923172227e-05, "loss": 0.0394, "num_input_tokens_seen": 472288, "step": 400 } ], "logging_steps": 5, "max_steps": 3240, "num_input_tokens_seen": 472288, "num_train_epochs": 30, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5598346769498112.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }