{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998806967310904, "eval_steps": 500, "global_step": 10475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047721307563827246, "grad_norm": 3.318664073944092, "learning_rate": 9.541984732824428e-06, "loss": 2.2411, "step": 100 }, { "epoch": 0.09544261512765449, "grad_norm": 3.0969126224517822, "learning_rate": 1.9083969465648855e-05, "loss": 1.7476, "step": 200 }, { "epoch": 0.14316392269148176, "grad_norm": 3.136957883834839, "learning_rate": 2.862595419847328e-05, "loss": 1.6566, "step": 300 }, { "epoch": 0.19088523025530899, "grad_norm": 2.480194091796875, "learning_rate": 3.816793893129771e-05, "loss": 1.6178, "step": 400 }, { "epoch": 0.23860653781913624, "grad_norm": 3.3880791664123535, "learning_rate": 4.7709923664122144e-05, "loss": 1.5727, "step": 500 }, { "epoch": 0.2863278453829635, "grad_norm": 2.4437544345855713, "learning_rate": 5.725190839694656e-05, "loss": 1.5195, "step": 600 }, { "epoch": 0.3340491529467907, "grad_norm": 2.8805792331695557, "learning_rate": 6.6793893129771e-05, "loss": 1.4936, "step": 700 }, { "epoch": 0.38177046051061797, "grad_norm": 2.8311476707458496, "learning_rate": 7.633587786259542e-05, "loss": 1.4911, "step": 800 }, { "epoch": 0.4294917680744452, "grad_norm": 2.5717568397521973, "learning_rate": 8.587786259541986e-05, "loss": 1.4788, "step": 900 }, { "epoch": 0.4772130756382725, "grad_norm": 2.799070119857788, "learning_rate": 9.541984732824429e-05, "loss": 1.4754, "step": 1000 }, { "epoch": 0.5249343832020997, "grad_norm": 2.4839601516723633, "learning_rate": 9.999249261723811e-05, "loss": 1.4498, "step": 1100 }, { "epoch": 0.572655690765927, "grad_norm": 2.281714916229248, "learning_rate": 9.993586618853648e-05, "loss": 1.4283, "step": 1200 }, { "epoch": 0.6203769983297542, "grad_norm": 2.145409345626831, "learning_rate": 9.982378674980903e-05, "loss": 1.3999, "step": 1300 }, { "epoch": 0.6680983058935814, "grad_norm": 2.1347646713256836, "learning_rate": 9.965637876354568e-05, "loss": 1.4032, "step": 1400 }, { "epoch": 0.7158196134574087, "grad_norm": 2.2230374813079834, "learning_rate": 9.943382813373584e-05, "loss": 1.3784, "step": 1500 }, { "epoch": 0.7635409210212359, "grad_norm": 2.433398723602295, "learning_rate": 9.915638199942502e-05, "loss": 1.3563, "step": 1600 }, { "epoch": 0.8112622285850632, "grad_norm": 2.3350024223327637, "learning_rate": 9.882434846027066e-05, "loss": 1.328, "step": 1700 }, { "epoch": 0.8589835361488904, "grad_norm": 2.373020648956299, "learning_rate": 9.843809623440229e-05, "loss": 1.3613, "step": 1800 }, { "epoch": 0.9067048437127178, "grad_norm": 2.0773284435272217, "learning_rate": 9.79980542489656e-05, "loss": 1.2976, "step": 1900 }, { "epoch": 0.954426151276545, "grad_norm": 1.9312775135040283, "learning_rate": 9.750471116380557e-05, "loss": 1.2865, "step": 2000 }, { "epoch": 1.0021474588403723, "grad_norm": 2.0190281867980957, "learning_rate": 9.69586148288172e-05, "loss": 1.2819, "step": 2100 }, { "epoch": 1.0498687664041995, "grad_norm": 2.113044500350952, "learning_rate": 9.636037167556662e-05, "loss": 1.0358, "step": 2200 }, { "epoch": 1.0975900739680267, "grad_norm": 1.9696762561798096, "learning_rate": 9.571064604385821e-05, "loss": 1.0359, "step": 2300 }, { "epoch": 1.145311381531854, "grad_norm": 1.7783578634262085, "learning_rate": 9.501015944399546e-05, "loss": 1.058, "step": 2400 }, { "epoch": 1.1930326890956813, "grad_norm": 1.7561606168746948, "learning_rate": 9.425968975555491e-05, "loss": 1.0732, "step": 2500 }, { "epoch": 1.2407539966595085, "grad_norm": 2.150614023208618, "learning_rate": 9.34600703635629e-05, "loss": 1.0523, "step": 2600 }, { "epoch": 1.2884753042233357, "grad_norm": 2.3448917865753174, "learning_rate": 9.26121892330342e-05, "loss": 1.0379, "step": 2700 }, { "epoch": 1.3361966117871629, "grad_norm": 1.8908520936965942, "learning_rate": 9.17169879229008e-05, "loss": 1.0588, "step": 2800 }, { "epoch": 1.3839179193509903, "grad_norm": 2.531501054763794, "learning_rate": 9.077546054042489e-05, "loss": 1.0532, "step": 2900 }, { "epoch": 1.4316392269148175, "grad_norm": 2.044644594192505, "learning_rate": 8.978865263725835e-05, "loss": 1.0267, "step": 3000 }, { "epoch": 1.4793605344786447, "grad_norm": 1.8352954387664795, "learning_rate": 8.875766004837363e-05, "loss": 1.0177, "step": 3100 }, { "epoch": 1.527081842042472, "grad_norm": 1.6844772100448608, "learning_rate": 8.768362767515585e-05, "loss": 1.009, "step": 3200 }, { "epoch": 1.574803149606299, "grad_norm": 1.881711483001709, "learning_rate": 8.656774821400754e-05, "loss": 1.0102, "step": 3300 }, { "epoch": 1.6225244571701265, "grad_norm": 1.7800958156585693, "learning_rate": 8.541126083187762e-05, "loss": 1.0031, "step": 3400 }, { "epoch": 1.6702457647339537, "grad_norm": 1.7756962776184082, "learning_rate": 8.421544979018563e-05, "loss": 0.9875, "step": 3500 }, { "epoch": 1.717967072297781, "grad_norm": 1.802489161491394, "learning_rate": 8.298164301866912e-05, "loss": 1.0077, "step": 3600 }, { "epoch": 1.7656883798616083, "grad_norm": 1.8551727533340454, "learning_rate": 8.171121064073828e-05, "loss": 0.9712, "step": 3700 }, { "epoch": 1.8134096874254353, "grad_norm": 1.98467218875885, "learning_rate": 8.040556345197484e-05, "loss": 0.9986, "step": 3800 }, { "epoch": 1.8611309949892627, "grad_norm": 1.8102763891220093, "learning_rate": 7.906615135346544e-05, "loss": 0.9679, "step": 3900 }, { "epoch": 1.90885230255309, "grad_norm": 1.8749668598175049, "learning_rate": 7.769446174170874e-05, "loss": 0.9476, "step": 4000 }, { "epoch": 1.956573610116917, "grad_norm": 1.697935938835144, "learning_rate": 7.629201785688463e-05, "loss": 0.9633, "step": 4100 }, { "epoch": 2.0042949176807445, "grad_norm": 2.464707136154175, "learning_rate": 7.486037709131927e-05, "loss": 0.9102, "step": 4200 }, { "epoch": 2.0520162252445715, "grad_norm": 1.7188732624053955, "learning_rate": 7.340112926002515e-05, "loss": 0.589, "step": 4300 }, { "epoch": 2.099737532808399, "grad_norm": 1.7961351871490479, "learning_rate": 7.191589483523586e-05, "loss": 0.592, "step": 4400 }, { "epoch": 2.1474588403722263, "grad_norm": 1.8405373096466064, "learning_rate": 7.040632314689668e-05, "loss": 0.5934, "step": 4500 }, { "epoch": 2.1951801479360533, "grad_norm": 1.8192975521087646, "learning_rate": 6.887409055110908e-05, "loss": 0.5962, "step": 4600 }, { "epoch": 2.2429014554998807, "grad_norm": 1.6917438507080078, "learning_rate": 6.732089856856316e-05, "loss": 0.5967, "step": 4700 }, { "epoch": 2.290622763063708, "grad_norm": 1.843735694885254, "learning_rate": 6.574847199502502e-05, "loss": 0.6002, "step": 4800 }, { "epoch": 2.338344070627535, "grad_norm": 1.81013023853302, "learning_rate": 6.415855698597774e-05, "loss": 0.5883, "step": 4900 }, { "epoch": 2.3860653781913626, "grad_norm": 2.259490966796875, "learning_rate": 6.255291911754258e-05, "loss": 0.5942, "step": 5000 }, { "epoch": 2.4337866857551895, "grad_norm": 1.8322285413742065, "learning_rate": 6.093334142583391e-05, "loss": 0.5917, "step": 5100 }, { "epoch": 2.481507993319017, "grad_norm": 1.5893152952194214, "learning_rate": 5.930162242692507e-05, "loss": 0.5982, "step": 5200 }, { "epoch": 2.529229300882844, "grad_norm": 1.5037435293197632, "learning_rate": 5.765957411962395e-05, "loss": 0.571, "step": 5300 }, { "epoch": 2.5769506084466713, "grad_norm": 1.9529515504837036, "learning_rate": 5.6009019973276314e-05, "loss": 0.5835, "step": 5400 }, { "epoch": 2.6246719160104988, "grad_norm": 1.7182705402374268, "learning_rate": 5.435179290283109e-05, "loss": 0.586, "step": 5500 }, { "epoch": 2.6723932235743257, "grad_norm": 1.9804043769836426, "learning_rate": 5.268973323341661e-05, "loss": 0.5781, "step": 5600 }, { "epoch": 2.720114531138153, "grad_norm": 1.6894468069076538, "learning_rate": 5.1024686656687935e-05, "loss": 0.566, "step": 5700 }, { "epoch": 2.7678358387019806, "grad_norm": 2.1793899536132812, "learning_rate": 4.9358502181214666e-05, "loss": 0.5673, "step": 5800 }, { "epoch": 2.8155571462658076, "grad_norm": 1.4578018188476562, "learning_rate": 4.7693030079185405e-05, "loss": 0.5528, "step": 5900 }, { "epoch": 2.863278453829635, "grad_norm": 1.5378628969192505, "learning_rate": 4.6030119831709054e-05, "loss": 0.5696, "step": 6000 }, { "epoch": 2.9109997613934624, "grad_norm": 1.6980961561203003, "learning_rate": 4.4371618074994464e-05, "loss": 0.546, "step": 6100 }, { "epoch": 2.9587210689572894, "grad_norm": 2.221790313720703, "learning_rate": 4.271936654968948e-05, "loss": 0.5645, "step": 6200 }, { "epoch": 3.006442376521117, "grad_norm": 1.6896121501922607, "learning_rate": 4.1075200055656186e-05, "loss": 0.5217, "step": 6300 }, { "epoch": 3.0541636840849438, "grad_norm": 1.338323950767517, "learning_rate": 3.9440944414454066e-05, "loss": 0.2704, "step": 6400 }, { "epoch": 3.101884991648771, "grad_norm": 1.9295827150344849, "learning_rate": 3.7818414441793224e-05, "loss": 0.2679, "step": 6500 }, { "epoch": 3.1496062992125986, "grad_norm": 1.5463491678237915, "learning_rate": 3.6209411932209506e-05, "loss": 0.2713, "step": 6600 }, { "epoch": 3.1973276067764256, "grad_norm": 1.3632663488388062, "learning_rate": 3.461572365819943e-05, "loss": 0.2716, "step": 6700 }, { "epoch": 3.245048914340253, "grad_norm": 1.2171025276184082, "learning_rate": 3.303911938603683e-05, "loss": 0.2723, "step": 6800 }, { "epoch": 3.29277022190408, "grad_norm": 1.9307883977890015, "learning_rate": 3.148134991047477e-05, "loss": 0.2736, "step": 6900 }, { "epoch": 3.3404915294679074, "grad_norm": 1.261749267578125, "learning_rate": 2.9944145110514825e-05, "loss": 0.2814, "step": 7000 }, { "epoch": 3.388212837031735, "grad_norm": 1.3747665882110596, "learning_rate": 2.842921202840322e-05, "loss": 0.2729, "step": 7100 }, { "epoch": 3.435934144595562, "grad_norm": 1.3830654621124268, "learning_rate": 2.6938232973986567e-05, "loss": 0.2788, "step": 7200 }, { "epoch": 3.483655452159389, "grad_norm": 1.478647232055664, "learning_rate": 2.547286365653292e-05, "loss": 0.2692, "step": 7300 }, { "epoch": 3.5313767597232166, "grad_norm": 1.1786105632781982, "learning_rate": 2.403473134609185e-05, "loss": 0.2729, "step": 7400 }, { "epoch": 3.5790980672870436, "grad_norm": 1.1656215190887451, "learning_rate": 2.262543306643643e-05, "loss": 0.2796, "step": 7500 }, { "epoch": 3.626819374850871, "grad_norm": 1.186291217803955, "learning_rate": 2.124653382159273e-05, "loss": 0.2666, "step": 7600 }, { "epoch": 3.674540682414698, "grad_norm": 1.295457363128662, "learning_rate": 1.989956485792702e-05, "loss": 0.2719, "step": 7700 }, { "epoch": 3.7222619899785254, "grad_norm": 1.141144871711731, "learning_rate": 1.8586021963720364e-05, "loss": 0.2699, "step": 7800 }, { "epoch": 3.7699832975423524, "grad_norm": 1.6188665628433228, "learning_rate": 1.7307363808118555e-05, "loss": 0.2569, "step": 7900 }, { "epoch": 3.81770460510618, "grad_norm": 1.63030207157135, "learning_rate": 1.6065010321302785e-05, "loss": 0.2601, "step": 8000 }, { "epoch": 3.8654259126700072, "grad_norm": 1.5045113563537598, "learning_rate": 1.486034111767885e-05, "loss": 0.2628, "step": 8100 }, { "epoch": 3.913147220233834, "grad_norm": 1.5543336868286133, "learning_rate": 1.3694693963836646e-05, "loss": 0.2626, "step": 8200 }, { "epoch": 3.9608685277976616, "grad_norm": 1.1248536109924316, "learning_rate": 1.2569363292981106e-05, "loss": 0.257, "step": 8300 }, { "epoch": 4.008589835361489, "grad_norm": 0.8004603981971741, "learning_rate": 1.1485598767483852e-05, "loss": 0.2457, "step": 8400 }, { "epoch": 4.056311142925316, "grad_norm": 0.8413916826248169, "learning_rate": 1.0444603891152616e-05, "loss": 0.1514, "step": 8500 }, { "epoch": 4.104032450489143, "grad_norm": 0.7297254204750061, "learning_rate": 9.447534672758624e-06, "loss": 0.1534, "step": 8600 }, { "epoch": 4.151753758052971, "grad_norm": 0.8776145577430725, "learning_rate": 8.495498342306763e-06, "loss": 0.1507, "step": 8700 }, { "epoch": 4.199475065616798, "grad_norm": 0.992046058177948, "learning_rate": 7.58955212147372e-06, "loss": 0.1566, "step": 8800 }, { "epoch": 4.247196373180625, "grad_norm": 0.7668212056159973, "learning_rate": 6.730702049579529e-06, "loss": 0.1536, "step": 8900 }, { "epoch": 4.294917680744453, "grad_norm": 0.7402627468109131, "learning_rate": 5.919901866396482e-06, "loss": 0.1528, "step": 9000 }, { "epoch": 4.34263898830828, "grad_norm": 0.9620897173881531, "learning_rate": 5.158051953035753e-06, "loss": 0.155, "step": 9100 }, { "epoch": 4.390360295872107, "grad_norm": 0.9639643430709839, "learning_rate": 4.445998332088108e-06, "loss": 0.1555, "step": 9200 }, { "epoch": 4.4380816034359345, "grad_norm": 0.9396972060203552, "learning_rate": 3.784531728128771e-06, "loss": 0.1549, "step": 9300 }, { "epoch": 4.4858029109997615, "grad_norm": 0.7707456350326538, "learning_rate": 3.1743866896300324e-06, "loss": 0.1505, "step": 9400 }, { "epoch": 4.5335242185635884, "grad_norm": 0.6720114350318909, "learning_rate": 2.6162407732564676e-06, "loss": 0.1518, "step": 9500 }, { "epoch": 4.581245526127416, "grad_norm": 0.8427059650421143, "learning_rate": 2.110713791448671e-06, "loss": 0.153, "step": 9600 }, { "epoch": 4.628966833691243, "grad_norm": 0.4860561192035675, "learning_rate": 1.6583671241311606e-06, "loss": 0.1533, "step": 9700 }, { "epoch": 4.67668814125507, "grad_norm": 0.717401921749115, "learning_rate": 1.259703095308551e-06, "loss": 0.1522, "step": 9800 }, { "epoch": 4.724409448818898, "grad_norm": 0.8769087791442871, "learning_rate": 9.151644152424976e-07, "loss": 0.1493, "step": 9900 }, { "epoch": 4.772130756382725, "grad_norm": 0.795388400554657, "learning_rate": 6.251336888288273e-07, "loss": 0.1543, "step": 10000 }, { "epoch": 4.819852063946552, "grad_norm": 0.8506287336349487, "learning_rate": 3.8993299072062063e-07, "loss": 0.1515, "step": 10100 }, { "epoch": 4.867573371510379, "grad_norm": 0.8654916882514954, "learning_rate": 2.0982350766932112e-07, "loss": 0.1487, "step": 10200 }, { "epoch": 4.915294679074207, "grad_norm": 1.4502595663070679, "learning_rate": 8.500524848086277e-08, "loss": 0.1537, "step": 10300 }, { "epoch": 4.963015986638034, "grad_norm": 0.5670742988586426, "learning_rate": 1.5616821908959188e-08, "loss": 0.1492, "step": 10400 } ], "logging_steps": 100, "max_steps": 10475, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.362264123362263e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }