{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997049277072882, "eval_steps": 500, "global_step": 847, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011802891708468575, "grad_norm": 2.2601828575134277, "learning_rate": 5.294117647058824e-06, "loss": 3.4593, "step": 10 }, { "epoch": 0.02360578341693715, "grad_norm": 2.753499746322632, "learning_rate": 1e-05, "loss": 2.9512, "step": 20 }, { "epoch": 0.03540867512540572, "grad_norm": 1.42862069606781, "learning_rate": 1.588235294117647e-05, "loss": 2.2602, "step": 30 }, { "epoch": 0.0472115668338743, "grad_norm": 2.2324745655059814, "learning_rate": 2.1764705882352943e-05, "loss": 1.9542, "step": 40 }, { "epoch": 0.05901445854234287, "grad_norm": 0.9986198544502258, "learning_rate": 2.7647058823529416e-05, "loss": 1.7491, "step": 50 }, { "epoch": 0.07081735025081144, "grad_norm": 1.7015349864959717, "learning_rate": 3.352941176470588e-05, "loss": 1.7127, "step": 60 }, { "epoch": 0.08262024195928003, "grad_norm": 1.410741925239563, "learning_rate": 3.9411764705882356e-05, "loss": 1.5587, "step": 70 }, { "epoch": 0.0944231336677486, "grad_norm": 1.4795621633529663, "learning_rate": 4.5294117647058826e-05, "loss": 1.6199, "step": 80 }, { "epoch": 0.10622602537621717, "grad_norm": 1.2762919664382935, "learning_rate": 4.999915012051437e-05, "loss": 1.5481, "step": 90 }, { "epoch": 0.11802891708468574, "grad_norm": 1.1632708311080933, "learning_rate": 4.996941040535653e-05, "loss": 1.4648, "step": 100 }, { "epoch": 0.1298318087931543, "grad_norm": 1.950888752937317, "learning_rate": 4.989723448187131e-05, "loss": 1.5273, "step": 110 }, { "epoch": 0.14163470050162288, "grad_norm": 1.0861647129058838, "learning_rate": 4.978274501505061e-05, "loss": 1.5472, "step": 120 }, { "epoch": 0.15343759221009148, "grad_norm": 1.347829818725586, "learning_rate": 4.962613658293158e-05, "loss": 1.5428, "step": 130 }, { "epoch": 0.16524048391856005, "grad_norm": 1.014880657196045, "learning_rate": 4.942767534590581e-05, "loss": 1.3644, "step": 140 }, { "epoch": 0.17704337562702863, "grad_norm": 1.7476609945297241, "learning_rate": 4.918769859437232e-05, "loss": 1.4653, "step": 150 }, { "epoch": 0.1888462673354972, "grad_norm": 1.0597649812698364, "learning_rate": 4.890661417550319e-05, "loss": 1.5003, "step": 160 }, { "epoch": 0.20064915904396577, "grad_norm": 1.098489761352539, "learning_rate": 4.8584899800095864e-05, "loss": 1.462, "step": 170 }, { "epoch": 0.21245205075243434, "grad_norm": 0.8845277428627014, "learning_rate": 4.822310223069039e-05, "loss": 1.3805, "step": 180 }, { "epoch": 0.2242549424609029, "grad_norm": 1.181378960609436, "learning_rate": 4.782183635233124e-05, "loss": 1.4678, "step": 190 }, { "epoch": 0.23605783416937148, "grad_norm": 1.3490713834762573, "learning_rate": 4.738178412755306e-05, "loss": 1.378, "step": 200 }, { "epoch": 0.24786072587784008, "grad_norm": 1.1515178680419922, "learning_rate": 4.690369343736636e-05, "loss": 1.4216, "step": 210 }, { "epoch": 0.2596636175863086, "grad_norm": 1.3116216659545898, "learning_rate": 4.6388376810212905e-05, "loss": 1.3915, "step": 220 }, { "epoch": 0.2714665092947772, "grad_norm": 1.6775559186935425, "learning_rate": 4.583671004105096e-05, "loss": 1.4104, "step": 230 }, { "epoch": 0.28326940100324577, "grad_norm": 1.4933921098709106, "learning_rate": 4.524963070291744e-05, "loss": 1.411, "step": 240 }, { "epoch": 0.29507229271171437, "grad_norm": 1.057346224784851, "learning_rate": 4.4628136553496375e-05, "loss": 1.3628, "step": 250 }, { "epoch": 0.30687518442018297, "grad_norm": 1.389323353767395, "learning_rate": 4.397328383940196e-05, "loss": 1.331, "step": 260 }, { "epoch": 0.3186780761286515, "grad_norm": 1.2937726974487305, "learning_rate": 4.328618550105802e-05, "loss": 1.338, "step": 270 }, { "epoch": 0.3304809678371201, "grad_norm": 1.0137064456939697, "learning_rate": 4.256800928122475e-05, "loss": 1.4131, "step": 280 }, { "epoch": 0.34228385954558865, "grad_norm": 1.2065379619598389, "learning_rate": 4.181997574038741e-05, "loss": 1.3584, "step": 290 }, { "epoch": 0.35408675125405725, "grad_norm": 1.0482007265090942, "learning_rate": 4.104335618237972e-05, "loss": 1.3541, "step": 300 }, { "epoch": 0.3658896429625258, "grad_norm": 1.1176925897598267, "learning_rate": 4.0239470493767704e-05, "loss": 1.359, "step": 310 }, { "epoch": 0.3776925346709944, "grad_norm": 0.9922409653663635, "learning_rate": 3.940968490066559e-05, "loss": 1.261, "step": 320 }, { "epoch": 0.389495426379463, "grad_norm": 1.4820419549942017, "learning_rate": 3.855540964679658e-05, "loss": 1.2903, "step": 330 }, { "epoch": 0.40129831808793154, "grad_norm": 1.443935751914978, "learning_rate": 3.767809659674433e-05, "loss": 1.3593, "step": 340 }, { "epoch": 0.41310120979640014, "grad_norm": 2.1749682426452637, "learning_rate": 3.677923676846864e-05, "loss": 1.3608, "step": 350 }, { "epoch": 0.4249041015048687, "grad_norm": 1.4614121913909912, "learning_rate": 3.586035779927896e-05, "loss": 1.2742, "step": 360 }, { "epoch": 0.4367069932133373, "grad_norm": 1.1197657585144043, "learning_rate": 3.492302134957218e-05, "loss": 1.3217, "step": 370 }, { "epoch": 0.4485098849218058, "grad_norm": 1.1603928804397583, "learning_rate": 3.396882044874736e-05, "loss": 1.2824, "step": 380 }, { "epoch": 0.4603127766302744, "grad_norm": 1.6083821058273315, "learning_rate": 3.2999376787807864e-05, "loss": 1.344, "step": 390 }, { "epoch": 0.47211566833874297, "grad_norm": 1.56455397605896, "learning_rate": 3.201633796325233e-05, "loss": 1.3372, "step": 400 }, { "epoch": 0.48391856004721157, "grad_norm": 1.4750654697418213, "learning_rate": 3.1021374676938584e-05, "loss": 1.33, "step": 410 }, { "epoch": 0.49572145175568016, "grad_norm": 1.2510316371917725, "learning_rate": 3.0016177896679255e-05, "loss": 1.2919, "step": 420 }, { "epoch": 0.5075243434641488, "grad_norm": 1.2658268213272095, "learning_rate": 2.9002455982394944e-05, "loss": 1.2649, "step": 430 }, { "epoch": 0.5193272351726173, "grad_norm": 1.905948519706726, "learning_rate": 2.798193178270889e-05, "loss": 1.3047, "step": 440 }, { "epoch": 0.5311301268810859, "grad_norm": 1.2382662296295166, "learning_rate": 2.695633970691786e-05, "loss": 1.2862, "step": 450 }, { "epoch": 0.5429330185895545, "grad_norm": 1.2122917175292969, "learning_rate": 2.592742277731513e-05, "loss": 1.2843, "step": 460 }, { "epoch": 0.554735910298023, "grad_norm": 1.3638920783996582, "learning_rate": 2.489692966687566e-05, "loss": 1.2795, "step": 470 }, { "epoch": 0.5665388020064915, "grad_norm": 1.2353355884552002, "learning_rate": 2.386661172733762e-05, "loss": 1.1897, "step": 480 }, { "epoch": 0.5783416937149601, "grad_norm": 1.440238118171692, "learning_rate": 2.2838220012731365e-05, "loss": 1.3352, "step": 490 }, { "epoch": 0.5901445854234287, "grad_norm": 1.302875280380249, "learning_rate": 2.1813502303414306e-05, "loss": 1.2552, "step": 500 }, { "epoch": 0.6019474771318973, "grad_norm": 1.3485292196273804, "learning_rate": 2.0794200135669584e-05, "loss": 1.2573, "step": 510 }, { "epoch": 0.6137503688403659, "grad_norm": 2.1018223762512207, "learning_rate": 1.9782045841916625e-05, "loss": 1.2564, "step": 520 }, { "epoch": 0.6255532605488344, "grad_norm": 1.6336476802825928, "learning_rate": 1.877875960656394e-05, "loss": 1.1512, "step": 530 }, { "epoch": 0.637356152257303, "grad_norm": 1.4360566139221191, "learning_rate": 1.7786046542507843e-05, "loss": 1.2434, "step": 540 }, { "epoch": 0.6491590439657716, "grad_norm": 1.1216990947723389, "learning_rate": 1.680559379324558e-05, "loss": 1.325, "step": 550 }, { "epoch": 0.6609619356742402, "grad_norm": 1.6999801397323608, "learning_rate": 1.583906766552799e-05, "loss": 1.2197, "step": 560 }, { "epoch": 0.6727648273827088, "grad_norm": 1.4907481670379639, "learning_rate": 1.4888110797424782e-05, "loss": 1.2821, "step": 570 }, { "epoch": 0.6845677190911773, "grad_norm": 1.2150344848632812, "learning_rate": 1.3954339366615334e-05, "loss": 1.239, "step": 580 }, { "epoch": 0.6963706107996459, "grad_norm": 1.6709622144699097, "learning_rate": 1.303934034364983e-05, "loss": 1.2403, "step": 590 }, { "epoch": 0.7081735025081145, "grad_norm": 1.5160703659057617, "learning_rate": 1.21446687948485e-05, "loss": 1.2466, "step": 600 }, { "epoch": 0.7199763942165831, "grad_norm": 1.2667752504348755, "learning_rate": 1.1271845239423196e-05, "loss": 1.1662, "step": 610 }, { "epoch": 0.7317792859250516, "grad_norm": 1.685145616531372, "learning_rate": 1.0422353065312573e-05, "loss": 1.3161, "step": 620 }, { "epoch": 0.7435821776335202, "grad_norm": 1.5131856203079224, "learning_rate": 9.59763600812305e-06, "loss": 1.2608, "step": 630 }, { "epoch": 0.7553850693419888, "grad_norm": 1.2261701822280884, "learning_rate": 8.79909569745987e-06, "loss": 1.1507, "step": 640 }, { "epoch": 0.7671879610504574, "grad_norm": 1.2804995775222778, "learning_rate": 8.028089274818624e-06, "loss": 1.3008, "step": 650 }, { "epoch": 0.778990852758926, "grad_norm": 1.3678828477859497, "learning_rate": 7.285927087085423e-06, "loss": 1.272, "step": 660 }, { "epoch": 0.7907937444673945, "grad_norm": 1.3345593214035034, "learning_rate": 6.5738704595659065e-06, "loss": 1.1615, "step": 670 }, { "epoch": 0.8025966361758631, "grad_norm": 1.2585678100585938, "learning_rate": 5.893129552327781e-06, "loss": 1.1878, "step": 680 }, { "epoch": 0.8143995278843317, "grad_norm": 1.3462913036346436, "learning_rate": 5.244861303500026e-06, "loss": 1.2436, "step": 690 }, { "epoch": 0.8262024195928003, "grad_norm": 1.1118088960647583, "learning_rate": 4.630167463024393e-06, "loss": 1.0838, "step": 700 }, { "epoch": 0.8380053113012688, "grad_norm": 1.7299799919128418, "learning_rate": 4.050092720200638e-06, "loss": 1.1495, "step": 710 }, { "epoch": 0.8498082030097374, "grad_norm": 1.3773056268692017, "learning_rate": 3.5056229282080077e-06, "loss": 1.234, "step": 720 }, { "epoch": 0.861611094718206, "grad_norm": 1.2820888757705688, "learning_rate": 2.997683428620296e-06, "loss": 1.1803, "step": 730 }, { "epoch": 0.8734139864266746, "grad_norm": 1.3301385641098022, "learning_rate": 2.527137478762037e-06, "loss": 1.2197, "step": 740 }, { "epoch": 0.8852168781351432, "grad_norm": 1.7628834247589111, "learning_rate": 2.094784784578707e-06, "loss": 1.2354, "step": 750 }, { "epoch": 0.8970197698436116, "grad_norm": 1.2032676935195923, "learning_rate": 1.7013601415141383e-06, "loss": 1.1835, "step": 760 }, { "epoch": 0.9088226615520802, "grad_norm": 1.5983058214187622, "learning_rate": 1.3475321857052386e-06, "loss": 1.1651, "step": 770 }, { "epoch": 0.9206255532605488, "grad_norm": 1.0227899551391602, "learning_rate": 1.03390225761624e-06, "loss": 1.1662, "step": 780 }, { "epoch": 0.9324284449690174, "grad_norm": 1.352665901184082, "learning_rate": 7.610033800438344e-07, "loss": 1.1798, "step": 790 }, { "epoch": 0.9442313366774859, "grad_norm": 1.6476454734802246, "learning_rate": 5.292993522301005e-07, "loss": 1.2053, "step": 800 }, { "epoch": 0.9560342283859545, "grad_norm": 1.2775633335113525, "learning_rate": 3.3918396162275214e-07, "loss": 1.2049, "step": 810 }, { "epoch": 0.9678371200944231, "grad_norm": 1.4991925954818726, "learning_rate": 1.9098031462242705e-07, "loss": 1.2097, "step": 820 }, { "epoch": 0.9796400118028917, "grad_norm": 1.3501712083816528, "learning_rate": 8.494028745434368e-08, "loss": 1.2085, "step": 830 }, { "epoch": 0.9914429035113603, "grad_norm": 1.319488763809204, "learning_rate": 2.124409809766692e-08, "loss": 1.1854, "step": 840 }, { "epoch": 0.9997049277072882, "step": 847, "total_flos": 1.2532647345436754e+18, "train_loss": 1.3799798170537847, "train_runtime": 10524.3823, "train_samples_per_second": 2.576, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 847, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2532647345436754e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }