{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.969476634098311, "eval_steps": 300, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016157943901638515, "grad_norm": 76.5, "learning_rate": 9.999684417150033e-07, "loss": 94.6294, "step": 10 }, { "epoch": 0.03231588780327703, "grad_norm": 73.25, "learning_rate": 9.999368834300069e-07, "loss": 91.5485, "step": 20 }, { "epoch": 0.04847383170491555, "grad_norm": 68.25, "learning_rate": 9.999053251450102e-07, "loss": 90.8359, "step": 30 }, { "epoch": 0.06463177560655406, "grad_norm": 64.9375, "learning_rate": 9.998737668600136e-07, "loss": 90.9917, "step": 40 }, { "epoch": 0.08078971950819258, "grad_norm": 63.53125, "learning_rate": 9.998422085750172e-07, "loss": 91.0531, "step": 50 }, { "epoch": 0.0969476634098311, "grad_norm": 68.125, "learning_rate": 9.998106502900205e-07, "loss": 89.3149, "step": 60 }, { "epoch": 0.11310560731146961, "grad_norm": 64.375, "learning_rate": 9.99779092005024e-07, "loss": 89.6989, "step": 70 }, { "epoch": 0.12926355121310812, "grad_norm": 64.1875, "learning_rate": 9.997475337200274e-07, "loss": 88.5876, "step": 80 }, { "epoch": 0.14542149511474664, "grad_norm": 66.1875, "learning_rate": 9.99715975435031e-07, "loss": 89.8584, "step": 90 }, { "epoch": 0.16157943901638516, "grad_norm": 66.9375, "learning_rate": 9.996844171500344e-07, "loss": 89.2329, "step": 100 }, { "epoch": 0.17773738291802368, "grad_norm": 61.0625, "learning_rate": 9.996528588650377e-07, "loss": 89.2294, "step": 110 }, { "epoch": 0.1938953268196622, "grad_norm": 62.21875, "learning_rate": 9.996213005800413e-07, "loss": 88.9436, "step": 120 }, { "epoch": 0.21005327072130073, "grad_norm": 67.5, "learning_rate": 9.995897422950446e-07, "loss": 88.9848, "step": 130 }, { "epoch": 0.22621121462293922, "grad_norm": 65.0, "learning_rate": 9.99558184010048e-07, "loss": 89.7367, "step": 140 }, { "epoch": 0.24236915852457774, "grad_norm": 63.9375, "learning_rate": 9.995266257250516e-07, "loss": 88.9787, "step": 150 }, { "epoch": 0.25852710242621624, "grad_norm": 64.625, "learning_rate": 9.99495067440055e-07, "loss": 89.5066, "step": 160 }, { "epoch": 0.2746850463278548, "grad_norm": 69.5, "learning_rate": 9.994635091550585e-07, "loss": 88.551, "step": 170 }, { "epoch": 0.2908429902294933, "grad_norm": 67.125, "learning_rate": 9.994319508700618e-07, "loss": 90.1979, "step": 180 }, { "epoch": 0.30700093413113183, "grad_norm": 64.3125, "learning_rate": 9.994003925850654e-07, "loss": 88.6641, "step": 190 }, { "epoch": 0.3231588780327703, "grad_norm": 64.1875, "learning_rate": 9.993688343000688e-07, "loss": 87.9892, "step": 200 }, { "epoch": 0.3393168219344088, "grad_norm": 67.9375, "learning_rate": 9.993372760150721e-07, "loss": 88.7533, "step": 210 }, { "epoch": 0.35547476583604737, "grad_norm": 64.75, "learning_rate": 9.993057177300757e-07, "loss": 87.3739, "step": 220 }, { "epoch": 0.37163270973768586, "grad_norm": 68.75, "learning_rate": 9.99274159445079e-07, "loss": 88.7186, "step": 230 }, { "epoch": 0.3877906536393244, "grad_norm": 66.25, "learning_rate": 9.992426011600826e-07, "loss": 87.2317, "step": 240 }, { "epoch": 0.4039485975409629, "grad_norm": 68.1875, "learning_rate": 9.99211042875086e-07, "loss": 88.5885, "step": 250 }, { "epoch": 0.42010654144260146, "grad_norm": 69.125, "learning_rate": 9.991794845900893e-07, "loss": 89.1104, "step": 260 }, { "epoch": 0.43626448534423995, "grad_norm": 66.9375, "learning_rate": 9.991479263050929e-07, "loss": 89.6045, "step": 270 }, { "epoch": 0.45242242924587844, "grad_norm": 67.3125, "learning_rate": 9.991163680200962e-07, "loss": 89.1191, "step": 280 }, { "epoch": 0.468580373147517, "grad_norm": 64.875, "learning_rate": 9.990848097350996e-07, "loss": 89.2609, "step": 290 }, { "epoch": 0.4847383170491555, "grad_norm": 71.75, "learning_rate": 9.990532514501032e-07, "loss": 88.8598, "step": 300 }, { "epoch": 0.4847383170491555, "eval_loss": 1.375859022140503, "eval_runtime": 23.8472, "eval_samples_per_second": 1362.845, "eval_steps_per_second": 42.605, "step": 300 }, { "epoch": 0.500896260950794, "grad_norm": 62.3125, "learning_rate": 9.990216931651065e-07, "loss": 87.5903, "step": 310 }, { "epoch": 0.5170542048524325, "grad_norm": 61.0625, "learning_rate": 9.9899013488011e-07, "loss": 89.8225, "step": 320 }, { "epoch": 0.5332121487540711, "grad_norm": 65.6875, "learning_rate": 9.989585765951135e-07, "loss": 87.4679, "step": 330 }, { "epoch": 0.5493700926557096, "grad_norm": 66.625, "learning_rate": 9.98927018310117e-07, "loss": 88.2141, "step": 340 }, { "epoch": 0.5655280365573481, "grad_norm": 64.25, "learning_rate": 9.988954600251204e-07, "loss": 88.7629, "step": 350 }, { "epoch": 0.5816859804589866, "grad_norm": 67.6875, "learning_rate": 9.988639017401237e-07, "loss": 87.7045, "step": 360 }, { "epoch": 0.5978439243606251, "grad_norm": 68.3125, "learning_rate": 9.988323434551273e-07, "loss": 89.4493, "step": 370 }, { "epoch": 0.6140018682622637, "grad_norm": 63.6875, "learning_rate": 9.988007851701307e-07, "loss": 87.7596, "step": 380 }, { "epoch": 0.6301598121639022, "grad_norm": 64.875, "learning_rate": 9.98769226885134e-07, "loss": 88.0606, "step": 390 }, { "epoch": 0.6463177560655406, "grad_norm": 64.1875, "learning_rate": 9.987376686001376e-07, "loss": 87.7883, "step": 400 }, { "epoch": 0.6624756999671791, "grad_norm": 62.09375, "learning_rate": 9.98706110315141e-07, "loss": 88.77, "step": 410 }, { "epoch": 0.6786336438688176, "grad_norm": 65.6875, "learning_rate": 9.986745520301443e-07, "loss": 88.4047, "step": 420 }, { "epoch": 0.6947915877704562, "grad_norm": 65.3125, "learning_rate": 9.986429937451479e-07, "loss": 86.9815, "step": 430 }, { "epoch": 0.7109495316720947, "grad_norm": 68.0625, "learning_rate": 9.986114354601514e-07, "loss": 88.4078, "step": 440 }, { "epoch": 0.7271074755737332, "grad_norm": 64.4375, "learning_rate": 9.985798771751548e-07, "loss": 87.8946, "step": 450 }, { "epoch": 0.7432654194753717, "grad_norm": 65.3125, "learning_rate": 9.985483188901581e-07, "loss": 88.9161, "step": 460 }, { "epoch": 0.7594233633770103, "grad_norm": 64.875, "learning_rate": 9.985167606051617e-07, "loss": 87.7899, "step": 470 }, { "epoch": 0.7755813072786488, "grad_norm": 68.75, "learning_rate": 9.98485202320165e-07, "loss": 87.9098, "step": 480 }, { "epoch": 0.7917392511802873, "grad_norm": 64.625, "learning_rate": 9.984536440351684e-07, "loss": 88.2259, "step": 490 }, { "epoch": 0.8078971950819258, "grad_norm": 62.96875, "learning_rate": 9.98422085750172e-07, "loss": 87.4765, "step": 500 }, { "epoch": 0.8240551389835643, "grad_norm": 62.75, "learning_rate": 9.983905274651753e-07, "loss": 88.4725, "step": 510 }, { "epoch": 0.8402130828852029, "grad_norm": 66.3125, "learning_rate": 9.98358969180179e-07, "loss": 88.8554, "step": 520 }, { "epoch": 0.8563710267868414, "grad_norm": 64.75, "learning_rate": 9.983274108951823e-07, "loss": 88.1039, "step": 530 }, { "epoch": 0.8725289706884799, "grad_norm": 70.3125, "learning_rate": 9.982958526101856e-07, "loss": 87.927, "step": 540 }, { "epoch": 0.8886869145901184, "grad_norm": 61.03125, "learning_rate": 9.982642943251892e-07, "loss": 88.5062, "step": 550 }, { "epoch": 0.9048448584917569, "grad_norm": 69.1875, "learning_rate": 9.982327360401925e-07, "loss": 89.0445, "step": 560 }, { "epoch": 0.9210028023933955, "grad_norm": 64.625, "learning_rate": 9.982011777551961e-07, "loss": 87.5429, "step": 570 }, { "epoch": 0.937160746295034, "grad_norm": 64.4375, "learning_rate": 9.981696194701995e-07, "loss": 87.194, "step": 580 }, { "epoch": 0.9533186901966725, "grad_norm": 65.0, "learning_rate": 9.98138061185203e-07, "loss": 87.1017, "step": 590 }, { "epoch": 0.969476634098311, "grad_norm": 78.375, "learning_rate": 9.981065029002064e-07, "loss": 87.2723, "step": 600 }, { "epoch": 0.969476634098311, "eval_loss": 1.3628411293029785, "eval_runtime": 22.8208, "eval_samples_per_second": 1424.139, "eval_steps_per_second": 44.521, "step": 600 } ], "logging_steps": 10, "max_steps": 618, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.656669946647675e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }