{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02732408413085504, "eval_steps": 25, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002185926730468403, "grad_norm": NaN, "learning_rate": 1e-05, "loss": 0.0, "step": 1 }, { "epoch": 0.0002185926730468403, "eval_loss": NaN, "eval_runtime": 1683.6589, "eval_samples_per_second": 9.153, "eval_steps_per_second": 4.576, "step": 1 }, { "epoch": 0.0004371853460936806, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 0.0, "step": 2 }, { "epoch": 0.000655778019140521, "grad_norm": NaN, "learning_rate": 3e-05, "loss": 0.0, "step": 3 }, { "epoch": 0.0008743706921873612, "grad_norm": NaN, "learning_rate": 4e-05, "loss": 0.0, "step": 4 }, { "epoch": 0.0010929633652342016, "grad_norm": NaN, "learning_rate": 5e-05, "loss": 0.0, "step": 5 }, { "epoch": 0.001311556038281042, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 0.0, "step": 6 }, { "epoch": 0.0015301487113278823, "grad_norm": NaN, "learning_rate": 7e-05, "loss": 0.0, "step": 7 }, { "epoch": 0.0017487413843747224, "grad_norm": NaN, "learning_rate": 8e-05, "loss": 0.0, "step": 8 }, { "epoch": 0.001967334057421563, "grad_norm": NaN, "learning_rate": 9e-05, "loss": 0.0, "step": 9 }, { "epoch": 0.002185926730468403, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 0.0, "step": 10 }, { "epoch": 0.0024045194035152433, "grad_norm": NaN, "learning_rate": 0.00011000000000000002, "loss": 0.0, "step": 11 }, { "epoch": 0.002623112076562084, "grad_norm": NaN, "learning_rate": 0.00012, "loss": 0.0, "step": 12 }, { "epoch": 0.002841704749608924, "grad_norm": NaN, "learning_rate": 0.00013000000000000002, "loss": 0.0, "step": 13 }, { "epoch": 0.0030602974226557646, "grad_norm": NaN, "learning_rate": 0.00014, "loss": 0.0, "step": 14 }, { "epoch": 0.0032788900957026047, "grad_norm": NaN, "learning_rate": 0.00015000000000000001, "loss": 0.0, "step": 15 }, { "epoch": 0.003497482768749445, "grad_norm": NaN, "learning_rate": 0.00016, "loss": 0.0, "step": 16 }, { "epoch": 0.0037160754417962854, "grad_norm": NaN, "learning_rate": 0.00017, "loss": 0.0, "step": 17 }, { "epoch": 0.003934668114843126, "grad_norm": NaN, "learning_rate": 0.00018, "loss": 0.0, "step": 18 }, { "epoch": 0.004153260787889966, "grad_norm": NaN, "learning_rate": 0.00019, "loss": 0.0, "step": 19 }, { "epoch": 0.004371853460936806, "grad_norm": NaN, "learning_rate": 0.0002, "loss": 0.0, "step": 20 }, { "epoch": 0.004590446133983647, "grad_norm": NaN, "learning_rate": 0.00019995524322835034, "loss": 0.0, "step": 21 }, { "epoch": 0.0048090388070304866, "grad_norm": NaN, "learning_rate": 0.0001998210129767735, "loss": 0.0, "step": 22 }, { "epoch": 0.005027631480077327, "grad_norm": NaN, "learning_rate": 0.00019959742939952392, "loss": 0.0, "step": 23 }, { "epoch": 0.005246224153124168, "grad_norm": NaN, "learning_rate": 0.00019928469263418374, "loss": 0.0, "step": 24 }, { "epoch": 0.005464816826171007, "grad_norm": NaN, "learning_rate": 0.00019888308262251285, "loss": 0.0, "step": 25 }, { "epoch": 0.005464816826171007, "eval_loss": NaN, "eval_runtime": 2498.0917, "eval_samples_per_second": 6.169, "eval_steps_per_second": 3.084, "step": 25 }, { "epoch": 0.005683409499217848, "grad_norm": NaN, "learning_rate": 0.00019839295885986296, "loss": 0.0, "step": 26 }, { "epoch": 0.005902002172264689, "grad_norm": NaN, "learning_rate": 0.00019781476007338058, "loss": 0.0, "step": 27 }, { "epoch": 0.006120594845311529, "grad_norm": NaN, "learning_rate": 0.00019714900382928675, "loss": 0.0, "step": 28 }, { "epoch": 0.006339187518358369, "grad_norm": NaN, "learning_rate": 0.00019639628606958533, "loss": 0.0, "step": 29 }, { "epoch": 0.006557780191405209, "grad_norm": NaN, "learning_rate": 0.0001955572805786141, "loss": 0.0, "step": 30 }, { "epoch": 0.00677637286445205, "grad_norm": NaN, "learning_rate": 0.00019463273837991643, "loss": 0.0, "step": 31 }, { "epoch": 0.00699496553749889, "grad_norm": NaN, "learning_rate": 0.00019362348706397373, "loss": 0.0, "step": 32 }, { "epoch": 0.00721355821054573, "grad_norm": NaN, "learning_rate": 0.00019253043004739968, "loss": 0.0, "step": 33 }, { "epoch": 0.007432150883592571, "grad_norm": NaN, "learning_rate": 0.0001913545457642601, "loss": 0.0, "step": 34 }, { "epoch": 0.0076507435566394106, "grad_norm": NaN, "learning_rate": 0.0001900968867902419, "loss": 0.0, "step": 35 }, { "epoch": 0.007869336229686252, "grad_norm": NaN, "learning_rate": 0.00018875857890045543, "loss": 0.0, "step": 36 }, { "epoch": 0.00808792890273309, "grad_norm": NaN, "learning_rate": 0.00018734082006171299, "loss": 0.0, "step": 37 }, { "epoch": 0.008306521575779931, "grad_norm": NaN, "learning_rate": 0.00018584487936018661, "loss": 0.0, "step": 38 }, { "epoch": 0.008525114248826772, "grad_norm": NaN, "learning_rate": 0.0001842720958654039, "loss": 0.0, "step": 39 }, { "epoch": 0.008743706921873613, "grad_norm": NaN, "learning_rate": 0.0001826238774315995, "loss": 0.0, "step": 40 }, { "epoch": 0.008962299594920453, "grad_norm": NaN, "learning_rate": 0.00018090169943749476, "loss": 0.0, "step": 41 }, { "epoch": 0.009180892267967294, "grad_norm": NaN, "learning_rate": 0.00017910710346563416, "loss": 0.0, "step": 42 }, { "epoch": 0.009399484941014133, "grad_norm": NaN, "learning_rate": 0.00017724169592245995, "loss": 0.0, "step": 43 }, { "epoch": 0.009618077614060973, "grad_norm": NaN, "learning_rate": 0.00017530714660036112, "loss": 0.0, "step": 44 }, { "epoch": 0.009836670287107814, "grad_norm": NaN, "learning_rate": 0.00017330518718298264, "loss": 0.0, "step": 45 }, { "epoch": 0.010055262960154654, "grad_norm": NaN, "learning_rate": 0.0001712376096951345, "loss": 0.0, "step": 46 }, { "epoch": 0.010273855633201495, "grad_norm": NaN, "learning_rate": 0.00016910626489868649, "loss": 0.0, "step": 47 }, { "epoch": 0.010492448306248335, "grad_norm": NaN, "learning_rate": 0.00016691306063588583, "loss": 0.0, "step": 48 }, { "epoch": 0.010711040979295176, "grad_norm": NaN, "learning_rate": 0.00016465996012157995, "loss": 0.0, "step": 49 }, { "epoch": 0.010929633652342015, "grad_norm": NaN, "learning_rate": 0.00016234898018587337, "loss": 0.0, "step": 50 }, { "epoch": 0.010929633652342015, "eval_loss": NaN, "eval_runtime": 3709.6115, "eval_samples_per_second": 4.154, "eval_steps_per_second": 2.077, "step": 50 }, { "epoch": 0.011148226325388855, "grad_norm": NaN, "learning_rate": 0.00015998218946879138, "loss": 0.0, "step": 51 }, { "epoch": 0.011366818998435696, "grad_norm": NaN, "learning_rate": 0.00015756170656856737, "loss": 0.0, "step": 52 }, { "epoch": 0.011585411671482537, "grad_norm": NaN, "learning_rate": 0.00015508969814521025, "loss": 0.0, "step": 53 }, { "epoch": 0.011804004344529377, "grad_norm": NaN, "learning_rate": 0.00015256837698105047, "loss": 0.0, "step": 54 }, { "epoch": 0.012022597017576218, "grad_norm": NaN, "learning_rate": 0.00015000000000000001, "loss": 0.0, "step": 55 }, { "epoch": 0.012241189690623058, "grad_norm": NaN, "learning_rate": 0.00014738686624729986, "loss": 0.0, "step": 56 }, { "epoch": 0.012459782363669897, "grad_norm": NaN, "learning_rate": 0.00014473131483156327, "loss": 0.0, "step": 57 }, { "epoch": 0.012678375036716738, "grad_norm": NaN, "learning_rate": 0.00014203572283095657, "loss": 0.0, "step": 58 }, { "epoch": 0.012896967709763578, "grad_norm": NaN, "learning_rate": 0.00013930250316539238, "loss": 0.0, "step": 59 }, { "epoch": 0.013115560382810419, "grad_norm": NaN, "learning_rate": 0.00013653410243663952, "loss": 0.0, "step": 60 }, { "epoch": 0.01333415305585726, "grad_norm": NaN, "learning_rate": 0.00013373299873828303, "loss": 0.0, "step": 61 }, { "epoch": 0.0135527457289041, "grad_norm": NaN, "learning_rate": 0.00013090169943749476, "loss": 0.0, "step": 62 }, { "epoch": 0.013771338401950939, "grad_norm": NaN, "learning_rate": 0.00012804273893060028, "loss": 0.0, "step": 63 }, { "epoch": 0.01398993107499778, "grad_norm": NaN, "learning_rate": 0.00012515867637445086, "loss": 0.0, "step": 64 }, { "epoch": 0.01420852374804462, "grad_norm": NaN, "learning_rate": 0.00012225209339563145, "loss": 0.0, "step": 65 }, { "epoch": 0.01442711642109146, "grad_norm": NaN, "learning_rate": 0.00011932559177955533, "loss": 0.0, "step": 66 }, { "epoch": 0.014645709094138301, "grad_norm": NaN, "learning_rate": 0.00011638179114151377, "loss": 0.0, "step": 67 }, { "epoch": 0.014864301767185142, "grad_norm": NaN, "learning_rate": 0.00011342332658176555, "loss": 0.0, "step": 68 }, { "epoch": 0.015082894440231982, "grad_norm": NaN, "learning_rate": 0.00011045284632676536, "loss": 0.0, "step": 69 }, { "epoch": 0.015301487113278821, "grad_norm": NaN, "learning_rate": 0.00010747300935864243, "loss": 0.0, "step": 70 }, { "epoch": 0.015520079786325662, "grad_norm": NaN, "learning_rate": 0.00010448648303505151, "loss": 0.0, "step": 71 }, { "epoch": 0.015738672459372504, "grad_norm": NaN, "learning_rate": 0.00010149594070152638, "loss": 0.0, "step": 72 }, { "epoch": 0.01595726513241934, "grad_norm": NaN, "learning_rate": 9.850405929847366e-05, "loss": 0.0, "step": 73 }, { "epoch": 0.01617585780546618, "grad_norm": NaN, "learning_rate": 9.551351696494854e-05, "loss": 0.0, "step": 74 }, { "epoch": 0.016394450478513022, "grad_norm": NaN, "learning_rate": 9.252699064135758e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.016394450478513022, "eval_loss": NaN, "eval_runtime": 3699.4005, "eval_samples_per_second": 4.166, "eval_steps_per_second": 2.083, "step": 75 }, { "epoch": 0.016613043151559863, "grad_norm": NaN, "learning_rate": 8.954715367323468e-05, "loss": 0.0, "step": 76 }, { "epoch": 0.016831635824606703, "grad_norm": NaN, "learning_rate": 8.657667341823448e-05, "loss": 0.0, "step": 77 }, { "epoch": 0.017050228497653544, "grad_norm": NaN, "learning_rate": 8.361820885848624e-05, "loss": 0.0, "step": 78 }, { "epoch": 0.017268821170700385, "grad_norm": NaN, "learning_rate": 8.067440822044469e-05, "loss": 0.0, "step": 79 }, { "epoch": 0.017487413843747225, "grad_norm": NaN, "learning_rate": 7.774790660436858e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.017706006516794066, "grad_norm": NaN, "learning_rate": 7.484132362554915e-05, "loss": 0.0, "step": 81 }, { "epoch": 0.017924599189840906, "grad_norm": NaN, "learning_rate": 7.195726106939974e-05, "loss": 0.0, "step": 82 }, { "epoch": 0.018143191862887747, "grad_norm": NaN, "learning_rate": 6.909830056250527e-05, "loss": 0.0, "step": 83 }, { "epoch": 0.018361784535934587, "grad_norm": NaN, "learning_rate": 6.626700126171702e-05, "loss": 0.0, "step": 84 }, { "epoch": 0.018580377208981428, "grad_norm": NaN, "learning_rate": 6.34658975633605e-05, "loss": 0.0, "step": 85 }, { "epoch": 0.018798969882028265, "grad_norm": NaN, "learning_rate": 6.069749683460765e-05, "loss": 0.0, "step": 86 }, { "epoch": 0.019017562555075106, "grad_norm": NaN, "learning_rate": 5.796427716904347e-05, "loss": 0.0, "step": 87 }, { "epoch": 0.019236155228121946, "grad_norm": NaN, "learning_rate": 5.526868516843673e-05, "loss": 0.0, "step": 88 }, { "epoch": 0.019454747901168787, "grad_norm": NaN, "learning_rate": 5.261313375270014e-05, "loss": 0.0, "step": 89 }, { "epoch": 0.019673340574215627, "grad_norm": NaN, "learning_rate": 5.000000000000002e-05, "loss": 0.0, "step": 90 }, { "epoch": 0.019891933247262468, "grad_norm": NaN, "learning_rate": 4.743162301894952e-05, "loss": 0.0, "step": 91 }, { "epoch": 0.02011052592030931, "grad_norm": NaN, "learning_rate": 4.491030185478976e-05, "loss": 0.0, "step": 92 }, { "epoch": 0.02032911859335615, "grad_norm": NaN, "learning_rate": 4.2438293431432665e-05, "loss": 0.0, "step": 93 }, { "epoch": 0.02054771126640299, "grad_norm": NaN, "learning_rate": 4.001781053120863e-05, "loss": 0.0, "step": 94 }, { "epoch": 0.02076630393944983, "grad_norm": NaN, "learning_rate": 3.7651019814126654e-05, "loss": 0.0, "step": 95 }, { "epoch": 0.02098489661249667, "grad_norm": NaN, "learning_rate": 3.534003987842005e-05, "loss": 0.0, "step": 96 }, { "epoch": 0.02120348928554351, "grad_norm": NaN, "learning_rate": 3.308693936411421e-05, "loss": 0.0, "step": 97 }, { "epoch": 0.021422081958590352, "grad_norm": NaN, "learning_rate": 3.089373510131354e-05, "loss": 0.0, "step": 98 }, { "epoch": 0.02164067463163719, "grad_norm": NaN, "learning_rate": 2.876239030486554e-05, "loss": 0.0, "step": 99 }, { "epoch": 0.02185926730468403, "grad_norm": NaN, "learning_rate": 2.669481281701739e-05, "loss": 0.0, "step": 100 }, { "epoch": 0.02185926730468403, "eval_loss": NaN, "eval_runtime": 3094.1875, "eval_samples_per_second": 4.98, "eval_steps_per_second": 2.49, "step": 100 }, { "epoch": 0.02207785997773087, "grad_norm": NaN, "learning_rate": 2.4692853399638917e-05, "loss": 0.0, "step": 101 }, { "epoch": 0.02229645265077771, "grad_norm": NaN, "learning_rate": 2.275830407754006e-05, "loss": 0.0, "step": 102 }, { "epoch": 0.02251504532382455, "grad_norm": NaN, "learning_rate": 2.0892896534365904e-05, "loss": 0.0, "step": 103 }, { "epoch": 0.022733637996871392, "grad_norm": NaN, "learning_rate": 1.9098300562505266e-05, "loss": 0.0, "step": 104 }, { "epoch": 0.022952230669918233, "grad_norm": NaN, "learning_rate": 1.7376122568400532e-05, "loss": 0.0, "step": 105 }, { "epoch": 0.023170823342965073, "grad_norm": NaN, "learning_rate": 1.5727904134596083e-05, "loss": 0.0, "step": 106 }, { "epoch": 0.023389416016011914, "grad_norm": NaN, "learning_rate": 1.415512063981339e-05, "loss": 0.0, "step": 107 }, { "epoch": 0.023608008689058754, "grad_norm": NaN, "learning_rate": 1.2659179938287035e-05, "loss": 0.0, "step": 108 }, { "epoch": 0.023826601362105595, "grad_norm": NaN, "learning_rate": 1.124142109954459e-05, "loss": 0.0, "step": 109 }, { "epoch": 0.024045194035152435, "grad_norm": NaN, "learning_rate": 9.903113209758096e-06, "loss": 0.0, "step": 110 }, { "epoch": 0.024263786708199276, "grad_norm": NaN, "learning_rate": 8.645454235739903e-06, "loss": 0.0, "step": 111 }, { "epoch": 0.024482379381246117, "grad_norm": NaN, "learning_rate": 7.46956995260033e-06, "loss": 0.0, "step": 112 }, { "epoch": 0.024700972054292954, "grad_norm": NaN, "learning_rate": 6.37651293602628e-06, "loss": 0.0, "step": 113 }, { "epoch": 0.024919564727339794, "grad_norm": NaN, "learning_rate": 5.367261620083575e-06, "loss": 0.0, "step": 114 }, { "epoch": 0.025138157400386635, "grad_norm": NaN, "learning_rate": 4.442719421385922e-06, "loss": 0.0, "step": 115 }, { "epoch": 0.025356750073433475, "grad_norm": NaN, "learning_rate": 3.6037139304146762e-06, "loss": 0.0, "step": 116 }, { "epoch": 0.025575342746480316, "grad_norm": NaN, "learning_rate": 2.8509961707132494e-06, "loss": 0.0, "step": 117 }, { "epoch": 0.025793935419527157, "grad_norm": NaN, "learning_rate": 2.1852399266194314e-06, "loss": 0.0, "step": 118 }, { "epoch": 0.026012528092573997, "grad_norm": NaN, "learning_rate": 1.6070411401370334e-06, "loss": 0.0, "step": 119 }, { "epoch": 0.026231120765620838, "grad_norm": NaN, "learning_rate": 1.1169173774871478e-06, "loss": 0.0, "step": 120 }, { "epoch": 0.02644971343866768, "grad_norm": NaN, "learning_rate": 7.153073658162646e-07, "loss": 0.0, "step": 121 }, { "epoch": 0.02666830611171452, "grad_norm": NaN, "learning_rate": 4.025706004760932e-07, "loss": 0.0, "step": 122 }, { "epoch": 0.02688689878476136, "grad_norm": NaN, "learning_rate": 1.7898702322648453e-07, "loss": 0.0, "step": 123 }, { "epoch": 0.0271054914578082, "grad_norm": NaN, "learning_rate": 4.475677164966774e-08, "loss": 0.0, "step": 124 }, { "epoch": 0.02732408413085504, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.0, "step": 125 }, { "epoch": 0.02732408413085504, "eval_loss": NaN, "eval_runtime": 3732.9812, "eval_samples_per_second": 4.128, "eval_steps_per_second": 2.064, "step": 125 } ], "logging_steps": 1, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 11, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.46010452164608e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }