{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.504065040650406, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.6131871408580806, "learning_rate": 4.9980094094149945e-05, "loss": 1.047, "step": 25 }, { "epoch": 0.2, "grad_norm": 0.515709399418576, "learning_rate": 4.992040807620678e-05, "loss": 0.9666, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.38345995255280807, "learning_rate": 4.982103699451082e-05, "loss": 0.8994, "step": 75 }, { "epoch": 0.41, "grad_norm": 0.4106925564227259, "learning_rate": 4.968213909477376e-05, "loss": 0.8928, "step": 100 }, { "epoch": 0.51, "grad_norm": 0.46245157217806615, "learning_rate": 4.950393556807682e-05, "loss": 0.891, "step": 125 }, { "epoch": 0.61, "grad_norm": 0.46965485639834587, "learning_rate": 4.928671019862995e-05, "loss": 0.8741, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.5208840886647297, "learning_rate": 4.903080891185335e-05, "loss": 0.8649, "step": 175 }, { "epoch": 0.81, "grad_norm": 0.502935818668578, "learning_rate": 4.873663922350073e-05, "loss": 0.8713, "step": 200 }, { "epoch": 0.91, "grad_norm": 0.5664576246853419, "learning_rate": 4.840466959070174e-05, "loss": 0.8289, "step": 225 }, { "epoch": 1.02, "grad_norm": 0.5222205590215429, "learning_rate": 4.8035428665956806e-05, "loss": 0.8426, "step": 250 }, { "epoch": 1.12, "grad_norm": 0.5979794941397252, "learning_rate": 4.762950445527264e-05, "loss": 0.8426, "step": 275 }, { "epoch": 1.22, "grad_norm": 0.6574658282987255, "learning_rate": 4.7187543381778864e-05, "loss": 0.8445, "step": 300 }, { "epoch": 1.32, "grad_norm": 0.6208868160129577, "learning_rate": 4.671024925631694e-05, "loss": 0.8405, "step": 325 }, { "epoch": 1.42, "grad_norm": 0.6261313580655511, "learning_rate": 4.619838215664082e-05, "loss": 0.8177, "step": 350 }, { "epoch": 1.52, "grad_norm": 0.6202678860432359, "learning_rate": 4.5652757217013995e-05, "loss": 0.8192, "step": 375 }, { "epoch": 1.63, "grad_norm": 0.7078761621579428, "learning_rate": 4.507424333013069e-05, "loss": 0.8215, "step": 400 }, { "epoch": 1.73, "grad_norm": 0.7653619566673501, "learning_rate": 4.4463761763428125e-05, "loss": 0.8308, "step": 425 }, { "epoch": 1.83, "grad_norm": 0.6453208813641874, "learning_rate": 4.38222846919935e-05, "loss": 0.8272, "step": 450 }, { "epoch": 1.93, "grad_norm": 0.6377834929128101, "learning_rate": 4.315083365040192e-05, "loss": 0.8248, "step": 475 }, { "epoch": 2.03, "grad_norm": 0.770141823944302, "learning_rate": 4.245047790595075e-05, "loss": 0.8284, "step": 500 }, { "epoch": 2.13, "grad_norm": 0.721253950175735, "learning_rate": 4.172233275588082e-05, "loss": 0.8229, "step": 525 }, { "epoch": 2.24, "grad_norm": 0.6231476906453041, "learning_rate": 4.0967557751296336e-05, "loss": 0.8089, "step": 550 }, { "epoch": 2.34, "grad_norm": 0.8268572656351973, "learning_rate": 4.0187354850611636e-05, "loss": 0.8028, "step": 575 }, { "epoch": 2.44, "grad_norm": 0.7854716225601517, "learning_rate": 3.938296650546552e-05, "loss": 0.8142, "step": 600 }, { "epoch": 2.54, "grad_norm": 0.6745304491885583, "learning_rate": 3.8555673682151215e-05, "loss": 0.8098, "step": 625 }, { "epoch": 2.64, "grad_norm": 0.8428250770310277, "learning_rate": 3.7706793821712826e-05, "loss": 0.8063, "step": 650 }, { "epoch": 2.74, "grad_norm": 0.7050077227945795, "learning_rate": 3.683767874195674e-05, "loss": 0.8053, "step": 675 }, { "epoch": 2.85, "grad_norm": 0.7324375980300449, "learning_rate": 3.5949712484719014e-05, "loss": 0.8003, "step": 700 }, { "epoch": 2.95, "grad_norm": 0.7448120924614113, "learning_rate": 3.5044309111816796e-05, "loss": 0.7983, "step": 725 }, { "epoch": 3.05, "grad_norm": 0.7846948483796882, "learning_rate": 3.4122910453193885e-05, "loss": 0.8005, "step": 750 }, { "epoch": 3.15, "grad_norm": 0.7611752532437104, "learning_rate": 3.318698381084619e-05, "loss": 0.8002, "step": 775 }, { "epoch": 3.25, "grad_norm": 0.7881990924188006, "learning_rate": 3.223801962218372e-05, "loss": 0.7976, "step": 800 }, { "epoch": 3.35, "grad_norm": 0.7610818779512661, "learning_rate": 3.127752908655004e-05, "loss": 0.7965, "step": 825 }, { "epoch": 3.46, "grad_norm": 0.7180364283278721, "learning_rate": 3.0307041758678932e-05, "loss": 0.7876, "step": 850 }, { "epoch": 3.56, "grad_norm": 0.9309526041694146, "learning_rate": 2.932810311292058e-05, "loss": 0.7957, "step": 875 }, { "epoch": 3.66, "grad_norm": 0.7828785008783581, "learning_rate": 2.834227208211621e-05, "loss": 0.793, "step": 900 }, { "epoch": 3.76, "grad_norm": 0.7581881527652482, "learning_rate": 2.7351118575040496e-05, "loss": 0.7808, "step": 925 }, { "epoch": 3.86, "grad_norm": 0.8188308134420089, "learning_rate": 2.635622097636501e-05, "loss": 0.8139, "step": 950 }, { "epoch": 3.96, "grad_norm": 0.8386150171289235, "learning_rate": 2.535916363312414e-05, "loss": 0.7902, "step": 975 }, { "epoch": 4.07, "grad_norm": 0.39693696134084294, "learning_rate": 2.4361534331686003e-05, "loss": 0.7851, "step": 1000 }, { "epoch": 4.17, "grad_norm": 0.8435295350294197, "learning_rate": 2.3364921769246423e-05, "loss": 0.786, "step": 1025 }, { "epoch": 4.27, "grad_norm": 0.878180623936396, "learning_rate": 2.2370913023872355e-05, "loss": 0.7824, "step": 1050 }, { "epoch": 4.37, "grad_norm": 0.8477688393540482, "learning_rate": 2.138109102712376e-05, "loss": 0.7917, "step": 1075 }, { "epoch": 4.47, "grad_norm": 1.0116402877637123, "learning_rate": 2.0397032043278687e-05, "loss": 0.7952, "step": 1100 }, { "epoch": 4.57, "grad_norm": 0.7542753798456995, "learning_rate": 1.9420303159175796e-05, "loss": 0.7794, "step": 1125 }, { "epoch": 4.67, "grad_norm": 0.8847241202927656, "learning_rate": 1.8452459788671738e-05, "loss": 0.7771, "step": 1150 }, { "epoch": 4.78, "grad_norm": 0.9824412667192655, "learning_rate": 1.7495043195687368e-05, "loss": 0.7803, "step": 1175 }, { "epoch": 4.88, "grad_norm": 0.8602734894405721, "learning_rate": 1.6549578039787436e-05, "loss": 0.7917, "step": 1200 }, { "epoch": 4.98, "grad_norm": 0.8666836068688513, "learning_rate": 1.561756994820216e-05, "loss": 0.788, "step": 1225 }, { "epoch": 5.08, "grad_norm": 0.9258527307819997, "learning_rate": 1.470050311815736e-05, "loss": 0.7705, "step": 1250 }, { "epoch": 5.18, "grad_norm": 0.8495623021085815, "learning_rate": 1.379983795333119e-05, "loss": 0.7681, "step": 1275 }, { "epoch": 5.28, "grad_norm": 0.8867986411348584, "learning_rate": 1.2917008738201537e-05, "loss": 0.7777, "step": 1300 }, { "epoch": 5.39, "grad_norm": 0.8823017587324554, "learning_rate": 1.2053421353987437e-05, "loss": 0.771, "step": 1325 }, { "epoch": 5.49, "grad_norm": 0.9090926331206884, "learning_rate": 1.1210451039821965e-05, "loss": 0.7955, "step": 1350 }, { "epoch": 5.59, "grad_norm": 0.9340530184318201, "learning_rate": 1.0389440202721778e-05, "loss": 0.7816, "step": 1375 }, { "epoch": 5.69, "grad_norm": 0.9458040444506768, "learning_rate": 9.591696279840906e-06, "loss": 0.7824, "step": 1400 }, { "epoch": 5.79, "grad_norm": 0.9319742846682855, "learning_rate": 8.818489656413043e-06, "loss": 0.7725, "step": 1425 }, { "epoch": 5.89, "grad_norm": 0.9287287760402508, "learning_rate": 8.071051642698074e-06, "loss": 0.7852, "step": 1450 }, { "epoch": 6.0, "grad_norm": 0.9386074844778828, "learning_rate": 7.350572513154377e-06, "loss": 0.7832, "step": 1475 }, { "epoch": 6.1, "grad_norm": 0.9144394155026029, "learning_rate": 6.658199610959537e-06, "loss": 0.7887, "step": 1500 }, { "epoch": 6.2, "grad_norm": 0.8674048478962856, "learning_rate": 5.995035520897882e-06, "loss": 0.7833, "step": 1525 }, { "epoch": 6.3, "grad_norm": 0.9595278814551851, "learning_rate": 5.362136313524607e-06, "loss": 0.7627, "step": 1550 }, { "epoch": 6.4, "grad_norm": 0.9242690658583879, "learning_rate": 4.760509863402468e-06, "loss": 0.7705, "step": 1575 }, { "epoch": 6.5, "grad_norm": 0.9233544033053771, "learning_rate": 4.19111424408932e-06, "loss": 0.7709, "step": 1600 } ], "logging_steps": 25, "max_steps": 1968, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "total_flos": 488382231740416.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }