|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.504065040650406, |
|
"eval_steps": 500, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6131871408580806, |
|
"learning_rate": 4.9980094094149945e-05, |
|
"loss": 1.047, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.515709399418576, |
|
"learning_rate": 4.992040807620678e-05, |
|
"loss": 0.9666, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.38345995255280807, |
|
"learning_rate": 4.982103699451082e-05, |
|
"loss": 0.8994, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4106925564227259, |
|
"learning_rate": 4.968213909477376e-05, |
|
"loss": 0.8928, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.46245157217806615, |
|
"learning_rate": 4.950393556807682e-05, |
|
"loss": 0.891, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.46965485639834587, |
|
"learning_rate": 4.928671019862995e-05, |
|
"loss": 0.8741, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5208840886647297, |
|
"learning_rate": 4.903080891185335e-05, |
|
"loss": 0.8649, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.502935818668578, |
|
"learning_rate": 4.873663922350073e-05, |
|
"loss": 0.8713, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5664576246853419, |
|
"learning_rate": 4.840466959070174e-05, |
|
"loss": 0.8289, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.5222205590215429, |
|
"learning_rate": 4.8035428665956806e-05, |
|
"loss": 0.8426, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5979794941397252, |
|
"learning_rate": 4.762950445527264e-05, |
|
"loss": 0.8426, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.6574658282987255, |
|
"learning_rate": 4.7187543381778864e-05, |
|
"loss": 0.8445, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.6208868160129577, |
|
"learning_rate": 4.671024925631694e-05, |
|
"loss": 0.8405, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.6261313580655511, |
|
"learning_rate": 4.619838215664082e-05, |
|
"loss": 0.8177, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.6202678860432359, |
|
"learning_rate": 4.5652757217013995e-05, |
|
"loss": 0.8192, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.7078761621579428, |
|
"learning_rate": 4.507424333013069e-05, |
|
"loss": 0.8215, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.7653619566673501, |
|
"learning_rate": 4.4463761763428125e-05, |
|
"loss": 0.8308, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.6453208813641874, |
|
"learning_rate": 4.38222846919935e-05, |
|
"loss": 0.8272, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.6377834929128101, |
|
"learning_rate": 4.315083365040192e-05, |
|
"loss": 0.8248, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.770141823944302, |
|
"learning_rate": 4.245047790595075e-05, |
|
"loss": 0.8284, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.721253950175735, |
|
"learning_rate": 4.172233275588082e-05, |
|
"loss": 0.8229, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.6231476906453041, |
|
"learning_rate": 4.0967557751296336e-05, |
|
"loss": 0.8089, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.8268572656351973, |
|
"learning_rate": 4.0187354850611636e-05, |
|
"loss": 0.8028, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.7854716225601517, |
|
"learning_rate": 3.938296650546552e-05, |
|
"loss": 0.8142, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.6745304491885583, |
|
"learning_rate": 3.8555673682151215e-05, |
|
"loss": 0.8098, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.8428250770310277, |
|
"learning_rate": 3.7706793821712826e-05, |
|
"loss": 0.8063, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7050077227945795, |
|
"learning_rate": 3.683767874195674e-05, |
|
"loss": 0.8053, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.7324375980300449, |
|
"learning_rate": 3.5949712484719014e-05, |
|
"loss": 0.8003, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.7448120924614113, |
|
"learning_rate": 3.5044309111816796e-05, |
|
"loss": 0.7983, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.7846948483796882, |
|
"learning_rate": 3.4122910453193885e-05, |
|
"loss": 0.8005, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.7611752532437104, |
|
"learning_rate": 3.318698381084619e-05, |
|
"loss": 0.8002, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.7881990924188006, |
|
"learning_rate": 3.223801962218372e-05, |
|
"loss": 0.7976, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.7610818779512661, |
|
"learning_rate": 3.127752908655004e-05, |
|
"loss": 0.7965, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.7180364283278721, |
|
"learning_rate": 3.0307041758678932e-05, |
|
"loss": 0.7876, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.9309526041694146, |
|
"learning_rate": 2.932810311292058e-05, |
|
"loss": 0.7957, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.7828785008783581, |
|
"learning_rate": 2.834227208211621e-05, |
|
"loss": 0.793, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7581881527652482, |
|
"learning_rate": 2.7351118575040496e-05, |
|
"loss": 0.7808, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.8188308134420089, |
|
"learning_rate": 2.635622097636501e-05, |
|
"loss": 0.8139, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.8386150171289235, |
|
"learning_rate": 2.535916363312414e-05, |
|
"loss": 0.7902, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.39693696134084294, |
|
"learning_rate": 2.4361534331686003e-05, |
|
"loss": 0.7851, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.8435295350294197, |
|
"learning_rate": 2.3364921769246423e-05, |
|
"loss": 0.786, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.878180623936396, |
|
"learning_rate": 2.2370913023872355e-05, |
|
"loss": 0.7824, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.8477688393540482, |
|
"learning_rate": 2.138109102712376e-05, |
|
"loss": 0.7917, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.0116402877637123, |
|
"learning_rate": 2.0397032043278687e-05, |
|
"loss": 0.7952, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.7542753798456995, |
|
"learning_rate": 1.9420303159175796e-05, |
|
"loss": 0.7794, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.8847241202927656, |
|
"learning_rate": 1.8452459788671738e-05, |
|
"loss": 0.7771, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.9824412667192655, |
|
"learning_rate": 1.7495043195687368e-05, |
|
"loss": 0.7803, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.8602734894405721, |
|
"learning_rate": 1.6549578039787436e-05, |
|
"loss": 0.7917, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.8666836068688513, |
|
"learning_rate": 1.561756994820216e-05, |
|
"loss": 0.788, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.9258527307819997, |
|
"learning_rate": 1.470050311815736e-05, |
|
"loss": 0.7705, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 0.8495623021085815, |
|
"learning_rate": 1.379983795333119e-05, |
|
"loss": 0.7681, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.8867986411348584, |
|
"learning_rate": 1.2917008738201537e-05, |
|
"loss": 0.7777, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 0.8823017587324554, |
|
"learning_rate": 1.2053421353987437e-05, |
|
"loss": 0.771, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.9090926331206884, |
|
"learning_rate": 1.1210451039821965e-05, |
|
"loss": 0.7955, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.9340530184318201, |
|
"learning_rate": 1.0389440202721778e-05, |
|
"loss": 0.7816, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 0.9458040444506768, |
|
"learning_rate": 9.591696279840906e-06, |
|
"loss": 0.7824, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 0.9319742846682855, |
|
"learning_rate": 8.818489656413043e-06, |
|
"loss": 0.7725, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"grad_norm": 0.9287287760402508, |
|
"learning_rate": 8.071051642698074e-06, |
|
"loss": 0.7852, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.9386074844778828, |
|
"learning_rate": 7.350572513154377e-06, |
|
"loss": 0.7832, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 0.9144394155026029, |
|
"learning_rate": 6.658199610959537e-06, |
|
"loss": 0.7887, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.8674048478962856, |
|
"learning_rate": 5.995035520897882e-06, |
|
"loss": 0.7833, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 0.9595278814551851, |
|
"learning_rate": 5.362136313524607e-06, |
|
"loss": 0.7627, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.9242690658583879, |
|
"learning_rate": 4.760509863402468e-06, |
|
"loss": 0.7705, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.9233544033053771, |
|
"learning_rate": 4.19111424408932e-06, |
|
"loss": 0.7709, |
|
"step": 1600 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1968, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 400, |
|
"total_flos": 488382231740416.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|