tutor_mixtral_1000_q4 / trainer_state.json
braunaleMPG's picture
Upload 23 files
1b922b2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.504065040650406,
"eval_steps": 500,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1,
"grad_norm": 0.6131871408580806,
"learning_rate": 4.9980094094149945e-05,
"loss": 1.047,
"step": 25
},
{
"epoch": 0.2,
"grad_norm": 0.515709399418576,
"learning_rate": 4.992040807620678e-05,
"loss": 0.9666,
"step": 50
},
{
"epoch": 0.3,
"grad_norm": 0.38345995255280807,
"learning_rate": 4.982103699451082e-05,
"loss": 0.8994,
"step": 75
},
{
"epoch": 0.41,
"grad_norm": 0.4106925564227259,
"learning_rate": 4.968213909477376e-05,
"loss": 0.8928,
"step": 100
},
{
"epoch": 0.51,
"grad_norm": 0.46245157217806615,
"learning_rate": 4.950393556807682e-05,
"loss": 0.891,
"step": 125
},
{
"epoch": 0.61,
"grad_norm": 0.46965485639834587,
"learning_rate": 4.928671019862995e-05,
"loss": 0.8741,
"step": 150
},
{
"epoch": 0.71,
"grad_norm": 0.5208840886647297,
"learning_rate": 4.903080891185335e-05,
"loss": 0.8649,
"step": 175
},
{
"epoch": 0.81,
"grad_norm": 0.502935818668578,
"learning_rate": 4.873663922350073e-05,
"loss": 0.8713,
"step": 200
},
{
"epoch": 0.91,
"grad_norm": 0.5664576246853419,
"learning_rate": 4.840466959070174e-05,
"loss": 0.8289,
"step": 225
},
{
"epoch": 1.02,
"grad_norm": 0.5222205590215429,
"learning_rate": 4.8035428665956806e-05,
"loss": 0.8426,
"step": 250
},
{
"epoch": 1.12,
"grad_norm": 0.5979794941397252,
"learning_rate": 4.762950445527264e-05,
"loss": 0.8426,
"step": 275
},
{
"epoch": 1.22,
"grad_norm": 0.6574658282987255,
"learning_rate": 4.7187543381778864e-05,
"loss": 0.8445,
"step": 300
},
{
"epoch": 1.32,
"grad_norm": 0.6208868160129577,
"learning_rate": 4.671024925631694e-05,
"loss": 0.8405,
"step": 325
},
{
"epoch": 1.42,
"grad_norm": 0.6261313580655511,
"learning_rate": 4.619838215664082e-05,
"loss": 0.8177,
"step": 350
},
{
"epoch": 1.52,
"grad_norm": 0.6202678860432359,
"learning_rate": 4.5652757217013995e-05,
"loss": 0.8192,
"step": 375
},
{
"epoch": 1.63,
"grad_norm": 0.7078761621579428,
"learning_rate": 4.507424333013069e-05,
"loss": 0.8215,
"step": 400
},
{
"epoch": 1.73,
"grad_norm": 0.7653619566673501,
"learning_rate": 4.4463761763428125e-05,
"loss": 0.8308,
"step": 425
},
{
"epoch": 1.83,
"grad_norm": 0.6453208813641874,
"learning_rate": 4.38222846919935e-05,
"loss": 0.8272,
"step": 450
},
{
"epoch": 1.93,
"grad_norm": 0.6377834929128101,
"learning_rate": 4.315083365040192e-05,
"loss": 0.8248,
"step": 475
},
{
"epoch": 2.03,
"grad_norm": 0.770141823944302,
"learning_rate": 4.245047790595075e-05,
"loss": 0.8284,
"step": 500
},
{
"epoch": 2.13,
"grad_norm": 0.721253950175735,
"learning_rate": 4.172233275588082e-05,
"loss": 0.8229,
"step": 525
},
{
"epoch": 2.24,
"grad_norm": 0.6231476906453041,
"learning_rate": 4.0967557751296336e-05,
"loss": 0.8089,
"step": 550
},
{
"epoch": 2.34,
"grad_norm": 0.8268572656351973,
"learning_rate": 4.0187354850611636e-05,
"loss": 0.8028,
"step": 575
},
{
"epoch": 2.44,
"grad_norm": 0.7854716225601517,
"learning_rate": 3.938296650546552e-05,
"loss": 0.8142,
"step": 600
},
{
"epoch": 2.54,
"grad_norm": 0.6745304491885583,
"learning_rate": 3.8555673682151215e-05,
"loss": 0.8098,
"step": 625
},
{
"epoch": 2.64,
"grad_norm": 0.8428250770310277,
"learning_rate": 3.7706793821712826e-05,
"loss": 0.8063,
"step": 650
},
{
"epoch": 2.74,
"grad_norm": 0.7050077227945795,
"learning_rate": 3.683767874195674e-05,
"loss": 0.8053,
"step": 675
},
{
"epoch": 2.85,
"grad_norm": 0.7324375980300449,
"learning_rate": 3.5949712484719014e-05,
"loss": 0.8003,
"step": 700
},
{
"epoch": 2.95,
"grad_norm": 0.7448120924614113,
"learning_rate": 3.5044309111816796e-05,
"loss": 0.7983,
"step": 725
},
{
"epoch": 3.05,
"grad_norm": 0.7846948483796882,
"learning_rate": 3.4122910453193885e-05,
"loss": 0.8005,
"step": 750
},
{
"epoch": 3.15,
"grad_norm": 0.7611752532437104,
"learning_rate": 3.318698381084619e-05,
"loss": 0.8002,
"step": 775
},
{
"epoch": 3.25,
"grad_norm": 0.7881990924188006,
"learning_rate": 3.223801962218372e-05,
"loss": 0.7976,
"step": 800
},
{
"epoch": 3.35,
"grad_norm": 0.7610818779512661,
"learning_rate": 3.127752908655004e-05,
"loss": 0.7965,
"step": 825
},
{
"epoch": 3.46,
"grad_norm": 0.7180364283278721,
"learning_rate": 3.0307041758678932e-05,
"loss": 0.7876,
"step": 850
},
{
"epoch": 3.56,
"grad_norm": 0.9309526041694146,
"learning_rate": 2.932810311292058e-05,
"loss": 0.7957,
"step": 875
},
{
"epoch": 3.66,
"grad_norm": 0.7828785008783581,
"learning_rate": 2.834227208211621e-05,
"loss": 0.793,
"step": 900
},
{
"epoch": 3.76,
"grad_norm": 0.7581881527652482,
"learning_rate": 2.7351118575040496e-05,
"loss": 0.7808,
"step": 925
},
{
"epoch": 3.86,
"grad_norm": 0.8188308134420089,
"learning_rate": 2.635622097636501e-05,
"loss": 0.8139,
"step": 950
},
{
"epoch": 3.96,
"grad_norm": 0.8386150171289235,
"learning_rate": 2.535916363312414e-05,
"loss": 0.7902,
"step": 975
},
{
"epoch": 4.07,
"grad_norm": 0.39693696134084294,
"learning_rate": 2.4361534331686003e-05,
"loss": 0.7851,
"step": 1000
},
{
"epoch": 4.17,
"grad_norm": 0.8435295350294197,
"learning_rate": 2.3364921769246423e-05,
"loss": 0.786,
"step": 1025
},
{
"epoch": 4.27,
"grad_norm": 0.878180623936396,
"learning_rate": 2.2370913023872355e-05,
"loss": 0.7824,
"step": 1050
},
{
"epoch": 4.37,
"grad_norm": 0.8477688393540482,
"learning_rate": 2.138109102712376e-05,
"loss": 0.7917,
"step": 1075
},
{
"epoch": 4.47,
"grad_norm": 1.0116402877637123,
"learning_rate": 2.0397032043278687e-05,
"loss": 0.7952,
"step": 1100
},
{
"epoch": 4.57,
"grad_norm": 0.7542753798456995,
"learning_rate": 1.9420303159175796e-05,
"loss": 0.7794,
"step": 1125
},
{
"epoch": 4.67,
"grad_norm": 0.8847241202927656,
"learning_rate": 1.8452459788671738e-05,
"loss": 0.7771,
"step": 1150
},
{
"epoch": 4.78,
"grad_norm": 0.9824412667192655,
"learning_rate": 1.7495043195687368e-05,
"loss": 0.7803,
"step": 1175
},
{
"epoch": 4.88,
"grad_norm": 0.8602734894405721,
"learning_rate": 1.6549578039787436e-05,
"loss": 0.7917,
"step": 1200
},
{
"epoch": 4.98,
"grad_norm": 0.8666836068688513,
"learning_rate": 1.561756994820216e-05,
"loss": 0.788,
"step": 1225
},
{
"epoch": 5.08,
"grad_norm": 0.9258527307819997,
"learning_rate": 1.470050311815736e-05,
"loss": 0.7705,
"step": 1250
},
{
"epoch": 5.18,
"grad_norm": 0.8495623021085815,
"learning_rate": 1.379983795333119e-05,
"loss": 0.7681,
"step": 1275
},
{
"epoch": 5.28,
"grad_norm": 0.8867986411348584,
"learning_rate": 1.2917008738201537e-05,
"loss": 0.7777,
"step": 1300
},
{
"epoch": 5.39,
"grad_norm": 0.8823017587324554,
"learning_rate": 1.2053421353987437e-05,
"loss": 0.771,
"step": 1325
},
{
"epoch": 5.49,
"grad_norm": 0.9090926331206884,
"learning_rate": 1.1210451039821965e-05,
"loss": 0.7955,
"step": 1350
},
{
"epoch": 5.59,
"grad_norm": 0.9340530184318201,
"learning_rate": 1.0389440202721778e-05,
"loss": 0.7816,
"step": 1375
},
{
"epoch": 5.69,
"grad_norm": 0.9458040444506768,
"learning_rate": 9.591696279840906e-06,
"loss": 0.7824,
"step": 1400
},
{
"epoch": 5.79,
"grad_norm": 0.9319742846682855,
"learning_rate": 8.818489656413043e-06,
"loss": 0.7725,
"step": 1425
},
{
"epoch": 5.89,
"grad_norm": 0.9287287760402508,
"learning_rate": 8.071051642698074e-06,
"loss": 0.7852,
"step": 1450
},
{
"epoch": 6.0,
"grad_norm": 0.9386074844778828,
"learning_rate": 7.350572513154377e-06,
"loss": 0.7832,
"step": 1475
},
{
"epoch": 6.1,
"grad_norm": 0.9144394155026029,
"learning_rate": 6.658199610959537e-06,
"loss": 0.7887,
"step": 1500
},
{
"epoch": 6.2,
"grad_norm": 0.8674048478962856,
"learning_rate": 5.995035520897882e-06,
"loss": 0.7833,
"step": 1525
},
{
"epoch": 6.3,
"grad_norm": 0.9595278814551851,
"learning_rate": 5.362136313524607e-06,
"loss": 0.7627,
"step": 1550
},
{
"epoch": 6.4,
"grad_norm": 0.9242690658583879,
"learning_rate": 4.760509863402468e-06,
"loss": 0.7705,
"step": 1575
},
{
"epoch": 6.5,
"grad_norm": 0.9233544033053771,
"learning_rate": 4.19111424408932e-06,
"loss": 0.7709,
"step": 1600
}
],
"logging_steps": 25,
"max_steps": 1968,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 400,
"total_flos": 488382231740416.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}