flan-t5-small-ner / trainer_state.json
agentlans's picture
Upload 14 files
d667e5f verified
{
"best_metric": 0.5393198132514954,
"best_model_checkpoint": "/media/user/Expansion/flan-t5-small-ner/checkpoint-99955",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 99955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02501125506477915,
"grad_norm": 27.66358757019043,
"learning_rate": 4.974988744935221e-05,
"loss": 4.6267,
"num_input_tokens_seen": 1673336,
"step": 500
},
{
"epoch": 0.0500225101295583,
"grad_norm": 9.223219871520996,
"learning_rate": 4.949977489870442e-05,
"loss": 2.1909,
"num_input_tokens_seen": 3361736,
"step": 1000
},
{
"epoch": 0.07503376519433745,
"grad_norm": 15.481683731079102,
"learning_rate": 4.924966234805663e-05,
"loss": 1.8355,
"num_input_tokens_seen": 5013800,
"step": 1500
},
{
"epoch": 0.1000450202591166,
"grad_norm": 14.396512985229492,
"learning_rate": 4.8999549797408836e-05,
"loss": 1.5648,
"num_input_tokens_seen": 6667312,
"step": 2000
},
{
"epoch": 0.12505627532389577,
"grad_norm": 7.068989276885986,
"learning_rate": 4.8749437246761046e-05,
"loss": 1.4252,
"num_input_tokens_seen": 8347016,
"step": 2500
},
{
"epoch": 0.1500675303886749,
"grad_norm": 11.330971717834473,
"learning_rate": 4.849932469611325e-05,
"loss": 1.3972,
"num_input_tokens_seen": 10008296,
"step": 3000
},
{
"epoch": 0.17507878545345407,
"grad_norm": 9.403321266174316,
"learning_rate": 4.824921214546546e-05,
"loss": 1.3001,
"num_input_tokens_seen": 11658808,
"step": 3500
},
{
"epoch": 0.2000900405182332,
"grad_norm": 8.147115707397461,
"learning_rate": 4.799909959481767e-05,
"loss": 1.2625,
"num_input_tokens_seen": 13331648,
"step": 4000
},
{
"epoch": 0.22510129558301237,
"grad_norm": 13.405184745788574,
"learning_rate": 4.774898704416988e-05,
"loss": 1.1839,
"num_input_tokens_seen": 14982440,
"step": 4500
},
{
"epoch": 0.25011255064779153,
"grad_norm": 20.70949363708496,
"learning_rate": 4.749887449352209e-05,
"loss": 1.1598,
"num_input_tokens_seen": 16633632,
"step": 5000
},
{
"epoch": 0.27512380571257067,
"grad_norm": 16.94267463684082,
"learning_rate": 4.72487619428743e-05,
"loss": 1.1473,
"num_input_tokens_seen": 18311672,
"step": 5500
},
{
"epoch": 0.3001350607773498,
"grad_norm": 8.609989166259766,
"learning_rate": 4.69986493922265e-05,
"loss": 1.1098,
"num_input_tokens_seen": 19980456,
"step": 6000
},
{
"epoch": 0.32514631584212894,
"grad_norm": 9.003643989562988,
"learning_rate": 4.674853684157871e-05,
"loss": 1.0973,
"num_input_tokens_seen": 21646328,
"step": 6500
},
{
"epoch": 0.35015757090690813,
"grad_norm": 18.364194869995117,
"learning_rate": 4.649842429093092e-05,
"loss": 1.0987,
"num_input_tokens_seen": 23277400,
"step": 7000
},
{
"epoch": 0.37516882597168727,
"grad_norm": 13.544733047485352,
"learning_rate": 4.624831174028313e-05,
"loss": 1.0642,
"num_input_tokens_seen": 24915304,
"step": 7500
},
{
"epoch": 0.4001800810364664,
"grad_norm": 14.257452011108398,
"learning_rate": 4.5998199189635336e-05,
"loss": 1.0414,
"num_input_tokens_seen": 26590576,
"step": 8000
},
{
"epoch": 0.42519133610124554,
"grad_norm": 10.29515266418457,
"learning_rate": 4.5748086638987546e-05,
"loss": 1.0634,
"num_input_tokens_seen": 28236280,
"step": 8500
},
{
"epoch": 0.45020259116602473,
"grad_norm": 13.840631484985352,
"learning_rate": 4.5497974088339756e-05,
"loss": 0.9817,
"num_input_tokens_seen": 29891480,
"step": 9000
},
{
"epoch": 0.47521384623080387,
"grad_norm": 12.118327140808105,
"learning_rate": 4.5247861537691966e-05,
"loss": 1.0122,
"num_input_tokens_seen": 31551000,
"step": 9500
},
{
"epoch": 0.5002251012955831,
"grad_norm": 8.115203857421875,
"learning_rate": 4.499774898704417e-05,
"loss": 0.9802,
"num_input_tokens_seen": 33221384,
"step": 10000
},
{
"epoch": 0.5252363563603621,
"grad_norm": 8.905954360961914,
"learning_rate": 4.474763643639638e-05,
"loss": 0.9796,
"num_input_tokens_seen": 34891392,
"step": 10500
},
{
"epoch": 0.5502476114251413,
"grad_norm": 10.70656681060791,
"learning_rate": 4.449752388574859e-05,
"loss": 1.0031,
"num_input_tokens_seen": 36518768,
"step": 11000
},
{
"epoch": 0.5752588664899204,
"grad_norm": 12.424896240234375,
"learning_rate": 4.42474113351008e-05,
"loss": 0.9591,
"num_input_tokens_seen": 38147456,
"step": 11500
},
{
"epoch": 0.6002701215546996,
"grad_norm": 10.77695083618164,
"learning_rate": 4.399729878445301e-05,
"loss": 0.9338,
"num_input_tokens_seen": 39823976,
"step": 12000
},
{
"epoch": 0.6252813766194788,
"grad_norm": 12.77743911743164,
"learning_rate": 4.374718623380521e-05,
"loss": 0.9112,
"num_input_tokens_seen": 41493480,
"step": 12500
},
{
"epoch": 0.6502926316842579,
"grad_norm": 16.060897827148438,
"learning_rate": 4.349707368315742e-05,
"loss": 0.915,
"num_input_tokens_seen": 43130832,
"step": 13000
},
{
"epoch": 0.6753038867490371,
"grad_norm": 17.562183380126953,
"learning_rate": 4.324696113250963e-05,
"loss": 0.9096,
"num_input_tokens_seen": 44779392,
"step": 13500
},
{
"epoch": 0.7003151418138163,
"grad_norm": 12.406323432922363,
"learning_rate": 4.2996848581861835e-05,
"loss": 0.9499,
"num_input_tokens_seen": 46433856,
"step": 14000
},
{
"epoch": 0.7253263968785953,
"grad_norm": 15.567843437194824,
"learning_rate": 4.2746736031214045e-05,
"loss": 0.923,
"num_input_tokens_seen": 48102016,
"step": 14500
},
{
"epoch": 0.7503376519433745,
"grad_norm": 9.45335578918457,
"learning_rate": 4.2496623480566255e-05,
"loss": 0.9285,
"num_input_tokens_seen": 49796432,
"step": 15000
},
{
"epoch": 0.7753489070081536,
"grad_norm": 7.158623695373535,
"learning_rate": 4.2246510929918465e-05,
"loss": 0.9023,
"num_input_tokens_seen": 51432848,
"step": 15500
},
{
"epoch": 0.8003601620729328,
"grad_norm": 9.542813301086426,
"learning_rate": 4.1996398379270675e-05,
"loss": 0.9237,
"num_input_tokens_seen": 53083496,
"step": 16000
},
{
"epoch": 0.825371417137712,
"grad_norm": 10.027923583984375,
"learning_rate": 4.1746285828622885e-05,
"loss": 0.8813,
"num_input_tokens_seen": 54755032,
"step": 16500
},
{
"epoch": 0.8503826722024911,
"grad_norm": 18.8748722076416,
"learning_rate": 4.1496173277975095e-05,
"loss": 0.9036,
"num_input_tokens_seen": 56411184,
"step": 17000
},
{
"epoch": 0.8753939272672703,
"grad_norm": 12.792276382446289,
"learning_rate": 4.12460607273273e-05,
"loss": 0.8589,
"num_input_tokens_seen": 58070520,
"step": 17500
},
{
"epoch": 0.9004051823320495,
"grad_norm": 6.8420491218566895,
"learning_rate": 4.09959481766795e-05,
"loss": 0.8855,
"num_input_tokens_seen": 59745800,
"step": 18000
},
{
"epoch": 0.9254164373968286,
"grad_norm": 9.066823959350586,
"learning_rate": 4.074583562603171e-05,
"loss": 0.8773,
"num_input_tokens_seen": 61457288,
"step": 18500
},
{
"epoch": 0.9504276924616077,
"grad_norm": 7.002307415008545,
"learning_rate": 4.049572307538392e-05,
"loss": 0.8747,
"num_input_tokens_seen": 63139928,
"step": 19000
},
{
"epoch": 0.9754389475263868,
"grad_norm": 14.685755729675293,
"learning_rate": 4.024561052473613e-05,
"loss": 0.8398,
"num_input_tokens_seen": 64811920,
"step": 19500
},
{
"epoch": 1.0,
"eval_loss": 0.6227446794509888,
"eval_runtime": 96.3481,
"eval_samples_per_second": 414.964,
"eval_steps_per_second": 51.874,
"num_input_tokens_seen": 66451084,
"step": 19991
},
{
"epoch": 1.0004502025911661,
"grad_norm": 13.560747146606445,
"learning_rate": 3.999549797408834e-05,
"loss": 0.852,
"num_input_tokens_seen": 66482076,
"step": 20000
},
{
"epoch": 1.025461457655945,
"grad_norm": 4.446373462677002,
"learning_rate": 3.974538542344055e-05,
"loss": 0.7973,
"num_input_tokens_seen": 68132180,
"step": 20500
},
{
"epoch": 1.0504727127207243,
"grad_norm": 3.456674098968506,
"learning_rate": 3.949527287279276e-05,
"loss": 0.8215,
"num_input_tokens_seen": 69804380,
"step": 21000
},
{
"epoch": 1.0754839677855035,
"grad_norm": 8.283075332641602,
"learning_rate": 3.924516032214497e-05,
"loss": 0.8081,
"num_input_tokens_seen": 71452668,
"step": 21500
},
{
"epoch": 1.1004952228502827,
"grad_norm": 9.358149528503418,
"learning_rate": 3.8995047771497175e-05,
"loss": 0.7991,
"num_input_tokens_seen": 73104948,
"step": 22000
},
{
"epoch": 1.1255064779150619,
"grad_norm": 9.011244773864746,
"learning_rate": 3.8744935220849385e-05,
"loss": 0.7839,
"num_input_tokens_seen": 74751164,
"step": 22500
},
{
"epoch": 1.1505177329798408,
"grad_norm": 5.775268077850342,
"learning_rate": 3.849482267020159e-05,
"loss": 0.7515,
"num_input_tokens_seen": 76431460,
"step": 23000
},
{
"epoch": 1.17552898804462,
"grad_norm": 13.273436546325684,
"learning_rate": 3.82447101195538e-05,
"loss": 0.7821,
"num_input_tokens_seen": 78092124,
"step": 23500
},
{
"epoch": 1.2005402431093992,
"grad_norm": 10.351176261901855,
"learning_rate": 3.799459756890601e-05,
"loss": 0.772,
"num_input_tokens_seen": 79736012,
"step": 24000
},
{
"epoch": 1.2255514981741784,
"grad_norm": 14.834792137145996,
"learning_rate": 3.774448501825822e-05,
"loss": 0.78,
"num_input_tokens_seen": 81414220,
"step": 24500
},
{
"epoch": 1.2505627532389576,
"grad_norm": 14.160717964172363,
"learning_rate": 3.749437246761043e-05,
"loss": 0.7767,
"num_input_tokens_seen": 83081932,
"step": 25000
},
{
"epoch": 1.2755740083037366,
"grad_norm": 8.410615921020508,
"learning_rate": 3.724425991696264e-05,
"loss": 0.7665,
"num_input_tokens_seen": 84745948,
"step": 25500
},
{
"epoch": 1.3005852633685158,
"grad_norm": 7.881125450134277,
"learning_rate": 3.699414736631484e-05,
"loss": 0.7626,
"num_input_tokens_seen": 86421180,
"step": 26000
},
{
"epoch": 1.325596518433295,
"grad_norm": 21.633901596069336,
"learning_rate": 3.674403481566705e-05,
"loss": 0.7645,
"num_input_tokens_seen": 88075204,
"step": 26500
},
{
"epoch": 1.3506077734980741,
"grad_norm": 14.725602149963379,
"learning_rate": 3.649392226501926e-05,
"loss": 0.751,
"num_input_tokens_seen": 89740116,
"step": 27000
},
{
"epoch": 1.3756190285628533,
"grad_norm": 6.119060039520264,
"learning_rate": 3.6243809714371465e-05,
"loss": 0.756,
"num_input_tokens_seen": 91410556,
"step": 27500
},
{
"epoch": 1.4006302836276325,
"grad_norm": 6.520070552825928,
"learning_rate": 3.5993697163723675e-05,
"loss": 0.7526,
"num_input_tokens_seen": 93116396,
"step": 28000
},
{
"epoch": 1.4256415386924115,
"grad_norm": 7.963521480560303,
"learning_rate": 3.5743584613075885e-05,
"loss": 0.7645,
"num_input_tokens_seen": 94761716,
"step": 28500
},
{
"epoch": 1.4506527937571907,
"grad_norm": 11.38167953491211,
"learning_rate": 3.5493472062428095e-05,
"loss": 0.7624,
"num_input_tokens_seen": 96449700,
"step": 29000
},
{
"epoch": 1.4756640488219699,
"grad_norm": 15.715912818908691,
"learning_rate": 3.5243359511780305e-05,
"loss": 0.7509,
"num_input_tokens_seen": 98102252,
"step": 29500
},
{
"epoch": 1.500675303886749,
"grad_norm": 7.735713005065918,
"learning_rate": 3.499324696113251e-05,
"loss": 0.7738,
"num_input_tokens_seen": 99780396,
"step": 30000
},
{
"epoch": 1.525686558951528,
"grad_norm": 8.079352378845215,
"learning_rate": 3.474313441048472e-05,
"loss": 0.7522,
"num_input_tokens_seen": 101479956,
"step": 30500
},
{
"epoch": 1.5506978140163072,
"grad_norm": 8.290655136108398,
"learning_rate": 3.449302185983693e-05,
"loss": 0.7381,
"num_input_tokens_seen": 103149500,
"step": 31000
},
{
"epoch": 1.5757090690810864,
"grad_norm": 8.904264450073242,
"learning_rate": 3.424290930918914e-05,
"loss": 0.7467,
"num_input_tokens_seen": 104812996,
"step": 31500
},
{
"epoch": 1.6007203241458656,
"grad_norm": 7.439008712768555,
"learning_rate": 3.399279675854135e-05,
"loss": 0.7507,
"num_input_tokens_seen": 106479036,
"step": 32000
},
{
"epoch": 1.6257315792106448,
"grad_norm": 7.584664344787598,
"learning_rate": 3.374268420789355e-05,
"loss": 0.7168,
"num_input_tokens_seen": 108141364,
"step": 32500
},
{
"epoch": 1.650742834275424,
"grad_norm": 8.953302383422852,
"learning_rate": 3.349257165724576e-05,
"loss": 0.7469,
"num_input_tokens_seen": 109799916,
"step": 33000
},
{
"epoch": 1.6757540893402032,
"grad_norm": 10.678362846374512,
"learning_rate": 3.324245910659797e-05,
"loss": 0.7468,
"num_input_tokens_seen": 111436748,
"step": 33500
},
{
"epoch": 1.7007653444049824,
"grad_norm": 11.628217697143555,
"learning_rate": 3.2992346555950175e-05,
"loss": 0.7358,
"num_input_tokens_seen": 113068476,
"step": 34000
},
{
"epoch": 1.7257765994697614,
"grad_norm": 12.741203308105469,
"learning_rate": 3.2742234005302385e-05,
"loss": 0.7402,
"num_input_tokens_seen": 114748860,
"step": 34500
},
{
"epoch": 1.7507878545345406,
"grad_norm": 9.066828727722168,
"learning_rate": 3.2492121454654595e-05,
"loss": 0.7728,
"num_input_tokens_seen": 116441684,
"step": 35000
},
{
"epoch": 1.7757991095993197,
"grad_norm": 7.780086517333984,
"learning_rate": 3.2242008904006805e-05,
"loss": 0.7424,
"num_input_tokens_seen": 118093652,
"step": 35500
},
{
"epoch": 1.8008103646640987,
"grad_norm": 5.290003299713135,
"learning_rate": 3.1991896353359015e-05,
"loss": 0.7121,
"num_input_tokens_seen": 119756772,
"step": 36000
},
{
"epoch": 1.825821619728878,
"grad_norm": 13.356730461120605,
"learning_rate": 3.1741783802711225e-05,
"loss": 0.789,
"num_input_tokens_seen": 121419852,
"step": 36500
},
{
"epoch": 1.850832874793657,
"grad_norm": 4.2140727043151855,
"learning_rate": 3.149167125206343e-05,
"loss": 0.7501,
"num_input_tokens_seen": 123080420,
"step": 37000
},
{
"epoch": 1.8758441298584363,
"grad_norm": 15.408193588256836,
"learning_rate": 3.124155870141564e-05,
"loss": 0.7576,
"num_input_tokens_seen": 124733724,
"step": 37500
},
{
"epoch": 1.9008553849232155,
"grad_norm": 8.88025951385498,
"learning_rate": 3.099144615076784e-05,
"loss": 0.7315,
"num_input_tokens_seen": 126386636,
"step": 38000
},
{
"epoch": 1.9258666399879947,
"grad_norm": 15.850674629211426,
"learning_rate": 3.074133360012005e-05,
"loss": 0.7289,
"num_input_tokens_seen": 128054932,
"step": 38500
},
{
"epoch": 1.9508778950527739,
"grad_norm": 10.460667610168457,
"learning_rate": 3.049122104947226e-05,
"loss": 0.7375,
"num_input_tokens_seen": 129731780,
"step": 39000
},
{
"epoch": 1.975889150117553,
"grad_norm": 4.816532135009766,
"learning_rate": 3.024110849882447e-05,
"loss": 0.7203,
"num_input_tokens_seen": 131377564,
"step": 39500
},
{
"epoch": 2.0,
"eval_loss": 0.5678554773330688,
"eval_runtime": 97.2769,
"eval_samples_per_second": 411.002,
"eval_steps_per_second": 51.379,
"num_input_tokens_seen": 132976438,
"step": 39982
},
{
"epoch": 2.0009004051823323,
"grad_norm": 8.531465530395508,
"learning_rate": 2.999099594817668e-05,
"loss": 0.7337,
"num_input_tokens_seen": 133038726,
"step": 40000
},
{
"epoch": 2.025911660247111,
"grad_norm": 17.74102783203125,
"learning_rate": 2.974088339752889e-05,
"loss": 0.6798,
"num_input_tokens_seen": 134681590,
"step": 40500
},
{
"epoch": 2.05092291531189,
"grad_norm": 16.203670501708984,
"learning_rate": 2.9490770846881098e-05,
"loss": 0.692,
"num_input_tokens_seen": 136354910,
"step": 41000
},
{
"epoch": 2.0759341703766694,
"grad_norm": 11.238871574401855,
"learning_rate": 2.9240658296233308e-05,
"loss": 0.653,
"num_input_tokens_seen": 138014246,
"step": 41500
},
{
"epoch": 2.1009454254414486,
"grad_norm": 8.781373023986816,
"learning_rate": 2.899054574558551e-05,
"loss": 0.6742,
"num_input_tokens_seen": 139676526,
"step": 42000
},
{
"epoch": 2.1259566805062278,
"grad_norm": 7.73007869720459,
"learning_rate": 2.874043319493772e-05,
"loss": 0.6739,
"num_input_tokens_seen": 141326846,
"step": 42500
},
{
"epoch": 2.150967935571007,
"grad_norm": 6.6758904457092285,
"learning_rate": 2.849032064428993e-05,
"loss": 0.6767,
"num_input_tokens_seen": 142999126,
"step": 43000
},
{
"epoch": 2.175979190635786,
"grad_norm": 9.964508056640625,
"learning_rate": 2.824020809364214e-05,
"loss": 0.6649,
"num_input_tokens_seen": 144643454,
"step": 43500
},
{
"epoch": 2.2009904457005653,
"grad_norm": 7.9148664474487305,
"learning_rate": 2.7990095542994348e-05,
"loss": 0.678,
"num_input_tokens_seen": 146327686,
"step": 44000
},
{
"epoch": 2.2260017007653445,
"grad_norm": 5.838576316833496,
"learning_rate": 2.7739982992346558e-05,
"loss": 0.6629,
"num_input_tokens_seen": 147996750,
"step": 44500
},
{
"epoch": 2.2510129558301237,
"grad_norm": 9.018148422241211,
"learning_rate": 2.7489870441698768e-05,
"loss": 0.6673,
"num_input_tokens_seen": 149658382,
"step": 45000
},
{
"epoch": 2.276024210894903,
"grad_norm": 5.56981897354126,
"learning_rate": 2.7239757891050978e-05,
"loss": 0.658,
"num_input_tokens_seen": 151279470,
"step": 45500
},
{
"epoch": 2.3010354659596817,
"grad_norm": 3.9373059272766113,
"learning_rate": 2.698964534040318e-05,
"loss": 0.6747,
"num_input_tokens_seen": 152950878,
"step": 46000
},
{
"epoch": 2.326046721024461,
"grad_norm": 7.596631050109863,
"learning_rate": 2.6739532789755388e-05,
"loss": 0.6824,
"num_input_tokens_seen": 154603110,
"step": 46500
},
{
"epoch": 2.35105797608924,
"grad_norm": 7.714618682861328,
"learning_rate": 2.6489420239107598e-05,
"loss": 0.6662,
"num_input_tokens_seen": 156262254,
"step": 47000
},
{
"epoch": 2.3760692311540192,
"grad_norm": 11.400321006774902,
"learning_rate": 2.6239307688459808e-05,
"loss": 0.6478,
"num_input_tokens_seen": 157940526,
"step": 47500
},
{
"epoch": 2.4010804862187984,
"grad_norm": 5.944780349731445,
"learning_rate": 2.5989195137812018e-05,
"loss": 0.6701,
"num_input_tokens_seen": 159597926,
"step": 48000
},
{
"epoch": 2.4260917412835776,
"grad_norm": 7.971735954284668,
"learning_rate": 2.5739082587164225e-05,
"loss": 0.6815,
"num_input_tokens_seen": 161249054,
"step": 48500
},
{
"epoch": 2.451102996348357,
"grad_norm": 8.019645690917969,
"learning_rate": 2.5488970036516435e-05,
"loss": 0.6823,
"num_input_tokens_seen": 162937710,
"step": 49000
},
{
"epoch": 2.476114251413136,
"grad_norm": 14.52238655090332,
"learning_rate": 2.5238857485868645e-05,
"loss": 0.6662,
"num_input_tokens_seen": 164579550,
"step": 49500
},
{
"epoch": 2.501125506477915,
"grad_norm": 8.065009117126465,
"learning_rate": 2.498874493522085e-05,
"loss": 0.6855,
"num_input_tokens_seen": 166259486,
"step": 50000
},
{
"epoch": 2.526136761542694,
"grad_norm": 3.0121171474456787,
"learning_rate": 2.473863238457306e-05,
"loss": 0.6597,
"num_input_tokens_seen": 167925014,
"step": 50500
},
{
"epoch": 2.551148016607473,
"grad_norm": 9.93840217590332,
"learning_rate": 2.4488519833925268e-05,
"loss": 0.6672,
"num_input_tokens_seen": 169584230,
"step": 51000
},
{
"epoch": 2.5761592716722523,
"grad_norm": 7.8001627922058105,
"learning_rate": 2.4238407283277475e-05,
"loss": 0.6419,
"num_input_tokens_seen": 171205846,
"step": 51500
},
{
"epoch": 2.6011705267370315,
"grad_norm": 5.621837139129639,
"learning_rate": 2.3988294732629685e-05,
"loss": 0.6679,
"num_input_tokens_seen": 172867766,
"step": 52000
},
{
"epoch": 2.6261817818018107,
"grad_norm": 18.287431716918945,
"learning_rate": 2.3738182181981895e-05,
"loss": 0.6601,
"num_input_tokens_seen": 174508502,
"step": 52500
},
{
"epoch": 2.65119303686659,
"grad_norm": 7.687650203704834,
"learning_rate": 2.34880696313341e-05,
"loss": 0.6722,
"num_input_tokens_seen": 176179174,
"step": 53000
},
{
"epoch": 2.676204291931369,
"grad_norm": 9.807682037353516,
"learning_rate": 2.3237957080686308e-05,
"loss": 0.666,
"num_input_tokens_seen": 177874198,
"step": 53500
},
{
"epoch": 2.7012155469961483,
"grad_norm": 9.2701416015625,
"learning_rate": 2.2987844530038518e-05,
"loss": 0.6811,
"num_input_tokens_seen": 179531678,
"step": 54000
},
{
"epoch": 2.7262268020609275,
"grad_norm": 8.37064266204834,
"learning_rate": 2.2737731979390728e-05,
"loss": 0.6505,
"num_input_tokens_seen": 181197542,
"step": 54500
},
{
"epoch": 2.7512380571257067,
"grad_norm": 5.556591033935547,
"learning_rate": 2.2487619428742935e-05,
"loss": 0.6711,
"num_input_tokens_seen": 182849270,
"step": 55000
},
{
"epoch": 2.776249312190486,
"grad_norm": 7.93866491317749,
"learning_rate": 2.2237506878095145e-05,
"loss": 0.6664,
"num_input_tokens_seen": 184520526,
"step": 55500
},
{
"epoch": 2.801260567255265,
"grad_norm": 6.768641471862793,
"learning_rate": 2.198739432744735e-05,
"loss": 0.6699,
"num_input_tokens_seen": 186239974,
"step": 56000
},
{
"epoch": 2.8262718223200443,
"grad_norm": 5.911066055297852,
"learning_rate": 2.173728177679956e-05,
"loss": 0.6649,
"num_input_tokens_seen": 187875982,
"step": 56500
},
{
"epoch": 2.851283077384823,
"grad_norm": 9.964897155761719,
"learning_rate": 2.1487169226151768e-05,
"loss": 0.6874,
"num_input_tokens_seen": 189505118,
"step": 57000
},
{
"epoch": 2.876294332449602,
"grad_norm": 8.109452247619629,
"learning_rate": 2.1237056675503978e-05,
"loss": 0.6762,
"num_input_tokens_seen": 191184886,
"step": 57500
},
{
"epoch": 2.9013055875143814,
"grad_norm": 8.556594848632812,
"learning_rate": 2.0986944124856188e-05,
"loss": 0.6491,
"num_input_tokens_seen": 192859070,
"step": 58000
},
{
"epoch": 2.9263168425791606,
"grad_norm": 5.430099010467529,
"learning_rate": 2.0736831574208394e-05,
"loss": 0.661,
"num_input_tokens_seen": 194533102,
"step": 58500
},
{
"epoch": 2.9513280976439398,
"grad_norm": 9.806259155273438,
"learning_rate": 2.04867190235606e-05,
"loss": 0.645,
"num_input_tokens_seen": 196171870,
"step": 59000
},
{
"epoch": 2.976339352708719,
"grad_norm": 8.950848579406738,
"learning_rate": 2.023660647291281e-05,
"loss": 0.6479,
"num_input_tokens_seen": 197877830,
"step": 59500
},
{
"epoch": 3.0,
"eval_loss": 0.560497522354126,
"eval_runtime": 98.7193,
"eval_samples_per_second": 404.997,
"eval_steps_per_second": 50.628,
"num_input_tokens_seen": 199402582,
"step": 59973
},
{
"epoch": 3.001350607773498,
"grad_norm": 6.855441093444824,
"learning_rate": 1.998649392226502e-05,
"loss": 0.6187,
"num_input_tokens_seen": 199490934,
"step": 60000
},
{
"epoch": 3.0263618628382774,
"grad_norm": 8.57907772064209,
"learning_rate": 1.973638137161723e-05,
"loss": 0.652,
"num_input_tokens_seen": 201164870,
"step": 60500
},
{
"epoch": 3.0513731179030565,
"grad_norm": 15.578742027282715,
"learning_rate": 1.9486268820969438e-05,
"loss": 0.6127,
"num_input_tokens_seen": 202824222,
"step": 61000
},
{
"epoch": 3.0763843729678357,
"grad_norm": 9.083669662475586,
"learning_rate": 1.9236156270321644e-05,
"loss": 0.6146,
"num_input_tokens_seen": 204495334,
"step": 61500
},
{
"epoch": 3.1013956280326145,
"grad_norm": 10.12027359008789,
"learning_rate": 1.8986043719673854e-05,
"loss": 0.6341,
"num_input_tokens_seen": 206136214,
"step": 62000
},
{
"epoch": 3.1264068830973937,
"grad_norm": 10.482580184936523,
"learning_rate": 1.8735931169026064e-05,
"loss": 0.603,
"num_input_tokens_seen": 207809294,
"step": 62500
},
{
"epoch": 3.151418138162173,
"grad_norm": 7.722796440124512,
"learning_rate": 1.848581861837827e-05,
"loss": 0.6184,
"num_input_tokens_seen": 209485534,
"step": 63000
},
{
"epoch": 3.176429393226952,
"grad_norm": 7.449066162109375,
"learning_rate": 1.8235706067730478e-05,
"loss": 0.621,
"num_input_tokens_seen": 211143158,
"step": 63500
},
{
"epoch": 3.2014406482917313,
"grad_norm": 8.766199111938477,
"learning_rate": 1.7985593517082688e-05,
"loss": 0.6165,
"num_input_tokens_seen": 212777414,
"step": 64000
},
{
"epoch": 3.2264519033565104,
"grad_norm": 4.193557262420654,
"learning_rate": 1.7735480966434898e-05,
"loss": 0.6188,
"num_input_tokens_seen": 214420798,
"step": 64500
},
{
"epoch": 3.2514631584212896,
"grad_norm": 6.699706554412842,
"learning_rate": 1.7485368415787104e-05,
"loss": 0.6095,
"num_input_tokens_seen": 216073590,
"step": 65000
},
{
"epoch": 3.276474413486069,
"grad_norm": 8.79476547241211,
"learning_rate": 1.7235255865139314e-05,
"loss": 0.6208,
"num_input_tokens_seen": 217746214,
"step": 65500
},
{
"epoch": 3.301485668550848,
"grad_norm": 6.685282230377197,
"learning_rate": 1.698514331449152e-05,
"loss": 0.6058,
"num_input_tokens_seen": 219433446,
"step": 66000
},
{
"epoch": 3.326496923615627,
"grad_norm": 10.743680953979492,
"learning_rate": 1.673503076384373e-05,
"loss": 0.6318,
"num_input_tokens_seen": 221089694,
"step": 66500
},
{
"epoch": 3.3515081786804064,
"grad_norm": 8.36410903930664,
"learning_rate": 1.6484918213195938e-05,
"loss": 0.6236,
"num_input_tokens_seen": 222760502,
"step": 67000
},
{
"epoch": 3.376519433745185,
"grad_norm": 7.1238274574279785,
"learning_rate": 1.6234805662548148e-05,
"loss": 0.6103,
"num_input_tokens_seen": 224417582,
"step": 67500
},
{
"epoch": 3.4015306888099643,
"grad_norm": 7.042121887207031,
"learning_rate": 1.5984693111900358e-05,
"loss": 0.6157,
"num_input_tokens_seen": 226068982,
"step": 68000
},
{
"epoch": 3.4265419438747435,
"grad_norm": 9.31881332397461,
"learning_rate": 1.5734580561252564e-05,
"loss": 0.6263,
"num_input_tokens_seen": 227701038,
"step": 68500
},
{
"epoch": 3.4515531989395227,
"grad_norm": 7.049442768096924,
"learning_rate": 1.548446801060477e-05,
"loss": 0.6237,
"num_input_tokens_seen": 229359710,
"step": 69000
},
{
"epoch": 3.476564454004302,
"grad_norm": 7.746445178985596,
"learning_rate": 1.5234355459956981e-05,
"loss": 0.6376,
"num_input_tokens_seen": 231028950,
"step": 69500
},
{
"epoch": 3.501575709069081,
"grad_norm": 4.588512420654297,
"learning_rate": 1.4984242909309191e-05,
"loss": 0.6189,
"num_input_tokens_seen": 232663446,
"step": 70000
},
{
"epoch": 3.5265869641338603,
"grad_norm": 9.873016357421875,
"learning_rate": 1.47341303586614e-05,
"loss": 0.5935,
"num_input_tokens_seen": 234333558,
"step": 70500
},
{
"epoch": 3.5515982191986395,
"grad_norm": 8.153191566467285,
"learning_rate": 1.4484017808013606e-05,
"loss": 0.6403,
"num_input_tokens_seen": 236006758,
"step": 71000
},
{
"epoch": 3.5766094742634187,
"grad_norm": 5.909561634063721,
"learning_rate": 1.4233905257365814e-05,
"loss": 0.6152,
"num_input_tokens_seen": 237655630,
"step": 71500
},
{
"epoch": 3.6016207293281974,
"grad_norm": 9.481532096862793,
"learning_rate": 1.3983792706718024e-05,
"loss": 0.5916,
"num_input_tokens_seen": 239300238,
"step": 72000
},
{
"epoch": 3.6266319843929766,
"grad_norm": 4.988440990447998,
"learning_rate": 1.3733680156070232e-05,
"loss": 0.6275,
"num_input_tokens_seen": 240971214,
"step": 72500
},
{
"epoch": 3.651643239457756,
"grad_norm": 6.159299850463867,
"learning_rate": 1.3483567605422439e-05,
"loss": 0.6101,
"num_input_tokens_seen": 242634286,
"step": 73000
},
{
"epoch": 3.676654494522535,
"grad_norm": 4.264859199523926,
"learning_rate": 1.3233455054774649e-05,
"loss": 0.6045,
"num_input_tokens_seen": 244293870,
"step": 73500
},
{
"epoch": 3.701665749587314,
"grad_norm": 5.82095193862915,
"learning_rate": 1.2983342504126857e-05,
"loss": 0.624,
"num_input_tokens_seen": 245956374,
"step": 74000
},
{
"epoch": 3.7266770046520934,
"grad_norm": 10.4242525100708,
"learning_rate": 1.2733229953479067e-05,
"loss": 0.6231,
"num_input_tokens_seen": 247566166,
"step": 74500
},
{
"epoch": 3.7516882597168726,
"grad_norm": 6.536423206329346,
"learning_rate": 1.2483117402831276e-05,
"loss": 0.6159,
"num_input_tokens_seen": 249233118,
"step": 75000
},
{
"epoch": 3.776699514781652,
"grad_norm": 10.467476844787598,
"learning_rate": 1.2233004852183482e-05,
"loss": 0.6252,
"num_input_tokens_seen": 250919822,
"step": 75500
},
{
"epoch": 3.801710769846431,
"grad_norm": 13.297423362731934,
"learning_rate": 1.1982892301535692e-05,
"loss": 0.6133,
"num_input_tokens_seen": 252600838,
"step": 76000
},
{
"epoch": 3.82672202491121,
"grad_norm": 6.729821681976318,
"learning_rate": 1.1732779750887899e-05,
"loss": 0.6201,
"num_input_tokens_seen": 254292558,
"step": 76500
},
{
"epoch": 3.8517332799759894,
"grad_norm": 5.975412845611572,
"learning_rate": 1.1482667200240109e-05,
"loss": 0.5976,
"num_input_tokens_seen": 255961510,
"step": 77000
},
{
"epoch": 3.8767445350407685,
"grad_norm": 16.30948257446289,
"learning_rate": 1.1232554649592317e-05,
"loss": 0.6023,
"num_input_tokens_seen": 257630246,
"step": 77500
},
{
"epoch": 3.9017557901055477,
"grad_norm": 7.327265739440918,
"learning_rate": 1.0982442098944526e-05,
"loss": 0.6145,
"num_input_tokens_seen": 259305118,
"step": 78000
},
{
"epoch": 3.9267670451703265,
"grad_norm": 12.45727825164795,
"learning_rate": 1.0732329548296734e-05,
"loss": 0.6311,
"num_input_tokens_seen": 260978934,
"step": 78500
},
{
"epoch": 3.9517783002351057,
"grad_norm": 10.317325592041016,
"learning_rate": 1.0482216997648942e-05,
"loss": 0.6346,
"num_input_tokens_seen": 262670814,
"step": 79000
},
{
"epoch": 3.976789555299885,
"grad_norm": 7.8411149978637695,
"learning_rate": 1.023210444700115e-05,
"loss": 0.6023,
"num_input_tokens_seen": 264314614,
"step": 79500
},
{
"epoch": 4.0,
"eval_loss": 0.54269939661026,
"eval_runtime": 96.9182,
"eval_samples_per_second": 412.523,
"eval_steps_per_second": 51.569,
"num_input_tokens_seen": 265875340,
"step": 79964
},
{
"epoch": 4.0018008103646645,
"grad_norm": 6.620047569274902,
"learning_rate": 9.98199189635336e-06,
"loss": 0.6268,
"num_input_tokens_seen": 266010940,
"step": 80000
},
{
"epoch": 4.026812065429443,
"grad_norm": 10.007366180419922,
"learning_rate": 9.731879345705567e-06,
"loss": 0.5924,
"num_input_tokens_seen": 267660364,
"step": 80500
},
{
"epoch": 4.051823320494222,
"grad_norm": 6.680395603179932,
"learning_rate": 9.481766795057777e-06,
"loss": 0.5786,
"num_input_tokens_seen": 269338492,
"step": 81000
},
{
"epoch": 4.076834575559001,
"grad_norm": 4.809377670288086,
"learning_rate": 9.231654244409984e-06,
"loss": 0.5942,
"num_input_tokens_seen": 271024236,
"step": 81500
},
{
"epoch": 4.10184583062378,
"grad_norm": 8.463695526123047,
"learning_rate": 8.981541693762194e-06,
"loss": 0.5796,
"num_input_tokens_seen": 272672620,
"step": 82000
},
{
"epoch": 4.12685708568856,
"grad_norm": 10.12741470336914,
"learning_rate": 8.731429143114402e-06,
"loss": 0.5879,
"num_input_tokens_seen": 274353676,
"step": 82500
},
{
"epoch": 4.151868340753339,
"grad_norm": 15.428593635559082,
"learning_rate": 8.48131659246661e-06,
"loss": 0.5977,
"num_input_tokens_seen": 275998164,
"step": 83000
},
{
"epoch": 4.176879595818118,
"grad_norm": 10.350814819335938,
"learning_rate": 8.231204041818819e-06,
"loss": 0.566,
"num_input_tokens_seen": 277685356,
"step": 83500
},
{
"epoch": 4.201890850882897,
"grad_norm": 11.962939262390137,
"learning_rate": 7.981091491171027e-06,
"loss": 0.5671,
"num_input_tokens_seen": 279358548,
"step": 84000
},
{
"epoch": 4.226902105947676,
"grad_norm": 10.32712459564209,
"learning_rate": 7.730978940523236e-06,
"loss": 0.5785,
"num_input_tokens_seen": 280991044,
"step": 84500
},
{
"epoch": 4.2519133610124555,
"grad_norm": 5.896986484527588,
"learning_rate": 7.480866389875445e-06,
"loss": 0.6051,
"num_input_tokens_seen": 282646764,
"step": 85000
},
{
"epoch": 4.276924616077235,
"grad_norm": 7.187685966491699,
"learning_rate": 7.230753839227652e-06,
"loss": 0.5943,
"num_input_tokens_seen": 284342508,
"step": 85500
},
{
"epoch": 4.301935871142014,
"grad_norm": 6.680044174194336,
"learning_rate": 6.980641288579861e-06,
"loss": 0.5765,
"num_input_tokens_seen": 286036340,
"step": 86000
},
{
"epoch": 4.326947126206793,
"grad_norm": 4.963362693786621,
"learning_rate": 6.73052873793207e-06,
"loss": 0.6137,
"num_input_tokens_seen": 287681564,
"step": 86500
},
{
"epoch": 4.351958381271572,
"grad_norm": 12.112903594970703,
"learning_rate": 6.480416187284279e-06,
"loss": 0.5983,
"num_input_tokens_seen": 289353828,
"step": 87000
},
{
"epoch": 4.3769696363363515,
"grad_norm": 5.938944339752197,
"learning_rate": 6.230303636636486e-06,
"loss": 0.6017,
"num_input_tokens_seen": 291011668,
"step": 87500
},
{
"epoch": 4.401980891401131,
"grad_norm": 4.485511302947998,
"learning_rate": 5.980191085988695e-06,
"loss": 0.5898,
"num_input_tokens_seen": 292675092,
"step": 88000
},
{
"epoch": 4.42699214646591,
"grad_norm": 9.15986442565918,
"learning_rate": 5.730078535340903e-06,
"loss": 0.5744,
"num_input_tokens_seen": 294338212,
"step": 88500
},
{
"epoch": 4.452003401530689,
"grad_norm": 3.6591997146606445,
"learning_rate": 5.479965984693112e-06,
"loss": 0.5948,
"num_input_tokens_seen": 296004820,
"step": 89000
},
{
"epoch": 4.477014656595468,
"grad_norm": 9.19853401184082,
"learning_rate": 5.2298534340453205e-06,
"loss": 0.5838,
"num_input_tokens_seen": 297661964,
"step": 89500
},
{
"epoch": 4.5020259116602475,
"grad_norm": 13.491796493530273,
"learning_rate": 4.979740883397529e-06,
"loss": 0.5726,
"num_input_tokens_seen": 299312356,
"step": 90000
},
{
"epoch": 4.527037166725027,
"grad_norm": 6.374147415161133,
"learning_rate": 4.729628332749737e-06,
"loss": 0.5728,
"num_input_tokens_seen": 300978468,
"step": 90500
},
{
"epoch": 4.552048421789806,
"grad_norm": 7.507421970367432,
"learning_rate": 4.479515782101945e-06,
"loss": 0.5903,
"num_input_tokens_seen": 302639252,
"step": 91000
},
{
"epoch": 4.577059676854585,
"grad_norm": 12.31728744506836,
"learning_rate": 4.229403231454155e-06,
"loss": 0.5916,
"num_input_tokens_seen": 304289124,
"step": 91500
},
{
"epoch": 4.602070931919363,
"grad_norm": 11.238248825073242,
"learning_rate": 3.979290680806363e-06,
"loss": 0.5617,
"num_input_tokens_seen": 305968436,
"step": 92000
},
{
"epoch": 4.6270821869841425,
"grad_norm": 6.74647331237793,
"learning_rate": 3.7291781301585712e-06,
"loss": 0.6249,
"num_input_tokens_seen": 307616156,
"step": 92500
},
{
"epoch": 4.652093442048922,
"grad_norm": 7.845546722412109,
"learning_rate": 3.4790655795107795e-06,
"loss": 0.6015,
"num_input_tokens_seen": 309294188,
"step": 93000
},
{
"epoch": 4.677104697113701,
"grad_norm": 5.631568431854248,
"learning_rate": 3.2289530288629883e-06,
"loss": 0.5747,
"num_input_tokens_seen": 310930388,
"step": 93500
},
{
"epoch": 4.70211595217848,
"grad_norm": 4.305506229400635,
"learning_rate": 2.978840478215197e-06,
"loss": 0.5957,
"num_input_tokens_seen": 312600876,
"step": 94000
},
{
"epoch": 4.727127207243259,
"grad_norm": 12.092133522033691,
"learning_rate": 2.7287279275674053e-06,
"loss": 0.5952,
"num_input_tokens_seen": 314275796,
"step": 94500
},
{
"epoch": 4.7521384623080385,
"grad_norm": 7.043518543243408,
"learning_rate": 2.478615376919614e-06,
"loss": 0.6013,
"num_input_tokens_seen": 315945468,
"step": 95000
},
{
"epoch": 4.777149717372818,
"grad_norm": 6.208098888397217,
"learning_rate": 2.2285028262718224e-06,
"loss": 0.591,
"num_input_tokens_seen": 317595388,
"step": 95500
},
{
"epoch": 4.802160972437597,
"grad_norm": 3.588547706604004,
"learning_rate": 1.978390275624031e-06,
"loss": 0.5846,
"num_input_tokens_seen": 319229212,
"step": 96000
},
{
"epoch": 4.827172227502376,
"grad_norm": 10.502739906311035,
"learning_rate": 1.7282777249762395e-06,
"loss": 0.5904,
"num_input_tokens_seen": 320908604,
"step": 96500
},
{
"epoch": 4.852183482567155,
"grad_norm": 8.170723915100098,
"learning_rate": 1.4781651743284478e-06,
"loss": 0.5925,
"num_input_tokens_seen": 322558268,
"step": 97000
},
{
"epoch": 4.8771947376319345,
"grad_norm": 10.083109855651855,
"learning_rate": 1.2280526236806563e-06,
"loss": 0.5977,
"num_input_tokens_seen": 324205708,
"step": 97500
},
{
"epoch": 4.902205992696714,
"grad_norm": 6.591386795043945,
"learning_rate": 9.779400730328649e-07,
"loss": 0.5633,
"num_input_tokens_seen": 325850036,
"step": 98000
},
{
"epoch": 4.927217247761493,
"grad_norm": 7.133991241455078,
"learning_rate": 7.278275223850733e-07,
"loss": 0.5786,
"num_input_tokens_seen": 327509276,
"step": 98500
},
{
"epoch": 4.952228502826272,
"grad_norm": 5.090227127075195,
"learning_rate": 4.777149717372818e-07,
"loss": 0.5886,
"num_input_tokens_seen": 329175052,
"step": 99000
},
{
"epoch": 4.977239757891051,
"grad_norm": 7.157599925994873,
"learning_rate": 2.276024210894903e-07,
"loss": 0.5879,
"num_input_tokens_seen": 330819060,
"step": 99500
},
{
"epoch": 5.0,
"eval_loss": 0.5393198132514954,
"eval_runtime": 97.8527,
"eval_samples_per_second": 408.584,
"eval_steps_per_second": 51.077,
"num_input_tokens_seen": 332318598,
"step": 99955
},
{
"epoch": 5.0,
"num_input_tokens_seen": 332318598,
"step": 99955,
"total_flos": 1.2062750373789696e+17,
"train_loss": 0.7601320325591829,
"train_runtime": 7988.2275,
"train_samples_per_second": 100.099,
"train_steps_per_second": 12.513,
"train_tokens_per_second": 41593.134
}
],
"logging_steps": 500,
"max_steps": 99955,
"num_input_tokens_seen": 332318598,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2062750373789696e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}