Mistral-Peptide-v1-134M / trainer_state.json
RaphaelMourad's picture
Upload 11 files
b4d116e verified
raw
history blame
44.5 kB
{
"best_metric": 5.3026299476623535,
"best_model_checkpoint": "./results/models/checkpoint-121302",
"epoch": 18.0,
"eval_steps": 500,
"global_step": 121302,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07419498441905327,
"grad_norm": 0.6640625,
"learning_rate": 0.000998516100311619,
"loss": 5.6889,
"step": 500
},
{
"epoch": 0.14838996883810654,
"grad_norm": 0.6484375,
"learning_rate": 0.0009970322006232378,
"loss": 5.5782,
"step": 1000
},
{
"epoch": 0.22258495325715982,
"grad_norm": 0.75,
"learning_rate": 0.0009955483009348569,
"loss": 5.5562,
"step": 1500
},
{
"epoch": 0.2967799376762131,
"grad_norm": 0.80078125,
"learning_rate": 0.0009940644012464757,
"loss": 5.5537,
"step": 2000
},
{
"epoch": 0.37097492209526634,
"grad_norm": 0.765625,
"learning_rate": 0.0009925805015580946,
"loss": 5.5436,
"step": 2500
},
{
"epoch": 0.44516990651431965,
"grad_norm": 2.109375,
"learning_rate": 0.0009910966018697137,
"loss": 5.5482,
"step": 3000
},
{
"epoch": 0.5193648909333729,
"grad_norm": 0.92578125,
"learning_rate": 0.0009896127021813326,
"loss": 5.5504,
"step": 3500
},
{
"epoch": 0.5935598753524262,
"grad_norm": 1.3359375,
"learning_rate": 0.0009881288024929514,
"loss": 5.5373,
"step": 4000
},
{
"epoch": 0.6677548597714794,
"grad_norm": 1.0078125,
"learning_rate": 0.0009866449028045703,
"loss": 5.5279,
"step": 4500
},
{
"epoch": 0.7419498441905327,
"grad_norm": 5.78125,
"learning_rate": 0.0009851610031161894,
"loss": 5.5215,
"step": 5000
},
{
"epoch": 0.816144828609586,
"grad_norm": 4.65625,
"learning_rate": 0.0009836771034278083,
"loss": 5.5206,
"step": 5500
},
{
"epoch": 0.8903398130286393,
"grad_norm": 10.125,
"learning_rate": 0.0009821932037394272,
"loss": 5.5186,
"step": 6000
},
{
"epoch": 0.9645347974476925,
"grad_norm": 12.0,
"learning_rate": 0.0009807093040510462,
"loss": 5.5166,
"step": 6500
},
{
"epoch": 1.0,
"eval_loss": 5.499637603759766,
"eval_runtime": 2.498,
"eval_samples_per_second": 400.319,
"eval_steps_per_second": 3.203,
"step": 6739
},
{
"epoch": 1.0387297818667458,
"grad_norm": 1.2734375,
"learning_rate": 0.0009792254043626651,
"loss": 5.5143,
"step": 7000
},
{
"epoch": 1.112924766285799,
"grad_norm": 0.76953125,
"learning_rate": 0.000977741504674284,
"loss": 5.5052,
"step": 7500
},
{
"epoch": 1.1871197507048523,
"grad_norm": 1.203125,
"learning_rate": 0.000976257604985903,
"loss": 5.5022,
"step": 8000
},
{
"epoch": 1.2613147351239056,
"grad_norm": 2.671875,
"learning_rate": 0.000974773705297522,
"loss": 5.4968,
"step": 8500
},
{
"epoch": 1.3355097195429588,
"grad_norm": 0.9375,
"learning_rate": 0.0009732898056091408,
"loss": 5.4938,
"step": 9000
},
{
"epoch": 1.4097047039620123,
"grad_norm": 7.21875,
"learning_rate": 0.0009718059059207598,
"loss": 5.4899,
"step": 9500
},
{
"epoch": 1.4838996883810656,
"grad_norm": 0.80078125,
"learning_rate": 0.0009703220062323788,
"loss": 5.4853,
"step": 10000
},
{
"epoch": 1.5580946728001188,
"grad_norm": 0.7890625,
"learning_rate": 0.0009688381065439977,
"loss": 5.4836,
"step": 10500
},
{
"epoch": 1.632289657219172,
"grad_norm": 1.3046875,
"learning_rate": 0.0009673542068556166,
"loss": 5.4761,
"step": 11000
},
{
"epoch": 1.7064846416382253,
"grad_norm": 1.09375,
"learning_rate": 0.0009658703071672355,
"loss": 5.4742,
"step": 11500
},
{
"epoch": 1.7806796260572786,
"grad_norm": 1.2109375,
"learning_rate": 0.0009643864074788544,
"loss": 5.4737,
"step": 12000
},
{
"epoch": 1.8548746104763318,
"grad_norm": 1.4453125,
"learning_rate": 0.0009629025077904734,
"loss": 5.4767,
"step": 12500
},
{
"epoch": 1.929069594895385,
"grad_norm": 4.09375,
"learning_rate": 0.0009614186081020924,
"loss": 5.4729,
"step": 13000
},
{
"epoch": 2.0,
"eval_loss": 5.460068702697754,
"eval_runtime": 2.5452,
"eval_samples_per_second": 392.895,
"eval_steps_per_second": 3.143,
"step": 13478
},
{
"epoch": 2.0032645793144384,
"grad_norm": 0.765625,
"learning_rate": 0.0009599347084137112,
"loss": 5.4755,
"step": 13500
},
{
"epoch": 2.0774595637334916,
"grad_norm": 0.89453125,
"learning_rate": 0.0009584508087253302,
"loss": 5.4638,
"step": 14000
},
{
"epoch": 2.151654548152545,
"grad_norm": 0.78125,
"learning_rate": 0.0009569669090369492,
"loss": 5.4658,
"step": 14500
},
{
"epoch": 2.225849532571598,
"grad_norm": 0.8125,
"learning_rate": 0.0009554830093485681,
"loss": 5.46,
"step": 15000
},
{
"epoch": 2.3000445169906514,
"grad_norm": 0.7265625,
"learning_rate": 0.000953999109660187,
"loss": 5.4562,
"step": 15500
},
{
"epoch": 2.3742395014097046,
"grad_norm": 1.7109375,
"learning_rate": 0.0009525152099718059,
"loss": 5.4593,
"step": 16000
},
{
"epoch": 2.448434485828758,
"grad_norm": 1.0390625,
"learning_rate": 0.0009510313102834248,
"loss": 5.4628,
"step": 16500
},
{
"epoch": 2.522629470247811,
"grad_norm": 2.4375,
"learning_rate": 0.0009495474105950438,
"loss": 5.4565,
"step": 17000
},
{
"epoch": 2.5968244546668644,
"grad_norm": 1.5625,
"learning_rate": 0.0009480635109066628,
"loss": 5.4576,
"step": 17500
},
{
"epoch": 2.6710194390859177,
"grad_norm": 0.9765625,
"learning_rate": 0.0009465796112182816,
"loss": 5.4535,
"step": 18000
},
{
"epoch": 2.745214423504971,
"grad_norm": 3.4375,
"learning_rate": 0.0009450957115299006,
"loss": 5.458,
"step": 18500
},
{
"epoch": 2.8194094079240246,
"grad_norm": 1.4765625,
"learning_rate": 0.0009436118118415196,
"loss": 5.4522,
"step": 19000
},
{
"epoch": 2.893604392343078,
"grad_norm": 0.93359375,
"learning_rate": 0.0009421279121531385,
"loss": 5.4521,
"step": 19500
},
{
"epoch": 2.967799376762131,
"grad_norm": 2.03125,
"learning_rate": 0.0009406440124647574,
"loss": 5.4537,
"step": 20000
},
{
"epoch": 3.0,
"eval_loss": 5.447722434997559,
"eval_runtime": 2.4611,
"eval_samples_per_second": 406.315,
"eval_steps_per_second": 3.251,
"step": 20217
},
{
"epoch": 3.041994361181184,
"grad_norm": 1.3359375,
"learning_rate": 0.0009391601127763763,
"loss": 5.4561,
"step": 20500
},
{
"epoch": 3.1161893456002376,
"grad_norm": 1.2109375,
"learning_rate": 0.0009376762130879952,
"loss": 5.4494,
"step": 21000
},
{
"epoch": 3.190384330019291,
"grad_norm": 3.453125,
"learning_rate": 0.0009361923133996142,
"loss": 5.4476,
"step": 21500
},
{
"epoch": 3.264579314438344,
"grad_norm": 0.7890625,
"learning_rate": 0.0009347084137112332,
"loss": 5.4441,
"step": 22000
},
{
"epoch": 3.3387742988573974,
"grad_norm": 0.8828125,
"learning_rate": 0.000933224514022852,
"loss": 5.4429,
"step": 22500
},
{
"epoch": 3.4129692832764507,
"grad_norm": 1.40625,
"learning_rate": 0.000931740614334471,
"loss": 5.4421,
"step": 23000
},
{
"epoch": 3.487164267695504,
"grad_norm": 0.97265625,
"learning_rate": 0.00093025671464609,
"loss": 5.4414,
"step": 23500
},
{
"epoch": 3.561359252114557,
"grad_norm": 1.5390625,
"learning_rate": 0.000928772814957709,
"loss": 5.4417,
"step": 24000
},
{
"epoch": 3.6355542365336104,
"grad_norm": 1.3125,
"learning_rate": 0.0009272889152693278,
"loss": 5.44,
"step": 24500
},
{
"epoch": 3.7097492209526637,
"grad_norm": 0.9765625,
"learning_rate": 0.0009258050155809467,
"loss": 5.4383,
"step": 25000
},
{
"epoch": 3.783944205371717,
"grad_norm": 0.83203125,
"learning_rate": 0.0009243211158925657,
"loss": 5.4373,
"step": 25500
},
{
"epoch": 3.85813918979077,
"grad_norm": 0.9140625,
"learning_rate": 0.0009228372162041846,
"loss": 5.4365,
"step": 26000
},
{
"epoch": 3.9323341742098235,
"grad_norm": 0.8671875,
"learning_rate": 0.0009213533165158036,
"loss": 5.4409,
"step": 26500
},
{
"epoch": 4.0,
"eval_loss": 5.42551326751709,
"eval_runtime": 2.856,
"eval_samples_per_second": 350.134,
"eval_steps_per_second": 2.801,
"step": 26956
},
{
"epoch": 4.006529158628877,
"grad_norm": 1.8359375,
"learning_rate": 0.0009198694168274225,
"loss": 5.4357,
"step": 27000
},
{
"epoch": 4.08072414304793,
"grad_norm": 0.74609375,
"learning_rate": 0.0009183855171390414,
"loss": 5.4394,
"step": 27500
},
{
"epoch": 4.154919127466983,
"grad_norm": 0.875,
"learning_rate": 0.0009169016174506604,
"loss": 5.4285,
"step": 28000
},
{
"epoch": 4.2291141118860365,
"grad_norm": 1.1953125,
"learning_rate": 0.0009154177177622794,
"loss": 5.4331,
"step": 28500
},
{
"epoch": 4.30330909630509,
"grad_norm": 0.94140625,
"learning_rate": 0.0009139338180738981,
"loss": 5.4298,
"step": 29000
},
{
"epoch": 4.377504080724143,
"grad_norm": 1.078125,
"learning_rate": 0.0009124499183855171,
"loss": 5.4261,
"step": 29500
},
{
"epoch": 4.451699065143196,
"grad_norm": 0.7421875,
"learning_rate": 0.0009109660186971361,
"loss": 5.4228,
"step": 30000
},
{
"epoch": 4.5258940495622495,
"grad_norm": 0.8671875,
"learning_rate": 0.000909482119008755,
"loss": 5.4248,
"step": 30500
},
{
"epoch": 4.600089033981303,
"grad_norm": 1.0703125,
"learning_rate": 0.000907998219320374,
"loss": 5.4221,
"step": 31000
},
{
"epoch": 4.674284018400356,
"grad_norm": 1.2734375,
"learning_rate": 0.0009065143196319929,
"loss": 5.4209,
"step": 31500
},
{
"epoch": 4.748479002819409,
"grad_norm": 2.125,
"learning_rate": 0.0009050304199436118,
"loss": 5.4231,
"step": 32000
},
{
"epoch": 4.8226739872384625,
"grad_norm": 1.1015625,
"learning_rate": 0.0009035465202552308,
"loss": 5.4172,
"step": 32500
},
{
"epoch": 4.896868971657516,
"grad_norm": 0.8515625,
"learning_rate": 0.0009020626205668498,
"loss": 5.4214,
"step": 33000
},
{
"epoch": 4.971063956076569,
"grad_norm": 0.875,
"learning_rate": 0.0009005787208784685,
"loss": 5.4197,
"step": 33500
},
{
"epoch": 5.0,
"eval_loss": 5.406287670135498,
"eval_runtime": 2.2248,
"eval_samples_per_second": 449.473,
"eval_steps_per_second": 3.596,
"step": 33695
},
{
"epoch": 5.045258940495622,
"grad_norm": 0.70703125,
"learning_rate": 0.0008990948211900875,
"loss": 5.4097,
"step": 34000
},
{
"epoch": 5.1194539249146755,
"grad_norm": 0.921875,
"learning_rate": 0.0008976109215017065,
"loss": 5.4088,
"step": 34500
},
{
"epoch": 5.193648909333729,
"grad_norm": 0.8671875,
"learning_rate": 0.0008961270218133254,
"loss": 5.4096,
"step": 35000
},
{
"epoch": 5.267843893752782,
"grad_norm": 1.0703125,
"learning_rate": 0.0008946431221249444,
"loss": 5.4136,
"step": 35500
},
{
"epoch": 5.342038878171835,
"grad_norm": 1.4375,
"learning_rate": 0.0008931592224365633,
"loss": 5.4113,
"step": 36000
},
{
"epoch": 5.416233862590889,
"grad_norm": 1.6796875,
"learning_rate": 0.0008916753227481822,
"loss": 5.4121,
"step": 36500
},
{
"epoch": 5.490428847009942,
"grad_norm": 32.75,
"learning_rate": 0.0008901914230598012,
"loss": 5.4103,
"step": 37000
},
{
"epoch": 5.564623831428995,
"grad_norm": 1.3984375,
"learning_rate": 0.0008887075233714202,
"loss": 5.4097,
"step": 37500
},
{
"epoch": 5.638818815848048,
"grad_norm": 1.078125,
"learning_rate": 0.0008872236236830389,
"loss": 5.4042,
"step": 38000
},
{
"epoch": 5.713013800267102,
"grad_norm": 0.8515625,
"learning_rate": 0.0008857397239946579,
"loss": 5.4061,
"step": 38500
},
{
"epoch": 5.787208784686155,
"grad_norm": 0.7109375,
"learning_rate": 0.0008842558243062769,
"loss": 5.4004,
"step": 39000
},
{
"epoch": 5.861403769105208,
"grad_norm": 0.78515625,
"learning_rate": 0.0008827719246178958,
"loss": 5.3987,
"step": 39500
},
{
"epoch": 5.935598753524262,
"grad_norm": 0.75390625,
"learning_rate": 0.0008812880249295148,
"loss": 5.3989,
"step": 40000
},
{
"epoch": 6.0,
"eval_loss": 5.391963481903076,
"eval_runtime": 2.6622,
"eval_samples_per_second": 375.626,
"eval_steps_per_second": 3.005,
"step": 40434
},
{
"epoch": 6.009793737943315,
"grad_norm": 1.421875,
"learning_rate": 0.0008798041252411337,
"loss": 5.3997,
"step": 40500
},
{
"epoch": 6.083988722362368,
"grad_norm": 1.03125,
"learning_rate": 0.0008783202255527527,
"loss": 5.3956,
"step": 41000
},
{
"epoch": 6.158183706781422,
"grad_norm": 1.2890625,
"learning_rate": 0.0008768363258643716,
"loss": 5.3922,
"step": 41500
},
{
"epoch": 6.232378691200475,
"grad_norm": 3.625,
"learning_rate": 0.0008753524261759906,
"loss": 5.3889,
"step": 42000
},
{
"epoch": 6.3065736756195285,
"grad_norm": 0.7890625,
"learning_rate": 0.0008738685264876095,
"loss": 5.3919,
"step": 42500
},
{
"epoch": 6.380768660038582,
"grad_norm": 1.3515625,
"learning_rate": 0.0008723846267992283,
"loss": 5.3939,
"step": 43000
},
{
"epoch": 6.454963644457635,
"grad_norm": 0.875,
"learning_rate": 0.0008709007271108473,
"loss": 5.3955,
"step": 43500
},
{
"epoch": 6.529158628876688,
"grad_norm": 2.46875,
"learning_rate": 0.0008694168274224663,
"loss": 5.3926,
"step": 44000
},
{
"epoch": 6.6033536132957416,
"grad_norm": 1.625,
"learning_rate": 0.0008679329277340852,
"loss": 5.3846,
"step": 44500
},
{
"epoch": 6.677548597714795,
"grad_norm": 0.984375,
"learning_rate": 0.0008664490280457041,
"loss": 5.3893,
"step": 45000
},
{
"epoch": 6.751743582133848,
"grad_norm": 0.90234375,
"learning_rate": 0.0008649651283573231,
"loss": 5.3905,
"step": 45500
},
{
"epoch": 6.825938566552901,
"grad_norm": 0.69140625,
"learning_rate": 0.000863481228668942,
"loss": 5.3902,
"step": 46000
},
{
"epoch": 6.900133550971955,
"grad_norm": 1.453125,
"learning_rate": 0.000861997328980561,
"loss": 5.3856,
"step": 46500
},
{
"epoch": 6.974328535391008,
"grad_norm": 1.015625,
"learning_rate": 0.0008605134292921799,
"loss": 5.3859,
"step": 47000
},
{
"epoch": 7.0,
"eval_loss": 5.379256725311279,
"eval_runtime": 2.4552,
"eval_samples_per_second": 407.295,
"eval_steps_per_second": 3.258,
"step": 47173
},
{
"epoch": 7.048523519810061,
"grad_norm": 1.2265625,
"learning_rate": 0.0008590295296037987,
"loss": 5.3794,
"step": 47500
},
{
"epoch": 7.122718504229114,
"grad_norm": 1.890625,
"learning_rate": 0.0008575456299154177,
"loss": 5.3824,
"step": 48000
},
{
"epoch": 7.196913488648168,
"grad_norm": 0.8515625,
"learning_rate": 0.0008560617302270367,
"loss": 5.3814,
"step": 48500
},
{
"epoch": 7.271108473067221,
"grad_norm": 1.3203125,
"learning_rate": 0.0008545778305386556,
"loss": 5.3778,
"step": 49000
},
{
"epoch": 7.345303457486274,
"grad_norm": 0.84765625,
"learning_rate": 0.0008530939308502745,
"loss": 5.3796,
"step": 49500
},
{
"epoch": 7.419498441905327,
"grad_norm": 2.953125,
"learning_rate": 0.0008516100311618935,
"loss": 5.3802,
"step": 50000
},
{
"epoch": 7.493693426324381,
"grad_norm": 0.875,
"learning_rate": 0.0008501261314735124,
"loss": 5.3821,
"step": 50500
},
{
"epoch": 7.567888410743434,
"grad_norm": 1.03125,
"learning_rate": 0.0008486422317851314,
"loss": 5.3796,
"step": 51000
},
{
"epoch": 7.642083395162487,
"grad_norm": 0.86328125,
"learning_rate": 0.0008471583320967503,
"loss": 5.3832,
"step": 51500
},
{
"epoch": 7.71627837958154,
"grad_norm": 0.91796875,
"learning_rate": 0.0008456744324083691,
"loss": 5.3821,
"step": 52000
},
{
"epoch": 7.790473364000594,
"grad_norm": 1.1015625,
"learning_rate": 0.0008441905327199881,
"loss": 5.3779,
"step": 52500
},
{
"epoch": 7.864668348419647,
"grad_norm": 1.203125,
"learning_rate": 0.0008427066330316071,
"loss": 5.3801,
"step": 53000
},
{
"epoch": 7.9388633328387,
"grad_norm": 1.0859375,
"learning_rate": 0.000841222733343226,
"loss": 5.3784,
"step": 53500
},
{
"epoch": 8.0,
"eval_loss": 5.376255512237549,
"eval_runtime": 2.4385,
"eval_samples_per_second": 410.092,
"eval_steps_per_second": 3.281,
"step": 53912
},
{
"epoch": 8.013058317257753,
"grad_norm": 1.2421875,
"learning_rate": 0.000839738833654845,
"loss": 5.3737,
"step": 54000
},
{
"epoch": 8.087253301676807,
"grad_norm": 1.53125,
"learning_rate": 0.0008382549339664639,
"loss": 5.3727,
"step": 54500
},
{
"epoch": 8.16144828609586,
"grad_norm": 0.80078125,
"learning_rate": 0.0008367710342780828,
"loss": 5.3665,
"step": 55000
},
{
"epoch": 8.235643270514913,
"grad_norm": 1.078125,
"learning_rate": 0.0008352871345897018,
"loss": 5.3711,
"step": 55500
},
{
"epoch": 8.309838254933966,
"grad_norm": 0.78515625,
"learning_rate": 0.0008338032349013207,
"loss": 5.3637,
"step": 56000
},
{
"epoch": 8.38403323935302,
"grad_norm": 0.84375,
"learning_rate": 0.0008323193352129396,
"loss": 5.3708,
"step": 56500
},
{
"epoch": 8.458228223772073,
"grad_norm": 1.5546875,
"learning_rate": 0.0008308354355245585,
"loss": 5.3706,
"step": 57000
},
{
"epoch": 8.532423208191126,
"grad_norm": 0.9375,
"learning_rate": 0.0008293515358361775,
"loss": 5.371,
"step": 57500
},
{
"epoch": 8.60661819261018,
"grad_norm": 0.80859375,
"learning_rate": 0.0008278676361477965,
"loss": 5.3719,
"step": 58000
},
{
"epoch": 8.680813177029233,
"grad_norm": 1.21875,
"learning_rate": 0.0008263837364594153,
"loss": 5.3708,
"step": 58500
},
{
"epoch": 8.755008161448286,
"grad_norm": 0.81640625,
"learning_rate": 0.0008248998367710343,
"loss": 5.367,
"step": 59000
},
{
"epoch": 8.82920314586734,
"grad_norm": 0.84375,
"learning_rate": 0.0008234159370826533,
"loss": 5.3645,
"step": 59500
},
{
"epoch": 8.903398130286392,
"grad_norm": 1.015625,
"learning_rate": 0.0008219320373942722,
"loss": 5.3625,
"step": 60000
},
{
"epoch": 8.977593114705446,
"grad_norm": 1.1796875,
"learning_rate": 0.0008204481377058911,
"loss": 5.364,
"step": 60500
},
{
"epoch": 9.0,
"eval_loss": 5.361752986907959,
"eval_runtime": 2.4227,
"eval_samples_per_second": 412.763,
"eval_steps_per_second": 3.302,
"step": 60651
},
{
"epoch": 9.051788099124499,
"grad_norm": 0.85546875,
"learning_rate": 0.00081896423801751,
"loss": 5.3608,
"step": 61000
},
{
"epoch": 9.125983083543552,
"grad_norm": 0.98828125,
"learning_rate": 0.0008174803383291289,
"loss": 5.3559,
"step": 61500
},
{
"epoch": 9.200178067962606,
"grad_norm": 0.80078125,
"learning_rate": 0.0008159964386407479,
"loss": 5.3547,
"step": 62000
},
{
"epoch": 9.274373052381659,
"grad_norm": 1.28125,
"learning_rate": 0.0008145125389523669,
"loss": 5.3533,
"step": 62500
},
{
"epoch": 9.348568036800712,
"grad_norm": 0.859375,
"learning_rate": 0.0008130286392639857,
"loss": 5.3545,
"step": 63000
},
{
"epoch": 9.422763021219765,
"grad_norm": 9.5625,
"learning_rate": 0.0008115447395756047,
"loss": 5.3574,
"step": 63500
},
{
"epoch": 9.496958005638819,
"grad_norm": 1.40625,
"learning_rate": 0.0008100608398872237,
"loss": 5.3565,
"step": 64000
},
{
"epoch": 9.571152990057872,
"grad_norm": 2.140625,
"learning_rate": 0.0008085769401988426,
"loss": 5.3529,
"step": 64500
},
{
"epoch": 9.645347974476925,
"grad_norm": 1.421875,
"learning_rate": 0.0008070930405104615,
"loss": 5.3565,
"step": 65000
},
{
"epoch": 9.719542958895978,
"grad_norm": 1.34375,
"learning_rate": 0.0008056091408220804,
"loss": 5.3566,
"step": 65500
},
{
"epoch": 9.793737943315032,
"grad_norm": 1.15625,
"learning_rate": 0.0008041252411336993,
"loss": 5.3579,
"step": 66000
},
{
"epoch": 9.867932927734085,
"grad_norm": 2.46875,
"learning_rate": 0.0008026413414453183,
"loss": 5.3545,
"step": 66500
},
{
"epoch": 9.942127912153138,
"grad_norm": 1.953125,
"learning_rate": 0.0008011574417569373,
"loss": 5.3532,
"step": 67000
},
{
"epoch": 10.0,
"eval_loss": 5.357708930969238,
"eval_runtime": 2.5182,
"eval_samples_per_second": 397.102,
"eval_steps_per_second": 3.177,
"step": 67390
},
{
"epoch": 10.016322896572191,
"grad_norm": 3.09375,
"learning_rate": 0.0007996735420685562,
"loss": 5.3531,
"step": 67500
},
{
"epoch": 10.090517880991245,
"grad_norm": 5.59375,
"learning_rate": 0.0007981896423801751,
"loss": 5.3441,
"step": 68000
},
{
"epoch": 10.164712865410298,
"grad_norm": 3.4375,
"learning_rate": 0.0007967057426917941,
"loss": 5.3464,
"step": 68500
},
{
"epoch": 10.238907849829351,
"grad_norm": 1.0703125,
"learning_rate": 0.000795221843003413,
"loss": 5.3482,
"step": 69000
},
{
"epoch": 10.313102834248404,
"grad_norm": 3.046875,
"learning_rate": 0.0007937379433150319,
"loss": 5.3481,
"step": 69500
},
{
"epoch": 10.387297818667458,
"grad_norm": 0.74609375,
"learning_rate": 0.0007922540436266508,
"loss": 5.35,
"step": 70000
},
{
"epoch": 10.46149280308651,
"grad_norm": 0.75390625,
"learning_rate": 0.0007907701439382697,
"loss": 5.3446,
"step": 70500
},
{
"epoch": 10.535687787505564,
"grad_norm": 11.5,
"learning_rate": 0.0007892862442498887,
"loss": 5.3415,
"step": 71000
},
{
"epoch": 10.609882771924617,
"grad_norm": 0.78515625,
"learning_rate": 0.0007878023445615077,
"loss": 5.3424,
"step": 71500
},
{
"epoch": 10.68407775634367,
"grad_norm": 0.92578125,
"learning_rate": 0.0007863184448731267,
"loss": 5.3428,
"step": 72000
},
{
"epoch": 10.758272740762724,
"grad_norm": 1.234375,
"learning_rate": 0.0007848345451847455,
"loss": 5.3442,
"step": 72500
},
{
"epoch": 10.832467725181777,
"grad_norm": 4.21875,
"learning_rate": 0.0007833506454963645,
"loss": 5.3416,
"step": 73000
},
{
"epoch": 10.90666270960083,
"grad_norm": 2.890625,
"learning_rate": 0.0007818667458079835,
"loss": 5.343,
"step": 73500
},
{
"epoch": 10.980857694019884,
"grad_norm": 0.93359375,
"learning_rate": 0.0007803828461196023,
"loss": 5.3442,
"step": 74000
},
{
"epoch": 11.0,
"eval_loss": 5.3484954833984375,
"eval_runtime": 2.2047,
"eval_samples_per_second": 453.582,
"eval_steps_per_second": 3.629,
"step": 74129
},
{
"epoch": 11.055052678438937,
"grad_norm": 14.625,
"learning_rate": 0.0007788989464312212,
"loss": 5.3356,
"step": 74500
},
{
"epoch": 11.12924766285799,
"grad_norm": 1.25,
"learning_rate": 0.0007774150467428402,
"loss": 5.3352,
"step": 75000
},
{
"epoch": 11.203442647277043,
"grad_norm": 0.80859375,
"learning_rate": 0.0007759311470544591,
"loss": 5.3376,
"step": 75500
},
{
"epoch": 11.277637631696097,
"grad_norm": 1.0625,
"learning_rate": 0.0007744472473660781,
"loss": 5.3388,
"step": 76000
},
{
"epoch": 11.35183261611515,
"grad_norm": 0.94140625,
"learning_rate": 0.0007729633476776971,
"loss": 5.3359,
"step": 76500
},
{
"epoch": 11.426027600534203,
"grad_norm": 1.1875,
"learning_rate": 0.0007714794479893159,
"loss": 5.3373,
"step": 77000
},
{
"epoch": 11.500222584953256,
"grad_norm": 0.9140625,
"learning_rate": 0.0007699955483009349,
"loss": 5.3385,
"step": 77500
},
{
"epoch": 11.57441756937231,
"grad_norm": 0.99609375,
"learning_rate": 0.0007685116486125539,
"loss": 5.3334,
"step": 78000
},
{
"epoch": 11.648612553791363,
"grad_norm": 1.671875,
"learning_rate": 0.0007670277489241727,
"loss": 5.3325,
"step": 78500
},
{
"epoch": 11.722807538210416,
"grad_norm": 3.890625,
"learning_rate": 0.0007655438492357916,
"loss": 5.3376,
"step": 79000
},
{
"epoch": 11.79700252262947,
"grad_norm": 1.9765625,
"learning_rate": 0.0007640599495474106,
"loss": 5.3335,
"step": 79500
},
{
"epoch": 11.871197507048523,
"grad_norm": 4.5625,
"learning_rate": 0.0007625760498590295,
"loss": 5.3407,
"step": 80000
},
{
"epoch": 11.945392491467576,
"grad_norm": 2.046875,
"learning_rate": 0.0007610921501706485,
"loss": 5.3363,
"step": 80500
},
{
"epoch": 12.0,
"eval_loss": 5.337391376495361,
"eval_runtime": 2.4996,
"eval_samples_per_second": 400.069,
"eval_steps_per_second": 3.201,
"step": 80868
},
{
"epoch": 12.01958747588663,
"grad_norm": 1.59375,
"learning_rate": 0.0007596082504822675,
"loss": 5.3345,
"step": 81000
},
{
"epoch": 12.093782460305682,
"grad_norm": 1.0625,
"learning_rate": 0.0007581243507938863,
"loss": 5.3255,
"step": 81500
},
{
"epoch": 12.167977444724736,
"grad_norm": 1.0390625,
"learning_rate": 0.0007566404511055053,
"loss": 5.3327,
"step": 82000
},
{
"epoch": 12.242172429143789,
"grad_norm": 1.6953125,
"learning_rate": 0.0007551565514171243,
"loss": 5.3312,
"step": 82500
},
{
"epoch": 12.316367413562844,
"grad_norm": 0.71875,
"learning_rate": 0.0007536726517287431,
"loss": 5.3318,
"step": 83000
},
{
"epoch": 12.390562397981896,
"grad_norm": 0.88671875,
"learning_rate": 0.000752188752040362,
"loss": 5.332,
"step": 83500
},
{
"epoch": 12.46475738240095,
"grad_norm": 0.91796875,
"learning_rate": 0.000750704852351981,
"loss": 5.3283,
"step": 84000
},
{
"epoch": 12.538952366820002,
"grad_norm": 0.86328125,
"learning_rate": 0.0007492209526635999,
"loss": 5.3296,
"step": 84500
},
{
"epoch": 12.613147351239057,
"grad_norm": 1.6796875,
"learning_rate": 0.0007477370529752189,
"loss": 5.3338,
"step": 85000
},
{
"epoch": 12.68734233565811,
"grad_norm": 2.6875,
"learning_rate": 0.0007462531532868379,
"loss": 5.3344,
"step": 85500
},
{
"epoch": 12.761537320077164,
"grad_norm": 0.8515625,
"learning_rate": 0.0007447692535984567,
"loss": 5.328,
"step": 86000
},
{
"epoch": 12.835732304496217,
"grad_norm": 2.921875,
"learning_rate": 0.0007432853539100757,
"loss": 5.3291,
"step": 86500
},
{
"epoch": 12.90992728891527,
"grad_norm": 6.96875,
"learning_rate": 0.0007418014542216947,
"loss": 5.3239,
"step": 87000
},
{
"epoch": 12.984122273334323,
"grad_norm": 1.3671875,
"learning_rate": 0.0007403175545333135,
"loss": 5.3225,
"step": 87500
},
{
"epoch": 13.0,
"eval_loss": 5.332010746002197,
"eval_runtime": 2.4611,
"eval_samples_per_second": 406.325,
"eval_steps_per_second": 3.251,
"step": 87607
},
{
"epoch": 13.058317257753377,
"grad_norm": 1.21875,
"learning_rate": 0.0007388336548449324,
"loss": 5.3172,
"step": 88000
},
{
"epoch": 13.13251224217243,
"grad_norm": 0.94921875,
"learning_rate": 0.0007373497551565514,
"loss": 5.3177,
"step": 88500
},
{
"epoch": 13.206707226591483,
"grad_norm": 0.9140625,
"learning_rate": 0.0007358658554681704,
"loss": 5.3229,
"step": 89000
},
{
"epoch": 13.280902211010536,
"grad_norm": 5.46875,
"learning_rate": 0.0007343819557797893,
"loss": 5.3208,
"step": 89500
},
{
"epoch": 13.35509719542959,
"grad_norm": 1.703125,
"learning_rate": 0.0007328980560914083,
"loss": 5.3127,
"step": 90000
},
{
"epoch": 13.429292179848643,
"grad_norm": 0.921875,
"learning_rate": 0.0007314141564030272,
"loss": 5.3193,
"step": 90500
},
{
"epoch": 13.503487164267696,
"grad_norm": 0.9765625,
"learning_rate": 0.0007299302567146461,
"loss": 5.3168,
"step": 91000
},
{
"epoch": 13.57768214868675,
"grad_norm": 0.87109375,
"learning_rate": 0.000728446357026265,
"loss": 5.3133,
"step": 91500
},
{
"epoch": 13.651877133105803,
"grad_norm": 0.84375,
"learning_rate": 0.000726962457337884,
"loss": 5.3168,
"step": 92000
},
{
"epoch": 13.726072117524856,
"grad_norm": 0.86328125,
"learning_rate": 0.0007254785576495029,
"loss": 5.3168,
"step": 92500
},
{
"epoch": 13.80026710194391,
"grad_norm": 1.125,
"learning_rate": 0.0007239946579611218,
"loss": 5.3133,
"step": 93000
},
{
"epoch": 13.874462086362962,
"grad_norm": 0.98828125,
"learning_rate": 0.0007225107582727408,
"loss": 5.3155,
"step": 93500
},
{
"epoch": 13.948657070782016,
"grad_norm": 1.0,
"learning_rate": 0.0007210268585843597,
"loss": 5.3147,
"step": 94000
},
{
"epoch": 14.0,
"eval_loss": 5.325437545776367,
"eval_runtime": 2.4718,
"eval_samples_per_second": 404.568,
"eval_steps_per_second": 3.237,
"step": 94346
},
{
"epoch": 14.022852055201069,
"grad_norm": 1.3515625,
"learning_rate": 0.0007195429588959787,
"loss": 5.3109,
"step": 94500
},
{
"epoch": 14.097047039620122,
"grad_norm": 1.03125,
"learning_rate": 0.0007180590592075977,
"loss": 5.3044,
"step": 95000
},
{
"epoch": 14.171242024039175,
"grad_norm": 0.9921875,
"learning_rate": 0.0007165751595192165,
"loss": 5.3132,
"step": 95500
},
{
"epoch": 14.245437008458229,
"grad_norm": 1.3671875,
"learning_rate": 0.0007150912598308354,
"loss": 5.3108,
"step": 96000
},
{
"epoch": 14.319631992877282,
"grad_norm": 1.7734375,
"learning_rate": 0.0007136073601424544,
"loss": 5.3107,
"step": 96500
},
{
"epoch": 14.393826977296335,
"grad_norm": 3.71875,
"learning_rate": 0.0007121234604540733,
"loss": 5.3075,
"step": 97000
},
{
"epoch": 14.468021961715388,
"grad_norm": 0.88671875,
"learning_rate": 0.0007106395607656922,
"loss": 5.3084,
"step": 97500
},
{
"epoch": 14.542216946134442,
"grad_norm": 0.8984375,
"learning_rate": 0.0007091556610773112,
"loss": 5.3067,
"step": 98000
},
{
"epoch": 14.616411930553495,
"grad_norm": 1.6015625,
"learning_rate": 0.0007076717613889301,
"loss": 5.3097,
"step": 98500
},
{
"epoch": 14.690606914972548,
"grad_norm": 1.875,
"learning_rate": 0.0007061878617005491,
"loss": 5.3094,
"step": 99000
},
{
"epoch": 14.764801899391601,
"grad_norm": 1.0078125,
"learning_rate": 0.000704703962012168,
"loss": 5.31,
"step": 99500
},
{
"epoch": 14.838996883810655,
"grad_norm": 0.9296875,
"learning_rate": 0.0007032200623237869,
"loss": 5.3075,
"step": 100000
},
{
"epoch": 14.913191868229708,
"grad_norm": 1.078125,
"learning_rate": 0.0007017361626354058,
"loss": 5.3088,
"step": 100500
},
{
"epoch": 14.987386852648761,
"grad_norm": 0.828125,
"learning_rate": 0.0007002522629470248,
"loss": 5.3108,
"step": 101000
},
{
"epoch": 15.0,
"eval_loss": 5.322704315185547,
"eval_runtime": 2.0798,
"eval_samples_per_second": 480.818,
"eval_steps_per_second": 3.847,
"step": 101085
},
{
"epoch": 15.061581837067815,
"grad_norm": 0.90625,
"learning_rate": 0.0006987683632586437,
"loss": 5.306,
"step": 101500
},
{
"epoch": 15.135776821486868,
"grad_norm": 0.81640625,
"learning_rate": 0.0006972844635702626,
"loss": 5.3054,
"step": 102000
},
{
"epoch": 15.209971805905921,
"grad_norm": 1.5,
"learning_rate": 0.0006958005638818816,
"loss": 5.3026,
"step": 102500
},
{
"epoch": 15.284166790324974,
"grad_norm": 0.77734375,
"learning_rate": 0.0006943166641935005,
"loss": 5.3028,
"step": 103000
},
{
"epoch": 15.358361774744028,
"grad_norm": 0.9140625,
"learning_rate": 0.0006928327645051195,
"loss": 5.3042,
"step": 103500
},
{
"epoch": 15.43255675916308,
"grad_norm": 0.75,
"learning_rate": 0.0006913488648167385,
"loss": 5.3034,
"step": 104000
},
{
"epoch": 15.506751743582134,
"grad_norm": 1.21875,
"learning_rate": 0.0006898649651283574,
"loss": 5.309,
"step": 104500
},
{
"epoch": 15.580946728001187,
"grad_norm": 0.81640625,
"learning_rate": 0.0006883810654399762,
"loss": 5.3062,
"step": 105000
},
{
"epoch": 15.65514171242024,
"grad_norm": 1.15625,
"learning_rate": 0.0006868971657515952,
"loss": 5.3026,
"step": 105500
},
{
"epoch": 15.729336696839294,
"grad_norm": 0.984375,
"learning_rate": 0.0006854132660632142,
"loss": 5.3036,
"step": 106000
},
{
"epoch": 15.803531681258347,
"grad_norm": 0.9921875,
"learning_rate": 0.000683929366374833,
"loss": 5.3002,
"step": 106500
},
{
"epoch": 15.8777266656774,
"grad_norm": 1.1171875,
"learning_rate": 0.000682445466686452,
"loss": 5.302,
"step": 107000
},
{
"epoch": 15.951921650096454,
"grad_norm": 0.90234375,
"learning_rate": 0.000680961566998071,
"loss": 5.3014,
"step": 107500
},
{
"epoch": 16.0,
"eval_loss": 5.31771183013916,
"eval_runtime": 2.1156,
"eval_samples_per_second": 472.68,
"eval_steps_per_second": 3.781,
"step": 107824
},
{
"epoch": 16.026116634515507,
"grad_norm": 1.03125,
"learning_rate": 0.0006794776673096899,
"loss": 5.2981,
"step": 108000
},
{
"epoch": 16.10031161893456,
"grad_norm": 4.15625,
"learning_rate": 0.0006779937676213089,
"loss": 5.2982,
"step": 108500
},
{
"epoch": 16.174506603353613,
"grad_norm": 0.8359375,
"learning_rate": 0.0006765098679329278,
"loss": 5.2946,
"step": 109000
},
{
"epoch": 16.248701587772665,
"grad_norm": 1.015625,
"learning_rate": 0.0006750259682445466,
"loss": 5.2957,
"step": 109500
},
{
"epoch": 16.32289657219172,
"grad_norm": 1.0859375,
"learning_rate": 0.0006735420685561656,
"loss": 5.2964,
"step": 110000
},
{
"epoch": 16.397091556610775,
"grad_norm": 1.9453125,
"learning_rate": 0.0006720581688677846,
"loss": 5.2931,
"step": 110500
},
{
"epoch": 16.471286541029826,
"grad_norm": 1.0546875,
"learning_rate": 0.0006705742691794034,
"loss": 5.2916,
"step": 111000
},
{
"epoch": 16.545481525448878,
"grad_norm": 0.79296875,
"learning_rate": 0.0006690903694910224,
"loss": 5.2989,
"step": 111500
},
{
"epoch": 16.619676509867933,
"grad_norm": 1.234375,
"learning_rate": 0.0006676064698026414,
"loss": 5.2992,
"step": 112000
},
{
"epoch": 16.693871494286988,
"grad_norm": 7.375,
"learning_rate": 0.0006661225701142603,
"loss": 5.2971,
"step": 112500
},
{
"epoch": 16.76806647870604,
"grad_norm": 2.796875,
"learning_rate": 0.0006646386704258793,
"loss": 5.2931,
"step": 113000
},
{
"epoch": 16.842261463125094,
"grad_norm": 0.94921875,
"learning_rate": 0.0006631547707374982,
"loss": 5.2875,
"step": 113500
},
{
"epoch": 16.916456447544146,
"grad_norm": 0.74609375,
"learning_rate": 0.000661670871049117,
"loss": 5.2904,
"step": 114000
},
{
"epoch": 16.9906514319632,
"grad_norm": 0.8515625,
"learning_rate": 0.000660186971360736,
"loss": 5.2886,
"step": 114500
},
{
"epoch": 17.0,
"eval_loss": 5.307115077972412,
"eval_runtime": 2.5,
"eval_samples_per_second": 399.995,
"eval_steps_per_second": 3.2,
"step": 114563
},
{
"epoch": 17.064846416382252,
"grad_norm": 1.125,
"learning_rate": 0.000658703071672355,
"loss": 5.2867,
"step": 115000
},
{
"epoch": 17.139041400801307,
"grad_norm": 0.85546875,
"learning_rate": 0.0006572191719839738,
"loss": 5.2857,
"step": 115500
},
{
"epoch": 17.21323638522036,
"grad_norm": 0.8515625,
"learning_rate": 0.0006557352722955928,
"loss": 5.2865,
"step": 116000
},
{
"epoch": 17.287431369639414,
"grad_norm": 0.95703125,
"learning_rate": 0.0006542513726072118,
"loss": 5.2885,
"step": 116500
},
{
"epoch": 17.361626354058465,
"grad_norm": 1.1015625,
"learning_rate": 0.0006527674729188307,
"loss": 5.287,
"step": 117000
},
{
"epoch": 17.43582133847752,
"grad_norm": 1.3828125,
"learning_rate": 0.0006512835732304497,
"loss": 5.2837,
"step": 117500
},
{
"epoch": 17.510016322896572,
"grad_norm": 1.8046875,
"learning_rate": 0.0006497996735420686,
"loss": 5.2862,
"step": 118000
},
{
"epoch": 17.584211307315627,
"grad_norm": 0.8984375,
"learning_rate": 0.0006483157738536874,
"loss": 5.2868,
"step": 118500
},
{
"epoch": 17.65840629173468,
"grad_norm": 1.8046875,
"learning_rate": 0.0006468318741653064,
"loss": 5.28,
"step": 119000
},
{
"epoch": 17.732601276153733,
"grad_norm": 1.0390625,
"learning_rate": 0.0006453479744769254,
"loss": 5.2887,
"step": 119500
},
{
"epoch": 17.806796260572785,
"grad_norm": 2.015625,
"learning_rate": 0.0006438640747885443,
"loss": 5.2789,
"step": 120000
},
{
"epoch": 17.88099124499184,
"grad_norm": 1.34375,
"learning_rate": 0.0006423801751001632,
"loss": 5.2849,
"step": 120500
},
{
"epoch": 17.95518622941089,
"grad_norm": 0.85546875,
"learning_rate": 0.0006408962754117822,
"loss": 5.2823,
"step": 121000
},
{
"epoch": 18.0,
"eval_loss": 5.3026299476623535,
"eval_runtime": 2.0622,
"eval_samples_per_second": 484.913,
"eval_steps_per_second": 3.879,
"step": 121302
}
],
"logging_steps": 500,
"max_steps": 336950,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.515022957932667e+17,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}