code_3 / trainer_state.json
ciyazzzk
First model version
6d1add3
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9748743718592965,
"eval_steps": 200,
"global_step": 222,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01340033500837521,
"grad_norm": 8.789632797241211,
"learning_rate": 8.695652173913044e-07,
"loss": 0.6247,
"step": 1
},
{
"epoch": 0.02680067001675042,
"grad_norm": 9.524643898010254,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.6273,
"step": 2
},
{
"epoch": 0.04020100502512563,
"grad_norm": 8.73757266998291,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.6193,
"step": 3
},
{
"epoch": 0.05360134003350084,
"grad_norm": 6.146267890930176,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.5812,
"step": 4
},
{
"epoch": 0.06700167504187604,
"grad_norm": 7.021780967712402,
"learning_rate": 4.347826086956522e-06,
"loss": 0.5455,
"step": 5
},
{
"epoch": 0.08040201005025126,
"grad_norm": 7.42340612411499,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.5145,
"step": 6
},
{
"epoch": 0.09380234505862646,
"grad_norm": 4.656023025512695,
"learning_rate": 6.086956521739132e-06,
"loss": 0.4875,
"step": 7
},
{
"epoch": 0.10720268006700168,
"grad_norm": 3.4204678535461426,
"learning_rate": 6.956521739130435e-06,
"loss": 0.4797,
"step": 8
},
{
"epoch": 0.12060301507537688,
"grad_norm": 3.4276418685913086,
"learning_rate": 7.82608695652174e-06,
"loss": 0.4667,
"step": 9
},
{
"epoch": 0.13400335008375208,
"grad_norm": 2.8007936477661133,
"learning_rate": 8.695652173913044e-06,
"loss": 0.453,
"step": 10
},
{
"epoch": 0.1474036850921273,
"grad_norm": 2.540774345397949,
"learning_rate": 9.565217391304349e-06,
"loss": 0.4417,
"step": 11
},
{
"epoch": 0.16080402010050251,
"grad_norm": 2.691410779953003,
"learning_rate": 1.0434782608695653e-05,
"loss": 0.45,
"step": 12
},
{
"epoch": 0.17420435510887772,
"grad_norm": 2.2960121631622314,
"learning_rate": 1.1304347826086957e-05,
"loss": 0.4369,
"step": 13
},
{
"epoch": 0.18760469011725292,
"grad_norm": 2.296201229095459,
"learning_rate": 1.2173913043478263e-05,
"loss": 0.4267,
"step": 14
},
{
"epoch": 0.20100502512562815,
"grad_norm": 2.5100786685943604,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.4362,
"step": 15
},
{
"epoch": 0.21440536013400335,
"grad_norm": 2.1405630111694336,
"learning_rate": 1.391304347826087e-05,
"loss": 0.4166,
"step": 16
},
{
"epoch": 0.22780569514237856,
"grad_norm": 2.600360870361328,
"learning_rate": 1.4782608695652174e-05,
"loss": 0.4289,
"step": 17
},
{
"epoch": 0.24120603015075376,
"grad_norm": 2.8101706504821777,
"learning_rate": 1.565217391304348e-05,
"loss": 0.417,
"step": 18
},
{
"epoch": 0.254606365159129,
"grad_norm": 2.1962270736694336,
"learning_rate": 1.6521739130434785e-05,
"loss": 0.4123,
"step": 19
},
{
"epoch": 0.26800670016750416,
"grad_norm": 2.412221670150757,
"learning_rate": 1.739130434782609e-05,
"loss": 0.4144,
"step": 20
},
{
"epoch": 0.2814070351758794,
"grad_norm": 2.1126928329467773,
"learning_rate": 1.8260869565217393e-05,
"loss": 0.4112,
"step": 21
},
{
"epoch": 0.2948073701842546,
"grad_norm": 2.676149368286133,
"learning_rate": 1.9130434782608697e-05,
"loss": 0.4036,
"step": 22
},
{
"epoch": 0.3082077051926298,
"grad_norm": 2.2323713302612305,
"learning_rate": 2e-05,
"loss": 0.4132,
"step": 23
},
{
"epoch": 0.32160804020100503,
"grad_norm": 2.510111093521118,
"learning_rate": 1.9998753895176576e-05,
"loss": 0.4086,
"step": 24
},
{
"epoch": 0.33500837520938026,
"grad_norm": 1.944287896156311,
"learning_rate": 1.999501589126174e-05,
"loss": 0.3983,
"step": 25
},
{
"epoch": 0.34840871021775544,
"grad_norm": 2.390577554702759,
"learning_rate": 1.9988786919844437e-05,
"loss": 0.3858,
"step": 26
},
{
"epoch": 0.36180904522613067,
"grad_norm": 2.6762897968292236,
"learning_rate": 1.9980068533314937e-05,
"loss": 0.4029,
"step": 27
},
{
"epoch": 0.37520938023450584,
"grad_norm": 2.2880735397338867,
"learning_rate": 1.9968862904477936e-05,
"loss": 0.4051,
"step": 28
},
{
"epoch": 0.38860971524288107,
"grad_norm": 1.8628953695297241,
"learning_rate": 1.995517282601106e-05,
"loss": 0.3984,
"step": 29
},
{
"epoch": 0.4020100502512563,
"grad_norm": 2.0865204334259033,
"learning_rate": 1.993900170976888e-05,
"loss": 0.3999,
"step": 30
},
{
"epoch": 0.4154103852596315,
"grad_norm": 1.8910424709320068,
"learning_rate": 1.992035358593258e-05,
"loss": 0.3881,
"step": 31
},
{
"epoch": 0.4288107202680067,
"grad_norm": 1.797484040260315,
"learning_rate": 1.9899233102005573e-05,
"loss": 0.3908,
"step": 32
},
{
"epoch": 0.44221105527638194,
"grad_norm": 1.8772716522216797,
"learning_rate": 1.987564552165524e-05,
"loss": 0.3914,
"step": 33
},
{
"epoch": 0.4556113902847571,
"grad_norm": 2.0772998332977295,
"learning_rate": 1.984959672340111e-05,
"loss": 0.4001,
"step": 34
},
{
"epoch": 0.46901172529313234,
"grad_norm": 1.7790579795837402,
"learning_rate": 1.9821093199149806e-05,
"loss": 0.3782,
"step": 35
},
{
"epoch": 0.4824120603015075,
"grad_norm": 2.1140856742858887,
"learning_rate": 1.9790142052577148e-05,
"loss": 0.3877,
"step": 36
},
{
"epoch": 0.49581239530988275,
"grad_norm": 1.9867079257965088,
"learning_rate": 1.9756750997357738e-05,
"loss": 0.3942,
"step": 37
},
{
"epoch": 0.509212730318258,
"grad_norm": 1.9369703531265259,
"learning_rate": 1.972092835524257e-05,
"loss": 0.3953,
"step": 38
},
{
"epoch": 0.5226130653266332,
"grad_norm": 1.8987735509872437,
"learning_rate": 1.9682683053985073e-05,
"loss": 0.3892,
"step": 39
},
{
"epoch": 0.5360134003350083,
"grad_norm": 2.057279348373413,
"learning_rate": 1.9642024625116117e-05,
"loss": 0.3953,
"step": 40
},
{
"epoch": 0.5494137353433836,
"grad_norm": 1.8630274534225464,
"learning_rate": 1.959896320156857e-05,
"loss": 0.3938,
"step": 41
},
{
"epoch": 0.5628140703517588,
"grad_norm": 1.9051440954208374,
"learning_rate": 1.955350951515195e-05,
"loss": 0.3865,
"step": 42
},
{
"epoch": 0.576214405360134,
"grad_norm": 1.9074325561523438,
"learning_rate": 1.950567489387783e-05,
"loss": 0.3845,
"step": 43
},
{
"epoch": 0.5896147403685092,
"grad_norm": 1.8396834135055542,
"learning_rate": 1.945547125913667e-05,
"loss": 0.3881,
"step": 44
},
{
"epoch": 0.6030150753768844,
"grad_norm": 1.882157325744629,
"learning_rate": 1.9402911122726756e-05,
"loss": 0.3877,
"step": 45
},
{
"epoch": 0.6164154103852596,
"grad_norm": 1.9075357913970947,
"learning_rate": 1.9348007583735985e-05,
"loss": 0.3879,
"step": 46
},
{
"epoch": 0.6298157453936348,
"grad_norm": 1.7783665657043457,
"learning_rate": 1.9290774325277305e-05,
"loss": 0.3778,
"step": 47
},
{
"epoch": 0.6432160804020101,
"grad_norm": 1.8143409490585327,
"learning_rate": 1.923122561107861e-05,
"loss": 0.3787,
"step": 48
},
{
"epoch": 0.6566164154103853,
"grad_norm": 1.833724856376648,
"learning_rate": 1.916937628192789e-05,
"loss": 0.3779,
"step": 49
},
{
"epoch": 0.6700167504187605,
"grad_norm": 2.056809186935425,
"learning_rate": 1.9105241751974624e-05,
"loss": 0.3861,
"step": 50
},
{
"epoch": 0.6834170854271356,
"grad_norm": 1.8272616863250732,
"learning_rate": 1.903883800488824e-05,
"loss": 0.3783,
"step": 51
},
{
"epoch": 0.6968174204355109,
"grad_norm": 2.2034404277801514,
"learning_rate": 1.8970181589874637e-05,
"loss": 0.3807,
"step": 52
},
{
"epoch": 0.7102177554438861,
"grad_norm": 1.6915210485458374,
"learning_rate": 1.8899289617551803e-05,
"loss": 0.3732,
"step": 53
},
{
"epoch": 0.7236180904522613,
"grad_norm": 1.9049310684204102,
"learning_rate": 1.882617975568547e-05,
"loss": 0.382,
"step": 54
},
{
"epoch": 0.7370184254606366,
"grad_norm": 1.736574649810791,
"learning_rate": 1.875087022478594e-05,
"loss": 0.3735,
"step": 55
},
{
"epoch": 0.7504187604690117,
"grad_norm": 1.87579345703125,
"learning_rate": 1.867337979356715e-05,
"loss": 0.3785,
"step": 56
},
{
"epoch": 0.7638190954773869,
"grad_norm": 1.8350921869277954,
"learning_rate": 1.8593727774269122e-05,
"loss": 0.3698,
"step": 57
},
{
"epoch": 0.7772194304857621,
"grad_norm": 1.7165268659591675,
"learning_rate": 1.851193401784495e-05,
"loss": 0.3772,
"step": 58
},
{
"epoch": 0.7906197654941374,
"grad_norm": 1.7701345682144165,
"learning_rate": 1.842801890901351e-05,
"loss": 0.3851,
"step": 59
},
{
"epoch": 0.8040201005025126,
"grad_norm": 1.6949541568756104,
"learning_rate": 1.834200336117918e-05,
"loss": 0.3853,
"step": 60
},
{
"epoch": 0.8174204355108877,
"grad_norm": 1.6625339984893799,
"learning_rate": 1.8253908811219764e-05,
"loss": 0.3699,
"step": 61
},
{
"epoch": 0.830820770519263,
"grad_norm": 1.782524585723877,
"learning_rate": 1.8163757214143993e-05,
"loss": 0.3718,
"step": 62
},
{
"epoch": 0.8442211055276382,
"grad_norm": 1.8429073095321655,
"learning_rate": 1.8071571037619856e-05,
"loss": 0.3737,
"step": 63
},
{
"epoch": 0.8576214405360134,
"grad_norm": 1.6888030767440796,
"learning_rate": 1.7977373256375194e-05,
"loss": 0.3796,
"step": 64
},
{
"epoch": 0.8710217755443886,
"grad_norm": 1.7543060779571533,
"learning_rate": 1.7881187346471924e-05,
"loss": 0.3811,
"step": 65
},
{
"epoch": 0.8844221105527639,
"grad_norm": 1.5395281314849854,
"learning_rate": 1.77830372794553e-05,
"loss": 0.3813,
"step": 66
},
{
"epoch": 0.897822445561139,
"grad_norm": 1.7886697053909302,
"learning_rate": 1.7682947516379706e-05,
"loss": 0.3775,
"step": 67
},
{
"epoch": 0.9112227805695142,
"grad_norm": 1.8174864053726196,
"learning_rate": 1.7580943001712457e-05,
"loss": 0.379,
"step": 68
},
{
"epoch": 0.9246231155778895,
"grad_norm": 1.7818920612335205,
"learning_rate": 1.7477049157117093e-05,
"loss": 0.3736,
"step": 69
},
{
"epoch": 0.9380234505862647,
"grad_norm": 1.729671835899353,
"learning_rate": 1.737129187511779e-05,
"loss": 0.3652,
"step": 70
},
{
"epoch": 0.9514237855946399,
"grad_norm": 1.6677137613296509,
"learning_rate": 1.7263697512646397e-05,
"loss": 0.3762,
"step": 71
},
{
"epoch": 0.964824120603015,
"grad_norm": 1.597731113433838,
"learning_rate": 1.7154292884473712e-05,
"loss": 0.374,
"step": 72
},
{
"epoch": 0.9782244556113903,
"grad_norm": 1.757295846939087,
"learning_rate": 1.7043105256526723e-05,
"loss": 0.3792,
"step": 73
},
{
"epoch": 0.9916247906197655,
"grad_norm": 1.7294024229049683,
"learning_rate": 1.693016233909332e-05,
"loss": 0.3765,
"step": 74
},
{
"epoch": 1.0050251256281406,
"grad_norm": 1.6741400957107544,
"learning_rate": 1.681549227991634e-05,
"loss": 0.3694,
"step": 75
},
{
"epoch": 1.018425460636516,
"grad_norm": 1.5644919872283936,
"learning_rate": 1.6699123657178553e-05,
"loss": 0.3538,
"step": 76
},
{
"epoch": 1.031825795644891,
"grad_norm": 1.651426911354065,
"learning_rate": 1.658108547238038e-05,
"loss": 0.3468,
"step": 77
},
{
"epoch": 1.0452261306532664,
"grad_norm": 1.64875328540802,
"learning_rate": 1.64614071431121e-05,
"loss": 0.3612,
"step": 78
},
{
"epoch": 1.0586264656616415,
"grad_norm": 1.6128103733062744,
"learning_rate": 1.634011849572239e-05,
"loss": 0.3604,
"step": 79
},
{
"epoch": 1.0720268006700167,
"grad_norm": 1.6095118522644043,
"learning_rate": 1.6217249757884954e-05,
"loss": 0.3586,
"step": 80
},
{
"epoch": 1.085427135678392,
"grad_norm": 1.8337584733963013,
"learning_rate": 1.609283155106517e-05,
"loss": 0.3484,
"step": 81
},
{
"epoch": 1.0988274706867671,
"grad_norm": 1.6831597089767456,
"learning_rate": 1.596689488288856e-05,
"loss": 0.3576,
"step": 82
},
{
"epoch": 1.1122278056951425,
"grad_norm": 1.809220314025879,
"learning_rate": 1.5839471139413065e-05,
"loss": 0.3497,
"step": 83
},
{
"epoch": 1.1256281407035176,
"grad_norm": 1.7941969633102417,
"learning_rate": 1.571059207730695e-05,
"loss": 0.3643,
"step": 84
},
{
"epoch": 1.1390284757118927,
"grad_norm": 2.1410865783691406,
"learning_rate": 1.55802898159344e-05,
"loss": 0.3524,
"step": 85
},
{
"epoch": 1.152428810720268,
"grad_norm": 1.7003251314163208,
"learning_rate": 1.5448596829350706e-05,
"loss": 0.3514,
"step": 86
},
{
"epoch": 1.1658291457286432,
"grad_norm": 1.7707059383392334,
"learning_rate": 1.5315545938209016e-05,
"loss": 0.36,
"step": 87
},
{
"epoch": 1.1792294807370185,
"grad_norm": 1.5506631135940552,
"learning_rate": 1.5181170301580776e-05,
"loss": 0.3543,
"step": 88
},
{
"epoch": 1.1926298157453936,
"grad_norm": 1.9891937971115112,
"learning_rate": 1.5045503408691776e-05,
"loss": 0.358,
"step": 89
},
{
"epoch": 1.2060301507537687,
"grad_norm": 1.6654824018478394,
"learning_rate": 1.4908579070575936e-05,
"loss": 0.3586,
"step": 90
},
{
"epoch": 1.219430485762144,
"grad_norm": 1.7871456146240234,
"learning_rate": 1.4770431411648898e-05,
"loss": 0.3481,
"step": 91
},
{
"epoch": 1.2328308207705192,
"grad_norm": 1.7653170824050903,
"learning_rate": 1.4631094861203478e-05,
"loss": 0.3579,
"step": 92
},
{
"epoch": 1.2462311557788945,
"grad_norm": 1.670673131942749,
"learning_rate": 1.4490604144829204e-05,
"loss": 0.3542,
"step": 93
},
{
"epoch": 1.2596314907872697,
"grad_norm": 1.6217372417449951,
"learning_rate": 1.4348994275757933e-05,
"loss": 0.3573,
"step": 94
},
{
"epoch": 1.2730318257956448,
"grad_norm": 1.6846883296966553,
"learning_rate": 1.4206300546137844e-05,
"loss": 0.3618,
"step": 95
},
{
"epoch": 1.2864321608040201,
"grad_norm": 1.7402368783950806,
"learning_rate": 1.4062558518237893e-05,
"loss": 0.3568,
"step": 96
},
{
"epoch": 1.2998324958123952,
"grad_norm": 1.5939064025878906,
"learning_rate": 1.3917804015584932e-05,
"loss": 0.3522,
"step": 97
},
{
"epoch": 1.3132328308207706,
"grad_norm": 1.7298028469085693,
"learning_rate": 1.3772073114035762e-05,
"loss": 0.3489,
"step": 98
},
{
"epoch": 1.3266331658291457,
"grad_norm": 1.6836535930633545,
"learning_rate": 1.3625402132786247e-05,
"loss": 0.3607,
"step": 99
},
{
"epoch": 1.3400335008375208,
"grad_norm": 1.7039278745651245,
"learning_rate": 1.3477827625319826e-05,
"loss": 0.3577,
"step": 100
},
{
"epoch": 1.3534338358458962,
"grad_norm": 1.7519850730895996,
"learning_rate": 1.3329386370297615e-05,
"loss": 0.3587,
"step": 101
},
{
"epoch": 1.3668341708542713,
"grad_norm": 1.6788212060928345,
"learning_rate": 1.3180115362392383e-05,
"loss": 0.3542,
"step": 102
},
{
"epoch": 1.3802345058626466,
"grad_norm": 1.7427568435668945,
"learning_rate": 1.3030051803068729e-05,
"loss": 0.3441,
"step": 103
},
{
"epoch": 1.3936348408710217,
"grad_norm": 1.7681330442428589,
"learning_rate": 1.2879233091311667e-05,
"loss": 0.359,
"step": 104
},
{
"epoch": 1.4070351758793969,
"grad_norm": 1.7159209251403809,
"learning_rate": 1.2727696814306034e-05,
"loss": 0.3474,
"step": 105
},
{
"epoch": 1.4204355108877722,
"grad_norm": 1.589086890220642,
"learning_rate": 1.2575480738068971e-05,
"loss": 0.3492,
"step": 106
},
{
"epoch": 1.4338358458961473,
"grad_norm": 1.6435205936431885,
"learning_rate": 1.2422622798037833e-05,
"loss": 0.3541,
"step": 107
},
{
"epoch": 1.4472361809045227,
"grad_norm": 1.6578630208969116,
"learning_rate": 1.2269161089615902e-05,
"loss": 0.3538,
"step": 108
},
{
"epoch": 1.4606365159128978,
"grad_norm": 1.617587924003601,
"learning_rate": 1.2115133858678192e-05,
"loss": 0.3535,
"step": 109
},
{
"epoch": 1.474036850921273,
"grad_norm": 1.5076704025268555,
"learning_rate": 1.1960579492039783e-05,
"loss": 0.3404,
"step": 110
},
{
"epoch": 1.4874371859296482,
"grad_norm": 1.6691168546676636,
"learning_rate": 1.1805536507889021e-05,
"loss": 0.3576,
"step": 111
},
{
"epoch": 1.5008375209380236,
"grad_norm": 1.7041524648666382,
"learning_rate": 1.1650043546187994e-05,
"loss": 0.3538,
"step": 112
},
{
"epoch": 1.5142378559463987,
"grad_norm": 1.6463046073913574,
"learning_rate": 1.1494139359042612e-05,
"loss": 0.3554,
"step": 113
},
{
"epoch": 1.5276381909547738,
"grad_norm": 1.5577502250671387,
"learning_rate": 1.1337862801044792e-05,
"loss": 0.3637,
"step": 114
},
{
"epoch": 1.541038525963149,
"grad_norm": 1.7103004455566406,
"learning_rate": 1.1181252819589081e-05,
"loss": 0.3448,
"step": 115
},
{
"epoch": 1.5544388609715243,
"grad_norm": 1.7436065673828125,
"learning_rate": 1.1024348445166133e-05,
"loss": 0.3407,
"step": 116
},
{
"epoch": 1.5678391959798996,
"grad_norm": 1.5942566394805908,
"learning_rate": 1.086718878163551e-05,
"loss": 0.3282,
"step": 117
},
{
"epoch": 1.5812395309882747,
"grad_norm": 1.629876732826233,
"learning_rate": 1.070981299648016e-05,
"loss": 0.3391,
"step": 118
},
{
"epoch": 1.5946398659966499,
"grad_norm": 1.6159772872924805,
"learning_rate": 1.0552260311045082e-05,
"loss": 0.3433,
"step": 119
},
{
"epoch": 1.608040201005025,
"grad_norm": 1.6075868606567383,
"learning_rate": 1.0394569990762528e-05,
"loss": 0.3495,
"step": 120
},
{
"epoch": 1.6214405360134003,
"grad_norm": 1.539378046989441,
"learning_rate": 1.0236781335366239e-05,
"loss": 0.353,
"step": 121
},
{
"epoch": 1.6348408710217757,
"grad_norm": 1.4946749210357666,
"learning_rate": 1.0078933669097135e-05,
"loss": 0.3482,
"step": 122
},
{
"epoch": 1.6482412060301508,
"grad_norm": 1.602623462677002,
"learning_rate": 9.92106633090287e-06,
"loss": 0.3455,
"step": 123
},
{
"epoch": 1.661641541038526,
"grad_norm": 1.5885684490203857,
"learning_rate": 9.763218664633763e-06,
"loss": 0.3401,
"step": 124
},
{
"epoch": 1.675041876046901,
"grad_norm": 1.535250186920166,
"learning_rate": 9.605430009237474e-06,
"loss": 0.3453,
"step": 125
},
{
"epoch": 1.6884422110552764,
"grad_norm": 1.5518174171447754,
"learning_rate": 9.44773968895492e-06,
"loss": 0.3464,
"step": 126
},
{
"epoch": 1.7018425460636517,
"grad_norm": 1.5169888734817505,
"learning_rate": 9.290187003519841e-06,
"loss": 0.3371,
"step": 127
},
{
"epoch": 1.7152428810720268,
"grad_norm": 1.8163830041885376,
"learning_rate": 9.132811218364494e-06,
"loss": 0.3377,
"step": 128
},
{
"epoch": 1.728643216080402,
"grad_norm": 1.699095606803894,
"learning_rate": 8.975651554833869e-06,
"loss": 0.3385,
"step": 129
},
{
"epoch": 1.742043551088777,
"grad_norm": 1.5736947059631348,
"learning_rate": 8.81874718041092e-06,
"loss": 0.3409,
"step": 130
},
{
"epoch": 1.7554438860971524,
"grad_norm": 1.5745042562484741,
"learning_rate": 8.662137198955211e-06,
"loss": 0.3468,
"step": 131
},
{
"epoch": 1.7688442211055277,
"grad_norm": 1.5846389532089233,
"learning_rate": 8.50586064095739e-06,
"loss": 0.353,
"step": 132
},
{
"epoch": 1.7822445561139029,
"grad_norm": 1.8534716367721558,
"learning_rate": 8.349956453812009e-06,
"loss": 0.3477,
"step": 133
},
{
"epoch": 1.795644891122278,
"grad_norm": 1.6759575605392456,
"learning_rate": 8.194463492110982e-06,
"loss": 0.3434,
"step": 134
},
{
"epoch": 1.809045226130653,
"grad_norm": 1.6878952980041504,
"learning_rate": 8.03942050796022e-06,
"loss": 0.3446,
"step": 135
},
{
"epoch": 1.8224455611390284,
"grad_norm": 1.7165716886520386,
"learning_rate": 7.884866141321811e-06,
"loss": 0.334,
"step": 136
},
{
"epoch": 1.8358458961474038,
"grad_norm": 1.56163489818573,
"learning_rate": 7.730838910384098e-06,
"loss": 0.351,
"step": 137
},
{
"epoch": 1.849246231155779,
"grad_norm": 1.5513216257095337,
"learning_rate": 7.57737720196217e-06,
"loss": 0.3394,
"step": 138
},
{
"epoch": 1.862646566164154,
"grad_norm": 1.7233706712722778,
"learning_rate": 7.424519261931036e-06,
"loss": 0.3335,
"step": 139
},
{
"epoch": 1.8760469011725294,
"grad_norm": 1.634465217590332,
"learning_rate": 7.27230318569397e-06,
"loss": 0.347,
"step": 140
},
{
"epoch": 1.8894472361809045,
"grad_norm": 1.5293488502502441,
"learning_rate": 7.1207669086883366e-06,
"loss": 0.3422,
"step": 141
},
{
"epoch": 1.9028475711892798,
"grad_norm": 1.5170069932937622,
"learning_rate": 6.969948196931272e-06,
"loss": 0.3369,
"step": 142
},
{
"epoch": 1.916247906197655,
"grad_norm": 1.6170374155044556,
"learning_rate": 6.819884637607619e-06,
"loss": 0.3397,
"step": 143
},
{
"epoch": 1.92964824120603,
"grad_norm": 1.4933662414550781,
"learning_rate": 6.670613629702391e-06,
"loss": 0.3493,
"step": 144
},
{
"epoch": 1.9430485762144054,
"grad_norm": 1.5872507095336914,
"learning_rate": 6.522172374680177e-06,
"loss": 0.3498,
"step": 145
},
{
"epoch": 1.9564489112227805,
"grad_norm": 1.5772573947906494,
"learning_rate": 6.374597867213756e-06,
"loss": 0.3508,
"step": 146
},
{
"epoch": 1.9698492462311559,
"grad_norm": 1.5273511409759521,
"learning_rate": 6.2279268859642396e-06,
"loss": 0.3477,
"step": 147
},
{
"epoch": 1.983249581239531,
"grad_norm": 1.5936107635498047,
"learning_rate": 6.082195984415069e-06,
"loss": 0.3397,
"step": 148
},
{
"epoch": 1.996649916247906,
"grad_norm": 1.5770097970962524,
"learning_rate": 5.937441481762112e-06,
"loss": 0.3518,
"step": 149
},
{
"epoch": 2.0100502512562812,
"grad_norm": 1.4779218435287476,
"learning_rate": 5.793699453862161e-06,
"loss": 0.3364,
"step": 150
},
{
"epoch": 2.023450586264657,
"grad_norm": 1.4762550592422485,
"learning_rate": 5.651005724242072e-06,
"loss": 0.3265,
"step": 151
},
{
"epoch": 2.036850921273032,
"grad_norm": 1.5051695108413696,
"learning_rate": 5.509395855170798e-06,
"loss": 0.3246,
"step": 152
},
{
"epoch": 2.050251256281407,
"grad_norm": 1.727282166481018,
"learning_rate": 5.368905138796523e-06,
"loss": 0.3227,
"step": 153
},
{
"epoch": 2.063651591289782,
"grad_norm": 1.493577003479004,
"learning_rate": 5.2295685883511086e-06,
"loss": 0.3289,
"step": 154
},
{
"epoch": 2.0770519262981573,
"grad_norm": 1.5364409685134888,
"learning_rate": 5.091420929424065e-06,
"loss": 0.3233,
"step": 155
},
{
"epoch": 2.090452261306533,
"grad_norm": 1.592559814453125,
"learning_rate": 4.954496591308227e-06,
"loss": 0.3308,
"step": 156
},
{
"epoch": 2.103852596314908,
"grad_norm": 1.4885226488113403,
"learning_rate": 4.818829698419225e-06,
"loss": 0.3267,
"step": 157
},
{
"epoch": 2.117252931323283,
"grad_norm": 1.584492564201355,
"learning_rate": 4.684454061790987e-06,
"loss": 0.3344,
"step": 158
},
{
"epoch": 2.130653266331658,
"grad_norm": 1.5516785383224487,
"learning_rate": 4.551403170649299e-06,
"loss": 0.325,
"step": 159
},
{
"epoch": 2.1440536013400333,
"grad_norm": 1.5723347663879395,
"learning_rate": 4.4197101840656e-06,
"loss": 0.3343,
"step": 160
},
{
"epoch": 2.157453936348409,
"grad_norm": 1.56240713596344,
"learning_rate": 4.289407922693053e-06,
"loss": 0.319,
"step": 161
},
{
"epoch": 2.170854271356784,
"grad_norm": 1.4635605812072754,
"learning_rate": 4.1605288605869365e-06,
"loss": 0.3254,
"step": 162
},
{
"epoch": 2.184254606365159,
"grad_norm": 1.489740252494812,
"learning_rate": 4.033105117111441e-06,
"loss": 0.3291,
"step": 163
},
{
"epoch": 2.1976549413735342,
"grad_norm": 1.5082714557647705,
"learning_rate": 3.907168448934836e-06,
"loss": 0.3235,
"step": 164
},
{
"epoch": 2.2110552763819094,
"grad_norm": 1.571510672569275,
"learning_rate": 3.7827502421150497e-06,
"loss": 0.3308,
"step": 165
},
{
"epoch": 2.224455611390285,
"grad_norm": 1.526476263999939,
"learning_rate": 3.6598815042776135e-06,
"loss": 0.3263,
"step": 166
},
{
"epoch": 2.23785594639866,
"grad_norm": 1.5408774614334106,
"learning_rate": 3.5385928568879012e-06,
"loss": 0.3169,
"step": 167
},
{
"epoch": 2.251256281407035,
"grad_norm": 1.4838318824768066,
"learning_rate": 3.4189145276196244e-06,
"loss": 0.3287,
"step": 168
},
{
"epoch": 2.2646566164154103,
"grad_norm": 1.4531508684158325,
"learning_rate": 3.300876342821451e-06,
"loss": 0.3165,
"step": 169
},
{
"epoch": 2.2780569514237854,
"grad_norm": 1.4947620630264282,
"learning_rate": 3.1845077200836638e-06,
"loss": 0.3219,
"step": 170
},
{
"epoch": 2.291457286432161,
"grad_norm": 1.5694292783737183,
"learning_rate": 3.0698376609066828e-06,
"loss": 0.3183,
"step": 171
},
{
"epoch": 2.304857621440536,
"grad_norm": 1.5011779069900513,
"learning_rate": 2.9568947434732777e-06,
"loss": 0.3289,
"step": 172
},
{
"epoch": 2.318257956448911,
"grad_norm": 1.5003248453140259,
"learning_rate": 2.8457071155262885e-06,
"loss": 0.3275,
"step": 173
},
{
"epoch": 2.3316582914572863,
"grad_norm": 1.4831820726394653,
"learning_rate": 2.7363024873536093e-06,
"loss": 0.3302,
"step": 174
},
{
"epoch": 2.3450586264656614,
"grad_norm": 1.4793733358383179,
"learning_rate": 2.628708124882212e-06,
"loss": 0.318,
"step": 175
},
{
"epoch": 2.358458961474037,
"grad_norm": 1.4326947927474976,
"learning_rate": 2.52295084288291e-06,
"loss": 0.3256,
"step": 176
},
{
"epoch": 2.371859296482412,
"grad_norm": 1.52131986618042,
"learning_rate": 2.419056998287547e-06,
"loss": 0.3293,
"step": 177
},
{
"epoch": 2.3852596314907872,
"grad_norm": 1.5197522640228271,
"learning_rate": 2.3170524836202936e-06,
"loss": 0.3195,
"step": 178
},
{
"epoch": 2.3986599664991624,
"grad_norm": 1.454834222793579,
"learning_rate": 2.216962720544703e-06,
"loss": 0.3236,
"step": 179
},
{
"epoch": 2.4120603015075375,
"grad_norm": 1.5067466497421265,
"learning_rate": 2.118812653528077e-06,
"loss": 0.3225,
"step": 180
},
{
"epoch": 2.425460636515913,
"grad_norm": 1.4823540449142456,
"learning_rate": 2.022626743624807e-06,
"loss": 0.3316,
"step": 181
},
{
"epoch": 2.438860971524288,
"grad_norm": 1.4712008237838745,
"learning_rate": 1.928428962380148e-06,
"loss": 0.3193,
"step": 182
},
{
"epoch": 2.4522613065326633,
"grad_norm": 1.5106829404830933,
"learning_rate": 1.8362427858560094e-06,
"loss": 0.3202,
"step": 183
},
{
"epoch": 2.4656616415410384,
"grad_norm": 1.4007306098937988,
"learning_rate": 1.74609118878024e-06,
"loss": 0.3228,
"step": 184
},
{
"epoch": 2.4790619765494135,
"grad_norm": 1.4667021036148071,
"learning_rate": 1.6579966388208257e-06,
"loss": 0.3135,
"step": 185
},
{
"epoch": 2.492462311557789,
"grad_norm": 1.4928709268569946,
"learning_rate": 1.5719810909864941e-06,
"loss": 0.3283,
"step": 186
},
{
"epoch": 2.505862646566164,
"grad_norm": 1.5440924167633057,
"learning_rate": 1.4880659821550547e-06,
"loss": 0.3304,
"step": 187
},
{
"epoch": 2.5192629815745393,
"grad_norm": 1.4452801942825317,
"learning_rate": 1.4062722257308803e-06,
"loss": 0.3269,
"step": 188
},
{
"epoch": 2.5326633165829144,
"grad_norm": 1.492119312286377,
"learning_rate": 1.3266202064328548e-06,
"loss": 0.3236,
"step": 189
},
{
"epoch": 2.5460636515912896,
"grad_norm": 1.4364550113677979,
"learning_rate": 1.249129775214064e-06,
"loss": 0.328,
"step": 190
},
{
"epoch": 2.559463986599665,
"grad_norm": 1.45607328414917,
"learning_rate": 1.1738202443145307e-06,
"loss": 0.3267,
"step": 191
},
{
"epoch": 2.5728643216080402,
"grad_norm": 1.5155068635940552,
"learning_rate": 1.100710382448198e-06,
"loss": 0.3242,
"step": 192
},
{
"epoch": 2.5862646566164154,
"grad_norm": 1.454077124595642,
"learning_rate": 1.029818410125365e-06,
"loss": 0.3276,
"step": 193
},
{
"epoch": 2.5996649916247905,
"grad_norm": 1.468336582183838,
"learning_rate": 9.611619951117657e-07,
"loss": 0.3259,
"step": 194
},
{
"epoch": 2.6130653266331656,
"grad_norm": 1.480252981185913,
"learning_rate": 8.94758248025378e-07,
"loss": 0.3209,
"step": 195
},
{
"epoch": 2.626465661641541,
"grad_norm": 1.5078731775283813,
"learning_rate": 8.306237180721121e-07,
"loss": 0.3227,
"step": 196
},
{
"epoch": 2.6398659966499163,
"grad_norm": 1.4447978734970093,
"learning_rate": 7.687743889213939e-07,
"loss": 0.321,
"step": 197
},
{
"epoch": 2.6532663316582914,
"grad_norm": 1.4051856994628906,
"learning_rate": 7.092256747226944e-07,
"loss": 0.3241,
"step": 198
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.4788079261779785,
"learning_rate": 6.519924162640168e-07,
"loss": 0.3116,
"step": 199
},
{
"epoch": 2.6800670016750416,
"grad_norm": 1.4269615411758423,
"learning_rate": 5.970888772732453e-07,
"loss": 0.3269,
"step": 200
},
{
"epoch": 2.6800670016750416,
"eval_loss": 0.308817058801651,
"eval_runtime": 36.2259,
"eval_samples_per_second": 20.013,
"eval_steps_per_second": 5.024,
"step": 200
},
{
"epoch": 2.693467336683417,
"grad_norm": 1.4678142070770264,
"learning_rate": 5.445287408633304e-07,
"loss": 0.329,
"step": 201
},
{
"epoch": 2.7068676716917923,
"grad_norm": 1.43202805519104,
"learning_rate": 4.943251061221721e-07,
"loss": 0.3108,
"step": 202
},
{
"epoch": 2.7202680067001674,
"grad_norm": 1.4377905130386353,
"learning_rate": 4.464904848480522e-07,
"loss": 0.3222,
"step": 203
},
{
"epoch": 2.7336683417085426,
"grad_norm": 1.4890724420547485,
"learning_rate": 4.0103679843142895e-07,
"loss": 0.3232,
"step": 204
},
{
"epoch": 2.7470686767169177,
"grad_norm": 1.4820594787597656,
"learning_rate": 3.5797537488388326e-07,
"loss": 0.3206,
"step": 205
},
{
"epoch": 2.7604690117252932,
"grad_norm": 1.4427391290664673,
"learning_rate": 3.1731694601492834e-07,
"loss": 0.3237,
"step": 206
},
{
"epoch": 2.7738693467336684,
"grad_norm": 1.4541987180709839,
"learning_rate": 2.790716447574304e-07,
"loss": 0.3205,
"step": 207
},
{
"epoch": 2.7872696817420435,
"grad_norm": 1.379800796508789,
"learning_rate": 2.4324900264226405e-07,
"loss": 0.3193,
"step": 208
},
{
"epoch": 2.8006700167504186,
"grad_norm": 1.3961976766586304,
"learning_rate": 2.098579474228546e-07,
"loss": 0.3125,
"step": 209
},
{
"epoch": 2.8140703517587937,
"grad_norm": 1.44660484790802,
"learning_rate": 1.7890680085019597e-07,
"loss": 0.3225,
"step": 210
},
{
"epoch": 2.8274706867671693,
"grad_norm": 1.3615787029266357,
"learning_rate": 1.504032765988961e-07,
"loss": 0.317,
"step": 211
},
{
"epoch": 2.8408710217755444,
"grad_norm": 1.464227557182312,
"learning_rate": 1.2435447834476254e-07,
"loss": 0.3436,
"step": 212
},
{
"epoch": 2.8542713567839195,
"grad_norm": 1.4321690797805786,
"learning_rate": 1.0076689799442874e-07,
"loss": 0.3206,
"step": 213
},
{
"epoch": 2.8676716917922946,
"grad_norm": 1.3721367120742798,
"learning_rate": 7.964641406742135e-08,
"loss": 0.3242,
"step": 214
},
{
"epoch": 2.8810720268006698,
"grad_norm": 1.4638272523880005,
"learning_rate": 6.099829023112236e-08,
"loss": 0.316,
"step": 215
},
{
"epoch": 2.8944723618090453,
"grad_norm": 1.5022002458572388,
"learning_rate": 4.482717398894165e-08,
"loss": 0.3188,
"step": 216
},
{
"epoch": 2.9078726968174204,
"grad_norm": 1.4071238040924072,
"learning_rate": 3.1137095522068006e-08,
"loss": 0.3229,
"step": 217
},
{
"epoch": 2.9212730318257956,
"grad_norm": 1.4545257091522217,
"learning_rate": 1.993146668506585e-08,
"loss": 0.3103,
"step": 218
},
{
"epoch": 2.934673366834171,
"grad_norm": 1.4042024612426758,
"learning_rate": 1.1213080155564327e-08,
"loss": 0.3234,
"step": 219
},
{
"epoch": 2.948073701842546,
"grad_norm": 1.4457824230194092,
"learning_rate": 4.984108738261828e-09,
"loss": 0.3187,
"step": 220
},
{
"epoch": 2.9614740368509214,
"grad_norm": 1.4061236381530762,
"learning_rate": 1.246104823426908e-09,
"loss": 0.3134,
"step": 221
},
{
"epoch": 2.9748743718592965,
"grad_norm": 1.4718122482299805,
"learning_rate": 0.0,
"loss": 0.3219,
"step": 222
},
{
"epoch": 2.9748743718592965,
"step": 222,
"total_flos": 9.93148193773663e+18,
"train_loss": 0.36201348014779994,
"train_runtime": 28749.1045,
"train_samples_per_second": 3.987,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1.0,
"max_steps": 222,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.93148193773663e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}