dhruvnathawani's picture
Upload sft_backdoors_Llama3.2-3B_code3_dataset_experiment_7.1 files
56c1e23 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49747474747474746,
"eval_steps": 98,
"global_step": 197,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025252525252525255,
"grad_norm": 17.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.9033,
"step": 1
},
{
"epoch": 0.005050505050505051,
"grad_norm": 19.5,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.1028,
"step": 2
},
{
"epoch": 0.007575757575757576,
"grad_norm": 17.125,
"learning_rate": 3e-06,
"loss": 2.1133,
"step": 3
},
{
"epoch": 0.010101010101010102,
"grad_norm": 15.8125,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0188,
"step": 4
},
{
"epoch": 0.012626262626262626,
"grad_norm": 14.375,
"learning_rate": 5e-06,
"loss": 1.9426,
"step": 5
},
{
"epoch": 0.015151515151515152,
"grad_norm": 11.0,
"learning_rate": 6e-06,
"loss": 1.7974,
"step": 6
},
{
"epoch": 0.017676767676767676,
"grad_norm": 9.1875,
"learning_rate": 7e-06,
"loss": 1.7561,
"step": 7
},
{
"epoch": 0.020202020202020204,
"grad_norm": 8.4375,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7865,
"step": 8
},
{
"epoch": 0.022727272727272728,
"grad_norm": 9.5,
"learning_rate": 9e-06,
"loss": 1.7445,
"step": 9
},
{
"epoch": 0.025252525252525252,
"grad_norm": 8.6875,
"learning_rate": 1e-05,
"loss": 1.6558,
"step": 10
},
{
"epoch": 0.027777777777777776,
"grad_norm": 6.625,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.2742,
"step": 11
},
{
"epoch": 0.030303030303030304,
"grad_norm": 6.28125,
"learning_rate": 1.2e-05,
"loss": 1.2157,
"step": 12
},
{
"epoch": 0.03282828282828283,
"grad_norm": 6.0,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.24,
"step": 13
},
{
"epoch": 0.03535353535353535,
"grad_norm": 5.15625,
"learning_rate": 1.4e-05,
"loss": 1.1103,
"step": 14
},
{
"epoch": 0.03787878787878788,
"grad_norm": 4.0,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.9098,
"step": 15
},
{
"epoch": 0.04040404040404041,
"grad_norm": 3.90625,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.005,
"step": 16
},
{
"epoch": 0.04292929292929293,
"grad_norm": 3.203125,
"learning_rate": 1.7e-05,
"loss": 0.7924,
"step": 17
},
{
"epoch": 0.045454545454545456,
"grad_norm": 3.078125,
"learning_rate": 1.8e-05,
"loss": 0.8178,
"step": 18
},
{
"epoch": 0.047979797979797977,
"grad_norm": 2.46875,
"learning_rate": 1.9e-05,
"loss": 0.7241,
"step": 19
},
{
"epoch": 0.050505050505050504,
"grad_norm": 2.90625,
"learning_rate": 2e-05,
"loss": 0.9651,
"step": 20
},
{
"epoch": 0.05303030303030303,
"grad_norm": 2.53125,
"learning_rate": 1.99468085106383e-05,
"loss": 0.7474,
"step": 21
},
{
"epoch": 0.05555555555555555,
"grad_norm": 2.296875,
"learning_rate": 1.9893617021276595e-05,
"loss": 0.7503,
"step": 22
},
{
"epoch": 0.05808080808080808,
"grad_norm": 2.421875,
"learning_rate": 1.9840425531914894e-05,
"loss": 0.7411,
"step": 23
},
{
"epoch": 0.06060606060606061,
"grad_norm": 2.265625,
"learning_rate": 1.9787234042553193e-05,
"loss": 0.7489,
"step": 24
},
{
"epoch": 0.06313131313131314,
"grad_norm": 2.25,
"learning_rate": 1.973404255319149e-05,
"loss": 0.7545,
"step": 25
},
{
"epoch": 0.06565656565656566,
"grad_norm": 2.25,
"learning_rate": 1.968085106382979e-05,
"loss": 0.6775,
"step": 26
},
{
"epoch": 0.06818181818181818,
"grad_norm": 2.25,
"learning_rate": 1.962765957446809e-05,
"loss": 0.7888,
"step": 27
},
{
"epoch": 0.0707070707070707,
"grad_norm": 2.21875,
"learning_rate": 1.9574468085106384e-05,
"loss": 0.8352,
"step": 28
},
{
"epoch": 0.07323232323232323,
"grad_norm": 2.125,
"learning_rate": 1.9521276595744682e-05,
"loss": 0.6698,
"step": 29
},
{
"epoch": 0.07575757575757576,
"grad_norm": 2.5625,
"learning_rate": 1.946808510638298e-05,
"loss": 0.8309,
"step": 30
},
{
"epoch": 0.07828282828282829,
"grad_norm": 2.234375,
"learning_rate": 1.9414893617021276e-05,
"loss": 0.6307,
"step": 31
},
{
"epoch": 0.08080808080808081,
"grad_norm": 2.421875,
"learning_rate": 1.9361702127659575e-05,
"loss": 0.6557,
"step": 32
},
{
"epoch": 0.08333333333333333,
"grad_norm": 2.328125,
"learning_rate": 1.9308510638297873e-05,
"loss": 0.6325,
"step": 33
},
{
"epoch": 0.08585858585858586,
"grad_norm": 2.609375,
"learning_rate": 1.9255319148936172e-05,
"loss": 0.7254,
"step": 34
},
{
"epoch": 0.08838383838383838,
"grad_norm": 1.9453125,
"learning_rate": 1.920212765957447e-05,
"loss": 0.6736,
"step": 35
},
{
"epoch": 0.09090909090909091,
"grad_norm": 1.7265625,
"learning_rate": 1.914893617021277e-05,
"loss": 0.74,
"step": 36
},
{
"epoch": 0.09343434343434344,
"grad_norm": 1.6484375,
"learning_rate": 1.9095744680851064e-05,
"loss": 0.6725,
"step": 37
},
{
"epoch": 0.09595959595959595,
"grad_norm": 1.6640625,
"learning_rate": 1.9042553191489363e-05,
"loss": 0.631,
"step": 38
},
{
"epoch": 0.09848484848484848,
"grad_norm": 1.5546875,
"learning_rate": 1.898936170212766e-05,
"loss": 0.6208,
"step": 39
},
{
"epoch": 0.10101010101010101,
"grad_norm": 1.4765625,
"learning_rate": 1.893617021276596e-05,
"loss": 0.5535,
"step": 40
},
{
"epoch": 0.10353535353535354,
"grad_norm": 1.5859375,
"learning_rate": 1.888297872340426e-05,
"loss": 0.6425,
"step": 41
},
{
"epoch": 0.10606060606060606,
"grad_norm": 1.5,
"learning_rate": 1.8829787234042557e-05,
"loss": 0.5682,
"step": 42
},
{
"epoch": 0.10858585858585859,
"grad_norm": 1.5390625,
"learning_rate": 1.8776595744680852e-05,
"loss": 0.6053,
"step": 43
},
{
"epoch": 0.1111111111111111,
"grad_norm": 1.546875,
"learning_rate": 1.872340425531915e-05,
"loss": 0.5894,
"step": 44
},
{
"epoch": 0.11363636363636363,
"grad_norm": 1.6640625,
"learning_rate": 1.8670212765957446e-05,
"loss": 0.523,
"step": 45
},
{
"epoch": 0.11616161616161616,
"grad_norm": 1.5,
"learning_rate": 1.8617021276595745e-05,
"loss": 0.5638,
"step": 46
},
{
"epoch": 0.11868686868686869,
"grad_norm": 1.3515625,
"learning_rate": 1.8563829787234043e-05,
"loss": 0.533,
"step": 47
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.4453125,
"learning_rate": 1.8510638297872342e-05,
"loss": 0.5829,
"step": 48
},
{
"epoch": 0.12373737373737374,
"grad_norm": 1.484375,
"learning_rate": 1.845744680851064e-05,
"loss": 0.6252,
"step": 49
},
{
"epoch": 0.12626262626262627,
"grad_norm": 1.609375,
"learning_rate": 1.840425531914894e-05,
"loss": 0.641,
"step": 50
},
{
"epoch": 0.12878787878787878,
"grad_norm": 1.2734375,
"learning_rate": 1.8351063829787234e-05,
"loss": 0.4742,
"step": 51
},
{
"epoch": 0.13131313131313133,
"grad_norm": 1.4765625,
"learning_rate": 1.8297872340425533e-05,
"loss": 0.5409,
"step": 52
},
{
"epoch": 0.13383838383838384,
"grad_norm": 1.640625,
"learning_rate": 1.824468085106383e-05,
"loss": 0.6769,
"step": 53
},
{
"epoch": 0.13636363636363635,
"grad_norm": 1.4375,
"learning_rate": 1.8191489361702127e-05,
"loss": 0.5134,
"step": 54
},
{
"epoch": 0.1388888888888889,
"grad_norm": 1.5625,
"learning_rate": 1.8138297872340425e-05,
"loss": 0.6641,
"step": 55
},
{
"epoch": 0.1414141414141414,
"grad_norm": 1.546875,
"learning_rate": 1.8085106382978724e-05,
"loss": 0.6039,
"step": 56
},
{
"epoch": 0.14393939393939395,
"grad_norm": 1.40625,
"learning_rate": 1.8031914893617023e-05,
"loss": 0.5456,
"step": 57
},
{
"epoch": 0.14646464646464646,
"grad_norm": 1.515625,
"learning_rate": 1.797872340425532e-05,
"loss": 0.6185,
"step": 58
},
{
"epoch": 0.14898989898989898,
"grad_norm": 1.5703125,
"learning_rate": 1.792553191489362e-05,
"loss": 0.6234,
"step": 59
},
{
"epoch": 0.15151515151515152,
"grad_norm": 1.625,
"learning_rate": 1.7872340425531915e-05,
"loss": 0.6293,
"step": 60
},
{
"epoch": 0.15404040404040403,
"grad_norm": 1.4609375,
"learning_rate": 1.7819148936170214e-05,
"loss": 0.5785,
"step": 61
},
{
"epoch": 0.15656565656565657,
"grad_norm": 1.5,
"learning_rate": 1.7765957446808512e-05,
"loss": 0.5021,
"step": 62
},
{
"epoch": 0.1590909090909091,
"grad_norm": 1.5,
"learning_rate": 1.7712765957446807e-05,
"loss": 0.6317,
"step": 63
},
{
"epoch": 0.16161616161616163,
"grad_norm": 1.5234375,
"learning_rate": 1.765957446808511e-05,
"loss": 0.56,
"step": 64
},
{
"epoch": 0.16414141414141414,
"grad_norm": 1.4921875,
"learning_rate": 1.7606382978723408e-05,
"loss": 0.5755,
"step": 65
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.6171875,
"learning_rate": 1.7553191489361703e-05,
"loss": 0.6197,
"step": 66
},
{
"epoch": 0.1691919191919192,
"grad_norm": 1.453125,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.5798,
"step": 67
},
{
"epoch": 0.1717171717171717,
"grad_norm": 1.3984375,
"learning_rate": 1.74468085106383e-05,
"loss": 0.5906,
"step": 68
},
{
"epoch": 0.17424242424242425,
"grad_norm": 1.6796875,
"learning_rate": 1.7393617021276596e-05,
"loss": 0.7807,
"step": 69
},
{
"epoch": 0.17676767676767677,
"grad_norm": 1.609375,
"learning_rate": 1.7340425531914894e-05,
"loss": 0.626,
"step": 70
},
{
"epoch": 0.17929292929292928,
"grad_norm": 1.4453125,
"learning_rate": 1.7287234042553193e-05,
"loss": 0.6278,
"step": 71
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.5546875,
"learning_rate": 1.723404255319149e-05,
"loss": 0.5956,
"step": 72
},
{
"epoch": 0.18434343434343434,
"grad_norm": 1.40625,
"learning_rate": 1.718085106382979e-05,
"loss": 0.5488,
"step": 73
},
{
"epoch": 0.18686868686868688,
"grad_norm": 1.53125,
"learning_rate": 1.712765957446809e-05,
"loss": 0.5977,
"step": 74
},
{
"epoch": 0.1893939393939394,
"grad_norm": 1.40625,
"learning_rate": 1.7074468085106384e-05,
"loss": 0.5672,
"step": 75
},
{
"epoch": 0.1919191919191919,
"grad_norm": 1.6875,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.5855,
"step": 76
},
{
"epoch": 0.19444444444444445,
"grad_norm": 1.4453125,
"learning_rate": 1.696808510638298e-05,
"loss": 0.5899,
"step": 77
},
{
"epoch": 0.19696969696969696,
"grad_norm": 1.3125,
"learning_rate": 1.6914893617021276e-05,
"loss": 0.5164,
"step": 78
},
{
"epoch": 0.1994949494949495,
"grad_norm": 1.5078125,
"learning_rate": 1.6861702127659575e-05,
"loss": 0.7067,
"step": 79
},
{
"epoch": 0.20202020202020202,
"grad_norm": 1.828125,
"learning_rate": 1.6808510638297873e-05,
"loss": 0.7673,
"step": 80
},
{
"epoch": 0.20454545454545456,
"grad_norm": 1.5078125,
"learning_rate": 1.6755319148936172e-05,
"loss": 0.5957,
"step": 81
},
{
"epoch": 0.20707070707070707,
"grad_norm": 1.59375,
"learning_rate": 1.670212765957447e-05,
"loss": 0.593,
"step": 82
},
{
"epoch": 0.20959595959595959,
"grad_norm": 1.546875,
"learning_rate": 1.664893617021277e-05,
"loss": 0.6161,
"step": 83
},
{
"epoch": 0.21212121212121213,
"grad_norm": 1.4609375,
"learning_rate": 1.6595744680851064e-05,
"loss": 0.5788,
"step": 84
},
{
"epoch": 0.21464646464646464,
"grad_norm": 1.4453125,
"learning_rate": 1.6542553191489363e-05,
"loss": 0.5945,
"step": 85
},
{
"epoch": 0.21717171717171718,
"grad_norm": 1.4375,
"learning_rate": 1.648936170212766e-05,
"loss": 0.5936,
"step": 86
},
{
"epoch": 0.2196969696969697,
"grad_norm": 1.4609375,
"learning_rate": 1.6436170212765957e-05,
"loss": 0.5393,
"step": 87
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.453125,
"learning_rate": 1.6382978723404255e-05,
"loss": 0.5319,
"step": 88
},
{
"epoch": 0.22474747474747475,
"grad_norm": 1.734375,
"learning_rate": 1.6329787234042554e-05,
"loss": 0.7014,
"step": 89
},
{
"epoch": 0.22727272727272727,
"grad_norm": 1.546875,
"learning_rate": 1.6276595744680853e-05,
"loss": 0.6423,
"step": 90
},
{
"epoch": 0.2297979797979798,
"grad_norm": 1.546875,
"learning_rate": 1.622340425531915e-05,
"loss": 0.7358,
"step": 91
},
{
"epoch": 0.23232323232323232,
"grad_norm": 1.359375,
"learning_rate": 1.6170212765957446e-05,
"loss": 0.4773,
"step": 92
},
{
"epoch": 0.23484848484848486,
"grad_norm": 1.65625,
"learning_rate": 1.6117021276595745e-05,
"loss": 0.5955,
"step": 93
},
{
"epoch": 0.23737373737373738,
"grad_norm": 1.53125,
"learning_rate": 1.6063829787234044e-05,
"loss": 0.6716,
"step": 94
},
{
"epoch": 0.2398989898989899,
"grad_norm": 1.6328125,
"learning_rate": 1.6010638297872342e-05,
"loss": 0.6673,
"step": 95
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.90625,
"learning_rate": 1.595744680851064e-05,
"loss": 0.7431,
"step": 96
},
{
"epoch": 0.24494949494949494,
"grad_norm": 1.4140625,
"learning_rate": 1.590425531914894e-05,
"loss": 0.5161,
"step": 97
},
{
"epoch": 0.2474747474747475,
"grad_norm": 1.3984375,
"learning_rate": 1.5851063829787235e-05,
"loss": 0.5427,
"step": 98
},
{
"epoch": 0.2474747474747475,
"eval_loss": 0.4802989363670349,
"eval_model_preparation_time": 0.0001,
"eval_runtime": 4.4682,
"eval_samples_per_second": 23.723,
"eval_steps_per_second": 3.133,
"step": 98
},
{
"epoch": 0.25,
"grad_norm": 1.515625,
"learning_rate": 1.5797872340425533e-05,
"loss": 0.5947,
"step": 99
},
{
"epoch": 0.25252525252525254,
"grad_norm": 1.4140625,
"learning_rate": 1.5744680851063832e-05,
"loss": 0.6192,
"step": 100
},
{
"epoch": 0.255050505050505,
"grad_norm": 1.46875,
"learning_rate": 1.5691489361702127e-05,
"loss": 0.568,
"step": 101
},
{
"epoch": 0.25757575757575757,
"grad_norm": 1.65625,
"learning_rate": 1.5638297872340426e-05,
"loss": 0.6521,
"step": 102
},
{
"epoch": 0.2601010101010101,
"grad_norm": 1.4453125,
"learning_rate": 1.5585106382978724e-05,
"loss": 0.65,
"step": 103
},
{
"epoch": 0.26262626262626265,
"grad_norm": 1.390625,
"learning_rate": 1.5531914893617023e-05,
"loss": 0.4976,
"step": 104
},
{
"epoch": 0.26515151515151514,
"grad_norm": 1.40625,
"learning_rate": 1.547872340425532e-05,
"loss": 0.5354,
"step": 105
},
{
"epoch": 0.2676767676767677,
"grad_norm": 1.3203125,
"learning_rate": 1.542553191489362e-05,
"loss": 0.5232,
"step": 106
},
{
"epoch": 0.2702020202020202,
"grad_norm": 1.2734375,
"learning_rate": 1.5372340425531915e-05,
"loss": 0.4453,
"step": 107
},
{
"epoch": 0.2727272727272727,
"grad_norm": 1.5078125,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.6146,
"step": 108
},
{
"epoch": 0.27525252525252525,
"grad_norm": 1.2578125,
"learning_rate": 1.5265957446808512e-05,
"loss": 0.4385,
"step": 109
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.5078125,
"learning_rate": 1.521276595744681e-05,
"loss": 0.5709,
"step": 110
},
{
"epoch": 0.2803030303030303,
"grad_norm": 1.3984375,
"learning_rate": 1.5159574468085108e-05,
"loss": 0.5547,
"step": 111
},
{
"epoch": 0.2828282828282828,
"grad_norm": 1.5,
"learning_rate": 1.5106382978723407e-05,
"loss": 0.5413,
"step": 112
},
{
"epoch": 0.28535353535353536,
"grad_norm": 1.3984375,
"learning_rate": 1.5053191489361702e-05,
"loss": 0.5048,
"step": 113
},
{
"epoch": 0.2878787878787879,
"grad_norm": 1.5390625,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.6009,
"step": 114
},
{
"epoch": 0.2904040404040404,
"grad_norm": 1.28125,
"learning_rate": 1.49468085106383e-05,
"loss": 0.472,
"step": 115
},
{
"epoch": 0.29292929292929293,
"grad_norm": 1.4921875,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.6086,
"step": 116
},
{
"epoch": 0.29545454545454547,
"grad_norm": 1.5859375,
"learning_rate": 1.4840425531914894e-05,
"loss": 0.5837,
"step": 117
},
{
"epoch": 0.29797979797979796,
"grad_norm": 1.4921875,
"learning_rate": 1.4787234042553193e-05,
"loss": 0.5546,
"step": 118
},
{
"epoch": 0.3005050505050505,
"grad_norm": 1.4453125,
"learning_rate": 1.473404255319149e-05,
"loss": 0.4623,
"step": 119
},
{
"epoch": 0.30303030303030304,
"grad_norm": 1.40625,
"learning_rate": 1.4680851063829789e-05,
"loss": 0.492,
"step": 120
},
{
"epoch": 0.3055555555555556,
"grad_norm": 1.5546875,
"learning_rate": 1.4627659574468087e-05,
"loss": 0.5731,
"step": 121
},
{
"epoch": 0.30808080808080807,
"grad_norm": 1.5390625,
"learning_rate": 1.4574468085106384e-05,
"loss": 0.5626,
"step": 122
},
{
"epoch": 0.3106060606060606,
"grad_norm": 1.3046875,
"learning_rate": 1.4521276595744683e-05,
"loss": 0.475,
"step": 123
},
{
"epoch": 0.31313131313131315,
"grad_norm": 1.3828125,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.552,
"step": 124
},
{
"epoch": 0.31565656565656564,
"grad_norm": 1.5234375,
"learning_rate": 1.4414893617021276e-05,
"loss": 0.525,
"step": 125
},
{
"epoch": 0.3181818181818182,
"grad_norm": 1.5390625,
"learning_rate": 1.4361702127659575e-05,
"loss": 0.5764,
"step": 126
},
{
"epoch": 0.3207070707070707,
"grad_norm": 1.5703125,
"learning_rate": 1.4308510638297874e-05,
"loss": 0.5816,
"step": 127
},
{
"epoch": 0.32323232323232326,
"grad_norm": 1.453125,
"learning_rate": 1.425531914893617e-05,
"loss": 0.5908,
"step": 128
},
{
"epoch": 0.32575757575757575,
"grad_norm": 1.4140625,
"learning_rate": 1.420212765957447e-05,
"loss": 0.5943,
"step": 129
},
{
"epoch": 0.3282828282828283,
"grad_norm": 1.2890625,
"learning_rate": 1.4148936170212768e-05,
"loss": 0.4884,
"step": 130
},
{
"epoch": 0.33080808080808083,
"grad_norm": 1.6875,
"learning_rate": 1.4095744680851065e-05,
"loss": 0.5799,
"step": 131
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.4140625,
"learning_rate": 1.4042553191489363e-05,
"loss": 0.5285,
"step": 132
},
{
"epoch": 0.33585858585858586,
"grad_norm": 1.4765625,
"learning_rate": 1.3989361702127662e-05,
"loss": 0.4816,
"step": 133
},
{
"epoch": 0.3383838383838384,
"grad_norm": 1.5,
"learning_rate": 1.3936170212765959e-05,
"loss": 0.6294,
"step": 134
},
{
"epoch": 0.3409090909090909,
"grad_norm": 1.3046875,
"learning_rate": 1.3882978723404257e-05,
"loss": 0.4614,
"step": 135
},
{
"epoch": 0.3434343434343434,
"grad_norm": 1.4765625,
"learning_rate": 1.3829787234042556e-05,
"loss": 0.5877,
"step": 136
},
{
"epoch": 0.34595959595959597,
"grad_norm": 1.2265625,
"learning_rate": 1.3776595744680851e-05,
"loss": 0.4327,
"step": 137
},
{
"epoch": 0.3484848484848485,
"grad_norm": 1.4765625,
"learning_rate": 1.372340425531915e-05,
"loss": 0.5889,
"step": 138
},
{
"epoch": 0.351010101010101,
"grad_norm": 1.5,
"learning_rate": 1.3670212765957447e-05,
"loss": 0.5389,
"step": 139
},
{
"epoch": 0.35353535353535354,
"grad_norm": 1.21875,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.4373,
"step": 140
},
{
"epoch": 0.3560606060606061,
"grad_norm": 1.5703125,
"learning_rate": 1.3563829787234044e-05,
"loss": 0.5761,
"step": 141
},
{
"epoch": 0.35858585858585856,
"grad_norm": 1.625,
"learning_rate": 1.351063829787234e-05,
"loss": 0.7717,
"step": 142
},
{
"epoch": 0.3611111111111111,
"grad_norm": 1.2578125,
"learning_rate": 1.345744680851064e-05,
"loss": 0.4974,
"step": 143
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.5078125,
"learning_rate": 1.3404255319148938e-05,
"loss": 0.5615,
"step": 144
},
{
"epoch": 0.3661616161616162,
"grad_norm": 1.34375,
"learning_rate": 1.3351063829787235e-05,
"loss": 0.5081,
"step": 145
},
{
"epoch": 0.3686868686868687,
"grad_norm": 1.421875,
"learning_rate": 1.3297872340425533e-05,
"loss": 0.486,
"step": 146
},
{
"epoch": 0.3712121212121212,
"grad_norm": 1.28125,
"learning_rate": 1.3244680851063832e-05,
"loss": 0.4847,
"step": 147
},
{
"epoch": 0.37373737373737376,
"grad_norm": 1.53125,
"learning_rate": 1.3191489361702127e-05,
"loss": 0.5955,
"step": 148
},
{
"epoch": 0.37626262626262624,
"grad_norm": 1.3515625,
"learning_rate": 1.3138297872340426e-05,
"loss": 0.46,
"step": 149
},
{
"epoch": 0.3787878787878788,
"grad_norm": 1.4375,
"learning_rate": 1.3085106382978724e-05,
"loss": 0.5995,
"step": 150
},
{
"epoch": 0.3813131313131313,
"grad_norm": 1.3515625,
"learning_rate": 1.3031914893617021e-05,
"loss": 0.468,
"step": 151
},
{
"epoch": 0.3838383838383838,
"grad_norm": 1.984375,
"learning_rate": 1.297872340425532e-05,
"loss": 0.5832,
"step": 152
},
{
"epoch": 0.38636363636363635,
"grad_norm": 1.40625,
"learning_rate": 1.2925531914893619e-05,
"loss": 0.6264,
"step": 153
},
{
"epoch": 0.3888888888888889,
"grad_norm": 1.515625,
"learning_rate": 1.2872340425531915e-05,
"loss": 0.5987,
"step": 154
},
{
"epoch": 0.39141414141414144,
"grad_norm": 1.390625,
"learning_rate": 1.2819148936170214e-05,
"loss": 0.5271,
"step": 155
},
{
"epoch": 0.3939393939393939,
"grad_norm": 1.28125,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.4722,
"step": 156
},
{
"epoch": 0.39646464646464646,
"grad_norm": 1.484375,
"learning_rate": 1.2712765957446808e-05,
"loss": 0.6128,
"step": 157
},
{
"epoch": 0.398989898989899,
"grad_norm": 1.515625,
"learning_rate": 1.2659574468085108e-05,
"loss": 0.7296,
"step": 158
},
{
"epoch": 0.4015151515151515,
"grad_norm": 1.5703125,
"learning_rate": 1.2606382978723407e-05,
"loss": 0.658,
"step": 159
},
{
"epoch": 0.40404040404040403,
"grad_norm": 1.3828125,
"learning_rate": 1.2553191489361702e-05,
"loss": 0.5477,
"step": 160
},
{
"epoch": 0.4065656565656566,
"grad_norm": 1.453125,
"learning_rate": 1.25e-05,
"loss": 0.5191,
"step": 161
},
{
"epoch": 0.4090909090909091,
"grad_norm": 1.5859375,
"learning_rate": 1.24468085106383e-05,
"loss": 0.6534,
"step": 162
},
{
"epoch": 0.4116161616161616,
"grad_norm": 1.4140625,
"learning_rate": 1.2393617021276596e-05,
"loss": 0.5652,
"step": 163
},
{
"epoch": 0.41414141414141414,
"grad_norm": 1.4609375,
"learning_rate": 1.2340425531914895e-05,
"loss": 0.7201,
"step": 164
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.484375,
"learning_rate": 1.2287234042553193e-05,
"loss": 0.6026,
"step": 165
},
{
"epoch": 0.41919191919191917,
"grad_norm": 1.375,
"learning_rate": 1.223404255319149e-05,
"loss": 0.5286,
"step": 166
},
{
"epoch": 0.4217171717171717,
"grad_norm": 1.3359375,
"learning_rate": 1.2180851063829789e-05,
"loss": 0.5045,
"step": 167
},
{
"epoch": 0.42424242424242425,
"grad_norm": 1.3671875,
"learning_rate": 1.2127659574468087e-05,
"loss": 0.5062,
"step": 168
},
{
"epoch": 0.42676767676767674,
"grad_norm": 1.5546875,
"learning_rate": 1.2074468085106383e-05,
"loss": 0.5936,
"step": 169
},
{
"epoch": 0.4292929292929293,
"grad_norm": 1.3828125,
"learning_rate": 1.2021276595744681e-05,
"loss": 0.4753,
"step": 170
},
{
"epoch": 0.4318181818181818,
"grad_norm": 1.5859375,
"learning_rate": 1.196808510638298e-05,
"loss": 0.5516,
"step": 171
},
{
"epoch": 0.43434343434343436,
"grad_norm": 1.4375,
"learning_rate": 1.1914893617021277e-05,
"loss": 0.571,
"step": 172
},
{
"epoch": 0.43686868686868685,
"grad_norm": 1.546875,
"learning_rate": 1.1861702127659575e-05,
"loss": 0.5234,
"step": 173
},
{
"epoch": 0.4393939393939394,
"grad_norm": 1.46875,
"learning_rate": 1.1808510638297874e-05,
"loss": 0.5271,
"step": 174
},
{
"epoch": 0.44191919191919193,
"grad_norm": 1.890625,
"learning_rate": 1.175531914893617e-05,
"loss": 0.6933,
"step": 175
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.5,
"learning_rate": 1.170212765957447e-05,
"loss": 0.5311,
"step": 176
},
{
"epoch": 0.44696969696969696,
"grad_norm": 1.4765625,
"learning_rate": 1.1648936170212768e-05,
"loss": 0.4916,
"step": 177
},
{
"epoch": 0.4494949494949495,
"grad_norm": 1.3671875,
"learning_rate": 1.1595744680851065e-05,
"loss": 0.6005,
"step": 178
},
{
"epoch": 0.45202020202020204,
"grad_norm": 1.34375,
"learning_rate": 1.1542553191489364e-05,
"loss": 0.4982,
"step": 179
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.359375,
"learning_rate": 1.1489361702127662e-05,
"loss": 0.5284,
"step": 180
},
{
"epoch": 0.45707070707070707,
"grad_norm": 1.296875,
"learning_rate": 1.1436170212765957e-05,
"loss": 0.4811,
"step": 181
},
{
"epoch": 0.4595959595959596,
"grad_norm": 1.3671875,
"learning_rate": 1.1382978723404256e-05,
"loss": 0.4331,
"step": 182
},
{
"epoch": 0.4621212121212121,
"grad_norm": 1.609375,
"learning_rate": 1.1329787234042555e-05,
"loss": 0.5808,
"step": 183
},
{
"epoch": 0.46464646464646464,
"grad_norm": 1.71875,
"learning_rate": 1.1276595744680851e-05,
"loss": 0.7746,
"step": 184
},
{
"epoch": 0.4671717171717172,
"grad_norm": 1.4609375,
"learning_rate": 1.122340425531915e-05,
"loss": 0.5737,
"step": 185
},
{
"epoch": 0.4696969696969697,
"grad_norm": 1.5078125,
"learning_rate": 1.1170212765957447e-05,
"loss": 0.5221,
"step": 186
},
{
"epoch": 0.4722222222222222,
"grad_norm": 1.5703125,
"learning_rate": 1.1117021276595746e-05,
"loss": 0.6547,
"step": 187
},
{
"epoch": 0.47474747474747475,
"grad_norm": 1.375,
"learning_rate": 1.1063829787234044e-05,
"loss": 0.5264,
"step": 188
},
{
"epoch": 0.4772727272727273,
"grad_norm": 1.5625,
"learning_rate": 1.1010638297872341e-05,
"loss": 0.5393,
"step": 189
},
{
"epoch": 0.4797979797979798,
"grad_norm": 1.4609375,
"learning_rate": 1.095744680851064e-05,
"loss": 0.5669,
"step": 190
},
{
"epoch": 0.4823232323232323,
"grad_norm": 1.4453125,
"learning_rate": 1.0904255319148938e-05,
"loss": 0.5213,
"step": 191
},
{
"epoch": 0.48484848484848486,
"grad_norm": 1.5703125,
"learning_rate": 1.0851063829787233e-05,
"loss": 0.5955,
"step": 192
},
{
"epoch": 0.48737373737373735,
"grad_norm": 1.4765625,
"learning_rate": 1.0797872340425532e-05,
"loss": 0.5273,
"step": 193
},
{
"epoch": 0.4898989898989899,
"grad_norm": 1.3046875,
"learning_rate": 1.074468085106383e-05,
"loss": 0.4816,
"step": 194
},
{
"epoch": 0.49242424242424243,
"grad_norm": 2.0,
"learning_rate": 1.0691489361702128e-05,
"loss": 0.6121,
"step": 195
},
{
"epoch": 0.494949494949495,
"grad_norm": 1.4609375,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.6325,
"step": 196
},
{
"epoch": 0.494949494949495,
"eval_loss": 0.45528531074523926,
"eval_model_preparation_time": 0.0001,
"eval_runtime": 4.4693,
"eval_samples_per_second": 23.717,
"eval_steps_per_second": 3.132,
"step": 196
},
{
"epoch": 0.49747474747474746,
"grad_norm": 1.296875,
"learning_rate": 1.0585106382978725e-05,
"loss": 0.4857,
"step": 197
}
],
"logging_steps": 1,
"max_steps": 396,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 197,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.091750925875282e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}