cutelemonlili's picture
Add files using upload-large-folder tool
91ef6df verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 266,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007518796992481203,
"grad_norm": 0.08289683091006875,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.2087,
"step": 1
},
{
"epoch": 0.015037593984962405,
"grad_norm": 0.03030546873337256,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.1045,
"step": 2
},
{
"epoch": 0.022556390977443608,
"grad_norm": 0.04340875250649354,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.1291,
"step": 3
},
{
"epoch": 0.03007518796992481,
"grad_norm": 0.04223285184390201,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.1263,
"step": 4
},
{
"epoch": 0.03759398496240601,
"grad_norm": 0.04894801143939966,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.1409,
"step": 5
},
{
"epoch": 0.045112781954887216,
"grad_norm": 0.22261274174154347,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.2362,
"step": 6
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.043978295203653116,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.1195,
"step": 7
},
{
"epoch": 0.06015037593984962,
"grad_norm": 0.04381964595602848,
"learning_rate": 2.962962962962963e-05,
"loss": 0.1215,
"step": 8
},
{
"epoch": 0.06766917293233082,
"grad_norm": 0.03290036083527209,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0881,
"step": 9
},
{
"epoch": 0.07518796992481203,
"grad_norm": 0.03303553719000837,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.0923,
"step": 10
},
{
"epoch": 0.08270676691729323,
"grad_norm": 0.12832751130733108,
"learning_rate": 4.074074074074074e-05,
"loss": 0.218,
"step": 11
},
{
"epoch": 0.09022556390977443,
"grad_norm": 0.08479076437214379,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.149,
"step": 12
},
{
"epoch": 0.09774436090225563,
"grad_norm": 0.13534422076541278,
"learning_rate": 4.814814814814815e-05,
"loss": 0.1835,
"step": 13
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.10341781138952844,
"learning_rate": 5.185185185185185e-05,
"loss": 0.1573,
"step": 14
},
{
"epoch": 0.11278195488721804,
"grad_norm": 0.12256701286625035,
"learning_rate": 5.555555555555556e-05,
"loss": 0.1701,
"step": 15
},
{
"epoch": 0.12030075187969924,
"grad_norm": 0.05708144315846648,
"learning_rate": 5.925925925925926e-05,
"loss": 0.0938,
"step": 16
},
{
"epoch": 0.12781954887218044,
"grad_norm": 0.0813311914428683,
"learning_rate": 6.296296296296296e-05,
"loss": 0.1221,
"step": 17
},
{
"epoch": 0.13533834586466165,
"grad_norm": 0.07670248585638807,
"learning_rate": 6.666666666666667e-05,
"loss": 0.1068,
"step": 18
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.12777013083805186,
"learning_rate": 7.037037037037038e-05,
"loss": 0.1183,
"step": 19
},
{
"epoch": 0.15037593984962405,
"grad_norm": 0.07203864112993859,
"learning_rate": 7.407407407407407e-05,
"loss": 0.0941,
"step": 20
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.06622495246697525,
"learning_rate": 7.777777777777778e-05,
"loss": 0.0851,
"step": 21
},
{
"epoch": 0.16541353383458646,
"grad_norm": 0.05064660711733651,
"learning_rate": 8.148148148148148e-05,
"loss": 0.0672,
"step": 22
},
{
"epoch": 0.17293233082706766,
"grad_norm": 0.05569880144395339,
"learning_rate": 8.518518518518518e-05,
"loss": 0.0692,
"step": 23
},
{
"epoch": 0.18045112781954886,
"grad_norm": 0.06341922542018791,
"learning_rate": 8.888888888888889e-05,
"loss": 0.0719,
"step": 24
},
{
"epoch": 0.18796992481203006,
"grad_norm": 0.09483517480751269,
"learning_rate": 9.25925925925926e-05,
"loss": 0.1039,
"step": 25
},
{
"epoch": 0.19548872180451127,
"grad_norm": 0.06345422292566975,
"learning_rate": 9.62962962962963e-05,
"loss": 0.0642,
"step": 26
},
{
"epoch": 0.20300751879699247,
"grad_norm": 0.06565559978972503,
"learning_rate": 0.0001,
"loss": 0.0806,
"step": 27
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.07234940226716612,
"learning_rate": 9.999568045802217e-05,
"loss": 0.0699,
"step": 28
},
{
"epoch": 0.21804511278195488,
"grad_norm": 0.09174614011055109,
"learning_rate": 9.998272257842641e-05,
"loss": 0.0797,
"step": 29
},
{
"epoch": 0.22556390977443608,
"grad_norm": 0.0799372037045221,
"learning_rate": 9.996112860009688e-05,
"loss": 0.0599,
"step": 30
},
{
"epoch": 0.23308270676691728,
"grad_norm": 0.07650243821697233,
"learning_rate": 9.993090225407743e-05,
"loss": 0.0673,
"step": 31
},
{
"epoch": 0.24060150375939848,
"grad_norm": 0.07437978624039222,
"learning_rate": 9.989204876292688e-05,
"loss": 0.063,
"step": 32
},
{
"epoch": 0.24812030075187969,
"grad_norm": 0.05826090837310029,
"learning_rate": 9.984457483981669e-05,
"loss": 0.0563,
"step": 33
},
{
"epoch": 0.2556390977443609,
"grad_norm": 0.046830358894256296,
"learning_rate": 9.978848868737098e-05,
"loss": 0.0449,
"step": 34
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.059942032653184,
"learning_rate": 9.972379999624936e-05,
"loss": 0.0492,
"step": 35
},
{
"epoch": 0.2706766917293233,
"grad_norm": 0.04559622889503948,
"learning_rate": 9.96505199434725e-05,
"loss": 0.0384,
"step": 36
},
{
"epoch": 0.2781954887218045,
"grad_norm": 0.08582556953299057,
"learning_rate": 9.956866119049095e-05,
"loss": 0.052,
"step": 37
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.05879365562753825,
"learning_rate": 9.947823788099753e-05,
"loss": 0.0499,
"step": 38
},
{
"epoch": 0.2932330827067669,
"grad_norm": 0.07725729979493687,
"learning_rate": 9.937926563848346e-05,
"loss": 0.0382,
"step": 39
},
{
"epoch": 0.3007518796992481,
"grad_norm": 0.06791365316815774,
"learning_rate": 9.927176156353899e-05,
"loss": 0.0424,
"step": 40
},
{
"epoch": 0.3082706766917293,
"grad_norm": 0.06835456363607172,
"learning_rate": 9.91557442308987e-05,
"loss": 0.0477,
"step": 41
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.06785706541381617,
"learning_rate": 9.903123368623216e-05,
"loss": 0.0423,
"step": 42
},
{
"epoch": 0.3233082706766917,
"grad_norm": 0.037822284484082716,
"learning_rate": 9.889825144268029e-05,
"loss": 0.0373,
"step": 43
},
{
"epoch": 0.3308270676691729,
"grad_norm": 0.09335172889811039,
"learning_rate": 9.875682047713846e-05,
"loss": 0.0532,
"step": 44
},
{
"epoch": 0.3383458646616541,
"grad_norm": 0.03552601591664148,
"learning_rate": 9.860696522628639e-05,
"loss": 0.0302,
"step": 45
},
{
"epoch": 0.3458646616541353,
"grad_norm": 0.06792399841238587,
"learning_rate": 9.844871158236591e-05,
"loss": 0.043,
"step": 46
},
{
"epoch": 0.3533834586466165,
"grad_norm": 0.07394708716985816,
"learning_rate": 9.828208688870735e-05,
"loss": 0.0414,
"step": 47
},
{
"epoch": 0.3609022556390977,
"grad_norm": 0.07644206071621325,
"learning_rate": 9.810711993500507e-05,
"loss": 0.0442,
"step": 48
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.04448780324279346,
"learning_rate": 9.792384095234313e-05,
"loss": 0.0397,
"step": 49
},
{
"epoch": 0.37593984962406013,
"grad_norm": 0.040299146373067786,
"learning_rate": 9.773228160797188e-05,
"loss": 0.0294,
"step": 50
},
{
"epoch": 0.38345864661654133,
"grad_norm": 0.04600091352431098,
"learning_rate": 9.753247499983649e-05,
"loss": 0.0388,
"step": 51
},
{
"epoch": 0.39097744360902253,
"grad_norm": 0.05174024689025062,
"learning_rate": 9.732445565085824e-05,
"loss": 0.0464,
"step": 52
},
{
"epoch": 0.39849624060150374,
"grad_norm": 0.06048290755695799,
"learning_rate": 9.71082595029695e-05,
"loss": 0.0441,
"step": 53
},
{
"epoch": 0.40601503759398494,
"grad_norm": 0.06909111905381797,
"learning_rate": 9.688392391090373e-05,
"loss": 0.0403,
"step": 54
},
{
"epoch": 0.41353383458646614,
"grad_norm": 0.10580098842980783,
"learning_rate": 9.665148763574123e-05,
"loss": 0.0414,
"step": 55
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.06004492721880413,
"learning_rate": 9.64109908382119e-05,
"loss": 0.0348,
"step": 56
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.05616302785838828,
"learning_rate": 9.616247507175623e-05,
"loss": 0.0353,
"step": 57
},
{
"epoch": 0.43609022556390975,
"grad_norm": 0.04963402332052172,
"learning_rate": 9.590598327534564e-05,
"loss": 0.0354,
"step": 58
},
{
"epoch": 0.44360902255639095,
"grad_norm": 0.09520890937208057,
"learning_rate": 9.564155976606339e-05,
"loss": 0.0436,
"step": 59
},
{
"epoch": 0.45112781954887216,
"grad_norm": 0.07317691578763187,
"learning_rate": 9.536925023144742e-05,
"loss": 0.0448,
"step": 60
},
{
"epoch": 0.45864661654135336,
"grad_norm": 0.0653903652099525,
"learning_rate": 9.508910172159635e-05,
"loss": 0.0456,
"step": 61
},
{
"epoch": 0.46616541353383456,
"grad_norm": 0.08533000644485912,
"learning_rate": 9.480116264104011e-05,
"loss": 0.0417,
"step": 62
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.07477194348090598,
"learning_rate": 9.450548274037653e-05,
"loss": 0.0427,
"step": 63
},
{
"epoch": 0.48120300751879697,
"grad_norm": 0.040320894825821886,
"learning_rate": 9.420211310767533e-05,
"loss": 0.0317,
"step": 64
},
{
"epoch": 0.48872180451127817,
"grad_norm": 0.04204333897095501,
"learning_rate": 9.389110615965102e-05,
"loss": 0.0308,
"step": 65
},
{
"epoch": 0.49624060150375937,
"grad_norm": 0.06435209558835227,
"learning_rate": 9.35725156326063e-05,
"loss": 0.0404,
"step": 66
},
{
"epoch": 0.5037593984962406,
"grad_norm": 0.05292300086818655,
"learning_rate": 9.324639657314742e-05,
"loss": 0.0383,
"step": 67
},
{
"epoch": 0.5112781954887218,
"grad_norm": 0.0533359959006372,
"learning_rate": 9.291280532867302e-05,
"loss": 0.0419,
"step": 68
},
{
"epoch": 0.518796992481203,
"grad_norm": 0.0421677134855151,
"learning_rate": 9.257179953763845e-05,
"loss": 0.0301,
"step": 69
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.047396091527240565,
"learning_rate": 9.222343811959693e-05,
"loss": 0.0355,
"step": 70
},
{
"epoch": 0.5338345864661654,
"grad_norm": 0.05055865206409256,
"learning_rate": 9.186778126501916e-05,
"loss": 0.0379,
"step": 71
},
{
"epoch": 0.5413533834586466,
"grad_norm": 0.03922328494549794,
"learning_rate": 9.150489042489367e-05,
"loss": 0.03,
"step": 72
},
{
"epoch": 0.5488721804511278,
"grad_norm": 0.08580904921861318,
"learning_rate": 9.113482830010918e-05,
"loss": 0.038,
"step": 73
},
{
"epoch": 0.556390977443609,
"grad_norm": 0.04615991149700515,
"learning_rate": 9.075765883062093e-05,
"loss": 0.0321,
"step": 74
},
{
"epoch": 0.5639097744360902,
"grad_norm": 0.21688152384611062,
"learning_rate": 9.037344718440322e-05,
"loss": 0.0369,
"step": 75
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.06709856743156827,
"learning_rate": 8.99822597461894e-05,
"loss": 0.0429,
"step": 76
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.07300506123989278,
"learning_rate": 8.958416410600187e-05,
"loss": 0.0351,
"step": 77
},
{
"epoch": 0.5864661654135338,
"grad_norm": 0.08415403445437179,
"learning_rate": 8.917922904747384e-05,
"loss": 0.0425,
"step": 78
},
{
"epoch": 0.5939849624060151,
"grad_norm": 0.043734956942212244,
"learning_rate": 8.876752453596462e-05,
"loss": 0.0322,
"step": 79
},
{
"epoch": 0.6015037593984962,
"grad_norm": 0.11340147288766998,
"learning_rate": 8.834912170647101e-05,
"loss": 0.0446,
"step": 80
},
{
"epoch": 0.6090225563909775,
"grad_norm": 0.061288991507609664,
"learning_rate": 8.792409285133642e-05,
"loss": 0.0424,
"step": 81
},
{
"epoch": 0.6165413533834586,
"grad_norm": 0.043805649893633086,
"learning_rate": 8.749251140776016e-05,
"loss": 0.0342,
"step": 82
},
{
"epoch": 0.6240601503759399,
"grad_norm": 0.05953059965877648,
"learning_rate": 8.705445194510868e-05,
"loss": 0.0321,
"step": 83
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.07945205955271631,
"learning_rate": 8.66099901520315e-05,
"loss": 0.0371,
"step": 84
},
{
"epoch": 0.6390977443609023,
"grad_norm": 0.04453806753518928,
"learning_rate": 8.615920282338355e-05,
"loss": 0.0349,
"step": 85
},
{
"epoch": 0.6466165413533834,
"grad_norm": 0.05196927124976879,
"learning_rate": 8.570216784695637e-05,
"loss": 0.0287,
"step": 86
},
{
"epoch": 0.6541353383458647,
"grad_norm": 0.08901603801098872,
"learning_rate": 8.52389641900206e-05,
"loss": 0.0379,
"step": 87
},
{
"epoch": 0.6616541353383458,
"grad_norm": 0.04173009472070016,
"learning_rate": 8.476967188568188e-05,
"loss": 0.0264,
"step": 88
},
{
"epoch": 0.6691729323308271,
"grad_norm": 0.06191267416598679,
"learning_rate": 8.429437201905254e-05,
"loss": 0.028,
"step": 89
},
{
"epoch": 0.6766917293233082,
"grad_norm": 0.05938205491417802,
"learning_rate": 8.381314671324159e-05,
"loss": 0.0353,
"step": 90
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.06594155945203996,
"learning_rate": 8.332607911516545e-05,
"loss": 0.0423,
"step": 91
},
{
"epoch": 0.6917293233082706,
"grad_norm": 0.03727901580427709,
"learning_rate": 8.283325338118153e-05,
"loss": 0.0288,
"step": 92
},
{
"epoch": 0.6992481203007519,
"grad_norm": 0.039506792129091334,
"learning_rate": 8.233475466254765e-05,
"loss": 0.0319,
"step": 93
},
{
"epoch": 0.706766917293233,
"grad_norm": 0.10114676138905467,
"learning_rate": 8.183066909070947e-05,
"loss": 0.0413,
"step": 94
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.0519720254987392,
"learning_rate": 8.132108376241849e-05,
"loss": 0.0319,
"step": 95
},
{
"epoch": 0.7218045112781954,
"grad_norm": 0.06828535688055823,
"learning_rate": 8.08060867246834e-05,
"loss": 0.0415,
"step": 96
},
{
"epoch": 0.7293233082706767,
"grad_norm": 0.04423778552147402,
"learning_rate": 8.028576695955711e-05,
"loss": 0.0307,
"step": 97
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.04301708267503238,
"learning_rate": 7.97602143687623e-05,
"loss": 0.0292,
"step": 98
},
{
"epoch": 0.7443609022556391,
"grad_norm": 0.07557692217243188,
"learning_rate": 7.922951975815811e-05,
"loss": 0.0304,
"step": 99
},
{
"epoch": 0.7518796992481203,
"grad_norm": 0.061041885279450855,
"learning_rate": 7.869377482205042e-05,
"loss": 0.0318,
"step": 100
},
{
"epoch": 0.7593984962406015,
"grad_norm": 0.040342152719196084,
"learning_rate": 7.815307212734888e-05,
"loss": 0.027,
"step": 101
},
{
"epoch": 0.7669172932330827,
"grad_norm": 0.07790755826343725,
"learning_rate": 7.760750509757298e-05,
"loss": 0.0339,
"step": 102
},
{
"epoch": 0.7744360902255639,
"grad_norm": 0.05210408795431101,
"learning_rate": 7.705716799671019e-05,
"loss": 0.0228,
"step": 103
},
{
"epoch": 0.7819548872180451,
"grad_norm": 0.08000736959421384,
"learning_rate": 7.650215591292888e-05,
"loss": 0.0357,
"step": 104
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.05843028390975531,
"learning_rate": 7.594256474214882e-05,
"loss": 0.0285,
"step": 105
},
{
"epoch": 0.7969924812030075,
"grad_norm": 0.13537509841914472,
"learning_rate": 7.537849117147212e-05,
"loss": 0.0359,
"step": 106
},
{
"epoch": 0.8045112781954887,
"grad_norm": 0.08230566866298178,
"learning_rate": 7.481003266247744e-05,
"loss": 0.0367,
"step": 107
},
{
"epoch": 0.8120300751879699,
"grad_norm": 0.09678557492723187,
"learning_rate": 7.423728743438048e-05,
"loss": 0.0358,
"step": 108
},
{
"epoch": 0.8195488721804511,
"grad_norm": 0.049541914871144996,
"learning_rate": 7.366035444706347e-05,
"loss": 0.0329,
"step": 109
},
{
"epoch": 0.8270676691729323,
"grad_norm": 0.08823757922929092,
"learning_rate": 7.307933338397667e-05,
"loss": 0.0364,
"step": 110
},
{
"epoch": 0.8345864661654135,
"grad_norm": 0.044744299992948704,
"learning_rate": 7.249432463491498e-05,
"loss": 0.0328,
"step": 111
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.03814585189064516,
"learning_rate": 7.190542927867234e-05,
"loss": 0.0242,
"step": 112
},
{
"epoch": 0.849624060150376,
"grad_norm": 0.03553642928460275,
"learning_rate": 7.131274906557725e-05,
"loss": 0.0277,
"step": 113
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.044176381361140944,
"learning_rate": 7.071638639991207e-05,
"loss": 0.0282,
"step": 114
},
{
"epoch": 0.8646616541353384,
"grad_norm": 0.04113727259330019,
"learning_rate": 7.011644432221958e-05,
"loss": 0.0311,
"step": 115
},
{
"epoch": 0.8721804511278195,
"grad_norm": 0.060773829286428965,
"learning_rate": 6.95130264914993e-05,
"loss": 0.0414,
"step": 116
},
{
"epoch": 0.8796992481203008,
"grad_norm": 0.05757846085257315,
"learning_rate": 6.890623716729724e-05,
"loss": 0.0279,
"step": 117
},
{
"epoch": 0.8872180451127819,
"grad_norm": 0.08428255259620104,
"learning_rate": 6.82961811916917e-05,
"loss": 0.0298,
"step": 118
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.04529601746123181,
"learning_rate": 6.768296397117848e-05,
"loss": 0.0263,
"step": 119
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.0559976345746786,
"learning_rate": 6.706669145845863e-05,
"loss": 0.0331,
"step": 120
},
{
"epoch": 0.9097744360902256,
"grad_norm": 0.046985300077111235,
"learning_rate": 6.644747013413168e-05,
"loss": 0.0323,
"step": 121
},
{
"epoch": 0.9172932330827067,
"grad_norm": 0.06973194335422163,
"learning_rate": 6.582540698829781e-05,
"loss": 0.0356,
"step": 122
},
{
"epoch": 0.924812030075188,
"grad_norm": 0.0550307651636393,
"learning_rate": 6.520060950207185e-05,
"loss": 0.0374,
"step": 123
},
{
"epoch": 0.9323308270676691,
"grad_norm": 0.04136098377224926,
"learning_rate": 6.457318562901256e-05,
"loss": 0.0281,
"step": 124
},
{
"epoch": 0.9398496240601504,
"grad_norm": 0.04471839673788357,
"learning_rate": 6.394324377647028e-05,
"loss": 0.0344,
"step": 125
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.04057335071418551,
"learning_rate": 6.331089278685599e-05,
"loss": 0.0289,
"step": 126
},
{
"epoch": 0.9548872180451128,
"grad_norm": 0.036632585834280834,
"learning_rate": 6.26762419188355e-05,
"loss": 0.0254,
"step": 127
},
{
"epoch": 0.9624060150375939,
"grad_norm": 0.05253467833143005,
"learning_rate": 6.203940082845144e-05,
"loss": 0.0423,
"step": 128
},
{
"epoch": 0.9699248120300752,
"grad_norm": 0.05828434847478486,
"learning_rate": 6.140047955017671e-05,
"loss": 0.0331,
"step": 129
},
{
"epoch": 0.9774436090225563,
"grad_norm": 0.052528332979290625,
"learning_rate": 6.075958847790262e-05,
"loss": 0.0344,
"step": 130
},
{
"epoch": 0.9849624060150376,
"grad_norm": 0.039125799054480936,
"learning_rate": 6.011683834586473e-05,
"loss": 0.0264,
"step": 131
},
{
"epoch": 0.9924812030075187,
"grad_norm": 0.03707157930189228,
"learning_rate": 5.947234020951015e-05,
"loss": 0.0237,
"step": 132
},
{
"epoch": 1.0,
"grad_norm": 0.054189982183542575,
"learning_rate": 5.882620542630901e-05,
"loss": 0.0317,
"step": 133
},
{
"epoch": 1.0075187969924813,
"grad_norm": 0.04357846265860899,
"learning_rate": 5.8178545636514145e-05,
"loss": 0.0268,
"step": 134
},
{
"epoch": 1.0150375939849625,
"grad_norm": 0.056012933476124856,
"learning_rate": 5.752947274387147e-05,
"loss": 0.0223,
"step": 135
},
{
"epoch": 1.0225563909774436,
"grad_norm": 0.049689439936320044,
"learning_rate": 5.687909889628529e-05,
"loss": 0.0304,
"step": 136
},
{
"epoch": 1.0300751879699248,
"grad_norm": 0.04830994322048754,
"learning_rate": 5.622753646644102e-05,
"loss": 0.0278,
"step": 137
},
{
"epoch": 1.037593984962406,
"grad_norm": 0.04418639970975713,
"learning_rate": 5.557489803238933e-05,
"loss": 0.0259,
"step": 138
},
{
"epoch": 1.045112781954887,
"grad_norm": 0.042738363591787835,
"learning_rate": 5.492129635809473e-05,
"loss": 0.0198,
"step": 139
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.03885713180148723,
"learning_rate": 5.426684437395196e-05,
"loss": 0.0191,
"step": 140
},
{
"epoch": 1.0601503759398496,
"grad_norm": 0.04951650926676435,
"learning_rate": 5.361165515727374e-05,
"loss": 0.0214,
"step": 141
},
{
"epoch": 1.0676691729323309,
"grad_norm": 0.059968470212708236,
"learning_rate": 5.295584191275308e-05,
"loss": 0.0243,
"step": 142
},
{
"epoch": 1.0751879699248121,
"grad_norm": 0.0676386940224187,
"learning_rate": 5.229951795290353e-05,
"loss": 0.029,
"step": 143
},
{
"epoch": 1.0827067669172932,
"grad_norm": 0.04250436122379926,
"learning_rate": 5.164279667848094e-05,
"loss": 0.0204,
"step": 144
},
{
"epoch": 1.0902255639097744,
"grad_norm": 0.04124846102938738,
"learning_rate": 5.0985791558889785e-05,
"loss": 0.0209,
"step": 145
},
{
"epoch": 1.0977443609022557,
"grad_norm": 0.05914558229310168,
"learning_rate": 5.032861611257783e-05,
"loss": 0.0285,
"step": 146
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.0465029543723527,
"learning_rate": 4.967138388742218e-05,
"loss": 0.0204,
"step": 147
},
{
"epoch": 1.112781954887218,
"grad_norm": 0.06469458945659604,
"learning_rate": 4.901420844111021e-05,
"loss": 0.0314,
"step": 148
},
{
"epoch": 1.1203007518796992,
"grad_norm": 0.06440915952496404,
"learning_rate": 4.835720332151907e-05,
"loss": 0.0281,
"step": 149
},
{
"epoch": 1.1278195488721805,
"grad_norm": 0.0571757163158284,
"learning_rate": 4.770048204709648e-05,
"loss": 0.0248,
"step": 150
},
{
"epoch": 1.1353383458646618,
"grad_norm": 0.05910301690921271,
"learning_rate": 4.7044158087246926e-05,
"loss": 0.0311,
"step": 151
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.04613839631194596,
"learning_rate": 4.6388344842726264e-05,
"loss": 0.0218,
"step": 152
},
{
"epoch": 1.150375939849624,
"grad_norm": 0.05741866552084954,
"learning_rate": 4.5733155626048036e-05,
"loss": 0.0271,
"step": 153
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.04682544810113655,
"learning_rate": 4.507870364190527e-05,
"loss": 0.0264,
"step": 154
},
{
"epoch": 1.1654135338345863,
"grad_norm": 0.06282838577083374,
"learning_rate": 4.4425101967610674e-05,
"loss": 0.024,
"step": 155
},
{
"epoch": 1.1729323308270676,
"grad_norm": 0.05388737782363021,
"learning_rate": 4.377246353355899e-05,
"loss": 0.0271,
"step": 156
},
{
"epoch": 1.1804511278195489,
"grad_norm": 0.05086578069156835,
"learning_rate": 4.312090110371473e-05,
"loss": 0.0278,
"step": 157
},
{
"epoch": 1.1879699248120301,
"grad_norm": 0.05863572980738164,
"learning_rate": 4.247052725612852e-05,
"loss": 0.0292,
"step": 158
},
{
"epoch": 1.1954887218045114,
"grad_norm": 0.04227523648124146,
"learning_rate": 4.1821454363485866e-05,
"loss": 0.0234,
"step": 159
},
{
"epoch": 1.2030075187969924,
"grad_norm": 0.04268704545270105,
"learning_rate": 4.1173794573690996e-05,
"loss": 0.0206,
"step": 160
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.04778787432486908,
"learning_rate": 4.052765979048986e-05,
"loss": 0.0227,
"step": 161
},
{
"epoch": 1.218045112781955,
"grad_norm": 0.0459311125342993,
"learning_rate": 3.988316165413528e-05,
"loss": 0.0205,
"step": 162
},
{
"epoch": 1.225563909774436,
"grad_norm": 0.05603215690118315,
"learning_rate": 3.924041152209739e-05,
"loss": 0.029,
"step": 163
},
{
"epoch": 1.2330827067669172,
"grad_norm": 0.060179119443112154,
"learning_rate": 3.859952044982329e-05,
"loss": 0.0271,
"step": 164
},
{
"epoch": 1.2406015037593985,
"grad_norm": 0.04740279415347567,
"learning_rate": 3.7960599171548574e-05,
"loss": 0.0213,
"step": 165
},
{
"epoch": 1.2481203007518797,
"grad_norm": 0.052482110362426594,
"learning_rate": 3.732375808116451e-05,
"loss": 0.0258,
"step": 166
},
{
"epoch": 1.255639097744361,
"grad_norm": 0.04835120393099329,
"learning_rate": 3.668910721314402e-05,
"loss": 0.0229,
"step": 167
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.08311507045185516,
"learning_rate": 3.605675622352973e-05,
"loss": 0.0265,
"step": 168
},
{
"epoch": 1.2706766917293233,
"grad_norm": 0.053563077833150494,
"learning_rate": 3.542681437098745e-05,
"loss": 0.0256,
"step": 169
},
{
"epoch": 1.2781954887218046,
"grad_norm": 0.05567682482783888,
"learning_rate": 3.479939049792817e-05,
"loss": 0.0213,
"step": 170
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.054588031712222006,
"learning_rate": 3.417459301170219e-05,
"loss": 0.0266,
"step": 171
},
{
"epoch": 1.2932330827067668,
"grad_norm": 0.07694344232267265,
"learning_rate": 3.355252986586832e-05,
"loss": 0.0193,
"step": 172
},
{
"epoch": 1.300751879699248,
"grad_norm": 0.05943952613035603,
"learning_rate": 3.293330854154136e-05,
"loss": 0.0258,
"step": 173
},
{
"epoch": 1.3082706766917294,
"grad_norm": 0.038766556860819104,
"learning_rate": 3.2317036028821523e-05,
"loss": 0.0159,
"step": 174
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.05092188135687549,
"learning_rate": 3.1703818808308324e-05,
"loss": 0.0215,
"step": 175
},
{
"epoch": 1.3233082706766917,
"grad_norm": 0.04779789780883562,
"learning_rate": 3.109376283270277e-05,
"loss": 0.0268,
"step": 176
},
{
"epoch": 1.330827067669173,
"grad_norm": 0.04433720319245774,
"learning_rate": 3.0486973508500727e-05,
"loss": 0.0238,
"step": 177
},
{
"epoch": 1.3383458646616542,
"grad_norm": 0.049878475563895956,
"learning_rate": 2.988355567778043e-05,
"loss": 0.0259,
"step": 178
},
{
"epoch": 1.3458646616541352,
"grad_norm": 0.05962755604807658,
"learning_rate": 2.9283613600087933e-05,
"loss": 0.025,
"step": 179
},
{
"epoch": 1.3533834586466165,
"grad_norm": 0.04955718527923681,
"learning_rate": 2.8687250934422772e-05,
"loss": 0.0194,
"step": 180
},
{
"epoch": 1.3609022556390977,
"grad_norm": 0.03676456890831394,
"learning_rate": 2.8094570721327662e-05,
"loss": 0.0189,
"step": 181
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.04868946152583533,
"learning_rate": 2.750567536508504e-05,
"loss": 0.0243,
"step": 182
},
{
"epoch": 1.3759398496240602,
"grad_norm": 0.0555305400721802,
"learning_rate": 2.6920666616023327e-05,
"loss": 0.0257,
"step": 183
},
{
"epoch": 1.3834586466165413,
"grad_norm": 0.04963192556183434,
"learning_rate": 2.6339645552936536e-05,
"loss": 0.0275,
"step": 184
},
{
"epoch": 1.3909774436090225,
"grad_norm": 0.05542091349920839,
"learning_rate": 2.5762712565619528e-05,
"loss": 0.023,
"step": 185
},
{
"epoch": 1.3984962406015038,
"grad_norm": 0.0426183120843919,
"learning_rate": 2.5189967337522573e-05,
"loss": 0.0206,
"step": 186
},
{
"epoch": 1.4060150375939848,
"grad_norm": 0.05205246245376388,
"learning_rate": 2.46215088285279e-05,
"loss": 0.0229,
"step": 187
},
{
"epoch": 1.413533834586466,
"grad_norm": 0.04337666332691105,
"learning_rate": 2.4057435257851175e-05,
"loss": 0.019,
"step": 188
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.05985729489503263,
"learning_rate": 2.349784408707112e-05,
"loss": 0.0274,
"step": 189
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.062032022184375604,
"learning_rate": 2.2942832003289823e-05,
"loss": 0.0271,
"step": 190
},
{
"epoch": 1.4360902255639099,
"grad_norm": 0.05773389436675615,
"learning_rate": 2.2392494902427025e-05,
"loss": 0.0263,
"step": 191
},
{
"epoch": 1.443609022556391,
"grad_norm": 0.048522536078850126,
"learning_rate": 2.1846927872651137e-05,
"loss": 0.0242,
"step": 192
},
{
"epoch": 1.4511278195488722,
"grad_norm": 0.05010560342148772,
"learning_rate": 2.1306225177949585e-05,
"loss": 0.024,
"step": 193
},
{
"epoch": 1.4586466165413534,
"grad_norm": 0.058011679310299026,
"learning_rate": 2.07704802418419e-05,
"loss": 0.0301,
"step": 194
},
{
"epoch": 1.4661654135338344,
"grad_norm": 0.052695628737558814,
"learning_rate": 2.0239785631237705e-05,
"loss": 0.0262,
"step": 195
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.0397195089948912,
"learning_rate": 1.9714233040442915e-05,
"loss": 0.0179,
"step": 196
},
{
"epoch": 1.481203007518797,
"grad_norm": 0.05532938780742867,
"learning_rate": 1.9193913275316626e-05,
"loss": 0.0234,
"step": 197
},
{
"epoch": 1.4887218045112782,
"grad_norm": 0.07349266479809795,
"learning_rate": 1.8678916237581522e-05,
"loss": 0.0236,
"step": 198
},
{
"epoch": 1.4962406015037595,
"grad_norm": 0.03995824607041351,
"learning_rate": 1.816933090929055e-05,
"loss": 0.0176,
"step": 199
},
{
"epoch": 1.5037593984962405,
"grad_norm": 0.07166373724308431,
"learning_rate": 1.7665245337452368e-05,
"loss": 0.0258,
"step": 200
},
{
"epoch": 1.5037593984962405,
"eval_loss": 0.029665347188711166,
"eval_runtime": 6.5066,
"eval_samples_per_second": 0.922,
"eval_steps_per_second": 0.307,
"step": 200
},
{
"epoch": 1.5112781954887218,
"grad_norm": 0.048692577901512116,
"learning_rate": 1.716674661881848e-05,
"loss": 0.0224,
"step": 201
},
{
"epoch": 1.518796992481203,
"grad_norm": 0.04675059057360818,
"learning_rate": 1.667392088483456e-05,
"loss": 0.0223,
"step": 202
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.05459458244813264,
"learning_rate": 1.6186853286758397e-05,
"loss": 0.0242,
"step": 203
},
{
"epoch": 1.5338345864661656,
"grad_norm": 0.051543551392068274,
"learning_rate": 1.570562798094747e-05,
"loss": 0.025,
"step": 204
},
{
"epoch": 1.5413533834586466,
"grad_norm": 0.14671926401344376,
"learning_rate": 1.5230328114318127e-05,
"loss": 0.0241,
"step": 205
},
{
"epoch": 1.5488721804511278,
"grad_norm": 0.058979726559234814,
"learning_rate": 1.4761035809979395e-05,
"loss": 0.0253,
"step": 206
},
{
"epoch": 1.556390977443609,
"grad_norm": 0.06494643885270886,
"learning_rate": 1.4297832153043656e-05,
"loss": 0.0236,
"step": 207
},
{
"epoch": 1.5639097744360901,
"grad_norm": 0.06627104647345526,
"learning_rate": 1.3840797176616466e-05,
"loss": 0.0278,
"step": 208
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.06190650675134399,
"learning_rate": 1.3390009847968504e-05,
"loss": 0.0255,
"step": 209
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.06250699899282167,
"learning_rate": 1.2945548054891321e-05,
"loss": 0.0254,
"step": 210
},
{
"epoch": 1.5864661654135337,
"grad_norm": 0.06214391708977836,
"learning_rate": 1.2507488592239847e-05,
"loss": 0.0233,
"step": 211
},
{
"epoch": 1.5939849624060152,
"grad_norm": 0.054608347620115995,
"learning_rate": 1.2075907148663579e-05,
"loss": 0.024,
"step": 212
},
{
"epoch": 1.6015037593984962,
"grad_norm": 0.05333683650123989,
"learning_rate": 1.1650878293528994e-05,
"loss": 0.0261,
"step": 213
},
{
"epoch": 1.6090225563909775,
"grad_norm": 0.047407562918454,
"learning_rate": 1.1232475464035385e-05,
"loss": 0.0192,
"step": 214
},
{
"epoch": 1.6165413533834587,
"grad_norm": 0.06549580580637923,
"learning_rate": 1.0820770952526155e-05,
"loss": 0.0192,
"step": 215
},
{
"epoch": 1.6240601503759398,
"grad_norm": 0.0582730317262946,
"learning_rate": 1.0415835893998116e-05,
"loss": 0.0267,
"step": 216
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.06724858724013988,
"learning_rate": 1.0017740253810609e-05,
"loss": 0.0244,
"step": 217
},
{
"epoch": 1.6390977443609023,
"grad_norm": 0.07353126997097047,
"learning_rate": 9.62655281559679e-06,
"loss": 0.0265,
"step": 218
},
{
"epoch": 1.6466165413533833,
"grad_norm": 0.057567868642984674,
"learning_rate": 9.242341169379076e-06,
"loss": 0.0239,
"step": 219
},
{
"epoch": 1.6541353383458648,
"grad_norm": 0.06325334373179048,
"learning_rate": 8.865171699890834e-06,
"loss": 0.023,
"step": 220
},
{
"epoch": 1.6616541353383458,
"grad_norm": 0.057849806459398294,
"learning_rate": 8.49510957510633e-06,
"loss": 0.0286,
"step": 221
},
{
"epoch": 1.669172932330827,
"grad_norm": 0.06257054012996921,
"learning_rate": 8.132218734980852e-06,
"loss": 0.0205,
"step": 222
},
{
"epoch": 1.6766917293233083,
"grad_norm": 0.053291552200528655,
"learning_rate": 7.776561880403072e-06,
"loss": 0.0222,
"step": 223
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.055884993872003165,
"learning_rate": 7.4282004623615396e-06,
"loss": 0.0257,
"step": 224
},
{
"epoch": 1.6917293233082706,
"grad_norm": 0.04781226703104293,
"learning_rate": 7.0871946713269856e-06,
"loss": 0.021,
"step": 225
},
{
"epoch": 1.699248120300752,
"grad_norm": 0.04617454207758738,
"learning_rate": 6.753603426852589e-06,
"loss": 0.0206,
"step": 226
},
{
"epoch": 1.706766917293233,
"grad_norm": 0.05934488856386534,
"learning_rate": 6.427484367393699e-06,
"loss": 0.0221,
"step": 227
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.0563063349000768,
"learning_rate": 6.108893840348995e-06,
"loss": 0.0217,
"step": 228
},
{
"epoch": 1.7218045112781954,
"grad_norm": 0.058919681414065804,
"learning_rate": 5.797886892324694e-06,
"loss": 0.0241,
"step": 229
},
{
"epoch": 1.7293233082706767,
"grad_norm": 0.04652279001651371,
"learning_rate": 5.494517259623477e-06,
"loss": 0.023,
"step": 230
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.05206753304811755,
"learning_rate": 5.198837358959901e-06,
"loss": 0.0247,
"step": 231
},
{
"epoch": 1.744360902255639,
"grad_norm": 0.05759411719610633,
"learning_rate": 4.910898278403669e-06,
"loss": 0.0275,
"step": 232
},
{
"epoch": 1.7518796992481203,
"grad_norm": 0.05493938568305548,
"learning_rate": 4.630749768552589e-06,
"loss": 0.0236,
"step": 233
},
{
"epoch": 1.7593984962406015,
"grad_norm": 0.045214515268897214,
"learning_rate": 4.358440233936617e-06,
"loss": 0.0196,
"step": 234
},
{
"epoch": 1.7669172932330826,
"grad_norm": 0.08670874372319154,
"learning_rate": 4.094016724654359e-06,
"loss": 0.0292,
"step": 235
},
{
"epoch": 1.774436090225564,
"grad_norm": 0.049117351787292686,
"learning_rate": 3.837524928243774e-06,
"loss": 0.0224,
"step": 236
},
{
"epoch": 1.781954887218045,
"grad_norm": 0.058397389390063136,
"learning_rate": 3.589009161788104e-06,
"loss": 0.0278,
"step": 237
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.05422155962388968,
"learning_rate": 3.3485123642587658e-06,
"loss": 0.0243,
"step": 238
},
{
"epoch": 1.7969924812030076,
"grad_norm": 0.07090059571835504,
"learning_rate": 3.116076089096265e-06,
"loss": 0.027,
"step": 239
},
{
"epoch": 1.8045112781954886,
"grad_norm": 0.05963059250846481,
"learning_rate": 2.8917404970305097e-06,
"loss": 0.0288,
"step": 240
},
{
"epoch": 1.8120300751879699,
"grad_norm": 0.06946365704174999,
"learning_rate": 2.675544349141779e-06,
"loss": 0.0259,
"step": 241
},
{
"epoch": 1.8195488721804511,
"grad_norm": 0.06143740644726876,
"learning_rate": 2.4675250001635232e-06,
"loss": 0.0247,
"step": 242
},
{
"epoch": 1.8270676691729322,
"grad_norm": 0.04728168437977354,
"learning_rate": 2.2677183920281343e-06,
"loss": 0.0193,
"step": 243
},
{
"epoch": 1.8345864661654137,
"grad_norm": 0.07042127314230426,
"learning_rate": 2.076159047656889e-06,
"loss": 0.0227,
"step": 244
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.05266415047166696,
"learning_rate": 1.892880064994934e-06,
"loss": 0.0256,
"step": 245
},
{
"epoch": 1.849624060150376,
"grad_norm": 0.05204878417509025,
"learning_rate": 1.7179131112926627e-06,
"loss": 0.024,
"step": 246
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.04727065912696429,
"learning_rate": 1.551288417634106e-06,
"loss": 0.0159,
"step": 247
},
{
"epoch": 1.8646616541353382,
"grad_norm": 0.049637487718030344,
"learning_rate": 1.3930347737136196e-06,
"loss": 0.0209,
"step": 248
},
{
"epoch": 1.8721804511278195,
"grad_norm": 0.0505669836884092,
"learning_rate": 1.2431795228615372e-06,
"loss": 0.0206,
"step": 249
},
{
"epoch": 1.8796992481203008,
"grad_norm": 0.07557073448805833,
"learning_rate": 1.101748557319715e-06,
"loss": 0.0315,
"step": 250
},
{
"epoch": 1.8872180451127818,
"grad_norm": 0.04855407299966349,
"learning_rate": 9.687663137678604e-07,
"loss": 0.0193,
"step": 251
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.05981871688003821,
"learning_rate": 8.442557691013043e-07,
"loss": 0.0245,
"step": 252
},
{
"epoch": 1.9022556390977443,
"grad_norm": 0.055297053623164526,
"learning_rate": 7.282384364610206e-07,
"loss": 0.0242,
"step": 253
},
{
"epoch": 1.9097744360902256,
"grad_norm": 0.05097924138111233,
"learning_rate": 6.207343615165561e-07,
"loss": 0.0207,
"step": 254
},
{
"epoch": 1.9172932330827068,
"grad_norm": 0.05870296620626846,
"learning_rate": 5.217621190024779e-07,
"loss": 0.0259,
"step": 255
},
{
"epoch": 1.9248120300751879,
"grad_norm": 0.05289043509456049,
"learning_rate": 4.3133880950905205e-07,
"loss": 0.0217,
"step": 256
},
{
"epoch": 1.9323308270676691,
"grad_norm": 0.05040687502136238,
"learning_rate": 3.494800565275125e-07,
"loss": 0.0226,
"step": 257
},
{
"epoch": 1.9398496240601504,
"grad_norm": 0.05483598628420617,
"learning_rate": 2.762000037506485e-07,
"loss": 0.0226,
"step": 258
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.052171052589092846,
"learning_rate": 2.115113126290258e-07,
"loss": 0.0224,
"step": 259
},
{
"epoch": 1.954887218045113,
"grad_norm": 0.060066029686361856,
"learning_rate": 1.554251601833201e-07,
"loss": 0.0242,
"step": 260
},
{
"epoch": 1.962406015037594,
"grad_norm": 0.04560282840465627,
"learning_rate": 1.0795123707312283e-07,
"loss": 0.0199,
"step": 261
},
{
"epoch": 1.9699248120300752,
"grad_norm": 0.055943707431487216,
"learning_rate": 6.909774592258056e-08,
"loss": 0.0218,
"step": 262
},
{
"epoch": 1.9774436090225564,
"grad_norm": 0.057987573660367824,
"learning_rate": 3.8871399903134265e-08,
"loss": 0.0242,
"step": 263
},
{
"epoch": 1.9849624060150375,
"grad_norm": 0.05438020219150765,
"learning_rate": 1.7277421573608232e-08,
"loss": 0.0278,
"step": 264
},
{
"epoch": 1.9924812030075187,
"grad_norm": 0.05442881774912085,
"learning_rate": 4.319541977831909e-09,
"loss": 0.0193,
"step": 265
},
{
"epoch": 2.0,
"grad_norm": 0.059490023866208885,
"learning_rate": 0.0,
"loss": 0.0224,
"step": 266
},
{
"epoch": 2.0,
"step": 266,
"total_flos": 673614818967552.0,
"train_loss": 0.039493271835932604,
"train_runtime": 2026.6163,
"train_samples_per_second": 0.522,
"train_steps_per_second": 0.131
}
],
"logging_steps": 1,
"max_steps": 266,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 673614818967552.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}