cutelemonlili's picture
Add files using upload-large-folder tool
573ca0b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 692,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002890173410404624,
"grad_norm": 3.413911899913347,
"learning_rate": 9.999948473953725e-06,
"loss": 0.0577,
"step": 1
},
{
"epoch": 0.005780346820809248,
"grad_norm": 2.897400778584227,
"learning_rate": 9.999793896876868e-06,
"loss": 0.0552,
"step": 2
},
{
"epoch": 0.008670520231213872,
"grad_norm": 4.246403961679219,
"learning_rate": 9.99953627195533e-06,
"loss": 0.0695,
"step": 3
},
{
"epoch": 0.011560693641618497,
"grad_norm": 3.544275760472012,
"learning_rate": 9.999175604498867e-06,
"loss": 0.054,
"step": 4
},
{
"epoch": 0.014450867052023121,
"grad_norm": 6.6935720376972565,
"learning_rate": 9.998711901940989e-06,
"loss": 0.0875,
"step": 5
},
{
"epoch": 0.017341040462427744,
"grad_norm": 7.418656211317936,
"learning_rate": 9.998145173838796e-06,
"loss": 0.1044,
"step": 6
},
{
"epoch": 0.02023121387283237,
"grad_norm": 5.845467274560381,
"learning_rate": 9.997475431872795e-06,
"loss": 0.0894,
"step": 7
},
{
"epoch": 0.023121387283236993,
"grad_norm": 8.046971007820595,
"learning_rate": 9.996702689846645e-06,
"loss": 0.1113,
"step": 8
},
{
"epoch": 0.02601156069364162,
"grad_norm": 5.557087876304546,
"learning_rate": 9.995826963686883e-06,
"loss": 0.0899,
"step": 9
},
{
"epoch": 0.028901734104046242,
"grad_norm": 7.013405088920962,
"learning_rate": 9.994848271442595e-06,
"loss": 0.0947,
"step": 10
},
{
"epoch": 0.031791907514450865,
"grad_norm": 5.148671016900013,
"learning_rate": 9.993766633285033e-06,
"loss": 0.081,
"step": 11
},
{
"epoch": 0.03468208092485549,
"grad_norm": 4.719125802513109,
"learning_rate": 9.992582071507217e-06,
"loss": 0.0795,
"step": 12
},
{
"epoch": 0.03757225433526012,
"grad_norm": 4.926080178684607,
"learning_rate": 9.991294610523456e-06,
"loss": 0.1108,
"step": 13
},
{
"epoch": 0.04046242774566474,
"grad_norm": 4.8632538049944705,
"learning_rate": 9.989904276868865e-06,
"loss": 0.118,
"step": 14
},
{
"epoch": 0.04335260115606936,
"grad_norm": 4.49166064777025,
"learning_rate": 9.988411099198797e-06,
"loss": 0.1029,
"step": 15
},
{
"epoch": 0.046242774566473986,
"grad_norm": 3.7145419864840066,
"learning_rate": 9.986815108288273e-06,
"loss": 0.0744,
"step": 16
},
{
"epoch": 0.049132947976878616,
"grad_norm": 4.099867309174663,
"learning_rate": 9.98511633703133e-06,
"loss": 0.0914,
"step": 17
},
{
"epoch": 0.05202312138728324,
"grad_norm": 3.437121075753801,
"learning_rate": 9.98331482044036e-06,
"loss": 0.0803,
"step": 18
},
{
"epoch": 0.05491329479768786,
"grad_norm": 3.9662109797189267,
"learning_rate": 9.981410595645369e-06,
"loss": 0.0788,
"step": 19
},
{
"epoch": 0.057803468208092484,
"grad_norm": 4.8667640721708105,
"learning_rate": 9.979403701893226e-06,
"loss": 0.1069,
"step": 20
},
{
"epoch": 0.06069364161849711,
"grad_norm": 5.834375526681122,
"learning_rate": 9.977294180546857e-06,
"loss": 0.1179,
"step": 21
},
{
"epoch": 0.06358381502890173,
"grad_norm": 4.402795466175506,
"learning_rate": 9.975082075084375e-06,
"loss": 0.1312,
"step": 22
},
{
"epoch": 0.06647398843930635,
"grad_norm": 4.425118866581384,
"learning_rate": 9.9727674310982e-06,
"loss": 0.1073,
"step": 23
},
{
"epoch": 0.06936416184971098,
"grad_norm": 4.074778629199514,
"learning_rate": 9.970350296294114e-06,
"loss": 0.1029,
"step": 24
},
{
"epoch": 0.07225433526011561,
"grad_norm": 4.72700192719594,
"learning_rate": 9.967830720490277e-06,
"loss": 0.1005,
"step": 25
},
{
"epoch": 0.07514450867052024,
"grad_norm": 4.663929794890595,
"learning_rate": 9.9652087556162e-06,
"loss": 0.111,
"step": 26
},
{
"epoch": 0.07803468208092486,
"grad_norm": 4.922655803987667,
"learning_rate": 9.962484455711679e-06,
"loss": 0.1043,
"step": 27
},
{
"epoch": 0.08092485549132948,
"grad_norm": 3.881077548927933,
"learning_rate": 9.959657876925671e-06,
"loss": 0.0856,
"step": 28
},
{
"epoch": 0.0838150289017341,
"grad_norm": 4.677532571658142,
"learning_rate": 9.956729077515151e-06,
"loss": 0.1028,
"step": 29
},
{
"epoch": 0.08670520231213873,
"grad_norm": 3.6049503639471796,
"learning_rate": 9.9536981178439e-06,
"loss": 0.0743,
"step": 30
},
{
"epoch": 0.08959537572254335,
"grad_norm": 4.217598474174594,
"learning_rate": 9.950565060381264e-06,
"loss": 0.0983,
"step": 31
},
{
"epoch": 0.09248554913294797,
"grad_norm": 5.093429475733392,
"learning_rate": 9.94732996970087e-06,
"loss": 0.1133,
"step": 32
},
{
"epoch": 0.0953757225433526,
"grad_norm": 5.129318191872631,
"learning_rate": 9.94399291247929e-06,
"loss": 0.0888,
"step": 33
},
{
"epoch": 0.09826589595375723,
"grad_norm": 3.6982076421332923,
"learning_rate": 9.940553957494669e-06,
"loss": 0.0831,
"step": 34
},
{
"epoch": 0.10115606936416185,
"grad_norm": 4.087472683580998,
"learning_rate": 9.937013175625313e-06,
"loss": 0.0972,
"step": 35
},
{
"epoch": 0.10404624277456648,
"grad_norm": 4.077053797813939,
"learning_rate": 9.93337063984821e-06,
"loss": 0.0939,
"step": 36
},
{
"epoch": 0.1069364161849711,
"grad_norm": 4.459254832264914,
"learning_rate": 9.929626425237555e-06,
"loss": 0.1073,
"step": 37
},
{
"epoch": 0.10982658959537572,
"grad_norm": 3.653561777734679,
"learning_rate": 9.925780608963173e-06,
"loss": 0.0787,
"step": 38
},
{
"epoch": 0.11271676300578035,
"grad_norm": 4.207379444042341,
"learning_rate": 9.92183327028895e-06,
"loss": 0.0881,
"step": 39
},
{
"epoch": 0.11560693641618497,
"grad_norm": 5.28282743437778,
"learning_rate": 9.917784490571188e-06,
"loss": 0.0981,
"step": 40
},
{
"epoch": 0.11849710982658959,
"grad_norm": 3.958835693054809,
"learning_rate": 9.913634353256926e-06,
"loss": 0.0747,
"step": 41
},
{
"epoch": 0.12138728323699421,
"grad_norm": 3.815694424138344,
"learning_rate": 9.909382943882238e-06,
"loss": 0.0962,
"step": 42
},
{
"epoch": 0.12427745664739884,
"grad_norm": 4.616222762096817,
"learning_rate": 9.905030350070446e-06,
"loss": 0.09,
"step": 43
},
{
"epoch": 0.12716763005780346,
"grad_norm": 4.8258864571124835,
"learning_rate": 9.900576661530334e-06,
"loss": 0.1068,
"step": 44
},
{
"epoch": 0.13005780346820808,
"grad_norm": 4.98128890149854,
"learning_rate": 9.896021970054282e-06,
"loss": 0.0908,
"step": 45
},
{
"epoch": 0.1329479768786127,
"grad_norm": 3.2157418735511456,
"learning_rate": 9.89136636951639e-06,
"loss": 0.0771,
"step": 46
},
{
"epoch": 0.13583815028901733,
"grad_norm": 5.174195271609547,
"learning_rate": 9.886609955870536e-06,
"loss": 0.1144,
"step": 47
},
{
"epoch": 0.13872832369942195,
"grad_norm": 3.799076912387004,
"learning_rate": 9.881752827148391e-06,
"loss": 0.105,
"step": 48
},
{
"epoch": 0.1416184971098266,
"grad_norm": 4.478882016025104,
"learning_rate": 9.876795083457414e-06,
"loss": 0.1084,
"step": 49
},
{
"epoch": 0.14450867052023122,
"grad_norm": 4.130383884149315,
"learning_rate": 9.871736826978776e-06,
"loss": 0.1028,
"step": 50
},
{
"epoch": 0.14739884393063585,
"grad_norm": 4.120395848476724,
"learning_rate": 9.866578161965259e-06,
"loss": 0.086,
"step": 51
},
{
"epoch": 0.15028901734104047,
"grad_norm": 4.4606365855568795,
"learning_rate": 9.861319194739109e-06,
"loss": 0.1048,
"step": 52
},
{
"epoch": 0.1531791907514451,
"grad_norm": 4.235827775616701,
"learning_rate": 9.855960033689843e-06,
"loss": 0.1011,
"step": 53
},
{
"epoch": 0.15606936416184972,
"grad_norm": 3.863096874826039,
"learning_rate": 9.85050078927201e-06,
"loss": 0.0787,
"step": 54
},
{
"epoch": 0.15895953757225434,
"grad_norm": 4.930472249007545,
"learning_rate": 9.844941574002927e-06,
"loss": 0.113,
"step": 55
},
{
"epoch": 0.16184971098265896,
"grad_norm": 4.9133176516275,
"learning_rate": 9.83928250246034e-06,
"loss": 0.1126,
"step": 56
},
{
"epoch": 0.16473988439306358,
"grad_norm": 4.334161886190985,
"learning_rate": 9.83352369128009e-06,
"loss": 0.1163,
"step": 57
},
{
"epoch": 0.1676300578034682,
"grad_norm": 3.952135073411162,
"learning_rate": 9.82766525915368e-06,
"loss": 0.0997,
"step": 58
},
{
"epoch": 0.17052023121387283,
"grad_norm": 5.407354621789884,
"learning_rate": 9.821707326825849e-06,
"loss": 0.1464,
"step": 59
},
{
"epoch": 0.17341040462427745,
"grad_norm": 4.710011204025672,
"learning_rate": 9.815650017092078e-06,
"loss": 0.0966,
"step": 60
},
{
"epoch": 0.17630057803468208,
"grad_norm": 3.6046217871002897,
"learning_rate": 9.809493454796051e-06,
"loss": 0.0808,
"step": 61
},
{
"epoch": 0.1791907514450867,
"grad_norm": 4.333327639577169,
"learning_rate": 9.803237766827098e-06,
"loss": 0.1058,
"step": 62
},
{
"epoch": 0.18208092485549132,
"grad_norm": 3.7757843201095094,
"learning_rate": 9.796883082117565e-06,
"loss": 0.092,
"step": 63
},
{
"epoch": 0.18497109826589594,
"grad_norm": 3.292155410682368,
"learning_rate": 9.790429531640163e-06,
"loss": 0.0833,
"step": 64
},
{
"epoch": 0.18786127167630057,
"grad_norm": 3.1069406572396354,
"learning_rate": 9.783877248405266e-06,
"loss": 0.0836,
"step": 65
},
{
"epoch": 0.1907514450867052,
"grad_norm": 3.4381561878895925,
"learning_rate": 9.77722636745818e-06,
"loss": 0.0927,
"step": 66
},
{
"epoch": 0.1936416184971098,
"grad_norm": 4.516150027751648,
"learning_rate": 9.770477025876338e-06,
"loss": 0.1158,
"step": 67
},
{
"epoch": 0.19653179190751446,
"grad_norm": 4.593877711768214,
"learning_rate": 9.763629362766495e-06,
"loss": 0.1085,
"step": 68
},
{
"epoch": 0.1994219653179191,
"grad_norm": 3.9274365769789865,
"learning_rate": 9.75668351926186e-06,
"loss": 0.0954,
"step": 69
},
{
"epoch": 0.2023121387283237,
"grad_norm": 3.8806786830392466,
"learning_rate": 9.749639638519167e-06,
"loss": 0.0929,
"step": 70
},
{
"epoch": 0.20520231213872833,
"grad_norm": 4.019652719431334,
"learning_rate": 9.742497865715752e-06,
"loss": 0.1106,
"step": 71
},
{
"epoch": 0.20809248554913296,
"grad_norm": 4.432334879926278,
"learning_rate": 9.735258348046538e-06,
"loss": 0.1175,
"step": 72
},
{
"epoch": 0.21098265895953758,
"grad_norm": 3.4711831909967295,
"learning_rate": 9.727921234721013e-06,
"loss": 0.0852,
"step": 73
},
{
"epoch": 0.2138728323699422,
"grad_norm": 3.2064291726730256,
"learning_rate": 9.720486676960157e-06,
"loss": 0.0728,
"step": 74
},
{
"epoch": 0.21676300578034682,
"grad_norm": 3.808224427132749,
"learning_rate": 9.712954827993314e-06,
"loss": 0.1021,
"step": 75
},
{
"epoch": 0.21965317919075145,
"grad_norm": 3.607343192971514,
"learning_rate": 9.705325843055045e-06,
"loss": 0.0859,
"step": 76
},
{
"epoch": 0.22254335260115607,
"grad_norm": 5.194995083585211,
"learning_rate": 9.69759987938192e-06,
"loss": 0.1043,
"step": 77
},
{
"epoch": 0.2254335260115607,
"grad_norm": 4.054952478603542,
"learning_rate": 9.689777096209287e-06,
"loss": 0.103,
"step": 78
},
{
"epoch": 0.22832369942196531,
"grad_norm": 3.640845125492553,
"learning_rate": 9.681857654767978e-06,
"loss": 0.0797,
"step": 79
},
{
"epoch": 0.23121387283236994,
"grad_norm": 3.8972080333317765,
"learning_rate": 9.673841718281e-06,
"loss": 0.0896,
"step": 80
},
{
"epoch": 0.23410404624277456,
"grad_norm": 4.07281569833404,
"learning_rate": 9.665729451960152e-06,
"loss": 0.1053,
"step": 81
},
{
"epoch": 0.23699421965317918,
"grad_norm": 4.030872577083022,
"learning_rate": 9.657521023002644e-06,
"loss": 0.1148,
"step": 82
},
{
"epoch": 0.2398843930635838,
"grad_norm": 4.704073079780987,
"learning_rate": 9.64921660058763e-06,
"loss": 0.0992,
"step": 83
},
{
"epoch": 0.24277456647398843,
"grad_norm": 5.250684980442873,
"learning_rate": 9.64081635587273e-06,
"loss": 0.1066,
"step": 84
},
{
"epoch": 0.24566473988439305,
"grad_norm": 4.1597493954101035,
"learning_rate": 9.632320461990505e-06,
"loss": 0.1029,
"step": 85
},
{
"epoch": 0.24855491329479767,
"grad_norm": 4.76611183508526,
"learning_rate": 9.623729094044882e-06,
"loss": 0.0927,
"step": 86
},
{
"epoch": 0.2514450867052023,
"grad_norm": 4.328180556711655,
"learning_rate": 9.615042429107554e-06,
"loss": 0.105,
"step": 87
},
{
"epoch": 0.2543352601156069,
"grad_norm": 3.7978576663551658,
"learning_rate": 9.606260646214314e-06,
"loss": 0.0974,
"step": 88
},
{
"epoch": 0.25722543352601157,
"grad_norm": 3.9006554035006027,
"learning_rate": 9.597383926361388e-06,
"loss": 0.0806,
"step": 89
},
{
"epoch": 0.26011560693641617,
"grad_norm": 3.841382258509325,
"learning_rate": 9.588412452501686e-06,
"loss": 0.0972,
"step": 90
},
{
"epoch": 0.2630057803468208,
"grad_norm": 4.180653104918261,
"learning_rate": 9.579346409541037e-06,
"loss": 0.1034,
"step": 91
},
{
"epoch": 0.2658959537572254,
"grad_norm": 4.7454554804737885,
"learning_rate": 9.570185984334383e-06,
"loss": 0.1285,
"step": 92
},
{
"epoch": 0.26878612716763006,
"grad_norm": 4.10415672097511,
"learning_rate": 9.56093136568192e-06,
"loss": 0.0958,
"step": 93
},
{
"epoch": 0.27167630057803466,
"grad_norm": 3.5059005548760984,
"learning_rate": 9.551582744325213e-06,
"loss": 0.0866,
"step": 94
},
{
"epoch": 0.2745664739884393,
"grad_norm": 3.6654919235269645,
"learning_rate": 9.542140312943257e-06,
"loss": 0.091,
"step": 95
},
{
"epoch": 0.2774566473988439,
"grad_norm": 3.637246259933994,
"learning_rate": 9.532604266148521e-06,
"loss": 0.0885,
"step": 96
},
{
"epoch": 0.28034682080924855,
"grad_norm": 3.0790903920072865,
"learning_rate": 9.522974800482914e-06,
"loss": 0.08,
"step": 97
},
{
"epoch": 0.2832369942196532,
"grad_norm": 3.478040295573873,
"learning_rate": 9.513252114413756e-06,
"loss": 0.0874,
"step": 98
},
{
"epoch": 0.2861271676300578,
"grad_norm": 4.810373026536352,
"learning_rate": 9.503436408329677e-06,
"loss": 0.1175,
"step": 99
},
{
"epoch": 0.28901734104046245,
"grad_norm": 5.0600900263791315,
"learning_rate": 9.493527884536487e-06,
"loss": 0.1118,
"step": 100
},
{
"epoch": 0.29190751445086704,
"grad_norm": 5.024451679855302,
"learning_rate": 9.483526747253004e-06,
"loss": 0.1313,
"step": 101
},
{
"epoch": 0.2947976878612717,
"grad_norm": 4.266039616349381,
"learning_rate": 9.473433202606859e-06,
"loss": 0.0923,
"step": 102
},
{
"epoch": 0.2976878612716763,
"grad_norm": 4.948633226230769,
"learning_rate": 9.46324745863023e-06,
"loss": 0.1035,
"step": 103
},
{
"epoch": 0.30057803468208094,
"grad_norm": 4.444955459378445,
"learning_rate": 9.452969725255558e-06,
"loss": 0.1014,
"step": 104
},
{
"epoch": 0.30346820809248554,
"grad_norm": 4.110235690060319,
"learning_rate": 9.442600214311236e-06,
"loss": 0.1158,
"step": 105
},
{
"epoch": 0.3063583815028902,
"grad_norm": 4.086004629721548,
"learning_rate": 9.432139139517222e-06,
"loss": 0.1039,
"step": 106
},
{
"epoch": 0.3092485549132948,
"grad_norm": 3.7533923262128157,
"learning_rate": 9.421586716480645e-06,
"loss": 0.0902,
"step": 107
},
{
"epoch": 0.31213872832369943,
"grad_norm": 3.7885314784566892,
"learning_rate": 9.410943162691359e-06,
"loss": 0.0904,
"step": 108
},
{
"epoch": 0.315028901734104,
"grad_norm": 4.407963100332848,
"learning_rate": 9.400208697517463e-06,
"loss": 0.1011,
"step": 109
},
{
"epoch": 0.3179190751445087,
"grad_norm": 3.3317597063787194,
"learning_rate": 9.389383542200779e-06,
"loss": 0.0793,
"step": 110
},
{
"epoch": 0.3208092485549133,
"grad_norm": 3.6021995713337085,
"learning_rate": 9.378467919852285e-06,
"loss": 0.1021,
"step": 111
},
{
"epoch": 0.3236994219653179,
"grad_norm": 3.060883940108233,
"learning_rate": 9.367462055447528e-06,
"loss": 0.0782,
"step": 112
},
{
"epoch": 0.3265895953757225,
"grad_norm": 3.6322429756958443,
"learning_rate": 9.356366175821977e-06,
"loss": 0.1152,
"step": 113
},
{
"epoch": 0.32947976878612717,
"grad_norm": 3.5862125438438186,
"learning_rate": 9.34518050966636e-06,
"loss": 0.0958,
"step": 114
},
{
"epoch": 0.33236994219653176,
"grad_norm": 3.5665497104723944,
"learning_rate": 9.333905287521933e-06,
"loss": 0.0847,
"step": 115
},
{
"epoch": 0.3352601156069364,
"grad_norm": 4.102595062306423,
"learning_rate": 9.322540741775745e-06,
"loss": 0.1247,
"step": 116
},
{
"epoch": 0.33815028901734107,
"grad_norm": 3.6212682400747638,
"learning_rate": 9.311087106655838e-06,
"loss": 0.1005,
"step": 117
},
{
"epoch": 0.34104046242774566,
"grad_norm": 3.5539806452557445,
"learning_rate": 9.299544618226428e-06,
"loss": 0.0961,
"step": 118
},
{
"epoch": 0.3439306358381503,
"grad_norm": 4.240074088752261,
"learning_rate": 9.287913514383031e-06,
"loss": 0.0975,
"step": 119
},
{
"epoch": 0.3468208092485549,
"grad_norm": 3.065068340901538,
"learning_rate": 9.276194034847565e-06,
"loss": 0.092,
"step": 120
},
{
"epoch": 0.34971098265895956,
"grad_norm": 4.031446686004421,
"learning_rate": 9.26438642116341e-06,
"loss": 0.0945,
"step": 121
},
{
"epoch": 0.35260115606936415,
"grad_norm": 3.489698753275862,
"learning_rate": 9.252490916690422e-06,
"loss": 0.0848,
"step": 122
},
{
"epoch": 0.3554913294797688,
"grad_norm": 3.608823390219841,
"learning_rate": 9.240507766599928e-06,
"loss": 0.0801,
"step": 123
},
{
"epoch": 0.3583815028901734,
"grad_norm": 4.639617732442707,
"learning_rate": 9.228437217869668e-06,
"loss": 0.122,
"step": 124
},
{
"epoch": 0.36127167630057805,
"grad_norm": 3.661308322897671,
"learning_rate": 9.2162795192787e-06,
"loss": 0.0935,
"step": 125
},
{
"epoch": 0.36416184971098264,
"grad_norm": 4.1280140168078665,
"learning_rate": 9.204034921402282e-06,
"loss": 0.1184,
"step": 126
},
{
"epoch": 0.3670520231213873,
"grad_norm": 4.901427060033713,
"learning_rate": 9.191703676606702e-06,
"loss": 0.091,
"step": 127
},
{
"epoch": 0.3699421965317919,
"grad_norm": 3.796410122108623,
"learning_rate": 9.179286039044072e-06,
"loss": 0.1042,
"step": 128
},
{
"epoch": 0.37283236994219654,
"grad_norm": 4.575336333649151,
"learning_rate": 9.166782264647105e-06,
"loss": 0.1111,
"step": 129
},
{
"epoch": 0.37572254335260113,
"grad_norm": 4.177294896339385,
"learning_rate": 9.15419261112382e-06,
"loss": 0.1028,
"step": 130
},
{
"epoch": 0.3786127167630058,
"grad_norm": 3.772689277647484,
"learning_rate": 9.141517337952243e-06,
"loss": 0.0874,
"step": 131
},
{
"epoch": 0.3815028901734104,
"grad_norm": 3.9602527742947404,
"learning_rate": 9.128756706375065e-06,
"loss": 0.0966,
"step": 132
},
{
"epoch": 0.38439306358381503,
"grad_norm": 3.859417212423571,
"learning_rate": 9.115910979394238e-06,
"loss": 0.0921,
"step": 133
},
{
"epoch": 0.3872832369942196,
"grad_norm": 5.107869845200311,
"learning_rate": 9.102980421765575e-06,
"loss": 0.105,
"step": 134
},
{
"epoch": 0.3901734104046243,
"grad_norm": 4.5458122953341125,
"learning_rate": 9.089965299993278e-06,
"loss": 0.1022,
"step": 135
},
{
"epoch": 0.3930635838150289,
"grad_norm": 3.8501993337102327,
"learning_rate": 9.076865882324453e-06,
"loss": 0.0763,
"step": 136
},
{
"epoch": 0.3959537572254335,
"grad_norm": 4.28944054979154,
"learning_rate": 9.063682438743582e-06,
"loss": 0.0964,
"step": 137
},
{
"epoch": 0.3988439306358382,
"grad_norm": 4.587040074279113,
"learning_rate": 9.050415240966953e-06,
"loss": 0.1169,
"step": 138
},
{
"epoch": 0.40173410404624277,
"grad_norm": 4.826886123603603,
"learning_rate": 9.037064562437068e-06,
"loss": 0.1162,
"step": 139
},
{
"epoch": 0.4046242774566474,
"grad_norm": 4.451491462388894,
"learning_rate": 9.023630678316994e-06,
"loss": 0.0869,
"step": 140
},
{
"epoch": 0.407514450867052,
"grad_norm": 4.142756867333979,
"learning_rate": 9.01011386548471e-06,
"loss": 0.1083,
"step": 141
},
{
"epoch": 0.41040462427745666,
"grad_norm": 4.058664610851421,
"learning_rate": 8.996514402527383e-06,
"loss": 0.093,
"step": 142
},
{
"epoch": 0.41329479768786126,
"grad_norm": 3.9517689050770874,
"learning_rate": 8.982832569735635e-06,
"loss": 0.1049,
"step": 143
},
{
"epoch": 0.4161849710982659,
"grad_norm": 4.3831397108335075,
"learning_rate": 8.969068649097766e-06,
"loss": 0.0973,
"step": 144
},
{
"epoch": 0.4190751445086705,
"grad_norm": 3.3204794647179052,
"learning_rate": 8.955222924293943e-06,
"loss": 0.0893,
"step": 145
},
{
"epoch": 0.42196531791907516,
"grad_norm": 3.9958981086720726,
"learning_rate": 8.941295680690347e-06,
"loss": 0.0937,
"step": 146
},
{
"epoch": 0.42485549132947975,
"grad_norm": 3.560039411695779,
"learning_rate": 8.9272872053333e-06,
"loss": 0.0922,
"step": 147
},
{
"epoch": 0.4277456647398844,
"grad_norm": 3.866129097726797,
"learning_rate": 8.913197786943335e-06,
"loss": 0.0871,
"step": 148
},
{
"epoch": 0.430635838150289,
"grad_norm": 3.9338404603221293,
"learning_rate": 8.89902771590927e-06,
"loss": 0.1027,
"step": 149
},
{
"epoch": 0.43352601156069365,
"grad_norm": 4.362521573474016,
"learning_rate": 8.884777284282193e-06,
"loss": 0.0956,
"step": 150
},
{
"epoch": 0.43641618497109824,
"grad_norm": 3.945559735382797,
"learning_rate": 8.870446785769468e-06,
"loss": 0.0991,
"step": 151
},
{
"epoch": 0.4393063583815029,
"grad_norm": 3.180223515345157,
"learning_rate": 8.856036515728666e-06,
"loss": 0.0909,
"step": 152
},
{
"epoch": 0.4421965317919075,
"grad_norm": 3.7371844752520365,
"learning_rate": 8.84154677116148e-06,
"loss": 0.1071,
"step": 153
},
{
"epoch": 0.44508670520231214,
"grad_norm": 4.754557509579758,
"learning_rate": 8.826977850707612e-06,
"loss": 0.1067,
"step": 154
},
{
"epoch": 0.4479768786127168,
"grad_norm": 4.678190152571308,
"learning_rate": 8.812330054638611e-06,
"loss": 0.1177,
"step": 155
},
{
"epoch": 0.4508670520231214,
"grad_norm": 3.2121865098638027,
"learning_rate": 8.797603684851685e-06,
"loss": 0.0874,
"step": 156
},
{
"epoch": 0.45375722543352603,
"grad_norm": 3.17679106481082,
"learning_rate": 8.782799044863475e-06,
"loss": 0.0864,
"step": 157
},
{
"epoch": 0.45664739884393063,
"grad_norm": 3.2522042838865213,
"learning_rate": 8.767916439803808e-06,
"loss": 0.0916,
"step": 158
},
{
"epoch": 0.4595375722543353,
"grad_norm": 3.216689798967245,
"learning_rate": 8.752956176409404e-06,
"loss": 0.0893,
"step": 159
},
{
"epoch": 0.4624277456647399,
"grad_norm": 3.8865315971560404,
"learning_rate": 8.737918563017553e-06,
"loss": 0.0963,
"step": 160
},
{
"epoch": 0.4653179190751445,
"grad_norm": 3.563126634285472,
"learning_rate": 8.722803909559758e-06,
"loss": 0.1016,
"step": 161
},
{
"epoch": 0.4682080924855491,
"grad_norm": 6.544416998936724,
"learning_rate": 8.707612527555356e-06,
"loss": 0.1252,
"step": 162
},
{
"epoch": 0.47109826589595377,
"grad_norm": 3.9078238788443995,
"learning_rate": 8.692344730105084e-06,
"loss": 0.1006,
"step": 163
},
{
"epoch": 0.47398843930635837,
"grad_norm": 3.465202393031836,
"learning_rate": 8.677000831884639e-06,
"loss": 0.0974,
"step": 164
},
{
"epoch": 0.476878612716763,
"grad_norm": 3.4753816819973915,
"learning_rate": 8.661581149138185e-06,
"loss": 0.0859,
"step": 165
},
{
"epoch": 0.4797687861271676,
"grad_norm": 4.370748869146705,
"learning_rate": 8.646085999671838e-06,
"loss": 0.1195,
"step": 166
},
{
"epoch": 0.48265895953757226,
"grad_norm": 4.025212498814417,
"learning_rate": 8.630515702847109e-06,
"loss": 0.0968,
"step": 167
},
{
"epoch": 0.48554913294797686,
"grad_norm": 4.497291267391845,
"learning_rate": 8.614870579574338e-06,
"loss": 0.1209,
"step": 168
},
{
"epoch": 0.4884393063583815,
"grad_norm": 3.4760700610759536,
"learning_rate": 8.599150952306058e-06,
"loss": 0.0977,
"step": 169
},
{
"epoch": 0.4913294797687861,
"grad_norm": 3.4995292892009386,
"learning_rate": 8.58335714503037e-06,
"loss": 0.0972,
"step": 170
},
{
"epoch": 0.49421965317919075,
"grad_norm": 4.073017918693432,
"learning_rate": 8.567489483264247e-06,
"loss": 0.1049,
"step": 171
},
{
"epoch": 0.49710982658959535,
"grad_norm": 3.8679218357162233,
"learning_rate": 8.551548294046843e-06,
"loss": 0.1094,
"step": 172
},
{
"epoch": 0.5,
"grad_norm": 3.7071156064360493,
"learning_rate": 8.535533905932739e-06,
"loss": 0.0883,
"step": 173
},
{
"epoch": 0.5028901734104047,
"grad_norm": 4.851312118928478,
"learning_rate": 8.519446648985173e-06,
"loss": 0.1139,
"step": 174
},
{
"epoch": 0.5057803468208093,
"grad_norm": 5.218623175111435,
"learning_rate": 8.503286854769247e-06,
"loss": 0.0963,
"step": 175
},
{
"epoch": 0.5086705202312138,
"grad_norm": 4.199494517228212,
"learning_rate": 8.487054856345081e-06,
"loss": 0.1028,
"step": 176
},
{
"epoch": 0.5115606936416185,
"grad_norm": 3.7907330310051166,
"learning_rate": 8.470750988260956e-06,
"loss": 0.0986,
"step": 177
},
{
"epoch": 0.5144508670520231,
"grad_norm": 3.4270783123645487,
"learning_rate": 8.454375586546418e-06,
"loss": 0.0905,
"step": 178
},
{
"epoch": 0.5173410404624278,
"grad_norm": 4.290100611914985,
"learning_rate": 8.437928988705346e-06,
"loss": 0.1138,
"step": 179
},
{
"epoch": 0.5202312138728323,
"grad_norm": 4.172228203124501,
"learning_rate": 8.42141153370901e-06,
"loss": 0.1122,
"step": 180
},
{
"epoch": 0.523121387283237,
"grad_norm": 3.1661691393240514,
"learning_rate": 8.404823561989063e-06,
"loss": 0.0659,
"step": 181
},
{
"epoch": 0.5260115606936416,
"grad_norm": 3.606488236503898,
"learning_rate": 8.388165415430551e-06,
"loss": 0.0967,
"step": 182
},
{
"epoch": 0.5289017341040463,
"grad_norm": 4.0351274821452225,
"learning_rate": 8.371437437364844e-06,
"loss": 0.1112,
"step": 183
},
{
"epoch": 0.5317919075144508,
"grad_norm": 3.578234343255972,
"learning_rate": 8.35463997256257e-06,
"loss": 0.0879,
"step": 184
},
{
"epoch": 0.5346820809248555,
"grad_norm": 2.881061742935368,
"learning_rate": 8.337773367226509e-06,
"loss": 0.0777,
"step": 185
},
{
"epoch": 0.5375722543352601,
"grad_norm": 3.522910118232901,
"learning_rate": 8.320837968984456e-06,
"loss": 0.0919,
"step": 186
},
{
"epoch": 0.5404624277456648,
"grad_norm": 3.7797177948481506,
"learning_rate": 8.303834126882056e-06,
"loss": 0.0948,
"step": 187
},
{
"epoch": 0.5433526011560693,
"grad_norm": 3.42150444228012,
"learning_rate": 8.28676219137561e-06,
"loss": 0.0842,
"step": 188
},
{
"epoch": 0.546242774566474,
"grad_norm": 3.536980226076555,
"learning_rate": 8.269622514324856e-06,
"loss": 0.0799,
"step": 189
},
{
"epoch": 0.5491329479768786,
"grad_norm": 4.0285830647679886,
"learning_rate": 8.25241544898571e-06,
"loss": 0.1136,
"step": 190
},
{
"epoch": 0.5520231213872833,
"grad_norm": 3.6610676809643143,
"learning_rate": 8.23514135000299e-06,
"loss": 0.0897,
"step": 191
},
{
"epoch": 0.5549132947976878,
"grad_norm": 3.848269323679635,
"learning_rate": 8.217800573403105e-06,
"loss": 0.0968,
"step": 192
},
{
"epoch": 0.5578034682080925,
"grad_norm": 3.167566563939419,
"learning_rate": 8.20039347658672e-06,
"loss": 0.0829,
"step": 193
},
{
"epoch": 0.5606936416184971,
"grad_norm": 3.837712170703205,
"learning_rate": 8.18292041832138e-06,
"loss": 0.0955,
"step": 194
},
{
"epoch": 0.5635838150289018,
"grad_norm": 4.098712711661327,
"learning_rate": 8.165381758734134e-06,
"loss": 0.1147,
"step": 195
},
{
"epoch": 0.5664739884393064,
"grad_norm": 3.6241971235179102,
"learning_rate": 8.147777859304095e-06,
"loss": 0.1017,
"step": 196
},
{
"epoch": 0.569364161849711,
"grad_norm": 3.830171016987476,
"learning_rate": 8.130109082854998e-06,
"loss": 0.0945,
"step": 197
},
{
"epoch": 0.5722543352601156,
"grad_norm": 3.578104803720523,
"learning_rate": 8.112375793547718e-06,
"loss": 0.0893,
"step": 198
},
{
"epoch": 0.5751445086705202,
"grad_norm": 4.371585616818501,
"learning_rate": 8.09457835687277e-06,
"loss": 0.0933,
"step": 199
},
{
"epoch": 0.5780346820809249,
"grad_norm": 3.909778241746307,
"learning_rate": 8.076717139642775e-06,
"loss": 0.11,
"step": 200
},
{
"epoch": 0.5780346820809249,
"eval_loss": 0.09941592067480087,
"eval_runtime": 0.9324,
"eval_samples_per_second": 30.029,
"eval_steps_per_second": 7.507,
"step": 200
},
{
"epoch": 0.5809248554913294,
"grad_norm": 3.935272505639913,
"learning_rate": 8.058792509984893e-06,
"loss": 0.1024,
"step": 201
},
{
"epoch": 0.5838150289017341,
"grad_norm": 4.654720715350939,
"learning_rate": 8.040804837333243e-06,
"loss": 0.1019,
"step": 202
},
{
"epoch": 0.5867052023121387,
"grad_norm": 4.277556402582181,
"learning_rate": 8.022754492421284e-06,
"loss": 0.1083,
"step": 203
},
{
"epoch": 0.5895953757225434,
"grad_norm": 3.9171978917704458,
"learning_rate": 8.004641847274182e-06,
"loss": 0.1078,
"step": 204
},
{
"epoch": 0.5924855491329479,
"grad_norm": 3.5396394939537963,
"learning_rate": 7.986467275201135e-06,
"loss": 0.0841,
"step": 205
},
{
"epoch": 0.5953757225433526,
"grad_norm": 3.3936510531339703,
"learning_rate": 7.968231150787674e-06,
"loss": 0.092,
"step": 206
},
{
"epoch": 0.5982658959537572,
"grad_norm": 3.5159415187918794,
"learning_rate": 7.949933849887963e-06,
"loss": 0.0827,
"step": 207
},
{
"epoch": 0.6011560693641619,
"grad_norm": 3.6681764621931303,
"learning_rate": 7.931575749617027e-06,
"loss": 0.0665,
"step": 208
},
{
"epoch": 0.6040462427745664,
"grad_norm": 3.6083828563308042,
"learning_rate": 7.913157228342994e-06,
"loss": 0.106,
"step": 209
},
{
"epoch": 0.6069364161849711,
"grad_norm": 4.2477637082450315,
"learning_rate": 7.894678665679298e-06,
"loss": 0.1117,
"step": 210
},
{
"epoch": 0.6098265895953757,
"grad_norm": 4.6492747986553695,
"learning_rate": 7.876140442476847e-06,
"loss": 0.1062,
"step": 211
},
{
"epoch": 0.6127167630057804,
"grad_norm": 3.647527727939632,
"learning_rate": 7.857542940816183e-06,
"loss": 0.0853,
"step": 212
},
{
"epoch": 0.615606936416185,
"grad_norm": 3.93578958304064,
"learning_rate": 7.838886543999596e-06,
"loss": 0.0936,
"step": 213
},
{
"epoch": 0.6184971098265896,
"grad_norm": 4.072364355852493,
"learning_rate": 7.820171636543233e-06,
"loss": 0.0806,
"step": 214
},
{
"epoch": 0.6213872832369942,
"grad_norm": 4.441885738921091,
"learning_rate": 7.80139860416917e-06,
"loss": 0.1067,
"step": 215
},
{
"epoch": 0.6242774566473989,
"grad_norm": 4.310946247204338,
"learning_rate": 7.782567833797458e-06,
"loss": 0.1195,
"step": 216
},
{
"epoch": 0.6271676300578035,
"grad_norm": 3.7113356442945116,
"learning_rate": 7.763679713538158e-06,
"loss": 0.0826,
"step": 217
},
{
"epoch": 0.630057803468208,
"grad_norm": 4.057810111847526,
"learning_rate": 7.744734632683332e-06,
"loss": 0.0739,
"step": 218
},
{
"epoch": 0.6329479768786127,
"grad_norm": 3.9634738814885324,
"learning_rate": 7.725732981699028e-06,
"loss": 0.1089,
"step": 219
},
{
"epoch": 0.6358381502890174,
"grad_norm": 4.185189494202386,
"learning_rate": 7.70667515221722e-06,
"loss": 0.1013,
"step": 220
},
{
"epoch": 0.638728323699422,
"grad_norm": 4.1930722895881205,
"learning_rate": 7.687561537027754e-06,
"loss": 0.0989,
"step": 221
},
{
"epoch": 0.6416184971098265,
"grad_norm": 3.4917142430832544,
"learning_rate": 7.668392530070238e-06,
"loss": 0.0901,
"step": 222
},
{
"epoch": 0.6445086705202312,
"grad_norm": 4.746355802069406,
"learning_rate": 7.649168526425924e-06,
"loss": 0.1189,
"step": 223
},
{
"epoch": 0.6473988439306358,
"grad_norm": 3.8007941418147455,
"learning_rate": 7.629889922309576e-06,
"loss": 0.1021,
"step": 224
},
{
"epoch": 0.6502890173410405,
"grad_norm": 3.7915979937662794,
"learning_rate": 7.610557115061292e-06,
"loss": 0.0942,
"step": 225
},
{
"epoch": 0.653179190751445,
"grad_norm": 3.4975651641815757,
"learning_rate": 7.5911705031383235e-06,
"loss": 0.0758,
"step": 226
},
{
"epoch": 0.6560693641618497,
"grad_norm": 3.8976041216491972,
"learning_rate": 7.571730486106849e-06,
"loss": 0.0966,
"step": 227
},
{
"epoch": 0.6589595375722543,
"grad_norm": 3.5215116107725346,
"learning_rate": 7.55223746463376e-06,
"loss": 0.0868,
"step": 228
},
{
"epoch": 0.661849710982659,
"grad_norm": 3.217072576489954,
"learning_rate": 7.532691840478388e-06,
"loss": 0.0848,
"step": 229
},
{
"epoch": 0.6647398843930635,
"grad_norm": 3.4340150681487556,
"learning_rate": 7.513094016484225e-06,
"loss": 0.0977,
"step": 230
},
{
"epoch": 0.6676300578034682,
"grad_norm": 3.69714853322805,
"learning_rate": 7.493444396570625e-06,
"loss": 0.0865,
"step": 231
},
{
"epoch": 0.6705202312138728,
"grad_norm": 4.042244071122905,
"learning_rate": 7.473743385724478e-06,
"loss": 0.1144,
"step": 232
},
{
"epoch": 0.6734104046242775,
"grad_norm": 3.518643461499892,
"learning_rate": 7.453991389991864e-06,
"loss": 0.0772,
"step": 233
},
{
"epoch": 0.6763005780346821,
"grad_norm": 3.615509903388122,
"learning_rate": 7.434188816469681e-06,
"loss": 0.0926,
"step": 234
},
{
"epoch": 0.6791907514450867,
"grad_norm": 3.6364313649144626,
"learning_rate": 7.414336073297255e-06,
"loss": 0.0861,
"step": 235
},
{
"epoch": 0.6820809248554913,
"grad_norm": 3.1761480310047863,
"learning_rate": 7.394433569647935e-06,
"loss": 0.0858,
"step": 236
},
{
"epoch": 0.684971098265896,
"grad_norm": 3.284548753029961,
"learning_rate": 7.374481715720647e-06,
"loss": 0.086,
"step": 237
},
{
"epoch": 0.6878612716763006,
"grad_norm": 3.3199374348167265,
"learning_rate": 7.354480922731454e-06,
"loss": 0.0788,
"step": 238
},
{
"epoch": 0.6907514450867052,
"grad_norm": 3.8232040800857936,
"learning_rate": 7.334431602905068e-06,
"loss": 0.0829,
"step": 239
},
{
"epoch": 0.6936416184971098,
"grad_norm": 3.989063308502132,
"learning_rate": 7.3143341694663604e-06,
"loss": 0.0951,
"step": 240
},
{
"epoch": 0.6965317919075145,
"grad_norm": 3.598886651406255,
"learning_rate": 7.294189036631847e-06,
"loss": 0.0975,
"step": 241
},
{
"epoch": 0.6994219653179191,
"grad_norm": 3.790070882387216,
"learning_rate": 7.273996619601146e-06,
"loss": 0.0916,
"step": 242
},
{
"epoch": 0.7023121387283237,
"grad_norm": 3.301945711159591,
"learning_rate": 7.253757334548424e-06,
"loss": 0.0873,
"step": 243
},
{
"epoch": 0.7052023121387283,
"grad_norm": 3.368321053717709,
"learning_rate": 7.233471598613815e-06,
"loss": 0.0881,
"step": 244
},
{
"epoch": 0.708092485549133,
"grad_norm": 4.69106656832974,
"learning_rate": 7.213139829894826e-06,
"loss": 0.0953,
"step": 245
},
{
"epoch": 0.7109826589595376,
"grad_norm": 3.7249528749087997,
"learning_rate": 7.192762447437722e-06,
"loss": 0.0791,
"step": 246
},
{
"epoch": 0.7138728323699421,
"grad_norm": 4.7512868447958425,
"learning_rate": 7.17233987122888e-06,
"loss": 0.1132,
"step": 247
},
{
"epoch": 0.7167630057803468,
"grad_norm": 3.9162600055471595,
"learning_rate": 7.151872522186147e-06,
"loss": 0.0914,
"step": 248
},
{
"epoch": 0.7196531791907514,
"grad_norm": 3.015385891196794,
"learning_rate": 7.131360822150147e-06,
"loss": 0.064,
"step": 249
},
{
"epoch": 0.7225433526011561,
"grad_norm": 3.525616281685469,
"learning_rate": 7.110805193875607e-06,
"loss": 0.0847,
"step": 250
},
{
"epoch": 0.7254335260115607,
"grad_norm": 3.7840741766879997,
"learning_rate": 7.090206061022628e-06,
"loss": 0.0892,
"step": 251
},
{
"epoch": 0.7283236994219653,
"grad_norm": 3.005699776026508,
"learning_rate": 7.0695638481479565e-06,
"loss": 0.0813,
"step": 252
},
{
"epoch": 0.7312138728323699,
"grad_norm": 4.468302540719233,
"learning_rate": 7.048878980696241e-06,
"loss": 0.0923,
"step": 253
},
{
"epoch": 0.7341040462427746,
"grad_norm": 3.970063283349485,
"learning_rate": 7.028151884991254e-06,
"loss": 0.0887,
"step": 254
},
{
"epoch": 0.7369942196531792,
"grad_norm": 3.748663847925155,
"learning_rate": 7.007382988227116e-06,
"loss": 0.0769,
"step": 255
},
{
"epoch": 0.7398843930635838,
"grad_norm": 3.2014963138639123,
"learning_rate": 6.986572718459479e-06,
"loss": 0.0746,
"step": 256
},
{
"epoch": 0.7427745664739884,
"grad_norm": 3.8984665123074347,
"learning_rate": 6.965721504596712e-06,
"loss": 0.0839,
"step": 257
},
{
"epoch": 0.7456647398843931,
"grad_norm": 4.449372024694936,
"learning_rate": 6.94482977639106e-06,
"loss": 0.1147,
"step": 258
},
{
"epoch": 0.7485549132947977,
"grad_norm": 3.5860135752034252,
"learning_rate": 6.923897964429784e-06,
"loss": 0.0987,
"step": 259
},
{
"epoch": 0.7514450867052023,
"grad_norm": 3.9680857005528023,
"learning_rate": 6.902926500126292e-06,
"loss": 0.0952,
"step": 260
},
{
"epoch": 0.7543352601156069,
"grad_norm": 3.5008251567033857,
"learning_rate": 6.881915815711235e-06,
"loss": 0.0935,
"step": 261
},
{
"epoch": 0.7572254335260116,
"grad_norm": 3.5150218688906354,
"learning_rate": 6.8608663442236156e-06,
"loss": 0.0913,
"step": 262
},
{
"epoch": 0.7601156069364162,
"grad_norm": 3.4487002898741763,
"learning_rate": 6.839778519501848e-06,
"loss": 0.0827,
"step": 263
},
{
"epoch": 0.7630057803468208,
"grad_norm": 4.109991878752816,
"learning_rate": 6.818652776174828e-06,
"loss": 0.0861,
"step": 264
},
{
"epoch": 0.7658959537572254,
"grad_norm": 3.6854040146116307,
"learning_rate": 6.797489549652965e-06,
"loss": 0.0848,
"step": 265
},
{
"epoch": 0.7687861271676301,
"grad_norm": 4.48017404399493,
"learning_rate": 6.776289276119214e-06,
"loss": 0.1077,
"step": 266
},
{
"epoch": 0.7716763005780347,
"grad_norm": 3.5935256781587985,
"learning_rate": 6.7550523925200876e-06,
"loss": 0.0836,
"step": 267
},
{
"epoch": 0.7745664739884393,
"grad_norm": 3.949250387337809,
"learning_rate": 6.733779336556643e-06,
"loss": 0.082,
"step": 268
},
{
"epoch": 0.7774566473988439,
"grad_norm": 3.394592216776318,
"learning_rate": 6.712470546675467e-06,
"loss": 0.0714,
"step": 269
},
{
"epoch": 0.7803468208092486,
"grad_norm": 4.50351141769006,
"learning_rate": 6.691126462059636e-06,
"loss": 0.098,
"step": 270
},
{
"epoch": 0.7832369942196532,
"grad_norm": 3.891769583944111,
"learning_rate": 6.669747522619668e-06,
"loss": 0.097,
"step": 271
},
{
"epoch": 0.7861271676300579,
"grad_norm": 3.5621684716248687,
"learning_rate": 6.648334168984452e-06,
"loss": 0.0808,
"step": 272
},
{
"epoch": 0.7890173410404624,
"grad_norm": 3.4588584589807474,
"learning_rate": 6.626886842492168e-06,
"loss": 0.0892,
"step": 273
},
{
"epoch": 0.791907514450867,
"grad_norm": 3.7827135135353647,
"learning_rate": 6.60540598518119e-06,
"loss": 0.0883,
"step": 274
},
{
"epoch": 0.7947976878612717,
"grad_norm": 3.417524992238082,
"learning_rate": 6.583892039780979e-06,
"loss": 0.0845,
"step": 275
},
{
"epoch": 0.7976878612716763,
"grad_norm": 3.956403018811007,
"learning_rate": 6.562345449702952e-06,
"loss": 0.0866,
"step": 276
},
{
"epoch": 0.8005780346820809,
"grad_norm": 3.467743823738221,
"learning_rate": 6.540766659031348e-06,
"loss": 0.085,
"step": 277
},
{
"epoch": 0.8034682080924855,
"grad_norm": 2.9058940217436264,
"learning_rate": 6.519156112514074e-06,
"loss": 0.0622,
"step": 278
},
{
"epoch": 0.8063583815028902,
"grad_norm": 3.5477458403494913,
"learning_rate": 6.497514255553538e-06,
"loss": 0.0852,
"step": 279
},
{
"epoch": 0.8092485549132948,
"grad_norm": 3.3180443483421684,
"learning_rate": 6.4758415341974705e-06,
"loss": 0.0813,
"step": 280
},
{
"epoch": 0.8121387283236994,
"grad_norm": 3.6388404590362704,
"learning_rate": 6.454138395129727e-06,
"loss": 0.0771,
"step": 281
},
{
"epoch": 0.815028901734104,
"grad_norm": 4.510025043476393,
"learning_rate": 6.432405285661087e-06,
"loss": 0.1043,
"step": 282
},
{
"epoch": 0.8179190751445087,
"grad_norm": 3.7069972674864777,
"learning_rate": 6.410642653720033e-06,
"loss": 0.0955,
"step": 283
},
{
"epoch": 0.8208092485549133,
"grad_norm": 3.762263730027283,
"learning_rate": 6.388850947843517e-06,
"loss": 0.1028,
"step": 284
},
{
"epoch": 0.8236994219653179,
"grad_norm": 3.8877343897591334,
"learning_rate": 6.367030617167717e-06,
"loss": 0.0934,
"step": 285
},
{
"epoch": 0.8265895953757225,
"grad_norm": 4.226473140728079,
"learning_rate": 6.345182111418781e-06,
"loss": 0.0918,
"step": 286
},
{
"epoch": 0.8294797687861272,
"grad_norm": 3.621018489335686,
"learning_rate": 6.323305880903555e-06,
"loss": 0.0836,
"step": 287
},
{
"epoch": 0.8323699421965318,
"grad_norm": 3.8331628689604775,
"learning_rate": 6.301402376500306e-06,
"loss": 0.0924,
"step": 288
},
{
"epoch": 0.8352601156069365,
"grad_norm": 3.940039042927964,
"learning_rate": 6.279472049649426e-06,
"loss": 0.1116,
"step": 289
},
{
"epoch": 0.838150289017341,
"grad_norm": 4.441094589237161,
"learning_rate": 6.257515352344131e-06,
"loss": 0.1003,
"step": 290
},
{
"epoch": 0.8410404624277457,
"grad_norm": 3.6897875138985903,
"learning_rate": 6.2355327371211404e-06,
"loss": 0.0877,
"step": 291
},
{
"epoch": 0.8439306358381503,
"grad_norm": 2.9510789132394253,
"learning_rate": 6.213524657051354e-06,
"loss": 0.0762,
"step": 292
},
{
"epoch": 0.846820809248555,
"grad_norm": 4.173200678288769,
"learning_rate": 6.191491565730512e-06,
"loss": 0.0981,
"step": 293
},
{
"epoch": 0.8497109826589595,
"grad_norm": 3.591972918555686,
"learning_rate": 6.16943391726985e-06,
"loss": 0.0768,
"step": 294
},
{
"epoch": 0.8526011560693642,
"grad_norm": 3.636329925569424,
"learning_rate": 6.147352166286731e-06,
"loss": 0.0865,
"step": 295
},
{
"epoch": 0.8554913294797688,
"grad_norm": 3.643462204481586,
"learning_rate": 6.125246767895287e-06,
"loss": 0.0889,
"step": 296
},
{
"epoch": 0.8583815028901735,
"grad_norm": 3.563652514109132,
"learning_rate": 6.103118177697027e-06,
"loss": 0.0793,
"step": 297
},
{
"epoch": 0.861271676300578,
"grad_norm": 3.781227516799238,
"learning_rate": 6.0809668517714615e-06,
"loss": 0.0891,
"step": 298
},
{
"epoch": 0.8641618497109826,
"grad_norm": 3.512843215775068,
"learning_rate": 6.0587932466666825e-06,
"loss": 0.0691,
"step": 299
},
{
"epoch": 0.8670520231213873,
"grad_norm": 3.505371533644849,
"learning_rate": 6.036597819389972e-06,
"loss": 0.0796,
"step": 300
},
{
"epoch": 0.869942196531792,
"grad_norm": 3.654392241450492,
"learning_rate": 6.014381027398379e-06,
"loss": 0.0825,
"step": 301
},
{
"epoch": 0.8728323699421965,
"grad_norm": 3.5879948178102854,
"learning_rate": 5.992143328589282e-06,
"loss": 0.0855,
"step": 302
},
{
"epoch": 0.8757225433526011,
"grad_norm": 4.276864949222476,
"learning_rate": 5.96988518129096e-06,
"loss": 0.0989,
"step": 303
},
{
"epoch": 0.8786127167630058,
"grad_norm": 3.680706587097863,
"learning_rate": 5.947607044253142e-06,
"loss": 0.0877,
"step": 304
},
{
"epoch": 0.8815028901734104,
"grad_norm": 3.1181713444360697,
"learning_rate": 5.92530937663756e-06,
"loss": 0.0782,
"step": 305
},
{
"epoch": 0.884393063583815,
"grad_norm": 3.6130639841107275,
"learning_rate": 5.902992638008475e-06,
"loss": 0.0773,
"step": 306
},
{
"epoch": 0.8872832369942196,
"grad_norm": 2.9053534072517544,
"learning_rate": 5.880657288323207e-06,
"loss": 0.0674,
"step": 307
},
{
"epoch": 0.8901734104046243,
"grad_norm": 4.138796949543857,
"learning_rate": 5.858303787922663e-06,
"loss": 0.0937,
"step": 308
},
{
"epoch": 0.8930635838150289,
"grad_norm": 4.554927322190945,
"learning_rate": 5.835932597521839e-06,
"loss": 0.0887,
"step": 309
},
{
"epoch": 0.8959537572254336,
"grad_norm": 4.130255984143273,
"learning_rate": 5.8135441782003354e-06,
"loss": 0.0915,
"step": 310
},
{
"epoch": 0.8988439306358381,
"grad_norm": 3.703954373509824,
"learning_rate": 5.791138991392843e-06,
"loss": 0.0754,
"step": 311
},
{
"epoch": 0.9017341040462428,
"grad_norm": 4.873227420557636,
"learning_rate": 5.768717498879635e-06,
"loss": 0.1212,
"step": 312
},
{
"epoch": 0.9046242774566474,
"grad_norm": 3.2713591661227936,
"learning_rate": 5.746280162777061e-06,
"loss": 0.0843,
"step": 313
},
{
"epoch": 0.9075144508670521,
"grad_norm": 3.9663264457968803,
"learning_rate": 5.723827445528003e-06,
"loss": 0.0763,
"step": 314
},
{
"epoch": 0.9104046242774566,
"grad_norm": 4.199805628532218,
"learning_rate": 5.701359809892367e-06,
"loss": 0.1101,
"step": 315
},
{
"epoch": 0.9132947976878613,
"grad_norm": 3.7370869573089895,
"learning_rate": 5.67887771893752e-06,
"loss": 0.0752,
"step": 316
},
{
"epoch": 0.9161849710982659,
"grad_norm": 3.182039257660417,
"learning_rate": 5.656381636028769e-06,
"loss": 0.0812,
"step": 317
},
{
"epoch": 0.9190751445086706,
"grad_norm": 3.3708995105429858,
"learning_rate": 5.633872024819796e-06,
"loss": 0.0726,
"step": 318
},
{
"epoch": 0.9219653179190751,
"grad_norm": 3.384533244992893,
"learning_rate": 5.6113493492431105e-06,
"loss": 0.0628,
"step": 319
},
{
"epoch": 0.9248554913294798,
"grad_norm": 3.3156425908108105,
"learning_rate": 5.588814073500481e-06,
"loss": 0.0733,
"step": 320
},
{
"epoch": 0.9277456647398844,
"grad_norm": 3.638157508219399,
"learning_rate": 5.56626666205337e-06,
"loss": 0.0764,
"step": 321
},
{
"epoch": 0.930635838150289,
"grad_norm": 3.3598544316213013,
"learning_rate": 5.543707579613367e-06,
"loss": 0.084,
"step": 322
},
{
"epoch": 0.9335260115606936,
"grad_norm": 3.959366527529141,
"learning_rate": 5.5211372911326e-06,
"loss": 0.0854,
"step": 323
},
{
"epoch": 0.9364161849710982,
"grad_norm": 3.81567718746474,
"learning_rate": 5.498556261794161e-06,
"loss": 0.0898,
"step": 324
},
{
"epoch": 0.9393063583815029,
"grad_norm": 3.32242644124862,
"learning_rate": 5.475964957002516e-06,
"loss": 0.0858,
"step": 325
},
{
"epoch": 0.9421965317919075,
"grad_norm": 4.384037573727284,
"learning_rate": 5.45336384237391e-06,
"loss": 0.0929,
"step": 326
},
{
"epoch": 0.9450867052023122,
"grad_norm": 3.0790304811954656,
"learning_rate": 5.430753383726776e-06,
"loss": 0.0773,
"step": 327
},
{
"epoch": 0.9479768786127167,
"grad_norm": 3.4124598879380987,
"learning_rate": 5.4081340470721286e-06,
"loss": 0.0797,
"step": 328
},
{
"epoch": 0.9508670520231214,
"grad_norm": 4.1220306324446785,
"learning_rate": 5.385506298603962e-06,
"loss": 0.0907,
"step": 329
},
{
"epoch": 0.953757225433526,
"grad_norm": 3.3556166825004814,
"learning_rate": 5.362870604689643e-06,
"loss": 0.0771,
"step": 330
},
{
"epoch": 0.9566473988439307,
"grad_norm": 3.2419310606822345,
"learning_rate": 5.340227431860295e-06,
"loss": 0.0684,
"step": 331
},
{
"epoch": 0.9595375722543352,
"grad_norm": 3.655597904932805,
"learning_rate": 5.31757724680119e-06,
"loss": 0.0846,
"step": 332
},
{
"epoch": 0.9624277456647399,
"grad_norm": 4.960497588181285,
"learning_rate": 5.294920516342117e-06,
"loss": 0.0808,
"step": 333
},
{
"epoch": 0.9653179190751445,
"grad_norm": 3.185250487634402,
"learning_rate": 5.272257707447776e-06,
"loss": 0.0813,
"step": 334
},
{
"epoch": 0.9682080924855492,
"grad_norm": 4.274523790048614,
"learning_rate": 5.24958928720814e-06,
"loss": 0.0979,
"step": 335
},
{
"epoch": 0.9710982658959537,
"grad_norm": 3.4623164735925385,
"learning_rate": 5.22691572282884e-06,
"loss": 0.0831,
"step": 336
},
{
"epoch": 0.9739884393063584,
"grad_norm": 4.255813187391108,
"learning_rate": 5.2042374816215265e-06,
"loss": 0.0855,
"step": 337
},
{
"epoch": 0.976878612716763,
"grad_norm": 3.6733563161934204,
"learning_rate": 5.18155503099424e-06,
"loss": 0.0924,
"step": 338
},
{
"epoch": 0.9797687861271677,
"grad_norm": 3.767726660475069,
"learning_rate": 5.1588688384417816e-06,
"loss": 0.0845,
"step": 339
},
{
"epoch": 0.9826589595375722,
"grad_norm": 3.3855145820664743,
"learning_rate": 5.136179371536076e-06,
"loss": 0.0793,
"step": 340
},
{
"epoch": 0.9855491329479769,
"grad_norm": 3.408221038715105,
"learning_rate": 5.113487097916531e-06,
"loss": 0.0632,
"step": 341
},
{
"epoch": 0.9884393063583815,
"grad_norm": 3.3541043863328843,
"learning_rate": 5.090792485280401e-06,
"loss": 0.0717,
"step": 342
},
{
"epoch": 0.9913294797687862,
"grad_norm": 3.8506033223008624,
"learning_rate": 5.068096001373152e-06,
"loss": 0.0862,
"step": 343
},
{
"epoch": 0.9942196531791907,
"grad_norm": 3.478313323623953,
"learning_rate": 5.045398113978816e-06,
"loss": 0.0682,
"step": 344
},
{
"epoch": 0.9971098265895953,
"grad_norm": 3.5967316347838914,
"learning_rate": 5.022699290910351e-06,
"loss": 0.0864,
"step": 345
},
{
"epoch": 1.0,
"grad_norm": 4.094478069223065,
"learning_rate": 5e-06,
"loss": 0.0859,
"step": 346
},
{
"epoch": 1.0028901734104045,
"grad_norm": 3.191994252444766,
"learning_rate": 4.9773007090896505e-06,
"loss": 0.059,
"step": 347
},
{
"epoch": 1.0057803468208093,
"grad_norm": 2.592704679368971,
"learning_rate": 4.9546018860211845e-06,
"loss": 0.046,
"step": 348
},
{
"epoch": 1.0086705202312138,
"grad_norm": 2.4513026804496008,
"learning_rate": 4.931903998626851e-06,
"loss": 0.0386,
"step": 349
},
{
"epoch": 1.0115606936416186,
"grad_norm": 2.3175335938744572,
"learning_rate": 4.9092075147196005e-06,
"loss": 0.0282,
"step": 350
},
{
"epoch": 1.0144508670520231,
"grad_norm": 2.573974359707393,
"learning_rate": 4.886512902083471e-06,
"loss": 0.0388,
"step": 351
},
{
"epoch": 1.0173410404624277,
"grad_norm": 2.9332611203985923,
"learning_rate": 4.863820628463925e-06,
"loss": 0.0418,
"step": 352
},
{
"epoch": 1.0202312138728324,
"grad_norm": 1.973596205721184,
"learning_rate": 4.8411311615582176e-06,
"loss": 0.0295,
"step": 353
},
{
"epoch": 1.023121387283237,
"grad_norm": 1.8137644492843443,
"learning_rate": 4.818444969005762e-06,
"loss": 0.0259,
"step": 354
},
{
"epoch": 1.0260115606936415,
"grad_norm": 2.654278918599863,
"learning_rate": 4.795762518378476e-06,
"loss": 0.0442,
"step": 355
},
{
"epoch": 1.0289017341040463,
"grad_norm": 2.0967426909755353,
"learning_rate": 4.773084277171161e-06,
"loss": 0.0251,
"step": 356
},
{
"epoch": 1.0317919075144508,
"grad_norm": 2.442874432065364,
"learning_rate": 4.750410712791862e-06,
"loss": 0.0371,
"step": 357
},
{
"epoch": 1.0346820809248556,
"grad_norm": 2.5120312556057245,
"learning_rate": 4.727742292552225e-06,
"loss": 0.0377,
"step": 358
},
{
"epoch": 1.0375722543352601,
"grad_norm": 3.340524695416432,
"learning_rate": 4.705079483657885e-06,
"loss": 0.0443,
"step": 359
},
{
"epoch": 1.0404624277456647,
"grad_norm": 2.575030053289034,
"learning_rate": 4.682422753198812e-06,
"loss": 0.0299,
"step": 360
},
{
"epoch": 1.0433526011560694,
"grad_norm": 2.685175007956344,
"learning_rate": 4.659772568139706e-06,
"loss": 0.0398,
"step": 361
},
{
"epoch": 1.046242774566474,
"grad_norm": 2.6972818012368687,
"learning_rate": 4.637129395310359e-06,
"loss": 0.0399,
"step": 362
},
{
"epoch": 1.0491329479768785,
"grad_norm": 3.005217593332792,
"learning_rate": 4.614493701396041e-06,
"loss": 0.0407,
"step": 363
},
{
"epoch": 1.0520231213872833,
"grad_norm": 2.5154458128124024,
"learning_rate": 4.591865952927873e-06,
"loss": 0.0342,
"step": 364
},
{
"epoch": 1.0549132947976878,
"grad_norm": 4.439366071837881,
"learning_rate": 4.569246616273225e-06,
"loss": 0.0442,
"step": 365
},
{
"epoch": 1.0578034682080926,
"grad_norm": 3.0953883001786724,
"learning_rate": 4.546636157626091e-06,
"loss": 0.0421,
"step": 366
},
{
"epoch": 1.060693641618497,
"grad_norm": 2.2860089112442847,
"learning_rate": 4.524035042997485e-06,
"loss": 0.0273,
"step": 367
},
{
"epoch": 1.0635838150289016,
"grad_norm": 1.9635223901429677,
"learning_rate": 4.501443738205841e-06,
"loss": 0.0253,
"step": 368
},
{
"epoch": 1.0664739884393064,
"grad_norm": 3.600763598734996,
"learning_rate": 4.478862708867401e-06,
"loss": 0.0457,
"step": 369
},
{
"epoch": 1.069364161849711,
"grad_norm": 2.933454423722625,
"learning_rate": 4.456292420386635e-06,
"loss": 0.0353,
"step": 370
},
{
"epoch": 1.0722543352601157,
"grad_norm": 3.2964567748192946,
"learning_rate": 4.43373333794663e-06,
"loss": 0.0438,
"step": 371
},
{
"epoch": 1.0751445086705202,
"grad_norm": 2.4663446624125,
"learning_rate": 4.41118592649952e-06,
"loss": 0.0255,
"step": 372
},
{
"epoch": 1.0780346820809248,
"grad_norm": 2.9695281540539007,
"learning_rate": 4.388650650756891e-06,
"loss": 0.034,
"step": 373
},
{
"epoch": 1.0809248554913296,
"grad_norm": 3.213673243729433,
"learning_rate": 4.366127975180204e-06,
"loss": 0.0373,
"step": 374
},
{
"epoch": 1.083815028901734,
"grad_norm": 2.9297645443450344,
"learning_rate": 4.3436183639712326e-06,
"loss": 0.0345,
"step": 375
},
{
"epoch": 1.0867052023121386,
"grad_norm": 3.4897778865158835,
"learning_rate": 4.321122281062481e-06,
"loss": 0.0498,
"step": 376
},
{
"epoch": 1.0895953757225434,
"grad_norm": 3.83716948783189,
"learning_rate": 4.298640190107634e-06,
"loss": 0.0492,
"step": 377
},
{
"epoch": 1.092485549132948,
"grad_norm": 3.2428592157194007,
"learning_rate": 4.276172554471998e-06,
"loss": 0.0369,
"step": 378
},
{
"epoch": 1.0953757225433527,
"grad_norm": 2.724934147808616,
"learning_rate": 4.25371983722294e-06,
"loss": 0.0257,
"step": 379
},
{
"epoch": 1.0982658959537572,
"grad_norm": 2.7411102239022243,
"learning_rate": 4.231282501120366e-06,
"loss": 0.0355,
"step": 380
},
{
"epoch": 1.1011560693641618,
"grad_norm": 3.306500748639738,
"learning_rate": 4.20886100860716e-06,
"loss": 0.0311,
"step": 381
},
{
"epoch": 1.1040462427745665,
"grad_norm": 2.2266738254387515,
"learning_rate": 4.1864558217996645e-06,
"loss": 0.0353,
"step": 382
},
{
"epoch": 1.106936416184971,
"grad_norm": 3.1531866052818893,
"learning_rate": 4.164067402478162e-06,
"loss": 0.0424,
"step": 383
},
{
"epoch": 1.1098265895953756,
"grad_norm": 3.651012462730916,
"learning_rate": 4.14169621207734e-06,
"loss": 0.0429,
"step": 384
},
{
"epoch": 1.1127167630057804,
"grad_norm": 3.580488177101031,
"learning_rate": 4.119342711676794e-06,
"loss": 0.0459,
"step": 385
},
{
"epoch": 1.115606936416185,
"grad_norm": 2.964620530840838,
"learning_rate": 4.0970073619915264e-06,
"loss": 0.0437,
"step": 386
},
{
"epoch": 1.1184971098265897,
"grad_norm": 3.433449491668461,
"learning_rate": 4.074690623362439e-06,
"loss": 0.0457,
"step": 387
},
{
"epoch": 1.1213872832369942,
"grad_norm": 3.56646677299915,
"learning_rate": 4.05239295574686e-06,
"loss": 0.0552,
"step": 388
},
{
"epoch": 1.1242774566473988,
"grad_norm": 4.135460144949851,
"learning_rate": 4.030114818709044e-06,
"loss": 0.0541,
"step": 389
},
{
"epoch": 1.1271676300578035,
"grad_norm": 3.4522750039792927,
"learning_rate": 4.00785667141072e-06,
"loss": 0.0436,
"step": 390
},
{
"epoch": 1.130057803468208,
"grad_norm": 2.36010831219563,
"learning_rate": 3.985618972601622e-06,
"loss": 0.0307,
"step": 391
},
{
"epoch": 1.1329479768786128,
"grad_norm": 2.67796155921764,
"learning_rate": 3.963402180610028e-06,
"loss": 0.0331,
"step": 392
},
{
"epoch": 1.1358381502890174,
"grad_norm": 3.3768235978909726,
"learning_rate": 3.941206753333319e-06,
"loss": 0.0362,
"step": 393
},
{
"epoch": 1.138728323699422,
"grad_norm": 2.476496213479462,
"learning_rate": 3.919033148228542e-06,
"loss": 0.0279,
"step": 394
},
{
"epoch": 1.1416184971098267,
"grad_norm": 2.995513684870669,
"learning_rate": 3.896881822302973e-06,
"loss": 0.0431,
"step": 395
},
{
"epoch": 1.1445086705202312,
"grad_norm": 2.5680489305606264,
"learning_rate": 3.874753232104714e-06,
"loss": 0.0317,
"step": 396
},
{
"epoch": 1.147398843930636,
"grad_norm": 2.5370547981396423,
"learning_rate": 3.852647833713271e-06,
"loss": 0.0281,
"step": 397
},
{
"epoch": 1.1502890173410405,
"grad_norm": 2.9610590499557903,
"learning_rate": 3.830566082730151e-06,
"loss": 0.0332,
"step": 398
},
{
"epoch": 1.153179190751445,
"grad_norm": 3.249858884007628,
"learning_rate": 3.8085084342694894e-06,
"loss": 0.0421,
"step": 399
},
{
"epoch": 1.1560693641618498,
"grad_norm": 2.2116083552742563,
"learning_rate": 3.7864753429486475e-06,
"loss": 0.033,
"step": 400
},
{
"epoch": 1.1560693641618498,
"eval_loss": 0.09519536048173904,
"eval_runtime": 0.9252,
"eval_samples_per_second": 30.263,
"eval_steps_per_second": 7.566,
"step": 400
},
{
"epoch": 1.1589595375722543,
"grad_norm": 2.3890215101503744,
"learning_rate": 3.764467262878861e-06,
"loss": 0.0248,
"step": 401
},
{
"epoch": 1.1618497109826589,
"grad_norm": 2.5670031649481944,
"learning_rate": 3.7424846476558716e-06,
"loss": 0.0351,
"step": 402
},
{
"epoch": 1.1647398843930636,
"grad_norm": 2.260051259828975,
"learning_rate": 3.7205279503505744e-06,
"loss": 0.03,
"step": 403
},
{
"epoch": 1.1676300578034682,
"grad_norm": 2.892119519137359,
"learning_rate": 3.6985976234996957e-06,
"loss": 0.036,
"step": 404
},
{
"epoch": 1.1705202312138727,
"grad_norm": 2.5690145184191255,
"learning_rate": 3.676694119096446e-06,
"loss": 0.0276,
"step": 405
},
{
"epoch": 1.1734104046242775,
"grad_norm": 3.8499230666390303,
"learning_rate": 3.6548178885812203e-06,
"loss": 0.0648,
"step": 406
},
{
"epoch": 1.176300578034682,
"grad_norm": 2.700191616579353,
"learning_rate": 3.6329693828322843e-06,
"loss": 0.0395,
"step": 407
},
{
"epoch": 1.1791907514450868,
"grad_norm": 2.684567031546241,
"learning_rate": 3.611149052156483e-06,
"loss": 0.0314,
"step": 408
},
{
"epoch": 1.1820809248554913,
"grad_norm": 2.665135353367264,
"learning_rate": 3.5893573462799685e-06,
"loss": 0.0315,
"step": 409
},
{
"epoch": 1.1849710982658959,
"grad_norm": 3.352352872583342,
"learning_rate": 3.5675947143389144e-06,
"loss": 0.0381,
"step": 410
},
{
"epoch": 1.1878612716763006,
"grad_norm": 2.669792807397523,
"learning_rate": 3.545861604870274e-06,
"loss": 0.032,
"step": 411
},
{
"epoch": 1.1907514450867052,
"grad_norm": 4.631333627989559,
"learning_rate": 3.524158465802531e-06,
"loss": 0.0596,
"step": 412
},
{
"epoch": 1.19364161849711,
"grad_norm": 2.6548804106561814,
"learning_rate": 3.502485744446462e-06,
"loss": 0.0394,
"step": 413
},
{
"epoch": 1.1965317919075145,
"grad_norm": 3.0215989136881425,
"learning_rate": 3.4808438874859274e-06,
"loss": 0.0346,
"step": 414
},
{
"epoch": 1.199421965317919,
"grad_norm": 3.2761815094391795,
"learning_rate": 3.459233340968654e-06,
"loss": 0.0443,
"step": 415
},
{
"epoch": 1.2023121387283238,
"grad_norm": 2.640717210980499,
"learning_rate": 3.437654550297049e-06,
"loss": 0.0339,
"step": 416
},
{
"epoch": 1.2052023121387283,
"grad_norm": 2.9354149855485194,
"learning_rate": 3.4161079602190227e-06,
"loss": 0.032,
"step": 417
},
{
"epoch": 1.208092485549133,
"grad_norm": 3.254143923157989,
"learning_rate": 3.3945940148188117e-06,
"loss": 0.0388,
"step": 418
},
{
"epoch": 1.2109826589595376,
"grad_norm": 2.331283759083418,
"learning_rate": 3.3731131575078337e-06,
"loss": 0.0255,
"step": 419
},
{
"epoch": 1.2138728323699421,
"grad_norm": 2.5839607337424977,
"learning_rate": 3.3516658310155493e-06,
"loss": 0.0289,
"step": 420
},
{
"epoch": 1.216763005780347,
"grad_norm": 2.051445554385587,
"learning_rate": 3.3302524773803326e-06,
"loss": 0.0234,
"step": 421
},
{
"epoch": 1.2196531791907514,
"grad_norm": 3.1864371805455827,
"learning_rate": 3.3088735379403648e-06,
"loss": 0.0465,
"step": 422
},
{
"epoch": 1.222543352601156,
"grad_norm": 2.8343323344432156,
"learning_rate": 3.2875294533245355e-06,
"loss": 0.033,
"step": 423
},
{
"epoch": 1.2254335260115607,
"grad_norm": 3.101107885328744,
"learning_rate": 3.266220663443358e-06,
"loss": 0.0381,
"step": 424
},
{
"epoch": 1.2283236994219653,
"grad_norm": 3.605841286653843,
"learning_rate": 3.2449476074799137e-06,
"loss": 0.0471,
"step": 425
},
{
"epoch": 1.2312138728323698,
"grad_norm": 2.732444910454241,
"learning_rate": 3.223710723880786e-06,
"loss": 0.0361,
"step": 426
},
{
"epoch": 1.2341040462427746,
"grad_norm": 2.1721496892014516,
"learning_rate": 3.202510450347036e-06,
"loss": 0.0251,
"step": 427
},
{
"epoch": 1.2369942196531791,
"grad_norm": 3.0081441814863386,
"learning_rate": 3.1813472238251742e-06,
"loss": 0.0324,
"step": 428
},
{
"epoch": 1.239884393063584,
"grad_norm": 2.192339362377574,
"learning_rate": 3.160221480498153e-06,
"loss": 0.0265,
"step": 429
},
{
"epoch": 1.2427745664739884,
"grad_norm": 3.01343269094507,
"learning_rate": 3.139133655776386e-06,
"loss": 0.0414,
"step": 430
},
{
"epoch": 1.245664739884393,
"grad_norm": 3.325885420350197,
"learning_rate": 3.1180841842887667e-06,
"loss": 0.0414,
"step": 431
},
{
"epoch": 1.2485549132947977,
"grad_norm": 3.0528391154930774,
"learning_rate": 3.0970734998737095e-06,
"loss": 0.0356,
"step": 432
},
{
"epoch": 1.2514450867052023,
"grad_norm": 2.850794422724131,
"learning_rate": 3.0761020355702166e-06,
"loss": 0.0413,
"step": 433
},
{
"epoch": 1.254335260115607,
"grad_norm": 2.770056911108068,
"learning_rate": 3.055170223608941e-06,
"loss": 0.0351,
"step": 434
},
{
"epoch": 1.2572254335260116,
"grad_norm": 2.5955998288718716,
"learning_rate": 3.0342784954032893e-06,
"loss": 0.0276,
"step": 435
},
{
"epoch": 1.260115606936416,
"grad_norm": 3.701530230528004,
"learning_rate": 3.013427281540523e-06,
"loss": 0.0417,
"step": 436
},
{
"epoch": 1.2630057803468209,
"grad_norm": 3.0698772434133543,
"learning_rate": 2.992617011772885e-06,
"loss": 0.0345,
"step": 437
},
{
"epoch": 1.2658959537572254,
"grad_norm": 3.9760034819905274,
"learning_rate": 2.9718481150087475e-06,
"loss": 0.0561,
"step": 438
},
{
"epoch": 1.2687861271676302,
"grad_norm": 3.133018364068792,
"learning_rate": 2.9511210193037614e-06,
"loss": 0.0376,
"step": 439
},
{
"epoch": 1.2716763005780347,
"grad_norm": 2.9993982887005575,
"learning_rate": 2.9304361518520447e-06,
"loss": 0.0302,
"step": 440
},
{
"epoch": 1.2745664739884393,
"grad_norm": 2.8666702412666525,
"learning_rate": 2.9097939389773734e-06,
"loss": 0.0293,
"step": 441
},
{
"epoch": 1.2774566473988438,
"grad_norm": 2.497499578695883,
"learning_rate": 2.8891948061243925e-06,
"loss": 0.0374,
"step": 442
},
{
"epoch": 1.2803468208092486,
"grad_norm": 3.324055869265504,
"learning_rate": 2.8686391778498536e-06,
"loss": 0.0376,
"step": 443
},
{
"epoch": 1.2832369942196533,
"grad_norm": 4.425783556605509,
"learning_rate": 2.8481274778138567e-06,
"loss": 0.0487,
"step": 444
},
{
"epoch": 1.2861271676300579,
"grad_norm": 3.10467455625404,
"learning_rate": 2.827660128771119e-06,
"loss": 0.0369,
"step": 445
},
{
"epoch": 1.2890173410404624,
"grad_norm": 2.8080755687640417,
"learning_rate": 2.80723755256228e-06,
"loss": 0.034,
"step": 446
},
{
"epoch": 1.291907514450867,
"grad_norm": 3.2116525331760095,
"learning_rate": 2.786860170105174e-06,
"loss": 0.0392,
"step": 447
},
{
"epoch": 1.2947976878612717,
"grad_norm": 2.397691984436016,
"learning_rate": 2.766528401386187e-06,
"loss": 0.0309,
"step": 448
},
{
"epoch": 1.2976878612716762,
"grad_norm": 3.2789036120305117,
"learning_rate": 2.7462426654515797e-06,
"loss": 0.0405,
"step": 449
},
{
"epoch": 1.300578034682081,
"grad_norm": 2.69063461395359,
"learning_rate": 2.726003380398854e-06,
"loss": 0.0408,
"step": 450
},
{
"epoch": 1.3034682080924855,
"grad_norm": 3.1072958464943072,
"learning_rate": 2.705810963368154e-06,
"loss": 0.0387,
"step": 451
},
{
"epoch": 1.30635838150289,
"grad_norm": 3.5267578595885736,
"learning_rate": 2.685665830533642e-06,
"loss": 0.0383,
"step": 452
},
{
"epoch": 1.3092485549132948,
"grad_norm": 2.903217987827139,
"learning_rate": 2.665568397094934e-06,
"loss": 0.0297,
"step": 453
},
{
"epoch": 1.3121387283236994,
"grad_norm": 2.532011759421094,
"learning_rate": 2.6455190772685463e-06,
"loss": 0.0303,
"step": 454
},
{
"epoch": 1.3150289017341041,
"grad_norm": 3.4412261236059747,
"learning_rate": 2.6255182842793514e-06,
"loss": 0.0386,
"step": 455
},
{
"epoch": 1.3179190751445087,
"grad_norm": 2.26659906351238,
"learning_rate": 2.6055664303520655e-06,
"loss": 0.0265,
"step": 456
},
{
"epoch": 1.3208092485549132,
"grad_norm": 2.5513912795757583,
"learning_rate": 2.5856639267027463e-06,
"loss": 0.0314,
"step": 457
},
{
"epoch": 1.323699421965318,
"grad_norm": 2.4879494991424096,
"learning_rate": 2.5658111835303206e-06,
"loss": 0.0325,
"step": 458
},
{
"epoch": 1.3265895953757225,
"grad_norm": 2.8723586176408302,
"learning_rate": 2.5460086100081366e-06,
"loss": 0.0344,
"step": 459
},
{
"epoch": 1.3294797687861273,
"grad_norm": 4.178177986784516,
"learning_rate": 2.526256614275524e-06,
"loss": 0.0427,
"step": 460
},
{
"epoch": 1.3323699421965318,
"grad_norm": 3.2580659867039077,
"learning_rate": 2.506555603429377e-06,
"loss": 0.0408,
"step": 461
},
{
"epoch": 1.3352601156069364,
"grad_norm": 1.9694897719040088,
"learning_rate": 2.486905983515778e-06,
"loss": 0.0211,
"step": 462
},
{
"epoch": 1.3381502890173411,
"grad_norm": 2.653638767366116,
"learning_rate": 2.4673081595216136e-06,
"loss": 0.0345,
"step": 463
},
{
"epoch": 1.3410404624277457,
"grad_norm": 2.4307989679026547,
"learning_rate": 2.44776253536624e-06,
"loss": 0.0271,
"step": 464
},
{
"epoch": 1.3439306358381504,
"grad_norm": 3.199779022608156,
"learning_rate": 2.428269513893153e-06,
"loss": 0.0427,
"step": 465
},
{
"epoch": 1.346820809248555,
"grad_norm": 2.9276329342731935,
"learning_rate": 2.408829496861679e-06,
"loss": 0.0328,
"step": 466
},
{
"epoch": 1.3497109826589595,
"grad_norm": 3.0746209540433793,
"learning_rate": 2.389442884938709e-06,
"loss": 0.037,
"step": 467
},
{
"epoch": 1.352601156069364,
"grad_norm": 2.509196565821076,
"learning_rate": 2.370110077690425e-06,
"loss": 0.0313,
"step": 468
},
{
"epoch": 1.3554913294797688,
"grad_norm": 2.560118707356469,
"learning_rate": 2.3508314735740763e-06,
"loss": 0.0302,
"step": 469
},
{
"epoch": 1.3583815028901733,
"grad_norm": 2.0668214382886405,
"learning_rate": 2.331607469929765e-06,
"loss": 0.0243,
"step": 470
},
{
"epoch": 1.361271676300578,
"grad_norm": 2.479561489138274,
"learning_rate": 2.312438462972246e-06,
"loss": 0.0303,
"step": 471
},
{
"epoch": 1.3641618497109826,
"grad_norm": 2.726381817824829,
"learning_rate": 2.2933248477827814e-06,
"loss": 0.037,
"step": 472
},
{
"epoch": 1.3670520231213872,
"grad_norm": 3.33995350193323,
"learning_rate": 2.274267018300974e-06,
"loss": 0.0389,
"step": 473
},
{
"epoch": 1.369942196531792,
"grad_norm": 2.832996915893155,
"learning_rate": 2.2552653673166676e-06,
"loss": 0.0294,
"step": 474
},
{
"epoch": 1.3728323699421965,
"grad_norm": 3.400276604343586,
"learning_rate": 2.2363202864618432e-06,
"loss": 0.0302,
"step": 475
},
{
"epoch": 1.3757225433526012,
"grad_norm": 3.747613052965299,
"learning_rate": 2.2174321662025427e-06,
"loss": 0.045,
"step": 476
},
{
"epoch": 1.3786127167630058,
"grad_norm": 2.352541139134834,
"learning_rate": 2.1986013958308327e-06,
"loss": 0.0298,
"step": 477
},
{
"epoch": 1.3815028901734103,
"grad_norm": 2.6690961854018447,
"learning_rate": 2.179828363456768e-06,
"loss": 0.0281,
"step": 478
},
{
"epoch": 1.384393063583815,
"grad_norm": 2.564512639491241,
"learning_rate": 2.1611134560004045e-06,
"loss": 0.0242,
"step": 479
},
{
"epoch": 1.3872832369942196,
"grad_norm": 3.3760774282609063,
"learning_rate": 2.1424570591838184e-06,
"loss": 0.037,
"step": 480
},
{
"epoch": 1.3901734104046244,
"grad_norm": 2.943402455691774,
"learning_rate": 2.123859557523153e-06,
"loss": 0.035,
"step": 481
},
{
"epoch": 1.393063583815029,
"grad_norm": 2.6158536911307824,
"learning_rate": 2.1053213343207045e-06,
"loss": 0.0335,
"step": 482
},
{
"epoch": 1.3959537572254335,
"grad_norm": 2.472107173937846,
"learning_rate": 2.0868427716570078e-06,
"loss": 0.0291,
"step": 483
},
{
"epoch": 1.3988439306358382,
"grad_norm": 2.574302293095002,
"learning_rate": 2.068424250382974e-06,
"loss": 0.0339,
"step": 484
},
{
"epoch": 1.4017341040462428,
"grad_norm": 2.6025674697204755,
"learning_rate": 2.0500661501120378e-06,
"loss": 0.0309,
"step": 485
},
{
"epoch": 1.4046242774566475,
"grad_norm": 3.2228489798100224,
"learning_rate": 2.031768849212326e-06,
"loss": 0.0274,
"step": 486
},
{
"epoch": 1.407514450867052,
"grad_norm": 2.5345873926010776,
"learning_rate": 2.013532724798867e-06,
"loss": 0.0275,
"step": 487
},
{
"epoch": 1.4104046242774566,
"grad_norm": 2.9597299639439254,
"learning_rate": 1.995358152725818e-06,
"loss": 0.0359,
"step": 488
},
{
"epoch": 1.4132947976878611,
"grad_norm": 3.1633383402105415,
"learning_rate": 1.977245507578716e-06,
"loss": 0.0414,
"step": 489
},
{
"epoch": 1.416184971098266,
"grad_norm": 2.190197891627958,
"learning_rate": 1.959195162666759e-06,
"loss": 0.0224,
"step": 490
},
{
"epoch": 1.4190751445086704,
"grad_norm": 2.5140382935279506,
"learning_rate": 1.9412074900151094e-06,
"loss": 0.0343,
"step": 491
},
{
"epoch": 1.4219653179190752,
"grad_norm": 3.1439956653511385,
"learning_rate": 1.9232828603572255e-06,
"loss": 0.034,
"step": 492
},
{
"epoch": 1.4248554913294798,
"grad_norm": 2.5477953734275802,
"learning_rate": 1.9054216431272293e-06,
"loss": 0.0229,
"step": 493
},
{
"epoch": 1.4277456647398843,
"grad_norm": 2.9742747539659162,
"learning_rate": 1.8876242064522833e-06,
"loss": 0.0345,
"step": 494
},
{
"epoch": 1.430635838150289,
"grad_norm": 2.5865848063902637,
"learning_rate": 1.869890917145003e-06,
"loss": 0.0321,
"step": 495
},
{
"epoch": 1.4335260115606936,
"grad_norm": 3.124734907713117,
"learning_rate": 1.8522221406959063e-06,
"loss": 0.0426,
"step": 496
},
{
"epoch": 1.4364161849710984,
"grad_norm": 2.4623442770984463,
"learning_rate": 1.8346182412658665e-06,
"loss": 0.0368,
"step": 497
},
{
"epoch": 1.439306358381503,
"grad_norm": 4.111782764544109,
"learning_rate": 1.8170795816786202e-06,
"loss": 0.0525,
"step": 498
},
{
"epoch": 1.4421965317919074,
"grad_norm": 3.239712158794509,
"learning_rate": 1.7996065234132836e-06,
"loss": 0.0263,
"step": 499
},
{
"epoch": 1.4450867052023122,
"grad_norm": 2.334313474090217,
"learning_rate": 1.7821994265968962e-06,
"loss": 0.0271,
"step": 500
},
{
"epoch": 1.4479768786127167,
"grad_norm": 2.437665058243356,
"learning_rate": 1.7648586499970123e-06,
"loss": 0.0233,
"step": 501
},
{
"epoch": 1.4508670520231215,
"grad_norm": 3.3205593098358976,
"learning_rate": 1.747584551014291e-06,
"loss": 0.0455,
"step": 502
},
{
"epoch": 1.453757225433526,
"grad_norm": 3.517378382534404,
"learning_rate": 1.7303774856751443e-06,
"loss": 0.0377,
"step": 503
},
{
"epoch": 1.4566473988439306,
"grad_norm": 2.737219012324894,
"learning_rate": 1.7132378086243907e-06,
"loss": 0.0317,
"step": 504
},
{
"epoch": 1.4595375722543353,
"grad_norm": 2.3603945703999885,
"learning_rate": 1.6961658731179452e-06,
"loss": 0.0227,
"step": 505
},
{
"epoch": 1.4624277456647399,
"grad_norm": 3.12571466433377,
"learning_rate": 1.679162031015546e-06,
"loss": 0.0243,
"step": 506
},
{
"epoch": 1.4653179190751446,
"grad_norm": 2.9355074878435428,
"learning_rate": 1.662226632773492e-06,
"loss": 0.0304,
"step": 507
},
{
"epoch": 1.4682080924855492,
"grad_norm": 3.2151717972495195,
"learning_rate": 1.64536002743743e-06,
"loss": 0.044,
"step": 508
},
{
"epoch": 1.4710982658959537,
"grad_norm": 2.5493514288687327,
"learning_rate": 1.628562562635157e-06,
"loss": 0.0277,
"step": 509
},
{
"epoch": 1.4739884393063583,
"grad_norm": 2.925418140628061,
"learning_rate": 1.6118345845694489e-06,
"loss": 0.0326,
"step": 510
},
{
"epoch": 1.476878612716763,
"grad_norm": 4.364624930763201,
"learning_rate": 1.5951764380109374e-06,
"loss": 0.0388,
"step": 511
},
{
"epoch": 1.4797687861271676,
"grad_norm": 2.422471626390923,
"learning_rate": 1.5785884662909917e-06,
"loss": 0.0256,
"step": 512
},
{
"epoch": 1.4826589595375723,
"grad_norm": 2.8693832519867573,
"learning_rate": 1.5620710112946536e-06,
"loss": 0.0344,
"step": 513
},
{
"epoch": 1.4855491329479769,
"grad_norm": 2.4815608656291386,
"learning_rate": 1.5456244134535836e-06,
"loss": 0.0245,
"step": 514
},
{
"epoch": 1.4884393063583814,
"grad_norm": 1.4984532129341321,
"learning_rate": 1.5292490117390457e-06,
"loss": 0.017,
"step": 515
},
{
"epoch": 1.4913294797687862,
"grad_norm": 2.805421146569415,
"learning_rate": 1.5129451436549203e-06,
"loss": 0.0379,
"step": 516
},
{
"epoch": 1.4942196531791907,
"grad_norm": 2.212196592484195,
"learning_rate": 1.4967131452307537e-06,
"loss": 0.0284,
"step": 517
},
{
"epoch": 1.4971098265895955,
"grad_norm": 3.1407969224113783,
"learning_rate": 1.4805533510148268e-06,
"loss": 0.0357,
"step": 518
},
{
"epoch": 1.5,
"grad_norm": 3.2595411846223183,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0356,
"step": 519
},
{
"epoch": 1.5028901734104045,
"grad_norm": 3.28488525309383,
"learning_rate": 1.4484517059531588e-06,
"loss": 0.0355,
"step": 520
},
{
"epoch": 1.5057803468208093,
"grad_norm": 3.0922561562434017,
"learning_rate": 1.4325105167357545e-06,
"loss": 0.0339,
"step": 521
},
{
"epoch": 1.5086705202312138,
"grad_norm": 2.3544575695219043,
"learning_rate": 1.416642854969632e-06,
"loss": 0.0226,
"step": 522
},
{
"epoch": 1.5115606936416186,
"grad_norm": 3.346063294192223,
"learning_rate": 1.4008490476939423e-06,
"loss": 0.0385,
"step": 523
},
{
"epoch": 1.5144508670520231,
"grad_norm": 3.1038077146303085,
"learning_rate": 1.3851294204256638e-06,
"loss": 0.0335,
"step": 524
},
{
"epoch": 1.5173410404624277,
"grad_norm": 2.8284054263519147,
"learning_rate": 1.3694842971528927e-06,
"loss": 0.0268,
"step": 525
},
{
"epoch": 1.5202312138728322,
"grad_norm": 3.85505423838039,
"learning_rate": 1.3539140003281647e-06,
"loss": 0.0297,
"step": 526
},
{
"epoch": 1.523121387283237,
"grad_norm": 2.610739320006818,
"learning_rate": 1.3384188508618157e-06,
"loss": 0.0303,
"step": 527
},
{
"epoch": 1.5260115606936417,
"grad_norm": 3.1615204643844095,
"learning_rate": 1.3229991681153632e-06,
"loss": 0.0384,
"step": 528
},
{
"epoch": 1.5289017341040463,
"grad_norm": 2.5500870773853053,
"learning_rate": 1.3076552698949175e-06,
"loss": 0.022,
"step": 529
},
{
"epoch": 1.5317919075144508,
"grad_norm": 3.148763012910272,
"learning_rate": 1.2923874724446472e-06,
"loss": 0.0348,
"step": 530
},
{
"epoch": 1.5346820809248554,
"grad_norm": 3.3175768415192475,
"learning_rate": 1.277196090440243e-06,
"loss": 0.0337,
"step": 531
},
{
"epoch": 1.5375722543352601,
"grad_norm": 3.644399114002089,
"learning_rate": 1.262081436982448e-06,
"loss": 0.038,
"step": 532
},
{
"epoch": 1.5404624277456649,
"grad_norm": 2.585122401857608,
"learning_rate": 1.2470438235905975e-06,
"loss": 0.0288,
"step": 533
},
{
"epoch": 1.5433526011560694,
"grad_norm": 3.1018957028076097,
"learning_rate": 1.2320835601961928e-06,
"loss": 0.0358,
"step": 534
},
{
"epoch": 1.546242774566474,
"grad_norm": 2.238711021090234,
"learning_rate": 1.217200955136527e-06,
"loss": 0.0247,
"step": 535
},
{
"epoch": 1.5491329479768785,
"grad_norm": 2.6791734885813394,
"learning_rate": 1.2023963151483165e-06,
"loss": 0.0296,
"step": 536
},
{
"epoch": 1.5520231213872833,
"grad_norm": 2.841826452346151,
"learning_rate": 1.1876699453613883e-06,
"loss": 0.0404,
"step": 537
},
{
"epoch": 1.5549132947976878,
"grad_norm": 2.4678639583105433,
"learning_rate": 1.1730221492923882e-06,
"loss": 0.0273,
"step": 538
},
{
"epoch": 1.5578034682080926,
"grad_norm": 2.4973929272338644,
"learning_rate": 1.1584532288385209e-06,
"loss": 0.0256,
"step": 539
},
{
"epoch": 1.560693641618497,
"grad_norm": 2.341033650504072,
"learning_rate": 1.1439634842713371e-06,
"loss": 0.0227,
"step": 540
},
{
"epoch": 1.5635838150289016,
"grad_norm": 3.065988190194671,
"learning_rate": 1.1295532142305332e-06,
"loss": 0.0313,
"step": 541
},
{
"epoch": 1.5664739884393064,
"grad_norm": 2.1835285043969828,
"learning_rate": 1.115222715717807e-06,
"loss": 0.0208,
"step": 542
},
{
"epoch": 1.569364161849711,
"grad_norm": 3.500157776002619,
"learning_rate": 1.1009722840907316e-06,
"loss": 0.0319,
"step": 543
},
{
"epoch": 1.5722543352601157,
"grad_norm": 3.2214524688064574,
"learning_rate": 1.0868022130566652e-06,
"loss": 0.0416,
"step": 544
},
{
"epoch": 1.5751445086705202,
"grad_norm": 3.04668472610985,
"learning_rate": 1.0727127946667032e-06,
"loss": 0.0306,
"step": 545
},
{
"epoch": 1.5780346820809248,
"grad_norm": 2.679583691046011,
"learning_rate": 1.0587043193096535e-06,
"loss": 0.0308,
"step": 546
},
{
"epoch": 1.5809248554913293,
"grad_norm": 2.4842548396047226,
"learning_rate": 1.0447770757060571e-06,
"loss": 0.0328,
"step": 547
},
{
"epoch": 1.583815028901734,
"grad_norm": 3.2742820682763085,
"learning_rate": 1.030931350902235e-06,
"loss": 0.0352,
"step": 548
},
{
"epoch": 1.5867052023121389,
"grad_norm": 2.4207210641270485,
"learning_rate": 1.017167430264368e-06,
"loss": 0.0279,
"step": 549
},
{
"epoch": 1.5895953757225434,
"grad_norm": 2.152127329332973,
"learning_rate": 1.0034855974726194e-06,
"loss": 0.0201,
"step": 550
},
{
"epoch": 1.592485549132948,
"grad_norm": 3.0074798195082653,
"learning_rate": 9.89886134515291e-07,
"loss": 0.0338,
"step": 551
},
{
"epoch": 1.5953757225433525,
"grad_norm": 3.184876769094888,
"learning_rate": 9.763693216830055e-07,
"loss": 0.038,
"step": 552
},
{
"epoch": 1.5982658959537572,
"grad_norm": 2.869846326145178,
"learning_rate": 9.629354375629341e-07,
"loss": 0.0276,
"step": 553
},
{
"epoch": 1.601156069364162,
"grad_norm": 2.585461539082119,
"learning_rate": 9.495847590330486e-07,
"loss": 0.0324,
"step": 554
},
{
"epoch": 1.6040462427745665,
"grad_norm": 2.742836913299859,
"learning_rate": 9.363175612564202e-07,
"loss": 0.0302,
"step": 555
},
{
"epoch": 1.606936416184971,
"grad_norm": 3.2924360152641494,
"learning_rate": 9.231341176755487e-07,
"loss": 0.0297,
"step": 556
},
{
"epoch": 1.6098265895953756,
"grad_norm": 2.686788179114119,
"learning_rate": 9.10034700006725e-07,
"loss": 0.0321,
"step": 557
},
{
"epoch": 1.6127167630057804,
"grad_norm": 3.0218421958689063,
"learning_rate": 8.970195782344266e-07,
"loss": 0.0332,
"step": 558
},
{
"epoch": 1.6156069364161851,
"grad_norm": 2.3212479985232077,
"learning_rate": 8.840890206057634e-07,
"loss": 0.0261,
"step": 559
},
{
"epoch": 1.6184971098265897,
"grad_norm": 2.8736201021277377,
"learning_rate": 8.712432936249365e-07,
"loss": 0.03,
"step": 560
},
{
"epoch": 1.6213872832369942,
"grad_norm": 2.3595839968196866,
"learning_rate": 8.584826620477566e-07,
"loss": 0.0408,
"step": 561
},
{
"epoch": 1.6242774566473988,
"grad_norm": 3.0347610178615674,
"learning_rate": 8.458073888761826e-07,
"loss": 0.0362,
"step": 562
},
{
"epoch": 1.6271676300578035,
"grad_norm": 2.38603129609543,
"learning_rate": 8.332177353528964e-07,
"loss": 0.0251,
"step": 563
},
{
"epoch": 1.630057803468208,
"grad_norm": 2.6055073766794687,
"learning_rate": 8.207139609559284e-07,
"loss": 0.0253,
"step": 564
},
{
"epoch": 1.6329479768786128,
"grad_norm": 2.583083431789858,
"learning_rate": 8.082963233932995e-07,
"loss": 0.0259,
"step": 565
},
{
"epoch": 1.6358381502890174,
"grad_norm": 2.0515263932259518,
"learning_rate": 7.959650785977179e-07,
"loss": 0.0217,
"step": 566
},
{
"epoch": 1.638728323699422,
"grad_norm": 2.746779673644074,
"learning_rate": 7.837204807213017e-07,
"loss": 0.0314,
"step": 567
},
{
"epoch": 1.6416184971098264,
"grad_norm": 2.3296657216334102,
"learning_rate": 7.71562782130334e-07,
"loss": 0.0273,
"step": 568
},
{
"epoch": 1.6445086705202312,
"grad_norm": 2.3206553449940284,
"learning_rate": 7.594922334000738e-07,
"loss": 0.0202,
"step": 569
},
{
"epoch": 1.647398843930636,
"grad_norm": 3.801639203355854,
"learning_rate": 7.475090833095799e-07,
"loss": 0.0451,
"step": 570
},
{
"epoch": 1.6502890173410405,
"grad_norm": 2.3365922810094872,
"learning_rate": 7.356135788365915e-07,
"loss": 0.0216,
"step": 571
},
{
"epoch": 1.653179190751445,
"grad_norm": 2.7038418344399258,
"learning_rate": 7.238059651524354e-07,
"loss": 0.0238,
"step": 572
},
{
"epoch": 1.6560693641618496,
"grad_norm": 2.9959245625608184,
"learning_rate": 7.120864856169696e-07,
"loss": 0.0305,
"step": 573
},
{
"epoch": 1.6589595375722543,
"grad_norm": 2.3080167609962046,
"learning_rate": 7.004553817735732e-07,
"loss": 0.0217,
"step": 574
},
{
"epoch": 1.661849710982659,
"grad_norm": 2.4821282413657033,
"learning_rate": 6.88912893344163e-07,
"loss": 0.0267,
"step": 575
},
{
"epoch": 1.6647398843930636,
"grad_norm": 2.653018792418636,
"learning_rate": 6.774592582242567e-07,
"loss": 0.0298,
"step": 576
},
{
"epoch": 1.6676300578034682,
"grad_norm": 2.642689505666662,
"learning_rate": 6.660947124780686e-07,
"loss": 0.0254,
"step": 577
},
{
"epoch": 1.6705202312138727,
"grad_norm": 3.169845791000904,
"learning_rate": 6.548194903336408e-07,
"loss": 0.031,
"step": 578
},
{
"epoch": 1.6734104046242775,
"grad_norm": 2.9465397732730363,
"learning_rate": 6.436338241780227e-07,
"loss": 0.0338,
"step": 579
},
{
"epoch": 1.6763005780346822,
"grad_norm": 2.2311471874361373,
"learning_rate": 6.325379445524732e-07,
"loss": 0.0229,
"step": 580
},
{
"epoch": 1.6791907514450868,
"grad_norm": 3.081369160616298,
"learning_rate": 6.215320801477154e-07,
"loss": 0.0363,
"step": 581
},
{
"epoch": 1.6820809248554913,
"grad_norm": 2.421411126342786,
"learning_rate": 6.106164577992224e-07,
"loss": 0.0271,
"step": 582
},
{
"epoch": 1.6849710982658959,
"grad_norm": 2.9947960888538767,
"learning_rate": 5.99791302482538e-07,
"loss": 0.0337,
"step": 583
},
{
"epoch": 1.6878612716763006,
"grad_norm": 2.227042908760082,
"learning_rate": 5.890568373086425e-07,
"loss": 0.0211,
"step": 584
},
{
"epoch": 1.6907514450867052,
"grad_norm": 3.712493485936692,
"learning_rate": 5.784132835193562e-07,
"loss": 0.0334,
"step": 585
},
{
"epoch": 1.69364161849711,
"grad_norm": 2.725787651475256,
"learning_rate": 5.678608604827784e-07,
"loss": 0.0297,
"step": 586
},
{
"epoch": 1.6965317919075145,
"grad_norm": 2.1287017206524106,
"learning_rate": 5.573997856887642e-07,
"loss": 0.0227,
"step": 587
},
{
"epoch": 1.699421965317919,
"grad_norm": 2.0962651279994864,
"learning_rate": 5.470302747444428e-07,
"loss": 0.0222,
"step": 588
},
{
"epoch": 1.7023121387283235,
"grad_norm": 2.2102726315322005,
"learning_rate": 5.367525413697716e-07,
"loss": 0.0198,
"step": 589
},
{
"epoch": 1.7052023121387283,
"grad_norm": 3.081800122703875,
"learning_rate": 5.265667973931416e-07,
"loss": 0.0293,
"step": 590
},
{
"epoch": 1.708092485549133,
"grad_norm": 2.5073112696737234,
"learning_rate": 5.164732527469968e-07,
"loss": 0.0255,
"step": 591
},
{
"epoch": 1.7109826589595376,
"grad_norm": 2.9631917652655515,
"learning_rate": 5.064721154635155e-07,
"loss": 0.0238,
"step": 592
},
{
"epoch": 1.7138728323699421,
"grad_norm": 2.9449876977836835,
"learning_rate": 4.965635916703248e-07,
"loss": 0.0266,
"step": 593
},
{
"epoch": 1.7167630057803467,
"grad_norm": 2.8389071960037877,
"learning_rate": 4.86747885586244e-07,
"loss": 0.0289,
"step": 594
},
{
"epoch": 1.7196531791907514,
"grad_norm": 3.7075338546107095,
"learning_rate": 4.770251995170871e-07,
"loss": 0.0354,
"step": 595
},
{
"epoch": 1.7225433526011562,
"grad_norm": 2.8275729208383127,
"learning_rate": 4.673957338514812e-07,
"loss": 0.0284,
"step": 596
},
{
"epoch": 1.7254335260115607,
"grad_norm": 2.394113671682943,
"learning_rate": 4.5785968705674255e-07,
"loss": 0.026,
"step": 597
},
{
"epoch": 1.7283236994219653,
"grad_norm": 2.6354299148272378,
"learning_rate": 4.48417255674789e-07,
"loss": 0.0285,
"step": 598
},
{
"epoch": 1.7312138728323698,
"grad_norm": 2.487760980340726,
"learning_rate": 4.3906863431808e-07,
"loss": 0.0258,
"step": 599
},
{
"epoch": 1.7341040462427746,
"grad_norm": 2.7589247543697226,
"learning_rate": 4.298140156656178e-07,
"loss": 0.0286,
"step": 600
},
{
"epoch": 1.7341040462427746,
"eval_loss": 0.08402061462402344,
"eval_runtime": 0.9305,
"eval_samples_per_second": 30.092,
"eval_steps_per_second": 7.523,
"step": 600
},
{
"epoch": 1.7369942196531793,
"grad_norm": 3.2204676092038107,
"learning_rate": 4.2065359045896427e-07,
"loss": 0.0302,
"step": 601
},
{
"epoch": 1.739884393063584,
"grad_norm": 4.899074590727239,
"learning_rate": 4.115875474983161e-07,
"loss": 0.0279,
"step": 602
},
{
"epoch": 1.7427745664739884,
"grad_norm": 3.1374643269567724,
"learning_rate": 4.0261607363861365e-07,
"loss": 0.0344,
"step": 603
},
{
"epoch": 1.745664739884393,
"grad_norm": 2.749091436221168,
"learning_rate": 3.937393537856871e-07,
"loss": 0.0305,
"step": 604
},
{
"epoch": 1.7485549132947977,
"grad_norm": 2.741639181393763,
"learning_rate": 3.84957570892448e-07,
"loss": 0.0308,
"step": 605
},
{
"epoch": 1.7514450867052023,
"grad_norm": 3.8598686607362223,
"learning_rate": 3.762709059551184e-07,
"loss": 0.0467,
"step": 606
},
{
"epoch": 1.754335260115607,
"grad_norm": 2.3025890101533317,
"learning_rate": 3.6767953800949554e-07,
"loss": 0.0226,
"step": 607
},
{
"epoch": 1.7572254335260116,
"grad_norm": 1.902682668166002,
"learning_rate": 3.5918364412727004e-07,
"loss": 0.0181,
"step": 608
},
{
"epoch": 1.760115606936416,
"grad_norm": 3.3068575007443783,
"learning_rate": 3.5078339941237107e-07,
"loss": 0.0309,
"step": 609
},
{
"epoch": 1.7630057803468207,
"grad_norm": 2.476320454660619,
"learning_rate": 3.4247897699735575e-07,
"loss": 0.0339,
"step": 610
},
{
"epoch": 1.7658959537572254,
"grad_norm": 2.8791371798867584,
"learning_rate": 3.3427054803984784e-07,
"loss": 0.0279,
"step": 611
},
{
"epoch": 1.7687861271676302,
"grad_norm": 2.0036596494241343,
"learning_rate": 3.2615828171900234e-07,
"loss": 0.0206,
"step": 612
},
{
"epoch": 1.7716763005780347,
"grad_norm": 2.103998944647541,
"learning_rate": 3.181423452320209e-07,
"loss": 0.0194,
"step": 613
},
{
"epoch": 1.7745664739884393,
"grad_norm": 2.995779745958891,
"learning_rate": 3.102229037907134e-07,
"loss": 0.0281,
"step": 614
},
{
"epoch": 1.7774566473988438,
"grad_norm": 3.388218734539243,
"learning_rate": 3.024001206180799e-07,
"loss": 0.031,
"step": 615
},
{
"epoch": 1.7803468208092486,
"grad_norm": 2.8090278682040197,
"learning_rate": 2.946741569449563e-07,
"loss": 0.0262,
"step": 616
},
{
"epoch": 1.7832369942196533,
"grad_norm": 2.4275238476413845,
"learning_rate": 2.8704517200668746e-07,
"loss": 0.0246,
"step": 617
},
{
"epoch": 1.7861271676300579,
"grad_norm": 2.8947907296916604,
"learning_rate": 2.7951332303984335e-07,
"loss": 0.0273,
"step": 618
},
{
"epoch": 1.7890173410404624,
"grad_norm": 2.842966900220881,
"learning_rate": 2.7207876527898746e-07,
"loss": 0.0345,
"step": 619
},
{
"epoch": 1.791907514450867,
"grad_norm": 2.7282199212357816,
"learning_rate": 2.6474165195346346e-07,
"loss": 0.0285,
"step": 620
},
{
"epoch": 1.7947976878612717,
"grad_norm": 2.9452491096470603,
"learning_rate": 2.575021342842493e-07,
"loss": 0.0303,
"step": 621
},
{
"epoch": 1.7976878612716765,
"grad_norm": 2.595963079044866,
"learning_rate": 2.5036036148083367e-07,
"loss": 0.029,
"step": 622
},
{
"epoch": 1.800578034682081,
"grad_norm": 3.1034369862005313,
"learning_rate": 2.4331648073814107e-07,
"loss": 0.0362,
"step": 623
},
{
"epoch": 1.8034682080924855,
"grad_norm": 2.0293576730904404,
"learning_rate": 2.363706372335045e-07,
"loss": 0.0214,
"step": 624
},
{
"epoch": 1.80635838150289,
"grad_norm": 3.0290797901523447,
"learning_rate": 2.2952297412366432e-07,
"loss": 0.0285,
"step": 625
},
{
"epoch": 1.8092485549132948,
"grad_norm": 2.4301291771331375,
"learning_rate": 2.2277363254182228e-07,
"loss": 0.0306,
"step": 626
},
{
"epoch": 1.8121387283236994,
"grad_norm": 2.0209004900593452,
"learning_rate": 2.161227515947334e-07,
"loss": 0.0219,
"step": 627
},
{
"epoch": 1.8150289017341041,
"grad_norm": 2.272922638009751,
"learning_rate": 2.0957046835983764e-07,
"loss": 0.0261,
"step": 628
},
{
"epoch": 1.8179190751445087,
"grad_norm": 2.267867365649924,
"learning_rate": 2.0311691788243548e-07,
"loss": 0.0226,
"step": 629
},
{
"epoch": 1.8208092485549132,
"grad_norm": 2.6833144758951826,
"learning_rate": 1.9676223317290245e-07,
"loss": 0.0292,
"step": 630
},
{
"epoch": 1.8236994219653178,
"grad_norm": 2.7577394624568763,
"learning_rate": 1.905065452039495e-07,
"loss": 0.0328,
"step": 631
},
{
"epoch": 1.8265895953757225,
"grad_norm": 2.507658068123955,
"learning_rate": 1.8434998290792373e-07,
"loss": 0.0273,
"step": 632
},
{
"epoch": 1.8294797687861273,
"grad_norm": 2.0490162702647226,
"learning_rate": 1.7829267317415188e-07,
"loss": 0.0242,
"step": 633
},
{
"epoch": 1.8323699421965318,
"grad_norm": 2.7696464782973256,
"learning_rate": 1.7233474084632107e-07,
"loss": 0.0282,
"step": 634
},
{
"epoch": 1.8352601156069364,
"grad_norm": 3.3311577875403917,
"learning_rate": 1.6647630871991116e-07,
"loss": 0.0263,
"step": 635
},
{
"epoch": 1.838150289017341,
"grad_norm": 2.4750883996948674,
"learning_rate": 1.6071749753965914e-07,
"loss": 0.0274,
"step": 636
},
{
"epoch": 1.8410404624277457,
"grad_norm": 3.173715006325098,
"learning_rate": 1.5505842599707442e-07,
"loss": 0.0343,
"step": 637
},
{
"epoch": 1.8439306358381504,
"grad_norm": 3.489741224468283,
"learning_rate": 1.4949921072798967e-07,
"loss": 0.035,
"step": 638
},
{
"epoch": 1.846820809248555,
"grad_norm": 2.882500598594475,
"learning_rate": 1.440399663101577e-07,
"loss": 0.0277,
"step": 639
},
{
"epoch": 1.8497109826589595,
"grad_norm": 3.233374202089709,
"learning_rate": 1.386808052608918e-07,
"loss": 0.0391,
"step": 640
},
{
"epoch": 1.852601156069364,
"grad_norm": 2.7034183642322347,
"learning_rate": 1.334218380347424e-07,
"loss": 0.0292,
"step": 641
},
{
"epoch": 1.8554913294797688,
"grad_norm": 2.197194297058419,
"learning_rate": 1.282631730212258e-07,
"loss": 0.0249,
"step": 642
},
{
"epoch": 1.8583815028901736,
"grad_norm": 2.9036298407234313,
"learning_rate": 1.2320491654258803e-07,
"loss": 0.0298,
"step": 643
},
{
"epoch": 1.861271676300578,
"grad_norm": 2.8094195877802393,
"learning_rate": 1.1824717285160992e-07,
"loss": 0.0298,
"step": 644
},
{
"epoch": 1.8641618497109826,
"grad_norm": 2.717689585519079,
"learning_rate": 1.1339004412946553e-07,
"loss": 0.0301,
"step": 645
},
{
"epoch": 1.8670520231213872,
"grad_norm": 3.2884617474113127,
"learning_rate": 1.0863363048360942e-07,
"loss": 0.042,
"step": 646
},
{
"epoch": 1.869942196531792,
"grad_norm": 2.142509289269094,
"learning_rate": 1.0397802994571826e-07,
"loss": 0.0212,
"step": 647
},
{
"epoch": 1.8728323699421965,
"grad_norm": 3.6837900459823167,
"learning_rate": 9.942333846966745e-08,
"loss": 0.0443,
"step": 648
},
{
"epoch": 1.8757225433526012,
"grad_norm": 3.331072208277014,
"learning_rate": 9.496964992955382e-08,
"loss": 0.03,
"step": 649
},
{
"epoch": 1.8786127167630058,
"grad_norm": 2.231335330345806,
"learning_rate": 9.061705611776273e-08,
"loss": 0.0285,
"step": 650
},
{
"epoch": 1.8815028901734103,
"grad_norm": 2.6317094385765016,
"learning_rate": 8.636564674307402e-08,
"loss": 0.0334,
"step": 651
},
{
"epoch": 1.8843930635838149,
"grad_norm": 3.3131881744849396,
"learning_rate": 8.221550942881406e-08,
"loss": 0.0342,
"step": 652
},
{
"epoch": 1.8872832369942196,
"grad_norm": 3.150237302456506,
"learning_rate": 7.816672971105055e-08,
"loss": 0.0276,
"step": 653
},
{
"epoch": 1.8901734104046244,
"grad_norm": 1.9693826852564784,
"learning_rate": 7.421939103682662e-08,
"loss": 0.0209,
"step": 654
},
{
"epoch": 1.893063583815029,
"grad_norm": 2.337524129982516,
"learning_rate": 7.037357476244566e-08,
"loss": 0.0247,
"step": 655
},
{
"epoch": 1.8959537572254335,
"grad_norm": 2.8829233540951984,
"learning_rate": 6.662936015178978e-08,
"loss": 0.0313,
"step": 656
},
{
"epoch": 1.898843930635838,
"grad_norm": 3.3070223112681676,
"learning_rate": 6.298682437468895e-08,
"loss": 0.0351,
"step": 657
},
{
"epoch": 1.9017341040462428,
"grad_norm": 1.9799003747289656,
"learning_rate": 5.9446042505330594e-08,
"loss": 0.0235,
"step": 658
},
{
"epoch": 1.9046242774566475,
"grad_norm": 2.8571160793919206,
"learning_rate": 5.600708752071082e-08,
"loss": 0.0359,
"step": 659
},
{
"epoch": 1.907514450867052,
"grad_norm": 2.7689616678406055,
"learning_rate": 5.267003029913065e-08,
"loss": 0.0236,
"step": 660
},
{
"epoch": 1.9104046242774566,
"grad_norm": 2.762628793592965,
"learning_rate": 4.943493961873658e-08,
"loss": 0.023,
"step": 661
},
{
"epoch": 1.9132947976878611,
"grad_norm": 2.930260556434489,
"learning_rate": 4.630188215610065e-08,
"loss": 0.0235,
"step": 662
},
{
"epoch": 1.916184971098266,
"grad_norm": 2.358664696088892,
"learning_rate": 4.327092248484932e-08,
"loss": 0.0272,
"step": 663
},
{
"epoch": 1.9190751445086707,
"grad_norm": 3.09076782112662,
"learning_rate": 4.03421230743295e-08,
"loss": 0.0343,
"step": 664
},
{
"epoch": 1.9219653179190752,
"grad_norm": 1.7621073890549188,
"learning_rate": 3.751554428832238e-08,
"loss": 0.0177,
"step": 665
},
{
"epoch": 1.9248554913294798,
"grad_norm": 2.1762751213497933,
"learning_rate": 3.4791244383799994e-08,
"loss": 0.0185,
"step": 666
},
{
"epoch": 1.9277456647398843,
"grad_norm": 2.5431146597843997,
"learning_rate": 3.216927950972393e-08,
"loss": 0.0322,
"step": 667
},
{
"epoch": 1.930635838150289,
"grad_norm": 2.417193867852855,
"learning_rate": 2.964970370588738e-08,
"loss": 0.025,
"step": 668
},
{
"epoch": 1.9335260115606936,
"grad_norm": 2.2455101858667565,
"learning_rate": 2.7232568901801592e-08,
"loss": 0.0277,
"step": 669
},
{
"epoch": 1.9364161849710984,
"grad_norm": 2.4713814937235385,
"learning_rate": 2.4917924915626725e-08,
"loss": 0.0252,
"step": 670
},
{
"epoch": 1.939306358381503,
"grad_norm": 2.7741728341467775,
"learning_rate": 2.2705819453144316e-08,
"loss": 0.0233,
"step": 671
},
{
"epoch": 1.9421965317919074,
"grad_norm": 3.2682376836372056,
"learning_rate": 2.0596298106774214e-08,
"loss": 0.0342,
"step": 672
},
{
"epoch": 1.9450867052023122,
"grad_norm": 2.582049310045216,
"learning_rate": 1.8589404354632523e-08,
"loss": 0.03,
"step": 673
},
{
"epoch": 1.9479768786127167,
"grad_norm": 2.429361450682195,
"learning_rate": 1.6685179559641217e-08,
"loss": 0.0233,
"step": 674
},
{
"epoch": 1.9508670520231215,
"grad_norm": 3.132745641356759,
"learning_rate": 1.4883662968669387e-08,
"loss": 0.0407,
"step": 675
},
{
"epoch": 1.953757225433526,
"grad_norm": 2.3239626051059386,
"learning_rate": 1.3184891711727766e-08,
"loss": 0.0247,
"step": 676
},
{
"epoch": 1.9566473988439306,
"grad_norm": 2.477156881588747,
"learning_rate": 1.1588900801203229e-08,
"loss": 0.0228,
"step": 677
},
{
"epoch": 1.9595375722543351,
"grad_norm": 2.139721681837424,
"learning_rate": 1.0095723131136603e-08,
"loss": 0.0252,
"step": 678
},
{
"epoch": 1.9624277456647399,
"grad_norm": 3.426665247902062,
"learning_rate": 8.705389476543758e-09,
"loss": 0.0267,
"step": 679
},
{
"epoch": 1.9653179190751446,
"grad_norm": 2.8650595430397288,
"learning_rate": 7.417928492784443e-09,
"loss": 0.0323,
"step": 680
},
{
"epoch": 1.9682080924855492,
"grad_norm": 2.7291850369640613,
"learning_rate": 6.233366714967215e-09,
"loss": 0.026,
"step": 681
},
{
"epoch": 1.9710982658959537,
"grad_norm": 3.359249209071856,
"learning_rate": 5.151728557406532e-09,
"loss": 0.0357,
"step": 682
},
{
"epoch": 1.9739884393063583,
"grad_norm": 2.6983693308255314,
"learning_rate": 4.173036313117607e-09,
"loss": 0.0268,
"step": 683
},
{
"epoch": 1.976878612716763,
"grad_norm": 2.2142359024895644,
"learning_rate": 3.2973101533567698e-09,
"loss": 0.0277,
"step": 684
},
{
"epoch": 1.9797687861271678,
"grad_norm": 2.3357628732709474,
"learning_rate": 2.5245681272068057e-09,
"loss": 0.0229,
"step": 685
},
{
"epoch": 1.9826589595375723,
"grad_norm": 2.1678730353523386,
"learning_rate": 1.8548261612050255e-09,
"loss": 0.0236,
"step": 686
},
{
"epoch": 1.9855491329479769,
"grad_norm": 2.90668259735446,
"learning_rate": 1.2880980590124214e-09,
"loss": 0.0325,
"step": 687
},
{
"epoch": 1.9884393063583814,
"grad_norm": 1.775137098330101,
"learning_rate": 8.243955011333349e-10,
"loss": 0.0188,
"step": 688
},
{
"epoch": 1.9913294797687862,
"grad_norm": 3.1118509748289944,
"learning_rate": 4.637280446712078e-10,
"loss": 0.034,
"step": 689
},
{
"epoch": 1.9942196531791907,
"grad_norm": 2.7054381917248325,
"learning_rate": 2.0610312313318336e-10,
"loss": 0.0344,
"step": 690
},
{
"epoch": 1.9971098265895955,
"grad_norm": 2.7307481010776367,
"learning_rate": 5.152604627634006e-11,
"loss": 0.0265,
"step": 691
},
{
"epoch": 2.0,
"grad_norm": 1.7811358836387599,
"learning_rate": 0.0,
"loss": 0.0164,
"step": 692
},
{
"epoch": 2.0,
"step": 692,
"total_flos": 1412134244352.0,
"train_loss": 0.0628656179461133,
"train_runtime": 422.1909,
"train_samples_per_second": 13.103,
"train_steps_per_second": 1.639
}
],
"logging_steps": 1,
"max_steps": 692,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1412134244352.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}