Henbane-r4-s615 / trainer_state.json
kubernetes-bad's picture
Upload folder using huggingface_hub
78b2b8c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4885193378594859,
"eval_steps": 500,
"global_step": 615,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024410710199099855,
"grad_norm": 8.533896446228027,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1655,
"step": 1
},
{
"epoch": 0.004882142039819971,
"grad_norm": 8.988560676574707,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2516,
"step": 2
},
{
"epoch": 0.0073232130597299565,
"grad_norm": 7.550627708435059,
"learning_rate": 6e-06,
"loss": 1.1895,
"step": 3
},
{
"epoch": 0.009764284079639942,
"grad_norm": 3.6377415657043457,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0982,
"step": 4
},
{
"epoch": 0.012205355099549928,
"grad_norm": 3.964740753173828,
"learning_rate": 1e-05,
"loss": 1.0622,
"step": 5
},
{
"epoch": 0.014646426119459913,
"grad_norm": 4.8016157150268555,
"learning_rate": 9.999962669988608e-06,
"loss": 1.0653,
"step": 6
},
{
"epoch": 0.017087497139369898,
"grad_norm": 2.9538488388061523,
"learning_rate": 9.999850680511844e-06,
"loss": 1.026,
"step": 7
},
{
"epoch": 0.019528568159279884,
"grad_norm": 2.869965076446533,
"learning_rate": 9.999664033241933e-06,
"loss": 1.0349,
"step": 8
},
{
"epoch": 0.02196963917918987,
"grad_norm": 1.8026058673858643,
"learning_rate": 9.999402730965894e-06,
"loss": 1.0421,
"step": 9
},
{
"epoch": 0.024410710199099857,
"grad_norm": 1.075210452079773,
"learning_rate": 9.999066777585496e-06,
"loss": 1.0008,
"step": 10
},
{
"epoch": 0.02685178121900984,
"grad_norm": 1.4493818283081055,
"learning_rate": 9.998656178117193e-06,
"loss": 0.9347,
"step": 11
},
{
"epoch": 0.029292852238919826,
"grad_norm": 1.2218502759933472,
"learning_rate": 9.99817093869206e-06,
"loss": 0.9537,
"step": 12
},
{
"epoch": 0.03173392325882981,
"grad_norm": 1.0389800071716309,
"learning_rate": 9.997611066555694e-06,
"loss": 0.9458,
"step": 13
},
{
"epoch": 0.034174994278739795,
"grad_norm": 0.959168016910553,
"learning_rate": 9.99697657006811e-06,
"loss": 0.9622,
"step": 14
},
{
"epoch": 0.03661606529864978,
"grad_norm": 1.0173426866531372,
"learning_rate": 9.99626745870361e-06,
"loss": 0.9594,
"step": 15
},
{
"epoch": 0.03905713631855977,
"grad_norm": 0.9893942475318909,
"learning_rate": 9.995483743050649e-06,
"loss": 0.9128,
"step": 16
},
{
"epoch": 0.041498207338469754,
"grad_norm": 0.9174278974533081,
"learning_rate": 9.99462543481167e-06,
"loss": 0.9108,
"step": 17
},
{
"epoch": 0.04393927835837974,
"grad_norm": 0.8355342745780945,
"learning_rate": 9.993692546802943e-06,
"loss": 0.9341,
"step": 18
},
{
"epoch": 0.04638034937828973,
"grad_norm": 0.9482454061508179,
"learning_rate": 9.992685092954347e-06,
"loss": 0.8488,
"step": 19
},
{
"epoch": 0.048821420398199714,
"grad_norm": 0.8152992129325867,
"learning_rate": 9.991603088309195e-06,
"loss": 0.9388,
"step": 20
},
{
"epoch": 0.05126249141810969,
"grad_norm": 0.7824520468711853,
"learning_rate": 9.990446549023977e-06,
"loss": 0.917,
"step": 21
},
{
"epoch": 0.05370356243801968,
"grad_norm": 0.8396065831184387,
"learning_rate": 9.989215492368152e-06,
"loss": 0.9043,
"step": 22
},
{
"epoch": 0.056144633457929666,
"grad_norm": 0.7503563761711121,
"learning_rate": 9.98790993672386e-06,
"loss": 0.9368,
"step": 23
},
{
"epoch": 0.05858570447783965,
"grad_norm": 0.846466600894928,
"learning_rate": 9.98652990158566e-06,
"loss": 0.8641,
"step": 24
},
{
"epoch": 0.06102677549774964,
"grad_norm": 0.8216990232467651,
"learning_rate": 9.985075407560247e-06,
"loss": 0.8744,
"step": 25
},
{
"epoch": 0.06346784651765962,
"grad_norm": 0.7758781313896179,
"learning_rate": 9.983546476366133e-06,
"loss": 0.8722,
"step": 26
},
{
"epoch": 0.06590891753756961,
"grad_norm": 0.8065202236175537,
"learning_rate": 9.981943130833323e-06,
"loss": 0.8582,
"step": 27
},
{
"epoch": 0.06834998855747959,
"grad_norm": 0.79361891746521,
"learning_rate": 9.980265394902982e-06,
"loss": 0.8549,
"step": 28
},
{
"epoch": 0.07079105957738958,
"grad_norm": 0.7769683003425598,
"learning_rate": 9.978513293627068e-06,
"loss": 0.8801,
"step": 29
},
{
"epoch": 0.07323213059729956,
"grad_norm": 0.7662413120269775,
"learning_rate": 9.976686853167967e-06,
"loss": 0.849,
"step": 30
},
{
"epoch": 0.07567320161720956,
"grad_norm": 0.7053027153015137,
"learning_rate": 9.974786100798098e-06,
"loss": 0.8925,
"step": 31
},
{
"epoch": 0.07811427263711954,
"grad_norm": 0.7407605051994324,
"learning_rate": 9.9728110648995e-06,
"loss": 0.8623,
"step": 32
},
{
"epoch": 0.08055534365702952,
"grad_norm": 0.7798149585723877,
"learning_rate": 9.970761774963421e-06,
"loss": 0.8711,
"step": 33
},
{
"epoch": 0.08299641467693951,
"grad_norm": 0.7310554385185242,
"learning_rate": 9.968638261589866e-06,
"loss": 0.9071,
"step": 34
},
{
"epoch": 0.08543748569684949,
"grad_norm": 0.8006892204284668,
"learning_rate": 9.966440556487149e-06,
"loss": 0.9026,
"step": 35
},
{
"epoch": 0.08787855671675948,
"grad_norm": 0.7774298787117004,
"learning_rate": 9.96416869247141e-06,
"loss": 0.8512,
"step": 36
},
{
"epoch": 0.09031962773666946,
"grad_norm": 0.7737051844596863,
"learning_rate": 9.961822703466131e-06,
"loss": 0.8629,
"step": 37
},
{
"epoch": 0.09276069875657945,
"grad_norm": 0.8388147950172424,
"learning_rate": 9.959402624501636e-06,
"loss": 0.803,
"step": 38
},
{
"epoch": 0.09520176977648943,
"grad_norm": 0.7394818067550659,
"learning_rate": 9.956908491714552e-06,
"loss": 0.8768,
"step": 39
},
{
"epoch": 0.09764284079639943,
"grad_norm": 0.8373251557350159,
"learning_rate": 9.95434034234728e-06,
"loss": 0.8604,
"step": 40
},
{
"epoch": 0.1000839118163094,
"grad_norm": 0.7941448092460632,
"learning_rate": 9.951698214747441e-06,
"loss": 0.8397,
"step": 41
},
{
"epoch": 0.10252498283621939,
"grad_norm": 0.7676767706871033,
"learning_rate": 9.948982148367294e-06,
"loss": 0.8434,
"step": 42
},
{
"epoch": 0.10496605385612938,
"grad_norm": 0.7958892583847046,
"learning_rate": 9.946192183763155e-06,
"loss": 0.8503,
"step": 43
},
{
"epoch": 0.10740712487603936,
"grad_norm": 0.793487012386322,
"learning_rate": 9.943328362594788e-06,
"loss": 0.8566,
"step": 44
},
{
"epoch": 0.10984819589594935,
"grad_norm": 0.716295599937439,
"learning_rate": 9.940390727624785e-06,
"loss": 0.8128,
"step": 45
},
{
"epoch": 0.11228926691585933,
"grad_norm": 0.7760279178619385,
"learning_rate": 9.937379322717923e-06,
"loss": 0.8409,
"step": 46
},
{
"epoch": 0.11473033793576932,
"grad_norm": 0.8229836821556091,
"learning_rate": 9.934294192840518e-06,
"loss": 0.8429,
"step": 47
},
{
"epoch": 0.1171714089556793,
"grad_norm": 0.6973395347595215,
"learning_rate": 9.931135384059737e-06,
"loss": 0.8542,
"step": 48
},
{
"epoch": 0.11961247997558928,
"grad_norm": 0.7911590933799744,
"learning_rate": 9.927902943542932e-06,
"loss": 0.8554,
"step": 49
},
{
"epoch": 0.12205355099549928,
"grad_norm": 0.6992570757865906,
"learning_rate": 9.924596919556917e-06,
"loss": 0.8706,
"step": 50
},
{
"epoch": 0.12449462201540926,
"grad_norm": 0.7577567100524902,
"learning_rate": 9.921217361467259e-06,
"loss": 0.856,
"step": 51
},
{
"epoch": 0.12693569303531924,
"grad_norm": 0.8022581934928894,
"learning_rate": 9.917764319737533e-06,
"loss": 0.8276,
"step": 52
},
{
"epoch": 0.12937676405522924,
"grad_norm": 0.720230758190155,
"learning_rate": 9.914237845928574e-06,
"loss": 0.8613,
"step": 53
},
{
"epoch": 0.13181783507513922,
"grad_norm": 0.7254828214645386,
"learning_rate": 9.910637992697707e-06,
"loss": 0.8617,
"step": 54
},
{
"epoch": 0.1342589060950492,
"grad_norm": 0.7254623174667358,
"learning_rate": 9.906964813797955e-06,
"loss": 0.8543,
"step": 55
},
{
"epoch": 0.13669997711495918,
"grad_norm": 0.7306321859359741,
"learning_rate": 9.903218364077242e-06,
"loss": 0.8332,
"step": 56
},
{
"epoch": 0.13914104813486916,
"grad_norm": 0.7202122211456299,
"learning_rate": 9.899398699477573e-06,
"loss": 0.8663,
"step": 57
},
{
"epoch": 0.14158211915477917,
"grad_norm": 0.7067145109176636,
"learning_rate": 9.895505877034198e-06,
"loss": 0.8165,
"step": 58
},
{
"epoch": 0.14402319017468915,
"grad_norm": 0.7376930713653564,
"learning_rate": 9.891539954874758e-06,
"loss": 0.8267,
"step": 59
},
{
"epoch": 0.14646426119459913,
"grad_norm": 0.7250686883926392,
"learning_rate": 9.887500992218421e-06,
"loss": 0.8239,
"step": 60
},
{
"epoch": 0.1489053322145091,
"grad_norm": 0.7254573106765747,
"learning_rate": 9.883389049374998e-06,
"loss": 0.8452,
"step": 61
},
{
"epoch": 0.1513464032344191,
"grad_norm": 0.7461521029472351,
"learning_rate": 9.879204187744036e-06,
"loss": 0.803,
"step": 62
},
{
"epoch": 0.1537874742543291,
"grad_norm": 0.7778986096382141,
"learning_rate": 9.874946469813907e-06,
"loss": 0.8287,
"step": 63
},
{
"epoch": 0.15622854527423907,
"grad_norm": 0.7395936846733093,
"learning_rate": 9.870615959160876e-06,
"loss": 0.8781,
"step": 64
},
{
"epoch": 0.15866961629414905,
"grad_norm": 0.7308329343795776,
"learning_rate": 9.866212720448149e-06,
"loss": 0.807,
"step": 65
},
{
"epoch": 0.16111068731405903,
"grad_norm": 0.7851212620735168,
"learning_rate": 9.861736819424904e-06,
"loss": 0.821,
"step": 66
},
{
"epoch": 0.16355175833396904,
"grad_norm": 0.7638505697250366,
"learning_rate": 9.857188322925317e-06,
"loss": 0.8273,
"step": 67
},
{
"epoch": 0.16599282935387902,
"grad_norm": 0.7750548720359802,
"learning_rate": 9.852567298867557e-06,
"loss": 0.8523,
"step": 68
},
{
"epoch": 0.168433900373789,
"grad_norm": 0.7466771602630615,
"learning_rate": 9.84787381625278e-06,
"loss": 0.8415,
"step": 69
},
{
"epoch": 0.17087497139369898,
"grad_norm": 0.6956301331520081,
"learning_rate": 9.843107945164086e-06,
"loss": 0.8206,
"step": 70
},
{
"epoch": 0.17331604241360898,
"grad_norm": 0.7392652630805969,
"learning_rate": 9.838269756765483e-06,
"loss": 0.8098,
"step": 71
},
{
"epoch": 0.17575711343351896,
"grad_norm": 0.7311574220657349,
"learning_rate": 9.833359323300827e-06,
"loss": 0.8116,
"step": 72
},
{
"epoch": 0.17819818445342894,
"grad_norm": 0.6983757615089417,
"learning_rate": 9.82837671809273e-06,
"loss": 0.8436,
"step": 73
},
{
"epoch": 0.18063925547333892,
"grad_norm": 0.7569893598556519,
"learning_rate": 9.823322015541474e-06,
"loss": 0.8058,
"step": 74
},
{
"epoch": 0.1830803264932489,
"grad_norm": 0.7439902424812317,
"learning_rate": 9.818195291123903e-06,
"loss": 0.8424,
"step": 75
},
{
"epoch": 0.1855213975131589,
"grad_norm": 0.7790477275848389,
"learning_rate": 9.81299662139229e-06,
"loss": 0.8483,
"step": 76
},
{
"epoch": 0.1879624685330689,
"grad_norm": 0.7717331051826477,
"learning_rate": 9.807726083973192e-06,
"loss": 0.8214,
"step": 77
},
{
"epoch": 0.19040353955297887,
"grad_norm": 0.7872374057769775,
"learning_rate": 9.8023837575663e-06,
"loss": 0.7938,
"step": 78
},
{
"epoch": 0.19284461057288885,
"grad_norm": 0.8018149137496948,
"learning_rate": 9.796969721943257e-06,
"loss": 0.802,
"step": 79
},
{
"epoch": 0.19528568159279885,
"grad_norm": 0.709600031375885,
"learning_rate": 9.791484057946465e-06,
"loss": 0.7944,
"step": 80
},
{
"epoch": 0.19772675261270883,
"grad_norm": 0.8216169476509094,
"learning_rate": 9.785926847487885e-06,
"loss": 0.8181,
"step": 81
},
{
"epoch": 0.2001678236326188,
"grad_norm": 0.7138919830322266,
"learning_rate": 9.780298173547811e-06,
"loss": 0.8043,
"step": 82
},
{
"epoch": 0.2026088946525288,
"grad_norm": 0.7637642621994019,
"learning_rate": 9.774598120173625e-06,
"loss": 0.8034,
"step": 83
},
{
"epoch": 0.20504996567243877,
"grad_norm": 0.7272418141365051,
"learning_rate": 9.76882677247855e-06,
"loss": 0.8271,
"step": 84
},
{
"epoch": 0.20749103669234878,
"grad_norm": 0.7340764999389648,
"learning_rate": 9.762984216640378e-06,
"loss": 0.8508,
"step": 85
},
{
"epoch": 0.20993210771225876,
"grad_norm": 0.7231638431549072,
"learning_rate": 9.75707053990018e-06,
"loss": 0.823,
"step": 86
},
{
"epoch": 0.21237317873216874,
"grad_norm": 0.7670260071754456,
"learning_rate": 9.751085830561e-06,
"loss": 0.8595,
"step": 87
},
{
"epoch": 0.21481424975207872,
"grad_norm": 0.7142215371131897,
"learning_rate": 9.74503017798655e-06,
"loss": 0.8325,
"step": 88
},
{
"epoch": 0.2172553207719887,
"grad_norm": 0.7884289026260376,
"learning_rate": 9.738903672599858e-06,
"loss": 0.7751,
"step": 89
},
{
"epoch": 0.2196963917918987,
"grad_norm": 0.7771654725074768,
"learning_rate": 9.732706405881931e-06,
"loss": 0.7827,
"step": 90
},
{
"epoch": 0.22213746281180868,
"grad_norm": 0.7293388247489929,
"learning_rate": 9.726438470370385e-06,
"loss": 0.8724,
"step": 91
},
{
"epoch": 0.22457853383171866,
"grad_norm": 0.7578020095825195,
"learning_rate": 9.720099959658062e-06,
"loss": 0.8277,
"step": 92
},
{
"epoch": 0.22701960485162864,
"grad_norm": 0.7896732091903687,
"learning_rate": 9.713690968391634e-06,
"loss": 0.7769,
"step": 93
},
{
"epoch": 0.22946067587153865,
"grad_norm": 0.6877868175506592,
"learning_rate": 9.707211592270183e-06,
"loss": 0.7938,
"step": 94
},
{
"epoch": 0.23190174689144863,
"grad_norm": 0.8047687411308289,
"learning_rate": 9.700661928043787e-06,
"loss": 0.7735,
"step": 95
},
{
"epoch": 0.2343428179113586,
"grad_norm": 0.7561459541320801,
"learning_rate": 9.69404207351206e-06,
"loss": 0.8079,
"step": 96
},
{
"epoch": 0.2367838889312686,
"grad_norm": 0.7163955569267273,
"learning_rate": 9.687352127522703e-06,
"loss": 0.8042,
"step": 97
},
{
"epoch": 0.23922495995117857,
"grad_norm": 0.7289466857910156,
"learning_rate": 9.680592189970015e-06,
"loss": 0.8449,
"step": 98
},
{
"epoch": 0.24166603097108857,
"grad_norm": 0.6951574087142944,
"learning_rate": 9.673762361793418e-06,
"loss": 0.7988,
"step": 99
},
{
"epoch": 0.24410710199099855,
"grad_norm": 0.7552266716957092,
"learning_rate": 9.666862744975938e-06,
"loss": 0.8323,
"step": 100
},
{
"epoch": 0.24654817301090853,
"grad_norm": 0.7086972594261169,
"learning_rate": 9.659893442542683e-06,
"loss": 0.8567,
"step": 101
},
{
"epoch": 0.2489892440308185,
"grad_norm": 0.7231544852256775,
"learning_rate": 9.652854558559309e-06,
"loss": 0.8265,
"step": 102
},
{
"epoch": 0.2514303150507285,
"grad_norm": 0.7094722986221313,
"learning_rate": 9.645746198130462e-06,
"loss": 0.7803,
"step": 103
},
{
"epoch": 0.25387138607063847,
"grad_norm": 0.6969436407089233,
"learning_rate": 9.638568467398215e-06,
"loss": 0.804,
"step": 104
},
{
"epoch": 0.25631245709054845,
"grad_norm": 0.7204388380050659,
"learning_rate": 9.631321473540476e-06,
"loss": 0.787,
"step": 105
},
{
"epoch": 0.2587535281104585,
"grad_norm": 0.6980841159820557,
"learning_rate": 9.62400532476939e-06,
"loss": 0.8294,
"step": 106
},
{
"epoch": 0.26119459913036847,
"grad_norm": 0.6793758273124695,
"learning_rate": 9.61662013032972e-06,
"loss": 0.7739,
"step": 107
},
{
"epoch": 0.26363567015027844,
"grad_norm": 0.7096854448318481,
"learning_rate": 9.60916600049723e-06,
"loss": 0.8035,
"step": 108
},
{
"epoch": 0.2660767411701884,
"grad_norm": 0.6875160932540894,
"learning_rate": 9.601643046577014e-06,
"loss": 0.8567,
"step": 109
},
{
"epoch": 0.2685178121900984,
"grad_norm": 0.7122709155082703,
"learning_rate": 9.59405138090186e-06,
"loss": 0.8153,
"step": 110
},
{
"epoch": 0.2709588832100084,
"grad_norm": 0.695655882358551,
"learning_rate": 9.586391116830549e-06,
"loss": 0.7813,
"step": 111
},
{
"epoch": 0.27339995422991836,
"grad_norm": 0.674659788608551,
"learning_rate": 9.578662368746183e-06,
"loss": 0.8802,
"step": 112
},
{
"epoch": 0.27584102524982834,
"grad_norm": 0.7121911644935608,
"learning_rate": 9.570865252054462e-06,
"loss": 0.8017,
"step": 113
},
{
"epoch": 0.2782820962697383,
"grad_norm": 0.7068195939064026,
"learning_rate": 9.562999883181968e-06,
"loss": 0.7817,
"step": 114
},
{
"epoch": 0.28072316728964836,
"grad_norm": 0.6847429275512695,
"learning_rate": 9.555066379574423e-06,
"loss": 0.801,
"step": 115
},
{
"epoch": 0.28316423830955834,
"grad_norm": 0.743248462677002,
"learning_rate": 9.547064859694943e-06,
"loss": 0.7978,
"step": 116
},
{
"epoch": 0.2856053093294683,
"grad_norm": 0.7640885710716248,
"learning_rate": 9.538995443022256e-06,
"loss": 0.7913,
"step": 117
},
{
"epoch": 0.2880463803493783,
"grad_norm": 0.7139798402786255,
"learning_rate": 9.530858250048933e-06,
"loss": 0.7994,
"step": 118
},
{
"epoch": 0.2904874513692883,
"grad_norm": 0.7640753388404846,
"learning_rate": 9.52265340227957e-06,
"loss": 0.7946,
"step": 119
},
{
"epoch": 0.29292852238919825,
"grad_norm": 0.7454321980476379,
"learning_rate": 9.514381022228997e-06,
"loss": 0.809,
"step": 120
},
{
"epoch": 0.29536959340910823,
"grad_norm": 0.6853974461555481,
"learning_rate": 9.506041233420427e-06,
"loss": 0.8013,
"step": 121
},
{
"epoch": 0.2978106644290182,
"grad_norm": 0.723430335521698,
"learning_rate": 9.497634160383627e-06,
"loss": 0.7923,
"step": 122
},
{
"epoch": 0.3002517354489282,
"grad_norm": 0.7062557935714722,
"learning_rate": 9.489159928653047e-06,
"loss": 0.7702,
"step": 123
},
{
"epoch": 0.3026928064688382,
"grad_norm": 0.6789696216583252,
"learning_rate": 9.480618664765956e-06,
"loss": 0.7748,
"step": 124
},
{
"epoch": 0.3051338774887482,
"grad_norm": 0.7581243515014648,
"learning_rate": 9.472010496260545e-06,
"loss": 0.771,
"step": 125
},
{
"epoch": 0.3075749485086582,
"grad_norm": 0.7822269201278687,
"learning_rate": 9.463335551674024e-06,
"loss": 0.8,
"step": 126
},
{
"epoch": 0.31001601952856817,
"grad_norm": 0.7157217264175415,
"learning_rate": 9.454593960540709e-06,
"loss": 0.7883,
"step": 127
},
{
"epoch": 0.31245709054847814,
"grad_norm": 0.7614567875862122,
"learning_rate": 9.445785853390074e-06,
"loss": 0.7929,
"step": 128
},
{
"epoch": 0.3148981615683881,
"grad_norm": 0.7470414042472839,
"learning_rate": 9.436911361744817e-06,
"loss": 0.7826,
"step": 129
},
{
"epoch": 0.3173392325882981,
"grad_norm": 0.7033482193946838,
"learning_rate": 9.427970618118888e-06,
"loss": 0.8359,
"step": 130
},
{
"epoch": 0.3197803036082081,
"grad_norm": 0.7030816674232483,
"learning_rate": 9.418963756015511e-06,
"loss": 0.7966,
"step": 131
},
{
"epoch": 0.32222137462811806,
"grad_norm": 0.7050835490226746,
"learning_rate": 9.409890909925191e-06,
"loss": 0.7852,
"step": 132
},
{
"epoch": 0.3246624456480281,
"grad_norm": 0.7047673463821411,
"learning_rate": 9.400752215323712e-06,
"loss": 0.8134,
"step": 133
},
{
"epoch": 0.3271035166679381,
"grad_norm": 0.6739450693130493,
"learning_rate": 9.391547808670097e-06,
"loss": 0.8186,
"step": 134
},
{
"epoch": 0.32954458768784806,
"grad_norm": 0.7166461944580078,
"learning_rate": 9.38227782740459e-06,
"loss": 0.8118,
"step": 135
},
{
"epoch": 0.33198565870775804,
"grad_norm": 0.6905531287193298,
"learning_rate": 9.372942409946597e-06,
"loss": 0.8092,
"step": 136
},
{
"epoch": 0.334426729727668,
"grad_norm": 0.7552813291549683,
"learning_rate": 9.36354169569261e-06,
"loss": 0.7405,
"step": 137
},
{
"epoch": 0.336867800747578,
"grad_norm": 0.6745990514755249,
"learning_rate": 9.35407582501414e-06,
"loss": 0.8397,
"step": 138
},
{
"epoch": 0.339308871767488,
"grad_norm": 0.7749987840652466,
"learning_rate": 9.344544939255608e-06,
"loss": 0.7979,
"step": 139
},
{
"epoch": 0.34174994278739795,
"grad_norm": 0.7859154939651489,
"learning_rate": 9.334949180732245e-06,
"loss": 0.8217,
"step": 140
},
{
"epoch": 0.34419101380730793,
"grad_norm": 0.7111227512359619,
"learning_rate": 9.325288692727963e-06,
"loss": 0.7692,
"step": 141
},
{
"epoch": 0.34663208482721797,
"grad_norm": 0.824995219707489,
"learning_rate": 9.315563619493209e-06,
"loss": 0.7989,
"step": 142
},
{
"epoch": 0.34907315584712795,
"grad_norm": 0.7707095742225647,
"learning_rate": 9.305774106242825e-06,
"loss": 0.8115,
"step": 143
},
{
"epoch": 0.3515142268670379,
"grad_norm": 0.7036089301109314,
"learning_rate": 9.295920299153863e-06,
"loss": 0.8119,
"step": 144
},
{
"epoch": 0.3539552978869479,
"grad_norm": 0.7585278153419495,
"learning_rate": 9.286002345363418e-06,
"loss": 0.7853,
"step": 145
},
{
"epoch": 0.3563963689068579,
"grad_norm": 0.7351112961769104,
"learning_rate": 9.276020392966423e-06,
"loss": 0.7974,
"step": 146
},
{
"epoch": 0.35883743992676786,
"grad_norm": 0.7286148071289062,
"learning_rate": 9.265974591013434e-06,
"loss": 0.8044,
"step": 147
},
{
"epoch": 0.36127851094667784,
"grad_norm": 0.6930050253868103,
"learning_rate": 9.25586508950841e-06,
"loss": 0.8117,
"step": 148
},
{
"epoch": 0.3637195819665878,
"grad_norm": 0.8765610456466675,
"learning_rate": 9.24569203940648e-06,
"loss": 0.7551,
"step": 149
},
{
"epoch": 0.3661606529864978,
"grad_norm": 0.7214458584785461,
"learning_rate": 9.235455592611667e-06,
"loss": 0.7984,
"step": 150
},
{
"epoch": 0.36860172400640784,
"grad_norm": 0.7065439820289612,
"learning_rate": 9.225155901974645e-06,
"loss": 0.8106,
"step": 151
},
{
"epoch": 0.3710427950263178,
"grad_norm": 0.7775700092315674,
"learning_rate": 9.214793121290442e-06,
"loss": 0.8211,
"step": 152
},
{
"epoch": 0.3734838660462278,
"grad_norm": 0.7118616700172424,
"learning_rate": 9.204367405296144e-06,
"loss": 0.82,
"step": 153
},
{
"epoch": 0.3759249370661378,
"grad_norm": 0.7476733326911926,
"learning_rate": 9.193878909668591e-06,
"loss": 0.7584,
"step": 154
},
{
"epoch": 0.37836600808604776,
"grad_norm": 0.7488994002342224,
"learning_rate": 9.183327791022048e-06,
"loss": 0.7552,
"step": 155
},
{
"epoch": 0.38080707910595774,
"grad_norm": 0.7086935043334961,
"learning_rate": 9.172714206905866e-06,
"loss": 0.7993,
"step": 156
},
{
"epoch": 0.3832481501258677,
"grad_norm": 0.7513390183448792,
"learning_rate": 9.162038315802132e-06,
"loss": 0.7684,
"step": 157
},
{
"epoch": 0.3856892211457777,
"grad_norm": 0.6983102560043335,
"learning_rate": 9.1513002771233e-06,
"loss": 0.7904,
"step": 158
},
{
"epoch": 0.3881302921656877,
"grad_norm": 0.6591006517410278,
"learning_rate": 9.140500251209813e-06,
"loss": 0.7357,
"step": 159
},
{
"epoch": 0.3905713631855977,
"grad_norm": 0.7491998672485352,
"learning_rate": 9.129638399327707e-06,
"loss": 0.7964,
"step": 160
},
{
"epoch": 0.3930124342055077,
"grad_norm": 0.7312127947807312,
"learning_rate": 9.118714883666204e-06,
"loss": 0.7706,
"step": 161
},
{
"epoch": 0.39545350522541767,
"grad_norm": 0.7120770215988159,
"learning_rate": 9.107729867335287e-06,
"loss": 0.8367,
"step": 162
},
{
"epoch": 0.39789457624532765,
"grad_norm": 0.735023021697998,
"learning_rate": 9.096683514363275e-06,
"loss": 0.7832,
"step": 163
},
{
"epoch": 0.4003356472652376,
"grad_norm": 0.7334295511245728,
"learning_rate": 9.085575989694358e-06,
"loss": 0.7977,
"step": 164
},
{
"epoch": 0.4027767182851476,
"grad_norm": 0.7482827305793762,
"learning_rate": 9.074407459186144e-06,
"loss": 0.868,
"step": 165
},
{
"epoch": 0.4052177893050576,
"grad_norm": 0.7395485043525696,
"learning_rate": 9.063178089607183e-06,
"loss": 0.7676,
"step": 166
},
{
"epoch": 0.40765886032496756,
"grad_norm": 0.6970906257629395,
"learning_rate": 9.051888048634471e-06,
"loss": 0.762,
"step": 167
},
{
"epoch": 0.41009993134487754,
"grad_norm": 0.7200821042060852,
"learning_rate": 9.040537504850954e-06,
"loss": 0.8067,
"step": 168
},
{
"epoch": 0.4125410023647875,
"grad_norm": 0.7742771506309509,
"learning_rate": 9.029126627743003e-06,
"loss": 0.7767,
"step": 169
},
{
"epoch": 0.41498207338469756,
"grad_norm": 0.7340243458747864,
"learning_rate": 9.017655587697885e-06,
"loss": 0.7816,
"step": 170
},
{
"epoch": 0.41742314440460754,
"grad_norm": 0.7570080161094666,
"learning_rate": 9.006124556001223e-06,
"loss": 0.8374,
"step": 171
},
{
"epoch": 0.4198642154245175,
"grad_norm": 0.7807502150535583,
"learning_rate": 8.994533704834435e-06,
"loss": 0.7749,
"step": 172
},
{
"epoch": 0.4223052864444275,
"grad_norm": 0.7137355208396912,
"learning_rate": 8.982883207272164e-06,
"loss": 0.7397,
"step": 173
},
{
"epoch": 0.4247463574643375,
"grad_norm": 0.7511448860168457,
"learning_rate": 8.971173237279693e-06,
"loss": 0.8006,
"step": 174
},
{
"epoch": 0.42718742848424746,
"grad_norm": 0.7791663408279419,
"learning_rate": 8.959403969710346e-06,
"loss": 0.7684,
"step": 175
},
{
"epoch": 0.42962849950415744,
"grad_norm": 0.7711341381072998,
"learning_rate": 8.947575580302879e-06,
"loss": 0.7905,
"step": 176
},
{
"epoch": 0.4320695705240674,
"grad_norm": 0.7793801426887512,
"learning_rate": 8.935688245678859e-06,
"loss": 0.8121,
"step": 177
},
{
"epoch": 0.4345106415439774,
"grad_norm": 0.7082055807113647,
"learning_rate": 8.92374214334002e-06,
"loss": 0.7657,
"step": 178
},
{
"epoch": 0.43695171256388743,
"grad_norm": 0.735462486743927,
"learning_rate": 8.911737451665616e-06,
"loss": 0.7833,
"step": 179
},
{
"epoch": 0.4393927835837974,
"grad_norm": 0.7432037591934204,
"learning_rate": 8.899674349909759e-06,
"loss": 0.7645,
"step": 180
},
{
"epoch": 0.4418338546037074,
"grad_norm": 0.7552315592765808,
"learning_rate": 8.887553018198738e-06,
"loss": 0.8018,
"step": 181
},
{
"epoch": 0.44427492562361737,
"grad_norm": 0.677143931388855,
"learning_rate": 8.875373637528336e-06,
"loss": 0.8029,
"step": 182
},
{
"epoch": 0.44671599664352735,
"grad_norm": 0.7790682911872864,
"learning_rate": 8.863136389761115e-06,
"loss": 0.792,
"step": 183
},
{
"epoch": 0.4491570676634373,
"grad_norm": 0.735373854637146,
"learning_rate": 8.85084145762372e-06,
"loss": 0.78,
"step": 184
},
{
"epoch": 0.4515981386833473,
"grad_norm": 0.7221420407295227,
"learning_rate": 8.838489024704131e-06,
"loss": 0.807,
"step": 185
},
{
"epoch": 0.4540392097032573,
"grad_norm": 0.7021591067314148,
"learning_rate": 8.826079275448934e-06,
"loss": 0.7828,
"step": 186
},
{
"epoch": 0.45648028072316726,
"grad_norm": 0.7104141712188721,
"learning_rate": 8.81361239516056e-06,
"loss": 0.8051,
"step": 187
},
{
"epoch": 0.4589213517430773,
"grad_norm": 0.749536395072937,
"learning_rate": 8.801088569994523e-06,
"loss": 0.7811,
"step": 188
},
{
"epoch": 0.4613624227629873,
"grad_norm": 0.7570759654045105,
"learning_rate": 8.788507986956639e-06,
"loss": 0.8015,
"step": 189
},
{
"epoch": 0.46380349378289726,
"grad_norm": 0.6997769474983215,
"learning_rate": 8.775870833900226e-06,
"loss": 0.7816,
"step": 190
},
{
"epoch": 0.46624456480280724,
"grad_norm": 0.6764109134674072,
"learning_rate": 8.763177299523318e-06,
"loss": 0.7577,
"step": 191
},
{
"epoch": 0.4686856358227172,
"grad_norm": 0.7811216115951538,
"learning_rate": 8.750427573365825e-06,
"loss": 0.7324,
"step": 192
},
{
"epoch": 0.4711267068426272,
"grad_norm": 0.7098534107208252,
"learning_rate": 8.737621845806715e-06,
"loss": 0.7321,
"step": 193
},
{
"epoch": 0.4735677778625372,
"grad_norm": 0.7705920934677124,
"learning_rate": 8.724760308061172e-06,
"loss": 0.7501,
"step": 194
},
{
"epoch": 0.47600884888244716,
"grad_norm": 0.7170778512954712,
"learning_rate": 8.711843152177735e-06,
"loss": 0.767,
"step": 195
},
{
"epoch": 0.47844991990235713,
"grad_norm": 0.7175964713096619,
"learning_rate": 8.698870571035436e-06,
"loss": 0.7592,
"step": 196
},
{
"epoch": 0.48089099092226717,
"grad_norm": 0.7901434898376465,
"learning_rate": 8.685842758340912e-06,
"loss": 0.7921,
"step": 197
},
{
"epoch": 0.48333206194217715,
"grad_norm": 0.7608402371406555,
"learning_rate": 8.672759908625528e-06,
"loss": 0.8617,
"step": 198
},
{
"epoch": 0.48577313296208713,
"grad_norm": 0.7593024373054504,
"learning_rate": 8.65962221724245e-06,
"loss": 0.7674,
"step": 199
},
{
"epoch": 0.4882142039819971,
"grad_norm": 0.7110275626182556,
"learning_rate": 8.646429880363746e-06,
"loss": 0.7521,
"step": 200
},
{
"epoch": 0.4906552750019071,
"grad_norm": 0.7535459399223328,
"learning_rate": 8.633183094977453e-06,
"loss": 0.7296,
"step": 201
},
{
"epoch": 0.49309634602181707,
"grad_norm": 0.7531000971794128,
"learning_rate": 8.61988205888463e-06,
"loss": 0.7863,
"step": 202
},
{
"epoch": 0.49553741704172705,
"grad_norm": 0.7889319658279419,
"learning_rate": 8.60652697069641e-06,
"loss": 0.7784,
"step": 203
},
{
"epoch": 0.497978488061637,
"grad_norm": 0.6903645396232605,
"learning_rate": 8.593118029831025e-06,
"loss": 0.7954,
"step": 204
},
{
"epoch": 0.500419559081547,
"grad_norm": 0.7375295758247375,
"learning_rate": 8.579655436510847e-06,
"loss": 0.7764,
"step": 205
},
{
"epoch": 0.502860630101457,
"grad_norm": 0.7218457460403442,
"learning_rate": 8.566139391759378e-06,
"loss": 0.7852,
"step": 206
},
{
"epoch": 0.505301701121367,
"grad_norm": 0.7074956297874451,
"learning_rate": 8.552570097398262e-06,
"loss": 0.7824,
"step": 207
},
{
"epoch": 0.5077427721412769,
"grad_norm": 0.6844367384910583,
"learning_rate": 8.53894775604426e-06,
"loss": 0.8005,
"step": 208
},
{
"epoch": 0.5101838431611869,
"grad_norm": 0.7443989515304565,
"learning_rate": 8.525272571106242e-06,
"loss": 0.7761,
"step": 209
},
{
"epoch": 0.5126249141810969,
"grad_norm": 0.7639645338058472,
"learning_rate": 8.511544746782124e-06,
"loss": 0.8032,
"step": 210
},
{
"epoch": 0.515065985201007,
"grad_norm": 0.699748158454895,
"learning_rate": 8.497764488055848e-06,
"loss": 0.7801,
"step": 211
},
{
"epoch": 0.517507056220917,
"grad_norm": 0.7058794498443604,
"learning_rate": 8.483932000694295e-06,
"loss": 0.7693,
"step": 212
},
{
"epoch": 0.519948127240827,
"grad_norm": 0.7830145359039307,
"learning_rate": 8.470047491244232e-06,
"loss": 0.7684,
"step": 213
},
{
"epoch": 0.5223891982607369,
"grad_norm": 0.6766949892044067,
"learning_rate": 8.456111167029219e-06,
"loss": 0.8214,
"step": 214
},
{
"epoch": 0.5248302692806469,
"grad_norm": 0.7066507339477539,
"learning_rate": 8.442123236146509e-06,
"loss": 0.7639,
"step": 215
},
{
"epoch": 0.5272713403005569,
"grad_norm": 0.7286085486412048,
"learning_rate": 8.42808390746395e-06,
"loss": 0.7723,
"step": 216
},
{
"epoch": 0.5297124113204669,
"grad_norm": 0.7587203979492188,
"learning_rate": 8.413993390616865e-06,
"loss": 0.8034,
"step": 217
},
{
"epoch": 0.5321534823403768,
"grad_norm": 0.6527595520019531,
"learning_rate": 8.399851896004914e-06,
"loss": 0.7587,
"step": 218
},
{
"epoch": 0.5345945533602868,
"grad_norm": 0.8271955251693726,
"learning_rate": 8.385659634788959e-06,
"loss": 0.7846,
"step": 219
},
{
"epoch": 0.5370356243801968,
"grad_norm": 0.7351842522621155,
"learning_rate": 8.371416818887907e-06,
"loss": 0.8002,
"step": 220
},
{
"epoch": 0.5394766954001068,
"grad_norm": 0.7915340065956116,
"learning_rate": 8.357123660975553e-06,
"loss": 0.7511,
"step": 221
},
{
"epoch": 0.5419177664200168,
"grad_norm": 0.6955085396766663,
"learning_rate": 8.342780374477396e-06,
"loss": 0.7754,
"step": 222
},
{
"epoch": 0.5443588374399267,
"grad_norm": 0.7368732690811157,
"learning_rate": 8.328387173567453e-06,
"loss": 0.7775,
"step": 223
},
{
"epoch": 0.5467999084598367,
"grad_norm": 0.6908881068229675,
"learning_rate": 8.313944273165068e-06,
"loss": 0.7571,
"step": 224
},
{
"epoch": 0.5492409794797467,
"grad_norm": 0.700554370880127,
"learning_rate": 8.299451888931696e-06,
"loss": 0.7714,
"step": 225
},
{
"epoch": 0.5516820504996567,
"grad_norm": 0.7163404822349548,
"learning_rate": 8.284910237267681e-06,
"loss": 0.7767,
"step": 226
},
{
"epoch": 0.5541231215195667,
"grad_norm": 0.7443628311157227,
"learning_rate": 8.270319535309035e-06,
"loss": 0.7709,
"step": 227
},
{
"epoch": 0.5565641925394766,
"grad_norm": 0.691213071346283,
"learning_rate": 8.255680000924184e-06,
"loss": 0.7997,
"step": 228
},
{
"epoch": 0.5590052635593867,
"grad_norm": 0.7387362718582153,
"learning_rate": 8.240991852710724e-06,
"loss": 0.7502,
"step": 229
},
{
"epoch": 0.5614463345792967,
"grad_norm": 0.7051777243614197,
"learning_rate": 8.22625530999215e-06,
"loss": 0.7811,
"step": 230
},
{
"epoch": 0.5638874055992067,
"grad_norm": 0.6610181331634521,
"learning_rate": 8.211470592814586e-06,
"loss": 0.7884,
"step": 231
},
{
"epoch": 0.5663284766191167,
"grad_norm": 0.6575087904930115,
"learning_rate": 8.196637921943496e-06,
"loss": 0.7797,
"step": 232
},
{
"epoch": 0.5687695476390267,
"grad_norm": 0.7363637685775757,
"learning_rate": 8.181757518860387e-06,
"loss": 0.7369,
"step": 233
},
{
"epoch": 0.5712106186589366,
"grad_norm": 0.6955734491348267,
"learning_rate": 8.166829605759507e-06,
"loss": 0.7841,
"step": 234
},
{
"epoch": 0.5736516896788466,
"grad_norm": 0.7019768357276917,
"learning_rate": 8.151854405544526e-06,
"loss": 0.7702,
"step": 235
},
{
"epoch": 0.5760927606987566,
"grad_norm": 0.7041372656822205,
"learning_rate": 8.136832141825197e-06,
"loss": 0.7755,
"step": 236
},
{
"epoch": 0.5785338317186666,
"grad_norm": 0.7138186693191528,
"learning_rate": 8.12176303891403e-06,
"loss": 0.7815,
"step": 237
},
{
"epoch": 0.5809749027385765,
"grad_norm": 0.6682398319244385,
"learning_rate": 8.106647321822943e-06,
"loss": 0.7573,
"step": 238
},
{
"epoch": 0.5834159737584865,
"grad_norm": 0.6600127816200256,
"learning_rate": 8.091485216259886e-06,
"loss": 0.7644,
"step": 239
},
{
"epoch": 0.5858570447783965,
"grad_norm": 0.7157433032989502,
"learning_rate": 8.076276948625495e-06,
"loss": 0.7661,
"step": 240
},
{
"epoch": 0.5882981157983065,
"grad_norm": 0.7180731892585754,
"learning_rate": 8.061022746009687e-06,
"loss": 0.756,
"step": 241
},
{
"epoch": 0.5907391868182165,
"grad_norm": 0.6941264271736145,
"learning_rate": 8.04572283618829e-06,
"loss": 0.7501,
"step": 242
},
{
"epoch": 0.5931802578381264,
"grad_norm": 0.7047881484031677,
"learning_rate": 8.030377447619622e-06,
"loss": 0.7564,
"step": 243
},
{
"epoch": 0.5956213288580364,
"grad_norm": 0.6860742568969727,
"learning_rate": 8.014986809441093e-06,
"loss": 0.8048,
"step": 244
},
{
"epoch": 0.5980623998779464,
"grad_norm": 0.6961259245872498,
"learning_rate": 7.999551151465793e-06,
"loss": 0.8085,
"step": 245
},
{
"epoch": 0.6005034708978564,
"grad_norm": 0.6859815716743469,
"learning_rate": 7.984070704179026e-06,
"loss": 0.7911,
"step": 246
},
{
"epoch": 0.6029445419177665,
"grad_norm": 0.739782452583313,
"learning_rate": 7.968545698734908e-06,
"loss": 0.7981,
"step": 247
},
{
"epoch": 0.6053856129376765,
"grad_norm": 0.7173625230789185,
"learning_rate": 7.952976366952888e-06,
"loss": 0.7738,
"step": 248
},
{
"epoch": 0.6078266839575864,
"grad_norm": 0.7094762921333313,
"learning_rate": 7.9373629413143e-06,
"loss": 0.7802,
"step": 249
},
{
"epoch": 0.6102677549774964,
"grad_norm": 0.6974766254425049,
"learning_rate": 7.921705654958886e-06,
"loss": 0.7956,
"step": 250
},
{
"epoch": 0.6127088259974064,
"grad_norm": 0.7235715389251709,
"learning_rate": 7.906004741681321e-06,
"loss": 0.7581,
"step": 251
},
{
"epoch": 0.6151498970173164,
"grad_norm": 0.68167644739151,
"learning_rate": 7.890260435927709e-06,
"loss": 0.7746,
"step": 252
},
{
"epoch": 0.6175909680372264,
"grad_norm": 0.6965702176094055,
"learning_rate": 7.874472972792097e-06,
"loss": 0.7638,
"step": 253
},
{
"epoch": 0.6200320390571363,
"grad_norm": 0.7000617384910583,
"learning_rate": 7.858642588012957e-06,
"loss": 0.7252,
"step": 254
},
{
"epoch": 0.6224731100770463,
"grad_norm": 0.6918544173240662,
"learning_rate": 7.842769517969665e-06,
"loss": 0.7867,
"step": 255
},
{
"epoch": 0.6249141810969563,
"grad_norm": 0.7168439626693726,
"learning_rate": 7.826853999678978e-06,
"loss": 0.7349,
"step": 256
},
{
"epoch": 0.6273552521168663,
"grad_norm": 0.6823673844337463,
"learning_rate": 7.810896270791484e-06,
"loss": 0.7749,
"step": 257
},
{
"epoch": 0.6297963231367762,
"grad_norm": 0.7399064898490906,
"learning_rate": 7.794896569588066e-06,
"loss": 0.7886,
"step": 258
},
{
"epoch": 0.6322373941566862,
"grad_norm": 0.6988884806632996,
"learning_rate": 7.778855134976334e-06,
"loss": 0.7329,
"step": 259
},
{
"epoch": 0.6346784651765962,
"grad_norm": 0.7028211951255798,
"learning_rate": 7.762772206487066e-06,
"loss": 0.8214,
"step": 260
},
{
"epoch": 0.6371195361965062,
"grad_norm": 0.7346593737602234,
"learning_rate": 7.74664802427062e-06,
"loss": 0.7626,
"step": 261
},
{
"epoch": 0.6395606072164162,
"grad_norm": 0.7249420881271362,
"learning_rate": 7.73048282909336e-06,
"loss": 0.7617,
"step": 262
},
{
"epoch": 0.6420016782363261,
"grad_norm": 0.7126630544662476,
"learning_rate": 7.714276862334051e-06,
"loss": 0.7599,
"step": 263
},
{
"epoch": 0.6444427492562361,
"grad_norm": 0.7718750238418579,
"learning_rate": 7.698030365980265e-06,
"loss": 0.8228,
"step": 264
},
{
"epoch": 0.6468838202761461,
"grad_norm": 0.7375472187995911,
"learning_rate": 7.681743582624761e-06,
"loss": 0.7458,
"step": 265
},
{
"epoch": 0.6493248912960562,
"grad_norm": 0.7044516205787659,
"learning_rate": 7.66541675546186e-06,
"loss": 0.779,
"step": 266
},
{
"epoch": 0.6517659623159662,
"grad_norm": 0.7249746322631836,
"learning_rate": 7.64905012828382e-06,
"loss": 0.7983,
"step": 267
},
{
"epoch": 0.6542070333358762,
"grad_norm": 0.7117093801498413,
"learning_rate": 7.632643945477195e-06,
"loss": 0.7436,
"step": 268
},
{
"epoch": 0.6566481043557861,
"grad_norm": 0.7090557217597961,
"learning_rate": 7.616198452019176e-06,
"loss": 0.7563,
"step": 269
},
{
"epoch": 0.6590891753756961,
"grad_norm": 0.720168948173523,
"learning_rate": 7.59971389347395e-06,
"loss": 0.7487,
"step": 270
},
{
"epoch": 0.6615302463956061,
"grad_norm": 0.6775338053703308,
"learning_rate": 7.583190515989022e-06,
"loss": 0.7708,
"step": 271
},
{
"epoch": 0.6639713174155161,
"grad_norm": 0.711544394493103,
"learning_rate": 7.566628566291537e-06,
"loss": 0.7732,
"step": 272
},
{
"epoch": 0.666412388435426,
"grad_norm": 0.7197690606117249,
"learning_rate": 7.550028291684603e-06,
"loss": 0.7681,
"step": 273
},
{
"epoch": 0.668853459455336,
"grad_norm": 0.7002537250518799,
"learning_rate": 7.5333899400435986e-06,
"loss": 0.7414,
"step": 274
},
{
"epoch": 0.671294530475246,
"grad_norm": 0.7534189820289612,
"learning_rate": 7.516713759812465e-06,
"loss": 0.7756,
"step": 275
},
{
"epoch": 0.673735601495156,
"grad_norm": 0.6993451714515686,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7494,
"step": 276
},
{
"epoch": 0.676176672515066,
"grad_norm": 0.7160854339599609,
"learning_rate": 7.483248910176144e-06,
"loss": 0.7727,
"step": 277
},
{
"epoch": 0.678617743534976,
"grad_norm": 0.7340339422225952,
"learning_rate": 7.466460740468246e-06,
"loss": 0.7641,
"step": 278
},
{
"epoch": 0.6810588145548859,
"grad_norm": 0.6893506050109863,
"learning_rate": 7.44963574155733e-06,
"loss": 0.7859,
"step": 279
},
{
"epoch": 0.6834998855747959,
"grad_norm": 0.7197214365005493,
"learning_rate": 7.432774164674359e-06,
"loss": 0.7645,
"step": 280
},
{
"epoch": 0.6859409565947059,
"grad_norm": 0.722647488117218,
"learning_rate": 7.4158762615964744e-06,
"loss": 0.7614,
"step": 281
},
{
"epoch": 0.6883820276146159,
"grad_norm": 0.7112955451011658,
"learning_rate": 7.398942284643242e-06,
"loss": 0.7565,
"step": 282
},
{
"epoch": 0.6908230986345258,
"grad_norm": 0.7392273545265198,
"learning_rate": 7.381972486672886e-06,
"loss": 0.7474,
"step": 283
},
{
"epoch": 0.6932641696544359,
"grad_norm": 0.691473126411438,
"learning_rate": 7.3649671210785024e-06,
"loss": 0.7392,
"step": 284
},
{
"epoch": 0.6957052406743459,
"grad_norm": 0.764784038066864,
"learning_rate": 7.34792644178429e-06,
"loss": 0.7341,
"step": 285
},
{
"epoch": 0.6981463116942559,
"grad_norm": 0.7281628251075745,
"learning_rate": 7.330850703241751e-06,
"loss": 0.7804,
"step": 286
},
{
"epoch": 0.7005873827141659,
"grad_norm": 0.843350887298584,
"learning_rate": 7.313740160425887e-06,
"loss": 0.7085,
"step": 287
},
{
"epoch": 0.7030284537340759,
"grad_norm": 0.7312772870063782,
"learning_rate": 7.296595068831406e-06,
"loss": 0.7638,
"step": 288
},
{
"epoch": 0.7054695247539858,
"grad_norm": 0.7479636073112488,
"learning_rate": 7.279415684468893e-06,
"loss": 0.7208,
"step": 289
},
{
"epoch": 0.7079105957738958,
"grad_norm": 0.6945940256118774,
"learning_rate": 7.262202263860989e-06,
"loss": 0.7133,
"step": 290
},
{
"epoch": 0.7103516667938058,
"grad_norm": 0.697964608669281,
"learning_rate": 7.244955064038574e-06,
"loss": 0.7478,
"step": 291
},
{
"epoch": 0.7127927378137158,
"grad_norm": 0.7676611542701721,
"learning_rate": 7.227674342536914e-06,
"loss": 0.7778,
"step": 292
},
{
"epoch": 0.7152338088336258,
"grad_norm": 0.6927245259284973,
"learning_rate": 7.210360357391818e-06,
"loss": 0.7041,
"step": 293
},
{
"epoch": 0.7176748798535357,
"grad_norm": 0.7169522643089294,
"learning_rate": 7.1930133671357915e-06,
"loss": 0.7493,
"step": 294
},
{
"epoch": 0.7201159508734457,
"grad_norm": 0.6862355470657349,
"learning_rate": 7.175633630794176e-06,
"loss": 0.7547,
"step": 295
},
{
"epoch": 0.7225570218933557,
"grad_norm": 0.7003543376922607,
"learning_rate": 7.1582214078812715e-06,
"loss": 0.7677,
"step": 296
},
{
"epoch": 0.7249980929132657,
"grad_norm": 0.6878992915153503,
"learning_rate": 7.140776958396468e-06,
"loss": 0.7663,
"step": 297
},
{
"epoch": 0.7274391639331756,
"grad_norm": 0.6947048306465149,
"learning_rate": 7.123300542820367e-06,
"loss": 0.7514,
"step": 298
},
{
"epoch": 0.7298802349530856,
"grad_norm": 0.6599522829055786,
"learning_rate": 7.1057924221108856e-06,
"loss": 0.7363,
"step": 299
},
{
"epoch": 0.7323213059729956,
"grad_norm": 0.6951528787612915,
"learning_rate": 7.08825285769936e-06,
"loss": 0.7247,
"step": 300
},
{
"epoch": 0.7347623769929056,
"grad_norm": 0.7057417035102844,
"learning_rate": 7.0706821114866475e-06,
"loss": 0.7829,
"step": 301
},
{
"epoch": 0.7372034480128157,
"grad_norm": 0.7661730647087097,
"learning_rate": 7.053080445839211e-06,
"loss": 0.7233,
"step": 302
},
{
"epoch": 0.7396445190327257,
"grad_norm": 0.6780954003334045,
"learning_rate": 7.035448123585201e-06,
"loss": 0.7549,
"step": 303
},
{
"epoch": 0.7420855900526356,
"grad_norm": 0.7154073715209961,
"learning_rate": 7.017785408010533e-06,
"loss": 0.7593,
"step": 304
},
{
"epoch": 0.7445266610725456,
"grad_norm": 0.72113436460495,
"learning_rate": 7.0000925628549595e-06,
"loss": 0.8079,
"step": 305
},
{
"epoch": 0.7469677320924556,
"grad_norm": 0.6903125643730164,
"learning_rate": 6.982369852308124e-06,
"loss": 0.7777,
"step": 306
},
{
"epoch": 0.7494088031123656,
"grad_norm": 0.7365685701370239,
"learning_rate": 6.964617541005617e-06,
"loss": 0.7827,
"step": 307
},
{
"epoch": 0.7518498741322756,
"grad_norm": 0.7428478002548218,
"learning_rate": 6.946835894025037e-06,
"loss": 0.7319,
"step": 308
},
{
"epoch": 0.7542909451521855,
"grad_norm": 0.7224217653274536,
"learning_rate": 6.929025176882016e-06,
"loss": 0.7758,
"step": 309
},
{
"epoch": 0.7567320161720955,
"grad_norm": 0.7415998578071594,
"learning_rate": 6.911185655526263e-06,
"loss": 0.7832,
"step": 310
},
{
"epoch": 0.7591730871920055,
"grad_norm": 0.7322662472724915,
"learning_rate": 6.893317596337592e-06,
"loss": 0.7323,
"step": 311
},
{
"epoch": 0.7616141582119155,
"grad_norm": 0.6983169913291931,
"learning_rate": 6.875421266121946e-06,
"loss": 0.7576,
"step": 312
},
{
"epoch": 0.7640552292318255,
"grad_norm": 0.7458372116088867,
"learning_rate": 6.857496932107407e-06,
"loss": 0.7549,
"step": 313
},
{
"epoch": 0.7664963002517354,
"grad_norm": 0.7724815011024475,
"learning_rate": 6.839544861940214e-06,
"loss": 0.7625,
"step": 314
},
{
"epoch": 0.7689373712716454,
"grad_norm": 0.7590445280075073,
"learning_rate": 6.821565323680759e-06,
"loss": 0.7422,
"step": 315
},
{
"epoch": 0.7713784422915554,
"grad_norm": 0.7100796699523926,
"learning_rate": 6.80355858579959e-06,
"loss": 0.748,
"step": 316
},
{
"epoch": 0.7738195133114654,
"grad_norm": 0.6766054034233093,
"learning_rate": 6.7855249171734e-06,
"loss": 0.7487,
"step": 317
},
{
"epoch": 0.7762605843313753,
"grad_norm": 0.7497526407241821,
"learning_rate": 6.76746458708101e-06,
"loss": 0.7584,
"step": 318
},
{
"epoch": 0.7787016553512853,
"grad_norm": 0.6761816740036011,
"learning_rate": 6.74937786519935e-06,
"loss": 0.7332,
"step": 319
},
{
"epoch": 0.7811427263711954,
"grad_norm": 0.6793827414512634,
"learning_rate": 6.731265021599437e-06,
"loss": 0.7387,
"step": 320
},
{
"epoch": 0.7835837973911054,
"grad_norm": 0.7181971669197083,
"learning_rate": 6.7131263267423305e-06,
"loss": 0.7588,
"step": 321
},
{
"epoch": 0.7860248684110154,
"grad_norm": 0.6722172498703003,
"learning_rate": 6.6949620514751075e-06,
"loss": 0.7264,
"step": 322
},
{
"epoch": 0.7884659394309254,
"grad_norm": 0.6741105914115906,
"learning_rate": 6.676772467026809e-06,
"loss": 0.7806,
"step": 323
},
{
"epoch": 0.7909070104508353,
"grad_norm": 0.6798011660575867,
"learning_rate": 6.65855784500439e-06,
"loss": 0.7281,
"step": 324
},
{
"epoch": 0.7933480814707453,
"grad_norm": 0.6723977327346802,
"learning_rate": 6.640318457388672e-06,
"loss": 0.7358,
"step": 325
},
{
"epoch": 0.7957891524906553,
"grad_norm": 0.6920611262321472,
"learning_rate": 6.622054576530275e-06,
"loss": 0.7754,
"step": 326
},
{
"epoch": 0.7982302235105653,
"grad_norm": 0.7018612623214722,
"learning_rate": 6.603766475145546e-06,
"loss": 0.7714,
"step": 327
},
{
"epoch": 0.8006712945304753,
"grad_norm": 0.7847645282745361,
"learning_rate": 6.585454426312506e-06,
"loss": 0.804,
"step": 328
},
{
"epoch": 0.8031123655503852,
"grad_norm": 0.6560506820678711,
"learning_rate": 6.5671187034667465e-06,
"loss": 0.7768,
"step": 329
},
{
"epoch": 0.8055534365702952,
"grad_norm": 0.720274031162262,
"learning_rate": 6.548759580397363e-06,
"loss": 0.7619,
"step": 330
},
{
"epoch": 0.8079945075902052,
"grad_norm": 0.6853426694869995,
"learning_rate": 6.53037733124287e-06,
"loss": 0.7147,
"step": 331
},
{
"epoch": 0.8104355786101152,
"grad_norm": 0.6448130011558533,
"learning_rate": 6.511972230487091e-06,
"loss": 0.7958,
"step": 332
},
{
"epoch": 0.8128766496300251,
"grad_norm": 0.6405046582221985,
"learning_rate": 6.4935445529550775e-06,
"loss": 0.7659,
"step": 333
},
{
"epoch": 0.8153177206499351,
"grad_norm": 0.7028204798698425,
"learning_rate": 6.475094573808994e-06,
"loss": 0.7609,
"step": 334
},
{
"epoch": 0.8177587916698451,
"grad_norm": 0.7086704969406128,
"learning_rate": 6.456622568544012e-06,
"loss": 0.7719,
"step": 335
},
{
"epoch": 0.8201998626897551,
"grad_norm": 0.6891460418701172,
"learning_rate": 6.438128812984199e-06,
"loss": 0.7667,
"step": 336
},
{
"epoch": 0.8226409337096651,
"grad_norm": 0.7402656674385071,
"learning_rate": 6.419613583278395e-06,
"loss": 0.7833,
"step": 337
},
{
"epoch": 0.825082004729575,
"grad_norm": 0.7249888777732849,
"learning_rate": 6.401077155896098e-06,
"loss": 0.7031,
"step": 338
},
{
"epoch": 0.8275230757494851,
"grad_norm": 0.6789460182189941,
"learning_rate": 6.3825198076233255e-06,
"loss": 0.7739,
"step": 339
},
{
"epoch": 0.8299641467693951,
"grad_norm": 0.6832931041717529,
"learning_rate": 6.363941815558484e-06,
"loss": 0.7242,
"step": 340
},
{
"epoch": 0.8324052177893051,
"grad_norm": 0.64850252866745,
"learning_rate": 6.345343457108238e-06,
"loss": 0.7378,
"step": 341
},
{
"epoch": 0.8348462888092151,
"grad_norm": 0.7206950783729553,
"learning_rate": 6.32672500998336e-06,
"loss": 0.7718,
"step": 342
},
{
"epoch": 0.837287359829125,
"grad_norm": 0.690377950668335,
"learning_rate": 6.308086752194586e-06,
"loss": 0.7784,
"step": 343
},
{
"epoch": 0.839728430849035,
"grad_norm": 0.6793109774589539,
"learning_rate": 6.289428962048467e-06,
"loss": 0.7936,
"step": 344
},
{
"epoch": 0.842169501868945,
"grad_norm": 0.6914140582084656,
"learning_rate": 6.270751918143213e-06,
"loss": 0.7652,
"step": 345
},
{
"epoch": 0.844610572888855,
"grad_norm": 0.6733123064041138,
"learning_rate": 6.252055899364525e-06,
"loss": 0.8477,
"step": 346
},
{
"epoch": 0.847051643908765,
"grad_norm": 0.6884806156158447,
"learning_rate": 6.2333411848814415e-06,
"loss": 0.7544,
"step": 347
},
{
"epoch": 0.849492714928675,
"grad_norm": 0.6831750273704529,
"learning_rate": 6.214608054142167e-06,
"loss": 0.7333,
"step": 348
},
{
"epoch": 0.8519337859485849,
"grad_norm": 0.6560673713684082,
"learning_rate": 6.195856786869893e-06,
"loss": 0.7252,
"step": 349
},
{
"epoch": 0.8543748569684949,
"grad_norm": 0.6822741627693176,
"learning_rate": 6.177087663058626e-06,
"loss": 0.7181,
"step": 350
},
{
"epoch": 0.8568159279884049,
"grad_norm": 0.8162235021591187,
"learning_rate": 6.158300962969012e-06,
"loss": 0.7359,
"step": 351
},
{
"epoch": 0.8592569990083149,
"grad_norm": 0.7049196362495422,
"learning_rate": 6.13949696712414e-06,
"loss": 0.7592,
"step": 352
},
{
"epoch": 0.8616980700282248,
"grad_norm": 0.6802664399147034,
"learning_rate": 6.120675956305363e-06,
"loss": 0.7476,
"step": 353
},
{
"epoch": 0.8641391410481348,
"grad_norm": 0.7405847311019897,
"learning_rate": 6.101838211548099e-06,
"loss": 0.7368,
"step": 354
},
{
"epoch": 0.8665802120680448,
"grad_norm": 0.6568160057067871,
"learning_rate": 6.0829840141376385e-06,
"loss": 0.7519,
"step": 355
},
{
"epoch": 0.8690212830879548,
"grad_norm": 0.6947789192199707,
"learning_rate": 6.064113645604945e-06,
"loss": 0.7217,
"step": 356
},
{
"epoch": 0.8714623541078649,
"grad_norm": 0.6590073704719543,
"learning_rate": 6.045227387722445e-06,
"loss": 0.7516,
"step": 357
},
{
"epoch": 0.8739034251277749,
"grad_norm": 0.7120059132575989,
"learning_rate": 6.026325522499829e-06,
"loss": 0.7481,
"step": 358
},
{
"epoch": 0.8763444961476848,
"grad_norm": 0.6988116502761841,
"learning_rate": 6.007408332179836e-06,
"loss": 0.7995,
"step": 359
},
{
"epoch": 0.8787855671675948,
"grad_norm": 0.7005130648612976,
"learning_rate": 5.988476099234033e-06,
"loss": 0.7427,
"step": 360
},
{
"epoch": 0.8812266381875048,
"grad_norm": 0.675496518611908,
"learning_rate": 5.969529106358612e-06,
"loss": 0.7603,
"step": 361
},
{
"epoch": 0.8836677092074148,
"grad_norm": 0.7157605290412903,
"learning_rate": 5.95056763647016e-06,
"loss": 0.7788,
"step": 362
},
{
"epoch": 0.8861087802273248,
"grad_norm": 0.6584900617599487,
"learning_rate": 5.931591972701427e-06,
"loss": 0.7415,
"step": 363
},
{
"epoch": 0.8885498512472347,
"grad_norm": 0.6833528876304626,
"learning_rate": 5.9126023983971114e-06,
"loss": 0.7339,
"step": 364
},
{
"epoch": 0.8909909222671447,
"grad_norm": 0.6946988701820374,
"learning_rate": 5.893599197109625e-06,
"loss": 0.8115,
"step": 365
},
{
"epoch": 0.8934319932870547,
"grad_norm": 0.7004518508911133,
"learning_rate": 5.874582652594855e-06,
"loss": 0.75,
"step": 366
},
{
"epoch": 0.8958730643069647,
"grad_norm": 0.6791942119598389,
"learning_rate": 5.855553048807932e-06,
"loss": 0.7288,
"step": 367
},
{
"epoch": 0.8983141353268747,
"grad_norm": 0.6619113683700562,
"learning_rate": 5.836510669898984e-06,
"loss": 0.7408,
"step": 368
},
{
"epoch": 0.9007552063467846,
"grad_norm": 0.6960625052452087,
"learning_rate": 5.817455800208901e-06,
"loss": 0.7937,
"step": 369
},
{
"epoch": 0.9031962773666946,
"grad_norm": 0.6567366123199463,
"learning_rate": 5.798388724265085e-06,
"loss": 0.7555,
"step": 370
},
{
"epoch": 0.9056373483866046,
"grad_norm": 0.6739965677261353,
"learning_rate": 5.7793097267772e-06,
"loss": 0.7193,
"step": 371
},
{
"epoch": 0.9080784194065146,
"grad_norm": 0.6742777228355408,
"learning_rate": 5.760219092632924e-06,
"loss": 0.7308,
"step": 372
},
{
"epoch": 0.9105194904264245,
"grad_norm": 0.6787696480751038,
"learning_rate": 5.741117106893693e-06,
"loss": 0.7387,
"step": 373
},
{
"epoch": 0.9129605614463345,
"grad_norm": 0.6927146911621094,
"learning_rate": 5.722004054790442e-06,
"loss": 0.7435,
"step": 374
},
{
"epoch": 0.9154016324662446,
"grad_norm": 0.6972612142562866,
"learning_rate": 5.7028802217193565e-06,
"loss": 0.7605,
"step": 375
},
{
"epoch": 0.9178427034861546,
"grad_norm": 0.7299436926841736,
"learning_rate": 5.683745893237598e-06,
"loss": 0.7745,
"step": 376
},
{
"epoch": 0.9202837745060646,
"grad_norm": 0.6617533564567566,
"learning_rate": 5.664601355059044e-06,
"loss": 0.7718,
"step": 377
},
{
"epoch": 0.9227248455259746,
"grad_norm": 0.722768247127533,
"learning_rate": 5.645446893050029e-06,
"loss": 0.783,
"step": 378
},
{
"epoch": 0.9251659165458845,
"grad_norm": 0.686353862285614,
"learning_rate": 5.626282793225066e-06,
"loss": 0.7411,
"step": 379
},
{
"epoch": 0.9276069875657945,
"grad_norm": 0.7211350798606873,
"learning_rate": 5.607109341742579e-06,
"loss": 0.7729,
"step": 380
},
{
"epoch": 0.9300480585857045,
"grad_norm": 0.6945457458496094,
"learning_rate": 5.587926824900637e-06,
"loss": 0.73,
"step": 381
},
{
"epoch": 0.9324891296056145,
"grad_norm": 0.641502320766449,
"learning_rate": 5.568735529132665e-06,
"loss": 0.7537,
"step": 382
},
{
"epoch": 0.9349302006255245,
"grad_norm": 0.7009978294372559,
"learning_rate": 5.5495357410031805e-06,
"loss": 0.7407,
"step": 383
},
{
"epoch": 0.9373712716454344,
"grad_norm": 0.6966471672058105,
"learning_rate": 5.530327747203507e-06,
"loss": 0.7287,
"step": 384
},
{
"epoch": 0.9398123426653444,
"grad_norm": 0.7361583709716797,
"learning_rate": 5.511111834547496e-06,
"loss": 0.7132,
"step": 385
},
{
"epoch": 0.9422534136852544,
"grad_norm": 0.8365876078605652,
"learning_rate": 5.491888289967241e-06,
"loss": 0.7517,
"step": 386
},
{
"epoch": 0.9446944847051644,
"grad_norm": 0.7376920580863953,
"learning_rate": 5.472657400508801e-06,
"loss": 0.7354,
"step": 387
},
{
"epoch": 0.9471355557250744,
"grad_norm": 0.6934507489204407,
"learning_rate": 5.4534194533279e-06,
"loss": 0.7418,
"step": 388
},
{
"epoch": 0.9495766267449843,
"grad_norm": 0.6778532862663269,
"learning_rate": 5.434174735685658e-06,
"loss": 0.7768,
"step": 389
},
{
"epoch": 0.9520176977648943,
"grad_norm": 0.7122485041618347,
"learning_rate": 5.414923534944283e-06,
"loss": 0.7674,
"step": 390
},
{
"epoch": 0.9544587687848043,
"grad_norm": 0.6961596608161926,
"learning_rate": 5.395666138562794e-06,
"loss": 0.7709,
"step": 391
},
{
"epoch": 0.9568998398047143,
"grad_norm": 0.7608603239059448,
"learning_rate": 5.376402834092721e-06,
"loss": 0.7787,
"step": 392
},
{
"epoch": 0.9593409108246242,
"grad_norm": 0.6927947402000427,
"learning_rate": 5.357133909173815e-06,
"loss": 0.7363,
"step": 393
},
{
"epoch": 0.9617819818445343,
"grad_norm": 0.7136049270629883,
"learning_rate": 5.337859651529747e-06,
"loss": 0.742,
"step": 394
},
{
"epoch": 0.9642230528644443,
"grad_norm": 0.7149622440338135,
"learning_rate": 5.318580348963826e-06,
"loss": 0.7501,
"step": 395
},
{
"epoch": 0.9666641238843543,
"grad_norm": 0.6613947749137878,
"learning_rate": 5.2992962893546804e-06,
"loss": 0.7045,
"step": 396
},
{
"epoch": 0.9691051949042643,
"grad_norm": 0.7548585534095764,
"learning_rate": 5.280007760651977e-06,
"loss": 0.7447,
"step": 397
},
{
"epoch": 0.9715462659241743,
"grad_norm": 0.6713358759880066,
"learning_rate": 5.260715050872119e-06,
"loss": 0.7356,
"step": 398
},
{
"epoch": 0.9739873369440842,
"grad_norm": 0.7130277156829834,
"learning_rate": 5.241418448093931e-06,
"loss": 0.7523,
"step": 399
},
{
"epoch": 0.9764284079639942,
"grad_norm": 0.7643216252326965,
"learning_rate": 5.222118240454376e-06,
"loss": 0.7581,
"step": 400
},
{
"epoch": 0.9788694789839042,
"grad_norm": 0.6513252258300781,
"learning_rate": 5.202814716144245e-06,
"loss": 0.7635,
"step": 401
},
{
"epoch": 0.9813105500038142,
"grad_norm": 0.7522091269493103,
"learning_rate": 5.1835081634038455e-06,
"loss": 0.7765,
"step": 402
},
{
"epoch": 0.9837516210237242,
"grad_norm": 0.7359428405761719,
"learning_rate": 5.164198870518714e-06,
"loss": 0.7626,
"step": 403
},
{
"epoch": 0.9861926920436341,
"grad_norm": 0.7049588561058044,
"learning_rate": 5.144887125815301e-06,
"loss": 0.6856,
"step": 404
},
{
"epoch": 0.9886337630635441,
"grad_norm": 0.70652836561203,
"learning_rate": 5.125573217656664e-06,
"loss": 0.7479,
"step": 405
},
{
"epoch": 0.9910748340834541,
"grad_norm": 0.6710291504859924,
"learning_rate": 5.1062574344381686e-06,
"loss": 0.7419,
"step": 406
},
{
"epoch": 0.9935159051033641,
"grad_norm": 0.6925918459892273,
"learning_rate": 5.086940064583179e-06,
"loss": 0.7222,
"step": 407
},
{
"epoch": 0.995956976123274,
"grad_norm": 0.738444447517395,
"learning_rate": 5.067621396538747e-06,
"loss": 0.738,
"step": 408
},
{
"epoch": 0.998398047143184,
"grad_norm": 0.6995366811752319,
"learning_rate": 5.048301718771317e-06,
"loss": 0.7986,
"step": 409
},
{
"epoch": 1.000839118163094,
"grad_norm": 0.7283437848091125,
"learning_rate": 5.028981319762399e-06,
"loss": 0.7439,
"step": 410
},
{
"epoch": 1.003280189183004,
"grad_norm": 0.6933985948562622,
"learning_rate": 5.009660488004283e-06,
"loss": 0.7776,
"step": 411
},
{
"epoch": 1.005721260202914,
"grad_norm": 1.7147387266159058,
"learning_rate": 4.990339511995718e-06,
"loss": 0.7962,
"step": 412
},
{
"epoch": 1.008162331222824,
"grad_norm": 0.7424637675285339,
"learning_rate": 4.971018680237602e-06,
"loss": 0.7573,
"step": 413
},
{
"epoch": 1.010603402242734,
"grad_norm": 0.714701235294342,
"learning_rate": 4.951698281228686e-06,
"loss": 0.7582,
"step": 414
},
{
"epoch": 1.0003051338774887,
"grad_norm": 0.7080721855163574,
"learning_rate": 4.932378603461253e-06,
"loss": 0.7603,
"step": 415
},
{
"epoch": 1.0027462048973987,
"grad_norm": 1.1822799444198608,
"learning_rate": 4.913059935416822e-06,
"loss": 0.6213,
"step": 416
},
{
"epoch": 1.0051872759173086,
"grad_norm": 1.084796667098999,
"learning_rate": 4.893742565561832e-06,
"loss": 0.6391,
"step": 417
},
{
"epoch": 1.0076283469372187,
"grad_norm": 0.7524453401565552,
"learning_rate": 4.8744267823433374e-06,
"loss": 0.6246,
"step": 418
},
{
"epoch": 1.0100694179571288,
"grad_norm": 0.8332728147506714,
"learning_rate": 4.855112874184701e-06,
"loss": 0.615,
"step": 419
},
{
"epoch": 1.0125104889770387,
"grad_norm": 1.0015031099319458,
"learning_rate": 4.835801129481287e-06,
"loss": 0.6168,
"step": 420
},
{
"epoch": 1.0149515599969487,
"grad_norm": 0.9933431148529053,
"learning_rate": 4.816491836596157e-06,
"loss": 0.6083,
"step": 421
},
{
"epoch": 1.0173926310168586,
"grad_norm": 0.8857572674751282,
"learning_rate": 4.797185283855756e-06,
"loss": 0.6478,
"step": 422
},
{
"epoch": 1.0198337020367687,
"grad_norm": 1.0425084829330444,
"learning_rate": 4.777881759545625e-06,
"loss": 0.6198,
"step": 423
},
{
"epoch": 1.0222747730566786,
"grad_norm": 0.8083875775337219,
"learning_rate": 4.75858155190607e-06,
"loss": 0.6153,
"step": 424
},
{
"epoch": 1.0247158440765887,
"grad_norm": 0.8430187106132507,
"learning_rate": 4.7392849491278825e-06,
"loss": 0.5814,
"step": 425
},
{
"epoch": 1.0271569150964985,
"grad_norm": 0.8698053956031799,
"learning_rate": 4.719992239348024e-06,
"loss": 0.619,
"step": 426
},
{
"epoch": 1.0295979861164086,
"grad_norm": 0.8039147257804871,
"learning_rate": 4.700703710645322e-06,
"loss": 0.6716,
"step": 427
},
{
"epoch": 1.0320390571363185,
"grad_norm": 0.7722117900848389,
"learning_rate": 4.681419651036177e-06,
"loss": 0.6423,
"step": 428
},
{
"epoch": 1.0344801281562286,
"grad_norm": 0.7633837461471558,
"learning_rate": 4.662140348470253e-06,
"loss": 0.6122,
"step": 429
},
{
"epoch": 1.0369211991761385,
"grad_norm": 0.845450758934021,
"learning_rate": 4.642866090826187e-06,
"loss": 0.617,
"step": 430
},
{
"epoch": 1.0393622701960485,
"grad_norm": 0.7823710441589355,
"learning_rate": 4.6235971659072806e-06,
"loss": 0.6102,
"step": 431
},
{
"epoch": 1.0418033412159584,
"grad_norm": 0.6919700503349304,
"learning_rate": 4.604333861437207e-06,
"loss": 0.5872,
"step": 432
},
{
"epoch": 1.0442444122358685,
"grad_norm": 0.8099319338798523,
"learning_rate": 4.585076465055719e-06,
"loss": 0.6435,
"step": 433
},
{
"epoch": 1.0466854832557784,
"grad_norm": 0.8152591586112976,
"learning_rate": 4.565825264314344e-06,
"loss": 0.6321,
"step": 434
},
{
"epoch": 1.0491265542756885,
"grad_norm": 0.7012869119644165,
"learning_rate": 4.5465805466721e-06,
"loss": 0.6248,
"step": 435
},
{
"epoch": 1.0515676252955983,
"grad_norm": 0.8160309195518494,
"learning_rate": 4.5273425994912e-06,
"loss": 0.6366,
"step": 436
},
{
"epoch": 1.0540086963155084,
"grad_norm": 0.7725083827972412,
"learning_rate": 4.5081117100327594e-06,
"loss": 0.6205,
"step": 437
},
{
"epoch": 1.0564497673354185,
"grad_norm": 0.7493377923965454,
"learning_rate": 4.488888165452506e-06,
"loss": 0.643,
"step": 438
},
{
"epoch": 1.0588908383553284,
"grad_norm": 0.7511170506477356,
"learning_rate": 4.469672252796495e-06,
"loss": 0.6245,
"step": 439
},
{
"epoch": 1.0613319093752385,
"grad_norm": 0.7679057717323303,
"learning_rate": 4.450464258996822e-06,
"loss": 0.6131,
"step": 440
},
{
"epoch": 1.0637729803951483,
"grad_norm": 0.7596908807754517,
"learning_rate": 4.4312644708673375e-06,
"loss": 0.6655,
"step": 441
},
{
"epoch": 1.0662140514150584,
"grad_norm": 0.709562361240387,
"learning_rate": 4.412073175099365e-06,
"loss": 0.637,
"step": 442
},
{
"epoch": 1.0686551224349683,
"grad_norm": 0.7175518274307251,
"learning_rate": 4.392890658257421e-06,
"loss": 0.6462,
"step": 443
},
{
"epoch": 1.0710961934548784,
"grad_norm": 0.7275789976119995,
"learning_rate": 4.373717206774935e-06,
"loss": 0.6773,
"step": 444
},
{
"epoch": 1.0735372644747883,
"grad_norm": 0.7145282626152039,
"learning_rate": 4.354553106949972e-06,
"loss": 0.5822,
"step": 445
},
{
"epoch": 1.0759783354946983,
"grad_norm": 0.7076422572135925,
"learning_rate": 4.335398644940958e-06,
"loss": 0.624,
"step": 446
},
{
"epoch": 1.0784194065146082,
"grad_norm": 0.7219144105911255,
"learning_rate": 4.316254106762404e-06,
"loss": 0.6099,
"step": 447
},
{
"epoch": 1.0808604775345183,
"grad_norm": 0.7198976874351501,
"learning_rate": 4.297119778280645e-06,
"loss": 0.6083,
"step": 448
},
{
"epoch": 1.0833015485544282,
"grad_norm": 0.7468734979629517,
"learning_rate": 4.277995945209558e-06,
"loss": 0.6438,
"step": 449
},
{
"epoch": 1.0857426195743383,
"grad_norm": 0.7466146945953369,
"learning_rate": 4.258882893106308e-06,
"loss": 0.609,
"step": 450
},
{
"epoch": 1.0881836905942481,
"grad_norm": 0.7118586897850037,
"learning_rate": 4.239780907367078e-06,
"loss": 0.6045,
"step": 451
},
{
"epoch": 1.0906247616141582,
"grad_norm": 0.7523221969604492,
"learning_rate": 4.220690273222802e-06,
"loss": 0.5949,
"step": 452
},
{
"epoch": 1.093065832634068,
"grad_norm": 0.7216614484786987,
"learning_rate": 4.201611275734916e-06,
"loss": 0.6117,
"step": 453
},
{
"epoch": 1.0955069036539782,
"grad_norm": 0.7935013771057129,
"learning_rate": 4.182544199791102e-06,
"loss": 0.6055,
"step": 454
},
{
"epoch": 1.0979479746738883,
"grad_norm": 0.7399945259094238,
"learning_rate": 4.163489330101017e-06,
"loss": 0.6147,
"step": 455
},
{
"epoch": 1.1003890456937981,
"grad_norm": 0.7488657236099243,
"learning_rate": 4.14444695119207e-06,
"loss": 0.6189,
"step": 456
},
{
"epoch": 1.1028301167137082,
"grad_norm": 0.7612137794494629,
"learning_rate": 4.125417347405147e-06,
"loss": 0.6291,
"step": 457
},
{
"epoch": 1.105271187733618,
"grad_norm": 0.7072088718414307,
"learning_rate": 4.106400802890377e-06,
"loss": 0.6332,
"step": 458
},
{
"epoch": 1.1077122587535282,
"grad_norm": 0.6833801865577698,
"learning_rate": 4.08739760160289e-06,
"loss": 0.5767,
"step": 459
},
{
"epoch": 1.110153329773438,
"grad_norm": 0.7195488214492798,
"learning_rate": 4.068408027298576e-06,
"loss": 0.6515,
"step": 460
},
{
"epoch": 1.1125944007933481,
"grad_norm": 0.7658545970916748,
"learning_rate": 4.049432363529842e-06,
"loss": 0.6528,
"step": 461
},
{
"epoch": 1.115035471813258,
"grad_norm": 0.713361382484436,
"learning_rate": 4.030470893641387e-06,
"loss": 0.6457,
"step": 462
},
{
"epoch": 1.117476542833168,
"grad_norm": 0.7208877205848694,
"learning_rate": 4.011523900765968e-06,
"loss": 0.657,
"step": 463
},
{
"epoch": 1.119917613853078,
"grad_norm": 0.7104807496070862,
"learning_rate": 3.992591667820166e-06,
"loss": 0.615,
"step": 464
},
{
"epoch": 1.122358684872988,
"grad_norm": 0.7092770338058472,
"learning_rate": 3.973674477500172e-06,
"loss": 0.6052,
"step": 465
},
{
"epoch": 1.124799755892898,
"grad_norm": 0.695006251335144,
"learning_rate": 3.954772612277557e-06,
"loss": 0.6211,
"step": 466
},
{
"epoch": 1.127240826912808,
"grad_norm": 0.6810059547424316,
"learning_rate": 3.935886354395057e-06,
"loss": 0.5862,
"step": 467
},
{
"epoch": 1.129681897932718,
"grad_norm": 0.6945995688438416,
"learning_rate": 3.917015985862364e-06,
"loss": 0.6112,
"step": 468
},
{
"epoch": 1.132122968952628,
"grad_norm": 0.7196330428123474,
"learning_rate": 3.8981617884519015e-06,
"loss": 0.6147,
"step": 469
},
{
"epoch": 1.1345640399725379,
"grad_norm": 0.6800129413604736,
"learning_rate": 3.8793240436946385e-06,
"loss": 0.6016,
"step": 470
},
{
"epoch": 1.137005110992448,
"grad_norm": 0.6729468107223511,
"learning_rate": 3.860503032875861e-06,
"loss": 0.6244,
"step": 471
},
{
"epoch": 1.139446182012358,
"grad_norm": 0.7448645234107971,
"learning_rate": 3.841699037030989e-06,
"loss": 0.5908,
"step": 472
},
{
"epoch": 1.141887253032268,
"grad_norm": 0.7207785248756409,
"learning_rate": 3.822912336941375e-06,
"loss": 0.6245,
"step": 473
},
{
"epoch": 1.1443283240521778,
"grad_norm": 0.6913513541221619,
"learning_rate": 3.80414321313011e-06,
"loss": 0.612,
"step": 474
},
{
"epoch": 1.1467693950720879,
"grad_norm": 0.7224765419960022,
"learning_rate": 3.7853919458578327e-06,
"loss": 0.627,
"step": 475
},
{
"epoch": 1.149210466091998,
"grad_norm": 0.7210829257965088,
"learning_rate": 3.7666588151185584e-06,
"loss": 0.6174,
"step": 476
},
{
"epoch": 1.1516515371119078,
"grad_norm": 0.7608765363693237,
"learning_rate": 3.7479441006354755e-06,
"loss": 0.6023,
"step": 477
},
{
"epoch": 1.154092608131818,
"grad_norm": 0.7259137034416199,
"learning_rate": 3.729248081856788e-06,
"loss": 0.611,
"step": 478
},
{
"epoch": 1.1565336791517278,
"grad_norm": 0.700013279914856,
"learning_rate": 3.7105710379515335e-06,
"loss": 0.6162,
"step": 479
},
{
"epoch": 1.1589747501716379,
"grad_norm": 0.7397728562355042,
"learning_rate": 3.6919132478054153e-06,
"loss": 0.567,
"step": 480
},
{
"epoch": 1.1614158211915477,
"grad_norm": 0.6973784565925598,
"learning_rate": 3.673274990016642e-06,
"loss": 0.6249,
"step": 481
},
{
"epoch": 1.1638568922114578,
"grad_norm": 0.7272012829780579,
"learning_rate": 3.6546565428917623e-06,
"loss": 0.6025,
"step": 482
},
{
"epoch": 1.1662979632313677,
"grad_norm": 0.7183772921562195,
"learning_rate": 3.6360581844415165e-06,
"loss": 0.6128,
"step": 483
},
{
"epoch": 1.1687390342512778,
"grad_norm": 0.7049492001533508,
"learning_rate": 3.6174801923766762e-06,
"loss": 0.6525,
"step": 484
},
{
"epoch": 1.1711801052711877,
"grad_norm": 0.7470197081565857,
"learning_rate": 3.5989228441039024e-06,
"loss": 0.6067,
"step": 485
},
{
"epoch": 1.1736211762910977,
"grad_norm": 0.66621994972229,
"learning_rate": 3.5803864167216055e-06,
"loss": 0.6597,
"step": 486
},
{
"epoch": 1.1760622473110076,
"grad_norm": 0.7643696665763855,
"learning_rate": 3.561871187015803e-06,
"loss": 0.6192,
"step": 487
},
{
"epoch": 1.1785033183309177,
"grad_norm": 0.7268946766853333,
"learning_rate": 3.543377431455991e-06,
"loss": 0.6335,
"step": 488
},
{
"epoch": 1.1809443893508278,
"grad_norm": 0.743349552154541,
"learning_rate": 3.5249054261910067e-06,
"loss": 0.6024,
"step": 489
},
{
"epoch": 1.1833854603707377,
"grad_norm": 0.6776405572891235,
"learning_rate": 3.506455447044923e-06,
"loss": 0.6601,
"step": 490
},
{
"epoch": 1.1858265313906475,
"grad_norm": 0.7699673771858215,
"learning_rate": 3.4880277695129095e-06,
"loss": 0.6093,
"step": 491
},
{
"epoch": 1.1882676024105576,
"grad_norm": 0.6653928160667419,
"learning_rate": 3.4696226687571317e-06,
"loss": 0.655,
"step": 492
},
{
"epoch": 1.1907086734304677,
"grad_norm": 0.6945586204528809,
"learning_rate": 3.4512404196026384e-06,
"loss": 0.6305,
"step": 493
},
{
"epoch": 1.1931497444503776,
"grad_norm": 0.7231103777885437,
"learning_rate": 3.432881296533257e-06,
"loss": 0.6186,
"step": 494
},
{
"epoch": 1.1955908154702877,
"grad_norm": 0.718945324420929,
"learning_rate": 3.4145455736874957e-06,
"loss": 0.6155,
"step": 495
},
{
"epoch": 1.1980318864901975,
"grad_norm": 0.738957941532135,
"learning_rate": 3.396233524854453e-06,
"loss": 0.6022,
"step": 496
},
{
"epoch": 1.2004729575101076,
"grad_norm": 0.6797804236412048,
"learning_rate": 3.377945423469727e-06,
"loss": 0.6448,
"step": 497
},
{
"epoch": 1.2029140285300175,
"grad_norm": 0.7095910906791687,
"learning_rate": 3.359681542611328e-06,
"loss": 0.6229,
"step": 498
},
{
"epoch": 1.2053550995499276,
"grad_norm": 0.6906200647354126,
"learning_rate": 3.3414421549956115e-06,
"loss": 0.6149,
"step": 499
},
{
"epoch": 1.2077961705698375,
"grad_norm": 0.7154446840286255,
"learning_rate": 3.323227532973193e-06,
"loss": 0.6198,
"step": 500
},
{
"epoch": 1.2102372415897475,
"grad_norm": 0.7295240759849548,
"learning_rate": 3.305037948524894e-06,
"loss": 0.6541,
"step": 501
},
{
"epoch": 1.2126783126096574,
"grad_norm": 0.7125757932662964,
"learning_rate": 3.2868736732576695e-06,
"loss": 0.6396,
"step": 502
},
{
"epoch": 1.2151193836295675,
"grad_norm": 0.6835936307907104,
"learning_rate": 3.268734978400564e-06,
"loss": 0.6576,
"step": 503
},
{
"epoch": 1.2175604546494774,
"grad_norm": 0.7498407363891602,
"learning_rate": 3.250622134800651e-06,
"loss": 0.6013,
"step": 504
},
{
"epoch": 1.2200015256693875,
"grad_norm": 0.7102557420730591,
"learning_rate": 3.2325354129189923e-06,
"loss": 0.6175,
"step": 505
},
{
"epoch": 1.2224425966892973,
"grad_norm": 0.7087395787239075,
"learning_rate": 3.214475082826602e-06,
"loss": 0.6391,
"step": 506
},
{
"epoch": 1.2248836677092074,
"grad_norm": 0.6935733556747437,
"learning_rate": 3.1964414142004123e-06,
"loss": 0.6234,
"step": 507
},
{
"epoch": 1.2273247387291173,
"grad_norm": 0.7102475166320801,
"learning_rate": 3.1784346763192437e-06,
"loss": 0.6339,
"step": 508
},
{
"epoch": 1.2297658097490274,
"grad_norm": 0.7003853917121887,
"learning_rate": 3.160455138059788e-06,
"loss": 0.6077,
"step": 509
},
{
"epoch": 1.2322068807689375,
"grad_norm": 0.6690055727958679,
"learning_rate": 3.142503067892594e-06,
"loss": 0.6097,
"step": 510
},
{
"epoch": 1.2346479517888473,
"grad_norm": 0.7464238405227661,
"learning_rate": 3.1245787338780555e-06,
"loss": 0.617,
"step": 511
},
{
"epoch": 1.2370890228087574,
"grad_norm": 0.7298991084098816,
"learning_rate": 3.1066824036624086e-06,
"loss": 0.6492,
"step": 512
},
{
"epoch": 1.2395300938286673,
"grad_norm": 0.686462938785553,
"learning_rate": 3.0888143444737395e-06,
"loss": 0.5875,
"step": 513
},
{
"epoch": 1.2419711648485774,
"grad_norm": 0.7086668610572815,
"learning_rate": 3.070974823117986e-06,
"loss": 0.6059,
"step": 514
},
{
"epoch": 1.2444122358684873,
"grad_norm": 0.7413360476493835,
"learning_rate": 3.053164105974964e-06,
"loss": 0.5894,
"step": 515
},
{
"epoch": 1.2468533068883974,
"grad_norm": 0.6833621859550476,
"learning_rate": 3.0353824589943835e-06,
"loss": 0.6254,
"step": 516
},
{
"epoch": 1.2492943779083072,
"grad_norm": 0.6703265309333801,
"learning_rate": 3.017630147691878e-06,
"loss": 0.6072,
"step": 517
},
{
"epoch": 1.2517354489282173,
"grad_norm": 0.7052757143974304,
"learning_rate": 2.999907437145042e-06,
"loss": 0.614,
"step": 518
},
{
"epoch": 1.2541765199481272,
"grad_norm": 0.7140656113624573,
"learning_rate": 2.9822145919894676e-06,
"loss": 0.598,
"step": 519
},
{
"epoch": 1.2566175909680373,
"grad_norm": 0.722187340259552,
"learning_rate": 2.964551876414801e-06,
"loss": 0.6277,
"step": 520
},
{
"epoch": 1.2590586619879471,
"grad_norm": 0.7111802101135254,
"learning_rate": 2.946919554160792e-06,
"loss": 0.6221,
"step": 521
},
{
"epoch": 1.2614997330078572,
"grad_norm": 0.695512056350708,
"learning_rate": 2.929317888513353e-06,
"loss": 0.6257,
"step": 522
},
{
"epoch": 1.2639408040277673,
"grad_norm": 0.7214482426643372,
"learning_rate": 2.9117471423006418e-06,
"loss": 0.6436,
"step": 523
},
{
"epoch": 1.2663818750476772,
"grad_norm": 0.7167050242424011,
"learning_rate": 2.8942075778891153e-06,
"loss": 0.5728,
"step": 524
},
{
"epoch": 1.268822946067587,
"grad_norm": 0.737602710723877,
"learning_rate": 2.8766994571796336e-06,
"loss": 0.6247,
"step": 525
},
{
"epoch": 1.2712640170874971,
"grad_norm": 0.7210221290588379,
"learning_rate": 2.859223041603534e-06,
"loss": 0.6216,
"step": 526
},
{
"epoch": 1.2737050881074072,
"grad_norm": 0.6660017371177673,
"learning_rate": 2.84177859211873e-06,
"loss": 0.6212,
"step": 527
},
{
"epoch": 1.276146159127317,
"grad_norm": 0.7368438839912415,
"learning_rate": 2.8243663692058255e-06,
"loss": 0.6509,
"step": 528
},
{
"epoch": 1.278587230147227,
"grad_norm": 0.6850879788398743,
"learning_rate": 2.806986632864208e-06,
"loss": 0.6128,
"step": 529
},
{
"epoch": 1.281028301167137,
"grad_norm": 0.6914191842079163,
"learning_rate": 2.7896396426081844e-06,
"loss": 0.6038,
"step": 530
},
{
"epoch": 1.2834693721870472,
"grad_norm": 0.7017717361450195,
"learning_rate": 2.772325657463088e-06,
"loss": 0.6523,
"step": 531
},
{
"epoch": 1.285910443206957,
"grad_norm": 0.7203471660614014,
"learning_rate": 2.7550449359614272e-06,
"loss": 0.6852,
"step": 532
},
{
"epoch": 1.2883515142268671,
"grad_norm": 0.7346776127815247,
"learning_rate": 2.7377977361390118e-06,
"loss": 0.5981,
"step": 533
},
{
"epoch": 1.290792585246777,
"grad_norm": 0.6989855170249939,
"learning_rate": 2.7205843155311098e-06,
"loss": 0.6554,
"step": 534
},
{
"epoch": 1.293233656266687,
"grad_norm": 0.7189421653747559,
"learning_rate": 2.703404931168594e-06,
"loss": 0.6285,
"step": 535
},
{
"epoch": 1.295674727286597,
"grad_norm": 0.7148353457450867,
"learning_rate": 2.6862598395741136e-06,
"loss": 0.6629,
"step": 536
},
{
"epoch": 1.298115798306507,
"grad_norm": 0.7388694882392883,
"learning_rate": 2.66914929675825e-06,
"loss": 0.6393,
"step": 537
},
{
"epoch": 1.300556869326417,
"grad_norm": 0.7198134660720825,
"learning_rate": 2.652073558215711e-06,
"loss": 0.6458,
"step": 538
},
{
"epoch": 1.302997940346327,
"grad_norm": 0.7222850322723389,
"learning_rate": 2.6350328789215e-06,
"loss": 0.5939,
"step": 539
},
{
"epoch": 1.3054390113662369,
"grad_norm": 0.7000096440315247,
"learning_rate": 2.618027513327116e-06,
"loss": 0.6409,
"step": 540
},
{
"epoch": 1.307880082386147,
"grad_norm": 0.6793365478515625,
"learning_rate": 2.6010577153567597e-06,
"loss": 0.6155,
"step": 541
},
{
"epoch": 1.3103211534060568,
"grad_norm": 0.7198613882064819,
"learning_rate": 2.584123738403527e-06,
"loss": 0.6099,
"step": 542
},
{
"epoch": 1.312762224425967,
"grad_norm": 0.6797595620155334,
"learning_rate": 2.567225835325642e-06,
"loss": 0.6216,
"step": 543
},
{
"epoch": 1.315203295445877,
"grad_norm": 0.6998633742332458,
"learning_rate": 2.550364258442671e-06,
"loss": 0.6268,
"step": 544
},
{
"epoch": 1.3176443664657869,
"grad_norm": 0.706811785697937,
"learning_rate": 2.533539259531757e-06,
"loss": 0.5907,
"step": 545
},
{
"epoch": 1.3200854374856967,
"grad_norm": 0.7156489491462708,
"learning_rate": 2.5167510898238566e-06,
"loss": 0.62,
"step": 546
},
{
"epoch": 1.3225265085056068,
"grad_norm": 0.7096937894821167,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.6087,
"step": 547
},
{
"epoch": 1.324967579525517,
"grad_norm": 0.6790308952331543,
"learning_rate": 2.483286240187538e-06,
"loss": 0.5822,
"step": 548
},
{
"epoch": 1.3274086505454268,
"grad_norm": 0.7136918306350708,
"learning_rate": 2.466610059956401e-06,
"loss": 0.6387,
"step": 549
},
{
"epoch": 1.3298497215653369,
"grad_norm": 0.7166918516159058,
"learning_rate": 2.4499717083153975e-06,
"loss": 0.6196,
"step": 550
},
{
"epoch": 1.3322907925852467,
"grad_norm": 0.6799836158752441,
"learning_rate": 2.433371433708465e-06,
"loss": 0.602,
"step": 551
},
{
"epoch": 1.3347318636051568,
"grad_norm": 0.6526500582695007,
"learning_rate": 2.4168094840109784e-06,
"loss": 0.598,
"step": 552
},
{
"epoch": 1.3371729346250667,
"grad_norm": 0.6871894001960754,
"learning_rate": 2.4002861065260506e-06,
"loss": 0.5954,
"step": 553
},
{
"epoch": 1.3396140056449768,
"grad_norm": 0.6922629475593567,
"learning_rate": 2.383801547980826e-06,
"loss": 0.6007,
"step": 554
},
{
"epoch": 1.3420550766648867,
"grad_norm": 0.7360095977783203,
"learning_rate": 2.3673560545228082e-06,
"loss": 0.6713,
"step": 555
},
{
"epoch": 1.3444961476847967,
"grad_norm": 0.6689037680625916,
"learning_rate": 2.3509498717161803e-06,
"loss": 0.6255,
"step": 556
},
{
"epoch": 1.3469372187047066,
"grad_norm": 0.7025970816612244,
"learning_rate": 2.3345832445381415e-06,
"loss": 0.6239,
"step": 557
},
{
"epoch": 1.3493782897246167,
"grad_norm": 0.7011806964874268,
"learning_rate": 2.31825641737524e-06,
"loss": 0.6243,
"step": 558
},
{
"epoch": 1.3518193607445266,
"grad_norm": 0.7212774753570557,
"learning_rate": 2.3019696340197358e-06,
"loss": 0.612,
"step": 559
},
{
"epoch": 1.3542604317644367,
"grad_norm": 0.6677849292755127,
"learning_rate": 2.2857231376659517e-06,
"loss": 0.6362,
"step": 560
},
{
"epoch": 1.3567015027843468,
"grad_norm": 0.7369222640991211,
"learning_rate": 2.2695171709066427e-06,
"loss": 0.5964,
"step": 561
},
{
"epoch": 1.3591425738042566,
"grad_norm": 0.7177897691726685,
"learning_rate": 2.2533519757293803e-06,
"loss": 0.6261,
"step": 562
},
{
"epoch": 1.3615836448241665,
"grad_norm": 0.7659596800804138,
"learning_rate": 2.237227793512935e-06,
"loss": 0.623,
"step": 563
},
{
"epoch": 1.3640247158440766,
"grad_norm": 0.7141837477684021,
"learning_rate": 2.221144865023666e-06,
"loss": 0.6207,
"step": 564
},
{
"epoch": 1.3664657868639867,
"grad_norm": 0.7026992440223694,
"learning_rate": 2.2051034304119344e-06,
"loss": 0.6172,
"step": 565
},
{
"epoch": 1.3689068578838965,
"grad_norm": 0.7084405422210693,
"learning_rate": 2.1891037292085177e-06,
"loss": 0.5976,
"step": 566
},
{
"epoch": 1.3713479289038064,
"grad_norm": 0.7147724032402039,
"learning_rate": 2.1731460003210255e-06,
"loss": 0.6032,
"step": 567
},
{
"epoch": 1.3737889999237165,
"grad_norm": 0.6965445876121521,
"learning_rate": 2.157230482030336e-06,
"loss": 0.6451,
"step": 568
},
{
"epoch": 1.3762300709436266,
"grad_norm": 0.7260875105857849,
"learning_rate": 2.141357411987044e-06,
"loss": 0.5887,
"step": 569
},
{
"epoch": 1.3786711419635365,
"grad_norm": 0.7008638978004456,
"learning_rate": 2.1255270272079044e-06,
"loss": 0.6476,
"step": 570
},
{
"epoch": 1.3811122129834466,
"grad_norm": 0.6777788996696472,
"learning_rate": 2.1097395640722916e-06,
"loss": 0.6201,
"step": 571
},
{
"epoch": 1.3835532840033564,
"grad_norm": 0.7306210994720459,
"learning_rate": 2.0939952583186806e-06,
"loss": 0.6373,
"step": 572
},
{
"epoch": 1.3859943550232665,
"grad_norm": 0.7415671944618225,
"learning_rate": 2.0782943450411148e-06,
"loss": 0.6038,
"step": 573
},
{
"epoch": 1.3884354260431764,
"grad_norm": 0.7014935612678528,
"learning_rate": 2.062637058685701e-06,
"loss": 0.6012,
"step": 574
},
{
"epoch": 1.3908764970630865,
"grad_norm": 0.664974570274353,
"learning_rate": 2.0470236330471125e-06,
"loss": 0.5804,
"step": 575
},
{
"epoch": 1.3933175680829963,
"grad_norm": 0.7382723689079285,
"learning_rate": 2.0314543012650934e-06,
"loss": 0.6291,
"step": 576
},
{
"epoch": 1.3957586391029064,
"grad_norm": 0.7389819622039795,
"learning_rate": 2.015929295820974e-06,
"loss": 0.6151,
"step": 577
},
{
"epoch": 1.3981997101228165,
"grad_norm": 0.7102298736572266,
"learning_rate": 2.000448848534209e-06,
"loss": 0.6381,
"step": 578
},
{
"epoch": 1.4006407811427264,
"grad_norm": 0.7227942943572998,
"learning_rate": 1.9850131905589065e-06,
"loss": 0.6524,
"step": 579
},
{
"epoch": 1.4030818521626363,
"grad_norm": 0.6704846620559692,
"learning_rate": 1.9696225523803803e-06,
"loss": 0.5928,
"step": 580
},
{
"epoch": 1.4055229231825463,
"grad_norm": 0.6964511871337891,
"learning_rate": 1.9542771638117124e-06,
"loss": 0.6072,
"step": 581
},
{
"epoch": 1.4079639942024564,
"grad_norm": 0.6633496284484863,
"learning_rate": 1.9389772539903123e-06,
"loss": 0.5949,
"step": 582
},
{
"epoch": 1.4104050652223663,
"grad_norm": 0.6861550211906433,
"learning_rate": 1.923723051374505e-06,
"loss": 0.607,
"step": 583
},
{
"epoch": 1.4128461362422762,
"grad_norm": 0.705496609210968,
"learning_rate": 1.908514783740114e-06,
"loss": 0.6053,
"step": 584
},
{
"epoch": 1.4152872072621863,
"grad_norm": 0.6763675808906555,
"learning_rate": 1.89335267817706e-06,
"loss": 0.6074,
"step": 585
},
{
"epoch": 1.4177282782820964,
"grad_norm": 0.7484830617904663,
"learning_rate": 1.8782369610859707e-06,
"loss": 0.5975,
"step": 586
},
{
"epoch": 1.4201693493020062,
"grad_norm": 0.6787152290344238,
"learning_rate": 1.8631678581748059e-06,
"loss": 0.6134,
"step": 587
},
{
"epoch": 1.4226104203219163,
"grad_norm": 0.6600430607795715,
"learning_rate": 1.848145594455477e-06,
"loss": 0.6017,
"step": 588
},
{
"epoch": 1.4250514913418262,
"grad_norm": 0.6843500137329102,
"learning_rate": 1.8331703942404932e-06,
"loss": 0.6027,
"step": 589
},
{
"epoch": 1.4274925623617363,
"grad_norm": 0.725928544998169,
"learning_rate": 1.8182424811396131e-06,
"loss": 0.5891,
"step": 590
},
{
"epoch": 1.4299336333816461,
"grad_norm": 0.7073947787284851,
"learning_rate": 1.8033620780565058e-06,
"loss": 0.6,
"step": 591
},
{
"epoch": 1.4323747044015562,
"grad_norm": 0.6989341378211975,
"learning_rate": 1.7885294071854159e-06,
"loss": 0.6025,
"step": 592
},
{
"epoch": 1.434815775421466,
"grad_norm": 0.7197926640510559,
"learning_rate": 1.7737446900078503e-06,
"loss": 0.5944,
"step": 593
},
{
"epoch": 1.4372568464413762,
"grad_norm": 0.6586629152297974,
"learning_rate": 1.7590081472892779e-06,
"loss": 0.6074,
"step": 594
},
{
"epoch": 1.439697917461286,
"grad_norm": 0.6680869460105896,
"learning_rate": 1.7443199990758168e-06,
"loss": 0.5892,
"step": 595
},
{
"epoch": 1.4421389884811961,
"grad_norm": 0.6742851734161377,
"learning_rate": 1.7296804646909654e-06,
"loss": 0.6125,
"step": 596
},
{
"epoch": 1.444580059501106,
"grad_norm": 0.6669274568557739,
"learning_rate": 1.71508976273232e-06,
"loss": 0.6625,
"step": 597
},
{
"epoch": 1.447021130521016,
"grad_norm": 0.6753084659576416,
"learning_rate": 1.7005481110683064e-06,
"loss": 0.6016,
"step": 598
},
{
"epoch": 1.4494622015409262,
"grad_norm": 0.6702268719673157,
"learning_rate": 1.686055726834932e-06,
"loss": 0.5682,
"step": 599
},
{
"epoch": 1.451903272560836,
"grad_norm": 0.6861889958381653,
"learning_rate": 1.6716128264325477e-06,
"loss": 0.6066,
"step": 600
},
{
"epoch": 1.454344343580746,
"grad_norm": 0.7023141980171204,
"learning_rate": 1.6572196255226063e-06,
"loss": 0.6164,
"step": 601
},
{
"epoch": 1.456785414600656,
"grad_norm": 0.678805410861969,
"learning_rate": 1.6428763390244462e-06,
"loss": 0.6037,
"step": 602
},
{
"epoch": 1.4592264856205661,
"grad_norm": 0.6716406941413879,
"learning_rate": 1.6285831811120938e-06,
"loss": 0.5865,
"step": 603
},
{
"epoch": 1.461667556640476,
"grad_norm": 0.6960821151733398,
"learning_rate": 1.614340365211044e-06,
"loss": 0.626,
"step": 604
},
{
"epoch": 1.464108627660386,
"grad_norm": 0.6855660080909729,
"learning_rate": 1.6001481039950872e-06,
"loss": 0.6583,
"step": 605
},
{
"epoch": 1.466549698680296,
"grad_norm": 0.686470091342926,
"learning_rate": 1.5860066093831366e-06,
"loss": 0.5914,
"step": 606
},
{
"epoch": 1.468990769700206,
"grad_norm": 0.6726788878440857,
"learning_rate": 1.5719160925360517e-06,
"loss": 0.6499,
"step": 607
},
{
"epoch": 1.471431840720116,
"grad_norm": 0.6723580956459045,
"learning_rate": 1.557876763853493e-06,
"loss": 0.6381,
"step": 608
},
{
"epoch": 1.473872911740026,
"grad_norm": 0.7011963129043579,
"learning_rate": 1.5438888329707824e-06,
"loss": 0.5925,
"step": 609
},
{
"epoch": 1.4763139827599359,
"grad_norm": 0.708925724029541,
"learning_rate": 1.5299525087557682e-06,
"loss": 0.5849,
"step": 610
},
{
"epoch": 1.478755053779846,
"grad_norm": 0.6862295866012573,
"learning_rate": 1.5160679993057048e-06,
"loss": 0.5964,
"step": 611
},
{
"epoch": 1.4811961247997558,
"grad_norm": 0.6844595670700073,
"learning_rate": 1.502235511944154e-06,
"loss": 0.5876,
"step": 612
},
{
"epoch": 1.483637195819666,
"grad_norm": 0.6619264483451843,
"learning_rate": 1.488455253217877e-06,
"loss": 0.6242,
"step": 613
},
{
"epoch": 1.4860782668395758,
"grad_norm": 0.6791555881500244,
"learning_rate": 1.4747274288937597e-06,
"loss": 0.624,
"step": 614
},
{
"epoch": 1.4885193378594859,
"grad_norm": 0.6787840723991394,
"learning_rate": 1.461052243955739e-06,
"loss": 0.6298,
"step": 615
}
],
"logging_steps": 1,
"max_steps": 818,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 205,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3668760344669979e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}