Skywork-Reward-Gemma-2-27B-v0.2 / trainer_state.json
chrisliu298's picture
Add files using upload-large-folder tool
e69bfcf verified
raw
history blame
113 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.079734219269103,
"eval_steps": 500,
"global_step": 650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016611295681063123,
"grad_norm": 283.7961630862445,
"learning_rate": 2.7027027027027028e-08,
"loss": 1.0013,
"step": 1
},
{
"epoch": 0.0033222591362126247,
"grad_norm": 403.0161196007378,
"learning_rate": 5.4054054054054056e-08,
"loss": 1.3486,
"step": 2
},
{
"epoch": 0.0049833887043189366,
"grad_norm": 225.65685288846345,
"learning_rate": 8.108108108108108e-08,
"loss": 0.9601,
"step": 3
},
{
"epoch": 0.006644518272425249,
"grad_norm": 309.07994612900893,
"learning_rate": 1.0810810810810811e-07,
"loss": 0.814,
"step": 4
},
{
"epoch": 0.008305647840531562,
"grad_norm": 380.5732961356113,
"learning_rate": 1.3513513513513515e-07,
"loss": 0.9665,
"step": 5
},
{
"epoch": 0.009966777408637873,
"grad_norm": 350.2306529649741,
"learning_rate": 1.6216216216216215e-07,
"loss": 0.9394,
"step": 6
},
{
"epoch": 0.011627906976744186,
"grad_norm": 303.16533146413684,
"learning_rate": 1.891891891891892e-07,
"loss": 1.1157,
"step": 7
},
{
"epoch": 0.013289036544850499,
"grad_norm": 330.3322177024234,
"learning_rate": 2.1621621621621622e-07,
"loss": 0.8766,
"step": 8
},
{
"epoch": 0.014950166112956811,
"grad_norm": 232.1781635901737,
"learning_rate": 2.4324324324324326e-07,
"loss": 0.7901,
"step": 9
},
{
"epoch": 0.016611295681063124,
"grad_norm": 303.49257454610745,
"learning_rate": 2.702702702702703e-07,
"loss": 0.875,
"step": 10
},
{
"epoch": 0.018272425249169437,
"grad_norm": 191.62808346871842,
"learning_rate": 2.972972972972973e-07,
"loss": 0.6306,
"step": 11
},
{
"epoch": 0.019933554817275746,
"grad_norm": 184.28128658857395,
"learning_rate": 3.243243243243243e-07,
"loss": 0.6915,
"step": 12
},
{
"epoch": 0.02159468438538206,
"grad_norm": 466.93407676031165,
"learning_rate": 3.5135135135135134e-07,
"loss": 0.6333,
"step": 13
},
{
"epoch": 0.023255813953488372,
"grad_norm": 146.99367069527102,
"learning_rate": 3.783783783783784e-07,
"loss": 0.4615,
"step": 14
},
{
"epoch": 0.024916943521594685,
"grad_norm": 146.6363236745373,
"learning_rate": 4.054054054054054e-07,
"loss": 0.6328,
"step": 15
},
{
"epoch": 0.026578073089700997,
"grad_norm": 196.3416905532426,
"learning_rate": 4.3243243243243244e-07,
"loss": 0.6118,
"step": 16
},
{
"epoch": 0.02823920265780731,
"grad_norm": 302.44153365360745,
"learning_rate": 4.594594594594595e-07,
"loss": 0.5397,
"step": 17
},
{
"epoch": 0.029900332225913623,
"grad_norm": 8753.401359110785,
"learning_rate": 4.864864864864865e-07,
"loss": 0.6609,
"step": 18
},
{
"epoch": 0.03156146179401993,
"grad_norm": 139.31291333841494,
"learning_rate": 5.135135135135134e-07,
"loss": 0.4247,
"step": 19
},
{
"epoch": 0.03322259136212625,
"grad_norm": 117.2303002306334,
"learning_rate": 5.405405405405406e-07,
"loss": 0.5025,
"step": 20
},
{
"epoch": 0.03488372093023256,
"grad_norm": 159.0527667166082,
"learning_rate": 5.675675675675675e-07,
"loss": 0.4789,
"step": 21
},
{
"epoch": 0.036544850498338874,
"grad_norm": 211.30553229145636,
"learning_rate": 5.945945945945947e-07,
"loss": 0.6005,
"step": 22
},
{
"epoch": 0.03820598006644518,
"grad_norm": 74.75774440900648,
"learning_rate": 6.216216216216216e-07,
"loss": 0.4322,
"step": 23
},
{
"epoch": 0.03986710963455149,
"grad_norm": 152.6630766770394,
"learning_rate": 6.486486486486486e-07,
"loss": 0.6084,
"step": 24
},
{
"epoch": 0.04152823920265781,
"grad_norm": 68.59356856504525,
"learning_rate": 6.756756756756756e-07,
"loss": 0.3271,
"step": 25
},
{
"epoch": 0.04318936877076412,
"grad_norm": 48.213900231519695,
"learning_rate": 7.027027027027027e-07,
"loss": 0.3455,
"step": 26
},
{
"epoch": 0.044850498338870434,
"grad_norm": 50.0637038462766,
"learning_rate": 7.297297297297297e-07,
"loss": 0.3409,
"step": 27
},
{
"epoch": 0.046511627906976744,
"grad_norm": 64.20573798423982,
"learning_rate": 7.567567567567568e-07,
"loss": 0.448,
"step": 28
},
{
"epoch": 0.04817275747508306,
"grad_norm": 132.42160531061333,
"learning_rate": 7.837837837837838e-07,
"loss": 0.3915,
"step": 29
},
{
"epoch": 0.04983388704318937,
"grad_norm": 101.32898633286817,
"learning_rate": 8.108108108108108e-07,
"loss": 0.3159,
"step": 30
},
{
"epoch": 0.05149501661129568,
"grad_norm": 67.44357173798667,
"learning_rate": 8.378378378378377e-07,
"loss": 0.2786,
"step": 31
},
{
"epoch": 0.053156146179401995,
"grad_norm": 94.87155550615857,
"learning_rate": 8.648648648648649e-07,
"loss": 0.4167,
"step": 32
},
{
"epoch": 0.054817275747508304,
"grad_norm": 120.07807866412833,
"learning_rate": 8.918918918918918e-07,
"loss": 0.3996,
"step": 33
},
{
"epoch": 0.05647840531561462,
"grad_norm": 53.547698067900235,
"learning_rate": 9.18918918918919e-07,
"loss": 0.3981,
"step": 34
},
{
"epoch": 0.05813953488372093,
"grad_norm": 89.39620348618399,
"learning_rate": 9.459459459459459e-07,
"loss": 0.3523,
"step": 35
},
{
"epoch": 0.059800664451827246,
"grad_norm": 125.63871272969014,
"learning_rate": 9.72972972972973e-07,
"loss": 0.343,
"step": 36
},
{
"epoch": 0.061461794019933555,
"grad_norm": 85.85489825737757,
"learning_rate": 1e-06,
"loss": 0.3063,
"step": 37
},
{
"epoch": 0.06312292358803986,
"grad_norm": 56.63108841849816,
"learning_rate": 9.999981882520454e-07,
"loss": 0.4455,
"step": 38
},
{
"epoch": 0.06478405315614617,
"grad_norm": 40.40412484077134,
"learning_rate": 9.999927530213112e-07,
"loss": 0.3411,
"step": 39
},
{
"epoch": 0.0664451827242525,
"grad_norm": 42.56638423746503,
"learning_rate": 9.999836943471866e-07,
"loss": 0.3422,
"step": 40
},
{
"epoch": 0.0681063122923588,
"grad_norm": 149.52998978810905,
"learning_rate": 9.999710122953198e-07,
"loss": 0.3539,
"step": 41
},
{
"epoch": 0.06976744186046512,
"grad_norm": 99.06075800406914,
"learning_rate": 9.999547069576173e-07,
"loss": 0.3705,
"step": 42
},
{
"epoch": 0.07142857142857142,
"grad_norm": 28.75050949230183,
"learning_rate": 9.99934778452244e-07,
"loss": 0.2556,
"step": 43
},
{
"epoch": 0.07308970099667775,
"grad_norm": 102.07599788982593,
"learning_rate": 9.999112269236213e-07,
"loss": 0.3375,
"step": 44
},
{
"epoch": 0.07475083056478406,
"grad_norm": 94.98798632226429,
"learning_rate": 9.99884052542427e-07,
"loss": 0.325,
"step": 45
},
{
"epoch": 0.07641196013289037,
"grad_norm": 45.49599727736315,
"learning_rate": 9.99853255505594e-07,
"loss": 0.3344,
"step": 46
},
{
"epoch": 0.07807308970099668,
"grad_norm": 437.1776518783744,
"learning_rate": 9.99818836036308e-07,
"loss": 0.3195,
"step": 47
},
{
"epoch": 0.07973421926910298,
"grad_norm": 40.386876297214194,
"learning_rate": 9.997807943840063e-07,
"loss": 0.2935,
"step": 48
},
{
"epoch": 0.08139534883720931,
"grad_norm": 35.81499917192016,
"learning_rate": 9.997391308243767e-07,
"loss": 0.3221,
"step": 49
},
{
"epoch": 0.08305647840531562,
"grad_norm": 135.69410164163278,
"learning_rate": 9.996938456593547e-07,
"loss": 0.3641,
"step": 50
},
{
"epoch": 0.08471760797342193,
"grad_norm": 37.49737579141678,
"learning_rate": 9.996449392171216e-07,
"loss": 0.3116,
"step": 51
},
{
"epoch": 0.08637873754152824,
"grad_norm": 53.54810233248991,
"learning_rate": 9.995924118521016e-07,
"loss": 0.2374,
"step": 52
},
{
"epoch": 0.08803986710963455,
"grad_norm": 29.764129425896126,
"learning_rate": 9.995362639449604e-07,
"loss": 0.3214,
"step": 53
},
{
"epoch": 0.08970099667774087,
"grad_norm": 86.36715802553283,
"learning_rate": 9.994764959026014e-07,
"loss": 0.2724,
"step": 54
},
{
"epoch": 0.09136212624584718,
"grad_norm": 59.60483629138851,
"learning_rate": 9.99413108158163e-07,
"loss": 0.2682,
"step": 55
},
{
"epoch": 0.09302325581395349,
"grad_norm": 1121.1809969504397,
"learning_rate": 9.99346101171016e-07,
"loss": 0.3457,
"step": 56
},
{
"epoch": 0.0946843853820598,
"grad_norm": 36.4371245053598,
"learning_rate": 9.99275475426759e-07,
"loss": 0.3518,
"step": 57
},
{
"epoch": 0.09634551495016612,
"grad_norm": 36.679857127723125,
"learning_rate": 9.992012314372164e-07,
"loss": 0.1912,
"step": 58
},
{
"epoch": 0.09800664451827243,
"grad_norm": 168.93933955821868,
"learning_rate": 9.991233697404337e-07,
"loss": 0.2478,
"step": 59
},
{
"epoch": 0.09966777408637874,
"grad_norm": 24.14622539640721,
"learning_rate": 9.990418909006743e-07,
"loss": 0.2118,
"step": 60
},
{
"epoch": 0.10132890365448505,
"grad_norm": 43.18341622294434,
"learning_rate": 9.989567955084143e-07,
"loss": 0.3924,
"step": 61
},
{
"epoch": 0.10299003322259136,
"grad_norm": 29.085536311011936,
"learning_rate": 9.988680841803396e-07,
"loss": 0.2878,
"step": 62
},
{
"epoch": 0.10465116279069768,
"grad_norm": 24.284878773871643,
"learning_rate": 9.987757575593402e-07,
"loss": 0.1948,
"step": 63
},
{
"epoch": 0.10631229235880399,
"grad_norm": 27.805278117108944,
"learning_rate": 9.986798163145066e-07,
"loss": 0.2563,
"step": 64
},
{
"epoch": 0.1079734219269103,
"grad_norm": 22.56960935878,
"learning_rate": 9.985802611411243e-07,
"loss": 0.2298,
"step": 65
},
{
"epoch": 0.10963455149501661,
"grad_norm": 37.482492845739586,
"learning_rate": 9.984770927606686e-07,
"loss": 0.2785,
"step": 66
},
{
"epoch": 0.11129568106312292,
"grad_norm": 23.848350011819495,
"learning_rate": 9.983703119207998e-07,
"loss": 0.2113,
"step": 67
},
{
"epoch": 0.11295681063122924,
"grad_norm": 30.649497045880594,
"learning_rate": 9.98259919395358e-07,
"loss": 0.255,
"step": 68
},
{
"epoch": 0.11461794019933555,
"grad_norm": 23.006604632466246,
"learning_rate": 9.98145915984357e-07,
"loss": 0.224,
"step": 69
},
{
"epoch": 0.11627906976744186,
"grad_norm": 21.882953775970815,
"learning_rate": 9.98028302513978e-07,
"loss": 0.2616,
"step": 70
},
{
"epoch": 0.11794019933554817,
"grad_norm": 25.05081248063707,
"learning_rate": 9.97907079836566e-07,
"loss": 0.2128,
"step": 71
},
{
"epoch": 0.11960132890365449,
"grad_norm": 31.598368685329934,
"learning_rate": 9.977822488306195e-07,
"loss": 0.3792,
"step": 72
},
{
"epoch": 0.1212624584717608,
"grad_norm": 37.48759519077218,
"learning_rate": 9.976538104007886e-07,
"loss": 0.2736,
"step": 73
},
{
"epoch": 0.12292358803986711,
"grad_norm": 19.638414416569248,
"learning_rate": 9.975217654778651e-07,
"loss": 0.1277,
"step": 74
},
{
"epoch": 0.12458471760797342,
"grad_norm": 29.04555059332869,
"learning_rate": 9.97386115018778e-07,
"loss": 0.3141,
"step": 75
},
{
"epoch": 0.12624584717607973,
"grad_norm": 21.430456032128323,
"learning_rate": 9.972468600065845e-07,
"loss": 0.2253,
"step": 76
},
{
"epoch": 0.12790697674418605,
"grad_norm": 41.85901544277914,
"learning_rate": 9.971040014504648e-07,
"loss": 0.3621,
"step": 77
},
{
"epoch": 0.12956810631229235,
"grad_norm": 14.223035223233115,
"learning_rate": 9.969575403857135e-07,
"loss": 0.1284,
"step": 78
},
{
"epoch": 0.13122923588039867,
"grad_norm": 22.262799198485006,
"learning_rate": 9.968074778737332e-07,
"loss": 0.2524,
"step": 79
},
{
"epoch": 0.132890365448505,
"grad_norm": 50.697373968189815,
"learning_rate": 9.966538150020252e-07,
"loss": 0.2189,
"step": 80
},
{
"epoch": 0.1345514950166113,
"grad_norm": 21.86462698311471,
"learning_rate": 9.964965528841833e-07,
"loss": 0.2334,
"step": 81
},
{
"epoch": 0.1362126245847176,
"grad_norm": 30.03903059222607,
"learning_rate": 9.963356926598848e-07,
"loss": 0.2619,
"step": 82
},
{
"epoch": 0.1378737541528239,
"grad_norm": 44.39420516166669,
"learning_rate": 9.961712354948822e-07,
"loss": 0.3148,
"step": 83
},
{
"epoch": 0.13953488372093023,
"grad_norm": 45.40184063467476,
"learning_rate": 9.960031825809955e-07,
"loss": 0.2719,
"step": 84
},
{
"epoch": 0.14119601328903655,
"grad_norm": 41.30803088566475,
"learning_rate": 9.95831535136103e-07,
"loss": 0.2746,
"step": 85
},
{
"epoch": 0.14285714285714285,
"grad_norm": 21.10123734223929,
"learning_rate": 9.956562944041316e-07,
"loss": 0.2082,
"step": 86
},
{
"epoch": 0.14451827242524917,
"grad_norm": 38.37926428810434,
"learning_rate": 9.954774616550499e-07,
"loss": 0.221,
"step": 87
},
{
"epoch": 0.1461794019933555,
"grad_norm": 21.615149648940918,
"learning_rate": 9.952950381848576e-07,
"loss": 0.1952,
"step": 88
},
{
"epoch": 0.1478405315614618,
"grad_norm": 21.749889867769415,
"learning_rate": 9.951090253155757e-07,
"loss": 0.2139,
"step": 89
},
{
"epoch": 0.14950166112956811,
"grad_norm": 16.24745315976571,
"learning_rate": 9.949194243952382e-07,
"loss": 0.1852,
"step": 90
},
{
"epoch": 0.1511627906976744,
"grad_norm": 13.76793966078715,
"learning_rate": 9.94726236797881e-07,
"loss": 0.179,
"step": 91
},
{
"epoch": 0.15282392026578073,
"grad_norm": 16.40584213825403,
"learning_rate": 9.945294639235336e-07,
"loss": 0.2484,
"step": 92
},
{
"epoch": 0.15448504983388706,
"grad_norm": 19.45237506640772,
"learning_rate": 9.943291071982072e-07,
"loss": 0.2379,
"step": 93
},
{
"epoch": 0.15614617940199335,
"grad_norm": 11.996023853804303,
"learning_rate": 9.941251680738852e-07,
"loss": 0.1372,
"step": 94
},
{
"epoch": 0.15780730897009967,
"grad_norm": 16.66955200884909,
"learning_rate": 9.939176480285128e-07,
"loss": 0.1833,
"step": 95
},
{
"epoch": 0.15946843853820597,
"grad_norm": 20.352150945455673,
"learning_rate": 9.93706548565986e-07,
"loss": 0.1878,
"step": 96
},
{
"epoch": 0.1611295681063123,
"grad_norm": 30.86575710552868,
"learning_rate": 9.934918712161414e-07,
"loss": 0.2089,
"step": 97
},
{
"epoch": 0.16279069767441862,
"grad_norm": 30.108048559073602,
"learning_rate": 9.932736175347433e-07,
"loss": 0.2334,
"step": 98
},
{
"epoch": 0.1644518272425249,
"grad_norm": 16.137380650744188,
"learning_rate": 9.930517891034748e-07,
"loss": 0.1935,
"step": 99
},
{
"epoch": 0.16611295681063123,
"grad_norm": 17.736914720083263,
"learning_rate": 9.928263875299245e-07,
"loss": 0.1772,
"step": 100
},
{
"epoch": 0.16777408637873753,
"grad_norm": 11.972509757727426,
"learning_rate": 9.92597414447576e-07,
"loss": 0.1374,
"step": 101
},
{
"epoch": 0.16943521594684385,
"grad_norm": 20.433799792275494,
"learning_rate": 9.923648715157952e-07,
"loss": 0.2198,
"step": 102
},
{
"epoch": 0.17109634551495018,
"grad_norm": 19.404673471029323,
"learning_rate": 9.921287604198185e-07,
"loss": 0.152,
"step": 103
},
{
"epoch": 0.17275747508305647,
"grad_norm": 18.73899376004668,
"learning_rate": 9.918890828707416e-07,
"loss": 0.2282,
"step": 104
},
{
"epoch": 0.1744186046511628,
"grad_norm": 22.306238872760357,
"learning_rate": 9.916458406055055e-07,
"loss": 0.1895,
"step": 105
},
{
"epoch": 0.1760797342192691,
"grad_norm": 19.021294987112594,
"learning_rate": 9.91399035386885e-07,
"loss": 0.2403,
"step": 106
},
{
"epoch": 0.1777408637873754,
"grad_norm": 86.53948554137872,
"learning_rate": 9.911486690034753e-07,
"loss": 0.1723,
"step": 107
},
{
"epoch": 0.17940199335548174,
"grad_norm": 18.7063649829218,
"learning_rate": 9.908947432696798e-07,
"loss": 0.2134,
"step": 108
},
{
"epoch": 0.18106312292358803,
"grad_norm": 19.41271168055036,
"learning_rate": 9.906372600256962e-07,
"loss": 0.225,
"step": 109
},
{
"epoch": 0.18272425249169436,
"grad_norm": 20.719404876758283,
"learning_rate": 9.903762211375032e-07,
"loss": 0.2158,
"step": 110
},
{
"epoch": 0.18438538205980065,
"grad_norm": 21.610832850601305,
"learning_rate": 9.901116284968478e-07,
"loss": 0.2267,
"step": 111
},
{
"epoch": 0.18604651162790697,
"grad_norm": 22.19654692004818,
"learning_rate": 9.898434840212305e-07,
"loss": 0.2376,
"step": 112
},
{
"epoch": 0.1877076411960133,
"grad_norm": 14.838187787942,
"learning_rate": 9.89571789653892e-07,
"loss": 0.1743,
"step": 113
},
{
"epoch": 0.1893687707641196,
"grad_norm": 17.480053388106516,
"learning_rate": 9.892965473637992e-07,
"loss": 0.239,
"step": 114
},
{
"epoch": 0.19102990033222592,
"grad_norm": 18.66511685282266,
"learning_rate": 9.890177591456311e-07,
"loss": 0.2502,
"step": 115
},
{
"epoch": 0.19269102990033224,
"grad_norm": 17.526724768165533,
"learning_rate": 9.887354270197634e-07,
"loss": 0.2557,
"step": 116
},
{
"epoch": 0.19435215946843853,
"grad_norm": 13.92139359787026,
"learning_rate": 9.884495530322548e-07,
"loss": 0.2024,
"step": 117
},
{
"epoch": 0.19601328903654486,
"grad_norm": 11.321726941564314,
"learning_rate": 9.881601392548314e-07,
"loss": 0.1411,
"step": 118
},
{
"epoch": 0.19767441860465115,
"grad_norm": 13.326684770702352,
"learning_rate": 9.878671877848728e-07,
"loss": 0.1813,
"step": 119
},
{
"epoch": 0.19933554817275748,
"grad_norm": 19.563367999650673,
"learning_rate": 9.875707007453957e-07,
"loss": 0.2395,
"step": 120
},
{
"epoch": 0.2009966777408638,
"grad_norm": 19.412272467279955,
"learning_rate": 9.872706802850395e-07,
"loss": 0.1867,
"step": 121
},
{
"epoch": 0.2026578073089701,
"grad_norm": 38.85770250068695,
"learning_rate": 9.869671285780498e-07,
"loss": 0.213,
"step": 122
},
{
"epoch": 0.20431893687707642,
"grad_norm": 12.921432457309935,
"learning_rate": 9.866600478242635e-07,
"loss": 0.1208,
"step": 123
},
{
"epoch": 0.2059800664451827,
"grad_norm": 23.926163027310174,
"learning_rate": 9.863494402490922e-07,
"loss": 0.2012,
"step": 124
},
{
"epoch": 0.20764119601328904,
"grad_norm": 15.480133857657162,
"learning_rate": 9.860353081035065e-07,
"loss": 0.1231,
"step": 125
},
{
"epoch": 0.20930232558139536,
"grad_norm": 22.15508037126096,
"learning_rate": 9.857176536640195e-07,
"loss": 0.2013,
"step": 126
},
{
"epoch": 0.21096345514950166,
"grad_norm": 18.2661529166959,
"learning_rate": 9.853964792326704e-07,
"loss": 0.2317,
"step": 127
},
{
"epoch": 0.21262458471760798,
"grad_norm": 16.111552087003037,
"learning_rate": 9.850717871370073e-07,
"loss": 0.1145,
"step": 128
},
{
"epoch": 0.21428571428571427,
"grad_norm": 17.14067866125475,
"learning_rate": 9.847435797300718e-07,
"loss": 0.2102,
"step": 129
},
{
"epoch": 0.2159468438538206,
"grad_norm": 15.60291480405653,
"learning_rate": 9.844118593903797e-07,
"loss": 0.1035,
"step": 130
},
{
"epoch": 0.21760797342192692,
"grad_norm": 15.174859703173738,
"learning_rate": 9.840766285219059e-07,
"loss": 0.1183,
"step": 131
},
{
"epoch": 0.21926910299003322,
"grad_norm": 25.54546397866724,
"learning_rate": 9.837378895540655e-07,
"loss": 0.2647,
"step": 132
},
{
"epoch": 0.22093023255813954,
"grad_norm": 10.70010328252097,
"learning_rate": 9.833956449416976e-07,
"loss": 0.1388,
"step": 133
},
{
"epoch": 0.22259136212624583,
"grad_norm": 13.281620643715373,
"learning_rate": 9.830498971650454e-07,
"loss": 0.1973,
"step": 134
},
{
"epoch": 0.22425249169435216,
"grad_norm": 15.696504177542776,
"learning_rate": 9.827006487297406e-07,
"loss": 0.2341,
"step": 135
},
{
"epoch": 0.22591362126245848,
"grad_norm": 11.52200533124044,
"learning_rate": 9.823479021667838e-07,
"loss": 0.1317,
"step": 136
},
{
"epoch": 0.22757475083056478,
"grad_norm": 27.726328978283576,
"learning_rate": 9.819916600325262e-07,
"loss": 0.354,
"step": 137
},
{
"epoch": 0.2292358803986711,
"grad_norm": 17.901230984984156,
"learning_rate": 9.816319249086519e-07,
"loss": 0.2298,
"step": 138
},
{
"epoch": 0.23089700996677742,
"grad_norm": 14.384698945178693,
"learning_rate": 9.812686994021582e-07,
"loss": 0.1523,
"step": 139
},
{
"epoch": 0.23255813953488372,
"grad_norm": 11.621939766839967,
"learning_rate": 9.809019861453373e-07,
"loss": 0.2313,
"step": 140
},
{
"epoch": 0.23421926910299004,
"grad_norm": 12.969245765067623,
"learning_rate": 9.805317877957576e-07,
"loss": 0.2519,
"step": 141
},
{
"epoch": 0.23588039867109634,
"grad_norm": 8.925285164861304,
"learning_rate": 9.80158107036243e-07,
"loss": 0.1042,
"step": 142
},
{
"epoch": 0.23754152823920266,
"grad_norm": 13.756628040019768,
"learning_rate": 9.797809465748553e-07,
"loss": 0.1994,
"step": 143
},
{
"epoch": 0.23920265780730898,
"grad_norm": 15.970803014352063,
"learning_rate": 9.794003091448728e-07,
"loss": 0.22,
"step": 144
},
{
"epoch": 0.24086378737541528,
"grad_norm": 11.738958531546247,
"learning_rate": 9.790161975047724e-07,
"loss": 0.1279,
"step": 145
},
{
"epoch": 0.2425249169435216,
"grad_norm": 13.897850888275721,
"learning_rate": 9.786286144382077e-07,
"loss": 0.1566,
"step": 146
},
{
"epoch": 0.2441860465116279,
"grad_norm": 17.313264421559992,
"learning_rate": 9.7823756275399e-07,
"loss": 0.225,
"step": 147
},
{
"epoch": 0.24584717607973422,
"grad_norm": 23.335482929522485,
"learning_rate": 9.77843045286068e-07,
"loss": 0.2193,
"step": 148
},
{
"epoch": 0.24750830564784054,
"grad_norm": 13.474282765831358,
"learning_rate": 9.774450648935062e-07,
"loss": 0.1841,
"step": 149
},
{
"epoch": 0.24916943521594684,
"grad_norm": 9.992948490443583,
"learning_rate": 9.77043624460465e-07,
"loss": 0.1319,
"step": 150
},
{
"epoch": 0.25083056478405313,
"grad_norm": 12.069688114769054,
"learning_rate": 9.766387268961807e-07,
"loss": 0.2002,
"step": 151
},
{
"epoch": 0.25249169435215946,
"grad_norm": 19.552574894536637,
"learning_rate": 9.762303751349421e-07,
"loss": 0.3202,
"step": 152
},
{
"epoch": 0.2541528239202658,
"grad_norm": 15.59457607811674,
"learning_rate": 9.758185721360713e-07,
"loss": 0.134,
"step": 153
},
{
"epoch": 0.2558139534883721,
"grad_norm": 14.504175942466917,
"learning_rate": 9.754033208839009e-07,
"loss": 0.1177,
"step": 154
},
{
"epoch": 0.2574750830564784,
"grad_norm": 14.826282805181464,
"learning_rate": 9.749846243877538e-07,
"loss": 0.1866,
"step": 155
},
{
"epoch": 0.2591362126245847,
"grad_norm": 10.496856702175336,
"learning_rate": 9.745624856819197e-07,
"loss": 0.1535,
"step": 156
},
{
"epoch": 0.260797342192691,
"grad_norm": 17.600209366012557,
"learning_rate": 9.741369078256344e-07,
"loss": 0.1506,
"step": 157
},
{
"epoch": 0.26245847176079734,
"grad_norm": 16.897725025587278,
"learning_rate": 9.737078939030574e-07,
"loss": 0.1118,
"step": 158
},
{
"epoch": 0.26411960132890366,
"grad_norm": 14.824856178621472,
"learning_rate": 9.73275447023249e-07,
"loss": 0.1801,
"step": 159
},
{
"epoch": 0.26578073089701,
"grad_norm": 10.323291152162106,
"learning_rate": 9.728395703201482e-07,
"loss": 0.1151,
"step": 160
},
{
"epoch": 0.26744186046511625,
"grad_norm": 25.65889033205719,
"learning_rate": 9.724002669525494e-07,
"loss": 0.2601,
"step": 161
},
{
"epoch": 0.2691029900332226,
"grad_norm": 15.331455485719754,
"learning_rate": 9.719575401040814e-07,
"loss": 0.2295,
"step": 162
},
{
"epoch": 0.2707641196013289,
"grad_norm": 15.620975269391694,
"learning_rate": 9.715113929831816e-07,
"loss": 0.1661,
"step": 163
},
{
"epoch": 0.2724252491694352,
"grad_norm": 21.27647184161525,
"learning_rate": 9.710618288230743e-07,
"loss": 0.1653,
"step": 164
},
{
"epoch": 0.27408637873754155,
"grad_norm": 9.574800737047406,
"learning_rate": 9.706088508817475e-07,
"loss": 0.1149,
"step": 165
},
{
"epoch": 0.2757475083056478,
"grad_norm": 10.020677235555565,
"learning_rate": 9.701524624419288e-07,
"loss": 0.114,
"step": 166
},
{
"epoch": 0.27740863787375414,
"grad_norm": 26.463216054020872,
"learning_rate": 9.696926668110612e-07,
"loss": 0.2905,
"step": 167
},
{
"epoch": 0.27906976744186046,
"grad_norm": 9.8357213518326,
"learning_rate": 9.692294673212803e-07,
"loss": 0.0852,
"step": 168
},
{
"epoch": 0.2807308970099668,
"grad_norm": 11.574558850099985,
"learning_rate": 9.687628673293887e-07,
"loss": 0.2001,
"step": 169
},
{
"epoch": 0.2823920265780731,
"grad_norm": 20.35819109924581,
"learning_rate": 9.682928702168325e-07,
"loss": 0.2113,
"step": 170
},
{
"epoch": 0.2840531561461794,
"grad_norm": 10.618810483797844,
"learning_rate": 9.678194793896772e-07,
"loss": 0.157,
"step": 171
},
{
"epoch": 0.2857142857142857,
"grad_norm": 12.604859535960964,
"learning_rate": 9.673426982785825e-07,
"loss": 0.1428,
"step": 172
},
{
"epoch": 0.287375415282392,
"grad_norm": 14.237387799784342,
"learning_rate": 9.668625303387768e-07,
"loss": 0.1614,
"step": 173
},
{
"epoch": 0.28903654485049834,
"grad_norm": 10.68448826627167,
"learning_rate": 9.663789790500332e-07,
"loss": 0.1228,
"step": 174
},
{
"epoch": 0.29069767441860467,
"grad_norm": 14.01788767612421,
"learning_rate": 9.658920479166444e-07,
"loss": 0.1634,
"step": 175
},
{
"epoch": 0.292358803986711,
"grad_norm": 12.150319124625579,
"learning_rate": 9.65401740467396e-07,
"loss": 0.1816,
"step": 176
},
{
"epoch": 0.29401993355481726,
"grad_norm": 13.26166970387516,
"learning_rate": 9.649080602555419e-07,
"loss": 0.2212,
"step": 177
},
{
"epoch": 0.2956810631229236,
"grad_norm": 10.456475792269867,
"learning_rate": 9.644110108587791e-07,
"loss": 0.162,
"step": 178
},
{
"epoch": 0.2973421926910299,
"grad_norm": 12.01103083397816,
"learning_rate": 9.6391059587922e-07,
"loss": 0.1953,
"step": 179
},
{
"epoch": 0.29900332225913623,
"grad_norm": 10.932107597417101,
"learning_rate": 9.634068189433682e-07,
"loss": 0.1792,
"step": 180
},
{
"epoch": 0.30066445182724255,
"grad_norm": 9.912596130837807,
"learning_rate": 9.628996837020907e-07,
"loss": 0.171,
"step": 181
},
{
"epoch": 0.3023255813953488,
"grad_norm": 9.446769528764404,
"learning_rate": 9.623891938305928e-07,
"loss": 0.1131,
"step": 182
},
{
"epoch": 0.30398671096345514,
"grad_norm": 11.672641463794086,
"learning_rate": 9.618753530283901e-07,
"loss": 0.1384,
"step": 183
},
{
"epoch": 0.30564784053156147,
"grad_norm": 10.856472785858744,
"learning_rate": 9.613581650192831e-07,
"loss": 0.1635,
"step": 184
},
{
"epoch": 0.3073089700996678,
"grad_norm": 15.534434398535327,
"learning_rate": 9.608376335513285e-07,
"loss": 0.2019,
"step": 185
},
{
"epoch": 0.3089700996677741,
"grad_norm": 9.808233191877529,
"learning_rate": 9.60313762396814e-07,
"loss": 0.0811,
"step": 186
},
{
"epoch": 0.3106312292358804,
"grad_norm": 11.12760822568997,
"learning_rate": 9.597865553522297e-07,
"loss": 0.1293,
"step": 187
},
{
"epoch": 0.3122923588039867,
"grad_norm": 16.610274735181868,
"learning_rate": 9.592560162382403e-07,
"loss": 0.1754,
"step": 188
},
{
"epoch": 0.313953488372093,
"grad_norm": 31.037129088244438,
"learning_rate": 9.587221488996586e-07,
"loss": 0.3788,
"step": 189
},
{
"epoch": 0.31561461794019935,
"grad_norm": 19.704030174639467,
"learning_rate": 9.58184957205417e-07,
"loss": 0.1908,
"step": 190
},
{
"epoch": 0.31727574750830567,
"grad_norm": 11.615505292621943,
"learning_rate": 9.576444450485391e-07,
"loss": 0.1098,
"step": 191
},
{
"epoch": 0.31893687707641194,
"grad_norm": 10.666388155171473,
"learning_rate": 9.571006163461123e-07,
"loss": 0.131,
"step": 192
},
{
"epoch": 0.32059800664451826,
"grad_norm": 17.15366066076218,
"learning_rate": 9.565534750392585e-07,
"loss": 0.2124,
"step": 193
},
{
"epoch": 0.3222591362126246,
"grad_norm": 12.172651785352677,
"learning_rate": 9.560030250931064e-07,
"loss": 0.1371,
"step": 194
},
{
"epoch": 0.3239202657807309,
"grad_norm": 22.03825149725322,
"learning_rate": 9.554492704967624e-07,
"loss": 0.2334,
"step": 195
},
{
"epoch": 0.32558139534883723,
"grad_norm": 13.245192413042442,
"learning_rate": 9.548922152632811e-07,
"loss": 0.1631,
"step": 196
},
{
"epoch": 0.3272425249169435,
"grad_norm": 10.802335247206143,
"learning_rate": 9.543318634296375e-07,
"loss": 0.1568,
"step": 197
},
{
"epoch": 0.3289036544850498,
"grad_norm": 12.17592621693887,
"learning_rate": 9.53768219056697e-07,
"loss": 0.1141,
"step": 198
},
{
"epoch": 0.33056478405315615,
"grad_norm": 11.253478382590625,
"learning_rate": 9.532012862291853e-07,
"loss": 0.1163,
"step": 199
},
{
"epoch": 0.33222591362126247,
"grad_norm": 21.815045663911672,
"learning_rate": 9.526310690556605e-07,
"loss": 0.1867,
"step": 200
},
{
"epoch": 0.3338870431893688,
"grad_norm": 18.859610289112627,
"learning_rate": 9.520575716684811e-07,
"loss": 0.2125,
"step": 201
},
{
"epoch": 0.33554817275747506,
"grad_norm": 11.205977833725816,
"learning_rate": 9.514807982237785e-07,
"loss": 0.1618,
"step": 202
},
{
"epoch": 0.3372093023255814,
"grad_norm": 11.388470403679207,
"learning_rate": 9.50900752901425e-07,
"loss": 0.1184,
"step": 203
},
{
"epoch": 0.3388704318936877,
"grad_norm": 10.075648460940602,
"learning_rate": 9.503174399050043e-07,
"loss": 0.1441,
"step": 204
},
{
"epoch": 0.34053156146179403,
"grad_norm": 12.322759834617335,
"learning_rate": 9.497308634617807e-07,
"loss": 0.1244,
"step": 205
},
{
"epoch": 0.34219269102990035,
"grad_norm": 14.94788430766923,
"learning_rate": 9.491410278226692e-07,
"loss": 0.2405,
"step": 206
},
{
"epoch": 0.3438538205980066,
"grad_norm": 12.443492266603144,
"learning_rate": 9.485479372622037e-07,
"loss": 0.149,
"step": 207
},
{
"epoch": 0.34551495016611294,
"grad_norm": 11.067400945275928,
"learning_rate": 9.479515960785068e-07,
"loss": 0.1404,
"step": 208
},
{
"epoch": 0.34717607973421927,
"grad_norm": 11.525444371950433,
"learning_rate": 9.473520085932579e-07,
"loss": 0.1384,
"step": 209
},
{
"epoch": 0.3488372093023256,
"grad_norm": 10.971783104668468,
"learning_rate": 9.467491791516626e-07,
"loss": 0.1349,
"step": 210
},
{
"epoch": 0.3504983388704319,
"grad_norm": 12.635492557571897,
"learning_rate": 9.461431121224214e-07,
"loss": 0.1997,
"step": 211
},
{
"epoch": 0.3521594684385382,
"grad_norm": 19.325558806797325,
"learning_rate": 9.455338118976966e-07,
"loss": 0.1585,
"step": 212
},
{
"epoch": 0.3538205980066445,
"grad_norm": 8.353857534598658,
"learning_rate": 9.449212828930822e-07,
"loss": 0.1202,
"step": 213
},
{
"epoch": 0.3554817275747508,
"grad_norm": 10.543973821466691,
"learning_rate": 9.443055295475707e-07,
"loss": 0.1858,
"step": 214
},
{
"epoch": 0.35714285714285715,
"grad_norm": 9.95076123718523,
"learning_rate": 9.436865563235217e-07,
"loss": 0.179,
"step": 215
},
{
"epoch": 0.3588039867109635,
"grad_norm": 13.713185473400454,
"learning_rate": 9.430643677066291e-07,
"loss": 0.1925,
"step": 216
},
{
"epoch": 0.36046511627906974,
"grad_norm": 7.513881407573629,
"learning_rate": 9.424389682058886e-07,
"loss": 0.1222,
"step": 217
},
{
"epoch": 0.36212624584717606,
"grad_norm": 11.36739084106459,
"learning_rate": 9.418103623535653e-07,
"loss": 0.1867,
"step": 218
},
{
"epoch": 0.3637873754152824,
"grad_norm": 12.043749514925038,
"learning_rate": 9.41178554705161e-07,
"loss": 0.1931,
"step": 219
},
{
"epoch": 0.3654485049833887,
"grad_norm": 6.233797219912333,
"learning_rate": 9.405435498393799e-07,
"loss": 0.0966,
"step": 220
},
{
"epoch": 0.36710963455149503,
"grad_norm": 9.594573864750178,
"learning_rate": 9.399053523580976e-07,
"loss": 0.1386,
"step": 221
},
{
"epoch": 0.3687707641196013,
"grad_norm": 11.240178079664732,
"learning_rate": 9.392639668863258e-07,
"loss": 0.1203,
"step": 222
},
{
"epoch": 0.3704318936877076,
"grad_norm": 14.503931970774168,
"learning_rate": 9.3861939807218e-07,
"loss": 0.1463,
"step": 223
},
{
"epoch": 0.37209302325581395,
"grad_norm": 10.257728339652989,
"learning_rate": 9.379716505868447e-07,
"loss": 0.1593,
"step": 224
},
{
"epoch": 0.37375415282392027,
"grad_norm": 12.14682043932465,
"learning_rate": 9.373207291245411e-07,
"loss": 0.1257,
"step": 225
},
{
"epoch": 0.3754152823920266,
"grad_norm": 9.99859322996326,
"learning_rate": 9.366666384024913e-07,
"loss": 0.1696,
"step": 226
},
{
"epoch": 0.3770764119601329,
"grad_norm": 10.04625893529298,
"learning_rate": 9.360093831608856e-07,
"loss": 0.1625,
"step": 227
},
{
"epoch": 0.3787375415282392,
"grad_norm": 16.19965207561594,
"learning_rate": 9.353489681628475e-07,
"loss": 0.1471,
"step": 228
},
{
"epoch": 0.3803986710963455,
"grad_norm": 13.04710404485369,
"learning_rate": 9.346853981943988e-07,
"loss": 0.1499,
"step": 229
},
{
"epoch": 0.38205980066445183,
"grad_norm": 11.06095587308583,
"learning_rate": 9.340186780644259e-07,
"loss": 0.0893,
"step": 230
},
{
"epoch": 0.38372093023255816,
"grad_norm": 11.353482545061441,
"learning_rate": 9.333488126046438e-07,
"loss": 0.1214,
"step": 231
},
{
"epoch": 0.3853820598006645,
"grad_norm": 17.31613403125687,
"learning_rate": 9.326758066695624e-07,
"loss": 0.1278,
"step": 232
},
{
"epoch": 0.38704318936877075,
"grad_norm": 19.69366300321997,
"learning_rate": 9.319996651364499e-07,
"loss": 0.1722,
"step": 233
},
{
"epoch": 0.38870431893687707,
"grad_norm": 13.58019477734271,
"learning_rate": 9.313203929052986e-07,
"loss": 0.1316,
"step": 234
},
{
"epoch": 0.3903654485049834,
"grad_norm": 22.806246251767572,
"learning_rate": 9.306379948987888e-07,
"loss": 0.2574,
"step": 235
},
{
"epoch": 0.3920265780730897,
"grad_norm": 16.293534036256254,
"learning_rate": 9.299524760622533e-07,
"loss": 0.1146,
"step": 236
},
{
"epoch": 0.39368770764119604,
"grad_norm": 9.215117865221517,
"learning_rate": 9.292638413636414e-07,
"loss": 0.0652,
"step": 237
},
{
"epoch": 0.3953488372093023,
"grad_norm": 19.243051206888683,
"learning_rate": 9.285720957934831e-07,
"loss": 0.2231,
"step": 238
},
{
"epoch": 0.39700996677740863,
"grad_norm": 11.51135738383444,
"learning_rate": 9.278772443648531e-07,
"loss": 0.1822,
"step": 239
},
{
"epoch": 0.39867109634551495,
"grad_norm": 12.885715889145898,
"learning_rate": 9.271792921133337e-07,
"loss": 0.1281,
"step": 240
},
{
"epoch": 0.4003322259136213,
"grad_norm": 12.475318988797966,
"learning_rate": 9.264782440969793e-07,
"loss": 0.1822,
"step": 241
},
{
"epoch": 0.4019933554817276,
"grad_norm": 9.33400580821295,
"learning_rate": 9.257741053962794e-07,
"loss": 0.1347,
"step": 242
},
{
"epoch": 0.40365448504983387,
"grad_norm": 12.058606856026875,
"learning_rate": 9.25066881114121e-07,
"loss": 0.1706,
"step": 243
},
{
"epoch": 0.4053156146179402,
"grad_norm": 10.824161359526528,
"learning_rate": 9.243565763757529e-07,
"loss": 0.1761,
"step": 244
},
{
"epoch": 0.4069767441860465,
"grad_norm": 14.581271946544478,
"learning_rate": 9.236431963287477e-07,
"loss": 0.2583,
"step": 245
},
{
"epoch": 0.40863787375415284,
"grad_norm": 6.398803338643679,
"learning_rate": 9.229267461429647e-07,
"loss": 0.1036,
"step": 246
},
{
"epoch": 0.41029900332225916,
"grad_norm": 8.214273717131517,
"learning_rate": 9.222072310105126e-07,
"loss": 0.151,
"step": 247
},
{
"epoch": 0.4119601328903654,
"grad_norm": 6.531248071881361,
"learning_rate": 9.214846561457117e-07,
"loss": 0.1343,
"step": 248
},
{
"epoch": 0.41362126245847175,
"grad_norm": 7.946424836117755,
"learning_rate": 9.207590267850562e-07,
"loss": 0.1339,
"step": 249
},
{
"epoch": 0.4152823920265781,
"grad_norm": 14.683229666391957,
"learning_rate": 9.200303481871758e-07,
"loss": 0.2346,
"step": 250
},
{
"epoch": 0.4169435215946844,
"grad_norm": 6.7523422282754595,
"learning_rate": 9.192986256327989e-07,
"loss": 0.1082,
"step": 251
},
{
"epoch": 0.4186046511627907,
"grad_norm": 12.807771031205872,
"learning_rate": 9.185638644247122e-07,
"loss": 0.172,
"step": 252
},
{
"epoch": 0.420265780730897,
"grad_norm": 12.441325438265876,
"learning_rate": 9.178260698877247e-07,
"loss": 0.1524,
"step": 253
},
{
"epoch": 0.4219269102990033,
"grad_norm": 11.924760374595467,
"learning_rate": 9.170852473686272e-07,
"loss": 0.145,
"step": 254
},
{
"epoch": 0.42358803986710963,
"grad_norm": 15.295324115541575,
"learning_rate": 9.163414022361542e-07,
"loss": 0.2366,
"step": 255
},
{
"epoch": 0.42524916943521596,
"grad_norm": 11.45834430193356,
"learning_rate": 9.155945398809457e-07,
"loss": 0.1714,
"step": 256
},
{
"epoch": 0.4269102990033223,
"grad_norm": 11.860210597995017,
"learning_rate": 9.148446657155069e-07,
"loss": 0.1581,
"step": 257
},
{
"epoch": 0.42857142857142855,
"grad_norm": 14.653302275353555,
"learning_rate": 9.140917851741696e-07,
"loss": 0.1782,
"step": 258
},
{
"epoch": 0.43023255813953487,
"grad_norm": 11.340233977827461,
"learning_rate": 9.13335903713053e-07,
"loss": 0.135,
"step": 259
},
{
"epoch": 0.4318936877076412,
"grad_norm": 12.33215015955027,
"learning_rate": 9.125770268100241e-07,
"loss": 0.1755,
"step": 260
},
{
"epoch": 0.4335548172757475,
"grad_norm": 10.82579948342303,
"learning_rate": 9.118151599646573e-07,
"loss": 0.1775,
"step": 261
},
{
"epoch": 0.43521594684385384,
"grad_norm": 8.893649702916877,
"learning_rate": 9.110503086981955e-07,
"loss": 0.134,
"step": 262
},
{
"epoch": 0.4368770764119601,
"grad_norm": 11.851382147068934,
"learning_rate": 9.102824785535096e-07,
"loss": 0.248,
"step": 263
},
{
"epoch": 0.43853820598006643,
"grad_norm": 8.085625713042226,
"learning_rate": 9.095116750950583e-07,
"loss": 0.1053,
"step": 264
},
{
"epoch": 0.44019933554817275,
"grad_norm": 8.825530836149932,
"learning_rate": 9.087379039088481e-07,
"loss": 0.1699,
"step": 265
},
{
"epoch": 0.4418604651162791,
"grad_norm": 8.37105680491955,
"learning_rate": 9.079611706023925e-07,
"loss": 0.1496,
"step": 266
},
{
"epoch": 0.4435215946843854,
"grad_norm": 8.257776032135112,
"learning_rate": 9.071814808046709e-07,
"loss": 0.1492,
"step": 267
},
{
"epoch": 0.44518272425249167,
"grad_norm": 9.848656586818922,
"learning_rate": 9.063988401660895e-07,
"loss": 0.1167,
"step": 268
},
{
"epoch": 0.446843853820598,
"grad_norm": 13.15071986231609,
"learning_rate": 9.056132543584385e-07,
"loss": 0.2396,
"step": 269
},
{
"epoch": 0.4485049833887043,
"grad_norm": 7.4908331870411065,
"learning_rate": 9.048247290748516e-07,
"loss": 0.1152,
"step": 270
},
{
"epoch": 0.45016611295681064,
"grad_norm": 8.35585031303107,
"learning_rate": 9.040332700297651e-07,
"loss": 0.0845,
"step": 271
},
{
"epoch": 0.45182724252491696,
"grad_norm": 18.299405153632886,
"learning_rate": 9.032388829588764e-07,
"loss": 0.1516,
"step": 272
},
{
"epoch": 0.45348837209302323,
"grad_norm": 10.140601449856803,
"learning_rate": 9.02441573619102e-07,
"loss": 0.1274,
"step": 273
},
{
"epoch": 0.45514950166112955,
"grad_norm": 12.172095204640925,
"learning_rate": 9.01641347788536e-07,
"loss": 0.182,
"step": 274
},
{
"epoch": 0.4568106312292359,
"grad_norm": 9.563752668175168,
"learning_rate": 9.008382112664088e-07,
"loss": 0.0945,
"step": 275
},
{
"epoch": 0.4584717607973422,
"grad_norm": 10.287796564887433,
"learning_rate": 9.000321698730439e-07,
"loss": 0.0976,
"step": 276
},
{
"epoch": 0.4601328903654485,
"grad_norm": 13.189377107182274,
"learning_rate": 8.992232294498169e-07,
"loss": 0.1124,
"step": 277
},
{
"epoch": 0.46179401993355484,
"grad_norm": 25.060289240642998,
"learning_rate": 8.984113958591124e-07,
"loss": 0.1806,
"step": 278
},
{
"epoch": 0.4634551495016611,
"grad_norm": 14.670517988936247,
"learning_rate": 8.975966749842816e-07,
"loss": 0.1432,
"step": 279
},
{
"epoch": 0.46511627906976744,
"grad_norm": 19.822372119676242,
"learning_rate": 8.967790727296001e-07,
"loss": 0.2261,
"step": 280
},
{
"epoch": 0.46677740863787376,
"grad_norm": 5.631680487351567,
"learning_rate": 8.959585950202248e-07,
"loss": 0.0537,
"step": 281
},
{
"epoch": 0.4684385382059801,
"grad_norm": 21.281978024222216,
"learning_rate": 8.95135247802151e-07,
"loss": 0.2133,
"step": 282
},
{
"epoch": 0.4700996677740864,
"grad_norm": 16.409996226272277,
"learning_rate": 8.943090370421691e-07,
"loss": 0.1548,
"step": 283
},
{
"epoch": 0.4717607973421927,
"grad_norm": 10.142872282405547,
"learning_rate": 8.934799687278219e-07,
"loss": 0.1067,
"step": 284
},
{
"epoch": 0.473421926910299,
"grad_norm": 22.26876142574316,
"learning_rate": 8.926480488673605e-07,
"loss": 0.1667,
"step": 285
},
{
"epoch": 0.4750830564784053,
"grad_norm": 9.876108544387835,
"learning_rate": 8.918132834897015e-07,
"loss": 0.1081,
"step": 286
},
{
"epoch": 0.47674418604651164,
"grad_norm": 13.966378148098578,
"learning_rate": 8.909756786443827e-07,
"loss": 0.1993,
"step": 287
},
{
"epoch": 0.47840531561461797,
"grad_norm": 14.484542043786044,
"learning_rate": 8.901352404015194e-07,
"loss": 0.1349,
"step": 288
},
{
"epoch": 0.48006644518272423,
"grad_norm": 7.865469366519635,
"learning_rate": 8.89291974851761e-07,
"loss": 0.0748,
"step": 289
},
{
"epoch": 0.48172757475083056,
"grad_norm": 12.981698322123203,
"learning_rate": 8.884458881062457e-07,
"loss": 0.1387,
"step": 290
},
{
"epoch": 0.4833887043189369,
"grad_norm": 14.42068589553622,
"learning_rate": 8.875969862965574e-07,
"loss": 0.1887,
"step": 291
},
{
"epoch": 0.4850498338870432,
"grad_norm": 19.32168616528694,
"learning_rate": 8.867452755746805e-07,
"loss": 0.1184,
"step": 292
},
{
"epoch": 0.4867109634551495,
"grad_norm": 14.687720171684697,
"learning_rate": 8.858907621129559e-07,
"loss": 0.1596,
"step": 293
},
{
"epoch": 0.4883720930232558,
"grad_norm": 10.32826671997461,
"learning_rate": 8.850334521040352e-07,
"loss": 0.1432,
"step": 294
},
{
"epoch": 0.4900332225913621,
"grad_norm": 9.149170000100604,
"learning_rate": 8.841733517608374e-07,
"loss": 0.1725,
"step": 295
},
{
"epoch": 0.49169435215946844,
"grad_norm": 7.682038024526175,
"learning_rate": 8.833104673165024e-07,
"loss": 0.1473,
"step": 296
},
{
"epoch": 0.49335548172757476,
"grad_norm": 7.36387604217277,
"learning_rate": 8.824448050243469e-07,
"loss": 0.1065,
"step": 297
},
{
"epoch": 0.4950166112956811,
"grad_norm": 10.351711544120024,
"learning_rate": 8.815763711578183e-07,
"loss": 0.1717,
"step": 298
},
{
"epoch": 0.49667774086378735,
"grad_norm": 10.17527289922022,
"learning_rate": 8.8070517201045e-07,
"loss": 0.1498,
"step": 299
},
{
"epoch": 0.4983388704318937,
"grad_norm": 17.205640014020535,
"learning_rate": 8.798312138958146e-07,
"loss": 0.1562,
"step": 300
},
{
"epoch": 0.5,
"grad_norm": 13.741848852332474,
"learning_rate": 8.789545031474799e-07,
"loss": 0.1875,
"step": 301
},
{
"epoch": 0.5016611295681063,
"grad_norm": 7.794673805015374,
"learning_rate": 8.780750461189612e-07,
"loss": 0.1141,
"step": 302
},
{
"epoch": 0.5033222591362126,
"grad_norm": 11.818808426001134,
"learning_rate": 8.771928491836764e-07,
"loss": 0.1633,
"step": 303
},
{
"epoch": 0.5049833887043189,
"grad_norm": 7.761679819092504,
"learning_rate": 8.763079187348999e-07,
"loss": 0.1248,
"step": 304
},
{
"epoch": 0.5066445182724253,
"grad_norm": 8.882760685506408,
"learning_rate": 8.754202611857149e-07,
"loss": 0.1513,
"step": 305
},
{
"epoch": 0.5083056478405316,
"grad_norm": 7.40135368569042,
"learning_rate": 8.745298829689686e-07,
"loss": 0.0891,
"step": 306
},
{
"epoch": 0.5099667774086378,
"grad_norm": 10.925689760300747,
"learning_rate": 8.736367905372246e-07,
"loss": 0.1939,
"step": 307
},
{
"epoch": 0.5116279069767442,
"grad_norm": 11.424412238662494,
"learning_rate": 8.727409903627165e-07,
"loss": 0.1181,
"step": 308
},
{
"epoch": 0.5132890365448505,
"grad_norm": 11.026582956807562,
"learning_rate": 8.71842488937301e-07,
"loss": 0.1892,
"step": 309
},
{
"epoch": 0.5149501661129569,
"grad_norm": 14.452286426871511,
"learning_rate": 8.709412927724103e-07,
"loss": 0.1648,
"step": 310
},
{
"epoch": 0.5166112956810631,
"grad_norm": 8.879574876643009,
"learning_rate": 8.700374083990057e-07,
"loss": 0.1412,
"step": 311
},
{
"epoch": 0.5182724252491694,
"grad_norm": 14.483071618844798,
"learning_rate": 8.691308423675299e-07,
"loss": 0.2708,
"step": 312
},
{
"epoch": 0.5199335548172758,
"grad_norm": 10.234610377564792,
"learning_rate": 8.682216012478596e-07,
"loss": 0.1516,
"step": 313
},
{
"epoch": 0.521594684385382,
"grad_norm": 17.446807434470152,
"learning_rate": 8.673096916292576e-07,
"loss": 0.1629,
"step": 314
},
{
"epoch": 0.5232558139534884,
"grad_norm": 9.65924900517999,
"learning_rate": 8.663951201203254e-07,
"loss": 0.1413,
"step": 315
},
{
"epoch": 0.5249169435215947,
"grad_norm": 9.522606938597889,
"learning_rate": 8.654778933489556e-07,
"loss": 0.1678,
"step": 316
},
{
"epoch": 0.526578073089701,
"grad_norm": 10.287315185571302,
"learning_rate": 8.645580179622828e-07,
"loss": 0.1753,
"step": 317
},
{
"epoch": 0.5282392026578073,
"grad_norm": 9.999270763772957,
"learning_rate": 8.636355006266365e-07,
"loss": 0.1578,
"step": 318
},
{
"epoch": 0.5299003322259136,
"grad_norm": 9.32239950078714,
"learning_rate": 8.627103480274921e-07,
"loss": 0.1659,
"step": 319
},
{
"epoch": 0.53156146179402,
"grad_norm": 6.923158524420186,
"learning_rate": 8.617825668694232e-07,
"loss": 0.1233,
"step": 320
},
{
"epoch": 0.5332225913621262,
"grad_norm": 10.823618378270748,
"learning_rate": 8.60852163876052e-07,
"loss": 0.1538,
"step": 321
},
{
"epoch": 0.5348837209302325,
"grad_norm": 11.047906207579194,
"learning_rate": 8.599191457900016e-07,
"loss": 0.1547,
"step": 322
},
{
"epoch": 0.5365448504983389,
"grad_norm": 13.05735973202964,
"learning_rate": 8.589835193728463e-07,
"loss": 0.1444,
"step": 323
},
{
"epoch": 0.5382059800664452,
"grad_norm": 9.104429009432321,
"learning_rate": 8.580452914050631e-07,
"loss": 0.1255,
"step": 324
},
{
"epoch": 0.5398671096345515,
"grad_norm": 11.64252163137442,
"learning_rate": 8.571044686859825e-07,
"loss": 0.1912,
"step": 325
},
{
"epoch": 0.5415282392026578,
"grad_norm": 14.944866982774602,
"learning_rate": 8.561610580337391e-07,
"loss": 0.1768,
"step": 326
},
{
"epoch": 0.5431893687707641,
"grad_norm": 9.488326230185102,
"learning_rate": 8.55215066285222e-07,
"loss": 0.1118,
"step": 327
},
{
"epoch": 0.5448504983388704,
"grad_norm": 9.85897825770774,
"learning_rate": 8.542665002960257e-07,
"loss": 0.1025,
"step": 328
},
{
"epoch": 0.5465116279069767,
"grad_norm": 8.684946844670787,
"learning_rate": 8.533153669404001e-07,
"loss": 0.1264,
"step": 329
},
{
"epoch": 0.5481727574750831,
"grad_norm": 10.062901574236582,
"learning_rate": 8.523616731112011e-07,
"loss": 0.1723,
"step": 330
},
{
"epoch": 0.5498338870431894,
"grad_norm": 12.11357006155644,
"learning_rate": 8.514054257198398e-07,
"loss": 0.1531,
"step": 331
},
{
"epoch": 0.5514950166112956,
"grad_norm": 11.839908028521032,
"learning_rate": 8.504466316962336e-07,
"loss": 0.1442,
"step": 332
},
{
"epoch": 0.553156146179402,
"grad_norm": 11.844766776417758,
"learning_rate": 8.494852979887544e-07,
"loss": 0.1071,
"step": 333
},
{
"epoch": 0.5548172757475083,
"grad_norm": 10.271319518938574,
"learning_rate": 8.4852143156418e-07,
"loss": 0.149,
"step": 334
},
{
"epoch": 0.5564784053156147,
"grad_norm": 11.779914075239326,
"learning_rate": 8.475550394076426e-07,
"loss": 0.1389,
"step": 335
},
{
"epoch": 0.5581395348837209,
"grad_norm": 10.435527692770954,
"learning_rate": 8.465861285225781e-07,
"loss": 0.149,
"step": 336
},
{
"epoch": 0.5598006644518272,
"grad_norm": 9.38848130124771,
"learning_rate": 8.456147059306757e-07,
"loss": 0.0886,
"step": 337
},
{
"epoch": 0.5614617940199336,
"grad_norm": 10.191781455117614,
"learning_rate": 8.446407786718273e-07,
"loss": 0.1092,
"step": 338
},
{
"epoch": 0.5631229235880398,
"grad_norm": 10.76683247123338,
"learning_rate": 8.436643538040753e-07,
"loss": 0.1363,
"step": 339
},
{
"epoch": 0.5647840531561462,
"grad_norm": 10.294295935493142,
"learning_rate": 8.426854384035631e-07,
"loss": 0.0882,
"step": 340
},
{
"epoch": 0.5664451827242525,
"grad_norm": 16.910697465451545,
"learning_rate": 8.417040395644825e-07,
"loss": 0.1969,
"step": 341
},
{
"epoch": 0.5681063122923588,
"grad_norm": 15.166734734046708,
"learning_rate": 8.40720164399023e-07,
"loss": 0.1724,
"step": 342
},
{
"epoch": 0.5697674418604651,
"grad_norm": 11.95776400356682,
"learning_rate": 8.397338200373194e-07,
"loss": 0.1101,
"step": 343
},
{
"epoch": 0.5714285714285714,
"grad_norm": 14.389186956170414,
"learning_rate": 8.387450136274017e-07,
"loss": 0.1589,
"step": 344
},
{
"epoch": 0.5730897009966778,
"grad_norm": 14.678898341131756,
"learning_rate": 8.377537523351417e-07,
"loss": 0.1563,
"step": 345
},
{
"epoch": 0.574750830564784,
"grad_norm": 10.867669749295963,
"learning_rate": 8.367600433442018e-07,
"loss": 0.1036,
"step": 346
},
{
"epoch": 0.5764119601328903,
"grad_norm": 20.043379096999764,
"learning_rate": 8.35763893855983e-07,
"loss": 0.2066,
"step": 347
},
{
"epoch": 0.5780730897009967,
"grad_norm": 13.690312486819662,
"learning_rate": 8.347653110895725e-07,
"loss": 0.156,
"step": 348
},
{
"epoch": 0.579734219269103,
"grad_norm": 10.391292183316825,
"learning_rate": 8.337643022816914e-07,
"loss": 0.1022,
"step": 349
},
{
"epoch": 0.5813953488372093,
"grad_norm": 9.001405280579482,
"learning_rate": 8.327608746866423e-07,
"loss": 0.101,
"step": 350
},
{
"epoch": 0.5830564784053156,
"grad_norm": 8.24123642090196,
"learning_rate": 8.31755035576257e-07,
"loss": 0.0964,
"step": 351
},
{
"epoch": 0.584717607973422,
"grad_norm": 9.492222500148317,
"learning_rate": 8.307467922398432e-07,
"loss": 0.1317,
"step": 352
},
{
"epoch": 0.5863787375415282,
"grad_norm": 15.062860420034843,
"learning_rate": 8.297361519841318e-07,
"loss": 0.2075,
"step": 353
},
{
"epoch": 0.5880398671096345,
"grad_norm": 14.016489579298407,
"learning_rate": 8.28723122133225e-07,
"loss": 0.2038,
"step": 354
},
{
"epoch": 0.5897009966777409,
"grad_norm": 10.966278256892918,
"learning_rate": 8.277077100285412e-07,
"loss": 0.1182,
"step": 355
},
{
"epoch": 0.5913621262458472,
"grad_norm": 10.72462410494972,
"learning_rate": 8.266899230287642e-07,
"loss": 0.1052,
"step": 356
},
{
"epoch": 0.5930232558139535,
"grad_norm": 10.814041170004836,
"learning_rate": 8.256697685097877e-07,
"loss": 0.0989,
"step": 357
},
{
"epoch": 0.5946843853820598,
"grad_norm": 15.163523649534604,
"learning_rate": 8.246472538646634e-07,
"loss": 0.1417,
"step": 358
},
{
"epoch": 0.5963455149501661,
"grad_norm": 11.111867453534417,
"learning_rate": 8.236223865035465e-07,
"loss": 0.1706,
"step": 359
},
{
"epoch": 0.5980066445182725,
"grad_norm": 10.54296261815159,
"learning_rate": 8.225951738536423e-07,
"loss": 0.1287,
"step": 360
},
{
"epoch": 0.5996677740863787,
"grad_norm": 8.36851786714232,
"learning_rate": 8.215656233591524e-07,
"loss": 0.1091,
"step": 361
},
{
"epoch": 0.6013289036544851,
"grad_norm": 14.223204073643483,
"learning_rate": 8.205337424812208e-07,
"loss": 0.1974,
"step": 362
},
{
"epoch": 0.6029900332225914,
"grad_norm": 9.381500918095396,
"learning_rate": 8.194995386978803e-07,
"loss": 0.1167,
"step": 363
},
{
"epoch": 0.6046511627906976,
"grad_norm": 13.697438543636968,
"learning_rate": 8.184630195039965e-07,
"loss": 0.1341,
"step": 364
},
{
"epoch": 0.606312292358804,
"grad_norm": 11.51516985079088,
"learning_rate": 8.17424192411216e-07,
"loss": 0.1389,
"step": 365
},
{
"epoch": 0.6079734219269103,
"grad_norm": 17.961936532222403,
"learning_rate": 8.163830649479101e-07,
"loss": 0.2059,
"step": 366
},
{
"epoch": 0.6096345514950167,
"grad_norm": 11.255484209831073,
"learning_rate": 8.15339644659121e-07,
"loss": 0.1636,
"step": 367
},
{
"epoch": 0.6112956810631229,
"grad_norm": 14.400184086382511,
"learning_rate": 8.14293939106507e-07,
"loss": 0.2286,
"step": 368
},
{
"epoch": 0.6129568106312292,
"grad_norm": 11.730011163552305,
"learning_rate": 8.132459558682878e-07,
"loss": 0.1594,
"step": 369
},
{
"epoch": 0.6146179401993356,
"grad_norm": 10.322545416211497,
"learning_rate": 8.121957025391891e-07,
"loss": 0.1497,
"step": 370
},
{
"epoch": 0.6162790697674418,
"grad_norm": 10.109902353400317,
"learning_rate": 8.111431867303884e-07,
"loss": 0.1422,
"step": 371
},
{
"epoch": 0.6179401993355482,
"grad_norm": 8.097191708903397,
"learning_rate": 8.10088416069459e-07,
"loss": 0.0915,
"step": 372
},
{
"epoch": 0.6196013289036545,
"grad_norm": 15.757926168628456,
"learning_rate": 8.090313982003155e-07,
"loss": 0.2464,
"step": 373
},
{
"epoch": 0.6212624584717608,
"grad_norm": 12.285328658201744,
"learning_rate": 8.079721407831574e-07,
"loss": 0.1759,
"step": 374
},
{
"epoch": 0.6229235880398671,
"grad_norm": 9.06127827692409,
"learning_rate": 8.06910651494415e-07,
"loss": 0.1211,
"step": 375
},
{
"epoch": 0.6245847176079734,
"grad_norm": 6.734374794139948,
"learning_rate": 8.058469380266921e-07,
"loss": 0.11,
"step": 376
},
{
"epoch": 0.6262458471760798,
"grad_norm": 9.929774756393966,
"learning_rate": 8.047810080887116e-07,
"loss": 0.146,
"step": 377
},
{
"epoch": 0.627906976744186,
"grad_norm": 16.089957357745977,
"learning_rate": 8.037128694052588e-07,
"loss": 0.2195,
"step": 378
},
{
"epoch": 0.6295681063122923,
"grad_norm": 15.96907916727788,
"learning_rate": 8.026425297171266e-07,
"loss": 0.1866,
"step": 379
},
{
"epoch": 0.6312292358803987,
"grad_norm": 10.24243255900198,
"learning_rate": 8.015699967810576e-07,
"loss": 0.1659,
"step": 380
},
{
"epoch": 0.632890365448505,
"grad_norm": 8.298998658122965,
"learning_rate": 8.004952783696891e-07,
"loss": 0.1212,
"step": 381
},
{
"epoch": 0.6345514950166113,
"grad_norm": 6.94281041135004,
"learning_rate": 7.994183822714968e-07,
"loss": 0.0888,
"step": 382
},
{
"epoch": 0.6362126245847176,
"grad_norm": 10.305910457713361,
"learning_rate": 7.983393162907379e-07,
"loss": 0.1903,
"step": 383
},
{
"epoch": 0.6378737541528239,
"grad_norm": 7.464775960157331,
"learning_rate": 7.972580882473946e-07,
"loss": 0.097,
"step": 384
},
{
"epoch": 0.6395348837209303,
"grad_norm": 7.4746993771307055,
"learning_rate": 7.961747059771179e-07,
"loss": 0.1109,
"step": 385
},
{
"epoch": 0.6411960132890365,
"grad_norm": 6.615341400751429,
"learning_rate": 7.950891773311701e-07,
"loss": 0.0779,
"step": 386
},
{
"epoch": 0.6428571428571429,
"grad_norm": 17.60157058625115,
"learning_rate": 7.940015101763684e-07,
"loss": 0.2216,
"step": 387
},
{
"epoch": 0.6445182724252492,
"grad_norm": 11.013583175188954,
"learning_rate": 7.92911712395028e-07,
"loss": 0.172,
"step": 388
},
{
"epoch": 0.6461794019933554,
"grad_norm": 8.0676059944362,
"learning_rate": 7.918197918849042e-07,
"loss": 0.1122,
"step": 389
},
{
"epoch": 0.6478405315614618,
"grad_norm": 10.8514146503401,
"learning_rate": 7.907257565591362e-07,
"loss": 0.082,
"step": 390
},
{
"epoch": 0.6495016611295681,
"grad_norm": 12.550088112103355,
"learning_rate": 7.896296143461889e-07,
"loss": 0.1142,
"step": 391
},
{
"epoch": 0.6511627906976745,
"grad_norm": 12.146722322785989,
"learning_rate": 7.885313731897962e-07,
"loss": 0.1843,
"step": 392
},
{
"epoch": 0.6528239202657807,
"grad_norm": 11.011969414306924,
"learning_rate": 7.874310410489027e-07,
"loss": 0.1209,
"step": 393
},
{
"epoch": 0.654485049833887,
"grad_norm": 6.810537387951736,
"learning_rate": 7.863286258976061e-07,
"loss": 0.0608,
"step": 394
},
{
"epoch": 0.6561461794019934,
"grad_norm": 10.89167605967294,
"learning_rate": 7.852241357251002e-07,
"loss": 0.1189,
"step": 395
},
{
"epoch": 0.6578073089700996,
"grad_norm": 14.859678250628704,
"learning_rate": 7.841175785356165e-07,
"loss": 0.1324,
"step": 396
},
{
"epoch": 0.659468438538206,
"grad_norm": 14.58129989899506,
"learning_rate": 7.830089623483656e-07,
"loss": 0.1417,
"step": 397
},
{
"epoch": 0.6611295681063123,
"grad_norm": 16.34306018101847,
"learning_rate": 7.818982951974798e-07,
"loss": 0.1263,
"step": 398
},
{
"epoch": 0.6627906976744186,
"grad_norm": 9.974280701968183,
"learning_rate": 7.807855851319554e-07,
"loss": 0.1354,
"step": 399
},
{
"epoch": 0.6644518272425249,
"grad_norm": 14.425725799753549,
"learning_rate": 7.796708402155925e-07,
"loss": 0.1874,
"step": 400
},
{
"epoch": 0.6661129568106312,
"grad_norm": 10.094566765127704,
"learning_rate": 7.785540685269388e-07,
"loss": 0.147,
"step": 401
},
{
"epoch": 0.6677740863787376,
"grad_norm": 15.97109522405515,
"learning_rate": 7.774352781592295e-07,
"loss": 0.1826,
"step": 402
},
{
"epoch": 0.6694352159468439,
"grad_norm": 13.187995357816002,
"learning_rate": 7.763144772203291e-07,
"loss": 0.1317,
"step": 403
},
{
"epoch": 0.6710963455149501,
"grad_norm": 15.41977600347847,
"learning_rate": 7.751916738326732e-07,
"loss": 0.1712,
"step": 404
},
{
"epoch": 0.6727574750830565,
"grad_norm": 6.680838078645473,
"learning_rate": 7.740668761332084e-07,
"loss": 0.0829,
"step": 405
},
{
"epoch": 0.6744186046511628,
"grad_norm": 16.868662883815464,
"learning_rate": 7.729400922733345e-07,
"loss": 0.1237,
"step": 406
},
{
"epoch": 0.6760797342192691,
"grad_norm": 10.871555946818436,
"learning_rate": 7.71811330418845e-07,
"loss": 0.1275,
"step": 407
},
{
"epoch": 0.6777408637873754,
"grad_norm": 14.978834275251021,
"learning_rate": 7.706805987498677e-07,
"loss": 0.1144,
"step": 408
},
{
"epoch": 0.6794019933554817,
"grad_norm": 9.534606751566304,
"learning_rate": 7.69547905460806e-07,
"loss": 0.1177,
"step": 409
},
{
"epoch": 0.6810631229235881,
"grad_norm": 9.950511975602048,
"learning_rate": 7.684132587602786e-07,
"loss": 0.1758,
"step": 410
},
{
"epoch": 0.6827242524916943,
"grad_norm": 10.042396922065393,
"learning_rate": 7.67276666871061e-07,
"loss": 0.1446,
"step": 411
},
{
"epoch": 0.6843853820598007,
"grad_norm": 10.048910173755388,
"learning_rate": 7.661381380300253e-07,
"loss": 0.163,
"step": 412
},
{
"epoch": 0.686046511627907,
"grad_norm": 9.33358937868672,
"learning_rate": 7.649976804880809e-07,
"loss": 0.1048,
"step": 413
},
{
"epoch": 0.6877076411960132,
"grad_norm": 10.583010872327236,
"learning_rate": 7.63855302510114e-07,
"loss": 0.1344,
"step": 414
},
{
"epoch": 0.6893687707641196,
"grad_norm": 12.165380086835881,
"learning_rate": 7.627110123749285e-07,
"loss": 0.1494,
"step": 415
},
{
"epoch": 0.6910299003322259,
"grad_norm": 10.99633419748417,
"learning_rate": 7.615648183751857e-07,
"loss": 0.1329,
"step": 416
},
{
"epoch": 0.6926910299003323,
"grad_norm": 11.667588050351162,
"learning_rate": 7.60416728817344e-07,
"loss": 0.153,
"step": 417
},
{
"epoch": 0.6943521594684385,
"grad_norm": 8.985742528680719,
"learning_rate": 7.592667520215994e-07,
"loss": 0.1267,
"step": 418
},
{
"epoch": 0.6960132890365448,
"grad_norm": 9.25393580859672,
"learning_rate": 7.581148963218241e-07,
"loss": 0.1382,
"step": 419
},
{
"epoch": 0.6976744186046512,
"grad_norm": 8.89447816694309,
"learning_rate": 7.569611700655068e-07,
"loss": 0.1189,
"step": 420
},
{
"epoch": 0.6993355481727574,
"grad_norm": 11.740985525599102,
"learning_rate": 7.558055816136924e-07,
"loss": 0.1677,
"step": 421
},
{
"epoch": 0.7009966777408638,
"grad_norm": 7.905842682651146,
"learning_rate": 7.546481393409209e-07,
"loss": 0.098,
"step": 422
},
{
"epoch": 0.7026578073089701,
"grad_norm": 15.390816785264915,
"learning_rate": 7.53488851635167e-07,
"loss": 0.1973,
"step": 423
},
{
"epoch": 0.7043189368770764,
"grad_norm": 10.964069409142427,
"learning_rate": 7.523277268977792e-07,
"loss": 0.1268,
"step": 424
},
{
"epoch": 0.7059800664451827,
"grad_norm": 11.34599369651872,
"learning_rate": 7.51164773543419e-07,
"loss": 0.1469,
"step": 425
},
{
"epoch": 0.707641196013289,
"grad_norm": 9.205102192725148,
"learning_rate": 7.5e-07,
"loss": 0.1199,
"step": 426
},
{
"epoch": 0.7093023255813954,
"grad_norm": 8.670727684942934,
"learning_rate": 7.488334147086263e-07,
"loss": 0.1012,
"step": 427
},
{
"epoch": 0.7109634551495017,
"grad_norm": 10.11416734093539,
"learning_rate": 7.476650261235318e-07,
"loss": 0.1354,
"step": 428
},
{
"epoch": 0.7126245847176079,
"grad_norm": 6.003475116109574,
"learning_rate": 7.464948427120197e-07,
"loss": 0.0826,
"step": 429
},
{
"epoch": 0.7142857142857143,
"grad_norm": 11.188990918400444,
"learning_rate": 7.453228729543988e-07,
"loss": 0.1512,
"step": 430
},
{
"epoch": 0.7159468438538206,
"grad_norm": 19.722212570905874,
"learning_rate": 7.441491253439249e-07,
"loss": 0.0985,
"step": 431
},
{
"epoch": 0.717607973421927,
"grad_norm": 9.220172905747987,
"learning_rate": 7.429736083867371e-07,
"loss": 0.1254,
"step": 432
},
{
"epoch": 0.7192691029900332,
"grad_norm": 10.857365036669531,
"learning_rate": 7.417963306017972e-07,
"loss": 0.1556,
"step": 433
},
{
"epoch": 0.7209302325581395,
"grad_norm": 9.12825370131944,
"learning_rate": 7.406173005208277e-07,
"loss": 0.109,
"step": 434
},
{
"epoch": 0.7225913621262459,
"grad_norm": 11.058921726765327,
"learning_rate": 7.394365266882501e-07,
"loss": 0.1443,
"step": 435
},
{
"epoch": 0.7242524916943521,
"grad_norm": 17.6871451943868,
"learning_rate": 7.382540176611223e-07,
"loss": 0.2528,
"step": 436
},
{
"epoch": 0.7259136212624585,
"grad_norm": 9.523326447398212,
"learning_rate": 7.370697820090778e-07,
"loss": 0.0873,
"step": 437
},
{
"epoch": 0.7275747508305648,
"grad_norm": 11.905512725983018,
"learning_rate": 7.358838283142628e-07,
"loss": 0.1576,
"step": 438
},
{
"epoch": 0.729235880398671,
"grad_norm": 13.862231389544819,
"learning_rate": 7.346961651712739e-07,
"loss": 0.2174,
"step": 439
},
{
"epoch": 0.7308970099667774,
"grad_norm": 20.52530465219098,
"learning_rate": 7.335068011870962e-07,
"loss": 0.2746,
"step": 440
},
{
"epoch": 0.7325581395348837,
"grad_norm": 10.26969871122055,
"learning_rate": 7.323157449810405e-07,
"loss": 0.1119,
"step": 441
},
{
"epoch": 0.7342192691029901,
"grad_norm": 11.415857653085869,
"learning_rate": 7.311230051846819e-07,
"loss": 0.138,
"step": 442
},
{
"epoch": 0.7358803986710963,
"grad_norm": 11.650217063993916,
"learning_rate": 7.299285904417955e-07,
"loss": 0.1596,
"step": 443
},
{
"epoch": 0.7375415282392026,
"grad_norm": 9.256723737467732,
"learning_rate": 7.287325094082954e-07,
"loss": 0.1267,
"step": 444
},
{
"epoch": 0.739202657807309,
"grad_norm": 7.694604457260306,
"learning_rate": 7.275347707521709e-07,
"loss": 0.1038,
"step": 445
},
{
"epoch": 0.7408637873754153,
"grad_norm": 7.191980021296017,
"learning_rate": 7.263353831534244e-07,
"loss": 0.109,
"step": 446
},
{
"epoch": 0.7425249169435216,
"grad_norm": 10.691175165821143,
"learning_rate": 7.25134355304008e-07,
"loss": 0.1907,
"step": 447
},
{
"epoch": 0.7441860465116279,
"grad_norm": 6.947238283602248,
"learning_rate": 7.239316959077607e-07,
"loss": 0.0847,
"step": 448
},
{
"epoch": 0.7458471760797342,
"grad_norm": 13.005333968288324,
"learning_rate": 7.227274136803452e-07,
"loss": 0.2188,
"step": 449
},
{
"epoch": 0.7475083056478405,
"grad_norm": 7.289755042527808,
"learning_rate": 7.215215173491849e-07,
"loss": 0.1152,
"step": 450
},
{
"epoch": 0.7491694352159468,
"grad_norm": 8.425252309131805,
"learning_rate": 7.203140156534009e-07,
"loss": 0.1461,
"step": 451
},
{
"epoch": 0.7508305647840532,
"grad_norm": 5.562081357930086,
"learning_rate": 7.191049173437479e-07,
"loss": 0.0852,
"step": 452
},
{
"epoch": 0.7524916943521595,
"grad_norm": 12.352790950896518,
"learning_rate": 7.178942311825516e-07,
"loss": 0.155,
"step": 453
},
{
"epoch": 0.7541528239202658,
"grad_norm": 9.171891575083187,
"learning_rate": 7.166819659436445e-07,
"loss": 0.1495,
"step": 454
},
{
"epoch": 0.7558139534883721,
"grad_norm": 9.41209858612358,
"learning_rate": 7.15468130412303e-07,
"loss": 0.1169,
"step": 455
},
{
"epoch": 0.7574750830564784,
"grad_norm": 10.230916740772512,
"learning_rate": 7.142527333851833e-07,
"loss": 0.2093,
"step": 456
},
{
"epoch": 0.7591362126245847,
"grad_norm": 7.965188922429285,
"learning_rate": 7.130357836702577e-07,
"loss": 0.114,
"step": 457
},
{
"epoch": 0.760797342192691,
"grad_norm": 6.820864692461988,
"learning_rate": 7.118172900867508e-07,
"loss": 0.1279,
"step": 458
},
{
"epoch": 0.7624584717607974,
"grad_norm": 9.896346871574128,
"learning_rate": 7.105972614650756e-07,
"loss": 0.1915,
"step": 459
},
{
"epoch": 0.7641196013289037,
"grad_norm": 13.402974322971838,
"learning_rate": 7.093757066467696e-07,
"loss": 0.1564,
"step": 460
},
{
"epoch": 0.7657807308970099,
"grad_norm": 8.969620661819997,
"learning_rate": 7.081526344844305e-07,
"loss": 0.1348,
"step": 461
},
{
"epoch": 0.7674418604651163,
"grad_norm": 7.901573755091921,
"learning_rate": 7.069280538416524e-07,
"loss": 0.117,
"step": 462
},
{
"epoch": 0.7691029900332226,
"grad_norm": 8.676435505228378,
"learning_rate": 7.05701973592961e-07,
"loss": 0.1312,
"step": 463
},
{
"epoch": 0.770764119601329,
"grad_norm": 8.312745149516928,
"learning_rate": 7.044744026237499e-07,
"loss": 0.1163,
"step": 464
},
{
"epoch": 0.7724252491694352,
"grad_norm": 8.29093146988122,
"learning_rate": 7.03245349830216e-07,
"loss": 0.1253,
"step": 465
},
{
"epoch": 0.7740863787375415,
"grad_norm": 10.991996181440621,
"learning_rate": 7.020148241192945e-07,
"loss": 0.1426,
"step": 466
},
{
"epoch": 0.7757475083056479,
"grad_norm": 9.665978609673429,
"learning_rate": 7.007828344085958e-07,
"loss": 0.116,
"step": 467
},
{
"epoch": 0.7774086378737541,
"grad_norm": 12.673357083226641,
"learning_rate": 6.995493896263385e-07,
"loss": 0.1128,
"step": 468
},
{
"epoch": 0.7790697674418605,
"grad_norm": 16.485475962652067,
"learning_rate": 6.983144987112875e-07,
"loss": 0.2125,
"step": 469
},
{
"epoch": 0.7807308970099668,
"grad_norm": 9.751397196349588,
"learning_rate": 6.970781706126864e-07,
"loss": 0.1438,
"step": 470
},
{
"epoch": 0.782392026578073,
"grad_norm": 15.35224234314337,
"learning_rate": 6.958404142901956e-07,
"loss": 0.1653,
"step": 471
},
{
"epoch": 0.7840531561461794,
"grad_norm": 12.390405950401401,
"learning_rate": 6.946012387138247e-07,
"loss": 0.1534,
"step": 472
},
{
"epoch": 0.7857142857142857,
"grad_norm": 8.927360461817429,
"learning_rate": 6.933606528638689e-07,
"loss": 0.1109,
"step": 473
},
{
"epoch": 0.7873754152823921,
"grad_norm": 13.299787573189752,
"learning_rate": 6.921186657308439e-07,
"loss": 0.179,
"step": 474
},
{
"epoch": 0.7890365448504983,
"grad_norm": 8.06526614276188,
"learning_rate": 6.9087528631542e-07,
"loss": 0.1337,
"step": 475
},
{
"epoch": 0.7906976744186046,
"grad_norm": 8.042198670013263,
"learning_rate": 6.89630523628358e-07,
"loss": 0.1081,
"step": 476
},
{
"epoch": 0.792358803986711,
"grad_norm": 9.692237559854625,
"learning_rate": 6.883843866904426e-07,
"loss": 0.1177,
"step": 477
},
{
"epoch": 0.7940199335548173,
"grad_norm": 9.344382237225672,
"learning_rate": 6.87136884532418e-07,
"loss": 0.1255,
"step": 478
},
{
"epoch": 0.7956810631229236,
"grad_norm": 8.203203552366837,
"learning_rate": 6.858880261949224e-07,
"loss": 0.1308,
"step": 479
},
{
"epoch": 0.7973421926910299,
"grad_norm": 7.069561317398428,
"learning_rate": 6.84637820728422e-07,
"loss": 0.1177,
"step": 480
},
{
"epoch": 0.7990033222591362,
"grad_norm": 12.651184018556993,
"learning_rate": 6.833862771931452e-07,
"loss": 0.1717,
"step": 481
},
{
"epoch": 0.8006644518272426,
"grad_norm": 7.53322640696838,
"learning_rate": 6.82133404659018e-07,
"loss": 0.132,
"step": 482
},
{
"epoch": 0.8023255813953488,
"grad_norm": 8.74434892318003,
"learning_rate": 6.808792122055973e-07,
"loss": 0.144,
"step": 483
},
{
"epoch": 0.8039867109634552,
"grad_norm": 10.42589485523522,
"learning_rate": 6.796237089220057e-07,
"loss": 0.1394,
"step": 484
},
{
"epoch": 0.8056478405315615,
"grad_norm": 11.906093609013176,
"learning_rate": 6.783669039068652e-07,
"loss": 0.1599,
"step": 485
},
{
"epoch": 0.8073089700996677,
"grad_norm": 11.119016103511091,
"learning_rate": 6.771088062682312e-07,
"loss": 0.1454,
"step": 486
},
{
"epoch": 0.8089700996677741,
"grad_norm": 6.861287582124284,
"learning_rate": 6.758494251235274e-07,
"loss": 0.0874,
"step": 487
},
{
"epoch": 0.8106312292358804,
"grad_norm": 22.336812147687517,
"learning_rate": 6.745887695994783e-07,
"loss": 0.2066,
"step": 488
},
{
"epoch": 0.8122923588039868,
"grad_norm": 10.480117099044136,
"learning_rate": 6.733268488320442e-07,
"loss": 0.1989,
"step": 489
},
{
"epoch": 0.813953488372093,
"grad_norm": 7.915417258802982,
"learning_rate": 6.720636719663549e-07,
"loss": 0.0994,
"step": 490
},
{
"epoch": 0.8156146179401993,
"grad_norm": 6.0203241786289015,
"learning_rate": 6.707992481566426e-07,
"loss": 0.0882,
"step": 491
},
{
"epoch": 0.8172757475083057,
"grad_norm": 10.519130887832892,
"learning_rate": 6.695335865661763e-07,
"loss": 0.1457,
"step": 492
},
{
"epoch": 0.8189368770764119,
"grad_norm": 10.88189785815634,
"learning_rate": 6.682666963671953e-07,
"loss": 0.1381,
"step": 493
},
{
"epoch": 0.8205980066445183,
"grad_norm": 8.410449806885296,
"learning_rate": 6.669985867408421e-07,
"loss": 0.1285,
"step": 494
},
{
"epoch": 0.8222591362126246,
"grad_norm": 10.63153721731556,
"learning_rate": 6.657292668770973e-07,
"loss": 0.1344,
"step": 495
},
{
"epoch": 0.8239202657807309,
"grad_norm": 7.858722492263171,
"learning_rate": 6.644587459747113e-07,
"loss": 0.1392,
"step": 496
},
{
"epoch": 0.8255813953488372,
"grad_norm": 12.495414541553474,
"learning_rate": 6.631870332411387e-07,
"loss": 0.1249,
"step": 497
},
{
"epoch": 0.8272425249169435,
"grad_norm": 8.69182573599952,
"learning_rate": 6.619141378924714e-07,
"loss": 0.1069,
"step": 498
},
{
"epoch": 0.8289036544850499,
"grad_norm": 11.19610726093013,
"learning_rate": 6.606400691533715e-07,
"loss": 0.1561,
"step": 499
},
{
"epoch": 0.8305647840531561,
"grad_norm": 11.60589510557093,
"learning_rate": 6.593648362570045e-07,
"loss": 0.1657,
"step": 500
},
{
"epoch": 0.8322259136212624,
"grad_norm": 8.967078348213883,
"learning_rate": 6.580884484449733e-07,
"loss": 0.1476,
"step": 501
},
{
"epoch": 0.8338870431893688,
"grad_norm": 8.046078745336114,
"learning_rate": 6.568109149672496e-07,
"loss": 0.1536,
"step": 502
},
{
"epoch": 0.8355481727574751,
"grad_norm": 10.11126212003755,
"learning_rate": 6.555322450821081e-07,
"loss": 0.1772,
"step": 503
},
{
"epoch": 0.8372093023255814,
"grad_norm": 22.00811087279718,
"learning_rate": 6.542524480560588e-07,
"loss": 0.196,
"step": 504
},
{
"epoch": 0.8388704318936877,
"grad_norm": 9.729933636134202,
"learning_rate": 6.529715331637804e-07,
"loss": 0.1218,
"step": 505
},
{
"epoch": 0.840531561461794,
"grad_norm": 20.546621479120287,
"learning_rate": 6.516895096880529e-07,
"loss": 0.1806,
"step": 506
},
{
"epoch": 0.8421926910299004,
"grad_norm": 8.302645527868407,
"learning_rate": 6.504063869196897e-07,
"loss": 0.1164,
"step": 507
},
{
"epoch": 0.8438538205980066,
"grad_norm": 18.63430523045363,
"learning_rate": 6.491221741574711e-07,
"loss": 0.2653,
"step": 508
},
{
"epoch": 0.845514950166113,
"grad_norm": 5.795031890334442,
"learning_rate": 6.478368807080763e-07,
"loss": 0.0734,
"step": 509
},
{
"epoch": 0.8471760797342193,
"grad_norm": 7.3352460848157754,
"learning_rate": 6.465505158860165e-07,
"loss": 0.1188,
"step": 510
},
{
"epoch": 0.8488372093023255,
"grad_norm": 8.230664802958891,
"learning_rate": 6.452630890135672e-07,
"loss": 0.1376,
"step": 511
},
{
"epoch": 0.8504983388704319,
"grad_norm": 10.861378955756326,
"learning_rate": 6.439746094207004e-07,
"loss": 0.1895,
"step": 512
},
{
"epoch": 0.8521594684385382,
"grad_norm": 7.801899185147919,
"learning_rate": 6.426850864450168e-07,
"loss": 0.0992,
"step": 513
},
{
"epoch": 0.8538205980066446,
"grad_norm": 8.21231391824538,
"learning_rate": 6.413945294316794e-07,
"loss": 0.1277,
"step": 514
},
{
"epoch": 0.8554817275747508,
"grad_norm": 8.182824956563156,
"learning_rate": 6.401029477333437e-07,
"loss": 0.0903,
"step": 515
},
{
"epoch": 0.8571428571428571,
"grad_norm": 8.908858082334962,
"learning_rate": 6.388103507100922e-07,
"loss": 0.1044,
"step": 516
},
{
"epoch": 0.8588039867109635,
"grad_norm": 8.858069259669536,
"learning_rate": 6.375167477293648e-07,
"loss": 0.143,
"step": 517
},
{
"epoch": 0.8604651162790697,
"grad_norm": 7.449561570396506,
"learning_rate": 6.362221481658917e-07,
"loss": 0.1143,
"step": 518
},
{
"epoch": 0.8621262458471761,
"grad_norm": 10.420657379070615,
"learning_rate": 6.349265614016254e-07,
"loss": 0.0923,
"step": 519
},
{
"epoch": 0.8637873754152824,
"grad_norm": 6.692843180236105,
"learning_rate": 6.336299968256724e-07,
"loss": 0.0929,
"step": 520
},
{
"epoch": 0.8654485049833887,
"grad_norm": 13.432306584512084,
"learning_rate": 6.323324638342257e-07,
"loss": 0.1248,
"step": 521
},
{
"epoch": 0.867109634551495,
"grad_norm": 13.80099877387905,
"learning_rate": 6.310339718304965e-07,
"loss": 0.1533,
"step": 522
},
{
"epoch": 0.8687707641196013,
"grad_norm": 11.959387915414487,
"learning_rate": 6.297345302246452e-07,
"loss": 0.1385,
"step": 523
},
{
"epoch": 0.8704318936877077,
"grad_norm": 15.092125999647005,
"learning_rate": 6.28434148433715e-07,
"loss": 0.2109,
"step": 524
},
{
"epoch": 0.872093023255814,
"grad_norm": 10.776480947820303,
"learning_rate": 6.271328358815618e-07,
"loss": 0.171,
"step": 525
},
{
"epoch": 0.8737541528239202,
"grad_norm": 8.238618005353674,
"learning_rate": 6.258306019987871e-07,
"loss": 0.1164,
"step": 526
},
{
"epoch": 0.8754152823920266,
"grad_norm": 26.508653168688756,
"learning_rate": 6.245274562226693e-07,
"loss": 0.2546,
"step": 527
},
{
"epoch": 0.8770764119601329,
"grad_norm": 18.913079012630988,
"learning_rate": 6.232234079970949e-07,
"loss": 0.1723,
"step": 528
},
{
"epoch": 0.8787375415282392,
"grad_norm": 15.484934247028333,
"learning_rate": 6.219184667724911e-07,
"loss": 0.1934,
"step": 529
},
{
"epoch": 0.8803986710963455,
"grad_norm": 11.50624636520952,
"learning_rate": 6.20612642005756e-07,
"loss": 0.153,
"step": 530
},
{
"epoch": 0.8820598006644518,
"grad_norm": 9.765436764418478,
"learning_rate": 6.193059431601909e-07,
"loss": 0.1117,
"step": 531
},
{
"epoch": 0.8837209302325582,
"grad_norm": 10.442516383917948,
"learning_rate": 6.179983797054321e-07,
"loss": 0.1138,
"step": 532
},
{
"epoch": 0.8853820598006644,
"grad_norm": 9.29144855908764,
"learning_rate": 6.166899611173808e-07,
"loss": 0.1424,
"step": 533
},
{
"epoch": 0.8870431893687708,
"grad_norm": 8.897928846693906,
"learning_rate": 6.15380696878136e-07,
"loss": 0.1231,
"step": 534
},
{
"epoch": 0.8887043189368771,
"grad_norm": 6.957319586157739,
"learning_rate": 6.14070596475925e-07,
"loss": 0.1312,
"step": 535
},
{
"epoch": 0.8903654485049833,
"grad_norm": 10.179421163075975,
"learning_rate": 6.127596694050345e-07,
"loss": 0.1678,
"step": 536
},
{
"epoch": 0.8920265780730897,
"grad_norm": 7.020040317670267,
"learning_rate": 6.114479251657425e-07,
"loss": 0.0954,
"step": 537
},
{
"epoch": 0.893687707641196,
"grad_norm": 8.473993860366981,
"learning_rate": 6.101353732642485e-07,
"loss": 0.1449,
"step": 538
},
{
"epoch": 0.8953488372093024,
"grad_norm": 8.652250840430034,
"learning_rate": 6.088220232126055e-07,
"loss": 0.1063,
"step": 539
},
{
"epoch": 0.8970099667774086,
"grad_norm": 9.534760389427388,
"learning_rate": 6.075078845286509e-07,
"loss": 0.1728,
"step": 540
},
{
"epoch": 0.8986710963455149,
"grad_norm": 5.776374425659757,
"learning_rate": 6.061929667359365e-07,
"loss": 0.0742,
"step": 541
},
{
"epoch": 0.9003322259136213,
"grad_norm": 11.015148721611304,
"learning_rate": 6.04877279363661e-07,
"loss": 0.1788,
"step": 542
},
{
"epoch": 0.9019933554817275,
"grad_norm": 9.473756170184656,
"learning_rate": 6.035608319466e-07,
"loss": 0.1579,
"step": 543
},
{
"epoch": 0.9036544850498339,
"grad_norm": 11.179037463866795,
"learning_rate": 6.02243634025037e-07,
"loss": 0.1533,
"step": 544
},
{
"epoch": 0.9053156146179402,
"grad_norm": 7.8758415023645405,
"learning_rate": 6.00925695144695e-07,
"loss": 0.1146,
"step": 545
},
{
"epoch": 0.9069767441860465,
"grad_norm": 15.665873822759634,
"learning_rate": 5.99607024856666e-07,
"loss": 0.1047,
"step": 546
},
{
"epoch": 0.9086378737541528,
"grad_norm": 9.200391667118774,
"learning_rate": 5.982876327173427e-07,
"loss": 0.1272,
"step": 547
},
{
"epoch": 0.9102990033222591,
"grad_norm": 8.77578098706721,
"learning_rate": 5.969675282883493e-07,
"loss": 0.1516,
"step": 548
},
{
"epoch": 0.9119601328903655,
"grad_norm": 9.320250348637398,
"learning_rate": 5.956467211364717e-07,
"loss": 0.1387,
"step": 549
},
{
"epoch": 0.9136212624584718,
"grad_norm": 9.952052907950817,
"learning_rate": 5.943252208335884e-07,
"loss": 0.1403,
"step": 550
},
{
"epoch": 0.915282392026578,
"grad_norm": 5.2002513589385275,
"learning_rate": 5.930030369566017e-07,
"loss": 0.0565,
"step": 551
},
{
"epoch": 0.9169435215946844,
"grad_norm": 15.488056016488622,
"learning_rate": 5.916801790873669e-07,
"loss": 0.1978,
"step": 552
},
{
"epoch": 0.9186046511627907,
"grad_norm": 12.637431729375692,
"learning_rate": 5.903566568126245e-07,
"loss": 0.1326,
"step": 553
},
{
"epoch": 0.920265780730897,
"grad_norm": 9.074527532983787,
"learning_rate": 5.890324797239294e-07,
"loss": 0.1423,
"step": 554
},
{
"epoch": 0.9219269102990033,
"grad_norm": 9.091722076710525,
"learning_rate": 5.877076574175819e-07,
"loss": 0.1073,
"step": 555
},
{
"epoch": 0.9235880398671097,
"grad_norm": 8.454862363294326,
"learning_rate": 5.86382199494559e-07,
"loss": 0.0991,
"step": 556
},
{
"epoch": 0.925249169435216,
"grad_norm": 7.602236450189799,
"learning_rate": 5.850561155604429e-07,
"loss": 0.1149,
"step": 557
},
{
"epoch": 0.9269102990033222,
"grad_norm": 10.529531993339845,
"learning_rate": 5.837294152253533e-07,
"loss": 0.1796,
"step": 558
},
{
"epoch": 0.9285714285714286,
"grad_norm": 8.976880430709578,
"learning_rate": 5.824021081038767e-07,
"loss": 0.1138,
"step": 559
},
{
"epoch": 0.9302325581395349,
"grad_norm": 14.33254707392304,
"learning_rate": 5.810742038149966e-07,
"loss": 0.1308,
"step": 560
},
{
"epoch": 0.9318936877076412,
"grad_norm": 7.258067172250153,
"learning_rate": 5.79745711982025e-07,
"loss": 0.09,
"step": 561
},
{
"epoch": 0.9335548172757475,
"grad_norm": 8.172499836134483,
"learning_rate": 5.78416642232531e-07,
"loss": 0.1044,
"step": 562
},
{
"epoch": 0.9352159468438538,
"grad_norm": 6.594794551088574,
"learning_rate": 5.770870041982722e-07,
"loss": 0.1254,
"step": 563
},
{
"epoch": 0.9368770764119602,
"grad_norm": 5.9566914133383255,
"learning_rate": 5.757568075151249e-07,
"loss": 0.0921,
"step": 564
},
{
"epoch": 0.9385382059800664,
"grad_norm": 8.378099614968635,
"learning_rate": 5.744260618230133e-07,
"loss": 0.1151,
"step": 565
},
{
"epoch": 0.9401993355481728,
"grad_norm": 7.594866350018622,
"learning_rate": 5.730947767658404e-07,
"loss": 0.0926,
"step": 566
},
{
"epoch": 0.9418604651162791,
"grad_norm": 5.975799246097571,
"learning_rate": 5.717629619914185e-07,
"loss": 0.0634,
"step": 567
},
{
"epoch": 0.9435215946843853,
"grad_norm": 7.98709361058564,
"learning_rate": 5.704306271513981e-07,
"loss": 0.0739,
"step": 568
},
{
"epoch": 0.9451827242524917,
"grad_norm": 5.107038268623648,
"learning_rate": 5.69097781901199e-07,
"loss": 0.0742,
"step": 569
},
{
"epoch": 0.946843853820598,
"grad_norm": 7.691173623087892,
"learning_rate": 5.677644358999398e-07,
"loss": 0.1137,
"step": 570
},
{
"epoch": 0.9485049833887044,
"grad_norm": 15.079146650209767,
"learning_rate": 5.664305988103678e-07,
"loss": 0.1334,
"step": 571
},
{
"epoch": 0.9501661129568106,
"grad_norm": 10.468995649492486,
"learning_rate": 5.6509628029879e-07,
"loss": 0.0933,
"step": 572
},
{
"epoch": 0.9518272425249169,
"grad_norm": 12.307929483854858,
"learning_rate": 5.637614900350014e-07,
"loss": 0.1288,
"step": 573
},
{
"epoch": 0.9534883720930233,
"grad_norm": 11.854140397945446,
"learning_rate": 5.624262376922162e-07,
"loss": 0.1043,
"step": 574
},
{
"epoch": 0.9551495016611296,
"grad_norm": 10.655229778108161,
"learning_rate": 5.610905329469973e-07,
"loss": 0.0992,
"step": 575
},
{
"epoch": 0.9568106312292359,
"grad_norm": 12.54430451998764,
"learning_rate": 5.597543854791856e-07,
"loss": 0.187,
"step": 576
},
{
"epoch": 0.9584717607973422,
"grad_norm": 10.536342105186373,
"learning_rate": 5.584178049718314e-07,
"loss": 0.1524,
"step": 577
},
{
"epoch": 0.9601328903654485,
"grad_norm": 13.313735426136686,
"learning_rate": 5.570808011111226e-07,
"loss": 0.1978,
"step": 578
},
{
"epoch": 0.9617940199335548,
"grad_norm": 7.220442413364977,
"learning_rate": 5.557433835863151e-07,
"loss": 0.0943,
"step": 579
},
{
"epoch": 0.9634551495016611,
"grad_norm": 23.093365160299122,
"learning_rate": 5.544055620896629e-07,
"loss": 0.1533,
"step": 580
},
{
"epoch": 0.9651162790697675,
"grad_norm": 11.087199919198286,
"learning_rate": 5.530673463163471e-07,
"loss": 0.1455,
"step": 581
},
{
"epoch": 0.9667774086378738,
"grad_norm": 10.58290498744938,
"learning_rate": 5.517287459644069e-07,
"loss": 0.1665,
"step": 582
},
{
"epoch": 0.96843853820598,
"grad_norm": 7.202020662641881,
"learning_rate": 5.50389770734668e-07,
"loss": 0.0956,
"step": 583
},
{
"epoch": 0.9700996677740864,
"grad_norm": 12.46472530295776,
"learning_rate": 5.490504303306727e-07,
"loss": 0.1617,
"step": 584
},
{
"epoch": 0.9717607973421927,
"grad_norm": 16.76493326267004,
"learning_rate": 5.477107344586101e-07,
"loss": 0.1507,
"step": 585
},
{
"epoch": 0.973421926910299,
"grad_norm": 9.237732865929152,
"learning_rate": 5.463706928272453e-07,
"loss": 0.1412,
"step": 586
},
{
"epoch": 0.9750830564784053,
"grad_norm": 12.383497236276556,
"learning_rate": 5.450303151478489e-07,
"loss": 0.1493,
"step": 587
},
{
"epoch": 0.9767441860465116,
"grad_norm": 11.151722325366409,
"learning_rate": 5.43689611134127e-07,
"loss": 0.1412,
"step": 588
},
{
"epoch": 0.978405315614618,
"grad_norm": 7.21013726279382,
"learning_rate": 5.423485905021507e-07,
"loss": 0.1246,
"step": 589
},
{
"epoch": 0.9800664451827242,
"grad_norm": 8.579739495792525,
"learning_rate": 5.410072629702856e-07,
"loss": 0.1234,
"step": 590
},
{
"epoch": 0.9817275747508306,
"grad_norm": 7.602281673588693,
"learning_rate": 5.396656382591213e-07,
"loss": 0.1116,
"step": 591
},
{
"epoch": 0.9833887043189369,
"grad_norm": 14.100145982497999,
"learning_rate": 5.38323726091401e-07,
"loss": 0.1388,
"step": 592
},
{
"epoch": 0.9850498338870431,
"grad_norm": 7.492618519081229,
"learning_rate": 5.369815361919511e-07,
"loss": 0.0761,
"step": 593
},
{
"epoch": 0.9867109634551495,
"grad_norm": 6.371586229880432,
"learning_rate": 5.356390782876111e-07,
"loss": 0.1078,
"step": 594
},
{
"epoch": 0.9883720930232558,
"grad_norm": 9.371673513904842,
"learning_rate": 5.342963621071623e-07,
"loss": 0.1745,
"step": 595
},
{
"epoch": 0.9900332225913622,
"grad_norm": 5.206481672409249,
"learning_rate": 5.329533973812581e-07,
"loss": 0.0683,
"step": 596
},
{
"epoch": 0.9916943521594684,
"grad_norm": 14.826520082325914,
"learning_rate": 5.316101938423524e-07,
"loss": 0.1577,
"step": 597
},
{
"epoch": 0.9933554817275747,
"grad_norm": 10.961715205667316,
"learning_rate": 5.302667612246308e-07,
"loss": 0.1665,
"step": 598
},
{
"epoch": 0.9950166112956811,
"grad_norm": 14.397029548851924,
"learning_rate": 5.28923109263938e-07,
"loss": 0.1731,
"step": 599
},
{
"epoch": 0.9966777408637874,
"grad_norm": 8.375566282055818,
"learning_rate": 5.275792476977091e-07,
"loss": 0.1293,
"step": 600
},
{
"epoch": 0.9983388704318937,
"grad_norm": 11.268520674866318,
"learning_rate": 5.262351862648978e-07,
"loss": 0.1419,
"step": 601
},
{
"epoch": 1.0,
"grad_norm": 6.784664762230538,
"learning_rate": 5.248909347059061e-07,
"loss": 0.075,
"step": 602
},
{
"epoch": 1.0016611295681064,
"grad_norm": 4.877956718125895,
"learning_rate": 5.235465027625146e-07,
"loss": 0.0621,
"step": 603
},
{
"epoch": 1.0033222591362125,
"grad_norm": 4.203005899244808,
"learning_rate": 5.2220190017781e-07,
"loss": 0.0457,
"step": 604
},
{
"epoch": 1.004983388704319,
"grad_norm": 3.8123959854058103,
"learning_rate": 5.208571366961165e-07,
"loss": 0.0378,
"step": 605
},
{
"epoch": 1.0066445182724253,
"grad_norm": 5.315954696600741,
"learning_rate": 5.195122220629239e-07,
"loss": 0.0723,
"step": 606
},
{
"epoch": 1.0083056478405317,
"grad_norm": 3.7805372757538094,
"learning_rate": 5.181671660248178e-07,
"loss": 0.0298,
"step": 607
},
{
"epoch": 1.0099667774086378,
"grad_norm": 4.0974547093630225,
"learning_rate": 5.16821978329408e-07,
"loss": 0.0396,
"step": 608
},
{
"epoch": 1.0116279069767442,
"grad_norm": 4.948296436070784,
"learning_rate": 5.154766687252591e-07,
"loss": 0.0263,
"step": 609
},
{
"epoch": 1.0132890365448506,
"grad_norm": 9.792467737742347,
"learning_rate": 5.141312469618183e-07,
"loss": 0.0942,
"step": 610
},
{
"epoch": 1.0149501661129567,
"grad_norm": 7.834675461013419,
"learning_rate": 5.127857227893465e-07,
"loss": 0.0447,
"step": 611
},
{
"epoch": 1.0166112956810631,
"grad_norm": 7.985753475492194,
"learning_rate": 5.114401059588464e-07,
"loss": 0.0646,
"step": 612
},
{
"epoch": 1.0182724252491695,
"grad_norm": 10.331188978857707,
"learning_rate": 5.100944062219917e-07,
"loss": 0.0382,
"step": 613
},
{
"epoch": 1.0199335548172757,
"grad_norm": 3.83376887835134,
"learning_rate": 5.08748633331058e-07,
"loss": 0.0345,
"step": 614
},
{
"epoch": 1.021594684385382,
"grad_norm": 3.8639360019028355,
"learning_rate": 5.074027970388499e-07,
"loss": 0.0243,
"step": 615
},
{
"epoch": 1.0232558139534884,
"grad_norm": 10.400286609821283,
"learning_rate": 5.060569070986324e-07,
"loss": 0.0734,
"step": 616
},
{
"epoch": 1.0249169435215948,
"grad_norm": 5.442828090266754,
"learning_rate": 5.047109732640586e-07,
"loss": 0.0294,
"step": 617
},
{
"epoch": 1.026578073089701,
"grad_norm": 7.870712026341651,
"learning_rate": 5.033650052891001e-07,
"loss": 0.0301,
"step": 618
},
{
"epoch": 1.0282392026578073,
"grad_norm": 13.585823168789453,
"learning_rate": 5.020190129279759e-07,
"loss": 0.0988,
"step": 619
},
{
"epoch": 1.0299003322259137,
"grad_norm": 8.796258559095701,
"learning_rate": 5.006730059350815e-07,
"loss": 0.0468,
"step": 620
},
{
"epoch": 1.0315614617940199,
"grad_norm": 10.44576092302698,
"learning_rate": 4.993269940649184e-07,
"loss": 0.0714,
"step": 621
},
{
"epoch": 1.0332225913621262,
"grad_norm": 12.352450302810853,
"learning_rate": 4.979809870720242e-07,
"loss": 0.0478,
"step": 622
},
{
"epoch": 1.0348837209302326,
"grad_norm": 17.75211377150514,
"learning_rate": 4.966349947108999e-07,
"loss": 0.1147,
"step": 623
},
{
"epoch": 1.0365448504983388,
"grad_norm": 6.156505538276604,
"learning_rate": 4.952890267359412e-07,
"loss": 0.0478,
"step": 624
},
{
"epoch": 1.0382059800664452,
"grad_norm": 9.218532704370334,
"learning_rate": 4.939430929013677e-07,
"loss": 0.027,
"step": 625
},
{
"epoch": 1.0398671096345515,
"grad_norm": 7.120527183593547,
"learning_rate": 4.925972029611501e-07,
"loss": 0.0544,
"step": 626
},
{
"epoch": 1.041528239202658,
"grad_norm": 9.198383686246443,
"learning_rate": 4.912513666689421e-07,
"loss": 0.0323,
"step": 627
},
{
"epoch": 1.043189368770764,
"grad_norm": 12.663439974096635,
"learning_rate": 4.899055937780083e-07,
"loss": 0.0445,
"step": 628
},
{
"epoch": 1.0448504983388704,
"grad_norm": 24.085803314752685,
"learning_rate": 4.885598940411536e-07,
"loss": 0.0655,
"step": 629
},
{
"epoch": 1.0465116279069768,
"grad_norm": 6.902292684354512,
"learning_rate": 4.872142772106535e-07,
"loss": 0.0326,
"step": 630
},
{
"epoch": 1.048172757475083,
"grad_norm": 6.121287974432574,
"learning_rate": 4.858687530381817e-07,
"loss": 0.0369,
"step": 631
},
{
"epoch": 1.0498338870431894,
"grad_norm": 8.86764220454998,
"learning_rate": 4.845233312747411e-07,
"loss": 0.0607,
"step": 632
},
{
"epoch": 1.0514950166112957,
"grad_norm": 9.943461616121928,
"learning_rate": 4.831780216705919e-07,
"loss": 0.0529,
"step": 633
},
{
"epoch": 1.053156146179402,
"grad_norm": 3.7501260677915313,
"learning_rate": 4.818328339751823e-07,
"loss": 0.0177,
"step": 634
},
{
"epoch": 1.0548172757475083,
"grad_norm": 3.303220265814859,
"learning_rate": 4.804877779370762e-07,
"loss": 0.0139,
"step": 635
},
{
"epoch": 1.0564784053156147,
"grad_norm": 7.432956410407962,
"learning_rate": 4.791428633038835e-07,
"loss": 0.0463,
"step": 636
},
{
"epoch": 1.058139534883721,
"grad_norm": 8.712378765440459,
"learning_rate": 4.777980998221901e-07,
"loss": 0.0424,
"step": 637
},
{
"epoch": 1.0598006644518272,
"grad_norm": 11.111774138122533,
"learning_rate": 4.764534972374855e-07,
"loss": 0.0522,
"step": 638
},
{
"epoch": 1.0614617940199336,
"grad_norm": 4.983601758483826,
"learning_rate": 4.751090652940938e-07,
"loss": 0.0182,
"step": 639
},
{
"epoch": 1.06312292358804,
"grad_norm": 9.89938771331774,
"learning_rate": 4.7376481373510217e-07,
"loss": 0.0418,
"step": 640
},
{
"epoch": 1.064784053156146,
"grad_norm": 3.1865048946418626,
"learning_rate": 4.7242075230229083e-07,
"loss": 0.0155,
"step": 641
},
{
"epoch": 1.0664451827242525,
"grad_norm": 10.924670279472195,
"learning_rate": 4.71076890736062e-07,
"loss": 0.0586,
"step": 642
},
{
"epoch": 1.0681063122923589,
"grad_norm": 4.291225390159846,
"learning_rate": 4.6973323877536925e-07,
"loss": 0.0206,
"step": 643
},
{
"epoch": 1.069767441860465,
"grad_norm": 7.729879396540719,
"learning_rate": 4.6838980615764756e-07,
"loss": 0.0442,
"step": 644
},
{
"epoch": 1.0714285714285714,
"grad_norm": 10.4730457424228,
"learning_rate": 4.6704660261874195e-07,
"loss": 0.0297,
"step": 645
},
{
"epoch": 1.0730897009966778,
"grad_norm": 2.9531809499425465,
"learning_rate": 4.657036378928376e-07,
"loss": 0.0126,
"step": 646
},
{
"epoch": 1.0747508305647842,
"grad_norm": 6.896419676230295,
"learning_rate": 4.643609217123888e-07,
"loss": 0.024,
"step": 647
},
{
"epoch": 1.0764119601328903,
"grad_norm": 2.4703995052788392,
"learning_rate": 4.630184638080488e-07,
"loss": 0.0102,
"step": 648
},
{
"epoch": 1.0780730897009967,
"grad_norm": 10.728604189023233,
"learning_rate": 4.616762739085992e-07,
"loss": 0.0538,
"step": 649
},
{
"epoch": 1.079734219269103,
"grad_norm": 12.945045897775847,
"learning_rate": 4.603343617408787e-07,
"loss": 0.0504,
"step": 650
}
],
"logging_steps": 1,
"max_steps": 1204,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 689596271034368.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}