yjgjhgjh's picture
Upload 8 files
bcad7b6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 31.372549019607842,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0784313725490196,
"grad_norm": 7.629533767700195,
"learning_rate": 1e-05,
"loss": 4.321,
"step": 10
},
{
"epoch": 0.1568627450980392,
"grad_norm": 14.44616985321045,
"learning_rate": 2e-05,
"loss": 4.2862,
"step": 20
},
{
"epoch": 0.23529411764705882,
"grad_norm": 19.094482421875,
"learning_rate": 3e-05,
"loss": 4.3249,
"step": 30
},
{
"epoch": 0.3137254901960784,
"grad_norm": 11.822478294372559,
"learning_rate": 4e-05,
"loss": 4.249,
"step": 40
},
{
"epoch": 0.39215686274509803,
"grad_norm": 16.548583984375,
"learning_rate": 5e-05,
"loss": 4.5841,
"step": 50
},
{
"epoch": 0.47058823529411764,
"grad_norm": 7.463245391845703,
"learning_rate": 6e-05,
"loss": 4.3256,
"step": 60
},
{
"epoch": 0.5490196078431373,
"grad_norm": 9.122880935668945,
"learning_rate": 7e-05,
"loss": 4.391,
"step": 70
},
{
"epoch": 0.6274509803921569,
"grad_norm": 18.88115119934082,
"learning_rate": 8e-05,
"loss": 4.1049,
"step": 80
},
{
"epoch": 0.7058823529411765,
"grad_norm": 8.104399681091309,
"learning_rate": 9e-05,
"loss": 4.1016,
"step": 90
},
{
"epoch": 0.7843137254901961,
"grad_norm": 8.780770301818848,
"learning_rate": 0.0001,
"loss": 4.2899,
"step": 100
},
{
"epoch": 0.8627450980392157,
"grad_norm": 8.454495429992676,
"learning_rate": 9.996324205109356e-05,
"loss": 4.1025,
"step": 110
},
{
"epoch": 0.9411764705882353,
"grad_norm": 9.97666072845459,
"learning_rate": 9.992648410218711e-05,
"loss": 4.1319,
"step": 120
},
{
"epoch": 1.0196078431372548,
"grad_norm": 7.962587833404541,
"learning_rate": 9.988972615328065e-05,
"loss": 3.9871,
"step": 130
},
{
"epoch": 1.0980392156862746,
"grad_norm": 8.972702026367188,
"learning_rate": 9.98529682043742e-05,
"loss": 3.8708,
"step": 140
},
{
"epoch": 1.1764705882352942,
"grad_norm": 11.219219207763672,
"learning_rate": 9.981621025546775e-05,
"loss": 3.838,
"step": 150
},
{
"epoch": 1.2549019607843137,
"grad_norm": 12.35175609588623,
"learning_rate": 9.97794523065613e-05,
"loss": 3.8339,
"step": 160
},
{
"epoch": 1.3333333333333333,
"grad_norm": 8.549275398254395,
"learning_rate": 9.974269435765484e-05,
"loss": 3.7836,
"step": 170
},
{
"epoch": 1.4117647058823528,
"grad_norm": 14.15541934967041,
"learning_rate": 9.97059364087484e-05,
"loss": 3.6838,
"step": 180
},
{
"epoch": 1.4901960784313726,
"grad_norm": 16.026613235473633,
"learning_rate": 9.966917845984195e-05,
"loss": 3.6949,
"step": 190
},
{
"epoch": 1.5686274509803921,
"grad_norm": 10.782464981079102,
"learning_rate": 9.963242051093549e-05,
"loss": 3.636,
"step": 200
},
{
"epoch": 1.6470588235294117,
"grad_norm": 10.165632247924805,
"learning_rate": 9.959566256202904e-05,
"loss": 3.4669,
"step": 210
},
{
"epoch": 1.7254901960784315,
"grad_norm": 13.020809173583984,
"learning_rate": 9.95589046131226e-05,
"loss": 3.5055,
"step": 220
},
{
"epoch": 1.803921568627451,
"grad_norm": 8.814240455627441,
"learning_rate": 9.952214666421615e-05,
"loss": 3.6938,
"step": 230
},
{
"epoch": 1.8823529411764706,
"grad_norm": 18.580322265625,
"learning_rate": 9.94853887153097e-05,
"loss": 3.7736,
"step": 240
},
{
"epoch": 1.9607843137254903,
"grad_norm": 22.766721725463867,
"learning_rate": 9.944863076640323e-05,
"loss": 3.4302,
"step": 250
},
{
"epoch": 2.0392156862745097,
"grad_norm": 11.897246360778809,
"learning_rate": 9.941187281749679e-05,
"loss": 3.4821,
"step": 260
},
{
"epoch": 2.1176470588235294,
"grad_norm": 12.075397491455078,
"learning_rate": 9.937511486859034e-05,
"loss": 3.1794,
"step": 270
},
{
"epoch": 2.196078431372549,
"grad_norm": 12.672398567199707,
"learning_rate": 9.933835691968389e-05,
"loss": 3.0642,
"step": 280
},
{
"epoch": 2.2745098039215685,
"grad_norm": 8.902212142944336,
"learning_rate": 9.930159897077743e-05,
"loss": 3.4314,
"step": 290
},
{
"epoch": 2.3529411764705883,
"grad_norm": 10.514148712158203,
"learning_rate": 9.926484102187098e-05,
"loss": 3.298,
"step": 300
},
{
"epoch": 2.431372549019608,
"grad_norm": 12.432036399841309,
"learning_rate": 9.922808307296453e-05,
"loss": 3.2431,
"step": 310
},
{
"epoch": 2.5098039215686274,
"grad_norm": 11.593744277954102,
"learning_rate": 9.91913251240581e-05,
"loss": 3.048,
"step": 320
},
{
"epoch": 2.588235294117647,
"grad_norm": 11.159161567687988,
"learning_rate": 9.915456717515163e-05,
"loss": 3.1779,
"step": 330
},
{
"epoch": 2.6666666666666665,
"grad_norm": 8.838184356689453,
"learning_rate": 9.911780922624518e-05,
"loss": 3.4323,
"step": 340
},
{
"epoch": 2.7450980392156863,
"grad_norm": 10.507558822631836,
"learning_rate": 9.908105127733873e-05,
"loss": 3.291,
"step": 350
},
{
"epoch": 2.8235294117647056,
"grad_norm": 9.107258796691895,
"learning_rate": 9.904429332843229e-05,
"loss": 3.3132,
"step": 360
},
{
"epoch": 2.9019607843137254,
"grad_norm": 9.832110404968262,
"learning_rate": 9.900753537952582e-05,
"loss": 3.2163,
"step": 370
},
{
"epoch": 2.980392156862745,
"grad_norm": 8.222602844238281,
"learning_rate": 9.897077743061938e-05,
"loss": 3.478,
"step": 380
},
{
"epoch": 3.0588235294117645,
"grad_norm": 8.719328880310059,
"learning_rate": 9.893401948171293e-05,
"loss": 3.0145,
"step": 390
},
{
"epoch": 3.1372549019607843,
"grad_norm": 11.51774787902832,
"learning_rate": 9.889726153280648e-05,
"loss": 3.0454,
"step": 400
},
{
"epoch": 3.215686274509804,
"grad_norm": 10.710402488708496,
"learning_rate": 9.886050358390002e-05,
"loss": 2.7929,
"step": 410
},
{
"epoch": 3.2941176470588234,
"grad_norm": 13.097260475158691,
"learning_rate": 9.882374563499357e-05,
"loss": 2.9186,
"step": 420
},
{
"epoch": 3.372549019607843,
"grad_norm": 15.856924057006836,
"learning_rate": 9.878698768608712e-05,
"loss": 2.9282,
"step": 430
},
{
"epoch": 3.450980392156863,
"grad_norm": 11.797476768493652,
"learning_rate": 9.875022973718067e-05,
"loss": 2.891,
"step": 440
},
{
"epoch": 3.5294117647058822,
"grad_norm": 9.920392990112305,
"learning_rate": 9.871347178827422e-05,
"loss": 2.8574,
"step": 450
},
{
"epoch": 3.607843137254902,
"grad_norm": 8.427005767822266,
"learning_rate": 9.867671383936776e-05,
"loss": 2.9527,
"step": 460
},
{
"epoch": 3.686274509803922,
"grad_norm": 12.288470268249512,
"learning_rate": 9.863995589046132e-05,
"loss": 2.7286,
"step": 470
},
{
"epoch": 3.764705882352941,
"grad_norm": 8.801328659057617,
"learning_rate": 9.860319794155488e-05,
"loss": 2.9276,
"step": 480
},
{
"epoch": 3.843137254901961,
"grad_norm": 8.852784156799316,
"learning_rate": 9.856643999264841e-05,
"loss": 2.9347,
"step": 490
},
{
"epoch": 3.9215686274509802,
"grad_norm": 19.973102569580078,
"learning_rate": 9.852968204374196e-05,
"loss": 3.0643,
"step": 500
},
{
"epoch": 4.0,
"grad_norm": 10.647859573364258,
"learning_rate": 9.849292409483552e-05,
"loss": 3.0444,
"step": 510
},
{
"epoch": 4.078431372549019,
"grad_norm": 17.03227424621582,
"learning_rate": 9.845616614592907e-05,
"loss": 2.6609,
"step": 520
},
{
"epoch": 4.1568627450980395,
"grad_norm": 11.255087852478027,
"learning_rate": 9.84194081970226e-05,
"loss": 2.5831,
"step": 530
},
{
"epoch": 4.235294117647059,
"grad_norm": 7.963006019592285,
"learning_rate": 9.838265024811616e-05,
"loss": 2.6449,
"step": 540
},
{
"epoch": 4.313725490196078,
"grad_norm": 8.0176420211792,
"learning_rate": 9.834589229920971e-05,
"loss": 2.6803,
"step": 550
},
{
"epoch": 4.392156862745098,
"grad_norm": 12.322718620300293,
"learning_rate": 9.830913435030326e-05,
"loss": 2.6856,
"step": 560
},
{
"epoch": 4.470588235294118,
"grad_norm": 13.681117057800293,
"learning_rate": 9.827237640139681e-05,
"loss": 2.6411,
"step": 570
},
{
"epoch": 4.549019607843137,
"grad_norm": 9.620941162109375,
"learning_rate": 9.823561845249035e-05,
"loss": 2.6757,
"step": 580
},
{
"epoch": 4.627450980392156,
"grad_norm": 8.198792457580566,
"learning_rate": 9.81988605035839e-05,
"loss": 2.5249,
"step": 590
},
{
"epoch": 4.705882352941177,
"grad_norm": 11.216117858886719,
"learning_rate": 9.816210255467745e-05,
"loss": 2.5542,
"step": 600
},
{
"epoch": 4.784313725490196,
"grad_norm": 13.534390449523926,
"learning_rate": 9.8125344605771e-05,
"loss": 2.5632,
"step": 610
},
{
"epoch": 4.862745098039216,
"grad_norm": 8.238523483276367,
"learning_rate": 9.808858665686455e-05,
"loss": 2.7125,
"step": 620
},
{
"epoch": 4.9411764705882355,
"grad_norm": 7.748435974121094,
"learning_rate": 9.80518287079581e-05,
"loss": 2.5738,
"step": 630
},
{
"epoch": 5.019607843137255,
"grad_norm": 8.043440818786621,
"learning_rate": 9.801507075905166e-05,
"loss": 2.3469,
"step": 640
},
{
"epoch": 5.098039215686274,
"grad_norm": 14.42237377166748,
"learning_rate": 9.797831281014521e-05,
"loss": 2.2214,
"step": 650
},
{
"epoch": 5.176470588235294,
"grad_norm": 7.59706974029541,
"learning_rate": 9.794155486123875e-05,
"loss": 2.279,
"step": 660
},
{
"epoch": 5.254901960784314,
"grad_norm": 9.453704833984375,
"learning_rate": 9.79047969123323e-05,
"loss": 2.4156,
"step": 670
},
{
"epoch": 5.333333333333333,
"grad_norm": 8.243208885192871,
"learning_rate": 9.786803896342585e-05,
"loss": 2.2599,
"step": 680
},
{
"epoch": 5.411764705882353,
"grad_norm": 7.8015971183776855,
"learning_rate": 9.78312810145194e-05,
"loss": 2.1746,
"step": 690
},
{
"epoch": 5.490196078431373,
"grad_norm": 12.369669914245605,
"learning_rate": 9.779452306561294e-05,
"loss": 2.2099,
"step": 700
},
{
"epoch": 5.568627450980392,
"grad_norm": 7.153007507324219,
"learning_rate": 9.775776511670649e-05,
"loss": 2.4386,
"step": 710
},
{
"epoch": 5.647058823529412,
"grad_norm": 11.485706329345703,
"learning_rate": 9.772100716780004e-05,
"loss": 2.2379,
"step": 720
},
{
"epoch": 5.7254901960784315,
"grad_norm": 8.15987777709961,
"learning_rate": 9.768424921889359e-05,
"loss": 2.4642,
"step": 730
},
{
"epoch": 5.803921568627451,
"grad_norm": 8.703965187072754,
"learning_rate": 9.764749126998713e-05,
"loss": 2.1796,
"step": 740
},
{
"epoch": 5.882352941176471,
"grad_norm": 8.410348892211914,
"learning_rate": 9.761073332108068e-05,
"loss": 2.3248,
"step": 750
},
{
"epoch": 5.96078431372549,
"grad_norm": 7.395628929138184,
"learning_rate": 9.757397537217425e-05,
"loss": 2.3482,
"step": 760
},
{
"epoch": 6.03921568627451,
"grad_norm": 7.840580940246582,
"learning_rate": 9.75372174232678e-05,
"loss": 2.3581,
"step": 770
},
{
"epoch": 6.117647058823529,
"grad_norm": 7.0679779052734375,
"learning_rate": 9.750045947436133e-05,
"loss": 2.1624,
"step": 780
},
{
"epoch": 6.196078431372549,
"grad_norm": 8.131471633911133,
"learning_rate": 9.746370152545489e-05,
"loss": 1.9443,
"step": 790
},
{
"epoch": 6.2745098039215685,
"grad_norm": 7.165848255157471,
"learning_rate": 9.742694357654844e-05,
"loss": 1.8785,
"step": 800
},
{
"epoch": 6.352941176470588,
"grad_norm": 7.661879062652588,
"learning_rate": 9.739018562764199e-05,
"loss": 2.0509,
"step": 810
},
{
"epoch": 6.431372549019608,
"grad_norm": 10.607108116149902,
"learning_rate": 9.735342767873553e-05,
"loss": 1.994,
"step": 820
},
{
"epoch": 6.509803921568627,
"grad_norm": 7.981103420257568,
"learning_rate": 9.731666972982908e-05,
"loss": 1.8272,
"step": 830
},
{
"epoch": 6.588235294117647,
"grad_norm": 8.540278434753418,
"learning_rate": 9.727991178092263e-05,
"loss": 2.0337,
"step": 840
},
{
"epoch": 6.666666666666667,
"grad_norm": 6.572484493255615,
"learning_rate": 9.724315383201618e-05,
"loss": 1.8927,
"step": 850
},
{
"epoch": 6.745098039215686,
"grad_norm": 6.641257286071777,
"learning_rate": 9.720639588310972e-05,
"loss": 2.1526,
"step": 860
},
{
"epoch": 6.823529411764706,
"grad_norm": 8.230134010314941,
"learning_rate": 9.716963793420327e-05,
"loss": 2.1622,
"step": 870
},
{
"epoch": 6.901960784313726,
"grad_norm": 8.205769538879395,
"learning_rate": 9.713287998529682e-05,
"loss": 2.3578,
"step": 880
},
{
"epoch": 6.980392156862745,
"grad_norm": 8.910941123962402,
"learning_rate": 9.709612203639037e-05,
"loss": 2.1729,
"step": 890
},
{
"epoch": 7.0588235294117645,
"grad_norm": 6.27609920501709,
"learning_rate": 9.705936408748392e-05,
"loss": 1.8849,
"step": 900
},
{
"epoch": 7.137254901960785,
"grad_norm": 8.897392272949219,
"learning_rate": 9.702260613857747e-05,
"loss": 1.7128,
"step": 910
},
{
"epoch": 7.215686274509804,
"grad_norm": 8.366013526916504,
"learning_rate": 9.698584818967103e-05,
"loss": 1.7241,
"step": 920
},
{
"epoch": 7.294117647058823,
"grad_norm": 6.907725811004639,
"learning_rate": 9.694909024076458e-05,
"loss": 1.704,
"step": 930
},
{
"epoch": 7.372549019607844,
"grad_norm": 7.610422611236572,
"learning_rate": 9.691233229185812e-05,
"loss": 1.7402,
"step": 940
},
{
"epoch": 7.450980392156863,
"grad_norm": 6.888455390930176,
"learning_rate": 9.687557434295167e-05,
"loss": 1.9821,
"step": 950
},
{
"epoch": 7.529411764705882,
"grad_norm": 10.415329933166504,
"learning_rate": 9.683881639404522e-05,
"loss": 1.7391,
"step": 960
},
{
"epoch": 7.607843137254902,
"grad_norm": 7.084939956665039,
"learning_rate": 9.680205844513877e-05,
"loss": 1.8511,
"step": 970
},
{
"epoch": 7.686274509803922,
"grad_norm": 6.506997108459473,
"learning_rate": 9.676530049623231e-05,
"loss": 1.7976,
"step": 980
},
{
"epoch": 7.764705882352941,
"grad_norm": 6.158507823944092,
"learning_rate": 9.672854254732586e-05,
"loss": 1.6912,
"step": 990
},
{
"epoch": 7.8431372549019605,
"grad_norm": 7.078491687774658,
"learning_rate": 9.669178459841941e-05,
"loss": 1.9965,
"step": 1000
},
{
"epoch": 7.921568627450981,
"grad_norm": 8.607373237609863,
"learning_rate": 9.665502664951296e-05,
"loss": 1.7438,
"step": 1010
},
{
"epoch": 8.0,
"grad_norm": 6.7270073890686035,
"learning_rate": 9.661826870060651e-05,
"loss": 1.9924,
"step": 1020
},
{
"epoch": 8.07843137254902,
"grad_norm": 7.503419876098633,
"learning_rate": 9.658151075170005e-05,
"loss": 1.3828,
"step": 1030
},
{
"epoch": 8.156862745098039,
"grad_norm": 4.802187442779541,
"learning_rate": 9.65447528027936e-05,
"loss": 1.5968,
"step": 1040
},
{
"epoch": 8.235294117647058,
"grad_norm": 6.051253318786621,
"learning_rate": 9.650799485388717e-05,
"loss": 1.5977,
"step": 1050
},
{
"epoch": 8.313725490196079,
"grad_norm": 8.783585548400879,
"learning_rate": 9.64712369049807e-05,
"loss": 1.5205,
"step": 1060
},
{
"epoch": 8.392156862745098,
"grad_norm": 7.984306812286377,
"learning_rate": 9.643447895607426e-05,
"loss": 1.503,
"step": 1070
},
{
"epoch": 8.470588235294118,
"grad_norm": 6.121412754058838,
"learning_rate": 9.639772100716781e-05,
"loss": 1.3881,
"step": 1080
},
{
"epoch": 8.549019607843137,
"grad_norm": 11.138391494750977,
"learning_rate": 9.636096305826136e-05,
"loss": 1.4414,
"step": 1090
},
{
"epoch": 8.627450980392156,
"grad_norm": 7.716047763824463,
"learning_rate": 9.632420510935491e-05,
"loss": 1.6433,
"step": 1100
},
{
"epoch": 8.705882352941176,
"grad_norm": 7.658680438995361,
"learning_rate": 9.628744716044845e-05,
"loss": 1.6958,
"step": 1110
},
{
"epoch": 8.784313725490197,
"grad_norm": 7.3405961990356445,
"learning_rate": 9.6250689211542e-05,
"loss": 1.7082,
"step": 1120
},
{
"epoch": 8.862745098039216,
"grad_norm": 7.0780792236328125,
"learning_rate": 9.621393126263555e-05,
"loss": 1.5016,
"step": 1130
},
{
"epoch": 8.941176470588236,
"grad_norm": 6.372091293334961,
"learning_rate": 9.61771733137291e-05,
"loss": 1.7571,
"step": 1140
},
{
"epoch": 9.019607843137255,
"grad_norm": 6.285316467285156,
"learning_rate": 9.614041536482264e-05,
"loss": 1.493,
"step": 1150
},
{
"epoch": 9.098039215686274,
"grad_norm": 5.926991939544678,
"learning_rate": 9.610365741591619e-05,
"loss": 1.3031,
"step": 1160
},
{
"epoch": 9.176470588235293,
"grad_norm": 6.716485977172852,
"learning_rate": 9.606689946700974e-05,
"loss": 1.5205,
"step": 1170
},
{
"epoch": 9.254901960784313,
"grad_norm": 9.255678176879883,
"learning_rate": 9.60301415181033e-05,
"loss": 1.3186,
"step": 1180
},
{
"epoch": 9.333333333333334,
"grad_norm": 6.3390116691589355,
"learning_rate": 9.599338356919685e-05,
"loss": 1.2825,
"step": 1190
},
{
"epoch": 9.411764705882353,
"grad_norm": 6.447065830230713,
"learning_rate": 9.59566256202904e-05,
"loss": 1.3772,
"step": 1200
},
{
"epoch": 9.490196078431373,
"grad_norm": 5.734104633331299,
"learning_rate": 9.591986767138395e-05,
"loss": 1.2352,
"step": 1210
},
{
"epoch": 9.568627450980392,
"grad_norm": 6.8585968017578125,
"learning_rate": 9.58831097224775e-05,
"loss": 1.29,
"step": 1220
},
{
"epoch": 9.647058823529411,
"grad_norm": 6.205005645751953,
"learning_rate": 9.584635177357104e-05,
"loss": 1.4425,
"step": 1230
},
{
"epoch": 9.72549019607843,
"grad_norm": 7.428943157196045,
"learning_rate": 9.580959382466459e-05,
"loss": 1.2955,
"step": 1240
},
{
"epoch": 9.803921568627452,
"grad_norm": 6.727294921875,
"learning_rate": 9.577283587575814e-05,
"loss": 1.3502,
"step": 1250
},
{
"epoch": 9.882352941176471,
"grad_norm": 6.761404991149902,
"learning_rate": 9.573607792685169e-05,
"loss": 1.5468,
"step": 1260
},
{
"epoch": 9.96078431372549,
"grad_norm": 6.411635875701904,
"learning_rate": 9.569931997794523e-05,
"loss": 1.3705,
"step": 1270
},
{
"epoch": 10.03921568627451,
"grad_norm": 6.24171257019043,
"learning_rate": 9.566256202903878e-05,
"loss": 1.3356,
"step": 1280
},
{
"epoch": 10.117647058823529,
"grad_norm": 11.419039726257324,
"learning_rate": 9.562580408013233e-05,
"loss": 1.0625,
"step": 1290
},
{
"epoch": 10.196078431372548,
"grad_norm": 5.620776653289795,
"learning_rate": 9.558904613122588e-05,
"loss": 1.0987,
"step": 1300
},
{
"epoch": 10.27450980392157,
"grad_norm": 5.87687349319458,
"learning_rate": 9.555228818231942e-05,
"loss": 1.1131,
"step": 1310
},
{
"epoch": 10.352941176470589,
"grad_norm": 5.40950345993042,
"learning_rate": 9.551553023341297e-05,
"loss": 1.2988,
"step": 1320
},
{
"epoch": 10.431372549019608,
"grad_norm": 7.283133029937744,
"learning_rate": 9.547877228450652e-05,
"loss": 1.1024,
"step": 1330
},
{
"epoch": 10.509803921568627,
"grad_norm": 6.536468029022217,
"learning_rate": 9.544201433560009e-05,
"loss": 1.1333,
"step": 1340
},
{
"epoch": 10.588235294117647,
"grad_norm": 5.083970069885254,
"learning_rate": 9.540525638669363e-05,
"loss": 1.198,
"step": 1350
},
{
"epoch": 10.666666666666666,
"grad_norm": 6.129286289215088,
"learning_rate": 9.536849843778718e-05,
"loss": 1.1499,
"step": 1360
},
{
"epoch": 10.745098039215687,
"grad_norm": 6.173052787780762,
"learning_rate": 9.533174048888073e-05,
"loss": 1.0153,
"step": 1370
},
{
"epoch": 10.823529411764707,
"grad_norm": 9.189874649047852,
"learning_rate": 9.529498253997428e-05,
"loss": 1.3891,
"step": 1380
},
{
"epoch": 10.901960784313726,
"grad_norm": 4.8431172370910645,
"learning_rate": 9.525822459106782e-05,
"loss": 1.3164,
"step": 1390
},
{
"epoch": 10.980392156862745,
"grad_norm": 7.321885108947754,
"learning_rate": 9.522146664216137e-05,
"loss": 1.2641,
"step": 1400
},
{
"epoch": 11.058823529411764,
"grad_norm": 6.731063365936279,
"learning_rate": 9.518470869325492e-05,
"loss": 1.0551,
"step": 1410
},
{
"epoch": 11.137254901960784,
"grad_norm": 5.393633842468262,
"learning_rate": 9.514795074434847e-05,
"loss": 0.933,
"step": 1420
},
{
"epoch": 11.215686274509803,
"grad_norm": 6.351990699768066,
"learning_rate": 9.511119279544201e-05,
"loss": 1.1135,
"step": 1430
},
{
"epoch": 11.294117647058824,
"grad_norm": 6.7596893310546875,
"learning_rate": 9.507443484653556e-05,
"loss": 0.8813,
"step": 1440
},
{
"epoch": 11.372549019607844,
"grad_norm": 8.091069221496582,
"learning_rate": 9.503767689762911e-05,
"loss": 1.0027,
"step": 1450
},
{
"epoch": 11.450980392156863,
"grad_norm": 6.078036308288574,
"learning_rate": 9.500091894872266e-05,
"loss": 0.8464,
"step": 1460
},
{
"epoch": 11.529411764705882,
"grad_norm": 5.587483882904053,
"learning_rate": 9.496416099981622e-05,
"loss": 1.0221,
"step": 1470
},
{
"epoch": 11.607843137254902,
"grad_norm": 6.806708812713623,
"learning_rate": 9.492740305090977e-05,
"loss": 1.0172,
"step": 1480
},
{
"epoch": 11.686274509803921,
"grad_norm": 5.886943340301514,
"learning_rate": 9.489064510200332e-05,
"loss": 1.0526,
"step": 1490
},
{
"epoch": 11.764705882352942,
"grad_norm": 5.225791931152344,
"learning_rate": 9.485388715309687e-05,
"loss": 1.0879,
"step": 1500
},
{
"epoch": 11.843137254901961,
"grad_norm": 5.893291473388672,
"learning_rate": 9.481712920419041e-05,
"loss": 1.0346,
"step": 1510
},
{
"epoch": 11.92156862745098,
"grad_norm": 5.409924507141113,
"learning_rate": 9.478037125528396e-05,
"loss": 1.0874,
"step": 1520
},
{
"epoch": 12.0,
"grad_norm": 5.1957688331604,
"learning_rate": 9.474361330637751e-05,
"loss": 1.1563,
"step": 1530
},
{
"epoch": 12.07843137254902,
"grad_norm": 4.920179843902588,
"learning_rate": 9.470685535747106e-05,
"loss": 0.8673,
"step": 1540
},
{
"epoch": 12.156862745098039,
"grad_norm": 5.249741554260254,
"learning_rate": 9.467009740856461e-05,
"loss": 0.8907,
"step": 1550
},
{
"epoch": 12.235294117647058,
"grad_norm": 5.800076961517334,
"learning_rate": 9.463333945965815e-05,
"loss": 0.8124,
"step": 1560
},
{
"epoch": 12.313725490196079,
"grad_norm": 5.083131313323975,
"learning_rate": 9.45965815107517e-05,
"loss": 0.925,
"step": 1570
},
{
"epoch": 12.392156862745098,
"grad_norm": 10.037300109863281,
"learning_rate": 9.455982356184525e-05,
"loss": 0.8389,
"step": 1580
},
{
"epoch": 12.470588235294118,
"grad_norm": 6.173994541168213,
"learning_rate": 9.45230656129388e-05,
"loss": 0.9503,
"step": 1590
},
{
"epoch": 12.549019607843137,
"grad_norm": 4.115769386291504,
"learning_rate": 9.448630766403234e-05,
"loss": 0.8627,
"step": 1600
},
{
"epoch": 12.627450980392156,
"grad_norm": 5.032641410827637,
"learning_rate": 9.44495497151259e-05,
"loss": 0.9014,
"step": 1610
},
{
"epoch": 12.705882352941176,
"grad_norm": 5.60946798324585,
"learning_rate": 9.441279176621946e-05,
"loss": 0.8115,
"step": 1620
},
{
"epoch": 12.784313725490197,
"grad_norm": 5.839189529418945,
"learning_rate": 9.4376033817313e-05,
"loss": 0.9689,
"step": 1630
},
{
"epoch": 12.862745098039216,
"grad_norm": 5.109472751617432,
"learning_rate": 9.433927586840655e-05,
"loss": 0.8387,
"step": 1640
},
{
"epoch": 12.941176470588236,
"grad_norm": 5.745982646942139,
"learning_rate": 9.43025179195001e-05,
"loss": 0.853,
"step": 1650
},
{
"epoch": 13.019607843137255,
"grad_norm": 4.644872665405273,
"learning_rate": 9.426575997059365e-05,
"loss": 0.7598,
"step": 1660
},
{
"epoch": 13.098039215686274,
"grad_norm": 5.00312614440918,
"learning_rate": 9.42290020216872e-05,
"loss": 0.6555,
"step": 1670
},
{
"epoch": 13.176470588235293,
"grad_norm": 3.4448249340057373,
"learning_rate": 9.419224407278074e-05,
"loss": 0.7488,
"step": 1680
},
{
"epoch": 13.254901960784313,
"grad_norm": 5.198800563812256,
"learning_rate": 9.415548612387429e-05,
"loss": 0.7083,
"step": 1690
},
{
"epoch": 13.333333333333334,
"grad_norm": 6.767204761505127,
"learning_rate": 9.411872817496784e-05,
"loss": 0.7393,
"step": 1700
},
{
"epoch": 13.411764705882353,
"grad_norm": 4.484736919403076,
"learning_rate": 9.40819702260614e-05,
"loss": 0.7486,
"step": 1710
},
{
"epoch": 13.490196078431373,
"grad_norm": 4.29071569442749,
"learning_rate": 9.404521227715493e-05,
"loss": 0.64,
"step": 1720
},
{
"epoch": 13.568627450980392,
"grad_norm": 5.528765678405762,
"learning_rate": 9.400845432824848e-05,
"loss": 0.8711,
"step": 1730
},
{
"epoch": 13.647058823529411,
"grad_norm": 6.199097156524658,
"learning_rate": 9.397169637934203e-05,
"loss": 0.792,
"step": 1740
},
{
"epoch": 13.72549019607843,
"grad_norm": 6.095465183258057,
"learning_rate": 9.393493843043559e-05,
"loss": 0.6913,
"step": 1750
},
{
"epoch": 13.803921568627452,
"grad_norm": 5.053860664367676,
"learning_rate": 9.389818048152912e-05,
"loss": 0.8404,
"step": 1760
},
{
"epoch": 13.882352941176471,
"grad_norm": 5.084766864776611,
"learning_rate": 9.386142253262269e-05,
"loss": 0.8854,
"step": 1770
},
{
"epoch": 13.96078431372549,
"grad_norm": 7.908563613891602,
"learning_rate": 9.382466458371624e-05,
"loss": 0.7045,
"step": 1780
},
{
"epoch": 14.03921568627451,
"grad_norm": 3.9915947914123535,
"learning_rate": 9.378790663480979e-05,
"loss": 0.6747,
"step": 1790
},
{
"epoch": 14.117647058823529,
"grad_norm": 4.792238235473633,
"learning_rate": 9.375114868590333e-05,
"loss": 0.5822,
"step": 1800
},
{
"epoch": 14.196078431372548,
"grad_norm": 3.964909553527832,
"learning_rate": 9.371439073699688e-05,
"loss": 0.6341,
"step": 1810
},
{
"epoch": 14.27450980392157,
"grad_norm": 5.188769817352295,
"learning_rate": 9.367763278809043e-05,
"loss": 0.7031,
"step": 1820
},
{
"epoch": 14.352941176470589,
"grad_norm": 8.642464637756348,
"learning_rate": 9.364087483918398e-05,
"loss": 0.6679,
"step": 1830
},
{
"epoch": 14.431372549019608,
"grad_norm": 4.989500999450684,
"learning_rate": 9.360411689027752e-05,
"loss": 0.7224,
"step": 1840
},
{
"epoch": 14.509803921568627,
"grad_norm": 5.256617069244385,
"learning_rate": 9.356735894137107e-05,
"loss": 0.6798,
"step": 1850
},
{
"epoch": 14.588235294117647,
"grad_norm": 6.925418376922607,
"learning_rate": 9.353060099246462e-05,
"loss": 0.6554,
"step": 1860
},
{
"epoch": 14.666666666666666,
"grad_norm": 5.8858513832092285,
"learning_rate": 9.349384304355817e-05,
"loss": 0.6254,
"step": 1870
},
{
"epoch": 14.745098039215687,
"grad_norm": 5.78135871887207,
"learning_rate": 9.345708509465173e-05,
"loss": 0.727,
"step": 1880
},
{
"epoch": 14.823529411764707,
"grad_norm": 6.644104480743408,
"learning_rate": 9.342032714574526e-05,
"loss": 0.6857,
"step": 1890
},
{
"epoch": 14.901960784313726,
"grad_norm": 4.504312992095947,
"learning_rate": 9.338356919683882e-05,
"loss": 0.643,
"step": 1900
},
{
"epoch": 14.980392156862745,
"grad_norm": 4.649731636047363,
"learning_rate": 9.334681124793238e-05,
"loss": 0.6495,
"step": 1910
},
{
"epoch": 15.058823529411764,
"grad_norm": 4.2058610916137695,
"learning_rate": 9.331005329902592e-05,
"loss": 0.5624,
"step": 1920
},
{
"epoch": 15.137254901960784,
"grad_norm": 6.13231897354126,
"learning_rate": 9.327329535011947e-05,
"loss": 0.5113,
"step": 1930
},
{
"epoch": 15.215686274509803,
"grad_norm": 5.431331634521484,
"learning_rate": 9.323653740121302e-05,
"loss": 0.5455,
"step": 1940
},
{
"epoch": 15.294117647058824,
"grad_norm": 7.001118183135986,
"learning_rate": 9.319977945230657e-05,
"loss": 0.5578,
"step": 1950
},
{
"epoch": 15.372549019607844,
"grad_norm": 5.866312503814697,
"learning_rate": 9.316302150340011e-05,
"loss": 0.5321,
"step": 1960
},
{
"epoch": 15.450980392156863,
"grad_norm": 5.004396438598633,
"learning_rate": 9.312626355449366e-05,
"loss": 0.6243,
"step": 1970
},
{
"epoch": 15.529411764705882,
"grad_norm": 8.219724655151367,
"learning_rate": 9.308950560558721e-05,
"loss": 0.5888,
"step": 1980
},
{
"epoch": 15.607843137254902,
"grad_norm": 5.223458766937256,
"learning_rate": 9.305274765668076e-05,
"loss": 0.5239,
"step": 1990
},
{
"epoch": 15.686274509803921,
"grad_norm": 3.830970048904419,
"learning_rate": 9.301598970777432e-05,
"loss": 0.6341,
"step": 2000
},
{
"epoch": 15.764705882352942,
"grad_norm": 3.6862759590148926,
"learning_rate": 9.297923175886785e-05,
"loss": 0.5209,
"step": 2010
},
{
"epoch": 15.843137254901961,
"grad_norm": 4.332780361175537,
"learning_rate": 9.29424738099614e-05,
"loss": 0.6024,
"step": 2020
},
{
"epoch": 15.92156862745098,
"grad_norm": 3.963103771209717,
"learning_rate": 9.290571586105496e-05,
"loss": 0.7055,
"step": 2030
},
{
"epoch": 16.0,
"grad_norm": 4.53104305267334,
"learning_rate": 9.286895791214851e-05,
"loss": 0.5927,
"step": 2040
},
{
"epoch": 16.07843137254902,
"grad_norm": 4.432463645935059,
"learning_rate": 9.283219996324205e-05,
"loss": 0.3956,
"step": 2050
},
{
"epoch": 16.15686274509804,
"grad_norm": 2.843749523162842,
"learning_rate": 9.279544201433561e-05,
"loss": 0.4537,
"step": 2060
},
{
"epoch": 16.235294117647058,
"grad_norm": 3.6052606105804443,
"learning_rate": 9.275868406542916e-05,
"loss": 0.5233,
"step": 2070
},
{
"epoch": 16.313725490196077,
"grad_norm": 8.1451416015625,
"learning_rate": 9.272192611652271e-05,
"loss": 0.4672,
"step": 2080
},
{
"epoch": 16.392156862745097,
"grad_norm": 4.1013407707214355,
"learning_rate": 9.268516816761625e-05,
"loss": 0.4495,
"step": 2090
},
{
"epoch": 16.470588235294116,
"grad_norm": 5.130369186401367,
"learning_rate": 9.26484102187098e-05,
"loss": 0.6151,
"step": 2100
},
{
"epoch": 16.54901960784314,
"grad_norm": 3.6761317253112793,
"learning_rate": 9.261165226980335e-05,
"loss": 0.4775,
"step": 2110
},
{
"epoch": 16.627450980392158,
"grad_norm": 3.270599126815796,
"learning_rate": 9.25748943208969e-05,
"loss": 0.4543,
"step": 2120
},
{
"epoch": 16.705882352941178,
"grad_norm": 6.377615451812744,
"learning_rate": 9.253813637199044e-05,
"loss": 0.5052,
"step": 2130
},
{
"epoch": 16.784313725490197,
"grad_norm": 4.0827741622924805,
"learning_rate": 9.2501378423084e-05,
"loss": 0.4645,
"step": 2140
},
{
"epoch": 16.862745098039216,
"grad_norm": 4.460141181945801,
"learning_rate": 9.246462047417755e-05,
"loss": 0.5117,
"step": 2150
},
{
"epoch": 16.941176470588236,
"grad_norm": 4.786052227020264,
"learning_rate": 9.24278625252711e-05,
"loss": 0.561,
"step": 2160
},
{
"epoch": 17.019607843137255,
"grad_norm": 4.18758487701416,
"learning_rate": 9.239110457636463e-05,
"loss": 0.4759,
"step": 2170
},
{
"epoch": 17.098039215686274,
"grad_norm": 4.064152240753174,
"learning_rate": 9.235434662745819e-05,
"loss": 0.4141,
"step": 2180
},
{
"epoch": 17.176470588235293,
"grad_norm": 3.276078939437866,
"learning_rate": 9.231758867855174e-05,
"loss": 0.3797,
"step": 2190
},
{
"epoch": 17.254901960784313,
"grad_norm": 3.8203907012939453,
"learning_rate": 9.22808307296453e-05,
"loss": 0.4376,
"step": 2200
},
{
"epoch": 17.333333333333332,
"grad_norm": 3.6585357189178467,
"learning_rate": 9.224407278073884e-05,
"loss": 0.5028,
"step": 2210
},
{
"epoch": 17.41176470588235,
"grad_norm": 3.880546808242798,
"learning_rate": 9.220731483183239e-05,
"loss": 0.3938,
"step": 2220
},
{
"epoch": 17.49019607843137,
"grad_norm": 5.758749008178711,
"learning_rate": 9.217055688292594e-05,
"loss": 0.3834,
"step": 2230
},
{
"epoch": 17.568627450980394,
"grad_norm": 3.563232183456421,
"learning_rate": 9.21337989340195e-05,
"loss": 0.4363,
"step": 2240
},
{
"epoch": 17.647058823529413,
"grad_norm": 4.751742839813232,
"learning_rate": 9.209704098511303e-05,
"loss": 0.5009,
"step": 2250
},
{
"epoch": 17.725490196078432,
"grad_norm": 3.618528127670288,
"learning_rate": 9.206028303620658e-05,
"loss": 0.4387,
"step": 2260
},
{
"epoch": 17.80392156862745,
"grad_norm": 3.945882558822632,
"learning_rate": 9.202352508730013e-05,
"loss": 0.5031,
"step": 2270
},
{
"epoch": 17.88235294117647,
"grad_norm": 4.49643087387085,
"learning_rate": 9.198676713839369e-05,
"loss": 0.4466,
"step": 2280
},
{
"epoch": 17.96078431372549,
"grad_norm": 4.1370673179626465,
"learning_rate": 9.195000918948722e-05,
"loss": 0.4352,
"step": 2290
},
{
"epoch": 18.03921568627451,
"grad_norm": 3.39939546585083,
"learning_rate": 9.191325124058077e-05,
"loss": 0.4518,
"step": 2300
},
{
"epoch": 18.11764705882353,
"grad_norm": 3.816342353820801,
"learning_rate": 9.187649329167433e-05,
"loss": 0.3852,
"step": 2310
},
{
"epoch": 18.19607843137255,
"grad_norm": 2.6915409564971924,
"learning_rate": 9.183973534276788e-05,
"loss": 0.3584,
"step": 2320
},
{
"epoch": 18.274509803921568,
"grad_norm": 4.04006814956665,
"learning_rate": 9.180297739386143e-05,
"loss": 0.3567,
"step": 2330
},
{
"epoch": 18.352941176470587,
"grad_norm": 3.8536486625671387,
"learning_rate": 9.176621944495497e-05,
"loss": 0.3251,
"step": 2340
},
{
"epoch": 18.431372549019606,
"grad_norm": 3.982511281967163,
"learning_rate": 9.172946149604853e-05,
"loss": 0.4031,
"step": 2350
},
{
"epoch": 18.509803921568626,
"grad_norm": 2.8618922233581543,
"learning_rate": 9.169270354714208e-05,
"loss": 0.3933,
"step": 2360
},
{
"epoch": 18.58823529411765,
"grad_norm": 6.304449558258057,
"learning_rate": 9.165594559823562e-05,
"loss": 0.4121,
"step": 2370
},
{
"epoch": 18.666666666666668,
"grad_norm": 5.2535905838012695,
"learning_rate": 9.161918764932917e-05,
"loss": 0.3762,
"step": 2380
},
{
"epoch": 18.745098039215687,
"grad_norm": 4.310611724853516,
"learning_rate": 9.158242970042272e-05,
"loss": 0.3436,
"step": 2390
},
{
"epoch": 18.823529411764707,
"grad_norm": 3.7275941371917725,
"learning_rate": 9.154567175151627e-05,
"loss": 0.3255,
"step": 2400
},
{
"epoch": 18.901960784313726,
"grad_norm": 3.8429691791534424,
"learning_rate": 9.150891380260981e-05,
"loss": 0.4637,
"step": 2410
},
{
"epoch": 18.980392156862745,
"grad_norm": 5.648855686187744,
"learning_rate": 9.147215585370336e-05,
"loss": 0.5983,
"step": 2420
},
{
"epoch": 19.058823529411764,
"grad_norm": 4.382913589477539,
"learning_rate": 9.143539790479692e-05,
"loss": 0.34,
"step": 2430
},
{
"epoch": 19.137254901960784,
"grad_norm": 3.4657950401306152,
"learning_rate": 9.139863995589047e-05,
"loss": 0.2972,
"step": 2440
},
{
"epoch": 19.215686274509803,
"grad_norm": 2.9859068393707275,
"learning_rate": 9.136188200698402e-05,
"loss": 0.3526,
"step": 2450
},
{
"epoch": 19.294117647058822,
"grad_norm": 3.8208978176116943,
"learning_rate": 9.132512405807756e-05,
"loss": 0.3774,
"step": 2460
},
{
"epoch": 19.372549019607842,
"grad_norm": 3.611250877380371,
"learning_rate": 9.128836610917111e-05,
"loss": 0.3775,
"step": 2470
},
{
"epoch": 19.45098039215686,
"grad_norm": 2.989877700805664,
"learning_rate": 9.125160816026466e-05,
"loss": 0.3333,
"step": 2480
},
{
"epoch": 19.529411764705884,
"grad_norm": 3.469022750854492,
"learning_rate": 9.121485021135821e-05,
"loss": 0.3057,
"step": 2490
},
{
"epoch": 19.607843137254903,
"grad_norm": 2.706902027130127,
"learning_rate": 9.117809226245176e-05,
"loss": 0.431,
"step": 2500
},
{
"epoch": 19.686274509803923,
"grad_norm": 3.100156307220459,
"learning_rate": 9.114133431354531e-05,
"loss": 0.3619,
"step": 2510
},
{
"epoch": 19.764705882352942,
"grad_norm": 4.185247898101807,
"learning_rate": 9.110457636463886e-05,
"loss": 0.3541,
"step": 2520
},
{
"epoch": 19.84313725490196,
"grad_norm": 4.356285572052002,
"learning_rate": 9.106781841573242e-05,
"loss": 0.3376,
"step": 2530
},
{
"epoch": 19.92156862745098,
"grad_norm": 3.447700262069702,
"learning_rate": 9.103106046682595e-05,
"loss": 0.3219,
"step": 2540
},
{
"epoch": 20.0,
"grad_norm": 4.159237384796143,
"learning_rate": 9.09943025179195e-05,
"loss": 0.4266,
"step": 2550
},
{
"epoch": 20.07843137254902,
"grad_norm": 3.4393558502197266,
"learning_rate": 9.095754456901306e-05,
"loss": 0.3077,
"step": 2560
},
{
"epoch": 20.15686274509804,
"grad_norm": 3.7608890533447266,
"learning_rate": 9.092078662010661e-05,
"loss": 0.3565,
"step": 2570
},
{
"epoch": 20.235294117647058,
"grad_norm": 2.8301854133605957,
"learning_rate": 9.088402867120015e-05,
"loss": 0.313,
"step": 2580
},
{
"epoch": 20.313725490196077,
"grad_norm": 3.782179594039917,
"learning_rate": 9.08472707222937e-05,
"loss": 0.3104,
"step": 2590
},
{
"epoch": 20.392156862745097,
"grad_norm": 2.997694253921509,
"learning_rate": 9.081051277338725e-05,
"loss": 0.2572,
"step": 2600
},
{
"epoch": 20.470588235294116,
"grad_norm": 4.569226264953613,
"learning_rate": 9.07737548244808e-05,
"loss": 0.2745,
"step": 2610
},
{
"epoch": 20.54901960784314,
"grad_norm": 3.0622193813323975,
"learning_rate": 9.073699687557434e-05,
"loss": 0.3277,
"step": 2620
},
{
"epoch": 20.627450980392158,
"grad_norm": 7.896496295928955,
"learning_rate": 9.070023892666789e-05,
"loss": 0.3422,
"step": 2630
},
{
"epoch": 20.705882352941178,
"grad_norm": 3.433051109313965,
"learning_rate": 9.066348097776145e-05,
"loss": 0.3259,
"step": 2640
},
{
"epoch": 20.784313725490197,
"grad_norm": 4.0141215324401855,
"learning_rate": 9.0626723028855e-05,
"loss": 0.3942,
"step": 2650
},
{
"epoch": 20.862745098039216,
"grad_norm": 3.386195421218872,
"learning_rate": 9.058996507994854e-05,
"loss": 0.2881,
"step": 2660
},
{
"epoch": 20.941176470588236,
"grad_norm": 2.885312080383301,
"learning_rate": 9.05532071310421e-05,
"loss": 0.2974,
"step": 2670
},
{
"epoch": 21.019607843137255,
"grad_norm": 7.563695907592773,
"learning_rate": 9.051644918213565e-05,
"loss": 0.2997,
"step": 2680
},
{
"epoch": 21.098039215686274,
"grad_norm": 2.875091075897217,
"learning_rate": 9.04796912332292e-05,
"loss": 0.2349,
"step": 2690
},
{
"epoch": 21.176470588235293,
"grad_norm": 8.387333869934082,
"learning_rate": 9.044293328432273e-05,
"loss": 0.2667,
"step": 2700
},
{
"epoch": 21.254901960784313,
"grad_norm": 3.013108968734741,
"learning_rate": 9.040617533541629e-05,
"loss": 0.2615,
"step": 2710
},
{
"epoch": 21.333333333333332,
"grad_norm": 3.5930674076080322,
"learning_rate": 9.036941738650984e-05,
"loss": 0.3212,
"step": 2720
},
{
"epoch": 21.41176470588235,
"grad_norm": 3.1566312313079834,
"learning_rate": 9.033265943760339e-05,
"loss": 0.2714,
"step": 2730
},
{
"epoch": 21.49019607843137,
"grad_norm": 4.430455207824707,
"learning_rate": 9.029590148869693e-05,
"loss": 0.3139,
"step": 2740
},
{
"epoch": 21.568627450980394,
"grad_norm": 2.473768949508667,
"learning_rate": 9.025914353979048e-05,
"loss": 0.2804,
"step": 2750
},
{
"epoch": 21.647058823529413,
"grad_norm": 3.700646162033081,
"learning_rate": 9.022238559088403e-05,
"loss": 0.322,
"step": 2760
},
{
"epoch": 21.725490196078432,
"grad_norm": 2.997344970703125,
"learning_rate": 9.018562764197758e-05,
"loss": 0.2736,
"step": 2770
},
{
"epoch": 21.80392156862745,
"grad_norm": 3.385653495788574,
"learning_rate": 9.014886969307113e-05,
"loss": 0.2946,
"step": 2780
},
{
"epoch": 21.88235294117647,
"grad_norm": 3.961817741394043,
"learning_rate": 9.011211174416468e-05,
"loss": 0.295,
"step": 2790
},
{
"epoch": 21.96078431372549,
"grad_norm": 6.599482536315918,
"learning_rate": 9.007535379525823e-05,
"loss": 0.2975,
"step": 2800
},
{
"epoch": 22.03921568627451,
"grad_norm": 2.4624006748199463,
"learning_rate": 9.003859584635179e-05,
"loss": 0.2299,
"step": 2810
},
{
"epoch": 22.11764705882353,
"grad_norm": 2.7834863662719727,
"learning_rate": 9.000183789744532e-05,
"loss": 0.2237,
"step": 2820
},
{
"epoch": 22.19607843137255,
"grad_norm": 2.49113392829895,
"learning_rate": 8.996507994853887e-05,
"loss": 0.2631,
"step": 2830
},
{
"epoch": 22.274509803921568,
"grad_norm": 4.437926292419434,
"learning_rate": 8.992832199963243e-05,
"loss": 0.227,
"step": 2840
},
{
"epoch": 22.352941176470587,
"grad_norm": 6.664700508117676,
"learning_rate": 8.989156405072598e-05,
"loss": 0.3085,
"step": 2850
},
{
"epoch": 22.431372549019606,
"grad_norm": 3.945110321044922,
"learning_rate": 8.985480610181952e-05,
"loss": 0.2354,
"step": 2860
},
{
"epoch": 22.509803921568626,
"grad_norm": 2.9697070121765137,
"learning_rate": 8.981804815291307e-05,
"loss": 0.2374,
"step": 2870
},
{
"epoch": 22.58823529411765,
"grad_norm": 2.9439990520477295,
"learning_rate": 8.978129020400662e-05,
"loss": 0.2507,
"step": 2880
},
{
"epoch": 22.666666666666668,
"grad_norm": 2.947354555130005,
"learning_rate": 8.974453225510017e-05,
"loss": 0.2647,
"step": 2890
},
{
"epoch": 22.745098039215687,
"grad_norm": 9.103282928466797,
"learning_rate": 8.970777430619372e-05,
"loss": 0.3106,
"step": 2900
},
{
"epoch": 22.823529411764707,
"grad_norm": 2.5283734798431396,
"learning_rate": 8.967101635728726e-05,
"loss": 0.2715,
"step": 2910
},
{
"epoch": 22.901960784313726,
"grad_norm": 3.052879810333252,
"learning_rate": 8.963425840838081e-05,
"loss": 0.2977,
"step": 2920
},
{
"epoch": 22.980392156862745,
"grad_norm": 3.37917423248291,
"learning_rate": 8.959750045947437e-05,
"loss": 0.2878,
"step": 2930
},
{
"epoch": 23.058823529411764,
"grad_norm": 3.028381109237671,
"learning_rate": 8.956074251056791e-05,
"loss": 0.2169,
"step": 2940
},
{
"epoch": 23.137254901960784,
"grad_norm": 2.4643447399139404,
"learning_rate": 8.952398456166146e-05,
"loss": 0.2073,
"step": 2950
},
{
"epoch": 23.215686274509803,
"grad_norm": 2.397473096847534,
"learning_rate": 8.948722661275502e-05,
"loss": 0.2108,
"step": 2960
},
{
"epoch": 23.294117647058822,
"grad_norm": 6.173182010650635,
"learning_rate": 8.945046866384857e-05,
"loss": 0.2428,
"step": 2970
},
{
"epoch": 23.372549019607842,
"grad_norm": 3.343395948410034,
"learning_rate": 8.941371071494212e-05,
"loss": 0.275,
"step": 2980
},
{
"epoch": 23.45098039215686,
"grad_norm": 2.379011631011963,
"learning_rate": 8.937695276603566e-05,
"loss": 0.2336,
"step": 2990
},
{
"epoch": 23.529411764705884,
"grad_norm": 7.347818374633789,
"learning_rate": 8.934019481712921e-05,
"loss": 0.2575,
"step": 3000
},
{
"epoch": 23.607843137254903,
"grad_norm": 7.274477005004883,
"learning_rate": 8.930343686822276e-05,
"loss": 0.2616,
"step": 3010
},
{
"epoch": 23.686274509803923,
"grad_norm": 4.475617408752441,
"learning_rate": 8.926667891931631e-05,
"loss": 0.2336,
"step": 3020
},
{
"epoch": 23.764705882352942,
"grad_norm": 3.119966506958008,
"learning_rate": 8.922992097040985e-05,
"loss": 0.2705,
"step": 3030
},
{
"epoch": 23.84313725490196,
"grad_norm": 2.663884401321411,
"learning_rate": 8.91931630215034e-05,
"loss": 0.2863,
"step": 3040
},
{
"epoch": 23.92156862745098,
"grad_norm": 6.014930725097656,
"learning_rate": 8.915640507259695e-05,
"loss": 0.2437,
"step": 3050
},
{
"epoch": 24.0,
"grad_norm": 2.6029369831085205,
"learning_rate": 8.91196471236905e-05,
"loss": 0.2844,
"step": 3060
},
{
"epoch": 24.07843137254902,
"grad_norm": 2.7642641067504883,
"learning_rate": 8.908288917478405e-05,
"loss": 0.2014,
"step": 3070
},
{
"epoch": 24.15686274509804,
"grad_norm": 8.297749519348145,
"learning_rate": 8.90461312258776e-05,
"loss": 0.2657,
"step": 3080
},
{
"epoch": 24.235294117647058,
"grad_norm": 3.9905178546905518,
"learning_rate": 8.900937327697116e-05,
"loss": 0.2114,
"step": 3090
},
{
"epoch": 24.313725490196077,
"grad_norm": 2.5204246044158936,
"learning_rate": 8.897261532806471e-05,
"loss": 0.1942,
"step": 3100
},
{
"epoch": 24.392156862745097,
"grad_norm": 1.52804434299469,
"learning_rate": 8.893585737915824e-05,
"loss": 0.2155,
"step": 3110
},
{
"epoch": 24.470588235294116,
"grad_norm": 3.6253719329833984,
"learning_rate": 8.88990994302518e-05,
"loss": 0.2201,
"step": 3120
},
{
"epoch": 24.54901960784314,
"grad_norm": 7.122885227203369,
"learning_rate": 8.886234148134535e-05,
"loss": 0.2297,
"step": 3130
},
{
"epoch": 24.627450980392158,
"grad_norm": 2.7436540126800537,
"learning_rate": 8.88255835324389e-05,
"loss": 0.2201,
"step": 3140
},
{
"epoch": 24.705882352941178,
"grad_norm": 5.490830898284912,
"learning_rate": 8.878882558353244e-05,
"loss": 0.2075,
"step": 3150
},
{
"epoch": 24.784313725490197,
"grad_norm": 2.3640992641448975,
"learning_rate": 8.875206763462599e-05,
"loss": 0.2283,
"step": 3160
},
{
"epoch": 24.862745098039216,
"grad_norm": 3.1421284675598145,
"learning_rate": 8.871530968571954e-05,
"loss": 0.2722,
"step": 3170
},
{
"epoch": 24.941176470588236,
"grad_norm": 2.805938720703125,
"learning_rate": 8.867855173681309e-05,
"loss": 0.2361,
"step": 3180
},
{
"epoch": 25.019607843137255,
"grad_norm": 7.49440336227417,
"learning_rate": 8.864179378790663e-05,
"loss": 0.2847,
"step": 3190
},
{
"epoch": 25.098039215686274,
"grad_norm": 2.238097667694092,
"learning_rate": 8.860503583900018e-05,
"loss": 0.2008,
"step": 3200
},
{
"epoch": 25.176470588235293,
"grad_norm": 2.652937650680542,
"learning_rate": 8.856827789009373e-05,
"loss": 0.1826,
"step": 3210
},
{
"epoch": 25.254901960784313,
"grad_norm": 2.6613991260528564,
"learning_rate": 8.85315199411873e-05,
"loss": 0.1676,
"step": 3220
},
{
"epoch": 25.333333333333332,
"grad_norm": 2.0592703819274902,
"learning_rate": 8.849476199228083e-05,
"loss": 0.2177,
"step": 3230
},
{
"epoch": 25.41176470588235,
"grad_norm": 3.4880011081695557,
"learning_rate": 8.845800404337439e-05,
"loss": 0.1935,
"step": 3240
},
{
"epoch": 25.49019607843137,
"grad_norm": 2.736335039138794,
"learning_rate": 8.842124609446794e-05,
"loss": 0.1994,
"step": 3250
},
{
"epoch": 25.568627450980394,
"grad_norm": 5.7319135665893555,
"learning_rate": 8.838448814556149e-05,
"loss": 0.2072,
"step": 3260
},
{
"epoch": 25.647058823529413,
"grad_norm": 10.623271942138672,
"learning_rate": 8.834773019665503e-05,
"loss": 0.2004,
"step": 3270
},
{
"epoch": 25.725490196078432,
"grad_norm": 2.4742937088012695,
"learning_rate": 8.831097224774858e-05,
"loss": 0.2465,
"step": 3280
},
{
"epoch": 25.80392156862745,
"grad_norm": 2.440775156021118,
"learning_rate": 8.827421429884213e-05,
"loss": 0.2073,
"step": 3290
},
{
"epoch": 25.88235294117647,
"grad_norm": 4.591070175170898,
"learning_rate": 8.823745634993568e-05,
"loss": 0.225,
"step": 3300
},
{
"epoch": 25.96078431372549,
"grad_norm": 2.302111864089966,
"learning_rate": 8.820069840102923e-05,
"loss": 0.2245,
"step": 3310
},
{
"epoch": 26.03921568627451,
"grad_norm": 2.730738401412964,
"learning_rate": 8.816394045212277e-05,
"loss": 0.2307,
"step": 3320
},
{
"epoch": 26.11764705882353,
"grad_norm": 1.9027403593063354,
"learning_rate": 8.812718250321632e-05,
"loss": 0.1931,
"step": 3330
},
{
"epoch": 26.19607843137255,
"grad_norm": 2.853452444076538,
"learning_rate": 8.809042455430987e-05,
"loss": 0.1913,
"step": 3340
},
{
"epoch": 26.274509803921568,
"grad_norm": 2.136833667755127,
"learning_rate": 8.805366660540342e-05,
"loss": 0.1896,
"step": 3350
},
{
"epoch": 26.352941176470587,
"grad_norm": 3.3222334384918213,
"learning_rate": 8.801690865649697e-05,
"loss": 0.1924,
"step": 3360
},
{
"epoch": 26.431372549019606,
"grad_norm": 3.190403938293457,
"learning_rate": 8.798015070759053e-05,
"loss": 0.1569,
"step": 3370
},
{
"epoch": 26.509803921568626,
"grad_norm": 3.4979772567749023,
"learning_rate": 8.794339275868408e-05,
"loss": 0.1813,
"step": 3380
},
{
"epoch": 26.58823529411765,
"grad_norm": 3.0356762409210205,
"learning_rate": 8.790663480977762e-05,
"loss": 0.2141,
"step": 3390
},
{
"epoch": 26.666666666666668,
"grad_norm": 2.5389366149902344,
"learning_rate": 8.786987686087117e-05,
"loss": 0.1986,
"step": 3400
},
{
"epoch": 26.745098039215687,
"grad_norm": 2.2066240310668945,
"learning_rate": 8.783311891196472e-05,
"loss": 0.2011,
"step": 3410
},
{
"epoch": 26.823529411764707,
"grad_norm": 2.6409451961517334,
"learning_rate": 8.779636096305827e-05,
"loss": 0.233,
"step": 3420
},
{
"epoch": 26.901960784313726,
"grad_norm": 2.3644559383392334,
"learning_rate": 8.775960301415182e-05,
"loss": 0.1877,
"step": 3430
},
{
"epoch": 26.980392156862745,
"grad_norm": 3.346972942352295,
"learning_rate": 8.772284506524536e-05,
"loss": 0.2201,
"step": 3440
},
{
"epoch": 27.058823529411764,
"grad_norm": 3.081000328063965,
"learning_rate": 8.768608711633891e-05,
"loss": 0.1929,
"step": 3450
},
{
"epoch": 27.137254901960784,
"grad_norm": 1.6890923976898193,
"learning_rate": 8.764932916743246e-05,
"loss": 0.1539,
"step": 3460
},
{
"epoch": 27.215686274509803,
"grad_norm": 4.7821221351623535,
"learning_rate": 8.761257121852601e-05,
"loss": 0.1717,
"step": 3470
},
{
"epoch": 27.294117647058822,
"grad_norm": 2.38714861869812,
"learning_rate": 8.757581326961955e-05,
"loss": 0.2534,
"step": 3480
},
{
"epoch": 27.372549019607842,
"grad_norm": 2.4988088607788086,
"learning_rate": 8.75390553207131e-05,
"loss": 0.2302,
"step": 3490
},
{
"epoch": 27.45098039215686,
"grad_norm": 2.1674258708953857,
"learning_rate": 8.750229737180665e-05,
"loss": 0.1788,
"step": 3500
},
{
"epoch": 27.529411764705884,
"grad_norm": 3.270306348800659,
"learning_rate": 8.746553942290022e-05,
"loss": 0.1815,
"step": 3510
},
{
"epoch": 27.607843137254903,
"grad_norm": 2.9274301528930664,
"learning_rate": 8.742878147399376e-05,
"loss": 0.1871,
"step": 3520
},
{
"epoch": 27.686274509803923,
"grad_norm": 2.2478270530700684,
"learning_rate": 8.739202352508731e-05,
"loss": 0.1861,
"step": 3530
},
{
"epoch": 27.764705882352942,
"grad_norm": 3.159546136856079,
"learning_rate": 8.735526557618086e-05,
"loss": 0.2163,
"step": 3540
},
{
"epoch": 27.84313725490196,
"grad_norm": 4.743581771850586,
"learning_rate": 8.731850762727441e-05,
"loss": 0.1892,
"step": 3550
},
{
"epoch": 27.92156862745098,
"grad_norm": 4.11615514755249,
"learning_rate": 8.728174967836795e-05,
"loss": 0.2062,
"step": 3560
},
{
"epoch": 28.0,
"grad_norm": 3.3777382373809814,
"learning_rate": 8.72449917294615e-05,
"loss": 0.2048,
"step": 3570
},
{
"epoch": 28.07843137254902,
"grad_norm": 12.32438850402832,
"learning_rate": 8.720823378055505e-05,
"loss": 0.1704,
"step": 3580
},
{
"epoch": 28.15686274509804,
"grad_norm": 2.3112239837646484,
"learning_rate": 8.71714758316486e-05,
"loss": 0.1787,
"step": 3590
},
{
"epoch": 28.235294117647058,
"grad_norm": 2.7134642601013184,
"learning_rate": 8.713471788274214e-05,
"loss": 0.1595,
"step": 3600
},
{
"epoch": 28.313725490196077,
"grad_norm": 1.900732398033142,
"learning_rate": 8.709795993383569e-05,
"loss": 0.1614,
"step": 3610
},
{
"epoch": 28.392156862745097,
"grad_norm": 2.7066845893859863,
"learning_rate": 8.706120198492924e-05,
"loss": 0.1571,
"step": 3620
},
{
"epoch": 28.470588235294116,
"grad_norm": 2.3240630626678467,
"learning_rate": 8.70244440360228e-05,
"loss": 0.1988,
"step": 3630
},
{
"epoch": 28.54901960784314,
"grad_norm": 5.567899227142334,
"learning_rate": 8.698768608711633e-05,
"loss": 0.1768,
"step": 3640
},
{
"epoch": 28.627450980392158,
"grad_norm": 2.9699206352233887,
"learning_rate": 8.69509281382099e-05,
"loss": 0.1649,
"step": 3650
},
{
"epoch": 28.705882352941178,
"grad_norm": 2.124846935272217,
"learning_rate": 8.691417018930345e-05,
"loss": 0.2035,
"step": 3660
},
{
"epoch": 28.784313725490197,
"grad_norm": 2.9401068687438965,
"learning_rate": 8.6877412240397e-05,
"loss": 0.1728,
"step": 3670
},
{
"epoch": 28.862745098039216,
"grad_norm": 2.0019986629486084,
"learning_rate": 8.684065429149054e-05,
"loss": 0.176,
"step": 3680
},
{
"epoch": 28.941176470588236,
"grad_norm": 6.4335222244262695,
"learning_rate": 8.680389634258409e-05,
"loss": 0.1593,
"step": 3690
},
{
"epoch": 29.019607843137255,
"grad_norm": 1.7808016538619995,
"learning_rate": 8.676713839367764e-05,
"loss": 0.1949,
"step": 3700
},
{
"epoch": 29.098039215686274,
"grad_norm": 1.9336371421813965,
"learning_rate": 8.673038044477119e-05,
"loss": 0.1508,
"step": 3710
},
{
"epoch": 29.176470588235293,
"grad_norm": 1.271824598312378,
"learning_rate": 8.669362249586473e-05,
"loss": 0.1448,
"step": 3720
},
{
"epoch": 29.254901960784313,
"grad_norm": 2.432981252670288,
"learning_rate": 8.665686454695828e-05,
"loss": 0.1454,
"step": 3730
},
{
"epoch": 29.333333333333332,
"grad_norm": 1.8998444080352783,
"learning_rate": 8.662010659805183e-05,
"loss": 0.1893,
"step": 3740
},
{
"epoch": 29.41176470588235,
"grad_norm": 2.9416303634643555,
"learning_rate": 8.658334864914538e-05,
"loss": 0.1999,
"step": 3750
},
{
"epoch": 29.49019607843137,
"grad_norm": 2.7823660373687744,
"learning_rate": 8.654659070023893e-05,
"loss": 0.1543,
"step": 3760
},
{
"epoch": 29.568627450980394,
"grad_norm": 1.9040496349334717,
"learning_rate": 8.650983275133247e-05,
"loss": 0.1588,
"step": 3770
},
{
"epoch": 29.647058823529413,
"grad_norm": 2.2860541343688965,
"learning_rate": 8.647307480242602e-05,
"loss": 0.1376,
"step": 3780
},
{
"epoch": 29.725490196078432,
"grad_norm": 2.458211898803711,
"learning_rate": 8.643631685351957e-05,
"loss": 0.1502,
"step": 3790
},
{
"epoch": 29.80392156862745,
"grad_norm": 4.291934967041016,
"learning_rate": 8.639955890461313e-05,
"loss": 0.1664,
"step": 3800
},
{
"epoch": 29.88235294117647,
"grad_norm": 8.039182662963867,
"learning_rate": 8.636280095570668e-05,
"loss": 0.1673,
"step": 3810
},
{
"epoch": 29.96078431372549,
"grad_norm": 1.9125957489013672,
"learning_rate": 8.632604300680023e-05,
"loss": 0.2028,
"step": 3820
},
{
"epoch": 30.03921568627451,
"grad_norm": 1.9361660480499268,
"learning_rate": 8.628928505789378e-05,
"loss": 0.1725,
"step": 3830
},
{
"epoch": 30.11764705882353,
"grad_norm": 2.263054132461548,
"learning_rate": 8.625252710898732e-05,
"loss": 0.1409,
"step": 3840
},
{
"epoch": 30.19607843137255,
"grad_norm": 1.9042737483978271,
"learning_rate": 8.621576916008087e-05,
"loss": 0.1336,
"step": 3850
},
{
"epoch": 30.274509803921568,
"grad_norm": 1.9479308128356934,
"learning_rate": 8.617901121117442e-05,
"loss": 0.1503,
"step": 3860
},
{
"epoch": 30.352941176470587,
"grad_norm": 2.608462333679199,
"learning_rate": 8.614225326226797e-05,
"loss": 0.1654,
"step": 3870
},
{
"epoch": 30.431372549019606,
"grad_norm": 2.01275372505188,
"learning_rate": 8.610549531336152e-05,
"loss": 0.1427,
"step": 3880
},
{
"epoch": 30.509803921568626,
"grad_norm": 8.765314102172852,
"learning_rate": 8.606873736445506e-05,
"loss": 0.1878,
"step": 3890
},
{
"epoch": 30.58823529411765,
"grad_norm": 2.0812177658081055,
"learning_rate": 8.603197941554861e-05,
"loss": 0.1411,
"step": 3900
},
{
"epoch": 30.666666666666668,
"grad_norm": 2.890509843826294,
"learning_rate": 8.599522146664216e-05,
"loss": 0.1884,
"step": 3910
},
{
"epoch": 30.745098039215687,
"grad_norm": 2.644294023513794,
"learning_rate": 8.595846351773572e-05,
"loss": 0.1868,
"step": 3920
},
{
"epoch": 30.823529411764707,
"grad_norm": 1.564231276512146,
"learning_rate": 8.592170556882925e-05,
"loss": 0.1733,
"step": 3930
},
{
"epoch": 30.901960784313726,
"grad_norm": 2.647084951400757,
"learning_rate": 8.588494761992282e-05,
"loss": 0.2895,
"step": 3940
},
{
"epoch": 30.980392156862745,
"grad_norm": 1.3914289474487305,
"learning_rate": 8.584818967101637e-05,
"loss": 0.1602,
"step": 3950
},
{
"epoch": 31.058823529411764,
"grad_norm": 26.367515563964844,
"learning_rate": 8.581143172210992e-05,
"loss": 0.4469,
"step": 3960
},
{
"epoch": 31.137254901960784,
"grad_norm": 1.853583574295044,
"learning_rate": 8.577467377320346e-05,
"loss": 0.1277,
"step": 3970
},
{
"epoch": 31.215686274509803,
"grad_norm": 2.2600510120391846,
"learning_rate": 8.573791582429701e-05,
"loss": 0.1729,
"step": 3980
},
{
"epoch": 31.294117647058822,
"grad_norm": 1.9825807809829712,
"learning_rate": 8.570115787539056e-05,
"loss": 0.1597,
"step": 3990
},
{
"epoch": 31.372549019607842,
"grad_norm": 1.8277662992477417,
"learning_rate": 8.566439992648411e-05,
"loss": 0.1419,
"step": 4000
}
],
"logging_steps": 10,
"max_steps": 27305,
"num_input_tokens_seen": 0,
"num_train_epochs": 215,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 522593501184000.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}