ProgramInNonsense's picture
Training in progress, step 2850, checkpoint
10c2de2 verified
{
"best_metric": 0.2451845407485962,
"best_model_checkpoint": "./output/checkpoint-1950",
"epoch": 4.20353982300885,
"eval_steps": 150,
"global_step": 2850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014749262536873156,
"grad_norm": 5.5810041427612305,
"learning_rate": 4.125e-06,
"loss": 0.6821,
"step": 10
},
{
"epoch": 0.029498525073746312,
"grad_norm": 5.33672571182251,
"learning_rate": 8.25e-06,
"loss": 0.6731,
"step": 20
},
{
"epoch": 0.04424778761061947,
"grad_norm": 3.1765787601470947,
"learning_rate": 1.2375e-05,
"loss": 0.6863,
"step": 30
},
{
"epoch": 0.058997050147492625,
"grad_norm": 7.523784637451172,
"learning_rate": 1.65e-05,
"loss": 0.5096,
"step": 40
},
{
"epoch": 0.07374631268436578,
"grad_norm": 8.258329391479492,
"learning_rate": 2.0625e-05,
"loss": 0.552,
"step": 50
},
{
"epoch": 0.08849557522123894,
"grad_norm": 4.990320205688477,
"learning_rate": 2.475e-05,
"loss": 0.5211,
"step": 60
},
{
"epoch": 0.10324483775811209,
"grad_norm": 7.771764278411865,
"learning_rate": 2.8874999999999997e-05,
"loss": 0.4423,
"step": 70
},
{
"epoch": 0.11799410029498525,
"grad_norm": 4.934092044830322,
"learning_rate": 3.3e-05,
"loss": 0.5025,
"step": 80
},
{
"epoch": 0.13274336283185842,
"grad_norm": 5.5473551750183105,
"learning_rate": 3.7125e-05,
"loss": 0.5095,
"step": 90
},
{
"epoch": 0.14749262536873156,
"grad_norm": 5.446054458618164,
"learning_rate": 4.125e-05,
"loss": 0.439,
"step": 100
},
{
"epoch": 0.16224188790560473,
"grad_norm": 5.696810245513916,
"learning_rate": 4.12495760935163e-05,
"loss": 0.4491,
"step": 110
},
{
"epoch": 0.17699115044247787,
"grad_norm": 4.427632808685303,
"learning_rate": 4.1248304391490334e-05,
"loss": 0.3156,
"step": 120
},
{
"epoch": 0.19174041297935104,
"grad_norm": 3.307356357574463,
"learning_rate": 4.1246184946196796e-05,
"loss": 0.3194,
"step": 130
},
{
"epoch": 0.20648967551622419,
"grad_norm": 1.1623802185058594,
"learning_rate": 4.124321784475777e-05,
"loss": 0.3677,
"step": 140
},
{
"epoch": 0.22123893805309736,
"grad_norm": 4.190874099731445,
"learning_rate": 4.123940320913919e-05,
"loss": 0.42,
"step": 150
},
{
"epoch": 0.22123893805309736,
"eval_loss": 0.36307621002197266,
"eval_runtime": 43.6353,
"eval_samples_per_second": 6.921,
"eval_steps_per_second": 6.921,
"step": 150
},
{
"epoch": 0.2359882005899705,
"grad_norm": 1.6925841569900513,
"learning_rate": 4.123474119614577e-05,
"loss": 0.4331,
"step": 160
},
{
"epoch": 0.25073746312684364,
"grad_norm": 2.9325969219207764,
"learning_rate": 4.1229231997414614e-05,
"loss": 0.4183,
"step": 170
},
{
"epoch": 0.26548672566371684,
"grad_norm": 2.1130642890930176,
"learning_rate": 4.1222875839407306e-05,
"loss": 0.3555,
"step": 180
},
{
"epoch": 0.28023598820059,
"grad_norm": 3.514279365539551,
"learning_rate": 4.121567298340059e-05,
"loss": 0.4134,
"step": 190
},
{
"epoch": 0.2949852507374631,
"grad_norm": 3.123249053955078,
"learning_rate": 4.120762372547569e-05,
"loss": 0.5256,
"step": 200
},
{
"epoch": 0.30973451327433627,
"grad_norm": 2.3070755004882812,
"learning_rate": 4.119872839650605e-05,
"loss": 0.3679,
"step": 210
},
{
"epoch": 0.32448377581120946,
"grad_norm": 2.9008662700653076,
"learning_rate": 4.118898736214381e-05,
"loss": 0.3923,
"step": 220
},
{
"epoch": 0.3392330383480826,
"grad_norm": 2.0514321327209473,
"learning_rate": 4.117840102280475e-05,
"loss": 0.3725,
"step": 230
},
{
"epoch": 0.35398230088495575,
"grad_norm": 3.542119264602661,
"learning_rate": 4.116696981365181e-05,
"loss": 0.4415,
"step": 240
},
{
"epoch": 0.3687315634218289,
"grad_norm": 1.6755036115646362,
"learning_rate": 4.115469420457721e-05,
"loss": 0.2841,
"step": 250
},
{
"epoch": 0.3834808259587021,
"grad_norm": 2.9505436420440674,
"learning_rate": 4.1141574700183186e-05,
"loss": 0.3778,
"step": 260
},
{
"epoch": 0.39823008849557523,
"grad_norm": 2.982377052307129,
"learning_rate": 4.1127611839761155e-05,
"loss": 0.3422,
"step": 270
},
{
"epoch": 0.41297935103244837,
"grad_norm": 1.809727668762207,
"learning_rate": 4.111280619726964e-05,
"loss": 0.3351,
"step": 280
},
{
"epoch": 0.4277286135693215,
"grad_norm": 2.7590482234954834,
"learning_rate": 4.109715838131059e-05,
"loss": 0.3718,
"step": 290
},
{
"epoch": 0.4424778761061947,
"grad_norm": 1.9859437942504883,
"learning_rate": 4.108066903510445e-05,
"loss": 0.3772,
"step": 300
},
{
"epoch": 0.4424778761061947,
"eval_loss": 0.3334226906299591,
"eval_runtime": 43.4221,
"eval_samples_per_second": 6.955,
"eval_steps_per_second": 6.955,
"step": 300
},
{
"epoch": 0.45722713864306785,
"grad_norm": 2.9821550846099854,
"learning_rate": 4.106333883646366e-05,
"loss": 0.3968,
"step": 310
},
{
"epoch": 0.471976401179941,
"grad_norm": 3.6904613971710205,
"learning_rate": 4.104516849776479e-05,
"loss": 0.3127,
"step": 320
},
{
"epoch": 0.48672566371681414,
"grad_norm": 1.0732004642486572,
"learning_rate": 4.1026158765919306e-05,
"loss": 0.4087,
"step": 330
},
{
"epoch": 0.5014749262536873,
"grad_norm": 1.7305632829666138,
"learning_rate": 4.100631042234283e-05,
"loss": 0.4596,
"step": 340
},
{
"epoch": 0.5162241887905604,
"grad_norm": 2.5850343704223633,
"learning_rate": 4.098562428292304e-05,
"loss": 0.3444,
"step": 350
},
{
"epoch": 0.5309734513274337,
"grad_norm": 2.5205276012420654,
"learning_rate": 4.096410119798607e-05,
"loss": 0.4583,
"step": 360
},
{
"epoch": 0.5457227138643068,
"grad_norm": 2.062127113342285,
"learning_rate": 4.094174205226167e-05,
"loss": 0.4003,
"step": 370
},
{
"epoch": 0.56047197640118,
"grad_norm": 3.1239662170410156,
"learning_rate": 4.0918547764846736e-05,
"loss": 0.3674,
"step": 380
},
{
"epoch": 0.5752212389380531,
"grad_norm": 2.7132530212402344,
"learning_rate": 4.089451928916758e-05,
"loss": 0.3639,
"step": 390
},
{
"epoch": 0.5899705014749262,
"grad_norm": 3.0152759552001953,
"learning_rate": 4.0869657612940723e-05,
"loss": 0.2698,
"step": 400
},
{
"epoch": 0.6047197640117994,
"grad_norm": 2.3073341846466064,
"learning_rate": 4.08439637581323e-05,
"loss": 0.439,
"step": 410
},
{
"epoch": 0.6194690265486725,
"grad_norm": 1.7864807844161987,
"learning_rate": 4.081743878091604e-05,
"loss": 0.2919,
"step": 420
},
{
"epoch": 0.6342182890855457,
"grad_norm": 2.861272096633911,
"learning_rate": 4.079008377162988e-05,
"loss": 0.4066,
"step": 430
},
{
"epoch": 0.6489675516224189,
"grad_norm": 1.9505175352096558,
"learning_rate": 4.0761899854731085e-05,
"loss": 0.4823,
"step": 440
},
{
"epoch": 0.6637168141592921,
"grad_norm": 1.8906564712524414,
"learning_rate": 4.073288818875011e-05,
"loss": 0.3265,
"step": 450
},
{
"epoch": 0.6637168141592921,
"eval_loss": 0.30214831233024597,
"eval_runtime": 43.7717,
"eval_samples_per_second": 6.899,
"eval_steps_per_second": 6.899,
"step": 450
},
{
"epoch": 0.6784660766961652,
"grad_norm": 2.3650407791137695,
"learning_rate": 4.070304996624291e-05,
"loss": 0.5034,
"step": 460
},
{
"epoch": 0.6932153392330384,
"grad_norm": 1.9324402809143066,
"learning_rate": 4.067238641374194e-05,
"loss": 0.349,
"step": 470
},
{
"epoch": 0.7079646017699115,
"grad_norm": 2.0116679668426514,
"learning_rate": 4.0640898791705745e-05,
"loss": 0.5409,
"step": 480
},
{
"epoch": 0.7227138643067846,
"grad_norm": 1.6090418100357056,
"learning_rate": 4.060858839446713e-05,
"loss": 0.3821,
"step": 490
},
{
"epoch": 0.7374631268436578,
"grad_norm": 2.2912044525146484,
"learning_rate": 4.057545655017998e-05,
"loss": 0.2578,
"step": 500
},
{
"epoch": 0.7522123893805309,
"grad_norm": 1.67880117893219,
"learning_rate": 4.054150462076465e-05,
"loss": 0.3137,
"step": 510
},
{
"epoch": 0.7669616519174042,
"grad_norm": 1.5085221529006958,
"learning_rate": 4.0506734001851976e-05,
"loss": 0.2617,
"step": 520
},
{
"epoch": 0.7817109144542773,
"grad_norm": 1.5776855945587158,
"learning_rate": 4.0471146122725904e-05,
"loss": 0.3693,
"step": 530
},
{
"epoch": 0.7964601769911505,
"grad_norm": 2.036801815032959,
"learning_rate": 4.043474244626477e-05,
"loss": 0.2956,
"step": 540
},
{
"epoch": 0.8112094395280236,
"grad_norm": 2.230562686920166,
"learning_rate": 4.0397524468881125e-05,
"loss": 0.3842,
"step": 550
},
{
"epoch": 0.8259587020648967,
"grad_norm": 1.4315614700317383,
"learning_rate": 4.0359493720460244e-05,
"loss": 0.3418,
"step": 560
},
{
"epoch": 0.8407079646017699,
"grad_norm": 3.4652326107025146,
"learning_rate": 4.032065176429724e-05,
"loss": 0.3102,
"step": 570
},
{
"epoch": 0.855457227138643,
"grad_norm": 1.9565349817276,
"learning_rate": 4.0281000197032795e-05,
"loss": 0.186,
"step": 580
},
{
"epoch": 0.8702064896755162,
"grad_norm": 2.6647768020629883,
"learning_rate": 4.0240540648587546e-05,
"loss": 0.4584,
"step": 590
},
{
"epoch": 0.8849557522123894,
"grad_norm": 1.2911219596862793,
"learning_rate": 4.019927478209504e-05,
"loss": 0.2314,
"step": 600
},
{
"epoch": 0.8849557522123894,
"eval_loss": 0.29636457562446594,
"eval_runtime": 43.2686,
"eval_samples_per_second": 6.98,
"eval_steps_per_second": 6.98,
"step": 600
},
{
"epoch": 0.8997050147492626,
"grad_norm": 1.422271728515625,
"learning_rate": 4.015720429383344e-05,
"loss": 0.226,
"step": 610
},
{
"epoch": 0.9144542772861357,
"grad_norm": 2.965221881866455,
"learning_rate": 4.0114330913155726e-05,
"loss": 0.4821,
"step": 620
},
{
"epoch": 0.9292035398230089,
"grad_norm": 4.0180158615112305,
"learning_rate": 4.007065640241867e-05,
"loss": 0.2919,
"step": 630
},
{
"epoch": 0.943952802359882,
"grad_norm": 4.945089817047119,
"learning_rate": 4.002618255691033e-05,
"loss": 0.3667,
"step": 640
},
{
"epoch": 0.9587020648967551,
"grad_norm": 1.7440930604934692,
"learning_rate": 3.9980911204776306e-05,
"loss": 0.3945,
"step": 650
},
{
"epoch": 0.9734513274336283,
"grad_norm": 1.846238374710083,
"learning_rate": 3.993484420694458e-05,
"loss": 0.3624,
"step": 660
},
{
"epoch": 0.9882005899705014,
"grad_norm": 2.105642795562744,
"learning_rate": 3.988798345704899e-05,
"loss": 0.3291,
"step": 670
},
{
"epoch": 1.0029498525073746,
"grad_norm": 0.9437566995620728,
"learning_rate": 3.984033088135143e-05,
"loss": 0.2252,
"step": 680
},
{
"epoch": 1.0176991150442478,
"grad_norm": 2.24537992477417,
"learning_rate": 3.979188843866263e-05,
"loss": 0.2462,
"step": 690
},
{
"epoch": 1.0324483775811208,
"grad_norm": 1.486444354057312,
"learning_rate": 3.97426581202617e-05,
"loss": 0.2597,
"step": 700
},
{
"epoch": 1.047197640117994,
"grad_norm": 3.2096400260925293,
"learning_rate": 3.969264194981418e-05,
"loss": 0.2238,
"step": 710
},
{
"epoch": 1.0619469026548674,
"grad_norm": 2.7134766578674316,
"learning_rate": 3.9641841983288953e-05,
"loss": 0.2689,
"step": 720
},
{
"epoch": 1.0766961651917404,
"grad_norm": 0.9915286898612976,
"learning_rate": 3.959026030887367e-05,
"loss": 0.2326,
"step": 730
},
{
"epoch": 1.0914454277286136,
"grad_norm": 2.3304569721221924,
"learning_rate": 3.953789904688893e-05,
"loss": 0.2508,
"step": 740
},
{
"epoch": 1.1061946902654867,
"grad_norm": 0.8963961005210876,
"learning_rate": 3.948476034970113e-05,
"loss": 0.165,
"step": 750
},
{
"epoch": 1.1061946902654867,
"eval_loss": 0.29341065883636475,
"eval_runtime": 43.6734,
"eval_samples_per_second": 6.915,
"eval_steps_per_second": 6.915,
"step": 750
},
{
"epoch": 1.12094395280236,
"grad_norm": 1.2763745784759521,
"learning_rate": 3.943084640163398e-05,
"loss": 0.2356,
"step": 760
},
{
"epoch": 1.135693215339233,
"grad_norm": 1.333479642868042,
"learning_rate": 3.937615941887873e-05,
"loss": 0.2668,
"step": 770
},
{
"epoch": 1.1504424778761062,
"grad_norm": 2.042940378189087,
"learning_rate": 3.932070164940304e-05,
"loss": 0.2435,
"step": 780
},
{
"epoch": 1.1651917404129795,
"grad_norm": 2.6705069541931152,
"learning_rate": 3.926447537285859e-05,
"loss": 0.1938,
"step": 790
},
{
"epoch": 1.1799410029498525,
"grad_norm": 2.413623571395874,
"learning_rate": 3.920748290048739e-05,
"loss": 0.1981,
"step": 800
},
{
"epoch": 1.1946902654867257,
"grad_norm": 1.9013822078704834,
"learning_rate": 3.914972657502677e-05,
"loss": 0.3461,
"step": 810
},
{
"epoch": 1.2094395280235988,
"grad_norm": 2.037879228591919,
"learning_rate": 3.9091208770613036e-05,
"loss": 0.2506,
"step": 820
},
{
"epoch": 1.224188790560472,
"grad_norm": 1.8921838998794556,
"learning_rate": 3.9031931892683937e-05,
"loss": 0.2074,
"step": 830
},
{
"epoch": 1.238938053097345,
"grad_norm": 1.2936807870864868,
"learning_rate": 3.897189837787975e-05,
"loss": 0.2762,
"step": 840
},
{
"epoch": 1.2536873156342183,
"grad_norm": 1.600098967552185,
"learning_rate": 3.891111069394313e-05,
"loss": 0.2381,
"step": 850
},
{
"epoch": 1.2684365781710913,
"grad_norm": 1.6837131977081299,
"learning_rate": 3.884957133961768e-05,
"loss": 0.1811,
"step": 860
},
{
"epoch": 1.2831858407079646,
"grad_norm": 4.942287921905518,
"learning_rate": 3.878728284454522e-05,
"loss": 0.2511,
"step": 870
},
{
"epoch": 1.2979351032448379,
"grad_norm": 1.305611491203308,
"learning_rate": 3.872424776916183e-05,
"loss": 0.2289,
"step": 880
},
{
"epoch": 1.3126843657817109,
"grad_norm": 2.5036911964416504,
"learning_rate": 3.866046870459253e-05,
"loss": 0.4063,
"step": 890
},
{
"epoch": 1.3274336283185841,
"grad_norm": 2.1621756553649902,
"learning_rate": 3.8595948272544905e-05,
"loss": 0.2515,
"step": 900
},
{
"epoch": 1.3274336283185841,
"eval_loss": 0.278595894575119,
"eval_runtime": 43.7764,
"eval_samples_per_second": 6.899,
"eval_steps_per_second": 6.899,
"step": 900
},
{
"epoch": 1.3421828908554572,
"grad_norm": 0.7632271647453308,
"learning_rate": 3.8530689125201184e-05,
"loss": 0.1884,
"step": 910
},
{
"epoch": 1.3569321533923304,
"grad_norm": 1.29710054397583,
"learning_rate": 3.8464693945109305e-05,
"loss": 0.225,
"step": 920
},
{
"epoch": 1.3716814159292037,
"grad_norm": 2.357658863067627,
"learning_rate": 3.839796544507265e-05,
"loss": 0.3185,
"step": 930
},
{
"epoch": 1.3864306784660767,
"grad_norm": 1.8373112678527832,
"learning_rate": 3.833050636803849e-05,
"loss": 0.29,
"step": 940
},
{
"epoch": 1.4011799410029497,
"grad_norm": 1.9975624084472656,
"learning_rate": 3.826231948698527e-05,
"loss": 0.3203,
"step": 950
},
{
"epoch": 1.415929203539823,
"grad_norm": 3.1040427684783936,
"learning_rate": 3.819340760480859e-05,
"loss": 0.2454,
"step": 960
},
{
"epoch": 1.4306784660766962,
"grad_norm": 1.599753737449646,
"learning_rate": 3.812377355420602e-05,
"loss": 0.2825,
"step": 970
},
{
"epoch": 1.4454277286135693,
"grad_norm": 1.1874561309814453,
"learning_rate": 3.805342019756065e-05,
"loss": 0.1932,
"step": 980
},
{
"epoch": 1.4601769911504425,
"grad_norm": 1.8277095556259155,
"learning_rate": 3.7982350426823406e-05,
"loss": 0.2014,
"step": 990
},
{
"epoch": 1.4749262536873156,
"grad_norm": 0.8104329109191895,
"learning_rate": 3.791056716339421e-05,
"loss": 0.2486,
"step": 1000
},
{
"epoch": 1.4896755162241888,
"grad_norm": 2.973177194595337,
"learning_rate": 3.783807335800187e-05,
"loss": 0.2373,
"step": 1010
},
{
"epoch": 1.504424778761062,
"grad_norm": 1.6856945753097534,
"learning_rate": 3.776487199058277e-05,
"loss": 0.2203,
"step": 1020
},
{
"epoch": 1.519174041297935,
"grad_norm": 2.4095230102539062,
"learning_rate": 3.769096607015843e-05,
"loss": 0.2813,
"step": 1030
},
{
"epoch": 1.5339233038348081,
"grad_norm": 2.658792495727539,
"learning_rate": 3.761635863471175e-05,
"loss": 0.2552,
"step": 1040
},
{
"epoch": 1.5486725663716814,
"grad_norm": 2.120602607727051,
"learning_rate": 3.754105275106222e-05,
"loss": 0.3001,
"step": 1050
},
{
"epoch": 1.5486725663716814,
"eval_loss": 0.2679256796836853,
"eval_runtime": 43.6015,
"eval_samples_per_second": 6.926,
"eval_steps_per_second": 6.926,
"step": 1050
},
{
"epoch": 1.5634218289085546,
"grad_norm": 2.71559476852417,
"learning_rate": 3.746505151473972e-05,
"loss": 0.3008,
"step": 1060
},
{
"epoch": 1.5781710914454279,
"grad_norm": 2.8023104667663574,
"learning_rate": 3.738835804985743e-05,
"loss": 0.215,
"step": 1070
},
{
"epoch": 1.592920353982301,
"grad_norm": 1.7893186807632446,
"learning_rate": 3.731097550898329e-05,
"loss": 0.1894,
"step": 1080
},
{
"epoch": 1.607669616519174,
"grad_norm": 1.8876107931137085,
"learning_rate": 3.723290707301047e-05,
"loss": 0.1824,
"step": 1090
},
{
"epoch": 1.6224188790560472,
"grad_norm": 1.5161499977111816,
"learning_rate": 3.7154155951026605e-05,
"loss": 0.192,
"step": 1100
},
{
"epoch": 1.6371681415929205,
"grad_norm": 2.110395908355713,
"learning_rate": 3.707472538018187e-05,
"loss": 0.2047,
"step": 1110
},
{
"epoch": 1.6519174041297935,
"grad_norm": 2.1229217052459717,
"learning_rate": 3.6994618625555925e-05,
"loss": 0.1545,
"step": 1120
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.7134130001068115,
"learning_rate": 3.691383898002368e-05,
"loss": 0.2392,
"step": 1130
},
{
"epoch": 1.6814159292035398,
"grad_norm": 1.5704232454299927,
"learning_rate": 3.683238976412e-05,
"loss": 0.1984,
"step": 1140
},
{
"epoch": 1.696165191740413,
"grad_norm": 1.9206628799438477,
"learning_rate": 3.675027432590312e-05,
"loss": 0.2669,
"step": 1150
},
{
"epoch": 1.7109144542772863,
"grad_norm": 2.9219446182250977,
"learning_rate": 3.666749604081707e-05,
"loss": 0.1978,
"step": 1160
},
{
"epoch": 1.7256637168141593,
"grad_norm": 0.8861755728721619,
"learning_rate": 3.6584058311552954e-05,
"loss": 0.1588,
"step": 1170
},
{
"epoch": 1.7404129793510323,
"grad_norm": 3.052067279815674,
"learning_rate": 3.6499964567909e-05,
"loss": 0.1948,
"step": 1180
},
{
"epoch": 1.7551622418879056,
"grad_norm": 2.0168213844299316,
"learning_rate": 3.641521826664964e-05,
"loss": 0.2793,
"step": 1190
},
{
"epoch": 1.7699115044247788,
"grad_norm": 3.0683932304382324,
"learning_rate": 3.63298228913634e-05,
"loss": 0.2384,
"step": 1200
},
{
"epoch": 1.7699115044247788,
"eval_loss": 0.25286030769348145,
"eval_runtime": 43.7965,
"eval_samples_per_second": 6.896,
"eval_steps_per_second": 6.896,
"step": 1200
},
{
"epoch": 1.7846607669616519,
"grad_norm": 2.649764060974121,
"learning_rate": 3.624378195231967e-05,
"loss": 0.3089,
"step": 1210
},
{
"epoch": 1.799410029498525,
"grad_norm": 1.9995458126068115,
"learning_rate": 3.615709898632448e-05,
"loss": 0.2291,
"step": 1220
},
{
"epoch": 1.8141592920353982,
"grad_norm": 2.075753927230835,
"learning_rate": 3.606977755657502e-05,
"loss": 0.2188,
"step": 1230
},
{
"epoch": 1.8289085545722714,
"grad_norm": 3.202075958251953,
"learning_rate": 3.5981821252513274e-05,
"loss": 0.3073,
"step": 1240
},
{
"epoch": 1.8436578171091447,
"grad_norm": 2.0560741424560547,
"learning_rate": 3.5893233689678384e-05,
"loss": 0.2288,
"step": 1250
},
{
"epoch": 1.8584070796460177,
"grad_norm": 2.9390640258789062,
"learning_rate": 3.5804018509558095e-05,
"loss": 0.3001,
"step": 1260
},
{
"epoch": 1.8731563421828907,
"grad_norm": 0.6466512084007263,
"learning_rate": 3.571417937943903e-05,
"loss": 0.1617,
"step": 1270
},
{
"epoch": 1.887905604719764,
"grad_norm": 1.9550355672836304,
"learning_rate": 3.562371999225594e-05,
"loss": 0.2687,
"step": 1280
},
{
"epoch": 1.9026548672566372,
"grad_norm": 2.3754589557647705,
"learning_rate": 3.553264406643995e-05,
"loss": 0.181,
"step": 1290
},
{
"epoch": 1.9174041297935103,
"grad_norm": 1.620802640914917,
"learning_rate": 3.544095534576563e-05,
"loss": 0.2422,
"step": 1300
},
{
"epoch": 1.9321533923303835,
"grad_norm": 1.5539398193359375,
"learning_rate": 3.534865759919718e-05,
"loss": 0.1669,
"step": 1310
},
{
"epoch": 1.9469026548672566,
"grad_norm": 2.4959328174591064,
"learning_rate": 3.525575462073344e-05,
"loss": 0.2058,
"step": 1320
},
{
"epoch": 1.9616519174041298,
"grad_norm": 2.10261607170105,
"learning_rate": 3.516225022925199e-05,
"loss": 0.2412,
"step": 1330
},
{
"epoch": 1.976401179941003,
"grad_norm": 2.3935513496398926,
"learning_rate": 3.5068148268352135e-05,
"loss": 0.221,
"step": 1340
},
{
"epoch": 1.991150442477876,
"grad_norm": 1.9170893430709839,
"learning_rate": 3.497345260619691e-05,
"loss": 0.1804,
"step": 1350
},
{
"epoch": 1.991150442477876,
"eval_loss": 0.24877212941646576,
"eval_runtime": 43.1775,
"eval_samples_per_second": 6.994,
"eval_steps_per_second": 6.994,
"step": 1350
},
{
"epoch": 2.005899705014749,
"grad_norm": 1.3050181865692139,
"learning_rate": 3.487816713535409e-05,
"loss": 0.1889,
"step": 1360
},
{
"epoch": 2.0206489675516224,
"grad_norm": 6.0348219871521,
"learning_rate": 3.478229577263617e-05,
"loss": 0.1382,
"step": 1370
},
{
"epoch": 2.0353982300884956,
"grad_norm": 2.732297897338867,
"learning_rate": 3.4685842458939365e-05,
"loss": 0.1052,
"step": 1380
},
{
"epoch": 2.050147492625369,
"grad_norm": 2.3690683841705322,
"learning_rate": 3.458881115908164e-05,
"loss": 0.1049,
"step": 1390
},
{
"epoch": 2.0648967551622417,
"grad_norm": 1.0060967206954956,
"learning_rate": 3.449120586163966e-05,
"loss": 0.1413,
"step": 1400
},
{
"epoch": 2.079646017699115,
"grad_norm": 2.224308967590332,
"learning_rate": 3.439303057878493e-05,
"loss": 0.1162,
"step": 1410
},
{
"epoch": 2.094395280235988,
"grad_norm": 1.902258038520813,
"learning_rate": 3.429428934611879e-05,
"loss": 0.1231,
"step": 1420
},
{
"epoch": 2.1091445427728615,
"grad_norm": 3.415437936782837,
"learning_rate": 3.419498622250657e-05,
"loss": 0.1914,
"step": 1430
},
{
"epoch": 2.1238938053097347,
"grad_norm": 2.006056070327759,
"learning_rate": 3.409512528991075e-05,
"loss": 0.1494,
"step": 1440
},
{
"epoch": 2.1386430678466075,
"grad_norm": 2.260693311691284,
"learning_rate": 3.399471065322314e-05,
"loss": 0.1251,
"step": 1450
},
{
"epoch": 2.1533923303834808,
"grad_norm": 2.281259298324585,
"learning_rate": 3.3893746440096144e-05,
"loss": 0.1238,
"step": 1460
},
{
"epoch": 2.168141592920354,
"grad_norm": 1.1317875385284424,
"learning_rate": 3.3792236800773114e-05,
"loss": 0.1144,
"step": 1470
},
{
"epoch": 2.1828908554572273,
"grad_norm": 2.4440741539001465,
"learning_rate": 3.369018590791776e-05,
"loss": 0.1123,
"step": 1480
},
{
"epoch": 2.1976401179941005,
"grad_norm": 1.8043389320373535,
"learning_rate": 3.358759795644255e-05,
"loss": 0.1347,
"step": 1490
},
{
"epoch": 2.2123893805309733,
"grad_norm": 3.1604647636413574,
"learning_rate": 3.3484477163336383e-05,
"loss": 0.1287,
"step": 1500
},
{
"epoch": 2.2123893805309733,
"eval_loss": 0.25868239998817444,
"eval_runtime": 43.8975,
"eval_samples_per_second": 6.88,
"eval_steps_per_second": 6.88,
"step": 1500
},
{
"epoch": 2.2271386430678466,
"grad_norm": 2.021027088165283,
"learning_rate": 3.338082776749115e-05,
"loss": 0.1872,
"step": 1510
},
{
"epoch": 2.24188790560472,
"grad_norm": 1.5057893991470337,
"learning_rate": 3.327665402952756e-05,
"loss": 0.1177,
"step": 1520
},
{
"epoch": 2.256637168141593,
"grad_norm": 1.7474180459976196,
"learning_rate": 3.317196023161996e-05,
"loss": 0.1443,
"step": 1530
},
{
"epoch": 2.271386430678466,
"grad_norm": 1.8646279573440552,
"learning_rate": 3.306675067732031e-05,
"loss": 0.1386,
"step": 1540
},
{
"epoch": 2.286135693215339,
"grad_norm": 0.44584158062934875,
"learning_rate": 3.296102969138133e-05,
"loss": 0.1612,
"step": 1550
},
{
"epoch": 2.3008849557522124,
"grad_norm": 0.7199454307556152,
"learning_rate": 3.285480161957865e-05,
"loss": 0.1217,
"step": 1560
},
{
"epoch": 2.3156342182890857,
"grad_norm": 1.7795122861862183,
"learning_rate": 3.274807082853226e-05,
"loss": 0.1666,
"step": 1570
},
{
"epoch": 2.330383480825959,
"grad_norm": 1.9658058881759644,
"learning_rate": 3.264084170552692e-05,
"loss": 0.1036,
"step": 1580
},
{
"epoch": 2.3451327433628317,
"grad_norm": 0.996110200881958,
"learning_rate": 3.25331186583319e-05,
"loss": 0.1381,
"step": 1590
},
{
"epoch": 2.359882005899705,
"grad_norm": 5.947177410125732,
"learning_rate": 3.242490611501975e-05,
"loss": 0.1579,
"step": 1600
},
{
"epoch": 2.3746312684365782,
"grad_norm": 3.3651394844055176,
"learning_rate": 3.231620852378428e-05,
"loss": 0.165,
"step": 1610
},
{
"epoch": 2.3893805309734515,
"grad_norm": 1.959834098815918,
"learning_rate": 3.220703035275773e-05,
"loss": 0.1379,
"step": 1620
},
{
"epoch": 2.4041297935103243,
"grad_norm": 2.7289717197418213,
"learning_rate": 3.209737608982709e-05,
"loss": 0.196,
"step": 1630
},
{
"epoch": 2.4188790560471976,
"grad_norm": 4.184900283813477,
"learning_rate": 3.1987250242449616e-05,
"loss": 0.1701,
"step": 1640
},
{
"epoch": 2.433628318584071,
"grad_norm": 2.6607248783111572,
"learning_rate": 3.1876657337467564e-05,
"loss": 0.1057,
"step": 1650
},
{
"epoch": 2.433628318584071,
"eval_loss": 0.2500734031200409,
"eval_runtime": 44.0,
"eval_samples_per_second": 6.864,
"eval_steps_per_second": 6.864,
"step": 1650
},
{
"epoch": 2.448377581120944,
"grad_norm": 3.242337465286255,
"learning_rate": 3.176560192092211e-05,
"loss": 0.1478,
"step": 1660
},
{
"epoch": 2.4631268436578173,
"grad_norm": 2.5350615978240967,
"learning_rate": 3.165408855786642e-05,
"loss": 0.1988,
"step": 1670
},
{
"epoch": 2.47787610619469,
"grad_norm": 2.062612295150757,
"learning_rate": 3.154212183217812e-05,
"loss": 0.1228,
"step": 1680
},
{
"epoch": 2.4926253687315634,
"grad_norm": 2.87460994720459,
"learning_rate": 3.142970634637072e-05,
"loss": 0.1057,
"step": 1690
},
{
"epoch": 2.5073746312684366,
"grad_norm": 1.2329528331756592,
"learning_rate": 3.131684672140458e-05,
"loss": 0.1142,
"step": 1700
},
{
"epoch": 2.52212389380531,
"grad_norm": 2.1753523349761963,
"learning_rate": 3.120354759649682e-05,
"loss": 0.1065,
"step": 1710
},
{
"epoch": 2.5368731563421827,
"grad_norm": 2.7612011432647705,
"learning_rate": 3.1089813628930695e-05,
"loss": 0.189,
"step": 1720
},
{
"epoch": 2.551622418879056,
"grad_norm": 1.8067593574523926,
"learning_rate": 3.097564949386416e-05,
"loss": 0.1085,
"step": 1730
},
{
"epoch": 2.566371681415929,
"grad_norm": 2.544847011566162,
"learning_rate": 3.086105988413766e-05,
"loss": 0.2134,
"step": 1740
},
{
"epoch": 2.5811209439528024,
"grad_norm": 0.837935209274292,
"learning_rate": 3.074604951008122e-05,
"loss": 0.0964,
"step": 1750
},
{
"epoch": 2.5958702064896757,
"grad_norm": 1.2727034091949463,
"learning_rate": 3.063062309932086e-05,
"loss": 0.1179,
"step": 1760
},
{
"epoch": 2.6106194690265485,
"grad_norm": 2.784679651260376,
"learning_rate": 3.0514785396584238e-05,
"loss": 0.1062,
"step": 1770
},
{
"epoch": 2.6253687315634218,
"grad_norm": 1.9828710556030273,
"learning_rate": 3.0398541163505598e-05,
"loss": 0.1325,
"step": 1780
},
{
"epoch": 2.640117994100295,
"grad_norm": 1.7931956052780151,
"learning_rate": 3.028189517843007e-05,
"loss": 0.1326,
"step": 1790
},
{
"epoch": 2.6548672566371683,
"grad_norm": 2.979793071746826,
"learning_rate": 3.0164852236217233e-05,
"loss": 0.1903,
"step": 1800
},
{
"epoch": 2.6548672566371683,
"eval_loss": 0.2538958191871643,
"eval_runtime": 43.8985,
"eval_samples_per_second": 6.879,
"eval_steps_per_second": 6.879,
"step": 1800
},
{
"epoch": 2.669616519174041,
"grad_norm": 1.796676516532898,
"learning_rate": 3.0047417148044e-05,
"loss": 0.125,
"step": 1810
},
{
"epoch": 2.6843657817109143,
"grad_norm": 3.8405046463012695,
"learning_rate": 2.99295947412069e-05,
"loss": 0.119,
"step": 1820
},
{
"epoch": 2.6991150442477876,
"grad_norm": 2.2203421592712402,
"learning_rate": 2.9811389858923593e-05,
"loss": 0.1104,
"step": 1830
},
{
"epoch": 2.713864306784661,
"grad_norm": 1.727965235710144,
"learning_rate": 2.9692807360133822e-05,
"loss": 0.1523,
"step": 1840
},
{
"epoch": 2.728613569321534,
"grad_norm": 2.0604889392852783,
"learning_rate": 2.9573852119299634e-05,
"loss": 0.1517,
"step": 1850
},
{
"epoch": 2.7433628318584073,
"grad_norm": 2.538522243499756,
"learning_rate": 2.9454529026205092e-05,
"loss": 0.1528,
"step": 1860
},
{
"epoch": 2.75811209439528,
"grad_norm": 1.8873978853225708,
"learning_rate": 2.9334842985755173e-05,
"loss": 0.1234,
"step": 1870
},
{
"epoch": 2.7728613569321534,
"grad_norm": 1.9439120292663574,
"learning_rate": 2.921479891777423e-05,
"loss": 0.1629,
"step": 1880
},
{
"epoch": 2.7876106194690267,
"grad_norm": 1.4135169982910156,
"learning_rate": 2.9094401756803725e-05,
"loss": 0.1266,
"step": 1890
},
{
"epoch": 2.8023598820058995,
"grad_norm": 3.5280425548553467,
"learning_rate": 2.8973656451899372e-05,
"loss": 0.1209,
"step": 1900
},
{
"epoch": 2.8171091445427727,
"grad_norm": 3.0947368144989014,
"learning_rate": 2.8852567966427735e-05,
"loss": 0.1104,
"step": 1910
},
{
"epoch": 2.831858407079646,
"grad_norm": 2.14790940284729,
"learning_rate": 2.8731141277862174e-05,
"loss": 0.1489,
"step": 1920
},
{
"epoch": 2.8466076696165192,
"grad_norm": 2.191667079925537,
"learning_rate": 2.8609381377578267e-05,
"loss": 0.0923,
"step": 1930
},
{
"epoch": 2.8613569321533925,
"grad_norm": 1.2369720935821533,
"learning_rate": 2.848729327064861e-05,
"loss": 0.0907,
"step": 1940
},
{
"epoch": 2.8761061946902657,
"grad_norm": 3.589946746826172,
"learning_rate": 2.8364881975637094e-05,
"loss": 0.1721,
"step": 1950
},
{
"epoch": 2.8761061946902657,
"eval_loss": 0.2451845407485962,
"eval_runtime": 43.8481,
"eval_samples_per_second": 6.887,
"eval_steps_per_second": 6.887,
"step": 1950
},
{
"epoch": 2.8908554572271385,
"grad_norm": 2.7611799240112305,
"learning_rate": 2.82421525243926e-05,
"loss": 0.1254,
"step": 1960
},
{
"epoch": 2.905604719764012,
"grad_norm": 1.5316367149353027,
"learning_rate": 2.8119109961842176e-05,
"loss": 0.1056,
"step": 1970
},
{
"epoch": 2.920353982300885,
"grad_norm": 1.325637698173523,
"learning_rate": 2.799575934578365e-05,
"loss": 0.1512,
"step": 1980
},
{
"epoch": 2.935103244837758,
"grad_norm": 2.2634294033050537,
"learning_rate": 2.7872105746677694e-05,
"loss": 0.1763,
"step": 1990
},
{
"epoch": 2.949852507374631,
"grad_norm": 1.410277009010315,
"learning_rate": 2.774815424743947e-05,
"loss": 0.2031,
"step": 2000
},
{
"epoch": 2.9646017699115044,
"grad_norm": 3.4232828617095947,
"learning_rate": 2.762390994322962e-05,
"loss": 0.1413,
"step": 2010
},
{
"epoch": 2.9793510324483776,
"grad_norm": 1.2643831968307495,
"learning_rate": 2.749937794124486e-05,
"loss": 0.1268,
"step": 2020
},
{
"epoch": 2.994100294985251,
"grad_norm": 3.378948926925659,
"learning_rate": 2.7374563360508036e-05,
"loss": 0.1299,
"step": 2030
},
{
"epoch": 3.0088495575221237,
"grad_norm": 2.396994113922119,
"learning_rate": 2.7249471331657693e-05,
"loss": 0.0974,
"step": 2040
},
{
"epoch": 3.023598820058997,
"grad_norm": 1.482927918434143,
"learning_rate": 2.712410699673718e-05,
"loss": 0.098,
"step": 2050
},
{
"epoch": 3.03834808259587,
"grad_norm": 1.5194566249847412,
"learning_rate": 2.699847550898329e-05,
"loss": 0.0639,
"step": 2060
},
{
"epoch": 3.0530973451327434,
"grad_norm": 1.4694433212280273,
"learning_rate": 2.6872582032614426e-05,
"loss": 0.1063,
"step": 2070
},
{
"epoch": 3.0678466076696167,
"grad_norm": 0.5857545137405396,
"learning_rate": 2.6746431742618305e-05,
"loss": 0.0709,
"step": 2080
},
{
"epoch": 3.0825958702064895,
"grad_norm": 2.3842179775238037,
"learning_rate": 2.6620029824539257e-05,
"loss": 0.1083,
"step": 2090
},
{
"epoch": 3.0973451327433628,
"grad_norm": 1.8553909063339233,
"learning_rate": 2.6493381474265044e-05,
"loss": 0.0786,
"step": 2100
},
{
"epoch": 3.0973451327433628,
"eval_loss": 0.2644558846950531,
"eval_runtime": 43.8036,
"eval_samples_per_second": 6.894,
"eval_steps_per_second": 6.894,
"step": 2100
},
{
"epoch": 3.112094395280236,
"grad_norm": 0.7660773992538452,
"learning_rate": 2.636649189781331e-05,
"loss": 0.0781,
"step": 2110
},
{
"epoch": 3.1268436578171093,
"grad_norm": 2.0916807651519775,
"learning_rate": 2.6239366311117528e-05,
"loss": 0.129,
"step": 2120
},
{
"epoch": 3.1415929203539825,
"grad_norm": 1.9852261543273926,
"learning_rate": 2.6112009939812672e-05,
"loss": 0.0631,
"step": 2130
},
{
"epoch": 3.1563421828908553,
"grad_norm": 1.6749731302261353,
"learning_rate": 2.5984428019020343e-05,
"loss": 0.0932,
"step": 2140
},
{
"epoch": 3.1710914454277286,
"grad_norm": 2.2517542839050293,
"learning_rate": 2.5856625793133585e-05,
"loss": 0.0752,
"step": 2150
},
{
"epoch": 3.185840707964602,
"grad_norm": 3.476151466369629,
"learning_rate": 2.5728608515601357e-05,
"loss": 0.0657,
"step": 2160
},
{
"epoch": 3.200589970501475,
"grad_norm": 8.088579177856445,
"learning_rate": 2.560038144871252e-05,
"loss": 0.1153,
"step": 2170
},
{
"epoch": 3.215339233038348,
"grad_norm": 2.3388564586639404,
"learning_rate": 2.547194986337956e-05,
"loss": 0.0684,
"step": 2180
},
{
"epoch": 3.230088495575221,
"grad_norm": 2.223625421524048,
"learning_rate": 2.5343319038921927e-05,
"loss": 0.0712,
"step": 2190
},
{
"epoch": 3.2448377581120944,
"grad_norm": 1.7080127000808716,
"learning_rate": 2.521449426284898e-05,
"loss": 0.0701,
"step": 2200
},
{
"epoch": 3.2595870206489677,
"grad_norm": 1.1411865949630737,
"learning_rate": 2.5085480830642722e-05,
"loss": 0.0773,
"step": 2210
},
{
"epoch": 3.274336283185841,
"grad_norm": 3.657041072845459,
"learning_rate": 2.4956284045540015e-05,
"loss": 0.0779,
"step": 2220
},
{
"epoch": 3.2890855457227137,
"grad_norm": 1.847219705581665,
"learning_rate": 2.4826909218314684e-05,
"loss": 0.0656,
"step": 2230
},
{
"epoch": 3.303834808259587,
"grad_norm": 1.5679715871810913,
"learning_rate": 2.4697361667059132e-05,
"loss": 0.0741,
"step": 2240
},
{
"epoch": 3.3185840707964602,
"grad_norm": 2.4843060970306396,
"learning_rate": 2.4567646716965808e-05,
"loss": 0.077,
"step": 2250
},
{
"epoch": 3.3185840707964602,
"eval_loss": 0.2629007399082184,
"eval_runtime": 43.9182,
"eval_samples_per_second": 6.876,
"eval_steps_per_second": 6.876,
"step": 2250
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.421711802482605,
"learning_rate": 2.443776970010823e-05,
"loss": 0.0612,
"step": 2260
},
{
"epoch": 3.3480825958702063,
"grad_norm": 1.3979884386062622,
"learning_rate": 2.430773595522188e-05,
"loss": 0.0962,
"step": 2270
},
{
"epoch": 3.3628318584070795,
"grad_norm": 2.23537540435791,
"learning_rate": 2.4177550827484704e-05,
"loss": 0.1082,
"step": 2280
},
{
"epoch": 3.377581120943953,
"grad_norm": 0.7765734195709229,
"learning_rate": 2.4047219668297402e-05,
"loss": 0.1427,
"step": 2290
},
{
"epoch": 3.392330383480826,
"grad_norm": 1.6800991296768188,
"learning_rate": 2.3916747835063446e-05,
"loss": 0.0701,
"step": 2300
},
{
"epoch": 3.4070796460176993,
"grad_norm": 3.2461161613464355,
"learning_rate": 2.3786140690968887e-05,
"loss": 0.0997,
"step": 2310
},
{
"epoch": 3.421828908554572,
"grad_norm": 6.6169257164001465,
"learning_rate": 2.3655403604761872e-05,
"loss": 0.0774,
"step": 2320
},
{
"epoch": 3.4365781710914454,
"grad_norm": 0.5882118940353394,
"learning_rate": 2.3524541950531934e-05,
"loss": 0.0857,
"step": 2330
},
{
"epoch": 3.4513274336283186,
"grad_norm": 1.4368208646774292,
"learning_rate": 2.3393561107489144e-05,
"loss": 0.1152,
"step": 2340
},
{
"epoch": 3.466076696165192,
"grad_norm": 1.8174480199813843,
"learning_rate": 2.3262466459742938e-05,
"loss": 0.0788,
"step": 2350
},
{
"epoch": 3.4808259587020647,
"grad_norm": 4.503509044647217,
"learning_rate": 2.313126339608082e-05,
"loss": 0.0961,
"step": 2360
},
{
"epoch": 3.495575221238938,
"grad_norm": 1.568178415298462,
"learning_rate": 2.2999957309746853e-05,
"loss": 0.0868,
"step": 2370
},
{
"epoch": 3.510324483775811,
"grad_norm": 2.2628488540649414,
"learning_rate": 2.286855359821995e-05,
"loss": 0.0766,
"step": 2380
},
{
"epoch": 3.5250737463126844,
"grad_norm": 2.249336004257202,
"learning_rate": 2.273705766299202e-05,
"loss": 0.1004,
"step": 2390
},
{
"epoch": 3.5398230088495577,
"grad_norm": 2.4996466636657715,
"learning_rate": 2.2605474909345937e-05,
"loss": 0.0864,
"step": 2400
},
{
"epoch": 3.5398230088495577,
"eval_loss": 0.2607187330722809,
"eval_runtime": 43.5839,
"eval_samples_per_second": 6.929,
"eval_steps_per_second": 6.929,
"step": 2400
},
{
"epoch": 3.554572271386431,
"grad_norm": 2.188922643661499,
"learning_rate": 2.2473810746133318e-05,
"loss": 0.1021,
"step": 2410
},
{
"epoch": 3.5693215339233038,
"grad_norm": 1.4103885889053345,
"learning_rate": 2.234207058555222e-05,
"loss": 0.1025,
"step": 2420
},
{
"epoch": 3.584070796460177,
"grad_norm": 3.289332389831543,
"learning_rate": 2.221025984292466e-05,
"loss": 0.0835,
"step": 2430
},
{
"epoch": 3.5988200589970503,
"grad_norm": 1.7993652820587158,
"learning_rate": 2.2078383936473987e-05,
"loss": 0.0776,
"step": 2440
},
{
"epoch": 3.613569321533923,
"grad_norm": 2.0649101734161377,
"learning_rate": 2.1946448287102206e-05,
"loss": 0.0973,
"step": 2450
},
{
"epoch": 3.6283185840707963,
"grad_norm": 1.656886339187622,
"learning_rate": 2.18144583181671e-05,
"loss": 0.0863,
"step": 2460
},
{
"epoch": 3.6430678466076696,
"grad_norm": 2.353787899017334,
"learning_rate": 2.168241945525932e-05,
"loss": 0.0962,
"step": 2470
},
{
"epoch": 3.657817109144543,
"grad_norm": 2.442674398422241,
"learning_rate": 2.1550337125979373e-05,
"loss": 0.0828,
"step": 2480
},
{
"epoch": 3.672566371681416,
"grad_norm": 1.9212013483047485,
"learning_rate": 2.1418216759714467e-05,
"loss": 0.0729,
"step": 2490
},
{
"epoch": 3.6873156342182893,
"grad_norm": 3.9996650218963623,
"learning_rate": 2.1286063787415392e-05,
"loss": 0.1083,
"step": 2500
},
{
"epoch": 3.702064896755162,
"grad_norm": 1.5336638689041138,
"learning_rate": 2.115388364137322e-05,
"loss": 0.0758,
"step": 2510
},
{
"epoch": 3.7168141592920354,
"grad_norm": 0.8267903923988342,
"learning_rate": 2.1021681754996045e-05,
"loss": 0.0739,
"step": 2520
},
{
"epoch": 3.7315634218289087,
"grad_norm": 0.9670748114585876,
"learning_rate": 2.0889463562585625e-05,
"loss": 0.085,
"step": 2530
},
{
"epoch": 3.7463126843657815,
"grad_norm": 1.723310947418213,
"learning_rate": 2.075723449911398e-05,
"loss": 0.066,
"step": 2540
},
{
"epoch": 3.7610619469026547,
"grad_norm": 2.4782888889312744,
"learning_rate": 2.0625e-05,
"loss": 0.0905,
"step": 2550
},
{
"epoch": 3.7610619469026547,
"eval_loss": 0.2644849419593811,
"eval_runtime": 43.9466,
"eval_samples_per_second": 6.872,
"eval_steps_per_second": 6.872,
"step": 2550
},
{
"epoch": 3.775811209439528,
"grad_norm": 2.117515802383423,
"learning_rate": 2.049276550088602e-05,
"loss": 0.0745,
"step": 2560
},
{
"epoch": 3.7905604719764012,
"grad_norm": 2.7794859409332275,
"learning_rate": 2.0360536437414375e-05,
"loss": 0.0939,
"step": 2570
},
{
"epoch": 3.8053097345132745,
"grad_norm": 1.0744003057479858,
"learning_rate": 2.0228318245003955e-05,
"loss": 0.0675,
"step": 2580
},
{
"epoch": 3.8200589970501477,
"grad_norm": 1.0996911525726318,
"learning_rate": 2.0096116358626783e-05,
"loss": 0.0662,
"step": 2590
},
{
"epoch": 3.8348082595870205,
"grad_norm": 0.7364205718040466,
"learning_rate": 1.996393621258461e-05,
"loss": 0.0553,
"step": 2600
},
{
"epoch": 3.849557522123894,
"grad_norm": 1.5476138591766357,
"learning_rate": 1.9831783240285533e-05,
"loss": 0.0942,
"step": 2610
},
{
"epoch": 3.864306784660767,
"grad_norm": 1.6006284952163696,
"learning_rate": 1.9699662874020637e-05,
"loss": 0.0969,
"step": 2620
},
{
"epoch": 3.87905604719764,
"grad_norm": 0.5195714831352234,
"learning_rate": 1.9567580544740682e-05,
"loss": 0.0875,
"step": 2630
},
{
"epoch": 3.893805309734513,
"grad_norm": 0.9594073295593262,
"learning_rate": 1.9435541681832906e-05,
"loss": 0.0713,
"step": 2640
},
{
"epoch": 3.9085545722713864,
"grad_norm": 0.8206641674041748,
"learning_rate": 1.9303551712897798e-05,
"loss": 0.1369,
"step": 2650
},
{
"epoch": 3.9233038348082596,
"grad_norm": 2.239541530609131,
"learning_rate": 1.9171616063526012e-05,
"loss": 0.0837,
"step": 2660
},
{
"epoch": 3.938053097345133,
"grad_norm": 1.1113237142562866,
"learning_rate": 1.9039740157075348e-05,
"loss": 0.0671,
"step": 2670
},
{
"epoch": 3.952802359882006,
"grad_norm": 2.035332441329956,
"learning_rate": 1.8907929414447785e-05,
"loss": 0.0955,
"step": 2680
},
{
"epoch": 3.967551622418879,
"grad_norm": 0.7038524150848389,
"learning_rate": 1.8776189253866686e-05,
"loss": 0.0708,
"step": 2690
},
{
"epoch": 3.982300884955752,
"grad_norm": 3.023179054260254,
"learning_rate": 1.8644525090654063e-05,
"loss": 0.0863,
"step": 2700
},
{
"epoch": 3.982300884955752,
"eval_loss": 0.27085980772972107,
"eval_runtime": 43.594,
"eval_samples_per_second": 6.928,
"eval_steps_per_second": 6.928,
"step": 2700
},
{
"epoch": 3.9970501474926254,
"grad_norm": 0.3948822021484375,
"learning_rate": 1.851294233700798e-05,
"loss": 0.058,
"step": 2710
},
{
"epoch": 4.011799410029498,
"grad_norm": 2.1759111881256104,
"learning_rate": 1.8381446401780052e-05,
"loss": 0.0544,
"step": 2720
},
{
"epoch": 4.0265486725663715,
"grad_norm": 1.0792875289916992,
"learning_rate": 1.825004269025315e-05,
"loss": 0.0438,
"step": 2730
},
{
"epoch": 4.041297935103245,
"grad_norm": 0.6035469770431519,
"learning_rate": 1.811873660391918e-05,
"loss": 0.068,
"step": 2740
},
{
"epoch": 4.056047197640118,
"grad_norm": 2.127488613128662,
"learning_rate": 1.7987533540257062e-05,
"loss": 0.0842,
"step": 2750
},
{
"epoch": 4.070796460176991,
"grad_norm": 0.9804584980010986,
"learning_rate": 1.7856438892510862e-05,
"loss": 0.0512,
"step": 2760
},
{
"epoch": 4.0855457227138645,
"grad_norm": 0.3105282187461853,
"learning_rate": 1.772545804946807e-05,
"loss": 0.0632,
"step": 2770
},
{
"epoch": 4.100294985250738,
"grad_norm": 0.9263339638710022,
"learning_rate": 1.759459639523813e-05,
"loss": 0.0495,
"step": 2780
},
{
"epoch": 4.115044247787611,
"grad_norm": 0.42594772577285767,
"learning_rate": 1.7463859309031106e-05,
"loss": 0.0467,
"step": 2790
},
{
"epoch": 4.129793510324483,
"grad_norm": 1.5321959257125854,
"learning_rate": 1.7333252164936557e-05,
"loss": 0.0622,
"step": 2800
},
{
"epoch": 4.144542772861357,
"grad_norm": 0.47825008630752563,
"learning_rate": 1.7202780331702608e-05,
"loss": 0.0667,
"step": 2810
},
{
"epoch": 4.15929203539823,
"grad_norm": 2.095520496368408,
"learning_rate": 1.70724491725153e-05,
"loss": 0.063,
"step": 2820
},
{
"epoch": 4.174041297935103,
"grad_norm": 0.4359021782875061,
"learning_rate": 1.694226404477812e-05,
"loss": 0.0429,
"step": 2830
},
{
"epoch": 4.188790560471976,
"grad_norm": 2.7264065742492676,
"learning_rate": 1.681223029989177e-05,
"loss": 0.0501,
"step": 2840
},
{
"epoch": 4.20353982300885,
"grad_norm": 0.8709071278572083,
"learning_rate": 1.66823532830342e-05,
"loss": 0.0464,
"step": 2850
},
{
"epoch": 4.20353982300885,
"eval_loss": 0.29095226526260376,
"eval_runtime": 43.6544,
"eval_samples_per_second": 6.918,
"eval_steps_per_second": 6.918,
"step": 2850
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.313588061300654e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}