astrollama-3-8b-chat_summary / trainer_state.json
TirthankarSlg's picture
sft done
540a156 verified
raw
history blame
205 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 15.0,
"global_step": 1182,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025380710659898475,
"grad_norm": 152.0770602205149,
"learning_rate": 2.7777777777777776e-07,
"loss": 1.4538,
"step": 1
},
{
"epoch": 0.005076142131979695,
"grad_norm": 154.43415342456026,
"learning_rate": 5.555555555555555e-07,
"loss": 1.6443,
"step": 2
},
{
"epoch": 0.007614213197969543,
"grad_norm": 83.042923862024,
"learning_rate": 8.333333333333333e-07,
"loss": 1.569,
"step": 3
},
{
"epoch": 0.01015228426395939,
"grad_norm": 168.9469822153038,
"learning_rate": 1.111111111111111e-06,
"loss": 1.5581,
"step": 4
},
{
"epoch": 0.012690355329949238,
"grad_norm": 99.16296768200209,
"learning_rate": 1.3888888888888892e-06,
"loss": 1.5565,
"step": 5
},
{
"epoch": 0.015228426395939087,
"grad_norm": 77.46041883601814,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.834,
"step": 6
},
{
"epoch": 0.017766497461928935,
"grad_norm": 65.11628947265203,
"learning_rate": 1.944444444444445e-06,
"loss": 1.6277,
"step": 7
},
{
"epoch": 0.02030456852791878,
"grad_norm": 28.341949090239343,
"learning_rate": 2.222222222222222e-06,
"loss": 1.5202,
"step": 8
},
{
"epoch": 0.02284263959390863,
"grad_norm": 45.05260802676402,
"learning_rate": 2.5e-06,
"loss": 1.4123,
"step": 9
},
{
"epoch": 0.025380710659898477,
"grad_norm": 28.73735471868825,
"learning_rate": 2.7777777777777783e-06,
"loss": 1.5291,
"step": 10
},
{
"epoch": 0.027918781725888325,
"grad_norm": 68.32933011519918,
"learning_rate": 3.055555555555556e-06,
"loss": 1.2742,
"step": 11
},
{
"epoch": 0.030456852791878174,
"grad_norm": 26.127348649606496,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.357,
"step": 12
},
{
"epoch": 0.03299492385786802,
"grad_norm": 20.661721515738268,
"learning_rate": 3.6111111111111115e-06,
"loss": 1.2094,
"step": 13
},
{
"epoch": 0.03553299492385787,
"grad_norm": 28.64253126706993,
"learning_rate": 3.88888888888889e-06,
"loss": 1.2889,
"step": 14
},
{
"epoch": 0.03807106598984772,
"grad_norm": 11.29417859738243,
"learning_rate": 4.166666666666667e-06,
"loss": 1.1999,
"step": 15
},
{
"epoch": 0.04060913705583756,
"grad_norm": 10.496811646115283,
"learning_rate": 4.444444444444444e-06,
"loss": 1.3311,
"step": 16
},
{
"epoch": 0.04314720812182741,
"grad_norm": 3.222307468960679,
"learning_rate": 4.722222222222222e-06,
"loss": 1.2378,
"step": 17
},
{
"epoch": 0.04568527918781726,
"grad_norm": 5.95588179653401,
"learning_rate": 5e-06,
"loss": 1.2917,
"step": 18
},
{
"epoch": 0.048223350253807105,
"grad_norm": 4.595212148917514,
"learning_rate": 5.2777777777777785e-06,
"loss": 1.2182,
"step": 19
},
{
"epoch": 0.050761421319796954,
"grad_norm": 3.4919855269541316,
"learning_rate": 5.555555555555557e-06,
"loss": 1.1647,
"step": 20
},
{
"epoch": 0.0532994923857868,
"grad_norm": 2.8992797662103706,
"learning_rate": 5.833333333333334e-06,
"loss": 1.2627,
"step": 21
},
{
"epoch": 0.05583756345177665,
"grad_norm": 6.2754366481134,
"learning_rate": 6.111111111111112e-06,
"loss": 1.2512,
"step": 22
},
{
"epoch": 0.0583756345177665,
"grad_norm": 4.8408966557202735,
"learning_rate": 6.3888888888888885e-06,
"loss": 1.109,
"step": 23
},
{
"epoch": 0.06091370558375635,
"grad_norm": 4.794271905930829,
"learning_rate": 6.666666666666667e-06,
"loss": 1.1957,
"step": 24
},
{
"epoch": 0.06345177664974619,
"grad_norm": 3.8422011976344037,
"learning_rate": 6.944444444444445e-06,
"loss": 1.1104,
"step": 25
},
{
"epoch": 0.06598984771573604,
"grad_norm": 2.9555249451534094,
"learning_rate": 7.222222222222223e-06,
"loss": 1.1429,
"step": 26
},
{
"epoch": 0.06852791878172589,
"grad_norm": 3.420765898636476,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0736,
"step": 27
},
{
"epoch": 0.07106598984771574,
"grad_norm": 3.4590250149649053,
"learning_rate": 7.77777777777778e-06,
"loss": 1.1766,
"step": 28
},
{
"epoch": 0.07360406091370558,
"grad_norm": 6.343830946481616,
"learning_rate": 8.055555555555557e-06,
"loss": 1.2733,
"step": 29
},
{
"epoch": 0.07614213197969544,
"grad_norm": 6.9382893747130305,
"learning_rate": 8.333333333333334e-06,
"loss": 1.1572,
"step": 30
},
{
"epoch": 0.07868020304568528,
"grad_norm": 4.116628130544569,
"learning_rate": 8.611111111111112e-06,
"loss": 1.0768,
"step": 31
},
{
"epoch": 0.08121827411167512,
"grad_norm": 3.137655679928552,
"learning_rate": 8.888888888888888e-06,
"loss": 1.2086,
"step": 32
},
{
"epoch": 0.08375634517766498,
"grad_norm": 2.7173186967601377,
"learning_rate": 9.166666666666666e-06,
"loss": 1.0033,
"step": 33
},
{
"epoch": 0.08629441624365482,
"grad_norm": 4.422086595586005,
"learning_rate": 9.444444444444445e-06,
"loss": 1.2143,
"step": 34
},
{
"epoch": 0.08883248730964467,
"grad_norm": 4.287936560092926,
"learning_rate": 9.722222222222223e-06,
"loss": 1.1019,
"step": 35
},
{
"epoch": 0.09137055837563451,
"grad_norm": 4.71773823753788,
"learning_rate": 1e-05,
"loss": 1.0297,
"step": 36
},
{
"epoch": 0.09390862944162437,
"grad_norm": 3.0619027718013845,
"learning_rate": 9.999981212445786e-06,
"loss": 1.2065,
"step": 37
},
{
"epoch": 0.09644670050761421,
"grad_norm": 5.557917883258693,
"learning_rate": 9.999924849924331e-06,
"loss": 1.1469,
"step": 38
},
{
"epoch": 0.09898477157360407,
"grad_norm": 3.3448139583001146,
"learning_rate": 9.999830912859204e-06,
"loss": 1.076,
"step": 39
},
{
"epoch": 0.10152284263959391,
"grad_norm": 3.4576008769883573,
"learning_rate": 9.99969940195634e-06,
"loss": 1.1861,
"step": 40
},
{
"epoch": 0.10406091370558376,
"grad_norm": 2.859503835631421,
"learning_rate": 9.99953031820405e-06,
"loss": 1.1576,
"step": 41
},
{
"epoch": 0.1065989847715736,
"grad_norm": 3.01601354320767,
"learning_rate": 9.999323662872998e-06,
"loss": 1.0347,
"step": 42
},
{
"epoch": 0.10913705583756345,
"grad_norm": 3.5906060809534663,
"learning_rate": 9.999079437516205e-06,
"loss": 1.1774,
"step": 43
},
{
"epoch": 0.1116751269035533,
"grad_norm": 3.813820556604242,
"learning_rate": 9.998797643969031e-06,
"loss": 1.086,
"step": 44
},
{
"epoch": 0.11421319796954314,
"grad_norm": 3.2135626955254954,
"learning_rate": 9.998478284349163e-06,
"loss": 1.0939,
"step": 45
},
{
"epoch": 0.116751269035533,
"grad_norm": 3.038494005068585,
"learning_rate": 9.998121361056588e-06,
"loss": 1.1049,
"step": 46
},
{
"epoch": 0.11928934010152284,
"grad_norm": 4.836679741133791,
"learning_rate": 9.997726876773599e-06,
"loss": 0.9771,
"step": 47
},
{
"epoch": 0.1218274111675127,
"grad_norm": 3.285363024124718,
"learning_rate": 9.99729483446475e-06,
"loss": 1.0107,
"step": 48
},
{
"epoch": 0.12436548223350254,
"grad_norm": 4.762499743457535,
"learning_rate": 9.996825237376852e-06,
"loss": 1.1301,
"step": 49
},
{
"epoch": 0.12690355329949238,
"grad_norm": 4.073333426027345,
"learning_rate": 9.996318089038935e-06,
"loss": 1.0094,
"step": 50
},
{
"epoch": 0.12944162436548223,
"grad_norm": 3.5281106754331906,
"learning_rate": 9.99577339326223e-06,
"loss": 1.107,
"step": 51
},
{
"epoch": 0.1319796954314721,
"grad_norm": 2.388804020934236,
"learning_rate": 9.995191154140136e-06,
"loss": 1.0684,
"step": 52
},
{
"epoch": 0.13451776649746192,
"grad_norm": 3.896150155122405,
"learning_rate": 9.994571376048195e-06,
"loss": 1.0195,
"step": 53
},
{
"epoch": 0.13705583756345177,
"grad_norm": 3.159381315379948,
"learning_rate": 9.993914063644053e-06,
"loss": 1.1289,
"step": 54
},
{
"epoch": 0.13959390862944163,
"grad_norm": 4.078159681191905,
"learning_rate": 9.993219221867426e-06,
"loss": 1.1211,
"step": 55
},
{
"epoch": 0.14213197969543148,
"grad_norm": 5.601995817902489,
"learning_rate": 9.992486855940064e-06,
"loss": 1.0514,
"step": 56
},
{
"epoch": 0.1446700507614213,
"grad_norm": 2.1043015689108846,
"learning_rate": 9.991716971365713e-06,
"loss": 1.0456,
"step": 57
},
{
"epoch": 0.14720812182741116,
"grad_norm": 2.5356517486241827,
"learning_rate": 9.990909573930075e-06,
"loss": 1.192,
"step": 58
},
{
"epoch": 0.14974619289340102,
"grad_norm": 2.7360362373563953,
"learning_rate": 9.990064669700756e-06,
"loss": 0.9797,
"step": 59
},
{
"epoch": 0.15228426395939088,
"grad_norm": 2.208834445890474,
"learning_rate": 9.989182265027232e-06,
"loss": 1.1441,
"step": 60
},
{
"epoch": 0.1548223350253807,
"grad_norm": 3.2583621910518734,
"learning_rate": 9.988262366540792e-06,
"loss": 0.9885,
"step": 61
},
{
"epoch": 0.15736040609137056,
"grad_norm": 3.9359528549878404,
"learning_rate": 9.987304981154493e-06,
"loss": 1.032,
"step": 62
},
{
"epoch": 0.1598984771573604,
"grad_norm": 4.3721493801621465,
"learning_rate": 9.986310116063108e-06,
"loss": 1.0886,
"step": 63
},
{
"epoch": 0.16243654822335024,
"grad_norm": 3.2720085785918624,
"learning_rate": 9.985277778743069e-06,
"loss": 1.0736,
"step": 64
},
{
"epoch": 0.1649746192893401,
"grad_norm": 4.284093394240824,
"learning_rate": 9.984207976952412e-06,
"loss": 1.109,
"step": 65
},
{
"epoch": 0.16751269035532995,
"grad_norm": 2.744141216549521,
"learning_rate": 9.98310071873072e-06,
"loss": 1.0275,
"step": 66
},
{
"epoch": 0.1700507614213198,
"grad_norm": 2.8863340000098474,
"learning_rate": 9.981956012399068e-06,
"loss": 1.0759,
"step": 67
},
{
"epoch": 0.17258883248730963,
"grad_norm": 4.033064317161934,
"learning_rate": 9.980773866559946e-06,
"loss": 1.07,
"step": 68
},
{
"epoch": 0.1751269035532995,
"grad_norm": 3.7971135347720204,
"learning_rate": 9.979554290097201e-06,
"loss": 0.9832,
"step": 69
},
{
"epoch": 0.17766497461928935,
"grad_norm": 2.9283571123362697,
"learning_rate": 9.978297292175984e-06,
"loss": 1.0622,
"step": 70
},
{
"epoch": 0.1802030456852792,
"grad_norm": 2.9862875820274453,
"learning_rate": 9.977002882242657e-06,
"loss": 1.1675,
"step": 71
},
{
"epoch": 0.18274111675126903,
"grad_norm": 2.458693353037259,
"learning_rate": 9.975671070024741e-06,
"loss": 0.9963,
"step": 72
},
{
"epoch": 0.18527918781725888,
"grad_norm": 2.7140438707634162,
"learning_rate": 9.97430186553083e-06,
"loss": 0.9998,
"step": 73
},
{
"epoch": 0.18781725888324874,
"grad_norm": 2.3819474228984596,
"learning_rate": 9.972895279050532e-06,
"loss": 1.1602,
"step": 74
},
{
"epoch": 0.19035532994923857,
"grad_norm": 2.869485463686646,
"learning_rate": 9.971451321154368e-06,
"loss": 1.1476,
"step": 75
},
{
"epoch": 0.19289340101522842,
"grad_norm": 3.5672507886894915,
"learning_rate": 9.969970002693718e-06,
"loss": 1.189,
"step": 76
},
{
"epoch": 0.19543147208121828,
"grad_norm": 4.6249979596983275,
"learning_rate": 9.968451334800718e-06,
"loss": 1.0321,
"step": 77
},
{
"epoch": 0.19796954314720813,
"grad_norm": 2.590141938018056,
"learning_rate": 9.966895328888195e-06,
"loss": 1.1738,
"step": 78
},
{
"epoch": 0.20050761421319796,
"grad_norm": 2.4382892427081226,
"learning_rate": 9.965301996649563e-06,
"loss": 1.0667,
"step": 79
},
{
"epoch": 0.20304568527918782,
"grad_norm": 2.774255541254283,
"learning_rate": 9.96367135005875e-06,
"loss": 0.9998,
"step": 80
},
{
"epoch": 0.20558375634517767,
"grad_norm": 2.4921808717304685,
"learning_rate": 9.962003401370101e-06,
"loss": 1.0051,
"step": 81
},
{
"epoch": 0.20812182741116753,
"grad_norm": 2.8238480847860785,
"learning_rate": 9.960298163118284e-06,
"loss": 0.9916,
"step": 82
},
{
"epoch": 0.21065989847715735,
"grad_norm": 6.206800727550207,
"learning_rate": 9.958555648118207e-06,
"loss": 1.0763,
"step": 83
},
{
"epoch": 0.2131979695431472,
"grad_norm": 2.443578917202038,
"learning_rate": 9.956775869464901e-06,
"loss": 0.9825,
"step": 84
},
{
"epoch": 0.21573604060913706,
"grad_norm": 2.847758348759497,
"learning_rate": 9.954958840533447e-06,
"loss": 1.0456,
"step": 85
},
{
"epoch": 0.2182741116751269,
"grad_norm": 3.170138811884141,
"learning_rate": 9.953104574978854e-06,
"loss": 1.1186,
"step": 86
},
{
"epoch": 0.22081218274111675,
"grad_norm": 2.6010611046656877,
"learning_rate": 9.951213086735967e-06,
"loss": 1.0233,
"step": 87
},
{
"epoch": 0.2233502538071066,
"grad_norm": 6.679956372744609,
"learning_rate": 9.949284390019362e-06,
"loss": 1.066,
"step": 88
},
{
"epoch": 0.22588832487309646,
"grad_norm": 2.6554799986019133,
"learning_rate": 9.94731849932324e-06,
"loss": 0.9623,
"step": 89
},
{
"epoch": 0.22842639593908629,
"grad_norm": 2.3433414348493273,
"learning_rate": 9.945315429421307e-06,
"loss": 1.0869,
"step": 90
},
{
"epoch": 0.23096446700507614,
"grad_norm": 2.8599286738704683,
"learning_rate": 9.943275195366679e-06,
"loss": 1.11,
"step": 91
},
{
"epoch": 0.233502538071066,
"grad_norm": 4.958755531266782,
"learning_rate": 9.941197812491761e-06,
"loss": 1.2542,
"step": 92
},
{
"epoch": 0.23604060913705585,
"grad_norm": 2.2479047075233427,
"learning_rate": 9.939083296408127e-06,
"loss": 1.0836,
"step": 93
},
{
"epoch": 0.23857868020304568,
"grad_norm": 1.8107915831749222,
"learning_rate": 9.936931663006414e-06,
"loss": 1.0384,
"step": 94
},
{
"epoch": 0.24111675126903553,
"grad_norm": 2.457682015648462,
"learning_rate": 9.934742928456191e-06,
"loss": 1.0374,
"step": 95
},
{
"epoch": 0.2436548223350254,
"grad_norm": 2.972864834775759,
"learning_rate": 9.932517109205849e-06,
"loss": 1.1032,
"step": 96
},
{
"epoch": 0.24619289340101522,
"grad_norm": 2.3725301348700087,
"learning_rate": 9.930254221982464e-06,
"loss": 1.0623,
"step": 97
},
{
"epoch": 0.24873096446700507,
"grad_norm": 2.7944396518166355,
"learning_rate": 9.927954283791687e-06,
"loss": 1.0831,
"step": 98
},
{
"epoch": 0.2512690355329949,
"grad_norm": 2.300115990070667,
"learning_rate": 9.9256173119176e-06,
"loss": 1.0785,
"step": 99
},
{
"epoch": 0.25380710659898476,
"grad_norm": 1.8208692795246306,
"learning_rate": 9.923243323922598e-06,
"loss": 0.9879,
"step": 100
},
{
"epoch": 0.2563451776649746,
"grad_norm": 4.4476962273727665,
"learning_rate": 9.920832337647252e-06,
"loss": 1.0007,
"step": 101
},
{
"epoch": 0.25888324873096447,
"grad_norm": 3.3940854617337113,
"learning_rate": 9.918384371210178e-06,
"loss": 1.1891,
"step": 102
},
{
"epoch": 0.2614213197969543,
"grad_norm": 2.555201632199259,
"learning_rate": 9.915899443007894e-06,
"loss": 1.021,
"step": 103
},
{
"epoch": 0.2639593908629442,
"grad_norm": 2.3220370794195113,
"learning_rate": 9.91337757171469e-06,
"loss": 1.1011,
"step": 104
},
{
"epoch": 0.26649746192893403,
"grad_norm": 2.9669260657059655,
"learning_rate": 9.910818776282487e-06,
"loss": 1.0555,
"step": 105
},
{
"epoch": 0.26903553299492383,
"grad_norm": 2.870314472945511,
"learning_rate": 9.908223075940684e-06,
"loss": 1.1542,
"step": 106
},
{
"epoch": 0.2715736040609137,
"grad_norm": 2.7991064606918137,
"learning_rate": 9.905590490196027e-06,
"loss": 1.0811,
"step": 107
},
{
"epoch": 0.27411167512690354,
"grad_norm": 2.493650659366754,
"learning_rate": 9.902921038832456e-06,
"loss": 1.0137,
"step": 108
},
{
"epoch": 0.2766497461928934,
"grad_norm": 6.9035091138911655,
"learning_rate": 9.900214741910955e-06,
"loss": 1.1594,
"step": 109
},
{
"epoch": 0.27918781725888325,
"grad_norm": 2.453399893747965,
"learning_rate": 9.897471619769402e-06,
"loss": 0.9992,
"step": 110
},
{
"epoch": 0.2817258883248731,
"grad_norm": 5.298689006767713,
"learning_rate": 9.89469169302242e-06,
"loss": 1.0269,
"step": 111
},
{
"epoch": 0.28426395939086296,
"grad_norm": 3.312118607283353,
"learning_rate": 9.891874982561222e-06,
"loss": 1.0999,
"step": 112
},
{
"epoch": 0.2868020304568528,
"grad_norm": 2.309806298343435,
"learning_rate": 9.889021509553448e-06,
"loss": 0.9932,
"step": 113
},
{
"epoch": 0.2893401015228426,
"grad_norm": 2.782632711667371,
"learning_rate": 9.886131295443003e-06,
"loss": 1.0546,
"step": 114
},
{
"epoch": 0.2918781725888325,
"grad_norm": 3.0241898018425375,
"learning_rate": 9.883204361949916e-06,
"loss": 1.063,
"step": 115
},
{
"epoch": 0.29441624365482233,
"grad_norm": 4.881309577432392,
"learning_rate": 9.880240731070152e-06,
"loss": 0.9851,
"step": 116
},
{
"epoch": 0.2969543147208122,
"grad_norm": 4.393148043595713,
"learning_rate": 9.877240425075465e-06,
"loss": 1.1928,
"step": 117
},
{
"epoch": 0.29949238578680204,
"grad_norm": 2.539173060449269,
"learning_rate": 9.874203466513215e-06,
"loss": 1.0185,
"step": 118
},
{
"epoch": 0.3020304568527919,
"grad_norm": 3.088531691349216,
"learning_rate": 9.871129878206213e-06,
"loss": 1.0862,
"step": 119
},
{
"epoch": 0.30456852791878175,
"grad_norm": 2.646620091120325,
"learning_rate": 9.868019683252543e-06,
"loss": 0.9884,
"step": 120
},
{
"epoch": 0.30710659898477155,
"grad_norm": 2.435727031764384,
"learning_rate": 9.864872905025386e-06,
"loss": 1.0276,
"step": 121
},
{
"epoch": 0.3096446700507614,
"grad_norm": 2.1078027560231183,
"learning_rate": 9.861689567172849e-06,
"loss": 1.0837,
"step": 122
},
{
"epoch": 0.31218274111675126,
"grad_norm": 2.3556477849722293,
"learning_rate": 9.858469693617787e-06,
"loss": 0.9124,
"step": 123
},
{
"epoch": 0.3147208121827411,
"grad_norm": 2.574273891936511,
"learning_rate": 9.855213308557618e-06,
"loss": 1.1362,
"step": 124
},
{
"epoch": 0.31725888324873097,
"grad_norm": 2.954661863407828,
"learning_rate": 9.851920436464146e-06,
"loss": 1.1748,
"step": 125
},
{
"epoch": 0.3197969543147208,
"grad_norm": 2.493459031496384,
"learning_rate": 9.848591102083375e-06,
"loss": 1.1093,
"step": 126
},
{
"epoch": 0.3223350253807107,
"grad_norm": 2.1194534532314098,
"learning_rate": 9.845225330435329e-06,
"loss": 1.0326,
"step": 127
},
{
"epoch": 0.3248730964467005,
"grad_norm": 3.4687791362456477,
"learning_rate": 9.84182314681385e-06,
"loss": 1.0464,
"step": 128
},
{
"epoch": 0.32741116751269034,
"grad_norm": 6.5552134087262255,
"learning_rate": 9.838384576786427e-06,
"loss": 1.1425,
"step": 129
},
{
"epoch": 0.3299492385786802,
"grad_norm": 5.804701269137941,
"learning_rate": 9.834909646193983e-06,
"loss": 1.0323,
"step": 130
},
{
"epoch": 0.33248730964467005,
"grad_norm": 2.6380372208920697,
"learning_rate": 9.831398381150698e-06,
"loss": 1.0946,
"step": 131
},
{
"epoch": 0.3350253807106599,
"grad_norm": 2.292877023775666,
"learning_rate": 9.82785080804381e-06,
"loss": 0.883,
"step": 132
},
{
"epoch": 0.33756345177664976,
"grad_norm": 4.2977395996010435,
"learning_rate": 9.824266953533402e-06,
"loss": 0.9968,
"step": 133
},
{
"epoch": 0.3401015228426396,
"grad_norm": 2.4822533121250254,
"learning_rate": 9.82064684455222e-06,
"loss": 0.9894,
"step": 134
},
{
"epoch": 0.3426395939086294,
"grad_norm": 2.314919940633327,
"learning_rate": 9.816990508305463e-06,
"loss": 0.9332,
"step": 135
},
{
"epoch": 0.34517766497461927,
"grad_norm": 3.944138805330047,
"learning_rate": 9.813297972270575e-06,
"loss": 1.0342,
"step": 136
},
{
"epoch": 0.3477157360406091,
"grad_norm": 4.083763205392443,
"learning_rate": 9.809569264197046e-06,
"loss": 1.0035,
"step": 137
},
{
"epoch": 0.350253807106599,
"grad_norm": 2.2341064648864335,
"learning_rate": 9.805804412106197e-06,
"loss": 1.0156,
"step": 138
},
{
"epoch": 0.35279187817258884,
"grad_norm": 4.73655672514693,
"learning_rate": 9.802003444290975e-06,
"loss": 1.0575,
"step": 139
},
{
"epoch": 0.3553299492385787,
"grad_norm": 3.6474051487216794,
"learning_rate": 9.798166389315734e-06,
"loss": 1.0331,
"step": 140
},
{
"epoch": 0.35786802030456855,
"grad_norm": 2.0950970123191235,
"learning_rate": 9.794293276016024e-06,
"loss": 0.8774,
"step": 141
},
{
"epoch": 0.3604060913705584,
"grad_norm": 4.020417271141957,
"learning_rate": 9.79038413349838e-06,
"loss": 1.0517,
"step": 142
},
{
"epoch": 0.3629441624365482,
"grad_norm": 2.343919859731207,
"learning_rate": 9.786438991140086e-06,
"loss": 1.0388,
"step": 143
},
{
"epoch": 0.36548223350253806,
"grad_norm": 3.5918593063524202,
"learning_rate": 9.782457878588977e-06,
"loss": 1.007,
"step": 144
},
{
"epoch": 0.3680203045685279,
"grad_norm": 2.989308945487366,
"learning_rate": 9.7784408257632e-06,
"loss": 1.1402,
"step": 145
},
{
"epoch": 0.37055837563451777,
"grad_norm": 3.2999750398100898,
"learning_rate": 9.774387862850993e-06,
"loss": 1.0778,
"step": 146
},
{
"epoch": 0.3730964467005076,
"grad_norm": 2.39286494783793,
"learning_rate": 9.77029902031046e-06,
"loss": 1.0588,
"step": 147
},
{
"epoch": 0.3756345177664975,
"grad_norm": 7.549790221744935,
"learning_rate": 9.766174328869344e-06,
"loss": 1.0188,
"step": 148
},
{
"epoch": 0.37817258883248733,
"grad_norm": 2.026349609367698,
"learning_rate": 9.762013819524788e-06,
"loss": 0.9754,
"step": 149
},
{
"epoch": 0.38071065989847713,
"grad_norm": 1.9809815038815315,
"learning_rate": 9.75781752354311e-06,
"loss": 1.1272,
"step": 150
},
{
"epoch": 0.383248730964467,
"grad_norm": 2.332081936160816,
"learning_rate": 9.753585472459564e-06,
"loss": 1.0715,
"step": 151
},
{
"epoch": 0.38578680203045684,
"grad_norm": 2.5164597293319217,
"learning_rate": 9.749317698078109e-06,
"loss": 1.1051,
"step": 152
},
{
"epoch": 0.3883248730964467,
"grad_norm": 4.1363517122069435,
"learning_rate": 9.745014232471161e-06,
"loss": 0.9861,
"step": 153
},
{
"epoch": 0.39086294416243655,
"grad_norm": 4.619566694659651,
"learning_rate": 9.740675107979357e-06,
"loss": 1.0185,
"step": 154
},
{
"epoch": 0.3934010152284264,
"grad_norm": 3.055238311436169,
"learning_rate": 9.736300357211309e-06,
"loss": 1.0023,
"step": 155
},
{
"epoch": 0.39593908629441626,
"grad_norm": 3.7295880831016373,
"learning_rate": 9.731890013043367e-06,
"loss": 1.0002,
"step": 156
},
{
"epoch": 0.39847715736040606,
"grad_norm": 3.3508031862079637,
"learning_rate": 9.727444108619365e-06,
"loss": 1.1507,
"step": 157
},
{
"epoch": 0.4010152284263959,
"grad_norm": 2.723651526969142,
"learning_rate": 9.722962677350367e-06,
"loss": 1.1364,
"step": 158
},
{
"epoch": 0.4035532994923858,
"grad_norm": 3.774699978960873,
"learning_rate": 9.718445752914427e-06,
"loss": 1.1025,
"step": 159
},
{
"epoch": 0.40609137055837563,
"grad_norm": 2.7659454853714927,
"learning_rate": 9.713893369256334e-06,
"loss": 1.1713,
"step": 160
},
{
"epoch": 0.4086294416243655,
"grad_norm": 2.538756260743843,
"learning_rate": 9.709305560587344e-06,
"loss": 1.0911,
"step": 161
},
{
"epoch": 0.41116751269035534,
"grad_norm": 4.320782683762776,
"learning_rate": 9.704682361384941e-06,
"loss": 1.1632,
"step": 162
},
{
"epoch": 0.4137055837563452,
"grad_norm": 4.80159996293837,
"learning_rate": 9.700023806392569e-06,
"loss": 1.0217,
"step": 163
},
{
"epoch": 0.41624365482233505,
"grad_norm": 2.7673753672207595,
"learning_rate": 9.695329930619368e-06,
"loss": 1.1201,
"step": 164
},
{
"epoch": 0.41878172588832485,
"grad_norm": 3.104153414839049,
"learning_rate": 9.690600769339916e-06,
"loss": 1.078,
"step": 165
},
{
"epoch": 0.4213197969543147,
"grad_norm": 7.423796156142243,
"learning_rate": 9.685836358093964e-06,
"loss": 1.1045,
"step": 166
},
{
"epoch": 0.42385786802030456,
"grad_norm": 2.9797528305128083,
"learning_rate": 9.681036732686165e-06,
"loss": 1.1647,
"step": 167
},
{
"epoch": 0.4263959390862944,
"grad_norm": 2.851862749531238,
"learning_rate": 9.676201929185809e-06,
"loss": 0.9319,
"step": 168
},
{
"epoch": 0.4289340101522843,
"grad_norm": 2.8454139750098557,
"learning_rate": 9.671331983926548e-06,
"loss": 1.0268,
"step": 169
},
{
"epoch": 0.43147208121827413,
"grad_norm": 1.9807490940657426,
"learning_rate": 9.666426933506126e-06,
"loss": 1.1493,
"step": 170
},
{
"epoch": 0.434010152284264,
"grad_norm": 4.858906862885905,
"learning_rate": 9.661486814786104e-06,
"loss": 1.024,
"step": 171
},
{
"epoch": 0.4365482233502538,
"grad_norm": 2.659416757635679,
"learning_rate": 9.65651166489158e-06,
"loss": 0.971,
"step": 172
},
{
"epoch": 0.43908629441624364,
"grad_norm": 2.1524399400114764,
"learning_rate": 9.651501521210916e-06,
"loss": 1.0744,
"step": 173
},
{
"epoch": 0.4416243654822335,
"grad_norm": 5.9861847351599415,
"learning_rate": 9.646456421395447e-06,
"loss": 0.9895,
"step": 174
},
{
"epoch": 0.44416243654822335,
"grad_norm": 5.818455135086296,
"learning_rate": 9.64137640335921e-06,
"loss": 1.102,
"step": 175
},
{
"epoch": 0.4467005076142132,
"grad_norm": 3.150237217459513,
"learning_rate": 9.636261505278653e-06,
"loss": 1.0035,
"step": 176
},
{
"epoch": 0.44923857868020306,
"grad_norm": 2.971416590549205,
"learning_rate": 9.631111765592339e-06,
"loss": 1.0352,
"step": 177
},
{
"epoch": 0.4517766497461929,
"grad_norm": 2.636645325503754,
"learning_rate": 9.625927223000679e-06,
"loss": 0.9163,
"step": 178
},
{
"epoch": 0.4543147208121827,
"grad_norm": 3.6938262542992093,
"learning_rate": 9.620707916465622e-06,
"loss": 0.9884,
"step": 179
},
{
"epoch": 0.45685279187817257,
"grad_norm": 2.0628042177329644,
"learning_rate": 9.615453885210368e-06,
"loss": 1.0689,
"step": 180
},
{
"epoch": 0.4593908629441624,
"grad_norm": 2.490168476448274,
"learning_rate": 9.610165168719079e-06,
"loss": 1.0768,
"step": 181
},
{
"epoch": 0.4619289340101523,
"grad_norm": 2.931811841708527,
"learning_rate": 9.604841806736572e-06,
"loss": 1.0419,
"step": 182
},
{
"epoch": 0.46446700507614214,
"grad_norm": 2.678452333514363,
"learning_rate": 9.599483839268027e-06,
"loss": 1.0907,
"step": 183
},
{
"epoch": 0.467005076142132,
"grad_norm": 2.535226923041242,
"learning_rate": 9.594091306578687e-06,
"loss": 0.9734,
"step": 184
},
{
"epoch": 0.46954314720812185,
"grad_norm": 2.800305033909715,
"learning_rate": 9.58866424919355e-06,
"loss": 0.9831,
"step": 185
},
{
"epoch": 0.4720812182741117,
"grad_norm": 2.1777888495427495,
"learning_rate": 9.583202707897075e-06,
"loss": 0.9666,
"step": 186
},
{
"epoch": 0.4746192893401015,
"grad_norm": 2.2069652409303493,
"learning_rate": 9.577706723732858e-06,
"loss": 1.0929,
"step": 187
},
{
"epoch": 0.47715736040609136,
"grad_norm": 2.9321078227457984,
"learning_rate": 9.572176338003341e-06,
"loss": 1.078,
"step": 188
},
{
"epoch": 0.4796954314720812,
"grad_norm": 2.440269713026083,
"learning_rate": 9.566611592269495e-06,
"loss": 0.9856,
"step": 189
},
{
"epoch": 0.48223350253807107,
"grad_norm": 2.703608155740871,
"learning_rate": 9.5610125283505e-06,
"loss": 1.1808,
"step": 190
},
{
"epoch": 0.4847715736040609,
"grad_norm": 2.205546763469659,
"learning_rate": 9.555379188323448e-06,
"loss": 1.0306,
"step": 191
},
{
"epoch": 0.4873096446700508,
"grad_norm": 2.6215335484791007,
"learning_rate": 9.549711614523007e-06,
"loss": 1.0581,
"step": 192
},
{
"epoch": 0.48984771573604063,
"grad_norm": 3.4288525903852793,
"learning_rate": 9.54400984954112e-06,
"loss": 1.0535,
"step": 193
},
{
"epoch": 0.49238578680203043,
"grad_norm": 2.2449382072438415,
"learning_rate": 9.538273936226675e-06,
"loss": 1.078,
"step": 194
},
{
"epoch": 0.4949238578680203,
"grad_norm": 2.10839131093687,
"learning_rate": 9.532503917685179e-06,
"loss": 1.0901,
"step": 195
},
{
"epoch": 0.49746192893401014,
"grad_norm": 2.6226023542694668,
"learning_rate": 9.526699837278455e-06,
"loss": 1.0362,
"step": 196
},
{
"epoch": 0.5,
"grad_norm": 2.233493733259919,
"learning_rate": 9.520861738624288e-06,
"loss": 1.1131,
"step": 197
},
{
"epoch": 0.5025380710659898,
"grad_norm": 3.472957860340417,
"learning_rate": 9.514989665596114e-06,
"loss": 1.0128,
"step": 198
},
{
"epoch": 0.5050761421319797,
"grad_norm": 3.2276635592510554,
"learning_rate": 9.509083662322697e-06,
"loss": 0.9769,
"step": 199
},
{
"epoch": 0.5076142131979695,
"grad_norm": 4.6768656571054175,
"learning_rate": 9.503143773187773e-06,
"loss": 0.9644,
"step": 200
},
{
"epoch": 0.5101522842639594,
"grad_norm": 2.1315150716253464,
"learning_rate": 9.497170042829737e-06,
"loss": 1.0787,
"step": 201
},
{
"epoch": 0.5126903553299492,
"grad_norm": 3.4880915500896466,
"learning_rate": 9.491162516141308e-06,
"loss": 1.0436,
"step": 202
},
{
"epoch": 0.5152284263959391,
"grad_norm": 5.554999243181942,
"learning_rate": 9.485121238269175e-06,
"loss": 1.1596,
"step": 203
},
{
"epoch": 0.5177664974619289,
"grad_norm": 3.0133431645363062,
"learning_rate": 9.479046254613673e-06,
"loss": 1.0238,
"step": 204
},
{
"epoch": 0.5203045685279187,
"grad_norm": 2.6459316524578638,
"learning_rate": 9.472937610828437e-06,
"loss": 1.1887,
"step": 205
},
{
"epoch": 0.5228426395939086,
"grad_norm": 2.616874115600168,
"learning_rate": 9.466795352820055e-06,
"loss": 1.0816,
"step": 206
},
{
"epoch": 0.5253807106598984,
"grad_norm": 3.745002415078362,
"learning_rate": 9.460619526747732e-06,
"loss": 1.0252,
"step": 207
},
{
"epoch": 0.5279187817258884,
"grad_norm": 5.920555908768616,
"learning_rate": 9.454410179022932e-06,
"loss": 0.9879,
"step": 208
},
{
"epoch": 0.5304568527918782,
"grad_norm": 5.201646897805685,
"learning_rate": 9.448167356309041e-06,
"loss": 1.2475,
"step": 209
},
{
"epoch": 0.5329949238578681,
"grad_norm": 2.418499819521257,
"learning_rate": 9.441891105521005e-06,
"loss": 0.8801,
"step": 210
},
{
"epoch": 0.5355329949238579,
"grad_norm": 3.1655419423032822,
"learning_rate": 9.435581473824985e-06,
"loss": 1.1705,
"step": 211
},
{
"epoch": 0.5380710659898477,
"grad_norm": 2.3183177705846383,
"learning_rate": 9.429238508638001e-06,
"loss": 1.1676,
"step": 212
},
{
"epoch": 0.5406091370558376,
"grad_norm": 2.9767162225106922,
"learning_rate": 9.422862257627573e-06,
"loss": 1.0033,
"step": 213
},
{
"epoch": 0.5431472081218274,
"grad_norm": 3.438724989341635,
"learning_rate": 9.416452768711367e-06,
"loss": 0.9988,
"step": 214
},
{
"epoch": 0.5456852791878173,
"grad_norm": 2.8923643224417424,
"learning_rate": 9.41001009005683e-06,
"loss": 1.0348,
"step": 215
},
{
"epoch": 0.5482233502538071,
"grad_norm": 3.9138770051171816,
"learning_rate": 9.40353427008083e-06,
"loss": 1.1145,
"step": 216
},
{
"epoch": 0.550761421319797,
"grad_norm": 4.634738293590086,
"learning_rate": 9.397025357449298e-06,
"loss": 1.0498,
"step": 217
},
{
"epoch": 0.5532994923857868,
"grad_norm": 2.197412410654519,
"learning_rate": 9.39048340107685e-06,
"loss": 0.9445,
"step": 218
},
{
"epoch": 0.5558375634517766,
"grad_norm": 2.9245714608885307,
"learning_rate": 9.383908450126436e-06,
"loss": 1.1004,
"step": 219
},
{
"epoch": 0.5583756345177665,
"grad_norm": 4.248925201224667,
"learning_rate": 9.377300554008947e-06,
"loss": 1.2622,
"step": 220
},
{
"epoch": 0.5609137055837563,
"grad_norm": 2.321625730656605,
"learning_rate": 9.370659762382873e-06,
"loss": 1.044,
"step": 221
},
{
"epoch": 0.5634517766497462,
"grad_norm": 2.345083062827609,
"learning_rate": 9.3639861251539e-06,
"loss": 1.0076,
"step": 222
},
{
"epoch": 0.565989847715736,
"grad_norm": 5.040868975580229,
"learning_rate": 9.357279692474563e-06,
"loss": 1.0115,
"step": 223
},
{
"epoch": 0.5685279187817259,
"grad_norm": 4.7793447493473105,
"learning_rate": 9.350540514743844e-06,
"loss": 1.0627,
"step": 224
},
{
"epoch": 0.5710659898477157,
"grad_norm": 4.709717368951021,
"learning_rate": 9.343768642606813e-06,
"loss": 0.9392,
"step": 225
},
{
"epoch": 0.5736040609137056,
"grad_norm": 2.1830206359475723,
"learning_rate": 9.336964126954235e-06,
"loss": 1.0748,
"step": 226
},
{
"epoch": 0.5761421319796954,
"grad_norm": 2.1944035396382704,
"learning_rate": 9.330127018922195e-06,
"loss": 0.9683,
"step": 227
},
{
"epoch": 0.5786802030456852,
"grad_norm": 2.3010930914999967,
"learning_rate": 9.323257369891702e-06,
"loss": 1.1036,
"step": 228
},
{
"epoch": 0.5812182741116751,
"grad_norm": 2.719578205767353,
"learning_rate": 9.316355231488324e-06,
"loss": 1.0688,
"step": 229
},
{
"epoch": 0.583756345177665,
"grad_norm": 3.681549142591368,
"learning_rate": 9.309420655581777e-06,
"loss": 1.0006,
"step": 230
},
{
"epoch": 0.5862944162436549,
"grad_norm": 2.279346619615211,
"learning_rate": 9.302453694285549e-06,
"loss": 1.1434,
"step": 231
},
{
"epoch": 0.5888324873096447,
"grad_norm": 2.6059263472191754,
"learning_rate": 9.29545439995651e-06,
"loss": 1.035,
"step": 232
},
{
"epoch": 0.5913705583756346,
"grad_norm": 2.4004634697913225,
"learning_rate": 9.288422825194502e-06,
"loss": 0.8866,
"step": 233
},
{
"epoch": 0.5939086294416244,
"grad_norm": 2.028307692805792,
"learning_rate": 9.281359022841966e-06,
"loss": 0.9655,
"step": 234
},
{
"epoch": 0.5964467005076142,
"grad_norm": 3.94892590556114,
"learning_rate": 9.274263045983529e-06,
"loss": 1.0461,
"step": 235
},
{
"epoch": 0.5989847715736041,
"grad_norm": 2.5639365559174188,
"learning_rate": 9.267134947945611e-06,
"loss": 1.1191,
"step": 236
},
{
"epoch": 0.6015228426395939,
"grad_norm": 2.5771861483758975,
"learning_rate": 9.259974782296023e-06,
"loss": 1.1159,
"step": 237
},
{
"epoch": 0.6040609137055838,
"grad_norm": 4.259283248370748,
"learning_rate": 9.252782602843565e-06,
"loss": 1.0164,
"step": 238
},
{
"epoch": 0.6065989847715736,
"grad_norm": 1.9739014796831083,
"learning_rate": 9.245558463637623e-06,
"loss": 0.9955,
"step": 239
},
{
"epoch": 0.6091370558375635,
"grad_norm": 2.160593531593977,
"learning_rate": 9.238302418967757e-06,
"loss": 1.0866,
"step": 240
},
{
"epoch": 0.6116751269035533,
"grad_norm": 2.648999822170712,
"learning_rate": 9.231014523363303e-06,
"loss": 1.0848,
"step": 241
},
{
"epoch": 0.6142131979695431,
"grad_norm": 3.7793807875511805,
"learning_rate": 9.223694831592953e-06,
"loss": 1.0154,
"step": 242
},
{
"epoch": 0.616751269035533,
"grad_norm": 3.092399515241672,
"learning_rate": 9.216343398664349e-06,
"loss": 1.0185,
"step": 243
},
{
"epoch": 0.6192893401015228,
"grad_norm": 3.202644866882686,
"learning_rate": 9.208960279823672e-06,
"loss": 1.1874,
"step": 244
},
{
"epoch": 0.6218274111675127,
"grad_norm": 2.0162635261279416,
"learning_rate": 9.201545530555214e-06,
"loss": 1.115,
"step": 245
},
{
"epoch": 0.6243654822335025,
"grad_norm": 5.598301005765664,
"learning_rate": 9.194099206580981e-06,
"loss": 1.2179,
"step": 246
},
{
"epoch": 0.6269035532994924,
"grad_norm": 2.22095674856811,
"learning_rate": 9.18662136386026e-06,
"loss": 0.9959,
"step": 247
},
{
"epoch": 0.6294416243654822,
"grad_norm": 3.581389026739919,
"learning_rate": 9.1791120585892e-06,
"loss": 1.1269,
"step": 248
},
{
"epoch": 0.631979695431472,
"grad_norm": 5.445257086874675,
"learning_rate": 9.171571347200392e-06,
"loss": 1.2296,
"step": 249
},
{
"epoch": 0.6345177664974619,
"grad_norm": 2.24392675323956,
"learning_rate": 9.163999286362445e-06,
"loss": 1.0254,
"step": 250
},
{
"epoch": 0.6370558375634517,
"grad_norm": 4.167042508379098,
"learning_rate": 9.156395932979563e-06,
"loss": 1.1748,
"step": 251
},
{
"epoch": 0.6395939086294417,
"grad_norm": 2.5891911677793664,
"learning_rate": 9.14876134419111e-06,
"loss": 1.1466,
"step": 252
},
{
"epoch": 0.6421319796954315,
"grad_norm": 2.994985140303232,
"learning_rate": 9.141095577371185e-06,
"loss": 0.9818,
"step": 253
},
{
"epoch": 0.6446700507614214,
"grad_norm": 2.3912668038646854,
"learning_rate": 9.133398690128194e-06,
"loss": 1.0164,
"step": 254
},
{
"epoch": 0.6472081218274112,
"grad_norm": 2.1809817266204106,
"learning_rate": 9.125670740304412e-06,
"loss": 1.0514,
"step": 255
},
{
"epoch": 0.649746192893401,
"grad_norm": 3.490749098234336,
"learning_rate": 9.117911785975548e-06,
"loss": 1.0449,
"step": 256
},
{
"epoch": 0.6522842639593909,
"grad_norm": 4.203566340194239,
"learning_rate": 9.110121885450311e-06,
"loss": 1.1813,
"step": 257
},
{
"epoch": 0.6548223350253807,
"grad_norm": 3.196719647899695,
"learning_rate": 9.102301097269974e-06,
"loss": 1.0271,
"step": 258
},
{
"epoch": 0.6573604060913706,
"grad_norm": 3.076073531440758,
"learning_rate": 9.094449480207933e-06,
"loss": 1.036,
"step": 259
},
{
"epoch": 0.6598984771573604,
"grad_norm": 5.182177678783312,
"learning_rate": 9.086567093269253e-06,
"loss": 1.1141,
"step": 260
},
{
"epoch": 0.6624365482233503,
"grad_norm": 2.357962719234063,
"learning_rate": 9.078653995690248e-06,
"loss": 1.0862,
"step": 261
},
{
"epoch": 0.6649746192893401,
"grad_norm": 2.0784516657027297,
"learning_rate": 9.070710246938017e-06,
"loss": 0.9983,
"step": 262
},
{
"epoch": 0.6675126903553299,
"grad_norm": 2.5130820726794227,
"learning_rate": 9.062735906710004e-06,
"loss": 1.0241,
"step": 263
},
{
"epoch": 0.6700507614213198,
"grad_norm": 2.229905635017876,
"learning_rate": 9.05473103493355e-06,
"loss": 1.012,
"step": 264
},
{
"epoch": 0.6725888324873096,
"grad_norm": 4.327860759002616,
"learning_rate": 9.046695691765436e-06,
"loss": 1.062,
"step": 265
},
{
"epoch": 0.6751269035532995,
"grad_norm": 3.72671924192454,
"learning_rate": 9.038629937591445e-06,
"loss": 1.1112,
"step": 266
},
{
"epoch": 0.6776649746192893,
"grad_norm": 3.125309450876512,
"learning_rate": 9.03053383302589e-06,
"loss": 1.0586,
"step": 267
},
{
"epoch": 0.6802030456852792,
"grad_norm": 2.5688387996257704,
"learning_rate": 9.022407438911177e-06,
"loss": 1.1105,
"step": 268
},
{
"epoch": 0.682741116751269,
"grad_norm": 3.1366648310116125,
"learning_rate": 9.01425081631733e-06,
"loss": 0.9882,
"step": 269
},
{
"epoch": 0.6852791878172588,
"grad_norm": 4.202684164859739,
"learning_rate": 9.006064026541549e-06,
"loss": 1.016,
"step": 270
},
{
"epoch": 0.6878172588832487,
"grad_norm": 2.7148085939237383,
"learning_rate": 8.997847131107731e-06,
"loss": 1.0835,
"step": 271
},
{
"epoch": 0.6903553299492385,
"grad_norm": 2.9688800803660285,
"learning_rate": 8.989600191766028e-06,
"loss": 1.0059,
"step": 272
},
{
"epoch": 0.6928934010152284,
"grad_norm": 3.2613550555880595,
"learning_rate": 8.981323270492367e-06,
"loss": 1.0845,
"step": 273
},
{
"epoch": 0.6954314720812182,
"grad_norm": 6.7618332493465765,
"learning_rate": 8.973016429487989e-06,
"loss": 1.1204,
"step": 274
},
{
"epoch": 0.6979695431472082,
"grad_norm": 3.994336467537423,
"learning_rate": 8.964679731178984e-06,
"loss": 0.9889,
"step": 275
},
{
"epoch": 0.700507614213198,
"grad_norm": 2.9602083850724146,
"learning_rate": 8.956313238215824e-06,
"loss": 1.01,
"step": 276
},
{
"epoch": 0.7030456852791879,
"grad_norm": 2.36727770135421,
"learning_rate": 8.947917013472885e-06,
"loss": 1.14,
"step": 277
},
{
"epoch": 0.7055837563451777,
"grad_norm": 2.875102362311937,
"learning_rate": 8.939491120047974e-06,
"loss": 1.0227,
"step": 278
},
{
"epoch": 0.7081218274111675,
"grad_norm": 2.3483599719919086,
"learning_rate": 8.931035621261865e-06,
"loss": 1.0908,
"step": 279
},
{
"epoch": 0.7106598984771574,
"grad_norm": 3.5100771689299584,
"learning_rate": 8.922550580657816e-06,
"loss": 0.9979,
"step": 280
},
{
"epoch": 0.7131979695431472,
"grad_norm": 2.403015013570231,
"learning_rate": 8.914036062001089e-06,
"loss": 1.0621,
"step": 281
},
{
"epoch": 0.7157360406091371,
"grad_norm": 3.7415825640690956,
"learning_rate": 8.905492129278478e-06,
"loss": 1.0554,
"step": 282
},
{
"epoch": 0.7182741116751269,
"grad_norm": 3.4701854151842237,
"learning_rate": 8.896918846697822e-06,
"loss": 0.9954,
"step": 283
},
{
"epoch": 0.7208121827411168,
"grad_norm": 2.984814537688729,
"learning_rate": 8.888316278687526e-06,
"loss": 0.9768,
"step": 284
},
{
"epoch": 0.7233502538071066,
"grad_norm": 3.943831792187661,
"learning_rate": 8.879684489896073e-06,
"loss": 0.9808,
"step": 285
},
{
"epoch": 0.7258883248730964,
"grad_norm": 3.0862729355784553,
"learning_rate": 8.871023545191547e-06,
"loss": 0.998,
"step": 286
},
{
"epoch": 0.7284263959390863,
"grad_norm": 3.4902562190166386,
"learning_rate": 8.862333509661129e-06,
"loss": 1.063,
"step": 287
},
{
"epoch": 0.7309644670050761,
"grad_norm": 3.519136766999423,
"learning_rate": 8.85361444861063e-06,
"loss": 1.1137,
"step": 288
},
{
"epoch": 0.733502538071066,
"grad_norm": 3.439803607907018,
"learning_rate": 8.844866427563983e-06,
"loss": 1.029,
"step": 289
},
{
"epoch": 0.7360406091370558,
"grad_norm": 4.0784940602279836,
"learning_rate": 8.836089512262753e-06,
"loss": 0.997,
"step": 290
},
{
"epoch": 0.7385786802030457,
"grad_norm": 3.4814302353846798,
"learning_rate": 8.82728376866565e-06,
"loss": 1.2135,
"step": 291
},
{
"epoch": 0.7411167512690355,
"grad_norm": 3.6326143786283467,
"learning_rate": 8.818449262948028e-06,
"loss": 1.0662,
"step": 292
},
{
"epoch": 0.7436548223350253,
"grad_norm": 2.7030639665664666,
"learning_rate": 8.80958606150139e-06,
"loss": 1.1053,
"step": 293
},
{
"epoch": 0.7461928934010152,
"grad_norm": 6.2321761427881155,
"learning_rate": 8.800694230932885e-06,
"loss": 1.0579,
"step": 294
},
{
"epoch": 0.748730964467005,
"grad_norm": 4.294180075723966,
"learning_rate": 8.791773838064812e-06,
"loss": 1.0612,
"step": 295
},
{
"epoch": 0.751269035532995,
"grad_norm": 3.8269300062055196,
"learning_rate": 8.78282494993412e-06,
"loss": 0.9201,
"step": 296
},
{
"epoch": 0.7538071065989848,
"grad_norm": 2.3404396349951706,
"learning_rate": 8.773847633791897e-06,
"loss": 1.017,
"step": 297
},
{
"epoch": 0.7563451776649747,
"grad_norm": 3.8128462739169495,
"learning_rate": 8.764841957102866e-06,
"loss": 1.0252,
"step": 298
},
{
"epoch": 0.7588832487309645,
"grad_norm": 2.287311243612682,
"learning_rate": 8.755807987544884e-06,
"loss": 1.1611,
"step": 299
},
{
"epoch": 0.7614213197969543,
"grad_norm": 3.7690746307525744,
"learning_rate": 8.74674579300843e-06,
"loss": 0.9457,
"step": 300
},
{
"epoch": 0.7639593908629442,
"grad_norm": 2.048134697904296,
"learning_rate": 8.737655441596088e-06,
"loss": 0.9724,
"step": 301
},
{
"epoch": 0.766497461928934,
"grad_norm": 2.566215053585969,
"learning_rate": 8.72853700162205e-06,
"loss": 0.9773,
"step": 302
},
{
"epoch": 0.7690355329949239,
"grad_norm": 2.151885805636772,
"learning_rate": 8.71939054161159e-06,
"loss": 1.0774,
"step": 303
},
{
"epoch": 0.7715736040609137,
"grad_norm": 3.313132981406172,
"learning_rate": 8.710216130300551e-06,
"loss": 1.1324,
"step": 304
},
{
"epoch": 0.7741116751269036,
"grad_norm": 2.056935287783317,
"learning_rate": 8.701013836634833e-06,
"loss": 1.0126,
"step": 305
},
{
"epoch": 0.7766497461928934,
"grad_norm": 5.346543217053909,
"learning_rate": 8.691783729769874e-06,
"loss": 0.9938,
"step": 306
},
{
"epoch": 0.7791878172588832,
"grad_norm": 2.9607570187514707,
"learning_rate": 8.682525879070126e-06,
"loss": 1.1109,
"step": 307
},
{
"epoch": 0.7817258883248731,
"grad_norm": 2.861744851184498,
"learning_rate": 8.673240354108539e-06,
"loss": 0.9496,
"step": 308
},
{
"epoch": 0.7842639593908629,
"grad_norm": 2.58606470902963,
"learning_rate": 8.663927224666034e-06,
"loss": 0.9539,
"step": 309
},
{
"epoch": 0.7868020304568528,
"grad_norm": 2.109484594420891,
"learning_rate": 8.654586560730981e-06,
"loss": 1.125,
"step": 310
},
{
"epoch": 0.7893401015228426,
"grad_norm": 4.024325632708077,
"learning_rate": 8.645218432498673e-06,
"loss": 1.0123,
"step": 311
},
{
"epoch": 0.7918781725888325,
"grad_norm": 2.597662979848648,
"learning_rate": 8.635822910370793e-06,
"loss": 1.1692,
"step": 312
},
{
"epoch": 0.7944162436548223,
"grad_norm": 2.2166134294153075,
"learning_rate": 8.626400064954897e-06,
"loss": 0.9594,
"step": 313
},
{
"epoch": 0.7969543147208121,
"grad_norm": 2.0221730706708785,
"learning_rate": 8.616949967063871e-06,
"loss": 0.8921,
"step": 314
},
{
"epoch": 0.799492385786802,
"grad_norm": 2.8323976488528055,
"learning_rate": 8.607472687715408e-06,
"loss": 1.0321,
"step": 315
},
{
"epoch": 0.8020304568527918,
"grad_norm": 2.84440132443287,
"learning_rate": 8.597968298131464e-06,
"loss": 1.0595,
"step": 316
},
{
"epoch": 0.8045685279187818,
"grad_norm": 2.320484455637419,
"learning_rate": 8.588436869737737e-06,
"loss": 1.0101,
"step": 317
},
{
"epoch": 0.8071065989847716,
"grad_norm": 2.4042485235745965,
"learning_rate": 8.578878474163115e-06,
"loss": 1.025,
"step": 318
},
{
"epoch": 0.8096446700507615,
"grad_norm": 2.2100438666281783,
"learning_rate": 8.56929318323915e-06,
"loss": 0.8702,
"step": 319
},
{
"epoch": 0.8121827411167513,
"grad_norm": 2.600141765954614,
"learning_rate": 8.559681068999509e-06,
"loss": 1.042,
"step": 320
},
{
"epoch": 0.8147208121827412,
"grad_norm": 3.312136758797296,
"learning_rate": 8.550042203679441e-06,
"loss": 1.0783,
"step": 321
},
{
"epoch": 0.817258883248731,
"grad_norm": 2.355781755993528,
"learning_rate": 8.540376659715226e-06,
"loss": 1.121,
"step": 322
},
{
"epoch": 0.8197969543147208,
"grad_norm": 2.033914941407007,
"learning_rate": 8.530684509743639e-06,
"loss": 0.9748,
"step": 323
},
{
"epoch": 0.8223350253807107,
"grad_norm": 4.43870060560367,
"learning_rate": 8.520965826601394e-06,
"loss": 1.1324,
"step": 324
},
{
"epoch": 0.8248730964467005,
"grad_norm": 2.2923724345059813,
"learning_rate": 8.511220683324608e-06,
"loss": 0.918,
"step": 325
},
{
"epoch": 0.8274111675126904,
"grad_norm": 2.2068567850563228,
"learning_rate": 8.501449153148243e-06,
"loss": 0.9338,
"step": 326
},
{
"epoch": 0.8299492385786802,
"grad_norm": 2.589213829583448,
"learning_rate": 8.491651309505562e-06,
"loss": 1.112,
"step": 327
},
{
"epoch": 0.8324873096446701,
"grad_norm": 2.1974385541047656,
"learning_rate": 8.48182722602757e-06,
"loss": 0.8901,
"step": 328
},
{
"epoch": 0.8350253807106599,
"grad_norm": 7.061053266476324,
"learning_rate": 8.47197697654247e-06,
"loss": 1.0895,
"step": 329
},
{
"epoch": 0.8375634517766497,
"grad_norm": 3.0310872002885847,
"learning_rate": 8.462100635075097e-06,
"loss": 0.9703,
"step": 330
},
{
"epoch": 0.8401015228426396,
"grad_norm": 2.298254053197544,
"learning_rate": 8.452198275846372e-06,
"loss": 1.0766,
"step": 331
},
{
"epoch": 0.8426395939086294,
"grad_norm": 2.942514438530551,
"learning_rate": 8.442269973272743e-06,
"loss": 1.1541,
"step": 332
},
{
"epoch": 0.8451776649746193,
"grad_norm": 2.0603391728533573,
"learning_rate": 8.432315801965616e-06,
"loss": 1.0356,
"step": 333
},
{
"epoch": 0.8477157360406091,
"grad_norm": 2.34696517553944,
"learning_rate": 8.422335836730804e-06,
"loss": 0.9852,
"step": 334
},
{
"epoch": 0.850253807106599,
"grad_norm": 4.403994489595826,
"learning_rate": 8.412330152567965e-06,
"loss": 1.1587,
"step": 335
},
{
"epoch": 0.8527918781725888,
"grad_norm": 3.7915353107441376,
"learning_rate": 8.40229882467003e-06,
"loss": 1.0281,
"step": 336
},
{
"epoch": 0.8553299492385786,
"grad_norm": 2.1893832327328115,
"learning_rate": 8.392241928422644e-06,
"loss": 1.1753,
"step": 337
},
{
"epoch": 0.8578680203045685,
"grad_norm": 3.6665615611814615,
"learning_rate": 8.382159539403605e-06,
"loss": 0.9433,
"step": 338
},
{
"epoch": 0.8604060913705583,
"grad_norm": 5.388123821372951,
"learning_rate": 8.372051733382283e-06,
"loss": 0.9838,
"step": 339
},
{
"epoch": 0.8629441624365483,
"grad_norm": 2.2459723217939045,
"learning_rate": 8.361918586319058e-06,
"loss": 0.9915,
"step": 340
},
{
"epoch": 0.8654822335025381,
"grad_norm": 4.969255505433864,
"learning_rate": 8.351760174364752e-06,
"loss": 1.0536,
"step": 341
},
{
"epoch": 0.868020304568528,
"grad_norm": 2.1409299322309177,
"learning_rate": 8.341576573860049e-06,
"loss": 1.0697,
"step": 342
},
{
"epoch": 0.8705583756345178,
"grad_norm": 2.28979733193756,
"learning_rate": 8.331367861334928e-06,
"loss": 0.9796,
"step": 343
},
{
"epoch": 0.8730964467005076,
"grad_norm": 2.215159852607918,
"learning_rate": 8.321134113508089e-06,
"loss": 0.9487,
"step": 344
},
{
"epoch": 0.8756345177664975,
"grad_norm": 7.098608089968907,
"learning_rate": 8.310875407286364e-06,
"loss": 0.9679,
"step": 345
},
{
"epoch": 0.8781725888324873,
"grad_norm": 2.494977976840313,
"learning_rate": 8.300591819764155e-06,
"loss": 1.0262,
"step": 346
},
{
"epoch": 0.8807106598984772,
"grad_norm": 4.378236657118699,
"learning_rate": 8.290283428222842e-06,
"loss": 0.9634,
"step": 347
},
{
"epoch": 0.883248730964467,
"grad_norm": 2.5357774690099433,
"learning_rate": 8.279950310130218e-06,
"loss": 1.1934,
"step": 348
},
{
"epoch": 0.8857868020304569,
"grad_norm": 2.4838144435457172,
"learning_rate": 8.269592543139883e-06,
"loss": 1.1536,
"step": 349
},
{
"epoch": 0.8883248730964467,
"grad_norm": 4.7088721852127815,
"learning_rate": 8.259210205090683e-06,
"loss": 1.0596,
"step": 350
},
{
"epoch": 0.8908629441624365,
"grad_norm": 2.3904986847786875,
"learning_rate": 8.248803374006113e-06,
"loss": 0.9723,
"step": 351
},
{
"epoch": 0.8934010152284264,
"grad_norm": 3.0593901416640823,
"learning_rate": 8.238372128093736e-06,
"loss": 1.0769,
"step": 352
},
{
"epoch": 0.8959390862944162,
"grad_norm": 5.24826281940527,
"learning_rate": 8.227916545744588e-06,
"loss": 1.0171,
"step": 353
},
{
"epoch": 0.8984771573604061,
"grad_norm": 2.9561545208571394,
"learning_rate": 8.2174367055326e-06,
"loss": 1.0627,
"step": 354
},
{
"epoch": 0.9010152284263959,
"grad_norm": 4.302396818897092,
"learning_rate": 8.206932686213998e-06,
"loss": 1.1551,
"step": 355
},
{
"epoch": 0.9035532994923858,
"grad_norm": 4.862418884348732,
"learning_rate": 8.196404566726712e-06,
"loss": 0.9596,
"step": 356
},
{
"epoch": 0.9060913705583756,
"grad_norm": 3.2420027025969103,
"learning_rate": 8.185852426189794e-06,
"loss": 1.0267,
"step": 357
},
{
"epoch": 0.9086294416243654,
"grad_norm": 3.376880772597684,
"learning_rate": 8.175276343902802e-06,
"loss": 0.947,
"step": 358
},
{
"epoch": 0.9111675126903553,
"grad_norm": 4.603667399163658,
"learning_rate": 8.16467639934523e-06,
"loss": 0.9864,
"step": 359
},
{
"epoch": 0.9137055837563451,
"grad_norm": 2.9576724755101678,
"learning_rate": 8.154052672175888e-06,
"loss": 1.0051,
"step": 360
},
{
"epoch": 0.916243654822335,
"grad_norm": 4.320905650850623,
"learning_rate": 8.143405242232317e-06,
"loss": 0.998,
"step": 361
},
{
"epoch": 0.9187817258883249,
"grad_norm": 5.449431302042425,
"learning_rate": 8.132734189530182e-06,
"loss": 0.8851,
"step": 362
},
{
"epoch": 0.9213197969543148,
"grad_norm": 2.1253363817718136,
"learning_rate": 8.122039594262679e-06,
"loss": 0.8947,
"step": 363
},
{
"epoch": 0.9238578680203046,
"grad_norm": 3.187675631861296,
"learning_rate": 8.111321536799921e-06,
"loss": 1.0377,
"step": 364
},
{
"epoch": 0.9263959390862944,
"grad_norm": 2.022120835826271,
"learning_rate": 8.100580097688342e-06,
"loss": 1.0793,
"step": 365
},
{
"epoch": 0.9289340101522843,
"grad_norm": 2.645049531416873,
"learning_rate": 8.08981535765009e-06,
"loss": 0.9978,
"step": 366
},
{
"epoch": 0.9314720812182741,
"grad_norm": 4.212018855531683,
"learning_rate": 8.07902739758242e-06,
"loss": 1.0244,
"step": 367
},
{
"epoch": 0.934010152284264,
"grad_norm": 5.9126393432243916,
"learning_rate": 8.068216298557088e-06,
"loss": 0.9787,
"step": 368
},
{
"epoch": 0.9365482233502538,
"grad_norm": 2.2116674048323857,
"learning_rate": 8.057382141819734e-06,
"loss": 0.9862,
"step": 369
},
{
"epoch": 0.9390862944162437,
"grad_norm": 4.548662027137182,
"learning_rate": 8.046525008789283e-06,
"loss": 0.9965,
"step": 370
},
{
"epoch": 0.9416243654822335,
"grad_norm": 2.7198427380964105,
"learning_rate": 8.035644981057327e-06,
"loss": 1.174,
"step": 371
},
{
"epoch": 0.9441624365482234,
"grad_norm": 1.8298070803821809,
"learning_rate": 8.024742140387506e-06,
"loss": 0.8985,
"step": 372
},
{
"epoch": 0.9467005076142132,
"grad_norm": 3.1445038648413948,
"learning_rate": 8.013816568714905e-06,
"loss": 0.9881,
"step": 373
},
{
"epoch": 0.949238578680203,
"grad_norm": 2.9123896714012103,
"learning_rate": 8.002868348145436e-06,
"loss": 1.2471,
"step": 374
},
{
"epoch": 0.9517766497461929,
"grad_norm": 3.9489207704815428,
"learning_rate": 7.99189756095521e-06,
"loss": 1.0102,
"step": 375
},
{
"epoch": 0.9543147208121827,
"grad_norm": 2.317711292143962,
"learning_rate": 7.980904289589932e-06,
"loss": 1.153,
"step": 376
},
{
"epoch": 0.9568527918781726,
"grad_norm": 2.285220370350821,
"learning_rate": 7.969888616664275e-06,
"loss": 1.1004,
"step": 377
},
{
"epoch": 0.9593908629441624,
"grad_norm": 4.639698727940778,
"learning_rate": 7.95885062496126e-06,
"loss": 1.215,
"step": 378
},
{
"epoch": 0.9619289340101523,
"grad_norm": 2.187854846843315,
"learning_rate": 7.947790397431631e-06,
"loss": 1.0363,
"step": 379
},
{
"epoch": 0.9644670050761421,
"grad_norm": 2.087236809552686,
"learning_rate": 7.936708017193242e-06,
"loss": 1.093,
"step": 380
},
{
"epoch": 0.9670050761421319,
"grad_norm": 2.47349126593795,
"learning_rate": 7.92560356753042e-06,
"loss": 0.994,
"step": 381
},
{
"epoch": 0.9695431472081218,
"grad_norm": 5.206032659383499,
"learning_rate": 7.914477131893344e-06,
"loss": 1.1066,
"step": 382
},
{
"epoch": 0.9720812182741116,
"grad_norm": 4.546492297905296,
"learning_rate": 7.903328793897418e-06,
"loss": 1.0431,
"step": 383
},
{
"epoch": 0.9746192893401016,
"grad_norm": 3.288068386496405,
"learning_rate": 7.892158637322647e-06,
"loss": 1.147,
"step": 384
},
{
"epoch": 0.9771573604060914,
"grad_norm": 5.711480442784147,
"learning_rate": 7.880966746112995e-06,
"loss": 1.0171,
"step": 385
},
{
"epoch": 0.9796954314720813,
"grad_norm": 3.988727515802901,
"learning_rate": 7.869753204375772e-06,
"loss": 0.8908,
"step": 386
},
{
"epoch": 0.9822335025380711,
"grad_norm": 3.060563447708997,
"learning_rate": 7.858518096380984e-06,
"loss": 0.9856,
"step": 387
},
{
"epoch": 0.9847715736040609,
"grad_norm": 2.4682459956195637,
"learning_rate": 7.847261506560716e-06,
"loss": 1.0148,
"step": 388
},
{
"epoch": 0.9873096446700508,
"grad_norm": 5.223626547133224,
"learning_rate": 7.835983519508477e-06,
"loss": 1.0348,
"step": 389
},
{
"epoch": 0.9898477157360406,
"grad_norm": 4.123284868828342,
"learning_rate": 7.824684219978591e-06,
"loss": 1.1459,
"step": 390
},
{
"epoch": 0.9923857868020305,
"grad_norm": 4.014923037363868,
"learning_rate": 7.813363692885535e-06,
"loss": 0.9656,
"step": 391
},
{
"epoch": 0.9949238578680203,
"grad_norm": 3.8267198551637507,
"learning_rate": 7.802022023303319e-06,
"loss": 1.0511,
"step": 392
},
{
"epoch": 0.9974619289340102,
"grad_norm": 3.328421605030423,
"learning_rate": 7.790659296464833e-06,
"loss": 1.1031,
"step": 393
},
{
"epoch": 1.0,
"grad_norm": 3.7683208975790343,
"learning_rate": 7.779275597761215e-06,
"loss": 1.0182,
"step": 394
},
{
"epoch": 1.00253807106599,
"grad_norm": 3.0484171116238525,
"learning_rate": 7.76787101274121e-06,
"loss": 0.7197,
"step": 395
},
{
"epoch": 1.0050761421319796,
"grad_norm": 2.659217933312505,
"learning_rate": 7.756445627110523e-06,
"loss": 0.8507,
"step": 396
},
{
"epoch": 1.0076142131979695,
"grad_norm": 2.0241729265622515,
"learning_rate": 7.74499952673117e-06,
"loss": 0.711,
"step": 397
},
{
"epoch": 1.0101522842639594,
"grad_norm": 2.7605289419713666,
"learning_rate": 7.733532797620849e-06,
"loss": 0.9235,
"step": 398
},
{
"epoch": 1.0126903553299493,
"grad_norm": 4.332032153367687,
"learning_rate": 7.722045525952272e-06,
"loss": 0.8808,
"step": 399
},
{
"epoch": 1.015228426395939,
"grad_norm": 2.8080304014227746,
"learning_rate": 7.71053779805254e-06,
"loss": 0.7194,
"step": 400
},
{
"epoch": 1.017766497461929,
"grad_norm": 3.161578778385084,
"learning_rate": 7.699009700402476e-06,
"loss": 0.7822,
"step": 401
},
{
"epoch": 1.0203045685279188,
"grad_norm": 2.988252242343333,
"learning_rate": 7.68746131963598e-06,
"loss": 0.7483,
"step": 402
},
{
"epoch": 1.0228426395939085,
"grad_norm": 2.888286673140523,
"learning_rate": 7.675892742539392e-06,
"loss": 0.8125,
"step": 403
},
{
"epoch": 1.0253807106598984,
"grad_norm": 2.2435180847004577,
"learning_rate": 7.664304056050813e-06,
"loss": 0.6922,
"step": 404
},
{
"epoch": 1.0279187817258884,
"grad_norm": 3.198583049737055,
"learning_rate": 7.652695347259476e-06,
"loss": 0.8004,
"step": 405
},
{
"epoch": 1.0304568527918783,
"grad_norm": 3.2992117950438185,
"learning_rate": 7.641066703405076e-06,
"loss": 0.8566,
"step": 406
},
{
"epoch": 1.032994923857868,
"grad_norm": 4.463047095431445,
"learning_rate": 7.629418211877129e-06,
"loss": 0.6083,
"step": 407
},
{
"epoch": 1.0355329949238579,
"grad_norm": 3.0485877960149477,
"learning_rate": 7.6177499602143e-06,
"loss": 0.6715,
"step": 408
},
{
"epoch": 1.0380710659898478,
"grad_norm": 2.7853627353229062,
"learning_rate": 7.6060620361037495e-06,
"loss": 0.6847,
"step": 409
},
{
"epoch": 1.0406091370558375,
"grad_norm": 2.519657419339155,
"learning_rate": 7.594354527380485e-06,
"loss": 0.846,
"step": 410
},
{
"epoch": 1.0431472081218274,
"grad_norm": 2.8052149935489137,
"learning_rate": 7.582627522026686e-06,
"loss": 0.7273,
"step": 411
},
{
"epoch": 1.0456852791878173,
"grad_norm": 6.754274818674537,
"learning_rate": 7.5708811081710535e-06,
"loss": 0.6285,
"step": 412
},
{
"epoch": 1.0482233502538072,
"grad_norm": 3.6424686022316433,
"learning_rate": 7.55911537408814e-06,
"loss": 0.6845,
"step": 413
},
{
"epoch": 1.0507614213197969,
"grad_norm": 2.7139182825484487,
"learning_rate": 7.547330408197695e-06,
"loss": 0.6806,
"step": 414
},
{
"epoch": 1.0532994923857868,
"grad_norm": 8.510738024789926,
"learning_rate": 7.535526299063991e-06,
"loss": 0.7512,
"step": 415
},
{
"epoch": 1.0558375634517767,
"grad_norm": 4.04200237479796,
"learning_rate": 7.523703135395166e-06,
"loss": 0.7041,
"step": 416
},
{
"epoch": 1.0583756345177664,
"grad_norm": 2.260811025516907,
"learning_rate": 7.511861006042549e-06,
"loss": 0.6766,
"step": 417
},
{
"epoch": 1.0609137055837563,
"grad_norm": 4.490561125470434,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7141,
"step": 418
},
{
"epoch": 1.0634517766497462,
"grad_norm": 2.194578251618841,
"learning_rate": 7.488120206403238e-06,
"loss": 0.659,
"step": 419
},
{
"epoch": 1.0659898477157361,
"grad_norm": 2.6947953262722506,
"learning_rate": 7.476221714529167e-06,
"loss": 0.7283,
"step": 420
},
{
"epoch": 1.0685279187817258,
"grad_norm": 2.671286895370636,
"learning_rate": 7.4643046137952135e-06,
"loss": 0.7255,
"step": 421
},
{
"epoch": 1.0710659898477157,
"grad_norm": 4.777736320375133,
"learning_rate": 7.452368993758646e-06,
"loss": 0.6922,
"step": 422
},
{
"epoch": 1.0736040609137056,
"grad_norm": 2.839467923831252,
"learning_rate": 7.440414944115909e-06,
"loss": 0.708,
"step": 423
},
{
"epoch": 1.0761421319796955,
"grad_norm": 4.561346498567163,
"learning_rate": 7.428442554701945e-06,
"loss": 0.7903,
"step": 424
},
{
"epoch": 1.0786802030456852,
"grad_norm": 2.1485750563974624,
"learning_rate": 7.416451915489521e-06,
"loss": 0.7441,
"step": 425
},
{
"epoch": 1.0812182741116751,
"grad_norm": 3.8074935327749015,
"learning_rate": 7.404443116588548e-06,
"loss": 0.7575,
"step": 426
},
{
"epoch": 1.083756345177665,
"grad_norm": 2.4065976714180293,
"learning_rate": 7.392416248245412e-06,
"loss": 0.7296,
"step": 427
},
{
"epoch": 1.0862944162436547,
"grad_norm": 4.492198367779481,
"learning_rate": 7.38037140084229e-06,
"loss": 0.7928,
"step": 428
},
{
"epoch": 1.0888324873096447,
"grad_norm": 3.2332302381426707,
"learning_rate": 7.368308664896471e-06,
"loss": 0.7579,
"step": 429
},
{
"epoch": 1.0913705583756346,
"grad_norm": 6.182125255230219,
"learning_rate": 7.356228131059675e-06,
"loss": 0.6711,
"step": 430
},
{
"epoch": 1.0939086294416245,
"grad_norm": 2.73756664858534,
"learning_rate": 7.344129890117377e-06,
"loss": 0.7056,
"step": 431
},
{
"epoch": 1.0964467005076142,
"grad_norm": 2.1816031611580367,
"learning_rate": 7.332014032988123e-06,
"loss": 0.8818,
"step": 432
},
{
"epoch": 1.098984771573604,
"grad_norm": 2.1402473991159288,
"learning_rate": 7.319880650722838e-06,
"loss": 0.7043,
"step": 433
},
{
"epoch": 1.101522842639594,
"grad_norm": 4.3141992706587535,
"learning_rate": 7.307729834504155e-06,
"loss": 0.672,
"step": 434
},
{
"epoch": 1.1040609137055837,
"grad_norm": 2.5718559492040494,
"learning_rate": 7.29556167564572e-06,
"loss": 0.7288,
"step": 435
},
{
"epoch": 1.1065989847715736,
"grad_norm": 2.8145101788480473,
"learning_rate": 7.283376265591514e-06,
"loss": 0.7154,
"step": 436
},
{
"epoch": 1.1091370558375635,
"grad_norm": 3.3920536101786327,
"learning_rate": 7.271173695915154e-06,
"loss": 0.6787,
"step": 437
},
{
"epoch": 1.1116751269035534,
"grad_norm": 3.5945284430541182,
"learning_rate": 7.2589540583192165e-06,
"loss": 0.9233,
"step": 438
},
{
"epoch": 1.114213197969543,
"grad_norm": 4.886400955382691,
"learning_rate": 7.2467174446345435e-06,
"loss": 0.6844,
"step": 439
},
{
"epoch": 1.116751269035533,
"grad_norm": 4.600149769508583,
"learning_rate": 7.234463946819553e-06,
"loss": 0.7041,
"step": 440
},
{
"epoch": 1.119289340101523,
"grad_norm": 4.014196576676073,
"learning_rate": 7.222193656959546e-06,
"loss": 0.6791,
"step": 441
},
{
"epoch": 1.1218274111675126,
"grad_norm": 2.5402480546901773,
"learning_rate": 7.209906667266018e-06,
"loss": 0.9044,
"step": 442
},
{
"epoch": 1.1243654822335025,
"grad_norm": 3.1380754969776885,
"learning_rate": 7.19760307007596e-06,
"loss": 0.743,
"step": 443
},
{
"epoch": 1.1269035532994924,
"grad_norm": 4.388883522170381,
"learning_rate": 7.185282957851175e-06,
"loss": 0.7518,
"step": 444
},
{
"epoch": 1.1294416243654823,
"grad_norm": 2.3881650297375043,
"learning_rate": 7.172946423177574e-06,
"loss": 0.7223,
"step": 445
},
{
"epoch": 1.131979695431472,
"grad_norm": 5.0576028219200575,
"learning_rate": 7.160593558764477e-06,
"loss": 0.7502,
"step": 446
},
{
"epoch": 1.134517766497462,
"grad_norm": 4.182627482293584,
"learning_rate": 7.148224457443933e-06,
"loss": 0.7233,
"step": 447
},
{
"epoch": 1.1370558375634519,
"grad_norm": 3.323927436904356,
"learning_rate": 7.135839212170008e-06,
"loss": 0.7562,
"step": 448
},
{
"epoch": 1.1395939086294415,
"grad_norm": 5.674572111954262,
"learning_rate": 7.123437916018084e-06,
"loss": 0.6563,
"step": 449
},
{
"epoch": 1.1421319796954315,
"grad_norm": 5.193844514646635,
"learning_rate": 7.111020662184174e-06,
"loss": 0.6701,
"step": 450
},
{
"epoch": 1.1446700507614214,
"grad_norm": 2.745047805120342,
"learning_rate": 7.098587543984208e-06,
"loss": 0.6504,
"step": 451
},
{
"epoch": 1.1472081218274113,
"grad_norm": 5.2424783860025475,
"learning_rate": 7.086138654853339e-06,
"loss": 0.7568,
"step": 452
},
{
"epoch": 1.149746192893401,
"grad_norm": 2.525028908338044,
"learning_rate": 7.073674088345239e-06,
"loss": 0.7756,
"step": 453
},
{
"epoch": 1.1522842639593909,
"grad_norm": 3.94906824829304,
"learning_rate": 7.061193938131397e-06,
"loss": 0.7004,
"step": 454
},
{
"epoch": 1.1548223350253808,
"grad_norm": 3.9522980746979632,
"learning_rate": 7.048698298000411e-06,
"loss": 0.8008,
"step": 455
},
{
"epoch": 1.1573604060913705,
"grad_norm": 3.3433822313720345,
"learning_rate": 7.036187261857289e-06,
"loss": 0.7565,
"step": 456
},
{
"epoch": 1.1598984771573604,
"grad_norm": 2.786840629093061,
"learning_rate": 7.023660923722737e-06,
"loss": 0.7161,
"step": 457
},
{
"epoch": 1.1624365482233503,
"grad_norm": 3.054413975453938,
"learning_rate": 7.011119377732459e-06,
"loss": 0.7952,
"step": 458
},
{
"epoch": 1.1649746192893402,
"grad_norm": 3.8923977302485726,
"learning_rate": 6.998562718136445e-06,
"loss": 0.7485,
"step": 459
},
{
"epoch": 1.16751269035533,
"grad_norm": 2.84562510967333,
"learning_rate": 6.985991039298263e-06,
"loss": 0.7196,
"step": 460
},
{
"epoch": 1.1700507614213198,
"grad_norm": 2.4183432989182294,
"learning_rate": 6.973404435694353e-06,
"loss": 0.7646,
"step": 461
},
{
"epoch": 1.1725888324873097,
"grad_norm": 2.7200567540717877,
"learning_rate": 6.960803001913315e-06,
"loss": 0.7,
"step": 462
},
{
"epoch": 1.1751269035532994,
"grad_norm": 3.63959643248371,
"learning_rate": 6.948186832655195e-06,
"loss": 0.7266,
"step": 463
},
{
"epoch": 1.1776649746192893,
"grad_norm": 3.2936479977582365,
"learning_rate": 6.93555602273078e-06,
"loss": 0.7935,
"step": 464
},
{
"epoch": 1.1802030456852792,
"grad_norm": 5.592071642497226,
"learning_rate": 6.922910667060881e-06,
"loss": 0.7863,
"step": 465
},
{
"epoch": 1.1827411167512691,
"grad_norm": 4.04842721540907,
"learning_rate": 6.910250860675618e-06,
"loss": 0.7015,
"step": 466
},
{
"epoch": 1.1852791878172588,
"grad_norm": 3.4995823546724885,
"learning_rate": 6.897576698713713e-06,
"loss": 0.713,
"step": 467
},
{
"epoch": 1.1878172588832487,
"grad_norm": 2.0124542793260116,
"learning_rate": 6.884888276421766e-06,
"loss": 0.7089,
"step": 468
},
{
"epoch": 1.1903553299492386,
"grad_norm": 3.0245832112482685,
"learning_rate": 6.872185689153548e-06,
"loss": 0.7502,
"step": 469
},
{
"epoch": 1.1928934010152283,
"grad_norm": 3.1968776773224103,
"learning_rate": 6.859469032369275e-06,
"loss": 0.6792,
"step": 470
},
{
"epoch": 1.1954314720812182,
"grad_norm": 2.599351071193472,
"learning_rate": 6.846738401634899e-06,
"loss": 0.7182,
"step": 471
},
{
"epoch": 1.1979695431472082,
"grad_norm": 3.8378544182683254,
"learning_rate": 6.833993892621388e-06,
"loss": 0.6645,
"step": 472
},
{
"epoch": 1.200507614213198,
"grad_norm": 3.6586818009882616,
"learning_rate": 6.821235601104001e-06,
"loss": 0.7123,
"step": 473
},
{
"epoch": 1.2030456852791878,
"grad_norm": 2.495122167223983,
"learning_rate": 6.8084636229615786e-06,
"loss": 0.822,
"step": 474
},
{
"epoch": 1.2055837563451777,
"grad_norm": 2.311020743189909,
"learning_rate": 6.795678054175811e-06,
"loss": 0.7897,
"step": 475
},
{
"epoch": 1.2081218274111676,
"grad_norm": 3.3626364340836856,
"learning_rate": 6.782878990830527e-06,
"loss": 0.6936,
"step": 476
},
{
"epoch": 1.2106598984771573,
"grad_norm": 5.169275561140567,
"learning_rate": 6.770066529110964e-06,
"loss": 0.6622,
"step": 477
},
{
"epoch": 1.2131979695431472,
"grad_norm": 3.078776628999445,
"learning_rate": 6.757240765303047e-06,
"loss": 0.7513,
"step": 478
},
{
"epoch": 1.215736040609137,
"grad_norm": 5.617176822012553,
"learning_rate": 6.744401795792673e-06,
"loss": 0.7513,
"step": 479
},
{
"epoch": 1.218274111675127,
"grad_norm": 7.732644354996138,
"learning_rate": 6.731549717064975e-06,
"loss": 0.8617,
"step": 480
},
{
"epoch": 1.2208121827411167,
"grad_norm": 2.4117427336853696,
"learning_rate": 6.718684625703603e-06,
"loss": 0.7432,
"step": 481
},
{
"epoch": 1.2233502538071066,
"grad_norm": 1.8400980254586687,
"learning_rate": 6.705806618389998e-06,
"loss": 0.8022,
"step": 482
},
{
"epoch": 1.2258883248730965,
"grad_norm": 3.6097931824757032,
"learning_rate": 6.6929157919026645e-06,
"loss": 0.9118,
"step": 483
},
{
"epoch": 1.2284263959390862,
"grad_norm": 4.540403777710814,
"learning_rate": 6.6800122431164425e-06,
"loss": 0.8484,
"step": 484
},
{
"epoch": 1.2309644670050761,
"grad_norm": 5.750951180784241,
"learning_rate": 6.6670960690017814e-06,
"loss": 0.6695,
"step": 485
},
{
"epoch": 1.233502538071066,
"grad_norm": 3.3909591035556366,
"learning_rate": 6.654167366624009e-06,
"loss": 0.7365,
"step": 486
},
{
"epoch": 1.236040609137056,
"grad_norm": 4.588355673915564,
"learning_rate": 6.641226233142605e-06,
"loss": 0.7533,
"step": 487
},
{
"epoch": 1.2385786802030456,
"grad_norm": 6.6404536496522795,
"learning_rate": 6.628272765810468e-06,
"loss": 0.6466,
"step": 488
},
{
"epoch": 1.2411167512690355,
"grad_norm": 3.434312988744615,
"learning_rate": 6.615307061973185e-06,
"loss": 0.6203,
"step": 489
},
{
"epoch": 1.2436548223350254,
"grad_norm": 3.0930958381981815,
"learning_rate": 6.602329219068302e-06,
"loss": 0.7669,
"step": 490
},
{
"epoch": 1.2461928934010151,
"grad_norm": 2.8869650975031416,
"learning_rate": 6.5893393346245906e-06,
"loss": 0.7633,
"step": 491
},
{
"epoch": 1.248730964467005,
"grad_norm": 3.170504311720801,
"learning_rate": 6.576337506261314e-06,
"loss": 0.6953,
"step": 492
},
{
"epoch": 1.251269035532995,
"grad_norm": 2.780063933956294,
"learning_rate": 6.563323831687493e-06,
"loss": 0.709,
"step": 493
},
{
"epoch": 1.2538071065989849,
"grad_norm": 2.555837538242703,
"learning_rate": 6.550298408701175e-06,
"loss": 0.7809,
"step": 494
},
{
"epoch": 1.2563451776649746,
"grad_norm": 4.0633641977824855,
"learning_rate": 6.537261335188696e-06,
"loss": 0.6886,
"step": 495
},
{
"epoch": 1.2588832487309645,
"grad_norm": 4.623003200579342,
"learning_rate": 6.524212709123947e-06,
"loss": 0.7008,
"step": 496
},
{
"epoch": 1.2614213197969544,
"grad_norm": 3.5939963505107606,
"learning_rate": 6.511152628567635e-06,
"loss": 0.6717,
"step": 497
},
{
"epoch": 1.263959390862944,
"grad_norm": 2.34961246443107,
"learning_rate": 6.498081191666549e-06,
"loss": 0.6651,
"step": 498
},
{
"epoch": 1.266497461928934,
"grad_norm": 4.8848252156015395,
"learning_rate": 6.48499849665282e-06,
"loss": 0.7782,
"step": 499
},
{
"epoch": 1.2690355329949239,
"grad_norm": 2.747548072805645,
"learning_rate": 6.471904641843187e-06,
"loss": 0.7487,
"step": 500
},
{
"epoch": 1.2715736040609138,
"grad_norm": 2.8146283218783896,
"learning_rate": 6.458799725638249e-06,
"loss": 0.7939,
"step": 501
},
{
"epoch": 1.2741116751269035,
"grad_norm": 2.6115782469222437,
"learning_rate": 6.4456838465217384e-06,
"loss": 0.7964,
"step": 502
},
{
"epoch": 1.2766497461928934,
"grad_norm": 4.382058548644622,
"learning_rate": 6.432557103059771e-06,
"loss": 0.6758,
"step": 503
},
{
"epoch": 1.2791878172588833,
"grad_norm": 4.2761528963141044,
"learning_rate": 6.419419593900109e-06,
"loss": 0.6878,
"step": 504
},
{
"epoch": 1.281725888324873,
"grad_norm": 2.5161012535059433,
"learning_rate": 6.4062714177714166e-06,
"loss": 0.8262,
"step": 505
},
{
"epoch": 1.284263959390863,
"grad_norm": 3.4501823268959675,
"learning_rate": 6.393112673482522e-06,
"loss": 0.7008,
"step": 506
},
{
"epoch": 1.2868020304568528,
"grad_norm": 3.945300277429004,
"learning_rate": 6.379943459921677e-06,
"loss": 0.6499,
"step": 507
},
{
"epoch": 1.2893401015228427,
"grad_norm": 4.06451496648234,
"learning_rate": 6.3667638760558055e-06,
"loss": 0.7884,
"step": 508
},
{
"epoch": 1.2918781725888324,
"grad_norm": 2.7617149802354874,
"learning_rate": 6.353574020929767e-06,
"loss": 0.735,
"step": 509
},
{
"epoch": 1.2944162436548223,
"grad_norm": 3.167960813091416,
"learning_rate": 6.340373993665607e-06,
"loss": 0.9423,
"step": 510
},
{
"epoch": 1.2969543147208122,
"grad_norm": 2.8725417311483246,
"learning_rate": 6.327163893461819e-06,
"loss": 0.6771,
"step": 511
},
{
"epoch": 1.299492385786802,
"grad_norm": 3.9287674174064353,
"learning_rate": 6.31394381959259e-06,
"loss": 0.862,
"step": 512
},
{
"epoch": 1.3020304568527918,
"grad_norm": 3.0972388669258266,
"learning_rate": 6.300713871407062e-06,
"loss": 0.6995,
"step": 513
},
{
"epoch": 1.3045685279187818,
"grad_norm": 2.431302901450318,
"learning_rate": 6.287474148328584e-06,
"loss": 0.6403,
"step": 514
},
{
"epoch": 1.3071065989847717,
"grad_norm": 4.801341289977943,
"learning_rate": 6.274224749853961e-06,
"loss": 0.7267,
"step": 515
},
{
"epoch": 1.3096446700507614,
"grad_norm": 3.1112707577132324,
"learning_rate": 6.2609657755527135e-06,
"loss": 0.7341,
"step": 516
},
{
"epoch": 1.3121827411167513,
"grad_norm": 4.050638104301098,
"learning_rate": 6.247697325066314e-06,
"loss": 0.9268,
"step": 517
},
{
"epoch": 1.3147208121827412,
"grad_norm": 4.358249942985889,
"learning_rate": 6.2344194981074616e-06,
"loss": 0.8526,
"step": 518
},
{
"epoch": 1.3172588832487309,
"grad_norm": 2.606425333009026,
"learning_rate": 6.22113239445931e-06,
"loss": 0.7707,
"step": 519
},
{
"epoch": 1.3197969543147208,
"grad_norm": 2.16185838449471,
"learning_rate": 6.2078361139747334e-06,
"loss": 0.756,
"step": 520
},
{
"epoch": 1.3223350253807107,
"grad_norm": 11.438171951626867,
"learning_rate": 6.194530756575567e-06,
"loss": 0.8001,
"step": 521
},
{
"epoch": 1.3248730964467006,
"grad_norm": 4.268134675954473,
"learning_rate": 6.1812164222518626e-06,
"loss": 0.5958,
"step": 522
},
{
"epoch": 1.3274111675126903,
"grad_norm": 4.015099725456588,
"learning_rate": 6.167893211061128e-06,
"loss": 0.8645,
"step": 523
},
{
"epoch": 1.3299492385786802,
"grad_norm": 7.084231421312322,
"learning_rate": 6.154561223127587e-06,
"loss": 0.7082,
"step": 524
},
{
"epoch": 1.33248730964467,
"grad_norm": 3.8215913212518773,
"learning_rate": 6.141220558641416e-06,
"loss": 0.6995,
"step": 525
},
{
"epoch": 1.3350253807106598,
"grad_norm": 3.038519186584797,
"learning_rate": 6.127871317857996e-06,
"loss": 0.6656,
"step": 526
},
{
"epoch": 1.3375634517766497,
"grad_norm": 3.8116749862373958,
"learning_rate": 6.114513601097165e-06,
"loss": 0.8154,
"step": 527
},
{
"epoch": 1.3401015228426396,
"grad_norm": 4.817233367550986,
"learning_rate": 6.101147508742456e-06,
"loss": 0.7358,
"step": 528
},
{
"epoch": 1.3426395939086295,
"grad_norm": 3.646219613432323,
"learning_rate": 6.0877731412403365e-06,
"loss": 0.7699,
"step": 529
},
{
"epoch": 1.3451776649746192,
"grad_norm": 6.120745299164355,
"learning_rate": 6.0743905990994714e-06,
"loss": 0.8005,
"step": 530
},
{
"epoch": 1.3477157360406091,
"grad_norm": 3.0596765875812904,
"learning_rate": 6.060999982889955e-06,
"loss": 0.744,
"step": 531
},
{
"epoch": 1.350253807106599,
"grad_norm": 2.792466503447904,
"learning_rate": 6.04760139324256e-06,
"loss": 0.7307,
"step": 532
},
{
"epoch": 1.3527918781725887,
"grad_norm": 5.179116357662163,
"learning_rate": 6.0341949308479755e-06,
"loss": 0.6799,
"step": 533
},
{
"epoch": 1.3553299492385786,
"grad_norm": 3.822692771814533,
"learning_rate": 6.020780696456059e-06,
"loss": 0.7327,
"step": 534
},
{
"epoch": 1.3578680203045685,
"grad_norm": 2.3276793880324753,
"learning_rate": 6.0073587908750715e-06,
"loss": 0.7131,
"step": 535
},
{
"epoch": 1.3604060913705585,
"grad_norm": 5.9622605348086175,
"learning_rate": 5.9939293149709265e-06,
"loss": 0.8849,
"step": 536
},
{
"epoch": 1.3629441624365481,
"grad_norm": 4.449818782204746,
"learning_rate": 5.9804923696664255e-06,
"loss": 0.7274,
"step": 537
},
{
"epoch": 1.365482233502538,
"grad_norm": 10.86624898759871,
"learning_rate": 5.967048055940503e-06,
"loss": 0.7212,
"step": 538
},
{
"epoch": 1.368020304568528,
"grad_norm": 3.145159036375936,
"learning_rate": 5.953596474827469e-06,
"loss": 0.7319,
"step": 539
},
{
"epoch": 1.3705583756345177,
"grad_norm": 2.3980927107396197,
"learning_rate": 5.940137727416247e-06,
"loss": 0.6897,
"step": 540
},
{
"epoch": 1.3730964467005076,
"grad_norm": 3.418541376783986,
"learning_rate": 5.9266719148496155e-06,
"loss": 0.7733,
"step": 541
},
{
"epoch": 1.3756345177664975,
"grad_norm": 2.8126841157843057,
"learning_rate": 5.9131991383234485e-06,
"loss": 0.6699,
"step": 542
},
{
"epoch": 1.3781725888324874,
"grad_norm": 3.0318559748788534,
"learning_rate": 5.8997194990859545e-06,
"loss": 0.6653,
"step": 543
},
{
"epoch": 1.380710659898477,
"grad_norm": 3.5210511267819067,
"learning_rate": 5.886233098436914e-06,
"loss": 0.7593,
"step": 544
},
{
"epoch": 1.383248730964467,
"grad_norm": 4.2948950638851455,
"learning_rate": 5.872740037726919e-06,
"loss": 0.699,
"step": 545
},
{
"epoch": 1.385786802030457,
"grad_norm": 2.6028118744729007,
"learning_rate": 5.859240418356614e-06,
"loss": 0.7695,
"step": 546
},
{
"epoch": 1.3883248730964466,
"grad_norm": 4.782785847123683,
"learning_rate": 5.845734341775933e-06,
"loss": 0.8879,
"step": 547
},
{
"epoch": 1.3908629441624365,
"grad_norm": 2.6172139666414727,
"learning_rate": 5.832221909483334e-06,
"loss": 0.7758,
"step": 548
},
{
"epoch": 1.3934010152284264,
"grad_norm": 4.763054993480605,
"learning_rate": 5.818703223025036e-06,
"loss": 0.6957,
"step": 549
},
{
"epoch": 1.3959390862944163,
"grad_norm": 3.1414376067249923,
"learning_rate": 5.805178383994264e-06,
"loss": 0.7213,
"step": 550
},
{
"epoch": 1.398477157360406,
"grad_norm": 3.130650396595069,
"learning_rate": 5.791647494030475e-06,
"loss": 0.6065,
"step": 551
},
{
"epoch": 1.401015228426396,
"grad_norm": 3.261563931648429,
"learning_rate": 5.778110654818602e-06,
"loss": 0.7958,
"step": 552
},
{
"epoch": 1.4035532994923858,
"grad_norm": 3.8311803433345433,
"learning_rate": 5.764567968088282e-06,
"loss": 0.5946,
"step": 553
},
{
"epoch": 1.4060913705583755,
"grad_norm": 2.743364529127526,
"learning_rate": 5.751019535613103e-06,
"loss": 0.7435,
"step": 554
},
{
"epoch": 1.4086294416243654,
"grad_norm": 4.489529309087514,
"learning_rate": 5.737465459209825e-06,
"loss": 0.78,
"step": 555
},
{
"epoch": 1.4111675126903553,
"grad_norm": 3.7301506721368947,
"learning_rate": 5.723905840737632e-06,
"loss": 0.6712,
"step": 556
},
{
"epoch": 1.4137055837563453,
"grad_norm": 4.575944751601415,
"learning_rate": 5.710340782097347e-06,
"loss": 0.7624,
"step": 557
},
{
"epoch": 1.4162436548223352,
"grad_norm": 3.1435876497511863,
"learning_rate": 5.696770385230679e-06,
"loss": 0.8174,
"step": 558
},
{
"epoch": 1.4187817258883249,
"grad_norm": 2.431171340737915,
"learning_rate": 5.683194752119457e-06,
"loss": 0.7116,
"step": 559
},
{
"epoch": 1.4213197969543148,
"grad_norm": 4.370602698661378,
"learning_rate": 5.6696139847848554e-06,
"loss": 0.777,
"step": 560
},
{
"epoch": 1.4238578680203045,
"grad_norm": 4.718481956713903,
"learning_rate": 5.656028185286638e-06,
"loss": 0.6899,
"step": 561
},
{
"epoch": 1.4263959390862944,
"grad_norm": 2.821013688228599,
"learning_rate": 5.6424374557223815e-06,
"loss": 0.8273,
"step": 562
},
{
"epoch": 1.4289340101522843,
"grad_norm": 2.883810214582895,
"learning_rate": 5.628841898226715e-06,
"loss": 0.73,
"step": 563
},
{
"epoch": 1.4314720812182742,
"grad_norm": 2.2292511015864727,
"learning_rate": 5.615241614970546e-06,
"loss": 0.7264,
"step": 564
},
{
"epoch": 1.434010152284264,
"grad_norm": 2.820445001892862,
"learning_rate": 5.601636708160297e-06,
"loss": 0.7463,
"step": 565
},
{
"epoch": 1.4365482233502538,
"grad_norm": 5.707035941059044,
"learning_rate": 5.588027280037139e-06,
"loss": 0.8703,
"step": 566
},
{
"epoch": 1.4390862944162437,
"grad_norm": 6.244765987803611,
"learning_rate": 5.5744134328762225e-06,
"loss": 0.8395,
"step": 567
},
{
"epoch": 1.4416243654822334,
"grad_norm": 5.279336772760707,
"learning_rate": 5.560795268985899e-06,
"loss": 0.5548,
"step": 568
},
{
"epoch": 1.4441624365482233,
"grad_norm": 2.4337670623677936,
"learning_rate": 5.547172890706969e-06,
"loss": 0.81,
"step": 569
},
{
"epoch": 1.4467005076142132,
"grad_norm": 2.559001907523824,
"learning_rate": 5.533546400411899e-06,
"loss": 0.6723,
"step": 570
},
{
"epoch": 1.4492385786802031,
"grad_norm": 2.095739075458118,
"learning_rate": 5.519915900504059e-06,
"loss": 0.7547,
"step": 571
},
{
"epoch": 1.451776649746193,
"grad_norm": 2.7618676186626105,
"learning_rate": 5.506281493416954e-06,
"loss": 0.7759,
"step": 572
},
{
"epoch": 1.4543147208121827,
"grad_norm": 2.4429284260023025,
"learning_rate": 5.492643281613444e-06,
"loss": 0.7779,
"step": 573
},
{
"epoch": 1.4568527918781726,
"grad_norm": 2.248761204996824,
"learning_rate": 5.4790013675849906e-06,
"loss": 0.7139,
"step": 574
},
{
"epoch": 1.4593908629441623,
"grad_norm": 2.04849421734439,
"learning_rate": 5.465355853850873e-06,
"loss": 0.7967,
"step": 575
},
{
"epoch": 1.4619289340101522,
"grad_norm": 2.3149667017145474,
"learning_rate": 5.4517068429574215e-06,
"loss": 0.7546,
"step": 576
},
{
"epoch": 1.4644670050761421,
"grad_norm": 2.7279348376505372,
"learning_rate": 5.438054437477249e-06,
"loss": 0.7709,
"step": 577
},
{
"epoch": 1.467005076142132,
"grad_norm": 4.10133980464364,
"learning_rate": 5.424398740008481e-06,
"loss": 0.7447,
"step": 578
},
{
"epoch": 1.469543147208122,
"grad_norm": 3.0670495095077,
"learning_rate": 5.4107398531739765e-06,
"loss": 0.6942,
"step": 579
},
{
"epoch": 1.4720812182741116,
"grad_norm": 2.628033659732392,
"learning_rate": 5.397077879620569e-06,
"loss": 0.7497,
"step": 580
},
{
"epoch": 1.4746192893401016,
"grad_norm": 2.415953001245352,
"learning_rate": 5.383412922018285e-06,
"loss": 0.6987,
"step": 581
},
{
"epoch": 1.4771573604060912,
"grad_norm": 3.0726342117302208,
"learning_rate": 5.3697450830595775e-06,
"loss": 0.7094,
"step": 582
},
{
"epoch": 1.4796954314720812,
"grad_norm": 8.276976756150773,
"learning_rate": 5.356074465458553e-06,
"loss": 0.649,
"step": 583
},
{
"epoch": 1.482233502538071,
"grad_norm": 9.68509977234341,
"learning_rate": 5.3424011719502e-06,
"loss": 0.726,
"step": 584
},
{
"epoch": 1.484771573604061,
"grad_norm": 9.749912315585314,
"learning_rate": 5.3287253052896125e-06,
"loss": 0.6428,
"step": 585
},
{
"epoch": 1.487309644670051,
"grad_norm": 6.96335200544029,
"learning_rate": 5.3150469682512275e-06,
"loss": 0.7821,
"step": 586
},
{
"epoch": 1.4898477157360406,
"grad_norm": 7.183939152524903,
"learning_rate": 5.301366263628045e-06,
"loss": 0.6567,
"step": 587
},
{
"epoch": 1.4923857868020305,
"grad_norm": 9.732377228662223,
"learning_rate": 5.287683294230855e-06,
"loss": 0.8965,
"step": 588
},
{
"epoch": 1.4949238578680202,
"grad_norm": 7.097575405762116,
"learning_rate": 5.273998162887472e-06,
"loss": 0.6165,
"step": 589
},
{
"epoch": 1.49746192893401,
"grad_norm": 10.336609092325649,
"learning_rate": 5.260310972441951e-06,
"loss": 0.6666,
"step": 590
},
{
"epoch": 1.5,
"grad_norm": 10.173431901856254,
"learning_rate": 5.246621825753827e-06,
"loss": 0.7356,
"step": 591
},
{
"epoch": 1.50253807106599,
"grad_norm": 7.252928905504994,
"learning_rate": 5.232930825697337e-06,
"loss": 0.6349,
"step": 592
},
{
"epoch": 1.5050761421319798,
"grad_norm": 9.740020421735906,
"learning_rate": 5.2192380751606365e-06,
"loss": 0.7093,
"step": 593
},
{
"epoch": 1.5076142131979695,
"grad_norm": 6.881744795837598,
"learning_rate": 5.20554367704505e-06,
"loss": 0.7547,
"step": 594
},
{
"epoch": 1.5101522842639594,
"grad_norm": 7.403862646753793,
"learning_rate": 5.191847734264272e-06,
"loss": 0.7827,
"step": 595
},
{
"epoch": 1.512690355329949,
"grad_norm": 7.857449479117419,
"learning_rate": 5.178150349743611e-06,
"loss": 0.7318,
"step": 596
},
{
"epoch": 1.515228426395939,
"grad_norm": 7.8138561408079985,
"learning_rate": 5.1644516264192075e-06,
"loss": 0.7887,
"step": 597
},
{
"epoch": 1.517766497461929,
"grad_norm": 11.529490066787323,
"learning_rate": 5.150751667237266e-06,
"loss": 0.7767,
"step": 598
},
{
"epoch": 1.5203045685279188,
"grad_norm": 8.291871868386467,
"learning_rate": 5.137050575153276e-06,
"loss": 0.7236,
"step": 599
},
{
"epoch": 1.5228426395939088,
"grad_norm": 13.386290031605322,
"learning_rate": 5.123348453131242e-06,
"loss": 0.7494,
"step": 600
},
{
"epoch": 1.5253807106598984,
"grad_norm": 10.501239515721437,
"learning_rate": 5.1096454041429064e-06,
"loss": 0.7183,
"step": 601
},
{
"epoch": 1.5279187817258884,
"grad_norm": 8.400955073843619,
"learning_rate": 5.095941531166982e-06,
"loss": 0.7093,
"step": 602
},
{
"epoch": 1.530456852791878,
"grad_norm": 8.114490333939267,
"learning_rate": 5.08223693718837e-06,
"loss": 0.7364,
"step": 603
},
{
"epoch": 1.532994923857868,
"grad_norm": 6.9837214192909505,
"learning_rate": 5.068531725197393e-06,
"loss": 0.7058,
"step": 604
},
{
"epoch": 1.5355329949238579,
"grad_norm": 10.522759207505343,
"learning_rate": 5.054825998189014e-06,
"loss": 0.6368,
"step": 605
},
{
"epoch": 1.5380710659898478,
"grad_norm": 6.270463565331074,
"learning_rate": 5.041119859162068e-06,
"loss": 0.6091,
"step": 606
},
{
"epoch": 1.5406091370558377,
"grad_norm": 6.726382391943539,
"learning_rate": 5.027413411118491e-06,
"loss": 0.6602,
"step": 607
},
{
"epoch": 1.5431472081218274,
"grad_norm": 6.317567266105698,
"learning_rate": 5.0137067570625345e-06,
"loss": 0.6634,
"step": 608
},
{
"epoch": 1.5456852791878173,
"grad_norm": 6.934687119093674,
"learning_rate": 5e-06,
"loss": 0.7209,
"step": 609
},
{
"epoch": 1.548223350253807,
"grad_norm": 5.596856448268176,
"learning_rate": 4.986293242937467e-06,
"loss": 0.7366,
"step": 610
},
{
"epoch": 1.5507614213197969,
"grad_norm": 6.021457219503407,
"learning_rate": 4.97258658888151e-06,
"loss": 0.7069,
"step": 611
},
{
"epoch": 1.5532994923857868,
"grad_norm": 4.6400991890100185,
"learning_rate": 4.958880140837934e-06,
"loss": 0.7575,
"step": 612
},
{
"epoch": 1.5558375634517767,
"grad_norm": 6.820325061206459,
"learning_rate": 4.945174001810989e-06,
"loss": 0.7669,
"step": 613
},
{
"epoch": 1.5583756345177666,
"grad_norm": 4.5981508300331395,
"learning_rate": 4.9314682748026095e-06,
"loss": 0.7572,
"step": 614
},
{
"epoch": 1.5609137055837563,
"grad_norm": 3.681334801007928,
"learning_rate": 4.917763062811631e-06,
"loss": 0.7417,
"step": 615
},
{
"epoch": 1.5634517766497462,
"grad_norm": 8.250248361704466,
"learning_rate": 4.904058468833019e-06,
"loss": 0.8454,
"step": 616
},
{
"epoch": 1.565989847715736,
"grad_norm": 9.8095466260306,
"learning_rate": 4.8903545958570935e-06,
"loss": 0.7939,
"step": 617
},
{
"epoch": 1.5685279187817258,
"grad_norm": 8.299972020814456,
"learning_rate": 4.876651546868759e-06,
"loss": 0.7407,
"step": 618
},
{
"epoch": 1.5710659898477157,
"grad_norm": 7.39524807040033,
"learning_rate": 4.862949424846726e-06,
"loss": 0.7403,
"step": 619
},
{
"epoch": 1.5736040609137056,
"grad_norm": 7.02310739787436,
"learning_rate": 4.849248332762735e-06,
"loss": 0.8179,
"step": 620
},
{
"epoch": 1.5761421319796955,
"grad_norm": 7.940559854591136,
"learning_rate": 4.835548373580793e-06,
"loss": 0.6895,
"step": 621
},
{
"epoch": 1.5786802030456852,
"grad_norm": 4.8572557004474675,
"learning_rate": 4.8218496502563906e-06,
"loss": 0.6896,
"step": 622
},
{
"epoch": 1.5812182741116751,
"grad_norm": 6.306241421914032,
"learning_rate": 4.808152265735729e-06,
"loss": 0.7573,
"step": 623
},
{
"epoch": 1.5837563451776648,
"grad_norm": 6.9262999066348,
"learning_rate": 4.794456322954953e-06,
"loss": 0.7389,
"step": 624
},
{
"epoch": 1.5862944162436547,
"grad_norm": 6.812308267341822,
"learning_rate": 4.780761924839365e-06,
"loss": 0.6992,
"step": 625
},
{
"epoch": 1.5888324873096447,
"grad_norm": 9.421565668117234,
"learning_rate": 4.767069174302667e-06,
"loss": 0.7858,
"step": 626
},
{
"epoch": 1.5913705583756346,
"grad_norm": 8.64577548883685,
"learning_rate": 4.753378174246174e-06,
"loss": 0.8545,
"step": 627
},
{
"epoch": 1.5939086294416245,
"grad_norm": 11.328643712467004,
"learning_rate": 4.739689027558052e-06,
"loss": 0.7147,
"step": 628
},
{
"epoch": 1.5964467005076142,
"grad_norm": 7.4543053008075235,
"learning_rate": 4.726001837112529e-06,
"loss": 0.7868,
"step": 629
},
{
"epoch": 1.598984771573604,
"grad_norm": 7.259409943232553,
"learning_rate": 4.7123167057691446e-06,
"loss": 0.7483,
"step": 630
},
{
"epoch": 1.6015228426395938,
"grad_norm": 5.847738992367483,
"learning_rate": 4.6986337363719565e-06,
"loss": 0.655,
"step": 631
},
{
"epoch": 1.6040609137055837,
"grad_norm": 7.737551812189505,
"learning_rate": 4.684953031748773e-06,
"loss": 0.6705,
"step": 632
},
{
"epoch": 1.6065989847715736,
"grad_norm": 10.181314318265251,
"learning_rate": 4.671274694710388e-06,
"loss": 0.771,
"step": 633
},
{
"epoch": 1.6091370558375635,
"grad_norm": 7.794905487902131,
"learning_rate": 4.657598828049801e-06,
"loss": 0.6708,
"step": 634
},
{
"epoch": 1.6116751269035534,
"grad_norm": 8.145058809261956,
"learning_rate": 4.643925534541448e-06,
"loss": 0.7587,
"step": 635
},
{
"epoch": 1.614213197969543,
"grad_norm": 8.922528793798906,
"learning_rate": 4.630254916940424e-06,
"loss": 0.7323,
"step": 636
},
{
"epoch": 1.616751269035533,
"grad_norm": 7.00326933899544,
"learning_rate": 4.616587077981716e-06,
"loss": 0.8153,
"step": 637
},
{
"epoch": 1.6192893401015227,
"grad_norm": 7.999681110308186,
"learning_rate": 4.602922120379432e-06,
"loss": 0.7417,
"step": 638
},
{
"epoch": 1.6218274111675126,
"grad_norm": 8.438392955386595,
"learning_rate": 4.589260146826025e-06,
"loss": 0.7151,
"step": 639
},
{
"epoch": 1.6243654822335025,
"grad_norm": 7.155521862678608,
"learning_rate": 4.575601259991523e-06,
"loss": 0.6388,
"step": 640
},
{
"epoch": 1.6269035532994924,
"grad_norm": 9.059179058520689,
"learning_rate": 4.561945562522753e-06,
"loss": 0.7679,
"step": 641
},
{
"epoch": 1.6294416243654823,
"grad_norm": 10.854310437275425,
"learning_rate": 4.548293157042581e-06,
"loss": 0.6177,
"step": 642
},
{
"epoch": 1.631979695431472,
"grad_norm": 5.8672300531571855,
"learning_rate": 4.534644146149128e-06,
"loss": 0.761,
"step": 643
},
{
"epoch": 1.634517766497462,
"grad_norm": 9.975674381554894,
"learning_rate": 4.52099863241501e-06,
"loss": 0.5939,
"step": 644
},
{
"epoch": 1.6370558375634516,
"grad_norm": 7.111751840405073,
"learning_rate": 4.507356718386557e-06,
"loss": 0.6766,
"step": 645
},
{
"epoch": 1.6395939086294415,
"grad_norm": 6.247398349216973,
"learning_rate": 4.493718506583048e-06,
"loss": 0.7125,
"step": 646
},
{
"epoch": 1.6421319796954315,
"grad_norm": 7.549756956695894,
"learning_rate": 4.4800840994959426e-06,
"loss": 0.6904,
"step": 647
},
{
"epoch": 1.6446700507614214,
"grad_norm": 6.500884950383986,
"learning_rate": 4.466453599588103e-06,
"loss": 0.7022,
"step": 648
},
{
"epoch": 1.6472081218274113,
"grad_norm": 5.2335504629722704,
"learning_rate": 4.452827109293033e-06,
"loss": 0.7422,
"step": 649
},
{
"epoch": 1.649746192893401,
"grad_norm": 6.146630683172185,
"learning_rate": 4.439204731014102e-06,
"loss": 0.7558,
"step": 650
},
{
"epoch": 1.6522842639593909,
"grad_norm": 10.375183815595266,
"learning_rate": 4.42558656712378e-06,
"loss": 0.7455,
"step": 651
},
{
"epoch": 1.6548223350253806,
"grad_norm": 5.9665414440217255,
"learning_rate": 4.411972719962862e-06,
"loss": 0.6903,
"step": 652
},
{
"epoch": 1.6573604060913705,
"grad_norm": 11.099866755003685,
"learning_rate": 4.398363291839705e-06,
"loss": 0.6755,
"step": 653
},
{
"epoch": 1.6598984771573604,
"grad_norm": 4.488907541300615,
"learning_rate": 4.384758385029457e-06,
"loss": 0.7513,
"step": 654
},
{
"epoch": 1.6624365482233503,
"grad_norm": 5.519100934281681,
"learning_rate": 4.371158101773287e-06,
"loss": 0.7022,
"step": 655
},
{
"epoch": 1.6649746192893402,
"grad_norm": 3.780623949899037,
"learning_rate": 4.3575625442776185e-06,
"loss": 0.6907,
"step": 656
},
{
"epoch": 1.66751269035533,
"grad_norm": 3.8981771965418073,
"learning_rate": 4.3439718147133625e-06,
"loss": 0.6957,
"step": 657
},
{
"epoch": 1.6700507614213198,
"grad_norm": 2.434198707923521,
"learning_rate": 4.330386015215145e-06,
"loss": 0.9086,
"step": 658
},
{
"epoch": 1.6725888324873095,
"grad_norm": 3.6107308436430294,
"learning_rate": 4.316805247880546e-06,
"loss": 0.7743,
"step": 659
},
{
"epoch": 1.6751269035532994,
"grad_norm": 5.461014114880707,
"learning_rate": 4.3032296147693225e-06,
"loss": 0.6636,
"step": 660
},
{
"epoch": 1.6776649746192893,
"grad_norm": 2.9489874414341166,
"learning_rate": 4.289659217902655e-06,
"loss": 0.7954,
"step": 661
},
{
"epoch": 1.6802030456852792,
"grad_norm": 3.2634794475561617,
"learning_rate": 4.2760941592623686e-06,
"loss": 0.6887,
"step": 662
},
{
"epoch": 1.6827411167512691,
"grad_norm": 1.908474618570056,
"learning_rate": 4.262534540790176e-06,
"loss": 0.7072,
"step": 663
},
{
"epoch": 1.6852791878172588,
"grad_norm": 4.2509717118594645,
"learning_rate": 4.248980464386899e-06,
"loss": 0.6831,
"step": 664
},
{
"epoch": 1.6878172588832487,
"grad_norm": 1.9684828164673867,
"learning_rate": 4.23543203191172e-06,
"loss": 0.7576,
"step": 665
},
{
"epoch": 1.6903553299492384,
"grad_norm": 2.701415470742069,
"learning_rate": 4.2218893451814005e-06,
"loss": 0.687,
"step": 666
},
{
"epoch": 1.6928934010152283,
"grad_norm": 2.4493930106706627,
"learning_rate": 4.208352505969526e-06,
"loss": 0.6285,
"step": 667
},
{
"epoch": 1.6954314720812182,
"grad_norm": 2.7555789348597193,
"learning_rate": 4.194821616005738e-06,
"loss": 0.6422,
"step": 668
},
{
"epoch": 1.6979695431472082,
"grad_norm": 2.4972926405616866,
"learning_rate": 4.181296776974963e-06,
"loss": 0.7912,
"step": 669
},
{
"epoch": 1.700507614213198,
"grad_norm": 4.468489099102334,
"learning_rate": 4.167778090516667e-06,
"loss": 0.7716,
"step": 670
},
{
"epoch": 1.703045685279188,
"grad_norm": 6.010959172263999,
"learning_rate": 4.154265658224067e-06,
"loss": 0.7673,
"step": 671
},
{
"epoch": 1.7055837563451777,
"grad_norm": 2.2355125785055208,
"learning_rate": 4.140759581643386e-06,
"loss": 0.72,
"step": 672
},
{
"epoch": 1.7081218274111674,
"grad_norm": 2.3727185967108757,
"learning_rate": 4.127259962273082e-06,
"loss": 0.7083,
"step": 673
},
{
"epoch": 1.7106598984771573,
"grad_norm": 2.686125518968802,
"learning_rate": 4.113766901563087e-06,
"loss": 0.7431,
"step": 674
},
{
"epoch": 1.7131979695431472,
"grad_norm": 2.7573692052287897,
"learning_rate": 4.100280500914046e-06,
"loss": 0.7454,
"step": 675
},
{
"epoch": 1.715736040609137,
"grad_norm": 2.732285095831061,
"learning_rate": 4.086800861676552e-06,
"loss": 0.721,
"step": 676
},
{
"epoch": 1.718274111675127,
"grad_norm": 3.015824065840091,
"learning_rate": 4.073328085150386e-06,
"loss": 0.6937,
"step": 677
},
{
"epoch": 1.720812182741117,
"grad_norm": 6.951787707897627,
"learning_rate": 4.059862272583755e-06,
"loss": 0.7863,
"step": 678
},
{
"epoch": 1.7233502538071066,
"grad_norm": 4.845757849106375,
"learning_rate": 4.046403525172533e-06,
"loss": 0.8036,
"step": 679
},
{
"epoch": 1.7258883248730963,
"grad_norm": 3.0283091863817155,
"learning_rate": 4.0329519440595e-06,
"loss": 0.7565,
"step": 680
},
{
"epoch": 1.7284263959390862,
"grad_norm": 2.9032977085985063,
"learning_rate": 4.019507630333577e-06,
"loss": 0.7419,
"step": 681
},
{
"epoch": 1.7309644670050761,
"grad_norm": 2.732682655925896,
"learning_rate": 4.006070685029075e-06,
"loss": 0.7498,
"step": 682
},
{
"epoch": 1.733502538071066,
"grad_norm": 3.1823751482501184,
"learning_rate": 3.992641209124929e-06,
"loss": 0.6556,
"step": 683
},
{
"epoch": 1.736040609137056,
"grad_norm": 2.053257796239711,
"learning_rate": 3.979219303543942e-06,
"loss": 0.701,
"step": 684
},
{
"epoch": 1.7385786802030458,
"grad_norm": 3.1992206206683047,
"learning_rate": 3.965805069152025e-06,
"loss": 0.8046,
"step": 685
},
{
"epoch": 1.7411167512690355,
"grad_norm": 2.5494366065199525,
"learning_rate": 3.952398606757441e-06,
"loss": 0.6908,
"step": 686
},
{
"epoch": 1.7436548223350252,
"grad_norm": 2.7057236158660842,
"learning_rate": 3.939000017110046e-06,
"loss": 0.7024,
"step": 687
},
{
"epoch": 1.7461928934010151,
"grad_norm": 4.085040235331109,
"learning_rate": 3.92560940090053e-06,
"loss": 0.8566,
"step": 688
},
{
"epoch": 1.748730964467005,
"grad_norm": 2.133417834739682,
"learning_rate": 3.912226858759666e-06,
"loss": 0.7274,
"step": 689
},
{
"epoch": 1.751269035532995,
"grad_norm": 3.895538290272276,
"learning_rate": 3.898852491257547e-06,
"loss": 0.7114,
"step": 690
},
{
"epoch": 1.7538071065989849,
"grad_norm": 2.6652563108062592,
"learning_rate": 3.885486398902836e-06,
"loss": 0.6814,
"step": 691
},
{
"epoch": 1.7563451776649748,
"grad_norm": 2.6709125412075205,
"learning_rate": 3.872128682142005e-06,
"loss": 0.585,
"step": 692
},
{
"epoch": 1.7588832487309645,
"grad_norm": 7.758065846396269,
"learning_rate": 3.858779441358588e-06,
"loss": 0.6351,
"step": 693
},
{
"epoch": 1.7614213197969542,
"grad_norm": 2.339854294027953,
"learning_rate": 3.845438776872416e-06,
"loss": 0.7193,
"step": 694
},
{
"epoch": 1.763959390862944,
"grad_norm": 3.7441081697726886,
"learning_rate": 3.832106788938874e-06,
"loss": 0.8051,
"step": 695
},
{
"epoch": 1.766497461928934,
"grad_norm": 1.898947089645352,
"learning_rate": 3.818783577748138e-06,
"loss": 0.751,
"step": 696
},
{
"epoch": 1.7690355329949239,
"grad_norm": 4.8494211071824695,
"learning_rate": 3.8054692434244323e-06,
"loss": 0.7666,
"step": 697
},
{
"epoch": 1.7715736040609138,
"grad_norm": 2.7092795808497767,
"learning_rate": 3.7921638860252674e-06,
"loss": 0.8202,
"step": 698
},
{
"epoch": 1.7741116751269037,
"grad_norm": 2.976217075499209,
"learning_rate": 3.7788676055406913e-06,
"loss": 0.6678,
"step": 699
},
{
"epoch": 1.7766497461928934,
"grad_norm": 2.680319225126857,
"learning_rate": 3.76558050189254e-06,
"loss": 0.6886,
"step": 700
},
{
"epoch": 1.779187817258883,
"grad_norm": 2.296642253813403,
"learning_rate": 3.7523026749336868e-06,
"loss": 0.7568,
"step": 701
},
{
"epoch": 1.781725888324873,
"grad_norm": 2.938434483894651,
"learning_rate": 3.7390342244472886e-06,
"loss": 0.7419,
"step": 702
},
{
"epoch": 1.784263959390863,
"grad_norm": 2.4053334656704357,
"learning_rate": 3.7257752501460397e-06,
"loss": 0.6514,
"step": 703
},
{
"epoch": 1.7868020304568528,
"grad_norm": 2.378949854019335,
"learning_rate": 3.7125258516714175e-06,
"loss": 0.6336,
"step": 704
},
{
"epoch": 1.7893401015228427,
"grad_norm": 2.593432990200205,
"learning_rate": 3.6992861285929395e-06,
"loss": 0.9736,
"step": 705
},
{
"epoch": 1.7918781725888326,
"grad_norm": 2.5151437698388355,
"learning_rate": 3.6860561804074123e-06,
"loss": 0.7368,
"step": 706
},
{
"epoch": 1.7944162436548223,
"grad_norm": 2.8036107393674663,
"learning_rate": 3.6728361065381833e-06,
"loss": 0.6944,
"step": 707
},
{
"epoch": 1.796954314720812,
"grad_norm": 2.8715557691268003,
"learning_rate": 3.659626006334395e-06,
"loss": 0.7879,
"step": 708
},
{
"epoch": 1.799492385786802,
"grad_norm": 4.003179403948973,
"learning_rate": 3.646425979070233e-06,
"loss": 0.7558,
"step": 709
},
{
"epoch": 1.8020304568527918,
"grad_norm": 8.130933032348212,
"learning_rate": 3.633236123944195e-06,
"loss": 0.8283,
"step": 710
},
{
"epoch": 1.8045685279187818,
"grad_norm": 2.8583002801807575,
"learning_rate": 3.620056540078323e-06,
"loss": 0.7255,
"step": 711
},
{
"epoch": 1.8071065989847717,
"grad_norm": 5.09693351683838,
"learning_rate": 3.606887326517479e-06,
"loss": 0.8537,
"step": 712
},
{
"epoch": 1.8096446700507616,
"grad_norm": 3.014904465977428,
"learning_rate": 3.593728582228585e-06,
"loss": 0.6737,
"step": 713
},
{
"epoch": 1.8121827411167513,
"grad_norm": 3.6365545634430667,
"learning_rate": 3.5805804060998926e-06,
"loss": 0.679,
"step": 714
},
{
"epoch": 1.8147208121827412,
"grad_norm": 2.018230067478512,
"learning_rate": 3.567442896940231e-06,
"loss": 0.7556,
"step": 715
},
{
"epoch": 1.8172588832487309,
"grad_norm": 2.582862863818379,
"learning_rate": 3.554316153478263e-06,
"loss": 0.6736,
"step": 716
},
{
"epoch": 1.8197969543147208,
"grad_norm": 5.148339265724092,
"learning_rate": 3.5412002743617525e-06,
"loss": 0.7093,
"step": 717
},
{
"epoch": 1.8223350253807107,
"grad_norm": 3.5984283826570045,
"learning_rate": 3.5280953581568155e-06,
"loss": 0.7092,
"step": 718
},
{
"epoch": 1.8248730964467006,
"grad_norm": 3.0704447744931933,
"learning_rate": 3.5150015033471817e-06,
"loss": 0.8085,
"step": 719
},
{
"epoch": 1.8274111675126905,
"grad_norm": 10.00445719642568,
"learning_rate": 3.501918808333453e-06,
"loss": 0.671,
"step": 720
},
{
"epoch": 1.8299492385786802,
"grad_norm": 2.2671818495794374,
"learning_rate": 3.4888473714323675e-06,
"loss": 0.8039,
"step": 721
},
{
"epoch": 1.83248730964467,
"grad_norm": 6.221189957212778,
"learning_rate": 3.4757872908760554e-06,
"loss": 0.7488,
"step": 722
},
{
"epoch": 1.8350253807106598,
"grad_norm": 3.973211223724199,
"learning_rate": 3.4627386648113046e-06,
"loss": 0.7566,
"step": 723
},
{
"epoch": 1.8375634517766497,
"grad_norm": 4.306273661655382,
"learning_rate": 3.449701591298826e-06,
"loss": 0.7067,
"step": 724
},
{
"epoch": 1.8401015228426396,
"grad_norm": 2.576999153874239,
"learning_rate": 3.436676168312508e-06,
"loss": 0.8613,
"step": 725
},
{
"epoch": 1.8426395939086295,
"grad_norm": 2.343907257705208,
"learning_rate": 3.4236624937386874e-06,
"loss": 0.8303,
"step": 726
},
{
"epoch": 1.8451776649746194,
"grad_norm": 2.0952349982740253,
"learning_rate": 3.4106606653754102e-06,
"loss": 0.7578,
"step": 727
},
{
"epoch": 1.8477157360406091,
"grad_norm": 2.166342472964004,
"learning_rate": 3.397670780931699e-06,
"loss": 0.8091,
"step": 728
},
{
"epoch": 1.850253807106599,
"grad_norm": 3.099485050888837,
"learning_rate": 3.384692938026816e-06,
"loss": 0.7944,
"step": 729
},
{
"epoch": 1.8527918781725887,
"grad_norm": 2.3170390474934397,
"learning_rate": 3.3717272341895335e-06,
"loss": 0.754,
"step": 730
},
{
"epoch": 1.8553299492385786,
"grad_norm": 8.508529834209089,
"learning_rate": 3.358773766857397e-06,
"loss": 0.8398,
"step": 731
},
{
"epoch": 1.8578680203045685,
"grad_norm": 2.7296031884976815,
"learning_rate": 3.3458326333759927e-06,
"loss": 0.5618,
"step": 732
},
{
"epoch": 1.8604060913705585,
"grad_norm": 5.436648352881961,
"learning_rate": 3.3329039309982202e-06,
"loss": 0.715,
"step": 733
},
{
"epoch": 1.8629441624365484,
"grad_norm": 6.288919164300693,
"learning_rate": 3.319987756883559e-06,
"loss": 0.7989,
"step": 734
},
{
"epoch": 1.865482233502538,
"grad_norm": 3.4817572272054367,
"learning_rate": 3.307084208097337e-06,
"loss": 0.678,
"step": 735
},
{
"epoch": 1.868020304568528,
"grad_norm": 4.519379736523107,
"learning_rate": 3.2941933816100024e-06,
"loss": 0.6747,
"step": 736
},
{
"epoch": 1.8705583756345177,
"grad_norm": 3.178920019176328,
"learning_rate": 3.281315374296397e-06,
"loss": 0.7039,
"step": 737
},
{
"epoch": 1.8730964467005076,
"grad_norm": 3.2505180835432097,
"learning_rate": 3.268450282935026e-06,
"loss": 0.6003,
"step": 738
},
{
"epoch": 1.8756345177664975,
"grad_norm": 2.9109069621999155,
"learning_rate": 3.2555982042073287e-06,
"loss": 0.8364,
"step": 739
},
{
"epoch": 1.8781725888324874,
"grad_norm": 2.9599340344644,
"learning_rate": 3.2427592346969538e-06,
"loss": 0.7817,
"step": 740
},
{
"epoch": 1.8807106598984773,
"grad_norm": 2.4356343730468284,
"learning_rate": 3.2299334708890384e-06,
"loss": 0.767,
"step": 741
},
{
"epoch": 1.883248730964467,
"grad_norm": 2.5412975036915046,
"learning_rate": 3.217121009169474e-06,
"loss": 0.746,
"step": 742
},
{
"epoch": 1.885786802030457,
"grad_norm": 3.3029683648181214,
"learning_rate": 3.2043219458241896e-06,
"loss": 0.6829,
"step": 743
},
{
"epoch": 1.8883248730964466,
"grad_norm": 3.380145523857882,
"learning_rate": 3.1915363770384223e-06,
"loss": 0.6645,
"step": 744
},
{
"epoch": 1.8908629441624365,
"grad_norm": 3.688747382996952,
"learning_rate": 3.1787643988959993e-06,
"loss": 0.7798,
"step": 745
},
{
"epoch": 1.8934010152284264,
"grad_norm": 2.887848658016839,
"learning_rate": 3.1660061073786132e-06,
"loss": 0.664,
"step": 746
},
{
"epoch": 1.8959390862944163,
"grad_norm": 5.719598275135355,
"learning_rate": 3.1532615983651027e-06,
"loss": 0.6847,
"step": 747
},
{
"epoch": 1.8984771573604062,
"grad_norm": 2.2084171353583755,
"learning_rate": 3.1405309676307283e-06,
"loss": 0.6702,
"step": 748
},
{
"epoch": 1.901015228426396,
"grad_norm": 2.499900700135605,
"learning_rate": 3.127814310846453e-06,
"loss": 0.7367,
"step": 749
},
{
"epoch": 1.9035532994923858,
"grad_norm": 2.097745954293279,
"learning_rate": 3.1151117235782346e-06,
"loss": 0.7415,
"step": 750
},
{
"epoch": 1.9060913705583755,
"grad_norm": 4.193177406705227,
"learning_rate": 3.102423301286288e-06,
"loss": 0.7195,
"step": 751
},
{
"epoch": 1.9086294416243654,
"grad_norm": 4.1333003833317665,
"learning_rate": 3.089749139324383e-06,
"loss": 0.7775,
"step": 752
},
{
"epoch": 1.9111675126903553,
"grad_norm": 2.732619968521544,
"learning_rate": 3.0770893329391207e-06,
"loss": 0.7035,
"step": 753
},
{
"epoch": 1.9137055837563453,
"grad_norm": 2.3159844015784654,
"learning_rate": 3.06444397726922e-06,
"loss": 0.7481,
"step": 754
},
{
"epoch": 1.9162436548223352,
"grad_norm": 2.3696071469298134,
"learning_rate": 3.051813167344807e-06,
"loss": 0.6548,
"step": 755
},
{
"epoch": 1.9187817258883249,
"grad_norm": 3.067435702712297,
"learning_rate": 3.0391969980866874e-06,
"loss": 0.6184,
"step": 756
},
{
"epoch": 1.9213197969543148,
"grad_norm": 2.327610059779149,
"learning_rate": 3.026595564305649e-06,
"loss": 0.577,
"step": 757
},
{
"epoch": 1.9238578680203045,
"grad_norm": 2.0617755977820207,
"learning_rate": 3.0140089607017386e-06,
"loss": 0.7574,
"step": 758
},
{
"epoch": 1.9263959390862944,
"grad_norm": 8.037980133671185,
"learning_rate": 3.001437281863558e-06,
"loss": 0.6502,
"step": 759
},
{
"epoch": 1.9289340101522843,
"grad_norm": 2.753352962706752,
"learning_rate": 2.988880622267544e-06,
"loss": 0.8281,
"step": 760
},
{
"epoch": 1.9314720812182742,
"grad_norm": 2.4771490810702947,
"learning_rate": 2.976339076277265e-06,
"loss": 0.7763,
"step": 761
},
{
"epoch": 1.934010152284264,
"grad_norm": 3.702930187531345,
"learning_rate": 2.963812738142713e-06,
"loss": 0.7555,
"step": 762
},
{
"epoch": 1.9365482233502538,
"grad_norm": 3.811516534289345,
"learning_rate": 2.9513017019995892e-06,
"loss": 0.7605,
"step": 763
},
{
"epoch": 1.9390862944162437,
"grad_norm": 2.870096463585816,
"learning_rate": 2.938806061868603e-06,
"loss": 0.7558,
"step": 764
},
{
"epoch": 1.9416243654822334,
"grad_norm": 3.341968457840719,
"learning_rate": 2.9263259116547606e-06,
"loss": 0.7995,
"step": 765
},
{
"epoch": 1.9441624365482233,
"grad_norm": 2.3250437330227034,
"learning_rate": 2.9138613451466625e-06,
"loss": 0.7095,
"step": 766
},
{
"epoch": 1.9467005076142132,
"grad_norm": 4.089011850638278,
"learning_rate": 2.901412456015794e-06,
"loss": 0.7799,
"step": 767
},
{
"epoch": 1.9492385786802031,
"grad_norm": 5.39145678330308,
"learning_rate": 2.8889793378158284e-06,
"loss": 0.7432,
"step": 768
},
{
"epoch": 1.951776649746193,
"grad_norm": 2.5757496001839058,
"learning_rate": 2.8765620839819173e-06,
"loss": 0.7705,
"step": 769
},
{
"epoch": 1.9543147208121827,
"grad_norm": 2.1532580815403324,
"learning_rate": 2.864160787829994e-06,
"loss": 0.6409,
"step": 770
},
{
"epoch": 1.9568527918781726,
"grad_norm": 2.40066418251008,
"learning_rate": 2.8517755425560665e-06,
"loss": 0.6311,
"step": 771
},
{
"epoch": 1.9593908629441623,
"grad_norm": 2.3586685801917646,
"learning_rate": 2.8394064412355237e-06,
"loss": 0.7186,
"step": 772
},
{
"epoch": 1.9619289340101522,
"grad_norm": 4.490275405727021,
"learning_rate": 2.8270535768224306e-06,
"loss": 0.6813,
"step": 773
},
{
"epoch": 1.9644670050761421,
"grad_norm": 5.464180958902756,
"learning_rate": 2.814717042148827e-06,
"loss": 0.8316,
"step": 774
},
{
"epoch": 1.967005076142132,
"grad_norm": 3.0045365799520285,
"learning_rate": 2.802396929924042e-06,
"loss": 0.6874,
"step": 775
},
{
"epoch": 1.969543147208122,
"grad_norm": 2.099521271054753,
"learning_rate": 2.790093332733983e-06,
"loss": 0.6727,
"step": 776
},
{
"epoch": 1.9720812182741116,
"grad_norm": 2.7219964531890986,
"learning_rate": 2.7778063430404544e-06,
"loss": 0.6503,
"step": 777
},
{
"epoch": 1.9746192893401016,
"grad_norm": 5.930105066525641,
"learning_rate": 2.765536053180447e-06,
"loss": 0.6837,
"step": 778
},
{
"epoch": 1.9771573604060912,
"grad_norm": 2.353332363904023,
"learning_rate": 2.7532825553654565e-06,
"loss": 0.6135,
"step": 779
},
{
"epoch": 1.9796954314720812,
"grad_norm": 1.8537629771923843,
"learning_rate": 2.7410459416807856e-06,
"loss": 0.7775,
"step": 780
},
{
"epoch": 1.982233502538071,
"grad_norm": 2.0438002285632533,
"learning_rate": 2.7288263040848483e-06,
"loss": 0.6534,
"step": 781
},
{
"epoch": 1.984771573604061,
"grad_norm": 2.0013275878832713,
"learning_rate": 2.7166237344084883e-06,
"loss": 0.9252,
"step": 782
},
{
"epoch": 1.987309644670051,
"grad_norm": 2.262360303216243,
"learning_rate": 2.7044383243542804e-06,
"loss": 0.7165,
"step": 783
},
{
"epoch": 1.9898477157360406,
"grad_norm": 2.8760412225109246,
"learning_rate": 2.692270165495846e-06,
"loss": 0.7468,
"step": 784
},
{
"epoch": 1.9923857868020305,
"grad_norm": 2.9100081502109614,
"learning_rate": 2.6801193492771633e-06,
"loss": 0.7086,
"step": 785
},
{
"epoch": 1.9949238578680202,
"grad_norm": 4.9781125187661255,
"learning_rate": 2.6679859670118785e-06,
"loss": 0.7527,
"step": 786
},
{
"epoch": 1.99746192893401,
"grad_norm": 2.927258479320422,
"learning_rate": 2.6558701098826233e-06,
"loss": 0.9075,
"step": 787
},
{
"epoch": 2.0,
"grad_norm": 2.313238030479698,
"learning_rate": 2.643771868940327e-06,
"loss": 0.5394,
"step": 788
},
{
"epoch": 2.00253807106599,
"grad_norm": 3.116665312104337,
"learning_rate": 2.6316913351035313e-06,
"loss": 0.5184,
"step": 789
},
{
"epoch": 2.00507614213198,
"grad_norm": 2.5385445062740546,
"learning_rate": 2.6196285991577107e-06,
"loss": 0.495,
"step": 790
},
{
"epoch": 2.0076142131979697,
"grad_norm": 2.9811678625183924,
"learning_rate": 2.607583751754589e-06,
"loss": 0.4928,
"step": 791
},
{
"epoch": 2.010152284263959,
"grad_norm": 2.337923779288377,
"learning_rate": 2.5955568834114523e-06,
"loss": 0.6939,
"step": 792
},
{
"epoch": 2.012690355329949,
"grad_norm": 6.989523174369998,
"learning_rate": 2.58354808451048e-06,
"loss": 0.7023,
"step": 793
},
{
"epoch": 2.015228426395939,
"grad_norm": 2.9832031040895943,
"learning_rate": 2.571557445298055e-06,
"loss": 0.51,
"step": 794
},
{
"epoch": 2.017766497461929,
"grad_norm": 3.6733364494132603,
"learning_rate": 2.5595850558840908e-06,
"loss": 0.4982,
"step": 795
},
{
"epoch": 2.020304568527919,
"grad_norm": 4.175046814149058,
"learning_rate": 2.5476310062413544e-06,
"loss": 0.5748,
"step": 796
},
{
"epoch": 2.0228426395939088,
"grad_norm": 2.7364939660482466,
"learning_rate": 2.5356953862047894e-06,
"loss": 0.5673,
"step": 797
},
{
"epoch": 2.0253807106598987,
"grad_norm": 2.4199170445654925,
"learning_rate": 2.523778285470835e-06,
"loss": 0.5405,
"step": 798
},
{
"epoch": 2.027918781725888,
"grad_norm": 2.7021906853383055,
"learning_rate": 2.5118797935967643e-06,
"loss": 0.4834,
"step": 799
},
{
"epoch": 2.030456852791878,
"grad_norm": 3.547205567316223,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.5416,
"step": 800
},
{
"epoch": 2.032994923857868,
"grad_norm": 3.0646162357404747,
"learning_rate": 2.4881389939574524e-06,
"loss": 0.6144,
"step": 801
},
{
"epoch": 2.035532994923858,
"grad_norm": 2.8896201736006466,
"learning_rate": 2.4762968646048357e-06,
"loss": 0.5462,
"step": 802
},
{
"epoch": 2.0380710659898478,
"grad_norm": 4.352165532233693,
"learning_rate": 2.464473700936008e-06,
"loss": 0.6707,
"step": 803
},
{
"epoch": 2.0406091370558377,
"grad_norm": 3.074843740914918,
"learning_rate": 2.452669591802307e-06,
"loss": 0.5552,
"step": 804
},
{
"epoch": 2.0431472081218276,
"grad_norm": 3.2593449644153405,
"learning_rate": 2.4408846259118613e-06,
"loss": 0.578,
"step": 805
},
{
"epoch": 2.045685279187817,
"grad_norm": 2.775527110324298,
"learning_rate": 2.429118891828949e-06,
"loss": 0.6023,
"step": 806
},
{
"epoch": 2.048223350253807,
"grad_norm": 3.3719031747934833,
"learning_rate": 2.4173724779733153e-06,
"loss": 0.5919,
"step": 807
},
{
"epoch": 2.050761421319797,
"grad_norm": 5.862091426284267,
"learning_rate": 2.4056454726195166e-06,
"loss": 0.618,
"step": 808
},
{
"epoch": 2.053299492385787,
"grad_norm": 2.8984948188584245,
"learning_rate": 2.3939379638962513e-06,
"loss": 0.4007,
"step": 809
},
{
"epoch": 2.0558375634517767,
"grad_norm": 2.393540340028062,
"learning_rate": 2.3822500397857016e-06,
"loss": 0.5127,
"step": 810
},
{
"epoch": 2.0583756345177666,
"grad_norm": 4.320212096479848,
"learning_rate": 2.370581788122871e-06,
"loss": 0.5212,
"step": 811
},
{
"epoch": 2.0609137055837565,
"grad_norm": 3.5308347197793486,
"learning_rate": 2.3589332965949234e-06,
"loss": 0.5261,
"step": 812
},
{
"epoch": 2.063451776649746,
"grad_norm": 3.1936209119502728,
"learning_rate": 2.3473046527405273e-06,
"loss": 0.5722,
"step": 813
},
{
"epoch": 2.065989847715736,
"grad_norm": 2.4389080171771442,
"learning_rate": 2.33569594394919e-06,
"loss": 0.6258,
"step": 814
},
{
"epoch": 2.068527918781726,
"grad_norm": 4.494023471618723,
"learning_rate": 2.3241072574606105e-06,
"loss": 0.427,
"step": 815
},
{
"epoch": 2.0710659898477157,
"grad_norm": 11.249234559465428,
"learning_rate": 2.3125386803640188e-06,
"loss": 0.52,
"step": 816
},
{
"epoch": 2.0736040609137056,
"grad_norm": 2.3608749757852965,
"learning_rate": 2.300990299597525e-06,
"loss": 0.5496,
"step": 817
},
{
"epoch": 2.0761421319796955,
"grad_norm": 3.0310274663393924,
"learning_rate": 2.28946220194746e-06,
"loss": 0.5861,
"step": 818
},
{
"epoch": 2.0786802030456855,
"grad_norm": 13.351538653185711,
"learning_rate": 2.2779544740477276e-06,
"loss": 0.7389,
"step": 819
},
{
"epoch": 2.081218274111675,
"grad_norm": 3.9122053899709437,
"learning_rate": 2.266467202379154e-06,
"loss": 0.4761,
"step": 820
},
{
"epoch": 2.083756345177665,
"grad_norm": 2.208921812478111,
"learning_rate": 2.2550004732688312e-06,
"loss": 0.4685,
"step": 821
},
{
"epoch": 2.0862944162436547,
"grad_norm": 2.591217022415893,
"learning_rate": 2.243554372889479e-06,
"loss": 0.6088,
"step": 822
},
{
"epoch": 2.0888324873096447,
"grad_norm": 2.8280555926678885,
"learning_rate": 2.232128987258791e-06,
"loss": 0.5576,
"step": 823
},
{
"epoch": 2.0913705583756346,
"grad_norm": 6.603890432169106,
"learning_rate": 2.220724402238786e-06,
"loss": 0.5969,
"step": 824
},
{
"epoch": 2.0939086294416245,
"grad_norm": 3.3341288390032746,
"learning_rate": 2.2093407035351695e-06,
"loss": 0.4989,
"step": 825
},
{
"epoch": 2.0964467005076144,
"grad_norm": 3.124499128405431,
"learning_rate": 2.197977976696683e-06,
"loss": 0.5676,
"step": 826
},
{
"epoch": 2.098984771573604,
"grad_norm": 3.4057044744978944,
"learning_rate": 2.186636307114466e-06,
"loss": 0.6385,
"step": 827
},
{
"epoch": 2.1015228426395938,
"grad_norm": 2.5789067088419677,
"learning_rate": 2.1753157800214107e-06,
"loss": 0.5224,
"step": 828
},
{
"epoch": 2.1040609137055837,
"grad_norm": 4.824453982320905,
"learning_rate": 2.1640164804915235e-06,
"loss": 0.5814,
"step": 829
},
{
"epoch": 2.1065989847715736,
"grad_norm": 2.3147558116459397,
"learning_rate": 2.1527384934392865e-06,
"loss": 0.4466,
"step": 830
},
{
"epoch": 2.1091370558375635,
"grad_norm": 3.3719742417701193,
"learning_rate": 2.141481903619016e-06,
"loss": 0.5856,
"step": 831
},
{
"epoch": 2.1116751269035534,
"grad_norm": 6.540941962641488,
"learning_rate": 2.130246795624229e-06,
"loss": 0.6698,
"step": 832
},
{
"epoch": 2.1142131979695433,
"grad_norm": 4.684636553430604,
"learning_rate": 2.1190332538870055e-06,
"loss": 0.5059,
"step": 833
},
{
"epoch": 2.116751269035533,
"grad_norm": 2.5793765377182094,
"learning_rate": 2.1078413626773547e-06,
"loss": 0.4879,
"step": 834
},
{
"epoch": 2.1192893401015227,
"grad_norm": 3.9524644895450747,
"learning_rate": 2.096671206102582e-06,
"loss": 0.6381,
"step": 835
},
{
"epoch": 2.1218274111675126,
"grad_norm": 2.9411715631403506,
"learning_rate": 2.0855228681066585e-06,
"loss": 0.6114,
"step": 836
},
{
"epoch": 2.1243654822335025,
"grad_norm": 2.1866657400102953,
"learning_rate": 2.074396432469582e-06,
"loss": 0.5324,
"step": 837
},
{
"epoch": 2.1269035532994924,
"grad_norm": 3.2222804642260585,
"learning_rate": 2.063291982806759e-06,
"loss": 0.5614,
"step": 838
},
{
"epoch": 2.1294416243654823,
"grad_norm": 3.2229524875080227,
"learning_rate": 2.0522096025683696e-06,
"loss": 0.5169,
"step": 839
},
{
"epoch": 2.1319796954314723,
"grad_norm": 2.429958118668916,
"learning_rate": 2.0411493750387423e-06,
"loss": 0.5079,
"step": 840
},
{
"epoch": 2.1345177664974617,
"grad_norm": 2.8612184869702912,
"learning_rate": 2.0301113833357267e-06,
"loss": 0.5712,
"step": 841
},
{
"epoch": 2.1370558375634516,
"grad_norm": 5.2526677276961555,
"learning_rate": 2.0190957104100696e-06,
"loss": 0.7002,
"step": 842
},
{
"epoch": 2.1395939086294415,
"grad_norm": 2.6846288863362995,
"learning_rate": 2.0081024390447894e-06,
"loss": 0.6155,
"step": 843
},
{
"epoch": 2.1421319796954315,
"grad_norm": 4.160029352630203,
"learning_rate": 1.9971316518545652e-06,
"loss": 0.6612,
"step": 844
},
{
"epoch": 2.1446700507614214,
"grad_norm": 2.457195652872765,
"learning_rate": 1.9861834312850955e-06,
"loss": 0.5828,
"step": 845
},
{
"epoch": 2.1472081218274113,
"grad_norm": 4.014662686653015,
"learning_rate": 1.9752578596124955e-06,
"loss": 0.6141,
"step": 846
},
{
"epoch": 2.149746192893401,
"grad_norm": 4.450005915541498,
"learning_rate": 1.964355018942675e-06,
"loss": 0.5587,
"step": 847
},
{
"epoch": 2.152284263959391,
"grad_norm": 2.8253377333126726,
"learning_rate": 1.953474991210717e-06,
"loss": 0.6734,
"step": 848
},
{
"epoch": 2.1548223350253806,
"grad_norm": 2.935991080321399,
"learning_rate": 1.942617858180267e-06,
"loss": 0.5352,
"step": 849
},
{
"epoch": 2.1573604060913705,
"grad_norm": 3.358221489855652,
"learning_rate": 1.9317837014429135e-06,
"loss": 0.5821,
"step": 850
},
{
"epoch": 2.1598984771573604,
"grad_norm": 2.9236610806200924,
"learning_rate": 1.9209726024175807e-06,
"loss": 0.5073,
"step": 851
},
{
"epoch": 2.1624365482233503,
"grad_norm": 2.4928791939237724,
"learning_rate": 1.9101846423499113e-06,
"loss": 0.5512,
"step": 852
},
{
"epoch": 2.16497461928934,
"grad_norm": 4.34545853472978,
"learning_rate": 1.8994199023116617e-06,
"loss": 0.5374,
"step": 853
},
{
"epoch": 2.16751269035533,
"grad_norm": 2.269126405099248,
"learning_rate": 1.8886784632000827e-06,
"loss": 0.5079,
"step": 854
},
{
"epoch": 2.1700507614213196,
"grad_norm": 3.8382124318789774,
"learning_rate": 1.8779604057373234e-06,
"loss": 0.4831,
"step": 855
},
{
"epoch": 2.1725888324873095,
"grad_norm": 3.0041853611664973,
"learning_rate": 1.8672658104698178e-06,
"loss": 0.5322,
"step": 856
},
{
"epoch": 2.1751269035532994,
"grad_norm": 2.418167929638173,
"learning_rate": 1.856594757767684e-06,
"loss": 0.5232,
"step": 857
},
{
"epoch": 2.1776649746192893,
"grad_norm": 2.730297885338874,
"learning_rate": 1.8459473278241125e-06,
"loss": 0.597,
"step": 858
},
{
"epoch": 2.1802030456852792,
"grad_norm": 5.087457491162711,
"learning_rate": 1.8353236006547697e-06,
"loss": 0.636,
"step": 859
},
{
"epoch": 2.182741116751269,
"grad_norm": 3.51557324596564,
"learning_rate": 1.8247236560971986e-06,
"loss": 0.607,
"step": 860
},
{
"epoch": 2.185279187817259,
"grad_norm": 6.790823174251955,
"learning_rate": 1.8141475738102088e-06,
"loss": 0.5617,
"step": 861
},
{
"epoch": 2.187817258883249,
"grad_norm": 3.273590099732762,
"learning_rate": 1.803595433273289e-06,
"loss": 0.5327,
"step": 862
},
{
"epoch": 2.1903553299492384,
"grad_norm": 2.858230950098354,
"learning_rate": 1.7930673137860044e-06,
"loss": 0.5177,
"step": 863
},
{
"epoch": 2.1928934010152283,
"grad_norm": 4.2865689479573135,
"learning_rate": 1.7825632944674016e-06,
"loss": 0.5136,
"step": 864
},
{
"epoch": 2.1954314720812182,
"grad_norm": 3.2188012772359826,
"learning_rate": 1.7720834542554133e-06,
"loss": 0.5637,
"step": 865
},
{
"epoch": 2.197969543147208,
"grad_norm": 8.719196870847238,
"learning_rate": 1.7616278719062664e-06,
"loss": 0.6844,
"step": 866
},
{
"epoch": 2.200507614213198,
"grad_norm": 4.04579650838903,
"learning_rate": 1.751196625993888e-06,
"loss": 0.5141,
"step": 867
},
{
"epoch": 2.203045685279188,
"grad_norm": 11.660414848243596,
"learning_rate": 1.7407897949093184e-06,
"loss": 0.5526,
"step": 868
},
{
"epoch": 2.2055837563451774,
"grad_norm": 2.9105610061958296,
"learning_rate": 1.730407456860118e-06,
"loss": 0.4903,
"step": 869
},
{
"epoch": 2.2081218274111674,
"grad_norm": 2.8496411326189874,
"learning_rate": 1.7200496898697832e-06,
"loss": 0.489,
"step": 870
},
{
"epoch": 2.2106598984771573,
"grad_norm": 6.120183294524192,
"learning_rate": 1.7097165717771574e-06,
"loss": 0.584,
"step": 871
},
{
"epoch": 2.213197969543147,
"grad_norm": 5.938159912914938,
"learning_rate": 1.6994081802358464e-06,
"loss": 0.5253,
"step": 872
},
{
"epoch": 2.215736040609137,
"grad_norm": 14.924835199243644,
"learning_rate": 1.6891245927136368e-06,
"loss": 0.5136,
"step": 873
},
{
"epoch": 2.218274111675127,
"grad_norm": 2.9503220087744557,
"learning_rate": 1.6788658864919121e-06,
"loss": 0.5102,
"step": 874
},
{
"epoch": 2.220812182741117,
"grad_norm": 6.73128614516956,
"learning_rate": 1.6686321386650711e-06,
"loss": 0.4889,
"step": 875
},
{
"epoch": 2.223350253807107,
"grad_norm": 3.1684587237311983,
"learning_rate": 1.6584234261399535e-06,
"loss": 0.5409,
"step": 876
},
{
"epoch": 2.2258883248730963,
"grad_norm": 2.4359330853449133,
"learning_rate": 1.648239825635251e-06,
"loss": 0.4786,
"step": 877
},
{
"epoch": 2.228426395939086,
"grad_norm": 6.6099362518324805,
"learning_rate": 1.6380814136809442e-06,
"loss": 0.5253,
"step": 878
},
{
"epoch": 2.230964467005076,
"grad_norm": 2.6729424419883707,
"learning_rate": 1.6279482666177194e-06,
"loss": 0.6108,
"step": 879
},
{
"epoch": 2.233502538071066,
"grad_norm": 3.0490641589625476,
"learning_rate": 1.6178404605963965e-06,
"loss": 0.5215,
"step": 880
},
{
"epoch": 2.236040609137056,
"grad_norm": 2.8438807078533794,
"learning_rate": 1.6077580715773566e-06,
"loss": 0.5095,
"step": 881
},
{
"epoch": 2.238578680203046,
"grad_norm": 2.922067738649112,
"learning_rate": 1.5977011753299726e-06,
"loss": 0.5084,
"step": 882
},
{
"epoch": 2.2411167512690353,
"grad_norm": 7.290805478826787,
"learning_rate": 1.5876698474320368e-06,
"loss": 0.531,
"step": 883
},
{
"epoch": 2.2436548223350252,
"grad_norm": 2.6342457019012144,
"learning_rate": 1.5776641632691969e-06,
"loss": 0.5642,
"step": 884
},
{
"epoch": 2.246192893401015,
"grad_norm": 2.2984543518164045,
"learning_rate": 1.5676841980343854e-06,
"loss": 0.5731,
"step": 885
},
{
"epoch": 2.248730964467005,
"grad_norm": 4.565973268522213,
"learning_rate": 1.5577300267272583e-06,
"loss": 0.6255,
"step": 886
},
{
"epoch": 2.251269035532995,
"grad_norm": 2.3144475105877476,
"learning_rate": 1.5478017241536286e-06,
"loss": 0.5928,
"step": 887
},
{
"epoch": 2.253807106598985,
"grad_norm": 8.823232328391466,
"learning_rate": 1.5378993649249053e-06,
"loss": 0.5253,
"step": 888
},
{
"epoch": 2.2563451776649748,
"grad_norm": 5.6654920156900115,
"learning_rate": 1.5280230234575323e-06,
"loss": 0.4534,
"step": 889
},
{
"epoch": 2.2588832487309647,
"grad_norm": 3.331386361906362,
"learning_rate": 1.518172773972431e-06,
"loss": 0.5013,
"step": 890
},
{
"epoch": 2.261421319796954,
"grad_norm": 3.7280229939433505,
"learning_rate": 1.5083486904944388e-06,
"loss": 0.5985,
"step": 891
},
{
"epoch": 2.263959390862944,
"grad_norm": 4.310349475560709,
"learning_rate": 1.498550846851759e-06,
"loss": 0.7753,
"step": 892
},
{
"epoch": 2.266497461928934,
"grad_norm": 2.6757480326038623,
"learning_rate": 1.4887793166753944e-06,
"loss": 0.6183,
"step": 893
},
{
"epoch": 2.269035532994924,
"grad_norm": 3.0512411111957016,
"learning_rate": 1.4790341733986085e-06,
"loss": 0.5618,
"step": 894
},
{
"epoch": 2.271573604060914,
"grad_norm": 2.524907186256771,
"learning_rate": 1.4693154902563644e-06,
"loss": 0.6436,
"step": 895
},
{
"epoch": 2.2741116751269037,
"grad_norm": 3.2128246126669207,
"learning_rate": 1.4596233402847747e-06,
"loss": 0.4731,
"step": 896
},
{
"epoch": 2.276649746192893,
"grad_norm": 2.608459261288969,
"learning_rate": 1.4499577963205601e-06,
"loss": 0.5055,
"step": 897
},
{
"epoch": 2.279187817258883,
"grad_norm": 6.430159511139313,
"learning_rate": 1.4403189310004917e-06,
"loss": 0.4387,
"step": 898
},
{
"epoch": 2.281725888324873,
"grad_norm": 2.1689415980620272,
"learning_rate": 1.4307068167608506e-06,
"loss": 0.4445,
"step": 899
},
{
"epoch": 2.284263959390863,
"grad_norm": 2.9547978217502817,
"learning_rate": 1.4211215258368866e-06,
"loss": 0.5166,
"step": 900
},
{
"epoch": 2.286802030456853,
"grad_norm": 2.7536924318067464,
"learning_rate": 1.4115631302622645e-06,
"loss": 0.4962,
"step": 901
},
{
"epoch": 2.2893401015228427,
"grad_norm": 2.4788684838036312,
"learning_rate": 1.4020317018685364e-06,
"loss": 0.5943,
"step": 902
},
{
"epoch": 2.2918781725888326,
"grad_norm": 2.4029703503544524,
"learning_rate": 1.3925273122845933e-06,
"loss": 0.6502,
"step": 903
},
{
"epoch": 2.2944162436548226,
"grad_norm": 2.5929113598327262,
"learning_rate": 1.3830500329361295e-06,
"loss": 0.4675,
"step": 904
},
{
"epoch": 2.296954314720812,
"grad_norm": 5.059656988070416,
"learning_rate": 1.3735999350451047e-06,
"loss": 0.5883,
"step": 905
},
{
"epoch": 2.299492385786802,
"grad_norm": 2.146040859938093,
"learning_rate": 1.3641770896292083e-06,
"loss": 0.6194,
"step": 906
},
{
"epoch": 2.302030456852792,
"grad_norm": 2.9986918388588215,
"learning_rate": 1.3547815675013292e-06,
"loss": 0.6045,
"step": 907
},
{
"epoch": 2.3045685279187818,
"grad_norm": 9.578347963285362,
"learning_rate": 1.34541343926902e-06,
"loss": 0.5556,
"step": 908
},
{
"epoch": 2.3071065989847717,
"grad_norm": 2.3226554091713787,
"learning_rate": 1.3360727753339665e-06,
"loss": 0.5353,
"step": 909
},
{
"epoch": 2.3096446700507616,
"grad_norm": 3.0191445124573124,
"learning_rate": 1.3267596458914617e-06,
"loss": 0.478,
"step": 910
},
{
"epoch": 2.312182741116751,
"grad_norm": 2.3138830524059117,
"learning_rate": 1.3174741209298747e-06,
"loss": 0.4937,
"step": 911
},
{
"epoch": 2.314720812182741,
"grad_norm": 2.6600407713277585,
"learning_rate": 1.3082162702301276e-06,
"loss": 0.6677,
"step": 912
},
{
"epoch": 2.317258883248731,
"grad_norm": 2.2466655935165254,
"learning_rate": 1.2989861633651685e-06,
"loss": 0.5972,
"step": 913
},
{
"epoch": 2.3197969543147208,
"grad_norm": 2.8945889792174593,
"learning_rate": 1.2897838696994507e-06,
"loss": 0.6878,
"step": 914
},
{
"epoch": 2.3223350253807107,
"grad_norm": 5.824487867348564,
"learning_rate": 1.2806094583884115e-06,
"loss": 0.5068,
"step": 915
},
{
"epoch": 2.3248730964467006,
"grad_norm": 3.253456602349523,
"learning_rate": 1.2714629983779514e-06,
"loss": 0.4579,
"step": 916
},
{
"epoch": 2.3274111675126905,
"grad_norm": 2.0950832497861978,
"learning_rate": 1.262344558403913e-06,
"loss": 0.5605,
"step": 917
},
{
"epoch": 2.3299492385786804,
"grad_norm": 4.266155555762764,
"learning_rate": 1.2532542069915722e-06,
"loss": 0.5819,
"step": 918
},
{
"epoch": 2.33248730964467,
"grad_norm": 3.9318894536968374,
"learning_rate": 1.2441920124551166e-06,
"loss": 0.5636,
"step": 919
},
{
"epoch": 2.33502538071066,
"grad_norm": 3.4644680747535324,
"learning_rate": 1.2351580428971348e-06,
"loss": 0.453,
"step": 920
},
{
"epoch": 2.3375634517766497,
"grad_norm": 7.165287296686917,
"learning_rate": 1.2261523662081044e-06,
"loss": 0.4424,
"step": 921
},
{
"epoch": 2.3401015228426396,
"grad_norm": 3.2849863508234773,
"learning_rate": 1.2171750500658802e-06,
"loss": 0.47,
"step": 922
},
{
"epoch": 2.3426395939086295,
"grad_norm": 3.6210630428685007,
"learning_rate": 1.2082261619351888e-06,
"loss": 0.5103,
"step": 923
},
{
"epoch": 2.3451776649746194,
"grad_norm": 3.1975974186492033,
"learning_rate": 1.1993057690671174e-06,
"loss": 0.5004,
"step": 924
},
{
"epoch": 2.347715736040609,
"grad_norm": 3.7031056763500936,
"learning_rate": 1.1904139384986123e-06,
"loss": 0.6611,
"step": 925
},
{
"epoch": 2.350253807106599,
"grad_norm": 2.4956480369490106,
"learning_rate": 1.181550737051973e-06,
"loss": 0.4588,
"step": 926
},
{
"epoch": 2.3527918781725887,
"grad_norm": 4.633911698168668,
"learning_rate": 1.172716231334351e-06,
"loss": 0.5715,
"step": 927
},
{
"epoch": 2.3553299492385786,
"grad_norm": 3.540866963095373,
"learning_rate": 1.1639104877372475e-06,
"loss": 0.5164,
"step": 928
},
{
"epoch": 2.3578680203045685,
"grad_norm": 2.915116721864807,
"learning_rate": 1.1551335724360174e-06,
"loss": 0.6323,
"step": 929
},
{
"epoch": 2.3604060913705585,
"grad_norm": 2.8606673835042424,
"learning_rate": 1.1463855513893695e-06,
"loss": 0.6057,
"step": 930
},
{
"epoch": 2.3629441624365484,
"grad_norm": 2.337064691161368,
"learning_rate": 1.1376664903388711e-06,
"loss": 0.5347,
"step": 931
},
{
"epoch": 2.3654822335025383,
"grad_norm": 8.581970877336293,
"learning_rate": 1.128976454808457e-06,
"loss": 0.6829,
"step": 932
},
{
"epoch": 2.3680203045685277,
"grad_norm": 3.2613702450095396,
"learning_rate": 1.1203155101039293e-06,
"loss": 0.4777,
"step": 933
},
{
"epoch": 2.3705583756345177,
"grad_norm": 6.487751635876781,
"learning_rate": 1.111683721312477e-06,
"loss": 0.5744,
"step": 934
},
{
"epoch": 2.3730964467005076,
"grad_norm": 4.368595166619369,
"learning_rate": 1.10308115330218e-06,
"loss": 0.5109,
"step": 935
},
{
"epoch": 2.3756345177664975,
"grad_norm": 3.4530686993130306,
"learning_rate": 1.0945078707215224e-06,
"loss": 0.4937,
"step": 936
},
{
"epoch": 2.3781725888324874,
"grad_norm": 2.9525714062800215,
"learning_rate": 1.0859639379989113e-06,
"loss": 0.5571,
"step": 937
},
{
"epoch": 2.3807106598984773,
"grad_norm": 3.128139822413409,
"learning_rate": 1.0774494193421842e-06,
"loss": 0.5093,
"step": 938
},
{
"epoch": 2.3832487309644668,
"grad_norm": 2.6341548262769483,
"learning_rate": 1.0689643787381359e-06,
"loss": 0.4875,
"step": 939
},
{
"epoch": 2.3857868020304567,
"grad_norm": 2.866892196074003,
"learning_rate": 1.060508879952028e-06,
"loss": 0.5049,
"step": 940
},
{
"epoch": 2.3883248730964466,
"grad_norm": 2.4087256253802756,
"learning_rate": 1.0520829865271177e-06,
"loss": 0.5916,
"step": 941
},
{
"epoch": 2.3908629441624365,
"grad_norm": 4.157497736307402,
"learning_rate": 1.0436867617841768e-06,
"loss": 0.4625,
"step": 942
},
{
"epoch": 2.3934010152284264,
"grad_norm": 3.1654196498309446,
"learning_rate": 1.0353202688210169e-06,
"loss": 0.6251,
"step": 943
},
{
"epoch": 2.3959390862944163,
"grad_norm": 5.217619527759743,
"learning_rate": 1.0269835705120134e-06,
"loss": 0.5803,
"step": 944
},
{
"epoch": 2.3984771573604062,
"grad_norm": 2.691020661991825,
"learning_rate": 1.018676729507636e-06,
"loss": 0.6559,
"step": 945
},
{
"epoch": 2.401015228426396,
"grad_norm": 2.311428676461019,
"learning_rate": 1.0103998082339738e-06,
"loss": 0.4166,
"step": 946
},
{
"epoch": 2.4035532994923856,
"grad_norm": 2.6077485330856405,
"learning_rate": 1.0021528688922705e-06,
"loss": 0.5724,
"step": 947
},
{
"epoch": 2.4060913705583755,
"grad_norm": 4.218759486510066,
"learning_rate": 9.939359734584552e-07,
"loss": 0.5034,
"step": 948
},
{
"epoch": 2.4086294416243654,
"grad_norm": 3.824090997578029,
"learning_rate": 9.857491836826704e-07,
"loss": 0.7422,
"step": 949
},
{
"epoch": 2.4111675126903553,
"grad_norm": 3.0368671180326103,
"learning_rate": 9.775925610888243e-07,
"loss": 0.5311,
"step": 950
},
{
"epoch": 2.4137055837563453,
"grad_norm": 2.205826825038038,
"learning_rate": 9.694661669741102e-07,
"loss": 0.6092,
"step": 951
},
{
"epoch": 2.416243654822335,
"grad_norm": 2.9931354115283257,
"learning_rate": 9.613700624085564e-07,
"loss": 0.5511,
"step": 952
},
{
"epoch": 2.4187817258883246,
"grad_norm": 2.604908851382699,
"learning_rate": 9.533043082345644e-07,
"loss": 0.6022,
"step": 953
},
{
"epoch": 2.4213197969543145,
"grad_norm": 2.7757352275956007,
"learning_rate": 9.452689650664515e-07,
"loss": 0.5459,
"step": 954
},
{
"epoch": 2.4238578680203045,
"grad_norm": 2.9681544482964095,
"learning_rate": 9.372640932899962e-07,
"loss": 0.5155,
"step": 955
},
{
"epoch": 2.4263959390862944,
"grad_norm": 2.821374797475668,
"learning_rate": 9.292897530619843e-07,
"loss": 0.5359,
"step": 956
},
{
"epoch": 2.4289340101522843,
"grad_norm": 6.320399959191023,
"learning_rate": 9.213460043097533e-07,
"loss": 0.551,
"step": 957
},
{
"epoch": 2.431472081218274,
"grad_norm": 2.7724376062249263,
"learning_rate": 9.134329067307485e-07,
"loss": 0.6175,
"step": 958
},
{
"epoch": 2.434010152284264,
"grad_norm": 2.6764330813108947,
"learning_rate": 9.0555051979207e-07,
"loss": 0.473,
"step": 959
},
{
"epoch": 2.436548223350254,
"grad_norm": 2.730873477751639,
"learning_rate": 8.976989027300265e-07,
"loss": 0.5239,
"step": 960
},
{
"epoch": 2.4390862944162435,
"grad_norm": 5.344549869935821,
"learning_rate": 8.898781145496898e-07,
"loss": 0.521,
"step": 961
},
{
"epoch": 2.4416243654822334,
"grad_norm": 3.1454862125612038,
"learning_rate": 8.820882140244541e-07,
"loss": 0.5779,
"step": 962
},
{
"epoch": 2.4441624365482233,
"grad_norm": 3.458054376324759,
"learning_rate": 8.743292596955894e-07,
"loss": 0.6853,
"step": 963
},
{
"epoch": 2.446700507614213,
"grad_norm": 2.481931965861367,
"learning_rate": 8.666013098718068e-07,
"loss": 0.6854,
"step": 964
},
{
"epoch": 2.449238578680203,
"grad_norm": 2.532848059098252,
"learning_rate": 8.589044226288157e-07,
"loss": 0.6088,
"step": 965
},
{
"epoch": 2.451776649746193,
"grad_norm": 2.8979752935507515,
"learning_rate": 8.512386558088919e-07,
"loss": 0.5605,
"step": 966
},
{
"epoch": 2.4543147208121825,
"grad_norm": 2.413856110480652,
"learning_rate": 8.436040670204382e-07,
"loss": 0.6307,
"step": 967
},
{
"epoch": 2.4568527918781724,
"grad_norm": 2.6122764787309567,
"learning_rate": 8.360007136375553e-07,
"loss": 0.4842,
"step": 968
},
{
"epoch": 2.4593908629441623,
"grad_norm": 4.235121776718993,
"learning_rate": 8.284286527996094e-07,
"loss": 0.5184,
"step": 969
},
{
"epoch": 2.4619289340101522,
"grad_norm": 3.45109626188492,
"learning_rate": 8.208879414108006e-07,
"loss": 0.6053,
"step": 970
},
{
"epoch": 2.464467005076142,
"grad_norm": 2.6257857510140252,
"learning_rate": 8.1337863613974e-07,
"loss": 0.5008,
"step": 971
},
{
"epoch": 2.467005076142132,
"grad_norm": 2.3264527378512696,
"learning_rate": 8.059007934190194e-07,
"loss": 0.5054,
"step": 972
},
{
"epoch": 2.469543147208122,
"grad_norm": 4.073372597574256,
"learning_rate": 7.984544694447871e-07,
"loss": 0.5902,
"step": 973
},
{
"epoch": 2.472081218274112,
"grad_norm": 2.5118544268541974,
"learning_rate": 7.910397201763309e-07,
"loss": 0.619,
"step": 974
},
{
"epoch": 2.4746192893401013,
"grad_norm": 2.669293871398301,
"learning_rate": 7.836566013356523e-07,
"loss": 0.5093,
"step": 975
},
{
"epoch": 2.4771573604060912,
"grad_norm": 2.840349366691089,
"learning_rate": 7.763051684070477e-07,
"loss": 0.5888,
"step": 976
},
{
"epoch": 2.479695431472081,
"grad_norm": 2.9032925993956504,
"learning_rate": 7.689854766366972e-07,
"loss": 0.427,
"step": 977
},
{
"epoch": 2.482233502538071,
"grad_norm": 3.3473807928759816,
"learning_rate": 7.61697581032243e-07,
"loss": 0.5261,
"step": 978
},
{
"epoch": 2.484771573604061,
"grad_norm": 2.3723924084628027,
"learning_rate": 7.544415363623792e-07,
"loss": 0.6448,
"step": 979
},
{
"epoch": 2.487309644670051,
"grad_norm": 2.736374696720529,
"learning_rate": 7.472173971564361e-07,
"loss": 0.4803,
"step": 980
},
{
"epoch": 2.489847715736041,
"grad_norm": 3.637118886170699,
"learning_rate": 7.400252177039785e-07,
"loss": 0.5371,
"step": 981
},
{
"epoch": 2.4923857868020303,
"grad_norm": 3.0756471804049856,
"learning_rate": 7.328650520543906e-07,
"loss": 0.507,
"step": 982
},
{
"epoch": 2.49492385786802,
"grad_norm": 4.1622186651564235,
"learning_rate": 7.257369540164727e-07,
"loss": 0.5822,
"step": 983
},
{
"epoch": 2.49746192893401,
"grad_norm": 2.9186699671335603,
"learning_rate": 7.186409771580355e-07,
"loss": 0.4696,
"step": 984
},
{
"epoch": 2.5,
"grad_norm": 2.3325874106320756,
"learning_rate": 7.115771748054995e-07,
"loss": 0.4986,
"step": 985
},
{
"epoch": 2.50253807106599,
"grad_norm": 2.731412272019914,
"learning_rate": 7.045456000434925e-07,
"loss": 0.4617,
"step": 986
},
{
"epoch": 2.50507614213198,
"grad_norm": 6.200544247742595,
"learning_rate": 6.97546305714451e-07,
"loss": 0.5896,
"step": 987
},
{
"epoch": 2.5076142131979697,
"grad_norm": 2.423999317326736,
"learning_rate": 6.905793444182257e-07,
"loss": 0.4819,
"step": 988
},
{
"epoch": 2.5101522842639596,
"grad_norm": 3.3721509568099406,
"learning_rate": 6.83644768511677e-07,
"loss": 0.6536,
"step": 989
},
{
"epoch": 2.512690355329949,
"grad_norm": 3.0729278287610757,
"learning_rate": 6.76742630108298e-07,
"loss": 0.4877,
"step": 990
},
{
"epoch": 2.515228426395939,
"grad_norm": 2.7655715334433157,
"learning_rate": 6.698729810778065e-07,
"loss": 0.5287,
"step": 991
},
{
"epoch": 2.517766497461929,
"grad_norm": 5.420361267055273,
"learning_rate": 6.630358730457648e-07,
"loss": 0.5142,
"step": 992
},
{
"epoch": 2.520304568527919,
"grad_norm": 2.2241447721743004,
"learning_rate": 6.562313573931867e-07,
"loss": 0.5397,
"step": 993
},
{
"epoch": 2.5228426395939088,
"grad_norm": 2.286123538701192,
"learning_rate": 6.494594852561559e-07,
"loss": 0.4858,
"step": 994
},
{
"epoch": 2.525380710659898,
"grad_norm": 2.507215871577876,
"learning_rate": 6.42720307525439e-07,
"loss": 0.6809,
"step": 995
},
{
"epoch": 2.527918781725888,
"grad_norm": 2.483754095566079,
"learning_rate": 6.360138748461015e-07,
"loss": 0.5784,
"step": 996
},
{
"epoch": 2.530456852791878,
"grad_norm": 3.22268003866779,
"learning_rate": 6.293402376171298e-07,
"loss": 0.6292,
"step": 997
},
{
"epoch": 2.532994923857868,
"grad_norm": 3.8860117411826525,
"learning_rate": 6.22699445991054e-07,
"loss": 0.4521,
"step": 998
},
{
"epoch": 2.535532994923858,
"grad_norm": 4.434057239061569,
"learning_rate": 6.160915498735664e-07,
"loss": 0.5582,
"step": 999
},
{
"epoch": 2.5380710659898478,
"grad_norm": 7.565664182710548,
"learning_rate": 6.0951659892315e-07,
"loss": 0.511,
"step": 1000
},
{
"epoch": 2.5406091370558377,
"grad_norm": 2.1821928037343463,
"learning_rate": 6.029746425507032e-07,
"loss": 0.6412,
"step": 1001
},
{
"epoch": 2.5431472081218276,
"grad_norm": 4.229105673009159,
"learning_rate": 5.964657299191712e-07,
"loss": 0.5368,
"step": 1002
},
{
"epoch": 2.5456852791878175,
"grad_norm": 2.2429582646631254,
"learning_rate": 5.899899099431716e-07,
"loss": 0.4948,
"step": 1003
},
{
"epoch": 2.548223350253807,
"grad_norm": 2.184367924546276,
"learning_rate": 5.835472312886342e-07,
"loss": 0.5617,
"step": 1004
},
{
"epoch": 2.550761421319797,
"grad_norm": 2.5036433489360266,
"learning_rate": 5.771377423724272e-07,
"loss": 0.5757,
"step": 1005
},
{
"epoch": 2.553299492385787,
"grad_norm": 2.599186092121524,
"learning_rate": 5.707614913619991e-07,
"loss": 0.6208,
"step": 1006
},
{
"epoch": 2.5558375634517767,
"grad_norm": 3.383363675548512,
"learning_rate": 5.644185261750151e-07,
"loss": 0.6874,
"step": 1007
},
{
"epoch": 2.5583756345177666,
"grad_norm": 3.0495382632934103,
"learning_rate": 5.581088944789953e-07,
"loss": 0.4623,
"step": 1008
},
{
"epoch": 2.560913705583756,
"grad_norm": 2.825397606512342,
"learning_rate": 5.518326436909599e-07,
"loss": 0.6549,
"step": 1009
},
{
"epoch": 2.563451776649746,
"grad_norm": 3.0335585318562512,
"learning_rate": 5.455898209770682e-07,
"loss": 0.6025,
"step": 1010
},
{
"epoch": 2.565989847715736,
"grad_norm": 4.290032799679505,
"learning_rate": 5.393804732522695e-07,
"loss": 0.4766,
"step": 1011
},
{
"epoch": 2.568527918781726,
"grad_norm": 4.641552208588503,
"learning_rate": 5.332046471799468e-07,
"loss": 0.5483,
"step": 1012
},
{
"epoch": 2.5710659898477157,
"grad_norm": 4.309109302142682,
"learning_rate": 5.270623891715659e-07,
"loss": 0.7154,
"step": 1013
},
{
"epoch": 2.5736040609137056,
"grad_norm": 3.083159554412698,
"learning_rate": 5.20953745386329e-07,
"loss": 0.4923,
"step": 1014
},
{
"epoch": 2.5761421319796955,
"grad_norm": 3.160768286522952,
"learning_rate": 5.148787617308271e-07,
"loss": 0.5154,
"step": 1015
},
{
"epoch": 2.5786802030456855,
"grad_norm": 2.512698193443565,
"learning_rate": 5.088374838586924e-07,
"loss": 0.5437,
"step": 1016
},
{
"epoch": 2.5812182741116754,
"grad_norm": 2.7151353977857227,
"learning_rate": 5.028299571702622e-07,
"loss": 0.4243,
"step": 1017
},
{
"epoch": 2.583756345177665,
"grad_norm": 2.6177818738112246,
"learning_rate": 4.968562268122285e-07,
"loss": 0.528,
"step": 1018
},
{
"epoch": 2.5862944162436547,
"grad_norm": 2.6170555417311636,
"learning_rate": 4.909163376773046e-07,
"loss": 0.5059,
"step": 1019
},
{
"epoch": 2.5888324873096447,
"grad_norm": 2.9877171855142555,
"learning_rate": 4.850103344038853e-07,
"loss": 0.597,
"step": 1020
},
{
"epoch": 2.5913705583756346,
"grad_norm": 3.51833599501728,
"learning_rate": 4.791382613757139e-07,
"loss": 0.5726,
"step": 1021
},
{
"epoch": 2.5939086294416245,
"grad_norm": 3.138702316023805,
"learning_rate": 4.7330016272154665e-07,
"loss": 0.4273,
"step": 1022
},
{
"epoch": 2.596446700507614,
"grad_norm": 3.1926684107783037,
"learning_rate": 4.6749608231482113e-07,
"loss": 0.3683,
"step": 1023
},
{
"epoch": 2.598984771573604,
"grad_norm": 2.7769255659459984,
"learning_rate": 4.6172606377332785e-07,
"loss": 0.5339,
"step": 1024
},
{
"epoch": 2.6015228426395938,
"grad_norm": 2.8899240615730157,
"learning_rate": 4.5599015045888096e-07,
"loss": 0.4951,
"step": 1025
},
{
"epoch": 2.6040609137055837,
"grad_norm": 2.292725750412371,
"learning_rate": 4.502883854769935e-07,
"loss": 0.571,
"step": 1026
},
{
"epoch": 2.6065989847715736,
"grad_norm": 3.5657269907131965,
"learning_rate": 4.446208116765532e-07,
"loss": 0.5531,
"step": 1027
},
{
"epoch": 2.6091370558375635,
"grad_norm": 3.164676305535399,
"learning_rate": 4.389874716495013e-07,
"loss": 0.5276,
"step": 1028
},
{
"epoch": 2.6116751269035534,
"grad_norm": 2.432956704979966,
"learning_rate": 4.333884077305062e-07,
"loss": 0.4574,
"step": 1029
},
{
"epoch": 2.6142131979695433,
"grad_norm": 2.957712204580237,
"learning_rate": 4.2782366199665917e-07,
"loss": 0.5386,
"step": 1030
},
{
"epoch": 2.6167512690355332,
"grad_norm": 4.230756909710852,
"learning_rate": 4.222932762671428e-07,
"loss": 0.5932,
"step": 1031
},
{
"epoch": 2.6192893401015227,
"grad_norm": 5.653532698485646,
"learning_rate": 4.167972921029262e-07,
"loss": 0.4788,
"step": 1032
},
{
"epoch": 2.6218274111675126,
"grad_norm": 2.6961243222320808,
"learning_rate": 4.113357508064492e-07,
"loss": 0.6741,
"step": 1033
},
{
"epoch": 2.6243654822335025,
"grad_norm": 2.7983676395323775,
"learning_rate": 4.059086934213141e-07,
"loss": 0.4202,
"step": 1034
},
{
"epoch": 2.6269035532994924,
"grad_norm": 3.9618667349834094,
"learning_rate": 4.005161607319746e-07,
"loss": 0.6864,
"step": 1035
},
{
"epoch": 2.6294416243654823,
"grad_norm": 3.5271934230244923,
"learning_rate": 3.9515819326343017e-07,
"loss": 0.4295,
"step": 1036
},
{
"epoch": 2.631979695431472,
"grad_norm": 6.627886770611666,
"learning_rate": 3.898348312809225e-07,
"loss": 0.6542,
"step": 1037
},
{
"epoch": 2.6345177664974617,
"grad_norm": 2.989736226872522,
"learning_rate": 3.8454611478963235e-07,
"loss": 0.5271,
"step": 1038
},
{
"epoch": 2.6370558375634516,
"grad_norm": 2.693964153812789,
"learning_rate": 3.792920835343794e-07,
"loss": 0.5553,
"step": 1039
},
{
"epoch": 2.6395939086294415,
"grad_norm": 4.193853141678041,
"learning_rate": 3.7407277699932187e-07,
"loss": 0.4709,
"step": 1040
},
{
"epoch": 2.6421319796954315,
"grad_norm": 3.640169779501388,
"learning_rate": 3.688882344076622e-07,
"loss": 0.536,
"step": 1041
},
{
"epoch": 2.6446700507614214,
"grad_norm": 2.2708939549023004,
"learning_rate": 3.637384947213496e-07,
"loss": 0.4799,
"step": 1042
},
{
"epoch": 2.6472081218274113,
"grad_norm": 2.8336682475235793,
"learning_rate": 3.5862359664079026e-07,
"loss": 0.5862,
"step": 1043
},
{
"epoch": 2.649746192893401,
"grad_norm": 3.0122729813814657,
"learning_rate": 3.535435786045538e-07,
"loss": 0.5927,
"step": 1044
},
{
"epoch": 2.652284263959391,
"grad_norm": 3.9233115175621607,
"learning_rate": 3.484984787890855e-07,
"loss": 0.4916,
"step": 1045
},
{
"epoch": 2.6548223350253806,
"grad_norm": 2.1508006487042395,
"learning_rate": 3.434883351084212e-07,
"loss": 0.5636,
"step": 1046
},
{
"epoch": 2.6573604060913705,
"grad_norm": 2.3147748243714057,
"learning_rate": 3.385131852138979e-07,
"loss": 0.5902,
"step": 1047
},
{
"epoch": 2.6598984771573604,
"grad_norm": 3.4618558128201937,
"learning_rate": 3.335730664938758e-07,
"loss": 0.5067,
"step": 1048
},
{
"epoch": 2.6624365482233503,
"grad_norm": 2.5681555150355653,
"learning_rate": 3.286680160734534e-07,
"loss": 0.5044,
"step": 1049
},
{
"epoch": 2.66497461928934,
"grad_norm": 2.5460990508281274,
"learning_rate": 3.237980708141919e-07,
"loss": 0.6435,
"step": 1050
},
{
"epoch": 2.6675126903553297,
"grad_norm": 2.847346337059293,
"learning_rate": 3.1896326731383596e-07,
"loss": 0.4842,
"step": 1051
},
{
"epoch": 2.6700507614213196,
"grad_norm": 2.9472866736988235,
"learning_rate": 3.1416364190603734e-07,
"loss": 0.5303,
"step": 1052
},
{
"epoch": 2.6725888324873095,
"grad_norm": 2.7050299480086752,
"learning_rate": 3.0939923066008517e-07,
"loss": 0.5928,
"step": 1053
},
{
"epoch": 2.6751269035532994,
"grad_norm": 2.4390180219689483,
"learning_rate": 3.046700693806337e-07,
"loss": 0.5757,
"step": 1054
},
{
"epoch": 2.6776649746192893,
"grad_norm": 7.749480947114099,
"learning_rate": 2.99976193607433e-07,
"loss": 0.5894,
"step": 1055
},
{
"epoch": 2.6802030456852792,
"grad_norm": 3.7341604385626073,
"learning_rate": 2.9531763861505967e-07,
"loss": 0.6333,
"step": 1056
},
{
"epoch": 2.682741116751269,
"grad_norm": 3.2249839563418874,
"learning_rate": 2.9069443941265764e-07,
"loss": 0.5734,
"step": 1057
},
{
"epoch": 2.685279187817259,
"grad_norm": 3.6993587505420256,
"learning_rate": 2.8610663074366773e-07,
"loss": 0.5485,
"step": 1058
},
{
"epoch": 2.687817258883249,
"grad_norm": 2.8224525408738375,
"learning_rate": 2.8155424708557365e-07,
"loss": 0.5519,
"step": 1059
},
{
"epoch": 2.6903553299492384,
"grad_norm": 2.7593290213685466,
"learning_rate": 2.770373226496342e-07,
"loss": 0.4702,
"step": 1060
},
{
"epoch": 2.6928934010152283,
"grad_norm": 2.1562630932218627,
"learning_rate": 2.725558913806364e-07,
"loss": 0.5119,
"step": 1061
},
{
"epoch": 2.6954314720812182,
"grad_norm": 2.7385572688548043,
"learning_rate": 2.681099869566328e-07,
"loss": 0.4392,
"step": 1062
},
{
"epoch": 2.697969543147208,
"grad_norm": 3.819660887407419,
"learning_rate": 2.6369964278869174e-07,
"loss": 0.607,
"step": 1063
},
{
"epoch": 2.700507614213198,
"grad_norm": 2.3352928692496877,
"learning_rate": 2.5932489202064535e-07,
"loss": 0.5592,
"step": 1064
},
{
"epoch": 2.703045685279188,
"grad_norm": 2.564229225426386,
"learning_rate": 2.5498576752884087e-07,
"loss": 0.6144,
"step": 1065
},
{
"epoch": 2.7055837563451774,
"grad_norm": 3.773405104057773,
"learning_rate": 2.506823019218918e-07,
"loss": 0.5583,
"step": 1066
},
{
"epoch": 2.7081218274111674,
"grad_norm": 3.3735100724103084,
"learning_rate": 2.464145275404367e-07,
"loss": 0.6441,
"step": 1067
},
{
"epoch": 2.7106598984771573,
"grad_norm": 9.223286176926033,
"learning_rate": 2.4218247645689306e-07,
"loss": 0.5076,
"step": 1068
},
{
"epoch": 2.713197969543147,
"grad_norm": 2.4484463385198474,
"learning_rate": 2.3798618047521372e-07,
"loss": 0.6121,
"step": 1069
},
{
"epoch": 2.715736040609137,
"grad_norm": 5.020879232183135,
"learning_rate": 2.338256711306569e-07,
"loss": 0.5001,
"step": 1070
},
{
"epoch": 2.718274111675127,
"grad_norm": 4.845875804435141,
"learning_rate": 2.2970097968953996e-07,
"loss": 0.4871,
"step": 1071
},
{
"epoch": 2.720812182741117,
"grad_norm": 2.721208919370908,
"learning_rate": 2.2561213714900775e-07,
"loss": 0.5706,
"step": 1072
},
{
"epoch": 2.723350253807107,
"grad_norm": 2.5900068662341096,
"learning_rate": 2.2155917423680063e-07,
"loss": 0.5944,
"step": 1073
},
{
"epoch": 2.7258883248730963,
"grad_norm": 2.7454739994015225,
"learning_rate": 2.175421214110235e-07,
"loss": 0.5512,
"step": 1074
},
{
"epoch": 2.728426395939086,
"grad_norm": 2.1388171014946877,
"learning_rate": 2.1356100885991605e-07,
"loss": 0.5571,
"step": 1075
},
{
"epoch": 2.730964467005076,
"grad_norm": 3.171163810394876,
"learning_rate": 2.0961586650162348e-07,
"loss": 0.4753,
"step": 1076
},
{
"epoch": 2.733502538071066,
"grad_norm": 2.0827428970199593,
"learning_rate": 2.0570672398397716e-07,
"loss": 0.6027,
"step": 1077
},
{
"epoch": 2.736040609137056,
"grad_norm": 2.665595206273622,
"learning_rate": 2.0183361068426778e-07,
"loss": 0.4947,
"step": 1078
},
{
"epoch": 2.738578680203046,
"grad_norm": 2.693865043507274,
"learning_rate": 1.9799655570902576e-07,
"loss": 0.5106,
"step": 1079
},
{
"epoch": 2.7411167512690353,
"grad_norm": 2.3800902578530474,
"learning_rate": 1.941955878938029e-07,
"loss": 0.7141,
"step": 1080
},
{
"epoch": 2.7436548223350252,
"grad_norm": 2.777013918634036,
"learning_rate": 1.9043073580295445e-07,
"loss": 0.5651,
"step": 1081
},
{
"epoch": 2.746192893401015,
"grad_norm": 2.9611036549232117,
"learning_rate": 1.867020277294257e-07,
"loss": 0.5357,
"step": 1082
},
{
"epoch": 2.748730964467005,
"grad_norm": 5.735049614775947,
"learning_rate": 1.830094916945385e-07,
"loss": 0.5207,
"step": 1083
},
{
"epoch": 2.751269035532995,
"grad_norm": 2.401365336192109,
"learning_rate": 1.7935315544778064e-07,
"loss": 0.4534,
"step": 1084
},
{
"epoch": 2.753807106598985,
"grad_norm": 3.4850018370045133,
"learning_rate": 1.757330464665996e-07,
"loss": 0.4799,
"step": 1085
},
{
"epoch": 2.7563451776649748,
"grad_norm": 6.477937909442904,
"learning_rate": 1.721491919561913e-07,
"loss": 0.6103,
"step": 1086
},
{
"epoch": 2.7588832487309647,
"grad_norm": 2.272537968806992,
"learning_rate": 1.686016188493017e-07,
"loss": 0.4658,
"step": 1087
},
{
"epoch": 2.761421319796954,
"grad_norm": 3.5140457515001513,
"learning_rate": 1.650903538060189e-07,
"loss": 0.4613,
"step": 1088
},
{
"epoch": 2.763959390862944,
"grad_norm": 2.9061043312498094,
"learning_rate": 1.6161542321357526e-07,
"loss": 0.7196,
"step": 1089
},
{
"epoch": 2.766497461928934,
"grad_norm": 2.0962438282475198,
"learning_rate": 1.581768531861505e-07,
"loss": 0.4196,
"step": 1090
},
{
"epoch": 2.769035532994924,
"grad_norm": 2.3922675748643774,
"learning_rate": 1.5477466956467345e-07,
"loss": 0.4667,
"step": 1091
},
{
"epoch": 2.771573604060914,
"grad_norm": 3.0462687101587367,
"learning_rate": 1.514088979166256e-07,
"loss": 0.5318,
"step": 1092
},
{
"epoch": 2.7741116751269037,
"grad_norm": 5.670334178126804,
"learning_rate": 1.480795635358556e-07,
"loss": 0.58,
"step": 1093
},
{
"epoch": 2.776649746192893,
"grad_norm": 2.6536444394772767,
"learning_rate": 1.4478669144238345e-07,
"loss": 0.5004,
"step": 1094
},
{
"epoch": 2.779187817258883,
"grad_norm": 2.628505699299408,
"learning_rate": 1.4153030638221377e-07,
"loss": 0.5458,
"step": 1095
},
{
"epoch": 2.781725888324873,
"grad_norm": 7.357134641271442,
"learning_rate": 1.3831043282715007e-07,
"loss": 0.5301,
"step": 1096
},
{
"epoch": 2.784263959390863,
"grad_norm": 5.709242282448729,
"learning_rate": 1.3512709497461417e-07,
"loss": 0.5423,
"step": 1097
},
{
"epoch": 2.786802030456853,
"grad_norm": 2.560755061605383,
"learning_rate": 1.3198031674745814e-07,
"loss": 0.4878,
"step": 1098
},
{
"epoch": 2.7893401015228427,
"grad_norm": 7.521968288073685,
"learning_rate": 1.2887012179378822e-07,
"loss": 0.706,
"step": 1099
},
{
"epoch": 2.7918781725888326,
"grad_norm": 5.23634159896722,
"learning_rate": 1.2579653348678666e-07,
"loss": 0.5018,
"step": 1100
},
{
"epoch": 2.7944162436548226,
"grad_norm": 2.779118492134684,
"learning_rate": 1.2275957492453695e-07,
"loss": 0.4316,
"step": 1101
},
{
"epoch": 2.796954314720812,
"grad_norm": 6.466855188925993,
"learning_rate": 1.1975926892984768e-07,
"loss": 0.5546,
"step": 1102
},
{
"epoch": 2.799492385786802,
"grad_norm": 2.395408752537727,
"learning_rate": 1.1679563805008453e-07,
"loss": 0.5074,
"step": 1103
},
{
"epoch": 2.802030456852792,
"grad_norm": 3.3570845960467066,
"learning_rate": 1.138687045569975e-07,
"loss": 0.5307,
"step": 1104
},
{
"epoch": 2.8045685279187818,
"grad_norm": 3.59995031210504,
"learning_rate": 1.1097849044655496e-07,
"loss": 0.4549,
"step": 1105
},
{
"epoch": 2.8071065989847717,
"grad_norm": 4.307551357849026,
"learning_rate": 1.0812501743877824e-07,
"loss": 0.5132,
"step": 1106
},
{
"epoch": 2.8096446700507616,
"grad_norm": 3.4899798506842132,
"learning_rate": 1.053083069775801e-07,
"loss": 0.4659,
"step": 1107
},
{
"epoch": 2.812182741116751,
"grad_norm": 3.347922930728175,
"learning_rate": 1.0252838023059985e-07,
"loss": 0.579,
"step": 1108
},
{
"epoch": 2.814720812182741,
"grad_norm": 2.748810524512811,
"learning_rate": 9.978525808904738e-08,
"loss": 0.4253,
"step": 1109
},
{
"epoch": 2.817258883248731,
"grad_norm": 2.80515401589405,
"learning_rate": 9.70789611675449e-08,
"loss": 0.6182,
"step": 1110
},
{
"epoch": 2.8197969543147208,
"grad_norm": 3.2009134732352704,
"learning_rate": 9.440950980397268e-08,
"loss": 0.4551,
"step": 1111
},
{
"epoch": 2.8223350253807107,
"grad_norm": 4.606365951182387,
"learning_rate": 9.177692405931637e-08,
"loss": 0.6177,
"step": 1112
},
{
"epoch": 2.8248730964467006,
"grad_norm": 3.267084225461773,
"learning_rate": 8.918122371751381e-08,
"loss": 0.4555,
"step": 1113
},
{
"epoch": 2.8274111675126905,
"grad_norm": 2.537425408509751,
"learning_rate": 8.662242828530953e-08,
"loss": 0.5387,
"step": 1114
},
{
"epoch": 2.8299492385786804,
"grad_norm": 7.031905715818571,
"learning_rate": 8.410055699210718e-08,
"loss": 0.452,
"step": 1115
},
{
"epoch": 2.8324873096446703,
"grad_norm": 2.89545139077652,
"learning_rate": 8.161562878982399e-08,
"loss": 0.4785,
"step": 1116
},
{
"epoch": 2.83502538071066,
"grad_norm": 3.5675178609176323,
"learning_rate": 7.916766235274931e-08,
"loss": 0.5354,
"step": 1117
},
{
"epoch": 2.8375634517766497,
"grad_norm": 2.287121692564442,
"learning_rate": 7.675667607740356e-08,
"loss": 0.5081,
"step": 1118
},
{
"epoch": 2.8401015228426396,
"grad_norm": 4.284144020649474,
"learning_rate": 7.438268808240167e-08,
"loss": 0.5649,
"step": 1119
},
{
"epoch": 2.8426395939086295,
"grad_norm": 2.6441521888851427,
"learning_rate": 7.204571620831436e-08,
"loss": 0.5411,
"step": 1120
},
{
"epoch": 2.8451776649746194,
"grad_norm": 2.544897882073251,
"learning_rate": 6.974577801753591e-08,
"loss": 0.4644,
"step": 1121
},
{
"epoch": 2.847715736040609,
"grad_norm": 3.543692852658327,
"learning_rate": 6.74828907941516e-08,
"loss": 0.6353,
"step": 1122
},
{
"epoch": 2.850253807106599,
"grad_norm": 6.663293623437551,
"learning_rate": 6.52570715438089e-08,
"loss": 0.5767,
"step": 1123
},
{
"epoch": 2.8527918781725887,
"grad_norm": 4.196734730879769,
"learning_rate": 6.306833699358694e-08,
"loss": 0.4889,
"step": 1124
},
{
"epoch": 2.8553299492385786,
"grad_norm": 2.8129447744269807,
"learning_rate": 6.09167035918734e-08,
"loss": 0.4871,
"step": 1125
},
{
"epoch": 2.8578680203045685,
"grad_norm": 2.4298809321794606,
"learning_rate": 5.880218750823952e-08,
"loss": 0.5245,
"step": 1126
},
{
"epoch": 2.8604060913705585,
"grad_norm": 2.514214664332499,
"learning_rate": 5.672480463332075e-08,
"loss": 0.5704,
"step": 1127
},
{
"epoch": 2.8629441624365484,
"grad_norm": 2.523678806401503,
"learning_rate": 5.468457057869358e-08,
"loss": 0.5769,
"step": 1128
},
{
"epoch": 2.8654822335025383,
"grad_norm": 6.062005054394934,
"learning_rate": 5.268150067676114e-08,
"loss": 0.4635,
"step": 1129
},
{
"epoch": 2.868020304568528,
"grad_norm": 5.487134260248954,
"learning_rate": 5.071560998063774e-08,
"loss": 0.4576,
"step": 1130
},
{
"epoch": 2.8705583756345177,
"grad_norm": 3.969605616924183,
"learning_rate": 4.8786913264033955e-08,
"loss": 0.5012,
"step": 1131
},
{
"epoch": 2.8730964467005076,
"grad_norm": 6.889570976912757,
"learning_rate": 4.6895425021147856e-08,
"loss": 0.4497,
"step": 1132
},
{
"epoch": 2.8756345177664975,
"grad_norm": 2.4515069445669457,
"learning_rate": 4.5041159466554516e-08,
"loss": 0.538,
"step": 1133
},
{
"epoch": 2.8781725888324874,
"grad_norm": 5.727328009407996,
"learning_rate": 4.322413053509944e-08,
"loss": 0.4938,
"step": 1134
},
{
"epoch": 2.8807106598984773,
"grad_norm": 5.63850483058095,
"learning_rate": 4.14443518817953e-08,
"loss": 0.6336,
"step": 1135
},
{
"epoch": 2.8832487309644668,
"grad_norm": 4.543104033206608,
"learning_rate": 3.970183688171592e-08,
"loss": 0.6856,
"step": 1136
},
{
"epoch": 2.8857868020304567,
"grad_norm": 9.09297907029027,
"learning_rate": 3.799659862990024e-08,
"loss": 0.7819,
"step": 1137
},
{
"epoch": 2.8883248730964466,
"grad_norm": 3.27038419301367,
"learning_rate": 3.632864994125129e-08,
"loss": 0.5517,
"step": 1138
},
{
"epoch": 2.8908629441624365,
"grad_norm": 3.974886726560242,
"learning_rate": 3.469800335043849e-08,
"loss": 0.5948,
"step": 1139
},
{
"epoch": 2.8934010152284264,
"grad_norm": 6.7615687008901935,
"learning_rate": 3.31046711118066e-08,
"loss": 0.534,
"step": 1140
},
{
"epoch": 2.8959390862944163,
"grad_norm": 3.6418150113945207,
"learning_rate": 3.1548665199282457e-08,
"loss": 0.6924,
"step": 1141
},
{
"epoch": 2.8984771573604062,
"grad_norm": 2.3784081766815444,
"learning_rate": 3.002999730628342e-08,
"loss": 0.5198,
"step": 1142
},
{
"epoch": 2.901015228426396,
"grad_norm": 2.191671552701455,
"learning_rate": 2.8548678845632394e-08,
"loss": 0.5889,
"step": 1143
},
{
"epoch": 2.903553299492386,
"grad_norm": 3.1322620752135983,
"learning_rate": 2.710472094946959e-08,
"loss": 0.4742,
"step": 1144
},
{
"epoch": 2.9060913705583755,
"grad_norm": 3.5217814037906874,
"learning_rate": 2.5698134469169246e-08,
"loss": 0.4336,
"step": 1145
},
{
"epoch": 2.9086294416243654,
"grad_norm": 2.7931780637489267,
"learning_rate": 2.4328929975260262e-08,
"loss": 0.4718,
"step": 1146
},
{
"epoch": 2.9111675126903553,
"grad_norm": 2.960565978633607,
"learning_rate": 2.2997117757344035e-08,
"loss": 0.4998,
"step": 1147
},
{
"epoch": 2.9137055837563453,
"grad_norm": 2.579574803704386,
"learning_rate": 2.1702707824017287e-08,
"loss": 0.5634,
"step": 1148
},
{
"epoch": 2.916243654822335,
"grad_norm": 2.932826440727978,
"learning_rate": 2.0445709902798817e-08,
"loss": 0.4722,
"step": 1149
},
{
"epoch": 2.9187817258883246,
"grad_norm": 2.479489445919581,
"learning_rate": 1.9226133440056194e-08,
"loss": 0.5862,
"step": 1150
},
{
"epoch": 2.9213197969543145,
"grad_norm": 2.401202836041865,
"learning_rate": 1.8043987600932512e-08,
"loss": 0.6049,
"step": 1151
},
{
"epoch": 2.9238578680203045,
"grad_norm": 2.922087096147672,
"learning_rate": 1.6899281269279756e-08,
"loss": 0.5152,
"step": 1152
},
{
"epoch": 2.9263959390862944,
"grad_norm": 2.7792051289106614,
"learning_rate": 1.5792023047589978e-08,
"loss": 0.5535,
"step": 1153
},
{
"epoch": 2.9289340101522843,
"grad_norm": 3.2918821693850773,
"learning_rate": 1.4722221256933677e-08,
"loss": 0.5192,
"step": 1154
},
{
"epoch": 2.931472081218274,
"grad_norm": 3.0985981138740954,
"learning_rate": 1.36898839368943e-08,
"loss": 0.6679,
"step": 1155
},
{
"epoch": 2.934010152284264,
"grad_norm": 2.4402483348583006,
"learning_rate": 1.2695018845508278e-08,
"loss": 0.5892,
"step": 1156
},
{
"epoch": 2.936548223350254,
"grad_norm": 5.683858792689427,
"learning_rate": 1.173763345920953e-08,
"loss": 0.59,
"step": 1157
},
{
"epoch": 2.939086294416244,
"grad_norm": 2.7239649446352794,
"learning_rate": 1.0817734972768946e-08,
"loss": 0.5954,
"step": 1158
},
{
"epoch": 2.9416243654822334,
"grad_norm": 2.4708097002353853,
"learning_rate": 9.935330299244427e-09,
"loss": 0.5463,
"step": 1159
},
{
"epoch": 2.9441624365482233,
"grad_norm": 3.2101057585416926,
"learning_rate": 9.090426069925939e-09,
"loss": 0.5796,
"step": 1160
},
{
"epoch": 2.946700507614213,
"grad_norm": 5.884986825608284,
"learning_rate": 8.283028634287205e-09,
"loss": 0.5288,
"step": 1161
},
{
"epoch": 2.949238578680203,
"grad_norm": 4.916264982936341,
"learning_rate": 7.513144059937417e-09,
"loss": 0.5468,
"step": 1162
},
{
"epoch": 2.951776649746193,
"grad_norm": 2.8617024796040553,
"learning_rate": 6.780778132575716e-09,
"loss": 0.5463,
"step": 1163
},
{
"epoch": 2.9543147208121825,
"grad_norm": 2.9735229315289256,
"learning_rate": 6.085936355947897e-09,
"loss": 0.5341,
"step": 1164
},
{
"epoch": 2.9568527918781724,
"grad_norm": 2.9722625059513015,
"learning_rate": 5.428623951805323e-09,
"loss": 0.5023,
"step": 1165
},
{
"epoch": 2.9593908629441623,
"grad_norm": 2.271860754785485,
"learning_rate": 4.808845859864408e-09,
"loss": 0.5569,
"step": 1166
},
{
"epoch": 2.9619289340101522,
"grad_norm": 2.5277247252299997,
"learning_rate": 4.226606737771643e-09,
"loss": 0.5931,
"step": 1167
},
{
"epoch": 2.964467005076142,
"grad_norm": 2.6218878236770746,
"learning_rate": 3.6819109610658486e-09,
"loss": 0.4648,
"step": 1168
},
{
"epoch": 2.967005076142132,
"grad_norm": 4.7305366536735285,
"learning_rate": 3.1747626231481977e-09,
"loss": 0.6022,
"step": 1169
},
{
"epoch": 2.969543147208122,
"grad_norm": 3.491277684694074,
"learning_rate": 2.7051655352494654e-09,
"loss": 0.5136,
"step": 1170
},
{
"epoch": 2.972081218274112,
"grad_norm": 2.4808907499527,
"learning_rate": 2.273123226401719e-09,
"loss": 0.6368,
"step": 1171
},
{
"epoch": 2.974619289340102,
"grad_norm": 3.8438275108188438,
"learning_rate": 1.8786389434122254e-09,
"loss": 0.5318,
"step": 1172
},
{
"epoch": 2.9771573604060912,
"grad_norm": 5.372296775384867,
"learning_rate": 1.5217156508390286e-09,
"loss": 0.5529,
"step": 1173
},
{
"epoch": 2.979695431472081,
"grad_norm": 4.5433956784200955,
"learning_rate": 1.2023560309687431e-09,
"loss": 0.5474,
"step": 1174
},
{
"epoch": 2.982233502538071,
"grad_norm": 3.3272677410816933,
"learning_rate": 9.205624837949068e-10,
"loss": 0.6755,
"step": 1175
},
{
"epoch": 2.984771573604061,
"grad_norm": 5.764132998482088,
"learning_rate": 6.763371270035457e-10,
"loss": 0.5627,
"step": 1176
},
{
"epoch": 2.987309644670051,
"grad_norm": 2.6228604553150583,
"learning_rate": 4.696817959520816e-10,
"loss": 0.4996,
"step": 1177
},
{
"epoch": 2.9898477157360404,
"grad_norm": 3.5774398995312136,
"learning_rate": 3.005980436604494e-10,
"loss": 0.7654,
"step": 1178
},
{
"epoch": 2.9923857868020303,
"grad_norm": 2.9590286177610383,
"learning_rate": 1.6908714079721944e-10,
"loss": 0.6136,
"step": 1179
},
{
"epoch": 2.99492385786802,
"grad_norm": 2.67848875719845,
"learning_rate": 7.515007566849531e-11,
"loss": 0.6381,
"step": 1180
},
{
"epoch": 2.99746192893401,
"grad_norm": 5.3474868651106044,
"learning_rate": 1.8787554214583227e-11,
"loss": 0.5582,
"step": 1181
},
{
"epoch": 3.0,
"grad_norm": 2.276042560935072,
"learning_rate": 0.0,
"loss": 0.468,
"step": 1182
},
{
"epoch": 3.0,
"step": 1182,
"total_flos": 6.976297471026659e+18,
"train_loss": 0.7863800849406247,
"train_runtime": 8818.5969,
"train_samples_per_second": 8.569,
"train_steps_per_second": 0.134
}
],
"logging_steps": 1.0,
"max_steps": 1182,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.976297471026659e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}