nc-latent-tod-step-2-final / trainer_state.json
Brendan's picture
Upload folder using huggingface_hub
2b8d874 verified
raw
history blame
103 kB
{
"best_metric": 46.9,
"best_model_checkpoint": "/data/users/bking2/tod_zero/outputs/runs/finetune/starcoder_3b/mar_27_bqag30yb_step_2/online_e2e_from_hist_simple_rg/1lye830n/checkpoint-25600",
"epoch": 0.7857388400530374,
"eval_steps": 3200,
"global_step": 32000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 2.389784812927246,
"learning_rate": 2.5e-06,
"loss": 1.1731,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 1.2176052331924438,
"learning_rate": 5e-06,
"loss": 0.2782,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 1.1055022478103638,
"learning_rate": 4.9999696912850374e-06,
"loss": 0.2302,
"step": 150
},
{
"epoch": 0.0,
"grad_norm": 1.1802890300750732,
"learning_rate": 4.999878765875043e-06,
"loss": 0.2198,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 1.2586899995803833,
"learning_rate": 4.999727225974682e-06,
"loss": 0.2096,
"step": 250
},
{
"epoch": 0.01,
"grad_norm": 1.2082302570343018,
"learning_rate": 4.999515075258341e-06,
"loss": 0.1973,
"step": 300
},
{
"epoch": 0.01,
"grad_norm": 1.153389811515808,
"learning_rate": 4.999242318870029e-06,
"loss": 0.1949,
"step": 350
},
{
"epoch": 0.01,
"grad_norm": 1.0973172187805176,
"learning_rate": 4.998908963423264e-06,
"loss": 0.1886,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 1.0133867263793945,
"learning_rate": 4.998515017000907e-06,
"loss": 0.1833,
"step": 450
},
{
"epoch": 0.01,
"grad_norm": 0.9143375158309937,
"learning_rate": 4.998060489154965e-06,
"loss": 0.1886,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 0.9145483374595642,
"learning_rate": 4.997545390906362e-06,
"loss": 0.1906,
"step": 550
},
{
"epoch": 0.01,
"grad_norm": 1.067878007888794,
"learning_rate": 4.996969734744671e-06,
"loss": 0.1864,
"step": 600
},
{
"epoch": 0.02,
"grad_norm": 1.201335072517395,
"learning_rate": 4.99633353462781e-06,
"loss": 0.1846,
"step": 650
},
{
"epoch": 0.02,
"grad_norm": 1.024361252784729,
"learning_rate": 4.995636805981707e-06,
"loss": 0.1839,
"step": 700
},
{
"epoch": 0.02,
"grad_norm": 0.9717717170715332,
"learning_rate": 4.99487956569992e-06,
"loss": 0.1818,
"step": 750
},
{
"epoch": 0.02,
"grad_norm": 1.015265703201294,
"learning_rate": 4.994061832143235e-06,
"loss": 0.1907,
"step": 800
},
{
"epoch": 0.02,
"grad_norm": 1.0905097723007202,
"learning_rate": 4.993183625139212e-06,
"loss": 0.1844,
"step": 850
},
{
"epoch": 0.02,
"grad_norm": 1.065933346748352,
"learning_rate": 4.992244965981714e-06,
"loss": 0.181,
"step": 900
},
{
"epoch": 0.02,
"grad_norm": 1.090954065322876,
"learning_rate": 4.991245877430382e-06,
"loss": 0.1809,
"step": 950
},
{
"epoch": 0.02,
"grad_norm": 1.0927408933639526,
"learning_rate": 4.990186383710089e-06,
"loss": 0.176,
"step": 1000
},
{
"epoch": 0.03,
"grad_norm": 1.0833898782730103,
"learning_rate": 4.9890665105103484e-06,
"loss": 0.1778,
"step": 1050
},
{
"epoch": 0.03,
"grad_norm": 0.952042281627655,
"learning_rate": 4.987886284984695e-06,
"loss": 0.1796,
"step": 1100
},
{
"epoch": 0.03,
"grad_norm": 0.9069716930389404,
"learning_rate": 4.986645735750025e-06,
"loss": 0.1788,
"step": 1150
},
{
"epoch": 0.03,
"grad_norm": 0.9301267862319946,
"learning_rate": 4.985344892885899e-06,
"loss": 0.1698,
"step": 1200
},
{
"epoch": 0.03,
"grad_norm": 1.242105484008789,
"learning_rate": 4.98398378793382e-06,
"loss": 0.1748,
"step": 1250
},
{
"epoch": 0.03,
"grad_norm": 1.1251235008239746,
"learning_rate": 4.982562453896458e-06,
"loss": 0.1742,
"step": 1300
},
{
"epoch": 0.03,
"grad_norm": 0.9552658796310425,
"learning_rate": 4.9810809252368615e-06,
"loss": 0.1702,
"step": 1350
},
{
"epoch": 0.03,
"grad_norm": 0.8865312933921814,
"learning_rate": 4.979539237877615e-06,
"loss": 0.1727,
"step": 1400
},
{
"epoch": 0.04,
"grad_norm": 0.9874857068061829,
"learning_rate": 4.977937429199968e-06,
"loss": 0.1701,
"step": 1450
},
{
"epoch": 0.04,
"grad_norm": 1.0164177417755127,
"learning_rate": 4.976275538042932e-06,
"loss": 0.1778,
"step": 1500
},
{
"epoch": 0.04,
"grad_norm": 0.9868525862693787,
"learning_rate": 4.974553604702332e-06,
"loss": 0.1701,
"step": 1550
},
{
"epoch": 0.04,
"grad_norm": 1.0506185293197632,
"learning_rate": 4.972771670929841e-06,
"loss": 0.1701,
"step": 1600
},
{
"epoch": 0.04,
"grad_norm": 1.0885121822357178,
"learning_rate": 4.970929779931955e-06,
"loss": 0.1706,
"step": 1650
},
{
"epoch": 0.04,
"grad_norm": 0.9688003659248352,
"learning_rate": 4.969027976368954e-06,
"loss": 0.1692,
"step": 1700
},
{
"epoch": 0.04,
"grad_norm": 0.9492125511169434,
"learning_rate": 4.967066306353816e-06,
"loss": 0.1748,
"step": 1750
},
{
"epoch": 0.04,
"grad_norm": 0.9759877920150757,
"learning_rate": 4.9650448174510986e-06,
"loss": 0.1666,
"step": 1800
},
{
"epoch": 0.05,
"grad_norm": 0.9228382110595703,
"learning_rate": 4.9629635586757865e-06,
"loss": 0.1661,
"step": 1850
},
{
"epoch": 0.05,
"grad_norm": 1.0862488746643066,
"learning_rate": 4.960822580492103e-06,
"loss": 0.1707,
"step": 1900
},
{
"epoch": 0.05,
"grad_norm": 0.8746851086616516,
"learning_rate": 4.958621934812286e-06,
"loss": 0.165,
"step": 1950
},
{
"epoch": 0.05,
"grad_norm": 0.9456456303596497,
"learning_rate": 4.95636167499533e-06,
"loss": 0.17,
"step": 2000
},
{
"epoch": 0.05,
"grad_norm": 0.9976305365562439,
"learning_rate": 4.9540418558456915e-06,
"loss": 0.1628,
"step": 2050
},
{
"epoch": 0.05,
"grad_norm": 0.9470576643943787,
"learning_rate": 4.951662533611959e-06,
"loss": 0.173,
"step": 2100
},
{
"epoch": 0.05,
"grad_norm": 0.9603348970413208,
"learning_rate": 4.9492237659854946e-06,
"loss": 0.1695,
"step": 2150
},
{
"epoch": 0.05,
"grad_norm": 0.9751543998718262,
"learning_rate": 4.9467256120990255e-06,
"loss": 0.1668,
"step": 2200
},
{
"epoch": 0.06,
"grad_norm": 1.0704251527786255,
"learning_rate": 4.9441681325252215e-06,
"loss": 0.1719,
"step": 2250
},
{
"epoch": 0.06,
"grad_norm": 0.9936717748641968,
"learning_rate": 4.941551389275217e-06,
"loss": 0.1702,
"step": 2300
},
{
"epoch": 0.06,
"grad_norm": 0.9016590118408203,
"learning_rate": 4.938875445797112e-06,
"loss": 0.1604,
"step": 2350
},
{
"epoch": 0.06,
"grad_norm": 0.926601231098175,
"learning_rate": 4.936140366974434e-06,
"loss": 0.1654,
"step": 2400
},
{
"epoch": 0.06,
"grad_norm": 1.0179176330566406,
"learning_rate": 4.933346219124562e-06,
"loss": 0.163,
"step": 2450
},
{
"epoch": 0.06,
"grad_norm": 0.8542476892471313,
"learning_rate": 4.93049306999712e-06,
"loss": 0.1593,
"step": 2500
},
{
"epoch": 0.06,
"grad_norm": 1.3352445363998413,
"learning_rate": 4.927580988772336e-06,
"loss": 0.1615,
"step": 2550
},
{
"epoch": 0.06,
"grad_norm": 1.137984275817871,
"learning_rate": 4.9246100460593606e-06,
"loss": 0.1688,
"step": 2600
},
{
"epoch": 0.07,
"grad_norm": 0.9850485920906067,
"learning_rate": 4.92158031389456e-06,
"loss": 0.1663,
"step": 2650
},
{
"epoch": 0.07,
"grad_norm": 0.8925889134407043,
"learning_rate": 4.918491865739763e-06,
"loss": 0.1598,
"step": 2700
},
{
"epoch": 0.07,
"grad_norm": 0.9840365052223206,
"learning_rate": 4.915344776480487e-06,
"loss": 0.1683,
"step": 2750
},
{
"epoch": 0.07,
"grad_norm": 0.8868953585624695,
"learning_rate": 4.912139122424118e-06,
"loss": 0.16,
"step": 2800
},
{
"epoch": 0.07,
"grad_norm": 0.887232780456543,
"learning_rate": 4.908874981298058e-06,
"loss": 0.1599,
"step": 2850
},
{
"epoch": 0.07,
"grad_norm": 0.8554769158363342,
"learning_rate": 4.9055524322478456e-06,
"loss": 0.1627,
"step": 2900
},
{
"epoch": 0.07,
"grad_norm": 0.8842067122459412,
"learning_rate": 4.902171555835236e-06,
"loss": 0.1582,
"step": 2950
},
{
"epoch": 0.07,
"grad_norm": 0.7667160630226135,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.1616,
"step": 3000
},
{
"epoch": 0.07,
"grad_norm": 0.8488633632659912,
"learning_rate": 4.895235150239159e-06,
"loss": 0.1638,
"step": 3050
},
{
"epoch": 0.08,
"grad_norm": 1.0130559206008911,
"learning_rate": 4.891679789242524e-06,
"loss": 0.1697,
"step": 3100
},
{
"epoch": 0.08,
"grad_norm": 0.9109562635421753,
"learning_rate": 4.8880664372530765e-06,
"loss": 0.1646,
"step": 3150
},
{
"epoch": 0.08,
"grad_norm": 0.957970380783081,
"learning_rate": 4.884395181883661e-06,
"loss": 0.1632,
"step": 3200
},
{
"epoch": 0.08,
"grad_norm": 0.9091508984565735,
"learning_rate": 4.880666112151104e-06,
"loss": 0.1561,
"step": 3250
},
{
"epoch": 0.08,
"grad_norm": 0.9394000172615051,
"learning_rate": 4.876879318474056e-06,
"loss": 0.1645,
"step": 3300
},
{
"epoch": 0.08,
"grad_norm": 0.9433434009552002,
"learning_rate": 4.873034892670795e-06,
"loss": 0.1639,
"step": 3350
},
{
"epoch": 0.08,
"grad_norm": 0.8821176886558533,
"learning_rate": 4.869132927957007e-06,
"loss": 0.1606,
"step": 3400
},
{
"epoch": 0.08,
"grad_norm": 0.8866338133811951,
"learning_rate": 4.8651735189435205e-06,
"loss": 0.1608,
"step": 3450
},
{
"epoch": 0.09,
"grad_norm": 0.990011990070343,
"learning_rate": 4.861156761634014e-06,
"loss": 0.1551,
"step": 3500
},
{
"epoch": 0.09,
"grad_norm": 0.9839064478874207,
"learning_rate": 4.857082753422691e-06,
"loss": 0.1536,
"step": 3550
},
{
"epoch": 0.09,
"grad_norm": 1.0600333213806152,
"learning_rate": 4.852951593091914e-06,
"loss": 0.1585,
"step": 3600
},
{
"epoch": 0.09,
"grad_norm": 1.0477063655853271,
"learning_rate": 4.848763380809811e-06,
"loss": 0.1609,
"step": 3650
},
{
"epoch": 0.09,
"grad_norm": 0.8695195913314819,
"learning_rate": 4.844518218127849e-06,
"loss": 0.156,
"step": 3700
},
{
"epoch": 0.09,
"grad_norm": 0.9280376434326172,
"learning_rate": 4.840216207978368e-06,
"loss": 0.163,
"step": 3750
},
{
"epoch": 0.09,
"grad_norm": 1.030386209487915,
"learning_rate": 4.835857454672087e-06,
"loss": 0.159,
"step": 3800
},
{
"epoch": 0.09,
"grad_norm": 0.9505769610404968,
"learning_rate": 4.831442063895575e-06,
"loss": 0.1631,
"step": 3850
},
{
"epoch": 0.1,
"grad_norm": 1.039372444152832,
"learning_rate": 4.8269701427086905e-06,
"loss": 0.1551,
"step": 3900
},
{
"epoch": 0.1,
"grad_norm": 0.9196427464485168,
"learning_rate": 4.822441799541979e-06,
"loss": 0.1647,
"step": 3950
},
{
"epoch": 0.1,
"grad_norm": 0.8985508680343628,
"learning_rate": 4.8178571441940515e-06,
"loss": 0.1595,
"step": 4000
},
{
"epoch": 0.1,
"grad_norm": 0.8000168800354004,
"learning_rate": 4.813216287828917e-06,
"loss": 0.1605,
"step": 4050
},
{
"epoch": 0.1,
"grad_norm": 0.9402801990509033,
"learning_rate": 4.808519342973289e-06,
"loss": 0.1626,
"step": 4100
},
{
"epoch": 0.1,
"grad_norm": 0.8145874738693237,
"learning_rate": 4.80376642351386e-06,
"loss": 0.1576,
"step": 4150
},
{
"epoch": 0.1,
"grad_norm": 0.8779155611991882,
"learning_rate": 4.798957644694533e-06,
"loss": 0.163,
"step": 4200
},
{
"epoch": 0.1,
"grad_norm": 0.7702597975730896,
"learning_rate": 4.794093123113635e-06,
"loss": 0.1639,
"step": 4250
},
{
"epoch": 0.11,
"grad_norm": 0.8008710741996765,
"learning_rate": 4.789172976721082e-06,
"loss": 0.1616,
"step": 4300
},
{
"epoch": 0.11,
"grad_norm": 0.8573291301727295,
"learning_rate": 4.7841973248155275e-06,
"loss": 0.1588,
"step": 4350
},
{
"epoch": 0.11,
"grad_norm": 0.9034799933433533,
"learning_rate": 4.779166288041463e-06,
"loss": 0.1591,
"step": 4400
},
{
"epoch": 0.11,
"grad_norm": 0.8662201166152954,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.1588,
"step": 4450
},
{
"epoch": 0.11,
"grad_norm": 0.8145610690116882,
"learning_rate": 4.7689385491773934e-06,
"loss": 0.1565,
"step": 4500
},
{
"epoch": 0.11,
"grad_norm": 0.9362510442733765,
"learning_rate": 4.7637420950790855e-06,
"loss": 0.1615,
"step": 4550
},
{
"epoch": 0.11,
"grad_norm": 0.865591824054718,
"learning_rate": 4.75849075208965e-06,
"loss": 0.1611,
"step": 4600
},
{
"epoch": 0.11,
"grad_norm": 0.9170017838478088,
"learning_rate": 4.7531846475382526e-06,
"loss": 0.16,
"step": 4650
},
{
"epoch": 0.12,
"grad_norm": 0.8011581897735596,
"learning_rate": 4.7478239100818626e-06,
"loss": 0.1539,
"step": 4700
},
{
"epoch": 0.12,
"grad_norm": 0.851195216178894,
"learning_rate": 4.742408669702131e-06,
"loss": 0.1552,
"step": 4750
},
{
"epoch": 0.12,
"grad_norm": 0.7801069021224976,
"learning_rate": 4.736939057702239e-06,
"loss": 0.1624,
"step": 4800
},
{
"epoch": 0.12,
"grad_norm": 0.7927130460739136,
"learning_rate": 4.731415206703714e-06,
"loss": 0.1557,
"step": 4850
},
{
"epoch": 0.12,
"grad_norm": 0.8437280058860779,
"learning_rate": 4.725837250643218e-06,
"loss": 0.1532,
"step": 4900
},
{
"epoch": 0.12,
"grad_norm": 0.826627790927887,
"learning_rate": 4.720205324769296e-06,
"loss": 0.1555,
"step": 4950
},
{
"epoch": 0.12,
"grad_norm": 0.8762204647064209,
"learning_rate": 4.714519565639095e-06,
"loss": 0.1551,
"step": 5000
},
{
"epoch": 0.12,
"grad_norm": 0.8985639214515686,
"learning_rate": 4.708780111115058e-06,
"loss": 0.1558,
"step": 5050
},
{
"epoch": 0.13,
"grad_norm": 0.916937530040741,
"learning_rate": 4.702987100361578e-06,
"loss": 0.1563,
"step": 5100
},
{
"epoch": 0.13,
"grad_norm": 0.9731091260910034,
"learning_rate": 4.697140673841624e-06,
"loss": 0.1571,
"step": 5150
},
{
"epoch": 0.13,
"grad_norm": 0.8918693661689758,
"learning_rate": 4.6912409733133365e-06,
"loss": 0.1545,
"step": 5200
},
{
"epoch": 0.13,
"grad_norm": 1.0232261419296265,
"learning_rate": 4.685288141826589e-06,
"loss": 0.1585,
"step": 5250
},
{
"epoch": 0.13,
"grad_norm": 0.7756772041320801,
"learning_rate": 4.679282323719519e-06,
"loss": 0.1566,
"step": 5300
},
{
"epoch": 0.13,
"grad_norm": 0.9180383086204529,
"learning_rate": 4.67322366461503e-06,
"loss": 0.1528,
"step": 5350
},
{
"epoch": 0.13,
"grad_norm": 0.8784381747245789,
"learning_rate": 4.66711231141726e-06,
"loss": 0.1539,
"step": 5400
},
{
"epoch": 0.13,
"grad_norm": 0.8606375455856323,
"learning_rate": 4.660948412308018e-06,
"loss": 0.1545,
"step": 5450
},
{
"epoch": 0.14,
"grad_norm": 0.8549814224243164,
"learning_rate": 4.654732116743193e-06,
"loss": 0.1574,
"step": 5500
},
{
"epoch": 0.14,
"grad_norm": 0.9514050483703613,
"learning_rate": 4.64846357544913e-06,
"loss": 0.1572,
"step": 5550
},
{
"epoch": 0.14,
"grad_norm": 0.7986012101173401,
"learning_rate": 4.642142940418973e-06,
"loss": 0.1532,
"step": 5600
},
{
"epoch": 0.14,
"grad_norm": 1.0129200220108032,
"learning_rate": 4.635770364908984e-06,
"loss": 0.1584,
"step": 5650
},
{
"epoch": 0.14,
"grad_norm": 0.7995598912239075,
"learning_rate": 4.629346003434822e-06,
"loss": 0.1555,
"step": 5700
},
{
"epoch": 0.14,
"grad_norm": 0.8810447454452515,
"learning_rate": 4.622870011767798e-06,
"loss": 0.1523,
"step": 5750
},
{
"epoch": 0.14,
"grad_norm": 0.9137409329414368,
"learning_rate": 4.616342546931103e-06,
"loss": 0.1526,
"step": 5800
},
{
"epoch": 0.14,
"grad_norm": 0.8597812056541443,
"learning_rate": 4.609763767195991e-06,
"loss": 0.1592,
"step": 5850
},
{
"epoch": 0.14,
"grad_norm": 0.6283226013183594,
"learning_rate": 4.603133832077953e-06,
"loss": 0.1557,
"step": 5900
},
{
"epoch": 0.15,
"grad_norm": 0.8470057249069214,
"learning_rate": 4.596452902332839e-06,
"loss": 0.1558,
"step": 5950
},
{
"epoch": 0.15,
"grad_norm": 0.9492647647857666,
"learning_rate": 4.589721139952964e-06,
"loss": 0.1523,
"step": 6000
},
{
"epoch": 0.15,
"grad_norm": 0.8647842407226562,
"learning_rate": 4.582938708163183e-06,
"loss": 0.1593,
"step": 6050
},
{
"epoch": 0.15,
"grad_norm": 0.8241806626319885,
"learning_rate": 4.576105771416928e-06,
"loss": 0.1513,
"step": 6100
},
{
"epoch": 0.15,
"grad_norm": 0.7603068947792053,
"learning_rate": 4.569222495392227e-06,
"loss": 0.1589,
"step": 6150
},
{
"epoch": 0.15,
"grad_norm": 0.7924914360046387,
"learning_rate": 4.562289046987679e-06,
"loss": 0.149,
"step": 6200
},
{
"epoch": 0.15,
"grad_norm": 0.8768645524978638,
"learning_rate": 4.555305594318414e-06,
"loss": 0.156,
"step": 6250
},
{
"epoch": 0.15,
"grad_norm": 0.6861469745635986,
"learning_rate": 4.548272306712013e-06,
"loss": 0.1496,
"step": 6300
},
{
"epoch": 0.16,
"grad_norm": 0.8879762291908264,
"learning_rate": 4.541189354704403e-06,
"loss": 0.1512,
"step": 6350
},
{
"epoch": 0.16,
"grad_norm": 0.8703266978263855,
"learning_rate": 4.534056910035724e-06,
"loss": 0.1464,
"step": 6400
},
{
"epoch": 0.16,
"grad_norm": 0.8554527163505554,
"learning_rate": 4.5268751456461605e-06,
"loss": 0.1486,
"step": 6450
},
{
"epoch": 0.16,
"grad_norm": 1.0276597738265991,
"learning_rate": 4.5196442356717526e-06,
"loss": 0.152,
"step": 6500
},
{
"epoch": 0.16,
"grad_norm": 0.8549512624740601,
"learning_rate": 4.512364355440172e-06,
"loss": 0.15,
"step": 6550
},
{
"epoch": 0.16,
"grad_norm": 0.8074272871017456,
"learning_rate": 4.505035681466472e-06,
"loss": 0.1507,
"step": 6600
},
{
"epoch": 0.16,
"grad_norm": 0.8379577994346619,
"learning_rate": 4.497658391448803e-06,
"loss": 0.1516,
"step": 6650
},
{
"epoch": 0.16,
"grad_norm": 0.8211039304733276,
"learning_rate": 4.49023266426411e-06,
"loss": 0.1534,
"step": 6700
},
{
"epoch": 0.17,
"grad_norm": 0.8751665949821472,
"learning_rate": 4.482758679963792e-06,
"loss": 0.1502,
"step": 6750
},
{
"epoch": 0.17,
"grad_norm": 0.8499336242675781,
"learning_rate": 4.475236619769336e-06,
"loss": 0.1521,
"step": 6800
},
{
"epoch": 0.17,
"grad_norm": 0.8008933663368225,
"learning_rate": 4.4676666660679265e-06,
"loss": 0.1514,
"step": 6850
},
{
"epoch": 0.17,
"grad_norm": 0.8242402672767639,
"learning_rate": 4.460049002408018e-06,
"loss": 0.1551,
"step": 6900
},
{
"epoch": 0.17,
"grad_norm": 0.7586780786514282,
"learning_rate": 4.452383813494887e-06,
"loss": 0.1496,
"step": 6950
},
{
"epoch": 0.17,
"grad_norm": 0.792927622795105,
"learning_rate": 4.444671285186155e-06,
"loss": 0.1525,
"step": 7000
},
{
"epoch": 0.17,
"grad_norm": 0.9768459796905518,
"learning_rate": 4.4369116044872786e-06,
"loss": 0.1547,
"step": 7050
},
{
"epoch": 0.17,
"grad_norm": 0.9388589262962341,
"learning_rate": 4.42910495954702e-06,
"loss": 0.1453,
"step": 7100
},
{
"epoch": 0.18,
"grad_norm": 0.7469547390937805,
"learning_rate": 4.421251539652879e-06,
"loss": 0.1476,
"step": 7150
},
{
"epoch": 0.18,
"grad_norm": 0.7823628187179565,
"learning_rate": 4.413351535226507e-06,
"loss": 0.1483,
"step": 7200
},
{
"epoch": 0.18,
"grad_norm": 0.8263921141624451,
"learning_rate": 4.4054051378190915e-06,
"loss": 0.1484,
"step": 7250
},
{
"epoch": 0.18,
"grad_norm": 0.8380225896835327,
"learning_rate": 4.397412540106707e-06,
"loss": 0.1504,
"step": 7300
},
{
"epoch": 0.18,
"grad_norm": 0.8395451903343201,
"learning_rate": 4.3893739358856465e-06,
"loss": 0.1529,
"step": 7350
},
{
"epoch": 0.18,
"grad_norm": 0.8702168464660645,
"learning_rate": 4.38128952006772e-06,
"loss": 0.1526,
"step": 7400
},
{
"epoch": 0.18,
"grad_norm": 0.9074561595916748,
"learning_rate": 4.373159488675533e-06,
"loss": 0.1429,
"step": 7450
},
{
"epoch": 0.18,
"grad_norm": 0.6499825119972229,
"learning_rate": 4.364984038837727e-06,
"loss": 0.1477,
"step": 7500
},
{
"epoch": 0.19,
"grad_norm": 0.9074704647064209,
"learning_rate": 4.356763368784207e-06,
"loss": 0.148,
"step": 7550
},
{
"epoch": 0.19,
"grad_norm": 0.8064557909965515,
"learning_rate": 4.348497677841328e-06,
"loss": 0.1488,
"step": 7600
},
{
"epoch": 0.19,
"grad_norm": 0.884131133556366,
"learning_rate": 4.340187166427067e-06,
"loss": 0.1482,
"step": 7650
},
{
"epoch": 0.19,
"grad_norm": 0.8771503567695618,
"learning_rate": 4.331832036046162e-06,
"loss": 0.1554,
"step": 7700
},
{
"epoch": 0.19,
"grad_norm": 0.800499677658081,
"learning_rate": 4.323432489285223e-06,
"loss": 0.1476,
"step": 7750
},
{
"epoch": 0.19,
"grad_norm": 0.8588670492172241,
"learning_rate": 4.3149887298078275e-06,
"loss": 0.1491,
"step": 7800
},
{
"epoch": 0.19,
"grad_norm": 0.8966203927993774,
"learning_rate": 4.306500962349573e-06,
"loss": 0.1524,
"step": 7850
},
{
"epoch": 0.19,
"grad_norm": 0.7843323945999146,
"learning_rate": 4.2979693927131205e-06,
"loss": 0.1491,
"step": 7900
},
{
"epoch": 0.2,
"grad_norm": 0.857551634311676,
"learning_rate": 4.289394227763199e-06,
"loss": 0.1492,
"step": 7950
},
{
"epoch": 0.2,
"grad_norm": 0.8421270251274109,
"learning_rate": 4.2807756754215926e-06,
"loss": 0.1546,
"step": 8000
},
{
"epoch": 0.2,
"grad_norm": 0.8497046232223511,
"learning_rate": 4.272113944662099e-06,
"loss": 0.1498,
"step": 8050
},
{
"epoch": 0.2,
"grad_norm": 0.817263662815094,
"learning_rate": 4.263409245505461e-06,
"loss": 0.1453,
"step": 8100
},
{
"epoch": 0.2,
"grad_norm": 0.8704351186752319,
"learning_rate": 4.254661789014274e-06,
"loss": 0.1535,
"step": 8150
},
{
"epoch": 0.2,
"grad_norm": 0.7390695214271545,
"learning_rate": 4.2458717872878715e-06,
"loss": 0.1423,
"step": 8200
},
{
"epoch": 0.2,
"grad_norm": 0.869274914264679,
"learning_rate": 4.237039453457179e-06,
"loss": 0.1464,
"step": 8250
},
{
"epoch": 0.2,
"grad_norm": 0.8410429954528809,
"learning_rate": 4.228165001679547e-06,
"loss": 0.1449,
"step": 8300
},
{
"epoch": 0.21,
"grad_norm": 0.8956751227378845,
"learning_rate": 4.219248647133559e-06,
"loss": 0.1511,
"step": 8350
},
{
"epoch": 0.21,
"grad_norm": 0.9139490723609924,
"learning_rate": 4.210290606013813e-06,
"loss": 0.1478,
"step": 8400
},
{
"epoch": 0.21,
"grad_norm": 0.8980191946029663,
"learning_rate": 4.2012910955256825e-06,
"loss": 0.1502,
"step": 8450
},
{
"epoch": 0.21,
"grad_norm": 0.8101931810379028,
"learning_rate": 4.192250333880045e-06,
"loss": 0.1511,
"step": 8500
},
{
"epoch": 0.21,
"grad_norm": 0.8072584867477417,
"learning_rate": 4.183168540287995e-06,
"loss": 0.1396,
"step": 8550
},
{
"epoch": 0.21,
"grad_norm": 0.8878731727600098,
"learning_rate": 4.174045934955527e-06,
"loss": 0.1487,
"step": 8600
},
{
"epoch": 0.21,
"grad_norm": 0.761406421661377,
"learning_rate": 4.164882739078197e-06,
"loss": 0.143,
"step": 8650
},
{
"epoch": 0.21,
"grad_norm": 0.8410043120384216,
"learning_rate": 4.155679174835758e-06,
"loss": 0.1534,
"step": 8700
},
{
"epoch": 0.21,
"grad_norm": 0.8776586651802063,
"learning_rate": 4.146435465386776e-06,
"loss": 0.1466,
"step": 8750
},
{
"epoch": 0.22,
"grad_norm": 1.0027079582214355,
"learning_rate": 4.137151834863213e-06,
"loss": 0.1521,
"step": 8800
},
{
"epoch": 0.22,
"grad_norm": 0.8652539253234863,
"learning_rate": 4.1278285083649985e-06,
"loss": 0.1483,
"step": 8850
},
{
"epoch": 0.22,
"grad_norm": 0.9300382733345032,
"learning_rate": 4.11846571195457e-06,
"loss": 0.1428,
"step": 8900
},
{
"epoch": 0.22,
"grad_norm": 0.7292830348014832,
"learning_rate": 4.1090636726513875e-06,
"loss": 0.1465,
"step": 8950
},
{
"epoch": 0.22,
"grad_norm": 0.8459763526916504,
"learning_rate": 4.0996226184264355e-06,
"loss": 0.1486,
"step": 9000
},
{
"epoch": 0.22,
"grad_norm": 0.8546915650367737,
"learning_rate": 4.090142778196692e-06,
"loss": 0.1461,
"step": 9050
},
{
"epoch": 0.22,
"grad_norm": 0.8114922046661377,
"learning_rate": 4.080624381819577e-06,
"loss": 0.1476,
"step": 9100
},
{
"epoch": 0.22,
"grad_norm": 0.820453941822052,
"learning_rate": 4.071067660087379e-06,
"loss": 0.1391,
"step": 9150
},
{
"epoch": 0.23,
"grad_norm": 0.7920337915420532,
"learning_rate": 4.061472844721664e-06,
"loss": 0.1435,
"step": 9200
},
{
"epoch": 0.23,
"grad_norm": 0.8869473338127136,
"learning_rate": 4.05184016836765e-06,
"loss": 0.1498,
"step": 9250
},
{
"epoch": 0.23,
"grad_norm": 0.7248154282569885,
"learning_rate": 4.042169864588571e-06,
"loss": 0.1465,
"step": 9300
},
{
"epoch": 0.23,
"grad_norm": 0.7333469390869141,
"learning_rate": 4.032462167860012e-06,
"loss": 0.1451,
"step": 9350
},
{
"epoch": 0.23,
"grad_norm": 0.8676118850708008,
"learning_rate": 4.022717313564223e-06,
"loss": 0.1469,
"step": 9400
},
{
"epoch": 0.23,
"grad_norm": 0.7327333688735962,
"learning_rate": 4.012935537984414e-06,
"loss": 0.1448,
"step": 9450
},
{
"epoch": 0.23,
"grad_norm": 0.9415409564971924,
"learning_rate": 4.0031170782990214e-06,
"loss": 0.147,
"step": 9500
},
{
"epoch": 0.23,
"grad_norm": 0.9164848923683167,
"learning_rate": 3.993262172575962e-06,
"loss": 0.1429,
"step": 9550
},
{
"epoch": 0.24,
"grad_norm": 0.8720986843109131,
"learning_rate": 3.983371059766862e-06,
"loss": 0.1454,
"step": 9600
},
{
"epoch": 0.24,
"grad_norm": 0.7514834403991699,
"learning_rate": 3.973443979701252e-06,
"loss": 0.1464,
"step": 9650
},
{
"epoch": 0.24,
"grad_norm": 0.7921479344367981,
"learning_rate": 3.963481173080768e-06,
"loss": 0.1449,
"step": 9700
},
{
"epoch": 0.24,
"grad_norm": 0.8018845915794373,
"learning_rate": 3.9534828814733e-06,
"loss": 0.144,
"step": 9750
},
{
"epoch": 0.24,
"grad_norm": 0.810478687286377,
"learning_rate": 3.943449347307146e-06,
"loss": 0.1512,
"step": 9800
},
{
"epoch": 0.24,
"grad_norm": 0.8357745409011841,
"learning_rate": 3.9333808138651265e-06,
"loss": 0.1475,
"step": 9850
},
{
"epoch": 0.24,
"grad_norm": 0.7317201495170593,
"learning_rate": 3.923277525278691e-06,
"loss": 0.1475,
"step": 9900
},
{
"epoch": 0.24,
"grad_norm": 0.9418444037437439,
"learning_rate": 3.913139726521993e-06,
"loss": 0.1431,
"step": 9950
},
{
"epoch": 0.25,
"grad_norm": 0.8349087238311768,
"learning_rate": 3.9029676634059565e-06,
"loss": 0.141,
"step": 10000
},
{
"epoch": 0.25,
"grad_norm": 0.7476102709770203,
"learning_rate": 3.89276158257231e-06,
"loss": 0.1404,
"step": 10050
},
{
"epoch": 0.25,
"grad_norm": 0.8648724555969238,
"learning_rate": 3.882521731487609e-06,
"loss": 0.1454,
"step": 10100
},
{
"epoch": 0.25,
"grad_norm": 0.8712032437324524,
"learning_rate": 3.872248358437236e-06,
"loss": 0.1438,
"step": 10150
},
{
"epoch": 0.25,
"grad_norm": 0.8766721487045288,
"learning_rate": 3.861941712519379e-06,
"loss": 0.1452,
"step": 10200
},
{
"epoch": 0.25,
"grad_norm": 0.9583675861358643,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.1472,
"step": 10250
},
{
"epoch": 0.25,
"grad_norm": 0.777285635471344,
"learning_rate": 3.841229602501742e-06,
"loss": 0.1511,
"step": 10300
},
{
"epoch": 0.25,
"grad_norm": 0.874595046043396,
"learning_rate": 3.8308246406079116e-06,
"loss": 0.143,
"step": 10350
},
{
"epoch": 0.26,
"grad_norm": 0.8689395189285278,
"learning_rate": 3.820387410246324e-06,
"loss": 0.1471,
"step": 10400
},
{
"epoch": 0.26,
"grad_norm": 0.8526681065559387,
"learning_rate": 3.809918164488208e-06,
"loss": 0.1514,
"step": 10450
},
{
"epoch": 0.26,
"grad_norm": 0.8526254892349243,
"learning_rate": 3.7994171571810756e-06,
"loss": 0.1385,
"step": 10500
},
{
"epoch": 0.26,
"grad_norm": 0.9596450924873352,
"learning_rate": 3.788884642942555e-06,
"loss": 0.1436,
"step": 10550
},
{
"epoch": 0.26,
"grad_norm": 0.8162502646446228,
"learning_rate": 3.7783208771542237e-06,
"loss": 0.143,
"step": 10600
},
{
"epoch": 0.26,
"grad_norm": 0.7438496351242065,
"learning_rate": 3.7677261159554145e-06,
"loss": 0.1472,
"step": 10650
},
{
"epoch": 0.26,
"grad_norm": 0.918170154094696,
"learning_rate": 3.757100616237006e-06,
"loss": 0.1441,
"step": 10700
},
{
"epoch": 0.26,
"grad_norm": 0.8643715381622314,
"learning_rate": 3.746444635635191e-06,
"loss": 0.1425,
"step": 10750
},
{
"epoch": 0.27,
"grad_norm": 0.849587619304657,
"learning_rate": 3.735758432525234e-06,
"loss": 0.1456,
"step": 10800
},
{
"epoch": 0.27,
"grad_norm": 0.8149674534797668,
"learning_rate": 3.725042266015201e-06,
"loss": 0.1465,
"step": 10850
},
{
"epoch": 0.27,
"grad_norm": 0.8201355934143066,
"learning_rate": 3.7142963959396805e-06,
"loss": 0.1483,
"step": 10900
},
{
"epoch": 0.27,
"grad_norm": 0.7340033054351807,
"learning_rate": 3.7035210828534846e-06,
"loss": 0.1435,
"step": 10950
},
{
"epoch": 0.27,
"grad_norm": 0.8350439667701721,
"learning_rate": 3.692716588025327e-06,
"loss": 0.1413,
"step": 11000
},
{
"epoch": 0.27,
"grad_norm": 0.8975796103477478,
"learning_rate": 3.68188317343149e-06,
"loss": 0.1405,
"step": 11050
},
{
"epoch": 0.27,
"grad_norm": 0.820162832736969,
"learning_rate": 3.671021101749476e-06,
"loss": 0.1418,
"step": 11100
},
{
"epoch": 0.27,
"grad_norm": 0.8355934023857117,
"learning_rate": 3.6601306363516297e-06,
"loss": 0.1426,
"step": 11150
},
{
"epoch": 0.28,
"grad_norm": 0.8989911079406738,
"learning_rate": 3.649212041298763e-06,
"loss": 0.1435,
"step": 11200
},
{
"epoch": 0.28,
"grad_norm": 0.8896788954734802,
"learning_rate": 3.638265581333742e-06,
"loss": 0.1398,
"step": 11250
},
{
"epoch": 0.28,
"grad_norm": 0.924511730670929,
"learning_rate": 3.627291521875076e-06,
"loss": 0.1421,
"step": 11300
},
{
"epoch": 0.28,
"grad_norm": 0.8179125785827637,
"learning_rate": 3.616290129010476e-06,
"loss": 0.1423,
"step": 11350
},
{
"epoch": 0.28,
"grad_norm": 0.9186579585075378,
"learning_rate": 3.605261669490407e-06,
"loss": 0.1379,
"step": 11400
},
{
"epoch": 0.28,
"grad_norm": 0.8218767642974854,
"learning_rate": 3.5942064107216183e-06,
"loss": 0.1458,
"step": 11450
},
{
"epoch": 0.28,
"grad_norm": 0.73914635181427,
"learning_rate": 3.5831246207606597e-06,
"loss": 0.1385,
"step": 11500
},
{
"epoch": 0.28,
"grad_norm": 0.8135074377059937,
"learning_rate": 3.57201656830738e-06,
"loss": 0.1358,
"step": 11550
},
{
"epoch": 0.28,
"grad_norm": 0.8401809334754944,
"learning_rate": 3.5608825226984168e-06,
"loss": 0.141,
"step": 11600
},
{
"epoch": 0.29,
"grad_norm": 0.8126397728919983,
"learning_rate": 3.549722753900662e-06,
"loss": 0.1478,
"step": 11650
},
{
"epoch": 0.29,
"grad_norm": 0.8558050990104675,
"learning_rate": 3.5385375325047167e-06,
"loss": 0.1388,
"step": 11700
},
{
"epoch": 0.29,
"grad_norm": 0.9053475260734558,
"learning_rate": 3.5273271297183302e-06,
"loss": 0.1379,
"step": 11750
},
{
"epoch": 0.29,
"grad_norm": 0.9160645008087158,
"learning_rate": 3.516091817359825e-06,
"loss": 0.1438,
"step": 11800
},
{
"epoch": 0.29,
"grad_norm": 0.8678522109985352,
"learning_rate": 3.5048318678515052e-06,
"loss": 0.1444,
"step": 11850
},
{
"epoch": 0.29,
"grad_norm": 0.7352308630943298,
"learning_rate": 3.493547554213051e-06,
"loss": 0.1449,
"step": 11900
},
{
"epoch": 0.29,
"grad_norm": 0.8845266699790955,
"learning_rate": 3.482239150054898e-06,
"loss": 0.1413,
"step": 11950
},
{
"epoch": 0.29,
"grad_norm": 0.8861889839172363,
"learning_rate": 3.470906929571605e-06,
"loss": 0.1432,
"step": 12000
},
{
"epoch": 0.3,
"grad_norm": 0.7789872884750366,
"learning_rate": 3.459551167535205e-06,
"loss": 0.1384,
"step": 12050
},
{
"epoch": 0.3,
"grad_norm": 0.7118023037910461,
"learning_rate": 3.4481721392885415e-06,
"loss": 0.1397,
"step": 12100
},
{
"epoch": 0.3,
"grad_norm": 0.7726086974143982,
"learning_rate": 3.4367701207385944e-06,
"loss": 0.1412,
"step": 12150
},
{
"epoch": 0.3,
"grad_norm": 0.7531469464302063,
"learning_rate": 3.425345388349787e-06,
"loss": 0.135,
"step": 12200
},
{
"epoch": 0.3,
"grad_norm": 0.7321942448616028,
"learning_rate": 3.4138982191372838e-06,
"loss": 0.1371,
"step": 12250
},
{
"epoch": 0.3,
"grad_norm": 0.930092990398407,
"learning_rate": 3.402428890660279e-06,
"loss": 0.1402,
"step": 12300
},
{
"epoch": 0.3,
"grad_norm": 0.7779068946838379,
"learning_rate": 3.390937681015256e-06,
"loss": 0.1343,
"step": 12350
},
{
"epoch": 0.3,
"grad_norm": 0.8833639621734619,
"learning_rate": 3.379424868829254e-06,
"loss": 0.1403,
"step": 12400
},
{
"epoch": 0.31,
"grad_norm": 0.8694601058959961,
"learning_rate": 3.367890733253108e-06,
"loss": 0.144,
"step": 12450
},
{
"epoch": 0.31,
"grad_norm": 0.822138249874115,
"learning_rate": 3.3563355539546795e-06,
"loss": 0.1432,
"step": 12500
},
{
"epoch": 0.31,
"grad_norm": 0.9551856517791748,
"learning_rate": 3.3447596111120767e-06,
"loss": 0.1442,
"step": 12550
},
{
"epoch": 0.31,
"grad_norm": 0.8066045641899109,
"learning_rate": 3.333163185406861e-06,
"loss": 0.1386,
"step": 12600
},
{
"epoch": 0.31,
"grad_norm": 0.7707632184028625,
"learning_rate": 3.321546558017243e-06,
"loss": 0.1482,
"step": 12650
},
{
"epoch": 0.31,
"grad_norm": 0.8924170136451721,
"learning_rate": 3.309910010611259e-06,
"loss": 0.1393,
"step": 12700
},
{
"epoch": 0.31,
"grad_norm": 0.7188828587532043,
"learning_rate": 3.29825382533995e-06,
"loss": 0.1355,
"step": 12750
},
{
"epoch": 0.31,
"grad_norm": 0.816042423248291,
"learning_rate": 3.286578284830513e-06,
"loss": 0.1415,
"step": 12800
},
{
"epoch": 0.32,
"grad_norm": 0.7887862324714661,
"learning_rate": 3.2748836721794514e-06,
"loss": 0.1396,
"step": 12850
},
{
"epoch": 0.32,
"grad_norm": 0.9086594581604004,
"learning_rate": 3.263170270945709e-06,
"loss": 0.1434,
"step": 12900
},
{
"epoch": 0.32,
"grad_norm": 0.9660589694976807,
"learning_rate": 3.2514383651437987e-06,
"loss": 0.139,
"step": 12950
},
{
"epoch": 0.32,
"grad_norm": 0.9063233733177185,
"learning_rate": 3.239688239236911e-06,
"loss": 0.1403,
"step": 13000
},
{
"epoch": 0.32,
"grad_norm": 0.7167203426361084,
"learning_rate": 3.2279201781300206e-06,
"loss": 0.1453,
"step": 13050
},
{
"epoch": 0.32,
"grad_norm": 0.784888505935669,
"learning_rate": 3.2161344671629736e-06,
"loss": 0.1396,
"step": 13100
},
{
"epoch": 0.32,
"grad_norm": 0.7880986332893372,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.141,
"step": 13150
},
{
"epoch": 0.32,
"grad_norm": 0.9474193453788757,
"learning_rate": 3.1925112391406534e-06,
"loss": 0.136,
"step": 13200
},
{
"epoch": 0.33,
"grad_norm": 0.757166862487793,
"learning_rate": 3.1806742948771276e-06,
"loss": 0.1408,
"step": 13250
},
{
"epoch": 0.33,
"grad_norm": 0.883098840713501,
"learning_rate": 3.168820846323053e-06,
"loss": 0.1378,
"step": 13300
},
{
"epoch": 0.33,
"grad_norm": 0.8062676191329956,
"learning_rate": 3.1569511808886633e-06,
"loss": 0.1374,
"step": 13350
},
{
"epoch": 0.33,
"grad_norm": 0.8611062169075012,
"learning_rate": 3.1450655863774053e-06,
"loss": 0.1398,
"step": 13400
},
{
"epoch": 0.33,
"grad_norm": 0.8143792152404785,
"learning_rate": 3.1331643509789553e-06,
"loss": 0.1383,
"step": 13450
},
{
"epoch": 0.33,
"grad_norm": 0.9166560173034668,
"learning_rate": 3.121247763262235e-06,
"loss": 0.1427,
"step": 13500
},
{
"epoch": 0.33,
"grad_norm": 0.8066914081573486,
"learning_rate": 3.1093161121684118e-06,
"loss": 0.1345,
"step": 13550
},
{
"epoch": 0.33,
"grad_norm": 0.7582921385765076,
"learning_rate": 3.097369687003896e-06,
"loss": 0.1451,
"step": 13600
},
{
"epoch": 0.34,
"grad_norm": 0.7752835750579834,
"learning_rate": 3.085408777433323e-06,
"loss": 0.1385,
"step": 13650
},
{
"epoch": 0.34,
"grad_norm": 0.667438268661499,
"learning_rate": 3.0734336734725327e-06,
"loss": 0.1371,
"step": 13700
},
{
"epoch": 0.34,
"grad_norm": 0.8165872097015381,
"learning_rate": 3.0614446654815346e-06,
"loss": 0.135,
"step": 13750
},
{
"epoch": 0.34,
"grad_norm": 0.7894701361656189,
"learning_rate": 3.049442044157469e-06,
"loss": 0.1365,
"step": 13800
},
{
"epoch": 0.34,
"grad_norm": 0.8602648377418518,
"learning_rate": 3.0374261005275606e-06,
"loss": 0.1388,
"step": 13850
},
{
"epoch": 0.34,
"grad_norm": 0.8391554355621338,
"learning_rate": 3.025397125942056e-06,
"loss": 0.1367,
"step": 13900
},
{
"epoch": 0.34,
"grad_norm": 0.8266942501068115,
"learning_rate": 3.0133554120671653e-06,
"loss": 0.1369,
"step": 13950
},
{
"epoch": 0.34,
"grad_norm": 0.8058880567550659,
"learning_rate": 3.001301250877987e-06,
"loss": 0.1356,
"step": 14000
},
{
"epoch": 0.34,
"grad_norm": 0.895440936088562,
"learning_rate": 2.9892349346514306e-06,
"loss": 0.1378,
"step": 14050
},
{
"epoch": 0.35,
"grad_norm": 0.8078271150588989,
"learning_rate": 2.977156755959126e-06,
"loss": 0.1352,
"step": 14100
},
{
"epoch": 0.35,
"grad_norm": 0.9933186769485474,
"learning_rate": 2.9650670076603342e-06,
"loss": 0.1405,
"step": 14150
},
{
"epoch": 0.35,
"grad_norm": 0.8396628499031067,
"learning_rate": 2.952965982894844e-06,
"loss": 0.1428,
"step": 14200
},
{
"epoch": 0.35,
"grad_norm": 0.7942902445793152,
"learning_rate": 2.9408539750758625e-06,
"loss": 0.1375,
"step": 14250
},
{
"epoch": 0.35,
"grad_norm": 0.8997230529785156,
"learning_rate": 2.9287312778829047e-06,
"loss": 0.1408,
"step": 14300
},
{
"epoch": 0.35,
"grad_norm": 0.9185475707054138,
"learning_rate": 2.9165981852546688e-06,
"loss": 0.1417,
"step": 14350
},
{
"epoch": 0.35,
"grad_norm": 0.8807886838912964,
"learning_rate": 2.9044549913819125e-06,
"loss": 0.1407,
"step": 14400
},
{
"epoch": 0.35,
"grad_norm": 0.8130568265914917,
"learning_rate": 2.892301990700316e-06,
"loss": 0.1331,
"step": 14450
},
{
"epoch": 0.36,
"grad_norm": 0.8746042251586914,
"learning_rate": 2.8801394778833475e-06,
"loss": 0.1365,
"step": 14500
},
{
"epoch": 0.36,
"grad_norm": 0.8365629315376282,
"learning_rate": 2.8679677478351147e-06,
"loss": 0.1418,
"step": 14550
},
{
"epoch": 0.36,
"grad_norm": 0.8888775706291199,
"learning_rate": 2.8557870956832135e-06,
"loss": 0.1346,
"step": 14600
},
{
"epoch": 0.36,
"grad_norm": 0.8894440531730652,
"learning_rate": 2.8435978167715753e-06,
"loss": 0.1349,
"step": 14650
},
{
"epoch": 0.36,
"grad_norm": 0.8231815695762634,
"learning_rate": 2.8314002066533053e-06,
"loss": 0.1351,
"step": 14700
},
{
"epoch": 0.36,
"grad_norm": 0.9063069820404053,
"learning_rate": 2.8191945610835138e-06,
"loss": 0.1371,
"step": 14750
},
{
"epoch": 0.36,
"grad_norm": 0.8974840641021729,
"learning_rate": 2.8069811760121463e-06,
"loss": 0.135,
"step": 14800
},
{
"epoch": 0.36,
"grad_norm": 0.9878427386283875,
"learning_rate": 2.794760347576809e-06,
"loss": 0.1424,
"step": 14850
},
{
"epoch": 0.37,
"grad_norm": 0.8393834233283997,
"learning_rate": 2.7825323720955853e-06,
"loss": 0.1407,
"step": 14900
},
{
"epoch": 0.37,
"grad_norm": 0.8515870571136475,
"learning_rate": 2.7702975460598545e-06,
"loss": 0.1397,
"step": 14950
},
{
"epoch": 0.37,
"grad_norm": 0.8178338408470154,
"learning_rate": 2.7580561661271015e-06,
"loss": 0.1341,
"step": 15000
},
{
"epoch": 0.37,
"grad_norm": 0.7944337725639343,
"learning_rate": 2.7458085291137213e-06,
"loss": 0.1365,
"step": 15050
},
{
"epoch": 0.37,
"grad_norm": 0.8307095766067505,
"learning_rate": 2.733554931987825e-06,
"loss": 0.1351,
"step": 15100
},
{
"epoch": 0.37,
"grad_norm": 0.8896898031234741,
"learning_rate": 2.7212956718620404e-06,
"loss": 0.1368,
"step": 15150
},
{
"epoch": 0.37,
"grad_norm": 0.8878371715545654,
"learning_rate": 2.709031045986302e-06,
"loss": 0.1369,
"step": 15200
},
{
"epoch": 0.37,
"grad_norm": 1.0110993385314941,
"learning_rate": 2.6967613517406514e-06,
"loss": 0.1315,
"step": 15250
},
{
"epoch": 0.38,
"grad_norm": 0.676525890827179,
"learning_rate": 2.68448688662802e-06,
"loss": 0.1356,
"step": 15300
},
{
"epoch": 0.38,
"grad_norm": 0.8003978729248047,
"learning_rate": 2.6722079482670196e-06,
"loss": 0.1332,
"step": 15350
},
{
"epoch": 0.38,
"grad_norm": 0.700488805770874,
"learning_rate": 2.6599248343847243e-06,
"loss": 0.1318,
"step": 15400
},
{
"epoch": 0.38,
"grad_norm": 0.844950258731842,
"learning_rate": 2.6476378428094523e-06,
"loss": 0.137,
"step": 15450
},
{
"epoch": 0.38,
"grad_norm": 0.7908570766448975,
"learning_rate": 2.6353472714635443e-06,
"loss": 0.137,
"step": 15500
},
{
"epoch": 0.38,
"grad_norm": 0.8521822094917297,
"learning_rate": 2.6230534183561385e-06,
"loss": 0.1348,
"step": 15550
},
{
"epoch": 0.38,
"grad_norm": 0.7813879251480103,
"learning_rate": 2.6107565815759473e-06,
"loss": 0.1337,
"step": 15600
},
{
"epoch": 0.38,
"grad_norm": 0.8601763844490051,
"learning_rate": 2.598457059284027e-06,
"loss": 0.1405,
"step": 15650
},
{
"epoch": 0.39,
"grad_norm": 0.9580998420715332,
"learning_rate": 2.5861551497065497e-06,
"loss": 0.1367,
"step": 15700
},
{
"epoch": 0.39,
"grad_norm": 0.8153945207595825,
"learning_rate": 2.5738511511275716e-06,
"loss": 0.1367,
"step": 15750
},
{
"epoch": 0.39,
"grad_norm": 0.8746289610862732,
"learning_rate": 2.5615453618818033e-06,
"loss": 0.1361,
"step": 15800
},
{
"epoch": 0.39,
"grad_norm": 0.8852072954177856,
"learning_rate": 2.5492380803473705e-06,
"loss": 0.1373,
"step": 15850
},
{
"epoch": 0.39,
"grad_norm": 0.8347833752632141,
"learning_rate": 2.5369296049385837e-06,
"loss": 0.1316,
"step": 15900
},
{
"epoch": 0.39,
"grad_norm": 0.8382280468940735,
"learning_rate": 2.5246202340987004e-06,
"loss": 0.1335,
"step": 15950
},
{
"epoch": 0.39,
"grad_norm": 0.8533499836921692,
"learning_rate": 2.5123102662926912e-06,
"loss": 0.1361,
"step": 16000
},
{
"epoch": 0.39,
"grad_norm": 0.6924258470535278,
"learning_rate": 2.5e-06,
"loss": 0.1371,
"step": 16050
},
{
"epoch": 0.4,
"grad_norm": 0.8405711650848389,
"learning_rate": 2.4876897337073105e-06,
"loss": 0.1367,
"step": 16100
},
{
"epoch": 0.4,
"grad_norm": 0.7599152326583862,
"learning_rate": 2.475379765901301e-06,
"loss": 0.1344,
"step": 16150
},
{
"epoch": 0.4,
"grad_norm": 0.8320915699005127,
"learning_rate": 2.4630703950614176e-06,
"loss": 0.1328,
"step": 16200
},
{
"epoch": 0.4,
"grad_norm": 0.8469275832176208,
"learning_rate": 2.45076191965263e-06,
"loss": 0.1337,
"step": 16250
},
{
"epoch": 0.4,
"grad_norm": 0.8450738191604614,
"learning_rate": 2.4384546381181975e-06,
"loss": 0.1324,
"step": 16300
},
{
"epoch": 0.4,
"grad_norm": 0.8245053887367249,
"learning_rate": 2.4261488488724284e-06,
"loss": 0.1355,
"step": 16350
},
{
"epoch": 0.4,
"grad_norm": 0.8532336950302124,
"learning_rate": 2.413844850293451e-06,
"loss": 0.1317,
"step": 16400
},
{
"epoch": 0.4,
"grad_norm": 0.8361835479736328,
"learning_rate": 2.4015429407159746e-06,
"loss": 0.1339,
"step": 16450
},
{
"epoch": 0.41,
"grad_norm": 0.8320481777191162,
"learning_rate": 2.3892434184240536e-06,
"loss": 0.1341,
"step": 16500
},
{
"epoch": 0.41,
"grad_norm": 0.9355595111846924,
"learning_rate": 2.3769465816438627e-06,
"loss": 0.1395,
"step": 16550
},
{
"epoch": 0.41,
"grad_norm": 0.8267269134521484,
"learning_rate": 2.3646527285364565e-06,
"loss": 0.1377,
"step": 16600
},
{
"epoch": 0.41,
"grad_norm": 0.7989060878753662,
"learning_rate": 2.3523621571905485e-06,
"loss": 0.1349,
"step": 16650
},
{
"epoch": 0.41,
"grad_norm": 0.7633017301559448,
"learning_rate": 2.340075165615276e-06,
"loss": 0.1298,
"step": 16700
},
{
"epoch": 0.41,
"grad_norm": 0.789726972579956,
"learning_rate": 2.3277920517329813e-06,
"loss": 0.1332,
"step": 16750
},
{
"epoch": 0.41,
"grad_norm": 0.9312193393707275,
"learning_rate": 2.315513113371981e-06,
"loss": 0.1332,
"step": 16800
},
{
"epoch": 0.41,
"grad_norm": 0.7161571979522705,
"learning_rate": 2.303238648259349e-06,
"loss": 0.1286,
"step": 16850
},
{
"epoch": 0.41,
"grad_norm": 0.8490265607833862,
"learning_rate": 2.2909689540136986e-06,
"loss": 0.1313,
"step": 16900
},
{
"epoch": 0.42,
"grad_norm": 0.7703916430473328,
"learning_rate": 2.27870432813796e-06,
"loss": 0.136,
"step": 16950
},
{
"epoch": 0.42,
"grad_norm": 0.8781723976135254,
"learning_rate": 2.2664450680121757e-06,
"loss": 0.1299,
"step": 17000
},
{
"epoch": 0.42,
"grad_norm": 0.88892662525177,
"learning_rate": 2.254191470886279e-06,
"loss": 0.1346,
"step": 17050
},
{
"epoch": 0.42,
"grad_norm": 0.9157165884971619,
"learning_rate": 2.241943833872899e-06,
"loss": 0.1361,
"step": 17100
},
{
"epoch": 0.42,
"grad_norm": 0.9071301221847534,
"learning_rate": 2.2297024539401463e-06,
"loss": 0.138,
"step": 17150
},
{
"epoch": 0.42,
"grad_norm": 0.7209107279777527,
"learning_rate": 2.2174676279044155e-06,
"loss": 0.1304,
"step": 17200
},
{
"epoch": 0.42,
"grad_norm": 0.8024885058403015,
"learning_rate": 2.2052396524231924e-06,
"loss": 0.1368,
"step": 17250
},
{
"epoch": 0.42,
"grad_norm": 0.8576869964599609,
"learning_rate": 2.193018823987854e-06,
"loss": 0.1339,
"step": 17300
},
{
"epoch": 0.43,
"grad_norm": 0.7553292512893677,
"learning_rate": 2.180805438916487e-06,
"loss": 0.1309,
"step": 17350
},
{
"epoch": 0.43,
"grad_norm": 0.7075461745262146,
"learning_rate": 2.1685997933466947e-06,
"loss": 0.1336,
"step": 17400
},
{
"epoch": 0.43,
"grad_norm": 0.8082211017608643,
"learning_rate": 2.1564021832284255e-06,
"loss": 0.1366,
"step": 17450
},
{
"epoch": 0.43,
"grad_norm": 0.904383659362793,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.1353,
"step": 17500
},
{
"epoch": 0.43,
"grad_norm": 0.7685850262641907,
"learning_rate": 2.1320322521648857e-06,
"loss": 0.1348,
"step": 17550
},
{
"epoch": 0.43,
"grad_norm": 0.9629550576210022,
"learning_rate": 2.119860522116653e-06,
"loss": 0.1355,
"step": 17600
},
{
"epoch": 0.43,
"grad_norm": 0.8548963665962219,
"learning_rate": 2.1076980092996845e-06,
"loss": 0.1314,
"step": 17650
},
{
"epoch": 0.43,
"grad_norm": 0.8409056067466736,
"learning_rate": 2.0955450086180883e-06,
"loss": 0.1333,
"step": 17700
},
{
"epoch": 0.44,
"grad_norm": 0.8298507928848267,
"learning_rate": 2.083401814745332e-06,
"loss": 0.1343,
"step": 17750
},
{
"epoch": 0.44,
"grad_norm": 0.8255162239074707,
"learning_rate": 2.071268722117096e-06,
"loss": 0.134,
"step": 17800
},
{
"epoch": 0.44,
"grad_norm": 0.8696017861366272,
"learning_rate": 2.0591460249241383e-06,
"loss": 0.134,
"step": 17850
},
{
"epoch": 0.44,
"grad_norm": 0.838620126247406,
"learning_rate": 2.0470340171051567e-06,
"loss": 0.135,
"step": 17900
},
{
"epoch": 0.44,
"grad_norm": 0.7810779213905334,
"learning_rate": 2.034932992339666e-06,
"loss": 0.1375,
"step": 17950
},
{
"epoch": 0.44,
"grad_norm": 0.7645576000213623,
"learning_rate": 2.022843244040874e-06,
"loss": 0.1351,
"step": 18000
},
{
"epoch": 0.44,
"grad_norm": 0.8255435824394226,
"learning_rate": 2.0107650653485707e-06,
"loss": 0.1349,
"step": 18050
},
{
"epoch": 0.44,
"grad_norm": 0.8783218264579773,
"learning_rate": 1.998698749122014e-06,
"loss": 0.1307,
"step": 18100
},
{
"epoch": 0.45,
"grad_norm": 0.9770956039428711,
"learning_rate": 1.986644587932835e-06,
"loss": 0.1369,
"step": 18150
},
{
"epoch": 0.45,
"grad_norm": 1.0978344678878784,
"learning_rate": 1.9746028740579453e-06,
"loss": 0.1367,
"step": 18200
},
{
"epoch": 0.45,
"grad_norm": 0.7640126943588257,
"learning_rate": 1.96257389947244e-06,
"loss": 0.1365,
"step": 18250
},
{
"epoch": 0.45,
"grad_norm": 0.7604764103889465,
"learning_rate": 1.9505579558425315e-06,
"loss": 0.1379,
"step": 18300
},
{
"epoch": 0.45,
"grad_norm": 0.7669352889060974,
"learning_rate": 1.938555334518466e-06,
"loss": 0.137,
"step": 18350
},
{
"epoch": 0.45,
"grad_norm": 0.8201314806938171,
"learning_rate": 1.926566326527468e-06,
"loss": 0.1303,
"step": 18400
},
{
"epoch": 0.45,
"grad_norm": 0.8664389252662659,
"learning_rate": 1.914591222566678e-06,
"loss": 0.1341,
"step": 18450
},
{
"epoch": 0.45,
"grad_norm": 0.9102927446365356,
"learning_rate": 1.9026303129961049e-06,
"loss": 0.1339,
"step": 18500
},
{
"epoch": 0.46,
"grad_norm": 0.8137445449829102,
"learning_rate": 1.8906838878315886e-06,
"loss": 0.13,
"step": 18550
},
{
"epoch": 0.46,
"grad_norm": 0.7645391225814819,
"learning_rate": 1.878752236737765e-06,
"loss": 0.1327,
"step": 18600
},
{
"epoch": 0.46,
"grad_norm": 0.8791195750236511,
"learning_rate": 1.8668356490210449e-06,
"loss": 0.1342,
"step": 18650
},
{
"epoch": 0.46,
"grad_norm": 0.8669970631599426,
"learning_rate": 1.8549344136225946e-06,
"loss": 0.1343,
"step": 18700
},
{
"epoch": 0.46,
"grad_norm": 0.8047628402709961,
"learning_rate": 1.8430488191113373e-06,
"loss": 0.1303,
"step": 18750
},
{
"epoch": 0.46,
"grad_norm": 0.9549434781074524,
"learning_rate": 1.8311791536769485e-06,
"loss": 0.1309,
"step": 18800
},
{
"epoch": 0.46,
"grad_norm": 0.9143491983413696,
"learning_rate": 1.819325705122873e-06,
"loss": 0.1327,
"step": 18850
},
{
"epoch": 0.46,
"grad_norm": 0.8247759342193604,
"learning_rate": 1.8074887608593477e-06,
"loss": 0.1332,
"step": 18900
},
{
"epoch": 0.47,
"grad_norm": 0.939913809299469,
"learning_rate": 1.7956686078964257e-06,
"loss": 0.1313,
"step": 18950
},
{
"epoch": 0.47,
"grad_norm": 0.9549752473831177,
"learning_rate": 1.7838655328370268e-06,
"loss": 0.1329,
"step": 19000
},
{
"epoch": 0.47,
"grad_norm": 0.8385360240936279,
"learning_rate": 1.7720798218699798e-06,
"loss": 0.1294,
"step": 19050
},
{
"epoch": 0.47,
"grad_norm": 0.7431653141975403,
"learning_rate": 1.7603117607630892e-06,
"loss": 0.1356,
"step": 19100
},
{
"epoch": 0.47,
"grad_norm": 0.6874954700469971,
"learning_rate": 1.7485616348562023e-06,
"loss": 0.13,
"step": 19150
},
{
"epoch": 0.47,
"grad_norm": 0.883647084236145,
"learning_rate": 1.7368297290542918e-06,
"loss": 0.1313,
"step": 19200
},
{
"epoch": 0.47,
"grad_norm": 0.9019405841827393,
"learning_rate": 1.72511632782055e-06,
"loss": 0.1319,
"step": 19250
},
{
"epoch": 0.47,
"grad_norm": 0.8680510520935059,
"learning_rate": 1.7134217151694873e-06,
"loss": 0.1331,
"step": 19300
},
{
"epoch": 0.48,
"grad_norm": 0.7705395817756653,
"learning_rate": 1.7017461746600506e-06,
"loss": 0.1318,
"step": 19350
},
{
"epoch": 0.48,
"grad_norm": 0.9004321694374084,
"learning_rate": 1.690089989388741e-06,
"loss": 0.1376,
"step": 19400
},
{
"epoch": 0.48,
"grad_norm": 0.8036971688270569,
"learning_rate": 1.678453441982758e-06,
"loss": 0.1319,
"step": 19450
},
{
"epoch": 0.48,
"grad_norm": 1.0252909660339355,
"learning_rate": 1.66683681459314e-06,
"loss": 0.1303,
"step": 19500
},
{
"epoch": 0.48,
"grad_norm": 0.9045292735099792,
"learning_rate": 1.6552403888879243e-06,
"loss": 0.1316,
"step": 19550
},
{
"epoch": 0.48,
"grad_norm": 0.8599112629890442,
"learning_rate": 1.6436644460453218e-06,
"loss": 0.1304,
"step": 19600
},
{
"epoch": 0.48,
"grad_norm": 0.8230019807815552,
"learning_rate": 1.6321092667468926e-06,
"loss": 0.1307,
"step": 19650
},
{
"epoch": 0.48,
"grad_norm": 0.8907728791236877,
"learning_rate": 1.6205751311707463e-06,
"loss": 0.1328,
"step": 19700
},
{
"epoch": 0.48,
"grad_norm": 0.8626874685287476,
"learning_rate": 1.6090623189847443e-06,
"loss": 0.1294,
"step": 19750
},
{
"epoch": 0.49,
"grad_norm": 1.0895453691482544,
"learning_rate": 1.5975711093397223e-06,
"loss": 0.1321,
"step": 19800
},
{
"epoch": 0.49,
"grad_norm": 0.7280532717704773,
"learning_rate": 1.5861017808627167e-06,
"loss": 0.1319,
"step": 19850
},
{
"epoch": 0.49,
"grad_norm": 0.7881760597229004,
"learning_rate": 1.574654611650214e-06,
"loss": 0.1336,
"step": 19900
},
{
"epoch": 0.49,
"grad_norm": 0.9470044374465942,
"learning_rate": 1.5632298792614064e-06,
"loss": 0.1351,
"step": 19950
},
{
"epoch": 0.49,
"grad_norm": 0.8242919445037842,
"learning_rate": 1.5518278607114585e-06,
"loss": 0.1316,
"step": 20000
},
{
"epoch": 0.49,
"grad_norm": 0.8981928825378418,
"learning_rate": 1.540448832464796e-06,
"loss": 0.1288,
"step": 20050
},
{
"epoch": 0.49,
"grad_norm": 0.8890678882598877,
"learning_rate": 1.5290930704283953e-06,
"loss": 0.1311,
"step": 20100
},
{
"epoch": 0.49,
"grad_norm": 1.0418756008148193,
"learning_rate": 1.517760849945103e-06,
"loss": 0.1275,
"step": 20150
},
{
"epoch": 0.5,
"grad_norm": 0.9723849892616272,
"learning_rate": 1.5064524457869506e-06,
"loss": 0.1291,
"step": 20200
},
{
"epoch": 0.5,
"grad_norm": 0.9670615792274475,
"learning_rate": 1.4951681321484952e-06,
"loss": 0.1307,
"step": 20250
},
{
"epoch": 0.5,
"grad_norm": 0.8188683390617371,
"learning_rate": 1.4839081826401756e-06,
"loss": 0.126,
"step": 20300
},
{
"epoch": 0.5,
"grad_norm": 0.9228366613388062,
"learning_rate": 1.47267287028167e-06,
"loss": 0.1331,
"step": 20350
},
{
"epoch": 0.5,
"grad_norm": 0.980305552482605,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.1289,
"step": 20400
},
{
"epoch": 0.5,
"grad_norm": 0.803564727306366,
"learning_rate": 1.4502772460993387e-06,
"loss": 0.1292,
"step": 20450
},
{
"epoch": 0.5,
"grad_norm": 0.9198700785636902,
"learning_rate": 1.4391174773015836e-06,
"loss": 0.1267,
"step": 20500
},
{
"epoch": 0.5,
"grad_norm": 0.7790723443031311,
"learning_rate": 1.4279834316926217e-06,
"loss": 0.1253,
"step": 20550
},
{
"epoch": 0.51,
"grad_norm": 0.729775071144104,
"learning_rate": 1.4168753792393413e-06,
"loss": 0.134,
"step": 20600
},
{
"epoch": 0.51,
"grad_norm": 0.9813298583030701,
"learning_rate": 1.405793589278382e-06,
"loss": 0.1317,
"step": 20650
},
{
"epoch": 0.51,
"grad_norm": 0.9131382703781128,
"learning_rate": 1.394738330509593e-06,
"loss": 0.1333,
"step": 20700
},
{
"epoch": 0.51,
"grad_norm": 0.7938083410263062,
"learning_rate": 1.3837098709895246e-06,
"loss": 0.127,
"step": 20750
},
{
"epoch": 0.51,
"grad_norm": 0.8520334362983704,
"learning_rate": 1.3727084781249251e-06,
"loss": 0.1297,
"step": 20800
},
{
"epoch": 0.51,
"grad_norm": 0.9451938271522522,
"learning_rate": 1.3617344186662585e-06,
"loss": 0.1299,
"step": 20850
},
{
"epoch": 0.51,
"grad_norm": 0.7489553093910217,
"learning_rate": 1.3507879587012378e-06,
"loss": 0.1297,
"step": 20900
},
{
"epoch": 0.51,
"grad_norm": 0.9317044615745544,
"learning_rate": 1.3398693636483707e-06,
"loss": 0.1291,
"step": 20950
},
{
"epoch": 0.52,
"grad_norm": 0.9955152273178101,
"learning_rate": 1.328978898250525e-06,
"loss": 0.1342,
"step": 21000
},
{
"epoch": 0.52,
"grad_norm": 0.9062507748603821,
"learning_rate": 1.31811682656851e-06,
"loss": 0.1317,
"step": 21050
},
{
"epoch": 0.52,
"grad_norm": 0.781728982925415,
"learning_rate": 1.307283411974674e-06,
"loss": 0.1304,
"step": 21100
},
{
"epoch": 0.52,
"grad_norm": 0.8539147973060608,
"learning_rate": 1.2964789171465164e-06,
"loss": 0.1294,
"step": 21150
},
{
"epoch": 0.52,
"grad_norm": 1.017155647277832,
"learning_rate": 1.2857036040603204e-06,
"loss": 0.1277,
"step": 21200
},
{
"epoch": 0.52,
"grad_norm": 0.9131125211715698,
"learning_rate": 1.2749577339848007e-06,
"loss": 0.1294,
"step": 21250
},
{
"epoch": 0.52,
"grad_norm": 0.7943052649497986,
"learning_rate": 1.2642415674747675e-06,
"loss": 0.1263,
"step": 21300
},
{
"epoch": 0.52,
"grad_norm": 0.8513858914375305,
"learning_rate": 1.25355536436481e-06,
"loss": 0.1269,
"step": 21350
},
{
"epoch": 0.53,
"grad_norm": 0.8048068284988403,
"learning_rate": 1.2428993837629943e-06,
"loss": 0.1316,
"step": 21400
},
{
"epoch": 0.53,
"grad_norm": 0.8899339437484741,
"learning_rate": 1.2322738840445867e-06,
"loss": 0.1332,
"step": 21450
},
{
"epoch": 0.53,
"grad_norm": 0.7358732223510742,
"learning_rate": 1.2216791228457778e-06,
"loss": 0.1282,
"step": 21500
},
{
"epoch": 0.53,
"grad_norm": 1.0263513326644897,
"learning_rate": 1.2111153570574454e-06,
"loss": 0.1279,
"step": 21550
},
{
"epoch": 0.53,
"grad_norm": 0.9039693474769592,
"learning_rate": 1.2005828428189256e-06,
"loss": 0.1291,
"step": 21600
},
{
"epoch": 0.53,
"grad_norm": 0.8572480082511902,
"learning_rate": 1.1900818355117918e-06,
"loss": 0.127,
"step": 21650
},
{
"epoch": 0.53,
"grad_norm": 0.7938106656074524,
"learning_rate": 1.1796125897536782e-06,
"loss": 0.127,
"step": 21700
},
{
"epoch": 0.53,
"grad_norm": 0.7319739460945129,
"learning_rate": 1.1691753593920884e-06,
"loss": 0.1276,
"step": 21750
},
{
"epoch": 0.54,
"grad_norm": 0.8481085896492004,
"learning_rate": 1.1587703974982583e-06,
"loss": 0.1303,
"step": 21800
},
{
"epoch": 0.54,
"grad_norm": 0.9394869804382324,
"learning_rate": 1.148397956361007e-06,
"loss": 0.1256,
"step": 21850
},
{
"epoch": 0.54,
"grad_norm": 0.842454195022583,
"learning_rate": 1.1380582874806208e-06,
"loss": 0.1318,
"step": 21900
},
{
"epoch": 0.54,
"grad_norm": 0.8142734169960022,
"learning_rate": 1.127751641562765e-06,
"loss": 0.1338,
"step": 21950
},
{
"epoch": 0.54,
"grad_norm": 0.9196761250495911,
"learning_rate": 1.1174782685123919e-06,
"loss": 0.1301,
"step": 22000
},
{
"epoch": 0.54,
"grad_norm": 0.8244367837905884,
"learning_rate": 1.107238417427691e-06,
"loss": 0.1312,
"step": 22050
},
{
"epoch": 0.54,
"grad_norm": 0.8315288424491882,
"learning_rate": 1.0970323365940443e-06,
"loss": 0.1317,
"step": 22100
},
{
"epoch": 0.54,
"grad_norm": 0.9578205943107605,
"learning_rate": 1.0868602734780075e-06,
"loss": 0.1286,
"step": 22150
},
{
"epoch": 0.55,
"grad_norm": 0.8732833862304688,
"learning_rate": 1.0767224747213102e-06,
"loss": 0.1256,
"step": 22200
},
{
"epoch": 0.55,
"grad_norm": 0.9204590916633606,
"learning_rate": 1.0666191861348741e-06,
"loss": 0.1281,
"step": 22250
},
{
"epoch": 0.55,
"grad_norm": 0.8349716663360596,
"learning_rate": 1.0565506526928548e-06,
"loss": 0.1369,
"step": 22300
},
{
"epoch": 0.55,
"grad_norm": 0.9580523371696472,
"learning_rate": 1.0465171185267007e-06,
"loss": 0.1224,
"step": 22350
},
{
"epoch": 0.55,
"grad_norm": 0.8979122638702393,
"learning_rate": 1.036518826919233e-06,
"loss": 0.1296,
"step": 22400
},
{
"epoch": 0.55,
"grad_norm": 0.9922088384628296,
"learning_rate": 1.0265560202987474e-06,
"loss": 0.1324,
"step": 22450
},
{
"epoch": 0.55,
"grad_norm": 0.9371756911277771,
"learning_rate": 1.0166289402331391e-06,
"loss": 0.128,
"step": 22500
},
{
"epoch": 0.55,
"grad_norm": 0.8509624600410461,
"learning_rate": 1.006737827424038e-06,
"loss": 0.1324,
"step": 22550
},
{
"epoch": 0.55,
"grad_norm": 0.8830806612968445,
"learning_rate": 9.9688292170098e-07,
"loss": 0.1277,
"step": 22600
},
{
"epoch": 0.56,
"grad_norm": 0.7790393233299255,
"learning_rate": 9.870644620155878e-07,
"loss": 0.1312,
"step": 22650
},
{
"epoch": 0.56,
"grad_norm": 0.8227584958076477,
"learning_rate": 9.77282686435777e-07,
"loss": 0.1272,
"step": 22700
},
{
"epoch": 0.56,
"grad_norm": 0.907335102558136,
"learning_rate": 9.67537832139989e-07,
"loss": 0.1292,
"step": 22750
},
{
"epoch": 0.56,
"grad_norm": 0.9107279777526855,
"learning_rate": 9.578301354114292e-07,
"loss": 0.1317,
"step": 22800
},
{
"epoch": 0.56,
"grad_norm": 0.8445246815681458,
"learning_rate": 9.481598316323504e-07,
"loss": 0.1291,
"step": 22850
},
{
"epoch": 0.56,
"grad_norm": 0.9479144215583801,
"learning_rate": 9.385271552783376e-07,
"loss": 0.1292,
"step": 22900
},
{
"epoch": 0.56,
"grad_norm": 0.8376983404159546,
"learning_rate": 9.289323399126216e-07,
"loss": 0.1284,
"step": 22950
},
{
"epoch": 0.56,
"grad_norm": 0.9021371006965637,
"learning_rate": 9.193756181804248e-07,
"loss": 0.1347,
"step": 23000
},
{
"epoch": 0.57,
"grad_norm": 0.8589430451393127,
"learning_rate": 9.098572218033084e-07,
"loss": 0.1278,
"step": 23050
},
{
"epoch": 0.57,
"grad_norm": 0.9074499011039734,
"learning_rate": 9.003773815735644e-07,
"loss": 0.1307,
"step": 23100
},
{
"epoch": 0.57,
"grad_norm": 0.8730277419090271,
"learning_rate": 8.90936327348613e-07,
"loss": 0.1298,
"step": 23150
},
{
"epoch": 0.57,
"grad_norm": 0.8911556601524353,
"learning_rate": 8.815342880454312e-07,
"loss": 0.1296,
"step": 23200
},
{
"epoch": 0.57,
"grad_norm": 0.87279212474823,
"learning_rate": 8.721714916350019e-07,
"loss": 0.1276,
"step": 23250
},
{
"epoch": 0.57,
"grad_norm": 0.9037759900093079,
"learning_rate": 8.628481651367876e-07,
"loss": 0.1311,
"step": 23300
},
{
"epoch": 0.57,
"grad_norm": 1.0413986444473267,
"learning_rate": 8.535645346132246e-07,
"loss": 0.1306,
"step": 23350
},
{
"epoch": 0.57,
"grad_norm": 0.752288281917572,
"learning_rate": 8.443208251642418e-07,
"loss": 0.1267,
"step": 23400
},
{
"epoch": 0.58,
"grad_norm": 0.791836678981781,
"learning_rate": 8.351172609218033e-07,
"loss": 0.1266,
"step": 23450
},
{
"epoch": 0.58,
"grad_norm": 0.8868898153305054,
"learning_rate": 8.259540650444736e-07,
"loss": 0.1316,
"step": 23500
},
{
"epoch": 0.58,
"grad_norm": 0.9566688537597656,
"learning_rate": 8.168314597120059e-07,
"loss": 0.1282,
"step": 23550
},
{
"epoch": 0.58,
"grad_norm": 0.844053328037262,
"learning_rate": 8.077496661199557e-07,
"loss": 0.1228,
"step": 23600
},
{
"epoch": 0.58,
"grad_norm": 0.826914370059967,
"learning_rate": 7.987089044743182e-07,
"loss": 0.1304,
"step": 23650
},
{
"epoch": 0.58,
"grad_norm": 0.8881601691246033,
"learning_rate": 7.897093939861878e-07,
"loss": 0.1264,
"step": 23700
},
{
"epoch": 0.58,
"grad_norm": 0.8328256607055664,
"learning_rate": 7.807513528664415e-07,
"loss": 0.1262,
"step": 23750
},
{
"epoch": 0.58,
"grad_norm": 0.8944756388664246,
"learning_rate": 7.71834998320454e-07,
"loss": 0.13,
"step": 23800
},
{
"epoch": 0.59,
"grad_norm": 0.8959199786186218,
"learning_rate": 7.629605465428211e-07,
"loss": 0.1254,
"step": 23850
},
{
"epoch": 0.59,
"grad_norm": 0.8694652318954468,
"learning_rate": 7.541282127121291e-07,
"loss": 0.1277,
"step": 23900
},
{
"epoch": 0.59,
"grad_norm": 1.048399567604065,
"learning_rate": 7.453382109857269e-07,
"loss": 0.1299,
"step": 23950
},
{
"epoch": 0.59,
"grad_norm": 0.8690307140350342,
"learning_rate": 7.365907544945398e-07,
"loss": 0.1309,
"step": 24000
},
{
"epoch": 0.59,
"grad_norm": 1.0731440782546997,
"learning_rate": 7.27886055337902e-07,
"loss": 0.1281,
"step": 24050
},
{
"epoch": 0.59,
"grad_norm": 0.8248873949050903,
"learning_rate": 7.192243245784075e-07,
"loss": 0.1232,
"step": 24100
},
{
"epoch": 0.59,
"grad_norm": 0.8485476970672607,
"learning_rate": 7.106057722368012e-07,
"loss": 0.1246,
"step": 24150
},
{
"epoch": 0.59,
"grad_norm": 0.8483170866966248,
"learning_rate": 7.020306072868804e-07,
"loss": 0.1324,
"step": 24200
},
{
"epoch": 0.6,
"grad_norm": 0.7738081812858582,
"learning_rate": 6.934990376504269e-07,
"loss": 0.1262,
"step": 24250
},
{
"epoch": 0.6,
"grad_norm": 1.051750898361206,
"learning_rate": 6.850112701921735e-07,
"loss": 0.1258,
"step": 24300
},
{
"epoch": 0.6,
"grad_norm": 0.9033488035202026,
"learning_rate": 6.76567510714777e-07,
"loss": 0.1226,
"step": 24350
},
{
"epoch": 0.6,
"grad_norm": 0.9012821316719055,
"learning_rate": 6.681679639538388e-07,
"loss": 0.127,
"step": 24400
},
{
"epoch": 0.6,
"grad_norm": 0.8953605890274048,
"learning_rate": 6.598128335729332e-07,
"loss": 0.1235,
"step": 24450
},
{
"epoch": 0.6,
"grad_norm": 0.9088945984840393,
"learning_rate": 6.515023221586722e-07,
"loss": 0.1311,
"step": 24500
},
{
"epoch": 0.6,
"grad_norm": 0.9187101721763611,
"learning_rate": 6.432366312157933e-07,
"loss": 0.1306,
"step": 24550
},
{
"epoch": 0.6,
"grad_norm": 0.7924743890762329,
"learning_rate": 6.35015961162273e-07,
"loss": 0.1274,
"step": 24600
},
{
"epoch": 0.61,
"grad_norm": 0.8540875315666199,
"learning_rate": 6.268405113244677e-07,
"loss": 0.1278,
"step": 24650
},
{
"epoch": 0.61,
"grad_norm": 0.8939800262451172,
"learning_rate": 6.187104799322805e-07,
"loss": 0.1242,
"step": 24700
},
{
"epoch": 0.61,
"grad_norm": 0.871914803981781,
"learning_rate": 6.106260641143547e-07,
"loss": 0.1276,
"step": 24750
},
{
"epoch": 0.61,
"grad_norm": 1.014113426208496,
"learning_rate": 6.025874598932937e-07,
"loss": 0.1252,
"step": 24800
},
{
"epoch": 0.61,
"grad_norm": 0.8618780374526978,
"learning_rate": 5.945948621809092e-07,
"loss": 0.1299,
"step": 24850
},
{
"epoch": 0.61,
"grad_norm": 0.9357609748840332,
"learning_rate": 5.866484647734935e-07,
"loss": 0.1302,
"step": 24900
},
{
"epoch": 0.61,
"grad_norm": 0.8437809348106384,
"learning_rate": 5.787484603471221e-07,
"loss": 0.1226,
"step": 24950
},
{
"epoch": 0.61,
"grad_norm": 0.9190840125083923,
"learning_rate": 5.708950404529812e-07,
"loss": 0.1277,
"step": 25000
},
{
"epoch": 0.62,
"grad_norm": 0.8171085715293884,
"learning_rate": 5.630883955127211e-07,
"loss": 0.1243,
"step": 25050
},
{
"epoch": 0.62,
"grad_norm": 0.87744140625,
"learning_rate": 5.553287148138462e-07,
"loss": 0.1248,
"step": 25100
},
{
"epoch": 0.62,
"grad_norm": 0.9077624082565308,
"learning_rate": 5.47616186505113e-07,
"loss": 0.1311,
"step": 25150
},
{
"epoch": 0.62,
"grad_norm": 0.8664446473121643,
"learning_rate": 5.399509975919828e-07,
"loss": 0.1198,
"step": 25200
},
{
"epoch": 0.62,
"grad_norm": 0.869256317615509,
"learning_rate": 5.323333339320739e-07,
"loss": 0.1299,
"step": 25250
},
{
"epoch": 0.62,
"grad_norm": 0.851861834526062,
"learning_rate": 5.247633802306637e-07,
"loss": 0.1317,
"step": 25300
},
{
"epoch": 0.62,
"grad_norm": 0.8232414722442627,
"learning_rate": 5.172413200362092e-07,
"loss": 0.1298,
"step": 25350
},
{
"epoch": 0.62,
"grad_norm": 0.7558964490890503,
"learning_rate": 5.097673357358906e-07,
"loss": 0.1296,
"step": 25400
},
{
"epoch": 0.62,
"grad_norm": 0.9631604552268982,
"learning_rate": 5.023416085511976e-07,
"loss": 0.1262,
"step": 25450
},
{
"epoch": 0.63,
"grad_norm": 0.9081146717071533,
"learning_rate": 4.949643185335288e-07,
"loss": 0.125,
"step": 25500
},
{
"epoch": 0.63,
"grad_norm": 0.8329197764396667,
"learning_rate": 4.876356445598279e-07,
"loss": 0.1243,
"step": 25550
},
{
"epoch": 0.63,
"grad_norm": 0.9667308330535889,
"learning_rate": 4.803557643282486e-07,
"loss": 0.1292,
"step": 25600
},
{
"epoch": 0.63,
"grad_norm": 0.7930652499198914,
"learning_rate": 4.731248543538405e-07,
"loss": 0.1216,
"step": 25650
},
{
"epoch": 0.63,
"grad_norm": 0.9274152517318726,
"learning_rate": 4.6594308996427696e-07,
"loss": 0.1262,
"step": 25700
},
{
"epoch": 0.63,
"grad_norm": 1.0009911060333252,
"learning_rate": 4.588106452955973e-07,
"loss": 0.1253,
"step": 25750
},
{
"epoch": 0.63,
"grad_norm": 0.9739466309547424,
"learning_rate": 4.517276932879877e-07,
"loss": 0.1299,
"step": 25800
},
{
"epoch": 0.63,
"grad_norm": 0.7866214513778687,
"learning_rate": 4.446944056815866e-07,
"loss": 0.1243,
"step": 25850
},
{
"epoch": 0.64,
"grad_norm": 0.8441150784492493,
"learning_rate": 4.377109530123216e-07,
"loss": 0.1305,
"step": 25900
},
{
"epoch": 0.64,
"grad_norm": 0.8807822465896606,
"learning_rate": 4.307775046077739e-07,
"loss": 0.124,
"step": 25950
},
{
"epoch": 0.64,
"grad_norm": 0.9606176614761353,
"learning_rate": 4.2389422858307244e-07,
"loss": 0.1268,
"step": 26000
},
{
"epoch": 0.64,
"grad_norm": 0.7626886963844299,
"learning_rate": 4.1706129183681834e-07,
"loss": 0.1224,
"step": 26050
},
{
"epoch": 0.64,
"grad_norm": 0.9602739214897156,
"learning_rate": 4.10278860047037e-07,
"loss": 0.1285,
"step": 26100
},
{
"epoch": 0.64,
"grad_norm": 0.9486740827560425,
"learning_rate": 4.035470976671621e-07,
"loss": 0.126,
"step": 26150
},
{
"epoch": 0.64,
"grad_norm": 0.8300147652626038,
"learning_rate": 3.9686616792204677e-07,
"loss": 0.1274,
"step": 26200
},
{
"epoch": 0.64,
"grad_norm": 0.9949631094932556,
"learning_rate": 3.902362328040091e-07,
"loss": 0.1278,
"step": 26250
},
{
"epoch": 0.65,
"grad_norm": 1.00017511844635,
"learning_rate": 3.836574530688983e-07,
"loss": 0.1302,
"step": 26300
},
{
"epoch": 0.65,
"grad_norm": 1.0099530220031738,
"learning_rate": 3.7712998823220243e-07,
"loss": 0.1274,
"step": 26350
},
{
"epoch": 0.65,
"grad_norm": 0.8386552929878235,
"learning_rate": 3.7065399656517955e-07,
"loss": 0.1312,
"step": 26400
},
{
"epoch": 0.65,
"grad_norm": 0.8319630026817322,
"learning_rate": 3.6422963509101626e-07,
"loss": 0.1267,
"step": 26450
},
{
"epoch": 0.65,
"grad_norm": 0.9459981918334961,
"learning_rate": 3.578570595810274e-07,
"loss": 0.1288,
"step": 26500
},
{
"epoch": 0.65,
"grad_norm": 0.8560216426849365,
"learning_rate": 3.515364245508704e-07,
"loss": 0.1292,
"step": 26550
},
{
"epoch": 0.65,
"grad_norm": 0.8289064764976501,
"learning_rate": 3.452678832568071e-07,
"loss": 0.1227,
"step": 26600
},
{
"epoch": 0.65,
"grad_norm": 0.805689811706543,
"learning_rate": 3.390515876919831e-07,
"loss": 0.129,
"step": 26650
},
{
"epoch": 0.66,
"grad_norm": 1.0290043354034424,
"learning_rate": 3.328876885827406e-07,
"loss": 0.1286,
"step": 26700
},
{
"epoch": 0.66,
"grad_norm": 0.8204767107963562,
"learning_rate": 3.267763353849704e-07,
"loss": 0.1289,
"step": 26750
},
{
"epoch": 0.66,
"grad_norm": 0.7993561625480652,
"learning_rate": 3.207176762804814e-07,
"loss": 0.1257,
"step": 26800
},
{
"epoch": 0.66,
"grad_norm": 0.8801778554916382,
"learning_rate": 3.1471185817341153e-07,
"loss": 0.1261,
"step": 26850
},
{
"epoch": 0.66,
"grad_norm": 0.9060052633285522,
"learning_rate": 3.0875902668666386e-07,
"loss": 0.1256,
"step": 26900
},
{
"epoch": 0.66,
"grad_norm": 1.0080070495605469,
"learning_rate": 3.0285932615837646e-07,
"loss": 0.1265,
"step": 26950
},
{
"epoch": 0.66,
"grad_norm": 0.8546701669692993,
"learning_rate": 2.970128996384228e-07,
"loss": 0.1267,
"step": 27000
},
{
"epoch": 0.66,
"grad_norm": 0.8351757526397705,
"learning_rate": 2.9121988888494297e-07,
"loss": 0.1256,
"step": 27050
},
{
"epoch": 0.67,
"grad_norm": 0.887173593044281,
"learning_rate": 2.854804343609058e-07,
"loss": 0.1269,
"step": 27100
},
{
"epoch": 0.67,
"grad_norm": 0.7610448598861694,
"learning_rate": 2.7979467523070484e-07,
"loss": 0.1253,
"step": 27150
},
{
"epoch": 0.67,
"grad_norm": 0.8258687853813171,
"learning_rate": 2.741627493567822e-07,
"loss": 0.1307,
"step": 27200
},
{
"epoch": 0.67,
"grad_norm": 0.7776610255241394,
"learning_rate": 2.685847932962868e-07,
"loss": 0.1251,
"step": 27250
},
{
"epoch": 0.67,
"grad_norm": 0.936505138874054,
"learning_rate": 2.630609422977623e-07,
"loss": 0.1287,
"step": 27300
},
{
"epoch": 0.67,
"grad_norm": 0.9153953790664673,
"learning_rate": 2.575913302978697e-07,
"loss": 0.1209,
"step": 27350
},
{
"epoch": 0.67,
"grad_norm": 0.895073413848877,
"learning_rate": 2.5217608991813774e-07,
"loss": 0.1266,
"step": 27400
},
{
"epoch": 0.67,
"grad_norm": 0.8763769268989563,
"learning_rate": 2.468153524617478e-07,
"loss": 0.1289,
"step": 27450
},
{
"epoch": 0.68,
"grad_norm": 1.1015098094940186,
"learning_rate": 2.4150924791035037e-07,
"loss": 0.1275,
"step": 27500
},
{
"epoch": 0.68,
"grad_norm": 0.9541353583335876,
"learning_rate": 2.3625790492091544e-07,
"loss": 0.1249,
"step": 27550
},
{
"epoch": 0.68,
"grad_norm": 0.798875629901886,
"learning_rate": 2.3106145082260777e-07,
"loss": 0.1239,
"step": 27600
},
{
"epoch": 0.68,
"grad_norm": 0.9592541456222534,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.1225,
"step": 27650
},
{
"epoch": 0.68,
"grad_norm": 0.9374776482582092,
"learning_rate": 2.2083371195853797e-07,
"loss": 0.1288,
"step": 27700
},
{
"epoch": 0.68,
"grad_norm": 0.9778282642364502,
"learning_rate": 2.158026751844733e-07,
"loss": 0.1266,
"step": 27750
},
{
"epoch": 0.68,
"grad_norm": 0.8233149647712708,
"learning_rate": 2.1082702327891918e-07,
"loss": 0.1275,
"step": 27800
},
{
"epoch": 0.68,
"grad_norm": 0.8023253083229065,
"learning_rate": 2.0590687688636619e-07,
"loss": 0.1245,
"step": 27850
},
{
"epoch": 0.69,
"grad_norm": 0.7281996607780457,
"learning_rate": 2.0104235530546745e-07,
"loss": 0.1298,
"step": 27900
},
{
"epoch": 0.69,
"grad_norm": 0.7590131163597107,
"learning_rate": 1.9623357648614088e-07,
"loss": 0.1274,
"step": 27950
},
{
"epoch": 0.69,
"grad_norm": 0.7430440187454224,
"learning_rate": 1.914806570267111e-07,
"loss": 0.1263,
"step": 28000
},
{
"epoch": 0.69,
"grad_norm": 0.8962095379829407,
"learning_rate": 1.8678371217108387e-07,
"loss": 0.1238,
"step": 28050
},
{
"epoch": 0.69,
"grad_norm": 0.9490410685539246,
"learning_rate": 1.821428558059493e-07,
"loss": 0.1291,
"step": 28100
},
{
"epoch": 0.69,
"grad_norm": 0.9238828420639038,
"learning_rate": 1.7755820045802146e-07,
"loss": 0.1256,
"step": 28150
},
{
"epoch": 0.69,
"grad_norm": 1.0077736377716064,
"learning_rate": 1.7302985729131e-07,
"loss": 0.1284,
"step": 28200
},
{
"epoch": 0.69,
"grad_norm": 0.7857496738433838,
"learning_rate": 1.6855793610442484e-07,
"loss": 0.1229,
"step": 28250
},
{
"epoch": 0.69,
"grad_norm": 0.9102024435997009,
"learning_rate": 1.6414254532791357e-07,
"loss": 0.1234,
"step": 28300
},
{
"epoch": 0.7,
"grad_norm": 0.8468128442764282,
"learning_rate": 1.5978379202163275e-07,
"loss": 0.1297,
"step": 28350
},
{
"epoch": 0.7,
"grad_norm": 1.1208685636520386,
"learning_rate": 1.554817818721513e-07,
"loss": 0.1263,
"step": 28400
},
{
"epoch": 0.7,
"grad_norm": 0.8233404159545898,
"learning_rate": 1.51236619190189e-07,
"loss": 0.1275,
"step": 28450
},
{
"epoch": 0.7,
"grad_norm": 0.7639409303665161,
"learning_rate": 1.4704840690808658e-07,
"loss": 0.1264,
"step": 28500
},
{
"epoch": 0.7,
"grad_norm": 0.8358595371246338,
"learning_rate": 1.4291724657730904e-07,
"loss": 0.1245,
"step": 28550
},
{
"epoch": 0.7,
"grad_norm": 0.754831075668335,
"learning_rate": 1.3884323836598656e-07,
"loss": 0.1217,
"step": 28600
},
{
"epoch": 0.7,
"grad_norm": 0.9194930195808411,
"learning_rate": 1.348264810564809e-07,
"loss": 0.126,
"step": 28650
},
{
"epoch": 0.7,
"grad_norm": 0.9364280700683594,
"learning_rate": 1.3086707204299415e-07,
"loss": 0.127,
"step": 28700
},
{
"epoch": 0.71,
"grad_norm": 1.0193439722061157,
"learning_rate": 1.269651073292058e-07,
"loss": 0.1277,
"step": 28750
},
{
"epoch": 0.71,
"grad_norm": 0.8932496309280396,
"learning_rate": 1.2312068152594448e-07,
"loss": 0.1326,
"step": 28800
},
{
"epoch": 0.71,
"grad_norm": 0.8845229148864746,
"learning_rate": 1.1933388784889617e-07,
"loss": 0.1268,
"step": 28850
},
{
"epoch": 0.71,
"grad_norm": 0.8222347497940063,
"learning_rate": 1.1560481811633911e-07,
"loss": 0.1249,
"step": 28900
},
{
"epoch": 0.71,
"grad_norm": 0.9118759036064148,
"learning_rate": 1.1193356274692424e-07,
"loss": 0.1248,
"step": 28950
},
{
"epoch": 0.71,
"grad_norm": 0.8872355222702026,
"learning_rate": 1.0832021075747712e-07,
"loss": 0.1273,
"step": 29000
},
{
"epoch": 0.71,
"grad_norm": 1.0349782705307007,
"learning_rate": 1.047648497608414e-07,
"loss": 0.1275,
"step": 29050
},
{
"epoch": 0.71,
"grad_norm": 0.8230103254318237,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.1255,
"step": 29100
},
{
"epoch": 0.72,
"grad_norm": 0.8841921091079712,
"learning_rate": 9.782844416476423e-08,
"loss": 0.1267,
"step": 29150
},
{
"epoch": 0.72,
"grad_norm": 0.8764199614524841,
"learning_rate": 9.444756775215446e-08,
"loss": 0.1271,
"step": 29200
},
{
"epoch": 0.72,
"grad_norm": 0.7919709086418152,
"learning_rate": 9.112501870194273e-08,
"loss": 0.1265,
"step": 29250
},
{
"epoch": 0.72,
"grad_norm": 0.873029351234436,
"learning_rate": 8.786087757588269e-08,
"loss": 0.1269,
"step": 29300
},
{
"epoch": 0.72,
"grad_norm": 0.9057585597038269,
"learning_rate": 8.465522351951305e-08,
"loss": 0.124,
"step": 29350
},
{
"epoch": 0.72,
"grad_norm": 0.8580294251441956,
"learning_rate": 8.150813426023752e-08,
"loss": 0.1267,
"step": 29400
},
{
"epoch": 0.72,
"grad_norm": 0.8197963237762451,
"learning_rate": 7.841968610544121e-08,
"loss": 0.1281,
"step": 29450
},
{
"epoch": 0.72,
"grad_norm": 0.8754163980484009,
"learning_rate": 7.538995394063996e-08,
"loss": 0.125,
"step": 29500
},
{
"epoch": 0.73,
"grad_norm": 0.7816724181175232,
"learning_rate": 7.241901122766515e-08,
"loss": 0.1284,
"step": 29550
},
{
"epoch": 0.73,
"grad_norm": 0.8223375082015991,
"learning_rate": 6.950693000288056e-08,
"loss": 0.1237,
"step": 29600
},
{
"epoch": 0.73,
"grad_norm": 0.8464847207069397,
"learning_rate": 6.665378087543889e-08,
"loss": 0.1232,
"step": 29650
},
{
"epoch": 0.73,
"grad_norm": 0.9507189989089966,
"learning_rate": 6.385963302556642e-08,
"loss": 0.1291,
"step": 29700
},
{
"epoch": 0.73,
"grad_norm": 0.933513879776001,
"learning_rate": 6.112455420288821e-08,
"loss": 0.1275,
"step": 29750
},
{
"epoch": 0.73,
"grad_norm": 0.9379338026046753,
"learning_rate": 5.844861072478336e-08,
"loss": 0.1268,
"step": 29800
},
{
"epoch": 0.73,
"grad_norm": 0.9361172318458557,
"learning_rate": 5.583186747477848e-08,
"loss": 0.1265,
"step": 29850
},
{
"epoch": 0.73,
"grad_norm": 0.7861219048500061,
"learning_rate": 5.32743879009745e-08,
"loss": 0.1237,
"step": 29900
},
{
"epoch": 0.74,
"grad_norm": 0.9114620089530945,
"learning_rate": 5.077623401450599e-08,
"loss": 0.1239,
"step": 29950
},
{
"epoch": 0.74,
"grad_norm": 0.8975350856781006,
"learning_rate": 4.8337466388040935e-08,
"loss": 0.1271,
"step": 30000
},
{
"epoch": 0.74,
"grad_norm": 0.8856131434440613,
"learning_rate": 4.595814415430916e-08,
"loss": 0.1264,
"step": 30050
},
{
"epoch": 0.74,
"grad_norm": 0.9059573411941528,
"learning_rate": 4.3638325004670134e-08,
"loss": 0.1224,
"step": 30100
},
{
"epoch": 0.74,
"grad_norm": 0.9936742186546326,
"learning_rate": 4.1378065187714365e-08,
"loss": 0.1275,
"step": 30150
},
{
"epoch": 0.74,
"grad_norm": 0.8675227761268616,
"learning_rate": 3.917741950789727e-08,
"loss": 0.124,
"step": 30200
},
{
"epoch": 0.74,
"grad_norm": 0.9244627356529236,
"learning_rate": 3.703644132421386e-08,
"loss": 0.1239,
"step": 30250
},
{
"epoch": 0.74,
"grad_norm": 0.774068295955658,
"learning_rate": 3.4955182548901956e-08,
"loss": 0.1261,
"step": 30300
},
{
"epoch": 0.75,
"grad_norm": 0.9418141841888428,
"learning_rate": 3.293369364618465e-08,
"loss": 0.1257,
"step": 30350
},
{
"epoch": 0.75,
"grad_norm": 0.8905578851699829,
"learning_rate": 3.097202363104679e-08,
"loss": 0.1238,
"step": 30400
},
{
"epoch": 0.75,
"grad_norm": 0.8111874461174011,
"learning_rate": 2.9070220068045663e-08,
"loss": 0.1295,
"step": 30450
},
{
"epoch": 0.75,
"grad_norm": 0.8793387413024902,
"learning_rate": 2.722832907015971e-08,
"loss": 0.1315,
"step": 30500
},
{
"epoch": 0.75,
"grad_norm": 1.0154118537902832,
"learning_rate": 2.544639529766829e-08,
"loss": 0.1282,
"step": 30550
},
{
"epoch": 0.75,
"grad_norm": 0.7594370245933533,
"learning_rate": 2.3724461957068955e-08,
"loss": 0.1254,
"step": 30600
},
{
"epoch": 0.75,
"grad_norm": 0.9383516311645508,
"learning_rate": 2.206257080003188e-08,
"loss": 0.1217,
"step": 30650
},
{
"epoch": 0.75,
"grad_norm": 0.8974120616912842,
"learning_rate": 2.0460762122385124e-08,
"loss": 0.1279,
"step": 30700
},
{
"epoch": 0.76,
"grad_norm": 0.9664300680160522,
"learning_rate": 1.8919074763138757e-08,
"loss": 0.1236,
"step": 30750
},
{
"epoch": 0.76,
"grad_norm": 0.8901047706604004,
"learning_rate": 1.7437546103542814e-08,
"loss": 0.1307,
"step": 30800
},
{
"epoch": 0.76,
"grad_norm": 0.8124094605445862,
"learning_rate": 1.6016212066181368e-08,
"loss": 0.1217,
"step": 30850
},
{
"epoch": 0.76,
"grad_norm": 0.8659054040908813,
"learning_rate": 1.4655107114101008e-08,
"loss": 0.125,
"step": 30900
},
{
"epoch": 0.76,
"grad_norm": 0.9323762059211731,
"learning_rate": 1.3354264249975379e-08,
"loss": 0.1267,
"step": 30950
},
{
"epoch": 0.76,
"grad_norm": 1.0165247917175293,
"learning_rate": 1.2113715015304728e-08,
"loss": 0.1247,
"step": 31000
},
{
"epoch": 0.76,
"grad_norm": 0.8402089476585388,
"learning_rate": 1.0933489489651783e-08,
"loss": 0.1262,
"step": 31050
},
{
"epoch": 0.76,
"grad_norm": 0.8893020749092102,
"learning_rate": 9.81361628991151e-09,
"loss": 0.1211,
"step": 31100
},
{
"epoch": 0.76,
"grad_norm": 0.8441564440727234,
"learning_rate": 8.754122569618329e-09,
"loss": 0.1209,
"step": 31150
},
{
"epoch": 0.77,
"grad_norm": 0.8949625492095947,
"learning_rate": 7.755034018286644e-09,
"loss": 0.1269,
"step": 31200
},
{
"epoch": 0.77,
"grad_norm": 0.8971026539802551,
"learning_rate": 6.816374860788566e-09,
"loss": 0.1299,
"step": 31250
},
{
"epoch": 0.77,
"grad_norm": 0.7675482034683228,
"learning_rate": 5.938167856766319e-09,
"loss": 0.1257,
"step": 31300
},
{
"epoch": 0.77,
"grad_norm": 0.9060002565383911,
"learning_rate": 5.120434300080745e-09,
"loss": 0.1269,
"step": 31350
},
{
"epoch": 0.77,
"grad_norm": 0.8141520023345947,
"learning_rate": 4.363194018293937e-09,
"loss": 0.1236,
"step": 31400
},
{
"epoch": 0.77,
"grad_norm": 0.7773285508155823,
"learning_rate": 3.666465372190453e-09,
"loss": 0.1233,
"step": 31450
},
{
"epoch": 0.77,
"grad_norm": 0.6834737062454224,
"learning_rate": 3.030265255329623e-09,
"loss": 0.1232,
"step": 31500
},
{
"epoch": 0.77,
"grad_norm": 0.7221519947052002,
"learning_rate": 2.4546090936383717e-09,
"loss": 0.1229,
"step": 31550
},
{
"epoch": 0.78,
"grad_norm": 0.8374796509742737,
"learning_rate": 1.9395108450351308e-09,
"loss": 0.1198,
"step": 31600
},
{
"epoch": 0.78,
"grad_norm": 0.9449348449707031,
"learning_rate": 1.4849829990931653e-09,
"loss": 0.126,
"step": 31650
},
{
"epoch": 0.78,
"grad_norm": 0.9566857814788818,
"learning_rate": 1.0910365767358155e-09,
"loss": 0.1251,
"step": 31700
},
{
"epoch": 0.78,
"grad_norm": 1.0854554176330566,
"learning_rate": 7.576811299714326e-10,
"loss": 0.1256,
"step": 31750
},
{
"epoch": 0.78,
"grad_norm": 0.9413610696792603,
"learning_rate": 4.849247416599534e-10,
"loss": 0.1314,
"step": 31800
},
{
"epoch": 0.78,
"grad_norm": 1.0781753063201904,
"learning_rate": 2.727740253177791e-10,
"loss": 0.1254,
"step": 31850
},
{
"epoch": 0.78,
"grad_norm": 0.8698143362998962,
"learning_rate": 1.2123412495762543e-10,
"loss": 0.1233,
"step": 31900
},
{
"epoch": 0.78,
"grad_norm": 0.8988608717918396,
"learning_rate": 3.0308714963067644e-11,
"loss": 0.1257,
"step": 31950
},
{
"epoch": 0.79,
"grad_norm": 0.8895017504692078,
"learning_rate": 0.0,
"loss": 0.1238,
"step": 32000
}
],
"logging_steps": 50,
"max_steps": 32000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3200,
"total_flos": 2.2179230271031296e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}