{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5626695604991863, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017362995116657625, "grad_norm": 0.357421875, "learning_rate": 0.0001, "loss": 1.5607, "step": 5 }, { "epoch": 0.003472599023331525, "grad_norm": 0.1552734375, "learning_rate": 0.0001, "loss": 0.8611, "step": 10 }, { "epoch": 0.005208898534997287, "grad_norm": 0.13671875, "learning_rate": 0.0001, "loss": 0.746, "step": 15 }, { "epoch": 0.00694519804666305, "grad_norm": 0.10888671875, "learning_rate": 0.0001, "loss": 0.7252, "step": 20 }, { "epoch": 0.008681497558328812, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.7153, "step": 25 }, { "epoch": 0.010417797069994574, "grad_norm": 0.11279296875, "learning_rate": 0.0001, "loss": 0.666, "step": 30 }, { "epoch": 0.012154096581660336, "grad_norm": 0.11572265625, "learning_rate": 0.0001, "loss": 0.6443, "step": 35 }, { "epoch": 0.0138903960933261, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.6404, "step": 40 }, { "epoch": 0.01562669560499186, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.6227, "step": 45 }, { "epoch": 0.017362995116657624, "grad_norm": 0.12353515625, "learning_rate": 0.0001, "loss": 0.6077, "step": 50 }, { "epoch": 0.019099294628323386, "grad_norm": 0.1279296875, "learning_rate": 0.0001, "loss": 0.6814, "step": 55 }, { "epoch": 0.020835594139989148, "grad_norm": 0.1416015625, "learning_rate": 0.0001, "loss": 0.6831, "step": 60 }, { "epoch": 0.02257189365165491, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.6659, "step": 65 }, { "epoch": 0.02430819316332067, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.6521, "step": 70 }, { "epoch": 0.026044492674986434, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.6309, "step": 75 }, { "epoch": 0.0277807921866522, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.6263, "step": 80 }, { "epoch": 0.02951709169831796, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.6268, "step": 85 }, { "epoch": 0.03125339120998372, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.6309, "step": 90 }, { "epoch": 0.032989690721649485, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.6103, "step": 95 }, { "epoch": 0.03472599023331525, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.6125, "step": 100 }, { "epoch": 0.03646228974498101, "grad_norm": 0.12255859375, "learning_rate": 0.0001, "loss": 0.6828, "step": 105 }, { "epoch": 0.03819858925664677, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.6517, "step": 110 }, { "epoch": 0.03993488876831253, "grad_norm": 0.1396484375, "learning_rate": 0.0001, "loss": 0.6702, "step": 115 }, { "epoch": 0.041671188279978295, "grad_norm": 0.10302734375, "learning_rate": 0.0001, "loss": 0.6158, "step": 120 }, { "epoch": 0.04340748779164406, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.6157, "step": 125 }, { "epoch": 0.04514378730330982, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.6444, "step": 130 }, { "epoch": 0.04688008681497558, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.6053, "step": 135 }, { "epoch": 0.04861638632664134, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5974, "step": 140 }, { "epoch": 0.050352685838307105, "grad_norm": 0.10986328125, "learning_rate": 0.0001, "loss": 0.5874, "step": 145 }, { "epoch": 0.05208898534997287, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.5656, "step": 150 }, { "epoch": 0.05382528486163863, "grad_norm": 0.1142578125, "learning_rate": 0.0001, "loss": 0.7098, "step": 155 }, { "epoch": 0.0555615843733044, "grad_norm": 0.1611328125, "learning_rate": 0.0001, "loss": 0.6968, "step": 160 }, { "epoch": 0.05729788388497016, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.652, "step": 165 }, { "epoch": 0.05903418339663592, "grad_norm": 0.10986328125, "learning_rate": 0.0001, "loss": 0.6084, "step": 170 }, { "epoch": 0.060770482908301685, "grad_norm": 0.12060546875, "learning_rate": 0.0001, "loss": 0.6243, "step": 175 }, { "epoch": 0.06250678241996745, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.6087, "step": 180 }, { "epoch": 0.06424308193163321, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.6189, "step": 185 }, { "epoch": 0.06597938144329897, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5926, "step": 190 }, { "epoch": 0.06771568095496473, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5846, "step": 195 }, { "epoch": 0.0694519804666305, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.5668, "step": 200 }, { "epoch": 0.07118827997829626, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.6842, "step": 205 }, { "epoch": 0.07292457948996202, "grad_norm": 0.1162109375, "learning_rate": 0.0001, "loss": 0.6476, "step": 210 }, { "epoch": 0.07466087900162778, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.6269, "step": 215 }, { "epoch": 0.07639717851329354, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.6189, "step": 220 }, { "epoch": 0.0781334780249593, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.617, "step": 225 }, { "epoch": 0.07986977753662507, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.649, "step": 230 }, { "epoch": 0.08160607704829083, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5985, "step": 235 }, { "epoch": 0.08334237655995659, "grad_norm": 0.1083984375, "learning_rate": 0.0001, "loss": 0.5909, "step": 240 }, { "epoch": 0.08507867607162235, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.6015, "step": 245 }, { "epoch": 0.08681497558328811, "grad_norm": 0.11279296875, "learning_rate": 0.0001, "loss": 0.5706, "step": 250 }, { "epoch": 0.08855127509495388, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.6721, "step": 255 }, { "epoch": 0.09028757460661964, "grad_norm": 0.107421875, "learning_rate": 0.0001, "loss": 0.6523, "step": 260 }, { "epoch": 0.0920238741182854, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.6232, "step": 265 }, { "epoch": 0.09376017362995116, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.642, "step": 270 }, { "epoch": 0.09549647314161692, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.6214, "step": 275 }, { "epoch": 0.09723277265328269, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5981, "step": 280 }, { "epoch": 0.09896907216494845, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.6205, "step": 285 }, { "epoch": 0.10070537167661421, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.6008, "step": 290 }, { "epoch": 0.10244167118827997, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5768, "step": 295 }, { "epoch": 0.10417797069994574, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5863, "step": 300 }, { "epoch": 0.1059142702116115, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6621, "step": 305 }, { "epoch": 0.10765056972327726, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.6285, "step": 310 }, { "epoch": 0.10938686923494302, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6254, "step": 315 }, { "epoch": 0.1111231687466088, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.6164, "step": 320 }, { "epoch": 0.11285946825827456, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.623, "step": 325 }, { "epoch": 0.11459576776994032, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5782, "step": 330 }, { "epoch": 0.11633206728160608, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.5962, "step": 335 }, { "epoch": 0.11806836679327185, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5987, "step": 340 }, { "epoch": 0.11980466630493761, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5885, "step": 345 }, { "epoch": 0.12154096581660337, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5678, "step": 350 }, { "epoch": 0.12327726532826913, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.674, "step": 355 }, { "epoch": 0.1250135648399349, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.631, "step": 360 }, { "epoch": 0.12674986435160066, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.6447, "step": 365 }, { "epoch": 0.12848616386326642, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.6277, "step": 370 }, { "epoch": 0.13022246337493218, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5902, "step": 375 }, { "epoch": 0.13195876288659794, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6211, "step": 380 }, { "epoch": 0.1336950623982637, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.6151, "step": 385 }, { "epoch": 0.13543136190992947, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.6097, "step": 390 }, { "epoch": 0.13716766142159523, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5842, "step": 395 }, { "epoch": 0.138903960933261, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5522, "step": 400 }, { "epoch": 0.14064026044492675, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.6664, "step": 405 }, { "epoch": 0.1423765599565925, "grad_norm": 0.111328125, "learning_rate": 0.0001, "loss": 0.647, "step": 410 }, { "epoch": 0.14411285946825828, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6205, "step": 415 }, { "epoch": 0.14584915897992404, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.615, "step": 420 }, { "epoch": 0.1475854584915898, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.605, "step": 425 }, { "epoch": 0.14932175800325556, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5884, "step": 430 }, { "epoch": 0.15105805751492132, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.6043, "step": 435 }, { "epoch": 0.15279435702658709, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5842, "step": 440 }, { "epoch": 0.15453065653825285, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5745, "step": 445 }, { "epoch": 0.1562669560499186, "grad_norm": 0.10302734375, "learning_rate": 0.0001, "loss": 0.5975, "step": 450 }, { "epoch": 0.15800325556158437, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.6604, "step": 455 }, { "epoch": 0.15973955507325013, "grad_norm": 0.107421875, "learning_rate": 0.0001, "loss": 0.6376, "step": 460 }, { "epoch": 0.1614758545849159, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.6241, "step": 465 }, { "epoch": 0.16321215409658166, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.6238, "step": 470 }, { "epoch": 0.16494845360824742, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5941, "step": 475 }, { "epoch": 0.16668475311991318, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5897, "step": 480 }, { "epoch": 0.16842105263157894, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.6106, "step": 485 }, { "epoch": 0.1701573521432447, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5948, "step": 490 }, { "epoch": 0.17189365165491047, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5811, "step": 495 }, { "epoch": 0.17362995116657623, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5604, "step": 500 }, { "epoch": 0.175366250678242, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.6414, "step": 505 }, { "epoch": 0.17710255018990775, "grad_norm": 0.1279296875, "learning_rate": 0.0001, "loss": 0.6296, "step": 510 }, { "epoch": 0.17883884970157352, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.6161, "step": 515 }, { "epoch": 0.18057514921323928, "grad_norm": 0.07470703125, "learning_rate": 0.0001, "loss": 0.5981, "step": 520 }, { "epoch": 0.18231144872490504, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.6258, "step": 525 }, { "epoch": 0.1840477482365708, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.591, "step": 530 }, { "epoch": 0.18578404774823656, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5829, "step": 535 }, { "epoch": 0.18752034725990233, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5749, "step": 540 }, { "epoch": 0.1892566467715681, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5908, "step": 545 }, { "epoch": 0.19099294628323385, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.5533, "step": 550 }, { "epoch": 0.1927292457948996, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.6469, "step": 555 }, { "epoch": 0.19446554530656537, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.61, "step": 560 }, { "epoch": 0.19620184481823114, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.6342, "step": 565 }, { "epoch": 0.1979381443298969, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.6017, "step": 570 }, { "epoch": 0.19967444384156266, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5873, "step": 575 }, { "epoch": 0.20141074335322842, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5898, "step": 580 }, { "epoch": 0.20314704286489418, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5927, "step": 585 }, { "epoch": 0.20488334237655995, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5641, "step": 590 }, { "epoch": 0.2066196418882257, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5742, "step": 595 }, { "epoch": 0.20835594139989147, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.566, "step": 600 }, { "epoch": 0.21009224091155723, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.6651, "step": 605 }, { "epoch": 0.211828540423223, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.6181, "step": 610 }, { "epoch": 0.21356483993488876, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.6136, "step": 615 }, { "epoch": 0.21530113944655452, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5951, "step": 620 }, { "epoch": 0.21703743895822028, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5696, "step": 625 }, { "epoch": 0.21877373846988604, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5912, "step": 630 }, { "epoch": 0.2205100379815518, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.587, "step": 635 }, { "epoch": 0.2222463374932176, "grad_norm": 0.07666015625, "learning_rate": 0.0001, "loss": 0.5574, "step": 640 }, { "epoch": 0.22398263700488336, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5815, "step": 645 }, { "epoch": 0.22571893651654912, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.5718, "step": 650 }, { "epoch": 0.22745523602821488, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.6307, "step": 655 }, { "epoch": 0.22919153553988064, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.6099, "step": 660 }, { "epoch": 0.2309278350515464, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.6032, "step": 665 }, { "epoch": 0.23266413456321217, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5917, "step": 670 }, { "epoch": 0.23440043407487793, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5869, "step": 675 }, { "epoch": 0.2361367335865437, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5737, "step": 680 }, { "epoch": 0.23787303309820945, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.586, "step": 685 }, { "epoch": 0.23960933260987521, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5844, "step": 690 }, { "epoch": 0.24134563212154098, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5819, "step": 695 }, { "epoch": 0.24308193163320674, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5453, "step": 700 }, { "epoch": 0.2448182311448725, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.6409, "step": 705 }, { "epoch": 0.24655453065653826, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6099, "step": 710 }, { "epoch": 0.24829083016820402, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6267, "step": 715 }, { "epoch": 0.2500271296798698, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.6319, "step": 720 }, { "epoch": 0.25176342919153555, "grad_norm": 0.07470703125, "learning_rate": 0.0001, "loss": 0.5841, "step": 725 }, { "epoch": 0.2534997287032013, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5665, "step": 730 }, { "epoch": 0.2552360282148671, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5881, "step": 735 }, { "epoch": 0.25697232772653283, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5599, "step": 740 }, { "epoch": 0.2587086272381986, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5614, "step": 745 }, { "epoch": 0.26044492674986436, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5683, "step": 750 }, { "epoch": 0.2621812262615301, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.6652, "step": 755 }, { "epoch": 0.2639175257731959, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6104, "step": 760 }, { "epoch": 0.26565382528486164, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5999, "step": 765 }, { "epoch": 0.2673901247965274, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5882, "step": 770 }, { "epoch": 0.26912642430819317, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5826, "step": 775 }, { "epoch": 0.27086272381985893, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5648, "step": 780 }, { "epoch": 0.2725990233315247, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5883, "step": 785 }, { "epoch": 0.27433532284319045, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5872, "step": 790 }, { "epoch": 0.2760716223548562, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5411, "step": 795 }, { "epoch": 0.277807921866522, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.5518, "step": 800 }, { "epoch": 0.27954422137818774, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6338, "step": 805 }, { "epoch": 0.2812805208898535, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.6061, "step": 810 }, { "epoch": 0.28301682040151926, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.6213, "step": 815 }, { "epoch": 0.284753119913185, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5815, "step": 820 }, { "epoch": 0.2864894194248508, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.6088, "step": 825 }, { "epoch": 0.28822571893651655, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5935, "step": 830 }, { "epoch": 0.2899620184481823, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5708, "step": 835 }, { "epoch": 0.2916983179598481, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5919, "step": 840 }, { "epoch": 0.29343461747151384, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5402, "step": 845 }, { "epoch": 0.2951709169831796, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.5421, "step": 850 }, { "epoch": 0.29690721649484536, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.6586, "step": 855 }, { "epoch": 0.2986435160065111, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6076, "step": 860 }, { "epoch": 0.3003798155181769, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.6283, "step": 865 }, { "epoch": 0.30211611502984265, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.6065, "step": 870 }, { "epoch": 0.3038524145415084, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5651, "step": 875 }, { "epoch": 0.30558871405317417, "grad_norm": 0.1884765625, "learning_rate": 0.0001, "loss": 0.6081, "step": 880 }, { "epoch": 0.30732501356483993, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5631, "step": 885 }, { "epoch": 0.3090613130765057, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5562, "step": 890 }, { "epoch": 0.31079761258817146, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5865, "step": 895 }, { "epoch": 0.3125339120998372, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.546, "step": 900 }, { "epoch": 0.314270211611503, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.6489, "step": 905 }, { "epoch": 0.31600651112316874, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.6021, "step": 910 }, { "epoch": 0.3177428106348345, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.6069, "step": 915 }, { "epoch": 0.31947911014650027, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5969, "step": 920 }, { "epoch": 0.32121540965816603, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5781, "step": 925 }, { "epoch": 0.3229517091698318, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5975, "step": 930 }, { "epoch": 0.32468800868149755, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5856, "step": 935 }, { "epoch": 0.3264243081931633, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5709, "step": 940 }, { "epoch": 0.3281606077048291, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5523, "step": 945 }, { "epoch": 0.32989690721649484, "grad_norm": 0.107421875, "learning_rate": 0.0001, "loss": 0.5505, "step": 950 }, { "epoch": 0.3316332067281606, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.6636, "step": 955 }, { "epoch": 0.33336950623982636, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.6165, "step": 960 }, { "epoch": 0.3351058057514921, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.6182, "step": 965 }, { "epoch": 0.3368421052631579, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5859, "step": 970 }, { "epoch": 0.33857840477482365, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.6026, "step": 975 }, { "epoch": 0.3403147042864894, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5777, "step": 980 }, { "epoch": 0.3420510037981552, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5612, "step": 985 }, { "epoch": 0.34378730330982094, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5838, "step": 990 }, { "epoch": 0.3455236028214867, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5567, "step": 995 }, { "epoch": 0.34725990233315246, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5563, "step": 1000 }, { "epoch": 0.3489962018448182, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.6221, "step": 1005 }, { "epoch": 0.350732501356484, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.613, "step": 1010 }, { "epoch": 0.35246880086814975, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.603, "step": 1015 }, { "epoch": 0.3542051003798155, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.6122, "step": 1020 }, { "epoch": 0.35594139989148127, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.6032, "step": 1025 }, { "epoch": 0.35767769940314703, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.6027, "step": 1030 }, { "epoch": 0.3594139989148128, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5592, "step": 1035 }, { "epoch": 0.36115029842647856, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5676, "step": 1040 }, { "epoch": 0.3628865979381443, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5578, "step": 1045 }, { "epoch": 0.3646228974498101, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5349, "step": 1050 }, { "epoch": 0.36635919696147584, "grad_norm": 0.11083984375, "learning_rate": 0.0001, "loss": 0.6616, "step": 1055 }, { "epoch": 0.3680954964731416, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.6331, "step": 1060 }, { "epoch": 0.36983179598480737, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.6057, "step": 1065 }, { "epoch": 0.3715680954964731, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5951, "step": 1070 }, { "epoch": 0.3733043950081389, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5747, "step": 1075 }, { "epoch": 0.37504069451980465, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5827, "step": 1080 }, { "epoch": 0.3767769940314704, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5855, "step": 1085 }, { "epoch": 0.3785132935431362, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5862, "step": 1090 }, { "epoch": 0.38024959305480194, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5525, "step": 1095 }, { "epoch": 0.3819858925664677, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.5491, "step": 1100 }, { "epoch": 0.38372219207813346, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.6284, "step": 1105 }, { "epoch": 0.3854584915897992, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5915, "step": 1110 }, { "epoch": 0.387194791101465, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.6242, "step": 1115 }, { "epoch": 0.38893109061313075, "grad_norm": 0.07470703125, "learning_rate": 0.0001, "loss": 0.5679, "step": 1120 }, { "epoch": 0.3906673901247965, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5795, "step": 1125 }, { "epoch": 0.39240368963646227, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5973, "step": 1130 }, { "epoch": 0.39413998914812803, "grad_norm": 0.12109375, "learning_rate": 0.0001, "loss": 0.5729, "step": 1135 }, { "epoch": 0.3958762886597938, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5757, "step": 1140 }, { "epoch": 0.39761258817145956, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5745, "step": 1145 }, { "epoch": 0.3993488876831253, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5563, "step": 1150 }, { "epoch": 0.4010851871947911, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.637, "step": 1155 }, { "epoch": 0.40282148670645684, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.597, "step": 1160 }, { "epoch": 0.4045577862181226, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.598, "step": 1165 }, { "epoch": 0.40629408572978837, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5984, "step": 1170 }, { "epoch": 0.40803038524145413, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5883, "step": 1175 }, { "epoch": 0.4097666847531199, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5725, "step": 1180 }, { "epoch": 0.41150298426478565, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5848, "step": 1185 }, { "epoch": 0.4132392837764514, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5757, "step": 1190 }, { "epoch": 0.4149755832881172, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5538, "step": 1195 }, { "epoch": 0.41671188279978294, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.531, "step": 1200 }, { "epoch": 0.4184481823114487, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.6415, "step": 1205 }, { "epoch": 0.42018448182311446, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.6296, "step": 1210 }, { "epoch": 0.4219207813347802, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.6157, "step": 1215 }, { "epoch": 0.423657080846446, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.5636, "step": 1220 }, { "epoch": 0.42539338035811175, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5655, "step": 1225 }, { "epoch": 0.4271296798697775, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5727, "step": 1230 }, { "epoch": 0.4288659793814433, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5615, "step": 1235 }, { "epoch": 0.43060227889310904, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5715, "step": 1240 }, { "epoch": 0.4323385784047748, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5566, "step": 1245 }, { "epoch": 0.43407487791644056, "grad_norm": 0.1025390625, "learning_rate": 0.0001, "loss": 0.5504, "step": 1250 }, { "epoch": 0.4358111774281063, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.6203, "step": 1255 }, { "epoch": 0.4375474769397721, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.6021, "step": 1260 }, { "epoch": 0.43928377645143785, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.6065, "step": 1265 }, { "epoch": 0.4410200759631036, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5844, "step": 1270 }, { "epoch": 0.44275637547476937, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5717, "step": 1275 }, { "epoch": 0.4444926749864352, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5893, "step": 1280 }, { "epoch": 0.44622897449810095, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5707, "step": 1285 }, { "epoch": 0.4479652740097667, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5494, "step": 1290 }, { "epoch": 0.4497015735214325, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5564, "step": 1295 }, { "epoch": 0.45143787303309824, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5536, "step": 1300 }, { "epoch": 0.453174172544764, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.6417, "step": 1305 }, { "epoch": 0.45491047205642976, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.6055, "step": 1310 }, { "epoch": 0.4566467715680955, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5962, "step": 1315 }, { "epoch": 0.4583830710797613, "grad_norm": 0.0732421875, "learning_rate": 0.0001, "loss": 0.6115, "step": 1320 }, { "epoch": 0.46011937059142705, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5898, "step": 1325 }, { "epoch": 0.4618556701030928, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5834, "step": 1330 }, { "epoch": 0.46359196961475857, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5721, "step": 1335 }, { "epoch": 0.46532826912642433, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.561, "step": 1340 }, { "epoch": 0.4670645686380901, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5637, "step": 1345 }, { "epoch": 0.46880086814975586, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5528, "step": 1350 }, { "epoch": 0.4705371676614216, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6466, "step": 1355 }, { "epoch": 0.4722734671730874, "grad_norm": 0.0771484375, "learning_rate": 0.0001, "loss": 0.6073, "step": 1360 }, { "epoch": 0.47400976668475314, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.6299, "step": 1365 }, { "epoch": 0.4757460661964189, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5787, "step": 1370 }, { "epoch": 0.47748236570808467, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5815, "step": 1375 }, { "epoch": 0.47921866521975043, "grad_norm": 0.2451171875, "learning_rate": 0.0001, "loss": 0.5751, "step": 1380 }, { "epoch": 0.4809549647314162, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5652, "step": 1385 }, { "epoch": 0.48269126424308195, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5488, "step": 1390 }, { "epoch": 0.4844275637547477, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5472, "step": 1395 }, { "epoch": 0.4861638632664135, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.5394, "step": 1400 }, { "epoch": 0.48790016277807924, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6331, "step": 1405 }, { "epoch": 0.489636462289745, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.6019, "step": 1410 }, { "epoch": 0.49137276180141076, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.6056, "step": 1415 }, { "epoch": 0.4931090613130765, "grad_norm": 0.076171875, "learning_rate": 0.0001, "loss": 0.6167, "step": 1420 }, { "epoch": 0.4948453608247423, "grad_norm": 0.076171875, "learning_rate": 0.0001, "loss": 0.5708, "step": 1425 }, { "epoch": 0.49658166033640805, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.591, "step": 1430 }, { "epoch": 0.4983179598480738, "grad_norm": 0.0703125, "learning_rate": 0.0001, "loss": 0.5364, "step": 1435 }, { "epoch": 0.5000542593597396, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5634, "step": 1440 }, { "epoch": 0.5017905588714053, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5374, "step": 1445 }, { "epoch": 0.5035268583830711, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.5533, "step": 1450 }, { "epoch": 0.5052631578947369, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.6605, "step": 1455 }, { "epoch": 0.5069994574064026, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.6167, "step": 1460 }, { "epoch": 0.5087357569180684, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6013, "step": 1465 }, { "epoch": 0.5104720564297341, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5774, "step": 1470 }, { "epoch": 0.5122083559413999, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5837, "step": 1475 }, { "epoch": 0.5139446554530657, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5885, "step": 1480 }, { "epoch": 0.5156809549647314, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5565, "step": 1485 }, { "epoch": 0.5174172544763972, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.5793, "step": 1490 }, { "epoch": 0.519153553988063, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5552, "step": 1495 }, { "epoch": 0.5208898534997287, "grad_norm": 0.1103515625, "learning_rate": 0.0001, "loss": 0.5376, "step": 1500 }, { "epoch": 0.5226261530113945, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.6368, "step": 1505 }, { "epoch": 0.5243624525230602, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.6176, "step": 1510 }, { "epoch": 0.526098752034726, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5909, "step": 1515 }, { "epoch": 0.5278350515463918, "grad_norm": 0.076171875, "learning_rate": 0.0001, "loss": 0.5816, "step": 1520 }, { "epoch": 0.5295713510580575, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.6136, "step": 1525 }, { "epoch": 0.5313076505697233, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5724, "step": 1530 }, { "epoch": 0.533043950081389, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5657, "step": 1535 }, { "epoch": 0.5347802495930548, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5642, "step": 1540 }, { "epoch": 0.5365165491047206, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5711, "step": 1545 }, { "epoch": 0.5382528486163863, "grad_norm": 0.10888671875, "learning_rate": 0.0001, "loss": 0.514, "step": 1550 }, { "epoch": 0.5399891481280521, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6508, "step": 1555 }, { "epoch": 0.5417254476397179, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.6109, "step": 1560 }, { "epoch": 0.5434617471513836, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.6041, "step": 1565 }, { "epoch": 0.5451980466630494, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.5893, "step": 1570 }, { "epoch": 0.5469343461747151, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5753, "step": 1575 }, { "epoch": 0.5486706456863809, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5626, "step": 1580 }, { "epoch": 0.5504069451980467, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5912, "step": 1585 }, { "epoch": 0.5521432447097124, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5629, "step": 1590 }, { "epoch": 0.5538795442213782, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.581, "step": 1595 }, { "epoch": 0.555615843733044, "grad_norm": 0.1025390625, "learning_rate": 0.0001, "loss": 0.5413, "step": 1600 }, { "epoch": 0.5573521432447097, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.6606, "step": 1605 }, { "epoch": 0.5590884427563755, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5923, "step": 1610 }, { "epoch": 0.5608247422680412, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5958, "step": 1615 }, { "epoch": 0.562561041779707, "grad_norm": 0.07421875, "learning_rate": 0.0001, "loss": 0.5899, "step": 1620 }, { "epoch": 0.5642973412913728, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5817, "step": 1625 }, { "epoch": 0.5660336408030385, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5579, "step": 1630 }, { "epoch": 0.5677699403147043, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5677, "step": 1635 }, { "epoch": 0.56950623982637, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.566, "step": 1640 }, { "epoch": 0.5712425393380358, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5668, "step": 1645 }, { "epoch": 0.5729788388497016, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5334, "step": 1650 }, { "epoch": 0.5747151383613673, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.6322, "step": 1655 }, { "epoch": 0.5764514378730331, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.588, "step": 1660 }, { "epoch": 0.5781877373846989, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5929, "step": 1665 }, { "epoch": 0.5799240368963646, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.6142, "step": 1670 }, { "epoch": 0.5816603364080304, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5919, "step": 1675 }, { "epoch": 0.5833966359196961, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5811, "step": 1680 }, { "epoch": 0.5851329354313619, "grad_norm": 0.0771484375, "learning_rate": 0.0001, "loss": 0.5564, "step": 1685 }, { "epoch": 0.5868692349430277, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5658, "step": 1690 }, { "epoch": 0.5886055344546934, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5677, "step": 1695 }, { "epoch": 0.5903418339663592, "grad_norm": 0.107421875, "learning_rate": 0.0001, "loss": 0.5537, "step": 1700 }, { "epoch": 0.592078133478025, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.6258, "step": 1705 }, { "epoch": 0.5938144329896907, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5946, "step": 1710 }, { "epoch": 0.5955507325013565, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5991, "step": 1715 }, { "epoch": 0.5972870320130222, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.602, "step": 1720 }, { "epoch": 0.599023331524688, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5778, "step": 1725 }, { "epoch": 0.6007596310363538, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5531, "step": 1730 }, { "epoch": 0.6024959305480195, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.56, "step": 1735 }, { "epoch": 0.6042322300596853, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5679, "step": 1740 }, { "epoch": 0.6059685295713511, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.557, "step": 1745 }, { "epoch": 0.6077048290830168, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5345, "step": 1750 }, { "epoch": 0.6094411285946826, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.6289, "step": 1755 }, { "epoch": 0.6111774281063483, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.6079, "step": 1760 }, { "epoch": 0.6129137276180141, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5773, "step": 1765 }, { "epoch": 0.6146500271296799, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5802, "step": 1770 }, { "epoch": 0.6163863266413456, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5654, "step": 1775 }, { "epoch": 0.6181226261530114, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5679, "step": 1780 }, { "epoch": 0.6198589256646772, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5849, "step": 1785 }, { "epoch": 0.6215952251763429, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5674, "step": 1790 }, { "epoch": 0.6233315246880087, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.56, "step": 1795 }, { "epoch": 0.6250678241996744, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.527, "step": 1800 }, { "epoch": 0.6268041237113402, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.6374, "step": 1805 }, { "epoch": 0.628540423223006, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.5962, "step": 1810 }, { "epoch": 0.6302767227346717, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.6013, "step": 1815 }, { "epoch": 0.6320130222463375, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6017, "step": 1820 }, { "epoch": 0.6337493217580032, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5826, "step": 1825 }, { "epoch": 0.635485621269669, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5977, "step": 1830 }, { "epoch": 0.6372219207813348, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5981, "step": 1835 }, { "epoch": 0.6389582202930005, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5768, "step": 1840 }, { "epoch": 0.6406945198046663, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5415, "step": 1845 }, { "epoch": 0.6424308193163321, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5317, "step": 1850 }, { "epoch": 0.6441671188279978, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.6237, "step": 1855 }, { "epoch": 0.6459034183396636, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5886, "step": 1860 }, { "epoch": 0.6476397178513293, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5769, "step": 1865 }, { "epoch": 0.6493760173629951, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5793, "step": 1870 }, { "epoch": 0.6511123168746609, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5704, "step": 1875 }, { "epoch": 0.6528486163863266, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5597, "step": 1880 }, { "epoch": 0.6545849158979924, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5358, "step": 1885 }, { "epoch": 0.6563212154096582, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5479, "step": 1890 }, { "epoch": 0.6580575149213239, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5472, "step": 1895 }, { "epoch": 0.6597938144329897, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5398, "step": 1900 }, { "epoch": 0.6615301139446554, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.651, "step": 1905 }, { "epoch": 0.6632664134563212, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5915, "step": 1910 }, { "epoch": 0.665002712967987, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.6007, "step": 1915 }, { "epoch": 0.6667390124796527, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5788, "step": 1920 }, { "epoch": 0.6684753119913185, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.568, "step": 1925 }, { "epoch": 0.6702116115029843, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.5711, "step": 1930 }, { "epoch": 0.67194791101465, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5682, "step": 1935 }, { "epoch": 0.6736842105263158, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5764, "step": 1940 }, { "epoch": 0.6754205100379815, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5491, "step": 1945 }, { "epoch": 0.6771568095496473, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5629, "step": 1950 }, { "epoch": 0.6788931090613131, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.6379, "step": 1955 }, { "epoch": 0.6806294085729788, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5983, "step": 1960 }, { "epoch": 0.6823657080846446, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5929, "step": 1965 }, { "epoch": 0.6841020075963103, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.588, "step": 1970 }, { "epoch": 0.6858383071079761, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5619, "step": 1975 }, { "epoch": 0.6875746066196419, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.5683, "step": 1980 }, { "epoch": 0.6893109061313076, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5808, "step": 1985 }, { "epoch": 0.6910472056429734, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5536, "step": 1990 }, { "epoch": 0.6927835051546392, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5608, "step": 1995 }, { "epoch": 0.6945198046663049, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.534, "step": 2000 }, { "epoch": 0.6962561041779707, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.6431, "step": 2005 }, { "epoch": 0.6979924036896364, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5889, "step": 2010 }, { "epoch": 0.6997287032013022, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5804, "step": 2015 }, { "epoch": 0.701465002712968, "grad_norm": 0.0703125, "learning_rate": 0.0001, "loss": 0.5916, "step": 2020 }, { "epoch": 0.7032013022246337, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5759, "step": 2025 }, { "epoch": 0.7049376017362995, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5853, "step": 2030 }, { "epoch": 0.7066739012479653, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5733, "step": 2035 }, { "epoch": 0.708410200759631, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5776, "step": 2040 }, { "epoch": 0.7101465002712968, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5489, "step": 2045 }, { "epoch": 0.7118827997829625, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5241, "step": 2050 }, { "epoch": 0.7136190992946283, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.6735, "step": 2055 }, { "epoch": 0.7153553988062941, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.6125, "step": 2060 }, { "epoch": 0.7170916983179598, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5805, "step": 2065 }, { "epoch": 0.7188279978296256, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.6037, "step": 2070 }, { "epoch": 0.7205642973412913, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5873, "step": 2075 }, { "epoch": 0.7223005968529571, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5727, "step": 2080 }, { "epoch": 0.7240368963646229, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5709, "step": 2085 }, { "epoch": 0.7257731958762886, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.519, "step": 2090 }, { "epoch": 0.7275094953879544, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5576, "step": 2095 }, { "epoch": 0.7292457948996202, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5285, "step": 2100 }, { "epoch": 0.7309820944112859, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6482, "step": 2105 }, { "epoch": 0.7327183939229517, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.595, "step": 2110 }, { "epoch": 0.7344546934346174, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5953, "step": 2115 }, { "epoch": 0.7361909929462832, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5665, "step": 2120 }, { "epoch": 0.737927292457949, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5612, "step": 2125 }, { "epoch": 0.7396635919696147, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5645, "step": 2130 }, { "epoch": 0.7413998914812805, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5865, "step": 2135 }, { "epoch": 0.7431361909929463, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5658, "step": 2140 }, { "epoch": 0.744872490504612, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5523, "step": 2145 }, { "epoch": 0.7466087900162778, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5358, "step": 2150 }, { "epoch": 0.7483450895279435, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.6148, "step": 2155 }, { "epoch": 0.7500813890396093, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.6092, "step": 2160 }, { "epoch": 0.7518176885512751, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5939, "step": 2165 }, { "epoch": 0.7535539880629408, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5823, "step": 2170 }, { "epoch": 0.7552902875746066, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5779, "step": 2175 }, { "epoch": 0.7570265870862724, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5697, "step": 2180 }, { "epoch": 0.7587628865979381, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5716, "step": 2185 }, { "epoch": 0.7604991861096039, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5569, "step": 2190 }, { "epoch": 0.7622354856212696, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.5544, "step": 2195 }, { "epoch": 0.7639717851329354, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5332, "step": 2200 }, { "epoch": 0.7657080846446012, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.6517, "step": 2205 }, { "epoch": 0.7674443841562669, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5991, "step": 2210 }, { "epoch": 0.7691806836679327, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5823, "step": 2215 }, { "epoch": 0.7709169831795984, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.6052, "step": 2220 }, { "epoch": 0.7726532826912642, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5711, "step": 2225 }, { "epoch": 0.77438958220293, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.5369, "step": 2230 }, { "epoch": 0.7761258817145957, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.5406, "step": 2235 }, { "epoch": 0.7778621812262615, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.604, "step": 2240 }, { "epoch": 0.7795984807379273, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5587, "step": 2245 }, { "epoch": 0.781334780249593, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5347, "step": 2250 }, { "epoch": 0.7830710797612588, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.6413, "step": 2255 }, { "epoch": 0.7848073792729245, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5683, "step": 2260 }, { "epoch": 0.7865436787845903, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.6132, "step": 2265 }, { "epoch": 0.7882799782962561, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5839, "step": 2270 }, { "epoch": 0.7900162778079218, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5857, "step": 2275 }, { "epoch": 0.7917525773195876, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5711, "step": 2280 }, { "epoch": 0.7934888768312534, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5748, "step": 2285 }, { "epoch": 0.7952251763429191, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5431, "step": 2290 }, { "epoch": 0.7969614758545849, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.549, "step": 2295 }, { "epoch": 0.7986977753662506, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.5471, "step": 2300 }, { "epoch": 0.8004340748779164, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.6297, "step": 2305 }, { "epoch": 0.8021703743895822, "grad_norm": 0.07666015625, "learning_rate": 0.0001, "loss": 0.5896, "step": 2310 }, { "epoch": 0.8039066739012479, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5866, "step": 2315 }, { "epoch": 0.8056429734129137, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5779, "step": 2320 }, { "epoch": 0.8073792729245794, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.58, "step": 2325 }, { "epoch": 0.8091155724362452, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5887, "step": 2330 }, { "epoch": 0.810851871947911, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5622, "step": 2335 }, { "epoch": 0.8125881714595767, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5402, "step": 2340 }, { "epoch": 0.8143244709712425, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5459, "step": 2345 }, { "epoch": 0.8160607704829083, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5197, "step": 2350 }, { "epoch": 0.817797069994574, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6183, "step": 2355 }, { "epoch": 0.8195333695062398, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5875, "step": 2360 }, { "epoch": 0.8212696690179055, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.605, "step": 2365 }, { "epoch": 0.8230059685295713, "grad_norm": 0.07421875, "learning_rate": 0.0001, "loss": 0.583, "step": 2370 }, { "epoch": 0.8247422680412371, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5785, "step": 2375 }, { "epoch": 0.8264785675529028, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5611, "step": 2380 }, { "epoch": 0.8282148670645686, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5796, "step": 2385 }, { "epoch": 0.8299511665762344, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.548, "step": 2390 }, { "epoch": 0.8316874660879001, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5653, "step": 2395 }, { "epoch": 0.8334237655995659, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5371, "step": 2400 }, { "epoch": 0.8351600651112316, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6198, "step": 2405 }, { "epoch": 0.8368963646228974, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5979, "step": 2410 }, { "epoch": 0.8386326641345632, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5892, "step": 2415 }, { "epoch": 0.8403689636462289, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5635, "step": 2420 }, { "epoch": 0.8421052631578947, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5798, "step": 2425 }, { "epoch": 0.8438415626695605, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5589, "step": 2430 }, { "epoch": 0.8455778621812262, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5365, "step": 2435 }, { "epoch": 0.847314161692892, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5417, "step": 2440 }, { "epoch": 0.8490504612045577, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5503, "step": 2445 }, { "epoch": 0.8507867607162235, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5355, "step": 2450 }, { "epoch": 0.8525230602278893, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.6341, "step": 2455 }, { "epoch": 0.854259359739555, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5729, "step": 2460 }, { "epoch": 0.8559956592512208, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.584, "step": 2465 }, { "epoch": 0.8577319587628865, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5905, "step": 2470 }, { "epoch": 0.8594682582745523, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5782, "step": 2475 }, { "epoch": 0.8612045577862181, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5476, "step": 2480 }, { "epoch": 0.8629408572978838, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5952, "step": 2485 }, { "epoch": 0.8646771568095496, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5347, "step": 2490 }, { "epoch": 0.8664134563212154, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.554, "step": 2495 }, { "epoch": 0.8681497558328811, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.54, "step": 2500 }, { "epoch": 0.8698860553445469, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.6187, "step": 2505 }, { "epoch": 0.8716223548562126, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5818, "step": 2510 }, { "epoch": 0.8733586543678784, "grad_norm": 0.07666015625, "learning_rate": 0.0001, "loss": 0.585, "step": 2515 }, { "epoch": 0.8750949538795442, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.5996, "step": 2520 }, { "epoch": 0.8768312533912099, "grad_norm": 0.07470703125, "learning_rate": 0.0001, "loss": 0.5727, "step": 2525 }, { "epoch": 0.8785675529028757, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5711, "step": 2530 }, { "epoch": 0.8803038524145415, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.5614, "step": 2535 }, { "epoch": 0.8820401519262072, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5451, "step": 2540 }, { "epoch": 0.883776451437873, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5539, "step": 2545 }, { "epoch": 0.8855127509495387, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5379, "step": 2550 }, { "epoch": 0.8872490504612045, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.6218, "step": 2555 }, { "epoch": 0.8889853499728704, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.6054, "step": 2560 }, { "epoch": 0.8907216494845361, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5887, "step": 2565 }, { "epoch": 0.8924579489962019, "grad_norm": 0.07666015625, "learning_rate": 0.0001, "loss": 0.5683, "step": 2570 }, { "epoch": 0.8941942485078677, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5753, "step": 2575 }, { "epoch": 0.8959305480195334, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5803, "step": 2580 }, { "epoch": 0.8976668475311992, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5634, "step": 2585 }, { "epoch": 0.899403147042865, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5674, "step": 2590 }, { "epoch": 0.9011394465545307, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5618, "step": 2595 }, { "epoch": 0.9028757460661965, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5553, "step": 2600 }, { "epoch": 0.9046120455778622, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.6392, "step": 2605 }, { "epoch": 0.906348345089528, "grad_norm": 0.0771484375, "learning_rate": 0.0001, "loss": 0.5888, "step": 2610 }, { "epoch": 0.9080846446011938, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5674, "step": 2615 }, { "epoch": 0.9098209441128595, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5688, "step": 2620 }, { "epoch": 0.9115572436245253, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5894, "step": 2625 }, { "epoch": 0.913293543136191, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5691, "step": 2630 }, { "epoch": 0.9150298426478568, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5588, "step": 2635 }, { "epoch": 0.9167661421595226, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5453, "step": 2640 }, { "epoch": 0.9185024416711883, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5394, "step": 2645 }, { "epoch": 0.9202387411828541, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5313, "step": 2650 }, { "epoch": 0.9219750406945199, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.6438, "step": 2655 }, { "epoch": 0.9237113402061856, "grad_norm": 0.0771484375, "learning_rate": 0.0001, "loss": 0.5677, "step": 2660 }, { "epoch": 0.9254476397178514, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5836, "step": 2665 }, { "epoch": 0.9271839392295171, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.5679, "step": 2670 }, { "epoch": 0.9289202387411829, "grad_norm": 0.07568359375, "learning_rate": 0.0001, "loss": 0.5684, "step": 2675 }, { "epoch": 0.9306565382528487, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.54, "step": 2680 }, { "epoch": 0.9323928377645144, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5564, "step": 2685 }, { "epoch": 0.9341291372761802, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5526, "step": 2690 }, { "epoch": 0.935865436787846, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5372, "step": 2695 }, { "epoch": 0.9376017362995117, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5369, "step": 2700 }, { "epoch": 0.9393380358111775, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.6357, "step": 2705 }, { "epoch": 0.9410743353228432, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5584, "step": 2710 }, { "epoch": 0.942810634834509, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.6022, "step": 2715 }, { "epoch": 0.9445469343461748, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.566, "step": 2720 }, { "epoch": 0.9462832338578405, "grad_norm": 0.07958984375, "learning_rate": 0.0001, "loss": 0.5742, "step": 2725 }, { "epoch": 0.9480195333695063, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5775, "step": 2730 }, { "epoch": 0.949755832881172, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5666, "step": 2735 }, { "epoch": 0.9514921323928378, "grad_norm": 0.1083984375, "learning_rate": 0.0001, "loss": 0.5505, "step": 2740 }, { "epoch": 0.9532284319045036, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5276, "step": 2745 }, { "epoch": 0.9549647314161693, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5265, "step": 2750 }, { "epoch": 0.9567010309278351, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.6327, "step": 2755 }, { "epoch": 0.9584373304395009, "grad_norm": 0.0732421875, "learning_rate": 0.0001, "loss": 0.5947, "step": 2760 }, { "epoch": 0.9601736299511666, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5922, "step": 2765 }, { "epoch": 0.9619099294628324, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.585, "step": 2770 }, { "epoch": 0.9636462289744981, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5712, "step": 2775 }, { "epoch": 0.9653825284861639, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5843, "step": 2780 }, { "epoch": 0.9671188279978297, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.551, "step": 2785 }, { "epoch": 0.9688551275094954, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5638, "step": 2790 }, { "epoch": 0.9705914270211612, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5495, "step": 2795 }, { "epoch": 0.972327726532827, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5238, "step": 2800 }, { "epoch": 0.9740640260444927, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.6252, "step": 2805 }, { "epoch": 0.9758003255561585, "grad_norm": 0.0712890625, "learning_rate": 0.0001, "loss": 0.6002, "step": 2810 }, { "epoch": 0.9775366250678242, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.5987, "step": 2815 }, { "epoch": 0.97927292457949, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.5911, "step": 2820 }, { "epoch": 0.9810092240911558, "grad_norm": 0.1533203125, "learning_rate": 0.0001, "loss": 0.5721, "step": 2825 }, { "epoch": 0.9827455236028215, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5606, "step": 2830 }, { "epoch": 0.9844818231144873, "grad_norm": 0.2177734375, "learning_rate": 0.0001, "loss": 0.5618, "step": 2835 }, { "epoch": 0.986218122626153, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5074, "step": 2840 }, { "epoch": 0.9879544221378188, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.5442, "step": 2845 }, { "epoch": 0.9896907216494846, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.556, "step": 2850 }, { "epoch": 0.9914270211611503, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.6428, "step": 2855 }, { "epoch": 0.9931633206728161, "grad_norm": 0.072265625, "learning_rate": 0.0001, "loss": 0.6115, "step": 2860 }, { "epoch": 0.9948996201844819, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5626, "step": 2865 }, { "epoch": 0.9966359196961476, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5743, "step": 2870 }, { "epoch": 0.9983722192078134, "grad_norm": 0.08447265625, "learning_rate": 0.0001, "loss": 0.5582, "step": 2875 }, { "epoch": 1.0001085187194791, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5435, "step": 2880 }, { "epoch": 1.0018448182311448, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5697, "step": 2885 }, { "epoch": 1.0035811177428107, "grad_norm": 0.07763671875, "learning_rate": 0.0001, "loss": 0.5622, "step": 2890 }, { "epoch": 1.0053174172544763, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5732, "step": 2895 }, { "epoch": 1.0070537167661422, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5266, "step": 2900 }, { "epoch": 1.0087900162778078, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5615, "step": 2905 }, { "epoch": 1.0105263157894737, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5519, "step": 2910 }, { "epoch": 1.0122626153011394, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5355, "step": 2915 }, { "epoch": 1.0139989148128052, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5235, "step": 2920 }, { "epoch": 1.015735214324471, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.4988, "step": 2925 }, { "epoch": 1.0174715138361368, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5261, "step": 2930 }, { "epoch": 1.0192078133478024, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.6225, "step": 2935 }, { "epoch": 1.0209441128594683, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.555, "step": 2940 }, { "epoch": 1.022680412371134, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5522, "step": 2945 }, { "epoch": 1.0244167118827998, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5617, "step": 2950 }, { "epoch": 1.0261530113944655, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5503, "step": 2955 }, { "epoch": 1.0278893109061313, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5318, "step": 2960 }, { "epoch": 1.029625610417797, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5166, "step": 2965 }, { "epoch": 1.0313619099294629, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5373, "step": 2970 }, { "epoch": 1.0330982094411285, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.535, "step": 2975 }, { "epoch": 1.0348345089527944, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5341, "step": 2980 }, { "epoch": 1.03657080846446, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5795, "step": 2985 }, { "epoch": 1.038307107976126, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5764, "step": 2990 }, { "epoch": 1.0400434074877916, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.6015, "step": 2995 }, { "epoch": 1.0417797069994574, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5517, "step": 3000 }, { "epoch": 1.043516006511123, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5322, "step": 3005 }, { "epoch": 1.045252306022789, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5317, "step": 3010 }, { "epoch": 1.0469886055344546, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5288, "step": 3015 }, { "epoch": 1.0487249050461205, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5166, "step": 3020 }, { "epoch": 1.0504612045577861, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5185, "step": 3025 }, { "epoch": 1.052197504069452, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5088, "step": 3030 }, { "epoch": 1.0539338035811177, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.6082, "step": 3035 }, { "epoch": 1.0556701030927835, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5536, "step": 3040 }, { "epoch": 1.0574064026044492, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5774, "step": 3045 }, { "epoch": 1.059142702116115, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5647, "step": 3050 }, { "epoch": 1.0608790016277807, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5461, "step": 3055 }, { "epoch": 1.0626153011394466, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5357, "step": 3060 }, { "epoch": 1.0643516006511122, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5344, "step": 3065 }, { "epoch": 1.066087900162778, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5371, "step": 3070 }, { "epoch": 1.0678241996744438, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5229, "step": 3075 }, { "epoch": 1.0695604991861096, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5132, "step": 3080 }, { "epoch": 1.0712967986977753, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5932, "step": 3085 }, { "epoch": 1.0730330982094411, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5733, "step": 3090 }, { "epoch": 1.0747693977211068, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.5709, "step": 3095 }, { "epoch": 1.0765056972327727, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.557, "step": 3100 }, { "epoch": 1.0782419967444383, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5577, "step": 3105 }, { "epoch": 1.0799782962561042, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.521, "step": 3110 }, { "epoch": 1.0817145957677698, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5096, "step": 3115 }, { "epoch": 1.0834508952794357, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5221, "step": 3120 }, { "epoch": 1.0851871947911014, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5144, "step": 3125 }, { "epoch": 1.0869234943027672, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5188, "step": 3130 }, { "epoch": 1.088659793814433, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.6153, "step": 3135 }, { "epoch": 1.0903960933260988, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.5514, "step": 3140 }, { "epoch": 1.0921323928377644, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5448, "step": 3145 }, { "epoch": 1.0938686923494303, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5636, "step": 3150 }, { "epoch": 1.095604991861096, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5605, "step": 3155 }, { "epoch": 1.0973412913727618, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5305, "step": 3160 }, { "epoch": 1.0990775908844275, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5507, "step": 3165 }, { "epoch": 1.1008138903960933, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.4921, "step": 3170 }, { "epoch": 1.102550189907759, "grad_norm": 0.12109375, "learning_rate": 0.0001, "loss": 0.5324, "step": 3175 }, { "epoch": 1.1042864894194249, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5052, "step": 3180 }, { "epoch": 1.1060227889310905, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5938, "step": 3185 }, { "epoch": 1.1077590884427564, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5594, "step": 3190 }, { "epoch": 1.109495387954422, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.5513, "step": 3195 }, { "epoch": 1.111231687466088, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5478, "step": 3200 }, { "epoch": 1.1129679869777536, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5314, "step": 3205 }, { "epoch": 1.1147042864894194, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5351, "step": 3210 }, { "epoch": 1.116440586001085, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5434, "step": 3215 }, { "epoch": 1.118176885512751, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5408, "step": 3220 }, { "epoch": 1.1199131850244166, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5003, "step": 3225 }, { "epoch": 1.1216494845360825, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.5179, "step": 3230 }, { "epoch": 1.1233857840477481, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5892, "step": 3235 }, { "epoch": 1.125122083559414, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.5529, "step": 3240 }, { "epoch": 1.1268583830710797, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.5669, "step": 3245 }, { "epoch": 1.1285946825827455, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.565, "step": 3250 }, { "epoch": 1.1303309820944114, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5467, "step": 3255 }, { "epoch": 1.132067281606077, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5411, "step": 3260 }, { "epoch": 1.1338035811177427, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5108, "step": 3265 }, { "epoch": 1.1355398806294086, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.5241, "step": 3270 }, { "epoch": 1.1372761801410745, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5386, "step": 3275 }, { "epoch": 1.13901247965274, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5035, "step": 3280 }, { "epoch": 1.1407487791644058, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5958, "step": 3285 }, { "epoch": 1.1424850786760716, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5763, "step": 3290 }, { "epoch": 1.1442213781877375, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5484, "step": 3295 }, { "epoch": 1.1459576776994032, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5687, "step": 3300 }, { "epoch": 1.1476939772110688, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.55, "step": 3305 }, { "epoch": 1.1494302767227347, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5231, "step": 3310 }, { "epoch": 1.1511665762344006, "grad_norm": 0.10302734375, "learning_rate": 0.0001, "loss": 0.5541, "step": 3315 }, { "epoch": 1.1529028757460662, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.54, "step": 3320 }, { "epoch": 1.1546391752577319, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5213, "step": 3325 }, { "epoch": 1.1563754747693977, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5192, "step": 3330 }, { "epoch": 1.1581117742810636, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5946, "step": 3335 }, { "epoch": 1.1598480737927293, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.5822, "step": 3340 }, { "epoch": 1.161584373304395, "grad_norm": 0.11328125, "learning_rate": 0.0001, "loss": 0.5655, "step": 3345 }, { "epoch": 1.1633206728160608, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5687, "step": 3350 }, { "epoch": 1.1650569723277266, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5448, "step": 3355 }, { "epoch": 1.1667932718393923, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5355, "step": 3360 }, { "epoch": 1.168529571351058, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5321, "step": 3365 }, { "epoch": 1.1702658708627238, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5252, "step": 3370 }, { "epoch": 1.1720021703743897, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5082, "step": 3375 }, { "epoch": 1.1737384698860553, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.4836, "step": 3380 }, { "epoch": 1.175474769397721, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.6026, "step": 3385 }, { "epoch": 1.1772110689093869, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5656, "step": 3390 }, { "epoch": 1.1789473684210527, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.563, "step": 3395 }, { "epoch": 1.1806836679327184, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.553, "step": 3400 }, { "epoch": 1.182419967444384, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5587, "step": 3405 }, { "epoch": 1.18415626695605, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5344, "step": 3410 }, { "epoch": 1.1858925664677158, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5537, "step": 3415 }, { "epoch": 1.1876288659793814, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5234, "step": 3420 }, { "epoch": 1.189365165491047, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.5108, "step": 3425 }, { "epoch": 1.191101465002713, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5186, "step": 3430 }, { "epoch": 1.1928377645143788, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.619, "step": 3435 }, { "epoch": 1.1945740640260445, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.568, "step": 3440 }, { "epoch": 1.1963103635377101, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5474, "step": 3445 }, { "epoch": 1.198046663049376, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.5505, "step": 3450 }, { "epoch": 1.1997829625610419, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.5279, "step": 3455 }, { "epoch": 1.2015192620727075, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5384, "step": 3460 }, { "epoch": 1.2032555615843732, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5143, "step": 3465 }, { "epoch": 1.204991861096039, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5401, "step": 3470 }, { "epoch": 1.206728160607705, "grad_norm": 0.15234375, "learning_rate": 0.0001, "loss": 0.526, "step": 3475 }, { "epoch": 1.2084644601193706, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5094, "step": 3480 }, { "epoch": 1.2102007596310362, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.6154, "step": 3485 }, { "epoch": 1.2119370591427021, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.566, "step": 3490 }, { "epoch": 1.213673358654368, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5844, "step": 3495 }, { "epoch": 1.2154096581660336, "grad_norm": 0.10498046875, "learning_rate": 0.0001, "loss": 0.5497, "step": 3500 }, { "epoch": 1.2171459576776993, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5543, "step": 3505 }, { "epoch": 1.2188822571893652, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.543, "step": 3510 }, { "epoch": 1.220618556701031, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5354, "step": 3515 }, { "epoch": 1.2223548562126967, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.4978, "step": 3520 }, { "epoch": 1.2240911557243623, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.5197, "step": 3525 }, { "epoch": 1.2258274552360282, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5303, "step": 3530 }, { "epoch": 1.227563754747694, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.599, "step": 3535 }, { "epoch": 1.2293000542593597, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5683, "step": 3540 }, { "epoch": 1.2310363537710254, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5718, "step": 3545 }, { "epoch": 1.2327726532826913, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5621, "step": 3550 }, { "epoch": 1.2345089527943571, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5643, "step": 3555 }, { "epoch": 1.2362452523060228, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.545, "step": 3560 }, { "epoch": 1.2379815518176884, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5316, "step": 3565 }, { "epoch": 1.2397178513293543, "grad_norm": 0.11962890625, "learning_rate": 0.0001, "loss": 0.5377, "step": 3570 }, { "epoch": 1.2414541508410202, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5073, "step": 3575 }, { "epoch": 1.2431904503526858, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5154, "step": 3580 }, { "epoch": 1.2449267498643515, "grad_norm": 0.111328125, "learning_rate": 0.0001, "loss": 0.605, "step": 3585 }, { "epoch": 1.2466630493760174, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5731, "step": 3590 }, { "epoch": 1.2483993488876832, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5519, "step": 3595 }, { "epoch": 1.2501356483993489, "grad_norm": 0.08349609375, "learning_rate": 0.0001, "loss": 0.5507, "step": 3600 }, { "epoch": 1.2518719479110145, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5362, "step": 3605 }, { "epoch": 1.2536082474226804, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5723, "step": 3610 }, { "epoch": 1.2553445469343463, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.538, "step": 3615 }, { "epoch": 1.257080846446012, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.5143, "step": 3620 }, { "epoch": 1.2588171459576776, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.5278, "step": 3625 }, { "epoch": 1.2605534454693434, "grad_norm": 0.1025390625, "learning_rate": 0.0001, "loss": 0.5244, "step": 3630 }, { "epoch": 1.2622897449810093, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5943, "step": 3635 }, { "epoch": 1.264026044492675, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5651, "step": 3640 }, { "epoch": 1.2657623440043406, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5607, "step": 3645 }, { "epoch": 1.2674986435160065, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5403, "step": 3650 }, { "epoch": 1.2692349430276724, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5385, "step": 3655 }, { "epoch": 1.270971242539338, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.528, "step": 3660 }, { "epoch": 1.2727075420510037, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5318, "step": 3665 }, { "epoch": 1.2744438415626695, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5335, "step": 3670 }, { "epoch": 1.2761801410743354, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5265, "step": 3675 }, { "epoch": 1.277916440586001, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.4965, "step": 3680 }, { "epoch": 1.2796527400976667, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.583, "step": 3685 }, { "epoch": 1.2813890396093326, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.5798, "step": 3690 }, { "epoch": 1.2831253391209985, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5799, "step": 3695 }, { "epoch": 1.2848616386326641, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5753, "step": 3700 }, { "epoch": 1.2865979381443298, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.555, "step": 3705 }, { "epoch": 1.2883342376559956, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5462, "step": 3710 }, { "epoch": 1.2900705371676615, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5345, "step": 3715 }, { "epoch": 1.2918068366793272, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5211, "step": 3720 }, { "epoch": 1.2935431361909928, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5173, "step": 3725 }, { "epoch": 1.2952794357026587, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.508, "step": 3730 }, { "epoch": 1.2970157352143246, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.6064, "step": 3735 }, { "epoch": 1.2987520347259902, "grad_norm": 0.083984375, "learning_rate": 0.0001, "loss": 0.5432, "step": 3740 }, { "epoch": 1.3004883342376559, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5654, "step": 3745 }, { "epoch": 1.3022246337493217, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5506, "step": 3750 }, { "epoch": 1.3039609332609876, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5412, "step": 3755 }, { "epoch": 1.3056972327726533, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5289, "step": 3760 }, { "epoch": 1.307433532284319, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5344, "step": 3765 }, { "epoch": 1.3091698317959848, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5394, "step": 3770 }, { "epoch": 1.3109061313076507, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.516, "step": 3775 }, { "epoch": 1.3126424308193163, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5243, "step": 3780 }, { "epoch": 1.314378730330982, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.6128, "step": 3785 }, { "epoch": 1.3161150298426478, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5888, "step": 3790 }, { "epoch": 1.3178513293543137, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.5755, "step": 3795 }, { "epoch": 1.3195876288659794, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5551, "step": 3800 }, { "epoch": 1.321323928377645, "grad_norm": 0.11083984375, "learning_rate": 0.0001, "loss": 0.5296, "step": 3805 }, { "epoch": 1.3230602278893109, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5562, "step": 3810 }, { "epoch": 1.3247965274009768, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5339, "step": 3815 }, { "epoch": 1.3265328269126424, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.526, "step": 3820 }, { "epoch": 1.328269126424308, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5213, "step": 3825 }, { "epoch": 1.330005425935974, "grad_norm": 0.1064453125, "learning_rate": 0.0001, "loss": 0.4972, "step": 3830 }, { "epoch": 1.3317417254476398, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5998, "step": 3835 }, { "epoch": 1.3334780249593055, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.5902, "step": 3840 }, { "epoch": 1.3352143244709713, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.585, "step": 3845 }, { "epoch": 1.336950623982637, "grad_norm": 0.1162109375, "learning_rate": 0.0001, "loss": 0.5746, "step": 3850 }, { "epoch": 1.3386869234943028, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.5543, "step": 3855 }, { "epoch": 1.3404232230059685, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5311, "step": 3860 }, { "epoch": 1.3421595225176344, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5246, "step": 3865 }, { "epoch": 1.3438958220293, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5152, "step": 3870 }, { "epoch": 1.345632121540966, "grad_norm": 0.11376953125, "learning_rate": 0.0001, "loss": 0.5267, "step": 3875 }, { "epoch": 1.3473684210526315, "grad_norm": 0.1162109375, "learning_rate": 0.0001, "loss": 0.5042, "step": 3880 }, { "epoch": 1.3491047205642974, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.6091, "step": 3885 }, { "epoch": 1.350841020075963, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5629, "step": 3890 }, { "epoch": 1.352577319587629, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5481, "step": 3895 }, { "epoch": 1.3543136190992946, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5349, "step": 3900 }, { "epoch": 1.3560499186109605, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5352, "step": 3905 }, { "epoch": 1.3577862181226261, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5529, "step": 3910 }, { "epoch": 1.359522517634292, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.55, "step": 3915 }, { "epoch": 1.3612588171459576, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5221, "step": 3920 }, { "epoch": 1.3629951166576235, "grad_norm": 0.291015625, "learning_rate": 0.0001, "loss": 0.5193, "step": 3925 }, { "epoch": 1.3647314161692892, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.5119, "step": 3930 }, { "epoch": 1.366467715680955, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5839, "step": 3935 }, { "epoch": 1.3682040151926207, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.5602, "step": 3940 }, { "epoch": 1.3699403147042866, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5468, "step": 3945 }, { "epoch": 1.3716766142159522, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.548, "step": 3950 }, { "epoch": 1.373412913727618, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5601, "step": 3955 }, { "epoch": 1.3751492132392837, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.5371, "step": 3960 }, { "epoch": 1.3768855127509496, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5431, "step": 3965 }, { "epoch": 1.3786218122626153, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5277, "step": 3970 }, { "epoch": 1.3803581117742811, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.5221, "step": 3975 }, { "epoch": 1.3820944112859468, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.5064, "step": 3980 }, { "epoch": 1.3838307107976127, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.6115, "step": 3985 }, { "epoch": 1.3855670103092783, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.573, "step": 3990 }, { "epoch": 1.3873033098209442, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.585, "step": 3995 }, { "epoch": 1.3890396093326098, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5566, "step": 4000 }, { "epoch": 1.3907759088442757, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5365, "step": 4005 }, { "epoch": 1.3925122083559414, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5477, "step": 4010 }, { "epoch": 1.3942485078676072, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5232, "step": 4015 }, { "epoch": 1.3959848073792729, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5557, "step": 4020 }, { "epoch": 1.3977211068909388, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5138, "step": 4025 }, { "epoch": 1.3994574064026044, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5254, "step": 4030 }, { "epoch": 1.4011937059142703, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.604, "step": 4035 }, { "epoch": 1.402930005425936, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5583, "step": 4040 }, { "epoch": 1.4046663049376018, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.5735, "step": 4045 }, { "epoch": 1.4064026044492675, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5454, "step": 4050 }, { "epoch": 1.4081389039609333, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5569, "step": 4055 }, { "epoch": 1.409875203472599, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5459, "step": 4060 }, { "epoch": 1.4116115029842649, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5285, "step": 4065 }, { "epoch": 1.4133478024959305, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.5373, "step": 4070 }, { "epoch": 1.4150841020075964, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5274, "step": 4075 }, { "epoch": 1.416820401519262, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.4974, "step": 4080 }, { "epoch": 1.418556701030928, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.5757, "step": 4085 }, { "epoch": 1.4202930005425936, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5569, "step": 4090 }, { "epoch": 1.4220293000542594, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.5585, "step": 4095 }, { "epoch": 1.423765599565925, "grad_norm": 0.4375, "learning_rate": 0.0001, "loss": 0.5621, "step": 4100 }, { "epoch": 1.425501899077591, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5391, "step": 4105 }, { "epoch": 1.4272381985892566, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5445, "step": 4110 }, { "epoch": 1.4289744981009225, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5327, "step": 4115 }, { "epoch": 1.4307107976125881, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.5027, "step": 4120 }, { "epoch": 1.432447097124254, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.5147, "step": 4125 }, { "epoch": 1.4341833966359196, "grad_norm": 0.10888671875, "learning_rate": 0.0001, "loss": 0.5243, "step": 4130 }, { "epoch": 1.4359196961475855, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5844, "step": 4135 }, { "epoch": 1.4376559956592512, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5563, "step": 4140 }, { "epoch": 1.439392295170917, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5539, "step": 4145 }, { "epoch": 1.4411285946825827, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.539, "step": 4150 }, { "epoch": 1.4428648941942486, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.539, "step": 4155 }, { "epoch": 1.4446011937059142, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5496, "step": 4160 }, { "epoch": 1.44633749321758, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.5393, "step": 4165 }, { "epoch": 1.4480737927292457, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.5233, "step": 4170 }, { "epoch": 1.4498100922409116, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5267, "step": 4175 }, { "epoch": 1.4515463917525773, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.529, "step": 4180 }, { "epoch": 1.4532826912642431, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.6023, "step": 4185 }, { "epoch": 1.4550189907759088, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5811, "step": 4190 }, { "epoch": 1.4567552902875747, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.5586, "step": 4195 }, { "epoch": 1.4584915897992403, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5704, "step": 4200 }, { "epoch": 1.4602278893109062, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5394, "step": 4205 }, { "epoch": 1.4619641888225718, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.5105, "step": 4210 }, { "epoch": 1.4637004883342377, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5327, "step": 4215 }, { "epoch": 1.4654367878459034, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5387, "step": 4220 }, { "epoch": 1.4671730873575692, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.518, "step": 4225 }, { "epoch": 1.468909386869235, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5103, "step": 4230 }, { "epoch": 1.4706456863809008, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.588, "step": 4235 }, { "epoch": 1.4723819858925664, "grad_norm": 0.10888671875, "learning_rate": 0.0001, "loss": 0.5676, "step": 4240 }, { "epoch": 1.4741182854042323, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5664, "step": 4245 }, { "epoch": 1.475854584915898, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.5609, "step": 4250 }, { "epoch": 1.4775908844275638, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.5587, "step": 4255 }, { "epoch": 1.4793271839392295, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5505, "step": 4260 }, { "epoch": 1.4810634834508953, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.5338, "step": 4265 }, { "epoch": 1.482799782962561, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5416, "step": 4270 }, { "epoch": 1.4845360824742269, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5411, "step": 4275 }, { "epoch": 1.4862723819858925, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.5019, "step": 4280 }, { "epoch": 1.4880086814975584, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5715, "step": 4285 }, { "epoch": 1.489744981009224, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5747, "step": 4290 }, { "epoch": 1.49148128052089, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5858, "step": 4295 }, { "epoch": 1.4932175800325556, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5528, "step": 4300 }, { "epoch": 1.4949538795442214, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5545, "step": 4305 }, { "epoch": 1.496690179055887, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.5409, "step": 4310 }, { "epoch": 1.498426478567553, "grad_norm": 0.103515625, "learning_rate": 0.0001, "loss": 0.5582, "step": 4315 }, { "epoch": 1.5001627780792188, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.5204, "step": 4320 }, { "epoch": 1.5018990775908845, "grad_norm": 0.1083984375, "learning_rate": 0.0001, "loss": 0.5273, "step": 4325 }, { "epoch": 1.5036353771025501, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.5047, "step": 4330 }, { "epoch": 1.505371676614216, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.5932, "step": 4335 }, { "epoch": 1.5071079761258819, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.5626, "step": 4340 }, { "epoch": 1.5088442756375475, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5594, "step": 4345 }, { "epoch": 1.5105805751492132, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.5529, "step": 4350 }, { "epoch": 1.512316874660879, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.5572, "step": 4355 }, { "epoch": 1.514053174172545, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5454, "step": 4360 }, { "epoch": 1.5157894736842106, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.5368, "step": 4365 }, { "epoch": 1.5175257731958762, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.5221, "step": 4370 }, { "epoch": 1.519262072707542, "grad_norm": 0.10791015625, "learning_rate": 0.0001, "loss": 0.5348, "step": 4375 }, { "epoch": 1.520998372219208, "grad_norm": 0.10693359375, "learning_rate": 0.0001, "loss": 0.5132, "step": 4380 }, { "epoch": 1.5227346717308736, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.5832, "step": 4385 }, { "epoch": 1.5244709712425393, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5648, "step": 4390 }, { "epoch": 1.5262072707542051, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.568, "step": 4395 }, { "epoch": 1.527943570265871, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.539, "step": 4400 }, { "epoch": 1.5296798697775367, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5463, "step": 4405 }, { "epoch": 1.5314161692892023, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.5238, "step": 4410 }, { "epoch": 1.5331524688008682, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5254, "step": 4415 }, { "epoch": 1.534888768312534, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.535, "step": 4420 }, { "epoch": 1.5366250678241997, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5234, "step": 4425 }, { "epoch": 1.5383613673358654, "grad_norm": 0.1025390625, "learning_rate": 0.0001, "loss": 0.4957, "step": 4430 }, { "epoch": 1.5400976668475312, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.5816, "step": 4435 }, { "epoch": 1.5418339663591971, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.5775, "step": 4440 }, { "epoch": 1.5435702658708628, "grad_norm": 0.09228515625, "learning_rate": 0.0001, "loss": 0.5528, "step": 4445 }, { "epoch": 1.5453065653825284, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.5616, "step": 4450 }, { "epoch": 1.5470428648941943, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.5281, "step": 4455 }, { "epoch": 1.5487791644058602, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5411, "step": 4460 }, { "epoch": 1.5505154639175258, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.4934, "step": 4465 }, { "epoch": 1.5522517634291915, "grad_norm": 0.0986328125, "learning_rate": 0.0001, "loss": 0.5168, "step": 4470 }, { "epoch": 1.5539880629408573, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.5091, "step": 4475 }, { "epoch": 1.5557243624525232, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.5026, "step": 4480 }, { "epoch": 1.5574606619641889, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.6009, "step": 4485 }, { "epoch": 1.5591969614758545, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.5593, "step": 4490 }, { "epoch": 1.5609332609875204, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.5694, "step": 4495 }, { "epoch": 1.5626695604991863, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.5424, "step": 4500 }, { "epoch": 1.5626695604991863, "step": 4500, "total_flos": 4.510419270260736e+18, "train_loss": 0.5749385200606452, "train_runtime": 199956.6124, "train_samples_per_second": 1.44, "train_steps_per_second": 0.023 } ], "logging_steps": 5, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.510419270260736e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }