diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7123 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3665823527255398, + "eval_steps": 1000, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003665823527255398, + "grad_norm": 0.8778485808644284, + "learning_rate": 2e-07, + "loss": 2.1465, + "step": 10 + }, + { + "epoch": 0.0007331647054510796, + "grad_norm": 1.0370696683685088, + "learning_rate": 4e-07, + "loss": 2.1972, + "step": 20 + }, + { + "epoch": 0.0010997470581766194, + "grad_norm": 1.0006676078231553, + "learning_rate": 6e-07, + "loss": 2.1582, + "step": 30 + }, + { + "epoch": 0.0014663294109021592, + "grad_norm": 0.8854477289760336, + "learning_rate": 8e-07, + "loss": 2.1934, + "step": 40 + }, + { + "epoch": 0.001832911763627699, + "grad_norm": 0.8999727006888211, + "learning_rate": 1e-06, + "loss": 2.1904, + "step": 50 + }, + { + "epoch": 0.002199494116353239, + "grad_norm": 0.932364223152173, + "learning_rate": 9.999996672053607e-07, + "loss": 2.1706, + "step": 60 + }, + { + "epoch": 0.0025660764690787785, + "grad_norm": 1.0299012086021375, + "learning_rate": 9.999986688218858e-07, + "loss": 2.1958, + "step": 70 + }, + { + "epoch": 0.0029326588218043185, + "grad_norm": 0.9395158606106717, + "learning_rate": 9.999970048509042e-07, + "loss": 2.2273, + "step": 80 + }, + { + "epoch": 0.003299241174529858, + "grad_norm": 0.9869960358591985, + "learning_rate": 9.999946752946311e-07, + "loss": 2.1807, + "step": 90 + }, + { + "epoch": 0.003665823527255398, + "grad_norm": 0.98825421384792, + "learning_rate": 9.999916801561675e-07, + "loss": 2.1348, + "step": 100 + }, + { + "epoch": 0.004032405879980938, + "grad_norm": 1.1988395000442367, + "learning_rate": 9.999880194395004e-07, + "loss": 2.1377, + "step": 110 + }, + { + "epoch": 0.004398988232706478, + "grad_norm": 1.129064025809237, + "learning_rate": 9.99983693149503e-07, + "loss": 2.1565, + "step": 120 + }, + { + "epoch": 0.004765570585432017, + "grad_norm": 1.0050118479797396, + "learning_rate": 9.999787012919342e-07, + "loss": 2.1701, + "step": 130 + }, + { + "epoch": 0.005132152938157557, + "grad_norm": 0.9232759625522824, + "learning_rate": 9.999730438734393e-07, + "loss": 2.0963, + "step": 140 + }, + { + "epoch": 0.0054987352908830965, + "grad_norm": 1.0348403490845175, + "learning_rate": 9.999667209015492e-07, + "loss": 2.1989, + "step": 150 + }, + { + "epoch": 0.005865317643608637, + "grad_norm": 1.0493408122676058, + "learning_rate": 9.999597323846806e-07, + "loss": 2.1707, + "step": 160 + }, + { + "epoch": 0.0062318999963341766, + "grad_norm": 1.116513730433909, + "learning_rate": 9.99952078332137e-07, + "loss": 2.1614, + "step": 170 + }, + { + "epoch": 0.006598482349059716, + "grad_norm": 0.9558367370618089, + "learning_rate": 9.999437587541072e-07, + "loss": 2.1214, + "step": 180 + }, + { + "epoch": 0.006965064701785256, + "grad_norm": 1.0990453159310916, + "learning_rate": 9.999347736616657e-07, + "loss": 2.1514, + "step": 190 + }, + { + "epoch": 0.007331647054510796, + "grad_norm": 1.051146838955259, + "learning_rate": 9.999251230667734e-07, + "loss": 2.1672, + "step": 200 + }, + { + "epoch": 0.007698229407236336, + "grad_norm": 1.0528334484392676, + "learning_rate": 9.99914806982277e-07, + "loss": 2.1651, + "step": 210 + }, + { + "epoch": 0.008064811759961876, + "grad_norm": 1.0488001209067876, + "learning_rate": 9.999038254219094e-07, + "loss": 2.1269, + "step": 220 + }, + { + "epoch": 0.008431394112687415, + "grad_norm": 1.0423933094923075, + "learning_rate": 9.998921784002884e-07, + "loss": 2.1409, + "step": 230 + }, + { + "epoch": 0.008797976465412955, + "grad_norm": 1.2035163212207243, + "learning_rate": 9.998798659329188e-07, + "loss": 2.0949, + "step": 240 + }, + { + "epoch": 0.009164558818138494, + "grad_norm": 1.0311622443925152, + "learning_rate": 9.998668880361902e-07, + "loss": 2.1572, + "step": 250 + }, + { + "epoch": 0.009531141170864035, + "grad_norm": 1.0199238986570556, + "learning_rate": 9.99853244727379e-07, + "loss": 2.0908, + "step": 260 + }, + { + "epoch": 0.009897723523589575, + "grad_norm": 1.1052910194491554, + "learning_rate": 9.998389360246465e-07, + "loss": 2.1046, + "step": 270 + }, + { + "epoch": 0.010264305876315114, + "grad_norm": 1.0244380828171549, + "learning_rate": 9.998239619470404e-07, + "loss": 2.1351, + "step": 280 + }, + { + "epoch": 0.010630888229040654, + "grad_norm": 1.0080176905815665, + "learning_rate": 9.998083225144936e-07, + "loss": 2.089, + "step": 290 + }, + { + "epoch": 0.010997470581766193, + "grad_norm": 0.9588881775099163, + "learning_rate": 9.997920177478252e-07, + "loss": 2.0186, + "step": 300 + }, + { + "epoch": 0.011364052934491733, + "grad_norm": 1.0223619251237732, + "learning_rate": 9.997750476687394e-07, + "loss": 2.0966, + "step": 310 + }, + { + "epoch": 0.011730635287217274, + "grad_norm": 1.1940399230837102, + "learning_rate": 9.99757412299827e-07, + "loss": 2.1036, + "step": 320 + }, + { + "epoch": 0.012097217639942813, + "grad_norm": 0.9943487033980454, + "learning_rate": 9.997391116645635e-07, + "loss": 2.0628, + "step": 330 + }, + { + "epoch": 0.012463799992668353, + "grad_norm": 1.03891573450971, + "learning_rate": 9.997201457873104e-07, + "loss": 2.0691, + "step": 340 + }, + { + "epoch": 0.012830382345393894, + "grad_norm": 1.116344520158988, + "learning_rate": 9.997005146933144e-07, + "loss": 2.0524, + "step": 350 + }, + { + "epoch": 0.013196964698119432, + "grad_norm": 0.9966017657422209, + "learning_rate": 9.996802184087082e-07, + "loss": 2.0779, + "step": 360 + }, + { + "epoch": 0.013563547050844973, + "grad_norm": 1.0412743923430994, + "learning_rate": 9.996592569605099e-07, + "loss": 2.0376, + "step": 370 + }, + { + "epoch": 0.013930129403570512, + "grad_norm": 1.1118998023014073, + "learning_rate": 9.996376303766227e-07, + "loss": 2.015, + "step": 380 + }, + { + "epoch": 0.014296711756296052, + "grad_norm": 1.0325566872435106, + "learning_rate": 9.996153386858355e-07, + "loss": 2.0249, + "step": 390 + }, + { + "epoch": 0.014663294109021592, + "grad_norm": 0.9345504257678122, + "learning_rate": 9.995923819178226e-07, + "loss": 2.0451, + "step": 400 + }, + { + "epoch": 0.015029876461747131, + "grad_norm": 0.8875269101106378, + "learning_rate": 9.995687601031435e-07, + "loss": 2.0108, + "step": 410 + }, + { + "epoch": 0.015396458814472672, + "grad_norm": 1.0784341870798066, + "learning_rate": 9.99544473273243e-07, + "loss": 2.0201, + "step": 420 + }, + { + "epoch": 0.015763041167198212, + "grad_norm": 0.9379135038421763, + "learning_rate": 9.995195214604515e-07, + "loss": 1.941, + "step": 430 + }, + { + "epoch": 0.016129623519923753, + "grad_norm": 0.9126909079244707, + "learning_rate": 9.994939046979838e-07, + "loss": 1.9684, + "step": 440 + }, + { + "epoch": 0.01649620587264929, + "grad_norm": 0.8838022442791796, + "learning_rate": 9.994676230199407e-07, + "loss": 2.0389, + "step": 450 + }, + { + "epoch": 0.01686278822537483, + "grad_norm": 0.8836839199930503, + "learning_rate": 9.994406764613082e-07, + "loss": 1.9666, + "step": 460 + }, + { + "epoch": 0.01722937057810037, + "grad_norm": 1.0627568898996331, + "learning_rate": 9.994130650579563e-07, + "loss": 2.0156, + "step": 470 + }, + { + "epoch": 0.01759595293082591, + "grad_norm": 0.9141641845780258, + "learning_rate": 9.993847888466408e-07, + "loss": 1.9649, + "step": 480 + }, + { + "epoch": 0.01796253528355145, + "grad_norm": 0.9929808622960486, + "learning_rate": 9.993558478650027e-07, + "loss": 1.951, + "step": 490 + }, + { + "epoch": 0.01832911763627699, + "grad_norm": 0.9649106649125109, + "learning_rate": 9.993262421515677e-07, + "loss": 2.0194, + "step": 500 + }, + { + "epoch": 0.01869569998900253, + "grad_norm": 0.9646184299435382, + "learning_rate": 9.992959717457456e-07, + "loss": 2.0054, + "step": 510 + }, + { + "epoch": 0.01906228234172807, + "grad_norm": 0.9754107205971403, + "learning_rate": 9.992650366878326e-07, + "loss": 1.9614, + "step": 520 + }, + { + "epoch": 0.01942886469445361, + "grad_norm": 0.825876663123403, + "learning_rate": 9.99233437019008e-07, + "loss": 2.0141, + "step": 530 + }, + { + "epoch": 0.01979544704717915, + "grad_norm": 0.9898145517539251, + "learning_rate": 9.992011727813372e-07, + "loss": 1.9788, + "step": 540 + }, + { + "epoch": 0.020162029399904687, + "grad_norm": 1.1244188599069105, + "learning_rate": 9.991682440177694e-07, + "loss": 1.9034, + "step": 550 + }, + { + "epoch": 0.020528611752630228, + "grad_norm": 1.1497344942569774, + "learning_rate": 9.991346507721387e-07, + "loss": 1.9211, + "step": 560 + }, + { + "epoch": 0.020895194105355768, + "grad_norm": 0.9021316458842555, + "learning_rate": 9.991003930891637e-07, + "loss": 1.9182, + "step": 570 + }, + { + "epoch": 0.02126177645808131, + "grad_norm": 0.8307709564470201, + "learning_rate": 9.990654710144475e-07, + "loss": 1.9272, + "step": 580 + }, + { + "epoch": 0.02162835881080685, + "grad_norm": 0.8745951617052735, + "learning_rate": 9.990298845944777e-07, + "loss": 1.9499, + "step": 590 + }, + { + "epoch": 0.021994941163532386, + "grad_norm": 0.8243921045085457, + "learning_rate": 9.98993633876626e-07, + "loss": 1.9221, + "step": 600 + }, + { + "epoch": 0.022361523516257927, + "grad_norm": 0.9285168979863858, + "learning_rate": 9.989567189091486e-07, + "loss": 1.8804, + "step": 610 + }, + { + "epoch": 0.022728105868983467, + "grad_norm": 0.9675998606348684, + "learning_rate": 9.98919139741186e-07, + "loss": 1.9019, + "step": 620 + }, + { + "epoch": 0.023094688221709007, + "grad_norm": 0.8852104273861887, + "learning_rate": 9.988808964227629e-07, + "loss": 1.8772, + "step": 630 + }, + { + "epoch": 0.023461270574434548, + "grad_norm": 0.819719680853091, + "learning_rate": 9.988419890047877e-07, + "loss": 1.9171, + "step": 640 + }, + { + "epoch": 0.023827852927160085, + "grad_norm": 0.93140794342887, + "learning_rate": 9.988024175390533e-07, + "loss": 1.8467, + "step": 650 + }, + { + "epoch": 0.024194435279885625, + "grad_norm": 0.8360802933834758, + "learning_rate": 9.987621820782363e-07, + "loss": 1.9233, + "step": 660 + }, + { + "epoch": 0.024561017632611166, + "grad_norm": 0.8157180427592693, + "learning_rate": 9.987212826758975e-07, + "loss": 1.9473, + "step": 670 + }, + { + "epoch": 0.024927599985336706, + "grad_norm": 0.9793002573948607, + "learning_rate": 9.98679719386481e-07, + "loss": 1.8931, + "step": 680 + }, + { + "epoch": 0.025294182338062247, + "grad_norm": 0.8445420197840301, + "learning_rate": 9.986374922653154e-07, + "loss": 1.8686, + "step": 690 + }, + { + "epoch": 0.025660764690787787, + "grad_norm": 0.8584605142905422, + "learning_rate": 9.985946013686119e-07, + "loss": 1.8967, + "step": 700 + }, + { + "epoch": 0.026027347043513324, + "grad_norm": 0.98656156834715, + "learning_rate": 9.985510467534664e-07, + "loss": 1.8635, + "step": 710 + }, + { + "epoch": 0.026393929396238865, + "grad_norm": 0.9182458113746159, + "learning_rate": 9.985068284778577e-07, + "loss": 1.8693, + "step": 720 + }, + { + "epoch": 0.026760511748964405, + "grad_norm": 0.8330989668660308, + "learning_rate": 9.984619466006485e-07, + "loss": 1.8613, + "step": 730 + }, + { + "epoch": 0.027127094101689946, + "grad_norm": 0.8644736624360776, + "learning_rate": 9.98416401181584e-07, + "loss": 1.8628, + "step": 740 + }, + { + "epoch": 0.027493676454415486, + "grad_norm": 0.987168924150431, + "learning_rate": 9.98370192281294e-07, + "loss": 1.8943, + "step": 750 + }, + { + "epoch": 0.027860258807141023, + "grad_norm": 0.8720418625775509, + "learning_rate": 9.983233199612903e-07, + "loss": 1.9446, + "step": 760 + }, + { + "epoch": 0.028226841159866563, + "grad_norm": 0.7953663245922279, + "learning_rate": 9.982757842839687e-07, + "loss": 1.9014, + "step": 770 + }, + { + "epoch": 0.028593423512592104, + "grad_norm": 0.9296681817326182, + "learning_rate": 9.98227585312607e-07, + "loss": 1.8108, + "step": 780 + }, + { + "epoch": 0.028960005865317644, + "grad_norm": 0.8062000633701384, + "learning_rate": 9.981787231113675e-07, + "loss": 1.8345, + "step": 790 + }, + { + "epoch": 0.029326588218043185, + "grad_norm": 0.7938194156111642, + "learning_rate": 9.981291977452939e-07, + "loss": 1.8941, + "step": 800 + }, + { + "epoch": 0.029693170570768722, + "grad_norm": 0.9291321405470028, + "learning_rate": 9.980790092803135e-07, + "loss": 1.8403, + "step": 810 + }, + { + "epoch": 0.030059752923494262, + "grad_norm": 0.8275423223500764, + "learning_rate": 9.980281577832363e-07, + "loss": 1.8402, + "step": 820 + }, + { + "epoch": 0.030426335276219803, + "grad_norm": 0.8980283349268403, + "learning_rate": 9.979766433217545e-07, + "loss": 1.8691, + "step": 830 + }, + { + "epoch": 0.030792917628945343, + "grad_norm": 0.7768796883189981, + "learning_rate": 9.979244659644429e-07, + "loss": 1.888, + "step": 840 + }, + { + "epoch": 0.031159499981670884, + "grad_norm": 0.818398169635764, + "learning_rate": 9.978716257807593e-07, + "loss": 1.8814, + "step": 850 + }, + { + "epoch": 0.031526082334396424, + "grad_norm": 0.8442121417280394, + "learning_rate": 9.97818122841043e-07, + "loss": 1.8369, + "step": 860 + }, + { + "epoch": 0.031892664687121965, + "grad_norm": 0.8176757534156489, + "learning_rate": 9.977639572165162e-07, + "loss": 1.8591, + "step": 870 + }, + { + "epoch": 0.032259247039847505, + "grad_norm": 0.8029579269470367, + "learning_rate": 9.97709128979283e-07, + "loss": 1.8866, + "step": 880 + }, + { + "epoch": 0.03262582939257304, + "grad_norm": 0.8812915944662771, + "learning_rate": 9.976536382023294e-07, + "loss": 1.8366, + "step": 890 + }, + { + "epoch": 0.03299241174529858, + "grad_norm": 0.777876054228082, + "learning_rate": 9.97597484959524e-07, + "loss": 1.8322, + "step": 900 + }, + { + "epoch": 0.03335899409802412, + "grad_norm": 0.9073927568433396, + "learning_rate": 9.975406693256162e-07, + "loss": 1.8238, + "step": 910 + }, + { + "epoch": 0.03372557645074966, + "grad_norm": 1.154230547383887, + "learning_rate": 9.974831913762382e-07, + "loss": 1.8574, + "step": 920 + }, + { + "epoch": 0.0340921588034752, + "grad_norm": 0.8196714978615802, + "learning_rate": 9.974250511879031e-07, + "loss": 1.8423, + "step": 930 + }, + { + "epoch": 0.03445874115620074, + "grad_norm": 0.9288752746341313, + "learning_rate": 9.97366248838006e-07, + "loss": 1.8993, + "step": 940 + }, + { + "epoch": 0.03482532350892628, + "grad_norm": 0.7950657259868453, + "learning_rate": 9.973067844048235e-07, + "loss": 1.8741, + "step": 950 + }, + { + "epoch": 0.03519190586165182, + "grad_norm": 0.796086365915343, + "learning_rate": 9.972466579675131e-07, + "loss": 1.7832, + "step": 960 + }, + { + "epoch": 0.03555848821437736, + "grad_norm": 0.9066172708399791, + "learning_rate": 9.97185869606114e-07, + "loss": 1.8462, + "step": 970 + }, + { + "epoch": 0.0359250705671029, + "grad_norm": 1.038083569499433, + "learning_rate": 9.971244194015463e-07, + "loss": 1.858, + "step": 980 + }, + { + "epoch": 0.036291652919828436, + "grad_norm": 0.9051533251684815, + "learning_rate": 9.97062307435611e-07, + "loss": 1.8387, + "step": 990 + }, + { + "epoch": 0.03665823527255398, + "grad_norm": 0.8381523935993735, + "learning_rate": 9.969995337909908e-07, + "loss": 1.8361, + "step": 1000 + }, + { + "epoch": 0.03665823527255398, + "eval_accuracy": 0.5988169778677517, + "eval_loss": 1.8318405151367188, + "eval_runtime": 308.5555, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 0.894, + "step": 1000 + }, + { + "epoch": 0.03702481762527952, + "grad_norm": 0.8427628207388767, + "learning_rate": 9.969360985512478e-07, + "loss": 1.8265, + "step": 1010 + }, + { + "epoch": 0.03739139997800506, + "grad_norm": 0.8552215254960128, + "learning_rate": 9.968720018008264e-07, + "loss": 1.858, + "step": 1020 + }, + { + "epoch": 0.0377579823307306, + "grad_norm": 0.9770990446912831, + "learning_rate": 9.968072436250502e-07, + "loss": 1.8336, + "step": 1030 + }, + { + "epoch": 0.03812456468345614, + "grad_norm": 0.8749109462328284, + "learning_rate": 9.967418241101245e-07, + "loss": 1.8659, + "step": 1040 + }, + { + "epoch": 0.03849114703618168, + "grad_norm": 1.0370092544039358, + "learning_rate": 9.966757433431338e-07, + "loss": 1.7817, + "step": 1050 + }, + { + "epoch": 0.03885772938890722, + "grad_norm": 0.9115228378829131, + "learning_rate": 9.966090014120439e-07, + "loss": 1.8024, + "step": 1060 + }, + { + "epoch": 0.03922431174163276, + "grad_norm": 0.8868427346212977, + "learning_rate": 9.965415984056998e-07, + "loss": 1.8437, + "step": 1070 + }, + { + "epoch": 0.0395908940943583, + "grad_norm": 0.9053364161480404, + "learning_rate": 9.96473534413827e-07, + "loss": 1.817, + "step": 1080 + }, + { + "epoch": 0.039957476447083834, + "grad_norm": 0.9133195528454671, + "learning_rate": 9.964048095270312e-07, + "loss": 1.7877, + "step": 1090 + }, + { + "epoch": 0.040324058799809374, + "grad_norm": 1.0646101033232054, + "learning_rate": 9.963354238367971e-07, + "loss": 1.784, + "step": 1100 + }, + { + "epoch": 0.040690641152534915, + "grad_norm": 0.7708104862115812, + "learning_rate": 9.962653774354897e-07, + "loss": 1.8534, + "step": 1110 + }, + { + "epoch": 0.041057223505260455, + "grad_norm": 0.8675790148592712, + "learning_rate": 9.96194670416353e-07, + "loss": 1.8549, + "step": 1120 + }, + { + "epoch": 0.041423805857985996, + "grad_norm": 0.8417668918121122, + "learning_rate": 9.961233028735107e-07, + "loss": 1.816, + "step": 1130 + }, + { + "epoch": 0.041790388210711536, + "grad_norm": 0.8168288703880237, + "learning_rate": 9.960512749019661e-07, + "loss": 1.8512, + "step": 1140 + }, + { + "epoch": 0.04215697056343708, + "grad_norm": 0.8018545416660454, + "learning_rate": 9.95978586597601e-07, + "loss": 1.832, + "step": 1150 + }, + { + "epoch": 0.04252355291616262, + "grad_norm": 0.9865966895727584, + "learning_rate": 9.959052380571764e-07, + "loss": 1.853, + "step": 1160 + }, + { + "epoch": 0.04289013526888816, + "grad_norm": 0.8107907928839149, + "learning_rate": 9.958312293783327e-07, + "loss": 1.85, + "step": 1170 + }, + { + "epoch": 0.0432567176216137, + "grad_norm": 0.9230676080344427, + "learning_rate": 9.957565606595882e-07, + "loss": 1.7839, + "step": 1180 + }, + { + "epoch": 0.04362329997433924, + "grad_norm": 0.9011134249108275, + "learning_rate": 9.956812320003407e-07, + "loss": 1.7649, + "step": 1190 + }, + { + "epoch": 0.04398988232706477, + "grad_norm": 0.8877055310067349, + "learning_rate": 9.956052435008657e-07, + "loss": 1.8358, + "step": 1200 + }, + { + "epoch": 0.04435646467979031, + "grad_norm": 0.9441745533847735, + "learning_rate": 9.955285952623177e-07, + "loss": 1.8217, + "step": 1210 + }, + { + "epoch": 0.04472304703251585, + "grad_norm": 0.9280531244485228, + "learning_rate": 9.954512873867292e-07, + "loss": 1.8273, + "step": 1220 + }, + { + "epoch": 0.04508962938524139, + "grad_norm": 1.0733510489183336, + "learning_rate": 9.95373319977011e-07, + "loss": 1.8289, + "step": 1230 + }, + { + "epoch": 0.045456211737966934, + "grad_norm": 0.9194393203848475, + "learning_rate": 9.952946931369512e-07, + "loss": 1.8134, + "step": 1240 + }, + { + "epoch": 0.045822794090692474, + "grad_norm": 0.8924651164337065, + "learning_rate": 9.952154069712164e-07, + "loss": 1.8233, + "step": 1250 + }, + { + "epoch": 0.046189376443418015, + "grad_norm": 0.9645620934573451, + "learning_rate": 9.951354615853506e-07, + "loss": 1.7951, + "step": 1260 + }, + { + "epoch": 0.046555958796143555, + "grad_norm": 0.9514951845878826, + "learning_rate": 9.950548570857755e-07, + "loss": 1.8034, + "step": 1270 + }, + { + "epoch": 0.046922541148869096, + "grad_norm": 1.0861848487934576, + "learning_rate": 9.949735935797898e-07, + "loss": 1.7845, + "step": 1280 + }, + { + "epoch": 0.047289123501594636, + "grad_norm": 0.9444165617124335, + "learning_rate": 9.948916711755702e-07, + "loss": 1.8499, + "step": 1290 + }, + { + "epoch": 0.04765570585432017, + "grad_norm": 0.9296489213610688, + "learning_rate": 9.948090899821695e-07, + "loss": 1.8362, + "step": 1300 + }, + { + "epoch": 0.04802228820704571, + "grad_norm": 0.9031404187157595, + "learning_rate": 9.947258501095183e-07, + "loss": 1.7987, + "step": 1310 + }, + { + "epoch": 0.04838887055977125, + "grad_norm": 0.9893576898507132, + "learning_rate": 9.946419516684238e-07, + "loss": 1.7901, + "step": 1320 + }, + { + "epoch": 0.04875545291249679, + "grad_norm": 0.8312432281714202, + "learning_rate": 9.945573947705696e-07, + "loss": 1.7877, + "step": 1330 + }, + { + "epoch": 0.04912203526522233, + "grad_norm": 0.9503234488792208, + "learning_rate": 9.944721795285161e-07, + "loss": 1.7814, + "step": 1340 + }, + { + "epoch": 0.04948861761794787, + "grad_norm": 0.8138144516056374, + "learning_rate": 9.943863060557e-07, + "loss": 1.7973, + "step": 1350 + }, + { + "epoch": 0.04985519997067341, + "grad_norm": 1.0236050868655204, + "learning_rate": 9.942997744664346e-07, + "loss": 1.766, + "step": 1360 + }, + { + "epoch": 0.05022178232339895, + "grad_norm": 0.8876253030811799, + "learning_rate": 9.942125848759084e-07, + "loss": 1.8025, + "step": 1370 + }, + { + "epoch": 0.05058836467612449, + "grad_norm": 0.9143837255426513, + "learning_rate": 9.941247374001864e-07, + "loss": 1.8256, + "step": 1380 + }, + { + "epoch": 0.050954947028850034, + "grad_norm": 0.7919956208916636, + "learning_rate": 9.940362321562095e-07, + "loss": 1.7966, + "step": 1390 + }, + { + "epoch": 0.051321529381575574, + "grad_norm": 0.9593927463945575, + "learning_rate": 9.939470692617936e-07, + "loss": 1.756, + "step": 1400 + }, + { + "epoch": 0.05168811173430111, + "grad_norm": 1.0264148022637987, + "learning_rate": 9.938572488356309e-07, + "loss": 1.7938, + "step": 1410 + }, + { + "epoch": 0.05205469408702665, + "grad_norm": 1.0694910008156386, + "learning_rate": 9.937667709972882e-07, + "loss": 1.7151, + "step": 1420 + }, + { + "epoch": 0.05242127643975219, + "grad_norm": 1.106949179035861, + "learning_rate": 9.936756358672075e-07, + "loss": 1.7566, + "step": 1430 + }, + { + "epoch": 0.05278785879247773, + "grad_norm": 0.8484995009187619, + "learning_rate": 9.935838435667062e-07, + "loss": 1.8061, + "step": 1440 + }, + { + "epoch": 0.05315444114520327, + "grad_norm": 0.9442924790988804, + "learning_rate": 9.93491394217976e-07, + "loss": 1.7938, + "step": 1450 + }, + { + "epoch": 0.05352102349792881, + "grad_norm": 0.8835040984395444, + "learning_rate": 9.933982879440838e-07, + "loss": 1.7801, + "step": 1460 + }, + { + "epoch": 0.05388760585065435, + "grad_norm": 0.951681021528121, + "learning_rate": 9.933045248689704e-07, + "loss": 1.7839, + "step": 1470 + }, + { + "epoch": 0.05425418820337989, + "grad_norm": 0.8986214443009446, + "learning_rate": 9.932101051174513e-07, + "loss": 1.8251, + "step": 1480 + }, + { + "epoch": 0.05462077055610543, + "grad_norm": 0.8136477078651573, + "learning_rate": 9.93115028815216e-07, + "loss": 1.8429, + "step": 1490 + }, + { + "epoch": 0.05498735290883097, + "grad_norm": 1.0031260237221131, + "learning_rate": 9.93019296088828e-07, + "loss": 1.7663, + "step": 1500 + }, + { + "epoch": 0.055353935261556506, + "grad_norm": 0.9959012828848206, + "learning_rate": 9.92922907065725e-07, + "loss": 1.8269, + "step": 1510 + }, + { + "epoch": 0.055720517614282046, + "grad_norm": 0.8915575658825868, + "learning_rate": 9.928258618742176e-07, + "loss": 1.7696, + "step": 1520 + }, + { + "epoch": 0.056087099967007586, + "grad_norm": 0.9963782636445598, + "learning_rate": 9.927281606434902e-07, + "loss": 1.7738, + "step": 1530 + }, + { + "epoch": 0.05645368231973313, + "grad_norm": 0.9381564546633785, + "learning_rate": 9.92629803503601e-07, + "loss": 1.7333, + "step": 1540 + }, + { + "epoch": 0.05682026467245867, + "grad_norm": 1.0017202007335113, + "learning_rate": 9.925307905854807e-07, + "loss": 1.8095, + "step": 1550 + }, + { + "epoch": 0.05718684702518421, + "grad_norm": 1.0543725728983615, + "learning_rate": 9.924311220209332e-07, + "loss": 1.7571, + "step": 1560 + }, + { + "epoch": 0.05755342937790975, + "grad_norm": 1.0455383232236297, + "learning_rate": 9.92330797942635e-07, + "loss": 1.7605, + "step": 1570 + }, + { + "epoch": 0.05792001173063529, + "grad_norm": 0.8416991518569622, + "learning_rate": 9.922298184841356e-07, + "loss": 1.7703, + "step": 1580 + }, + { + "epoch": 0.05828659408336083, + "grad_norm": 0.92044213042727, + "learning_rate": 9.921281837798565e-07, + "loss": 1.7051, + "step": 1590 + }, + { + "epoch": 0.05865317643608637, + "grad_norm": 0.9422384532621354, + "learning_rate": 9.920258939650918e-07, + "loss": 1.7882, + "step": 1600 + }, + { + "epoch": 0.0590197587888119, + "grad_norm": 1.1464397608985724, + "learning_rate": 9.919229491760074e-07, + "loss": 1.7504, + "step": 1610 + }, + { + "epoch": 0.059386341141537444, + "grad_norm": 1.1503410560007548, + "learning_rate": 9.918193495496411e-07, + "loss": 1.7755, + "step": 1620 + }, + { + "epoch": 0.059752923494262984, + "grad_norm": 1.034854775422536, + "learning_rate": 9.917150952239028e-07, + "loss": 1.8109, + "step": 1630 + }, + { + "epoch": 0.060119505846988525, + "grad_norm": 0.9357240877838402, + "learning_rate": 9.916101863375734e-07, + "loss": 1.812, + "step": 1640 + }, + { + "epoch": 0.060486088199714065, + "grad_norm": 1.2613406348730127, + "learning_rate": 9.915046230303055e-07, + "loss": 1.7299, + "step": 1650 + }, + { + "epoch": 0.060852670552439606, + "grad_norm": 0.991269818479319, + "learning_rate": 9.913984054426226e-07, + "loss": 1.6839, + "step": 1660 + }, + { + "epoch": 0.061219252905165146, + "grad_norm": 1.0426302229265827, + "learning_rate": 9.91291533715919e-07, + "loss": 1.6983, + "step": 1670 + }, + { + "epoch": 0.061585835257890686, + "grad_norm": 1.0623577818006307, + "learning_rate": 9.911840079924607e-07, + "loss": 1.7586, + "step": 1680 + }, + { + "epoch": 0.06195241761061623, + "grad_norm": 0.9792793493189645, + "learning_rate": 9.910758284153834e-07, + "loss": 1.7863, + "step": 1690 + }, + { + "epoch": 0.06231899996334177, + "grad_norm": 1.1013133546227525, + "learning_rate": 9.90966995128693e-07, + "loss": 1.7586, + "step": 1700 + }, + { + "epoch": 0.0626855823160673, + "grad_norm": 1.2653001609685381, + "learning_rate": 9.908575082772664e-07, + "loss": 1.7087, + "step": 1710 + }, + { + "epoch": 0.06305216466879285, + "grad_norm": 1.2600949114865185, + "learning_rate": 9.907473680068501e-07, + "loss": 1.6974, + "step": 1720 + }, + { + "epoch": 0.06341874702151838, + "grad_norm": 1.0352843166386823, + "learning_rate": 9.906365744640605e-07, + "loss": 1.7247, + "step": 1730 + }, + { + "epoch": 0.06378532937424393, + "grad_norm": 1.0534586823177523, + "learning_rate": 9.905251277963838e-07, + "loss": 1.7989, + "step": 1740 + }, + { + "epoch": 0.06415191172696946, + "grad_norm": 1.0901888662447625, + "learning_rate": 9.904130281521749e-07, + "loss": 1.7495, + "step": 1750 + }, + { + "epoch": 0.06451849407969501, + "grad_norm": 1.0657237836075932, + "learning_rate": 9.903002756806589e-07, + "loss": 1.7393, + "step": 1760 + }, + { + "epoch": 0.06488507643242054, + "grad_norm": 1.0695629454280169, + "learning_rate": 9.901868705319291e-07, + "loss": 1.784, + "step": 1770 + }, + { + "epoch": 0.06525165878514608, + "grad_norm": 0.9206279700392275, + "learning_rate": 9.900728128569482e-07, + "loss": 1.758, + "step": 1780 + }, + { + "epoch": 0.06561824113787162, + "grad_norm": 1.0410164391482535, + "learning_rate": 9.899581028075473e-07, + "loss": 1.7252, + "step": 1790 + }, + { + "epoch": 0.06598482349059716, + "grad_norm": 0.9377493357256449, + "learning_rate": 9.898427405364262e-07, + "loss": 1.74, + "step": 1800 + }, + { + "epoch": 0.0663514058433227, + "grad_norm": 1.1272971880737597, + "learning_rate": 9.897267261971524e-07, + "loss": 1.7524, + "step": 1810 + }, + { + "epoch": 0.06671798819604824, + "grad_norm": 1.0979559562270786, + "learning_rate": 9.896100599441618e-07, + "loss": 1.6988, + "step": 1820 + }, + { + "epoch": 0.06708457054877379, + "grad_norm": 0.961855276743755, + "learning_rate": 9.894927419327576e-07, + "loss": 1.7327, + "step": 1830 + }, + { + "epoch": 0.06745115290149932, + "grad_norm": 0.97235897562474, + "learning_rate": 9.893747723191118e-07, + "loss": 1.7544, + "step": 1840 + }, + { + "epoch": 0.06781773525422487, + "grad_norm": 1.1764451813427488, + "learning_rate": 9.892561512602626e-07, + "loss": 1.7616, + "step": 1850 + }, + { + "epoch": 0.0681843176069504, + "grad_norm": 0.9690232157285822, + "learning_rate": 9.891368789141158e-07, + "loss": 1.7386, + "step": 1860 + }, + { + "epoch": 0.06855089995967593, + "grad_norm": 1.131145797735988, + "learning_rate": 9.89016955439444e-07, + "loss": 1.7473, + "step": 1870 + }, + { + "epoch": 0.06891748231240148, + "grad_norm": 1.1996910697441496, + "learning_rate": 9.88896380995887e-07, + "loss": 1.7502, + "step": 1880 + }, + { + "epoch": 0.06928406466512702, + "grad_norm": 1.2280647210603344, + "learning_rate": 9.887751557439513e-07, + "loss": 1.7547, + "step": 1890 + }, + { + "epoch": 0.06965064701785256, + "grad_norm": 1.0705375351848956, + "learning_rate": 9.886532798450085e-07, + "loss": 1.7577, + "step": 1900 + }, + { + "epoch": 0.0700172293705781, + "grad_norm": 1.0083918166967278, + "learning_rate": 9.88530753461298e-07, + "loss": 1.7193, + "step": 1910 + }, + { + "epoch": 0.07038381172330364, + "grad_norm": 1.0053388433251793, + "learning_rate": 9.884075767559236e-07, + "loss": 1.7635, + "step": 1920 + }, + { + "epoch": 0.07075039407602918, + "grad_norm": 1.1405257537860627, + "learning_rate": 9.88283749892856e-07, + "loss": 1.7859, + "step": 1930 + }, + { + "epoch": 0.07111697642875472, + "grad_norm": 1.3872222978621402, + "learning_rate": 9.881592730369305e-07, + "loss": 1.6823, + "step": 1940 + }, + { + "epoch": 0.07148355878148026, + "grad_norm": 1.0500974949147595, + "learning_rate": 9.880341463538483e-07, + "loss": 1.7268, + "step": 1950 + }, + { + "epoch": 0.0718501411342058, + "grad_norm": 1.1146107157958263, + "learning_rate": 9.879083700101754e-07, + "loss": 1.7324, + "step": 1960 + }, + { + "epoch": 0.07221672348693134, + "grad_norm": 1.0782444093138666, + "learning_rate": 9.877819441733421e-07, + "loss": 1.7219, + "step": 1970 + }, + { + "epoch": 0.07258330583965687, + "grad_norm": 1.1066515564824118, + "learning_rate": 9.876548690116443e-07, + "loss": 1.6974, + "step": 1980 + }, + { + "epoch": 0.07294988819238242, + "grad_norm": 1.0551270004207765, + "learning_rate": 9.875271446942416e-07, + "loss": 1.7086, + "step": 1990 + }, + { + "epoch": 0.07331647054510795, + "grad_norm": 1.0172022580059552, + "learning_rate": 9.873987713911579e-07, + "loss": 1.7281, + "step": 2000 + }, + { + "epoch": 0.07331647054510795, + "eval_accuracy": 0.6153943652920695, + "eval_loss": 1.7325148582458496, + "eval_runtime": 307.9034, + "eval_samples_per_second": 10.74, + "eval_steps_per_second": 0.896, + "step": 2000 + }, + { + "epoch": 0.0736830528978335, + "grad_norm": 1.0319650415221862, + "learning_rate": 9.872697492732805e-07, + "loss": 1.699, + "step": 2010 + }, + { + "epoch": 0.07404963525055903, + "grad_norm": 0.9982774529316707, + "learning_rate": 9.871400785123615e-07, + "loss": 1.7476, + "step": 2020 + }, + { + "epoch": 0.07441621760328458, + "grad_norm": 1.1272779709424325, + "learning_rate": 9.870097592810156e-07, + "loss": 1.7911, + "step": 2030 + }, + { + "epoch": 0.07478279995601012, + "grad_norm": 1.0356947186293473, + "learning_rate": 9.86878791752721e-07, + "loss": 1.7038, + "step": 2040 + }, + { + "epoch": 0.07514938230873566, + "grad_norm": 0.9227271241300935, + "learning_rate": 9.867471761018187e-07, + "loss": 1.789, + "step": 2050 + }, + { + "epoch": 0.0755159646614612, + "grad_norm": 1.1484518524699514, + "learning_rate": 9.86614912503513e-07, + "loss": 1.7706, + "step": 2060 + }, + { + "epoch": 0.07588254701418674, + "grad_norm": 0.8955923870076745, + "learning_rate": 9.864820011338698e-07, + "loss": 1.7543, + "step": 2070 + }, + { + "epoch": 0.07624912936691228, + "grad_norm": 1.1335067807492596, + "learning_rate": 9.863484421698182e-07, + "loss": 1.7155, + "step": 2080 + }, + { + "epoch": 0.07661571171963781, + "grad_norm": 1.1784649675887455, + "learning_rate": 9.86214235789149e-07, + "loss": 1.7198, + "step": 2090 + }, + { + "epoch": 0.07698229407236336, + "grad_norm": 0.9990776315852751, + "learning_rate": 9.860793821705153e-07, + "loss": 1.7088, + "step": 2100 + }, + { + "epoch": 0.07734887642508889, + "grad_norm": 1.8933737366748618, + "learning_rate": 9.859438814934306e-07, + "loss": 1.7815, + "step": 2110 + }, + { + "epoch": 0.07771545877781444, + "grad_norm": 1.0824373033670114, + "learning_rate": 9.858077339382708e-07, + "loss": 1.7056, + "step": 2120 + }, + { + "epoch": 0.07808204113053997, + "grad_norm": 1.0459040499217758, + "learning_rate": 9.856709396862727e-07, + "loss": 1.7587, + "step": 2130 + }, + { + "epoch": 0.07844862348326552, + "grad_norm": 1.1273027866420589, + "learning_rate": 9.855334989195338e-07, + "loss": 1.6718, + "step": 2140 + }, + { + "epoch": 0.07881520583599105, + "grad_norm": 1.1216307142085522, + "learning_rate": 9.853954118210124e-07, + "loss": 1.6925, + "step": 2150 + }, + { + "epoch": 0.0791817881887166, + "grad_norm": 1.2320479842440668, + "learning_rate": 9.852566785745269e-07, + "loss": 1.7128, + "step": 2160 + }, + { + "epoch": 0.07954837054144213, + "grad_norm": 1.0679388999130817, + "learning_rate": 9.851172993647562e-07, + "loss": 1.7063, + "step": 2170 + }, + { + "epoch": 0.07991495289416767, + "grad_norm": 1.2733808120999472, + "learning_rate": 9.849772743772387e-07, + "loss": 1.69, + "step": 2180 + }, + { + "epoch": 0.08028153524689322, + "grad_norm": 1.240045987921097, + "learning_rate": 9.848366037983728e-07, + "loss": 1.7382, + "step": 2190 + }, + { + "epoch": 0.08064811759961875, + "grad_norm": 1.0370629833579919, + "learning_rate": 9.846952878154162e-07, + "loss": 1.7135, + "step": 2200 + }, + { + "epoch": 0.0810146999523443, + "grad_norm": 1.1809158590474762, + "learning_rate": 9.845533266164856e-07, + "loss": 1.7197, + "step": 2210 + }, + { + "epoch": 0.08138128230506983, + "grad_norm": 1.0143562772242192, + "learning_rate": 9.844107203905567e-07, + "loss": 1.7062, + "step": 2220 + }, + { + "epoch": 0.08174786465779538, + "grad_norm": 1.1841441026483928, + "learning_rate": 9.842674693274639e-07, + "loss": 1.6766, + "step": 2230 + }, + { + "epoch": 0.08211444701052091, + "grad_norm": 1.1281564379658906, + "learning_rate": 9.841235736179e-07, + "loss": 1.6485, + "step": 2240 + }, + { + "epoch": 0.08248102936324646, + "grad_norm": 1.2660731034162191, + "learning_rate": 9.83979033453416e-07, + "loss": 1.7513, + "step": 2250 + }, + { + "epoch": 0.08284761171597199, + "grad_norm": 1.1670722746985231, + "learning_rate": 9.8383384902642e-07, + "loss": 1.7282, + "step": 2260 + }, + { + "epoch": 0.08321419406869754, + "grad_norm": 1.1924698170354644, + "learning_rate": 9.836880205301795e-07, + "loss": 1.7339, + "step": 2270 + }, + { + "epoch": 0.08358077642142307, + "grad_norm": 1.0522491790203259, + "learning_rate": 9.835415481588173e-07, + "loss": 1.6907, + "step": 2280 + }, + { + "epoch": 0.0839473587741486, + "grad_norm": 1.1650865835189006, + "learning_rate": 9.83394432107315e-07, + "loss": 1.718, + "step": 2290 + }, + { + "epoch": 0.08431394112687415, + "grad_norm": 0.9881537861019963, + "learning_rate": 9.832466725715097e-07, + "loss": 1.7423, + "step": 2300 + }, + { + "epoch": 0.08468052347959969, + "grad_norm": 1.0843420992658444, + "learning_rate": 9.830982697480958e-07, + "loss": 1.7112, + "step": 2310 + }, + { + "epoch": 0.08504710583232523, + "grad_norm": 1.1947303847486304, + "learning_rate": 9.829492238346244e-07, + "loss": 1.6813, + "step": 2320 + }, + { + "epoch": 0.08541368818505077, + "grad_norm": 1.04336555772043, + "learning_rate": 9.82799535029502e-07, + "loss": 1.6871, + "step": 2330 + }, + { + "epoch": 0.08578027053777632, + "grad_norm": 1.3465243494238373, + "learning_rate": 9.826492035319911e-07, + "loss": 1.7358, + "step": 2340 + }, + { + "epoch": 0.08614685289050185, + "grad_norm": 1.1173189734449491, + "learning_rate": 9.824982295422097e-07, + "loss": 1.7047, + "step": 2350 + }, + { + "epoch": 0.0865134352432274, + "grad_norm": 1.2520018391632697, + "learning_rate": 9.823466132611313e-07, + "loss": 1.6984, + "step": 2360 + }, + { + "epoch": 0.08688001759595293, + "grad_norm": 1.03470369404529, + "learning_rate": 9.82194354890584e-07, + "loss": 1.7278, + "step": 2370 + }, + { + "epoch": 0.08724659994867848, + "grad_norm": 1.0164204083388344, + "learning_rate": 9.820414546332513e-07, + "loss": 1.7458, + "step": 2380 + }, + { + "epoch": 0.08761318230140401, + "grad_norm": 1.2348821126024987, + "learning_rate": 9.818879126926701e-07, + "loss": 1.7343, + "step": 2390 + }, + { + "epoch": 0.08797976465412954, + "grad_norm": 1.0011105767660962, + "learning_rate": 9.817337292732328e-07, + "loss": 1.7131, + "step": 2400 + }, + { + "epoch": 0.08834634700685509, + "grad_norm": 1.0710762717577924, + "learning_rate": 9.815789045801847e-07, + "loss": 1.6617, + "step": 2410 + }, + { + "epoch": 0.08871292935958063, + "grad_norm": 1.1055970569118785, + "learning_rate": 9.814234388196252e-07, + "loss": 1.758, + "step": 2420 + }, + { + "epoch": 0.08907951171230617, + "grad_norm": 1.013594052614807, + "learning_rate": 9.81267332198507e-07, + "loss": 1.6906, + "step": 2430 + }, + { + "epoch": 0.0894460940650317, + "grad_norm": 1.0649424099545044, + "learning_rate": 9.811105849246359e-07, + "loss": 1.6896, + "step": 2440 + }, + { + "epoch": 0.08981267641775725, + "grad_norm": 1.7084885584877294, + "learning_rate": 9.809531972066705e-07, + "loss": 1.6614, + "step": 2450 + }, + { + "epoch": 0.09017925877048279, + "grad_norm": 1.5758236147361129, + "learning_rate": 9.807951692541217e-07, + "loss": 1.6952, + "step": 2460 + }, + { + "epoch": 0.09054584112320833, + "grad_norm": 1.3585874981966901, + "learning_rate": 9.806365012773532e-07, + "loss": 1.7113, + "step": 2470 + }, + { + "epoch": 0.09091242347593387, + "grad_norm": 1.3061869321513975, + "learning_rate": 9.804771934875807e-07, + "loss": 1.6796, + "step": 2480 + }, + { + "epoch": 0.09127900582865942, + "grad_norm": 1.1540286110201206, + "learning_rate": 9.803172460968705e-07, + "loss": 1.7097, + "step": 2490 + }, + { + "epoch": 0.09164558818138495, + "grad_norm": 1.2915686809771951, + "learning_rate": 9.80156659318142e-07, + "loss": 1.7138, + "step": 2500 + }, + { + "epoch": 0.09201217053411048, + "grad_norm": 1.1468908768097306, + "learning_rate": 9.799954333651642e-07, + "loss": 1.7038, + "step": 2510 + }, + { + "epoch": 0.09237875288683603, + "grad_norm": 1.257655656482852, + "learning_rate": 9.79833568452558e-07, + "loss": 1.677, + "step": 2520 + }, + { + "epoch": 0.09274533523956156, + "grad_norm": 1.6361492549326027, + "learning_rate": 9.796710647957944e-07, + "loss": 1.6155, + "step": 2530 + }, + { + "epoch": 0.09311191759228711, + "grad_norm": 1.1505717408841072, + "learning_rate": 9.795079226111949e-07, + "loss": 1.6811, + "step": 2540 + }, + { + "epoch": 0.09347849994501264, + "grad_norm": 1.1983166183129195, + "learning_rate": 9.793441421159308e-07, + "loss": 1.7203, + "step": 2550 + }, + { + "epoch": 0.09384508229773819, + "grad_norm": 1.1985818933727272, + "learning_rate": 9.79179723528023e-07, + "loss": 1.7232, + "step": 2560 + }, + { + "epoch": 0.09421166465046373, + "grad_norm": 1.0143700528752713, + "learning_rate": 9.790146670663422e-07, + "loss": 1.6916, + "step": 2570 + }, + { + "epoch": 0.09457824700318927, + "grad_norm": 1.121117592417204, + "learning_rate": 9.788489729506082e-07, + "loss": 1.6683, + "step": 2580 + }, + { + "epoch": 0.0949448293559148, + "grad_norm": 1.339002521581536, + "learning_rate": 9.78682641401389e-07, + "loss": 1.6622, + "step": 2590 + }, + { + "epoch": 0.09531141170864034, + "grad_norm": 1.1212646774920143, + "learning_rate": 9.785156726401019e-07, + "loss": 1.687, + "step": 2600 + }, + { + "epoch": 0.09567799406136589, + "grad_norm": 1.2061879994547406, + "learning_rate": 9.78348066889012e-07, + "loss": 1.6652, + "step": 2610 + }, + { + "epoch": 0.09604457641409142, + "grad_norm": 1.225185884537581, + "learning_rate": 9.781798243712326e-07, + "loss": 1.6948, + "step": 2620 + }, + { + "epoch": 0.09641115876681697, + "grad_norm": 1.0146497215382635, + "learning_rate": 9.780109453107245e-07, + "loss": 1.7009, + "step": 2630 + }, + { + "epoch": 0.0967777411195425, + "grad_norm": 1.2171300466801498, + "learning_rate": 9.77841429932296e-07, + "loss": 1.7087, + "step": 2640 + }, + { + "epoch": 0.09714432347226805, + "grad_norm": 1.0629828650910798, + "learning_rate": 9.77671278461602e-07, + "loss": 1.7316, + "step": 2650 + }, + { + "epoch": 0.09751090582499358, + "grad_norm": 1.1754432625786018, + "learning_rate": 9.775004911251448e-07, + "loss": 1.6953, + "step": 2660 + }, + { + "epoch": 0.09787748817771913, + "grad_norm": 1.3069724342535498, + "learning_rate": 9.773290681502727e-07, + "loss": 1.7057, + "step": 2670 + }, + { + "epoch": 0.09824407053044466, + "grad_norm": 1.3314679455466842, + "learning_rate": 9.7715700976518e-07, + "loss": 1.6842, + "step": 2680 + }, + { + "epoch": 0.09861065288317021, + "grad_norm": 1.3928937247531508, + "learning_rate": 9.769843161989079e-07, + "loss": 1.7052, + "step": 2690 + }, + { + "epoch": 0.09897723523589574, + "grad_norm": 1.3389115391442472, + "learning_rate": 9.768109876813417e-07, + "loss": 1.6905, + "step": 2700 + }, + { + "epoch": 0.09934381758862128, + "grad_norm": 1.2854315608533564, + "learning_rate": 9.76637024443213e-07, + "loss": 1.6806, + "step": 2710 + }, + { + "epoch": 0.09971039994134683, + "grad_norm": 1.24293956575573, + "learning_rate": 9.764624267160975e-07, + "loss": 1.6922, + "step": 2720 + }, + { + "epoch": 0.10007698229407236, + "grad_norm": 1.2809307536658918, + "learning_rate": 9.762871947324165e-07, + "loss": 1.7001, + "step": 2730 + }, + { + "epoch": 0.1004435646467979, + "grad_norm": 1.1615070632030087, + "learning_rate": 9.761113287254345e-07, + "loss": 1.6747, + "step": 2740 + }, + { + "epoch": 0.10081014699952344, + "grad_norm": 1.245140216818738, + "learning_rate": 9.75934828929261e-07, + "loss": 1.6469, + "step": 2750 + }, + { + "epoch": 0.10117672935224899, + "grad_norm": 1.152316966014997, + "learning_rate": 9.757576955788486e-07, + "loss": 1.6773, + "step": 2760 + }, + { + "epoch": 0.10154331170497452, + "grad_norm": 1.1064605629765938, + "learning_rate": 9.755799289099932e-07, + "loss": 1.6447, + "step": 2770 + }, + { + "epoch": 0.10190989405770007, + "grad_norm": 1.1150499110452152, + "learning_rate": 9.754015291593343e-07, + "loss": 1.7168, + "step": 2780 + }, + { + "epoch": 0.1022764764104256, + "grad_norm": 1.3016769905995789, + "learning_rate": 9.752224965643536e-07, + "loss": 1.7209, + "step": 2790 + }, + { + "epoch": 0.10264305876315115, + "grad_norm": 1.332321427009131, + "learning_rate": 9.750428313633757e-07, + "loss": 1.6247, + "step": 2800 + }, + { + "epoch": 0.10300964111587668, + "grad_norm": 1.311092146207188, + "learning_rate": 9.748625337955667e-07, + "loss": 1.6366, + "step": 2810 + }, + { + "epoch": 0.10337622346860222, + "grad_norm": 1.1634742047900515, + "learning_rate": 9.746816041009351e-07, + "loss": 1.7143, + "step": 2820 + }, + { + "epoch": 0.10374280582132776, + "grad_norm": 1.1916284602740692, + "learning_rate": 9.745000425203307e-07, + "loss": 1.6568, + "step": 2830 + }, + { + "epoch": 0.1041093881740533, + "grad_norm": 1.280571751055567, + "learning_rate": 9.743178492954442e-07, + "loss": 1.6303, + "step": 2840 + }, + { + "epoch": 0.10447597052677884, + "grad_norm": 1.3621017517970784, + "learning_rate": 9.741350246688076e-07, + "loss": 1.7569, + "step": 2850 + }, + { + "epoch": 0.10484255287950438, + "grad_norm": 1.1019913075705825, + "learning_rate": 9.739515688837927e-07, + "loss": 1.6934, + "step": 2860 + }, + { + "epoch": 0.10520913523222993, + "grad_norm": 1.3868159647800968, + "learning_rate": 9.73767482184612e-07, + "loss": 1.6267, + "step": 2870 + }, + { + "epoch": 0.10557571758495546, + "grad_norm": 1.4881189853618986, + "learning_rate": 9.73582764816318e-07, + "loss": 1.7354, + "step": 2880 + }, + { + "epoch": 0.105942299937681, + "grad_norm": 1.5118948532986631, + "learning_rate": 9.733974170248025e-07, + "loss": 1.6856, + "step": 2890 + }, + { + "epoch": 0.10630888229040654, + "grad_norm": 1.4796154680218983, + "learning_rate": 9.732114390567963e-07, + "loss": 1.7045, + "step": 2900 + }, + { + "epoch": 0.10667546464313207, + "grad_norm": 1.2560441338500297, + "learning_rate": 9.730248311598694e-07, + "loss": 1.6466, + "step": 2910 + }, + { + "epoch": 0.10704204699585762, + "grad_norm": 1.1595828068992133, + "learning_rate": 9.728375935824301e-07, + "loss": 1.6822, + "step": 2920 + }, + { + "epoch": 0.10740862934858315, + "grad_norm": 1.3126146065763922, + "learning_rate": 9.726497265737252e-07, + "loss": 1.6723, + "step": 2930 + }, + { + "epoch": 0.1077752117013087, + "grad_norm": 1.2296488317137073, + "learning_rate": 9.724612303838393e-07, + "loss": 1.6647, + "step": 2940 + }, + { + "epoch": 0.10814179405403423, + "grad_norm": 1.170972623285309, + "learning_rate": 9.722721052636944e-07, + "loss": 1.6955, + "step": 2950 + }, + { + "epoch": 0.10850837640675978, + "grad_norm": 1.2633141406462256, + "learning_rate": 9.720823514650495e-07, + "loss": 1.6332, + "step": 2960 + }, + { + "epoch": 0.10887495875948532, + "grad_norm": 1.2911934178837097, + "learning_rate": 9.718919692405014e-07, + "loss": 1.7218, + "step": 2970 + }, + { + "epoch": 0.10924154111221086, + "grad_norm": 1.1657180939495957, + "learning_rate": 9.717009588434822e-07, + "loss": 1.6067, + "step": 2980 + }, + { + "epoch": 0.1096081234649364, + "grad_norm": 1.239214562886889, + "learning_rate": 9.715093205282615e-07, + "loss": 1.7067, + "step": 2990 + }, + { + "epoch": 0.10997470581766194, + "grad_norm": 1.3619661984646028, + "learning_rate": 9.713170545499435e-07, + "loss": 1.6978, + "step": 3000 + }, + { + "epoch": 0.10997470581766194, + "eval_accuracy": 0.6262376782115725, + "eval_loss": 1.6762739419937134, + "eval_runtime": 309.1255, + "eval_samples_per_second": 10.698, + "eval_steps_per_second": 0.893, + "step": 3000 + }, + { + "epoch": 0.11034128817038748, + "grad_norm": 1.2670499181513593, + "learning_rate": 9.711241611644688e-07, + "loss": 1.677, + "step": 3010 + }, + { + "epoch": 0.11070787052311301, + "grad_norm": 1.2403940254412753, + "learning_rate": 9.709306406286129e-07, + "loss": 1.6604, + "step": 3020 + }, + { + "epoch": 0.11107445287583856, + "grad_norm": 1.3312898520587448, + "learning_rate": 9.707364931999864e-07, + "loss": 1.6867, + "step": 3030 + }, + { + "epoch": 0.11144103522856409, + "grad_norm": 1.3495930407749666, + "learning_rate": 9.70541719137034e-07, + "loss": 1.6617, + "step": 3040 + }, + { + "epoch": 0.11180761758128964, + "grad_norm": 1.1396532709110236, + "learning_rate": 9.703463186990346e-07, + "loss": 1.7035, + "step": 3050 + }, + { + "epoch": 0.11217419993401517, + "grad_norm": 1.2231802562577823, + "learning_rate": 9.701502921461013e-07, + "loss": 1.6723, + "step": 3060 + }, + { + "epoch": 0.11254078228674072, + "grad_norm": 1.3403523967021675, + "learning_rate": 9.699536397391806e-07, + "loss": 1.6698, + "step": 3070 + }, + { + "epoch": 0.11290736463946625, + "grad_norm": 1.3447918453958256, + "learning_rate": 9.697563617400516e-07, + "loss": 1.6716, + "step": 3080 + }, + { + "epoch": 0.1132739469921918, + "grad_norm": 1.2969348535087712, + "learning_rate": 9.695584584113267e-07, + "loss": 1.6949, + "step": 3090 + }, + { + "epoch": 0.11364052934491733, + "grad_norm": 1.1643584556065927, + "learning_rate": 9.693599300164508e-07, + "loss": 1.6713, + "step": 3100 + }, + { + "epoch": 0.11400711169764288, + "grad_norm": 1.2242377804664155, + "learning_rate": 9.691607768197002e-07, + "loss": 1.6386, + "step": 3110 + }, + { + "epoch": 0.11437369405036842, + "grad_norm": 1.319822492671326, + "learning_rate": 9.689609990861837e-07, + "loss": 1.6816, + "step": 3120 + }, + { + "epoch": 0.11474027640309395, + "grad_norm": 1.3781452196212938, + "learning_rate": 9.687605970818408e-07, + "loss": 1.6784, + "step": 3130 + }, + { + "epoch": 0.1151068587558195, + "grad_norm": 1.2168088100404522, + "learning_rate": 9.68559571073443e-07, + "loss": 1.6982, + "step": 3140 + }, + { + "epoch": 0.11547344110854503, + "grad_norm": 1.4540401524570652, + "learning_rate": 9.68357921328591e-07, + "loss": 1.6718, + "step": 3150 + }, + { + "epoch": 0.11584002346127058, + "grad_norm": 1.3143498063269197, + "learning_rate": 9.681556481157171e-07, + "loss": 1.6709, + "step": 3160 + }, + { + "epoch": 0.11620660581399611, + "grad_norm": 1.1946622719420839, + "learning_rate": 9.679527517040831e-07, + "loss": 1.6747, + "step": 3170 + }, + { + "epoch": 0.11657318816672166, + "grad_norm": 1.286257203814063, + "learning_rate": 9.6774923236378e-07, + "loss": 1.699, + "step": 3180 + }, + { + "epoch": 0.11693977051944719, + "grad_norm": 1.3969179686751765, + "learning_rate": 9.675450903657286e-07, + "loss": 1.6228, + "step": 3190 + }, + { + "epoch": 0.11730635287217274, + "grad_norm": 1.1607892230894732, + "learning_rate": 9.673403259816787e-07, + "loss": 1.6538, + "step": 3200 + }, + { + "epoch": 0.11767293522489827, + "grad_norm": 1.4009629932701972, + "learning_rate": 9.671349394842075e-07, + "loss": 1.6401, + "step": 3210 + }, + { + "epoch": 0.1180395175776238, + "grad_norm": 1.5024706182569632, + "learning_rate": 9.669289311467216e-07, + "loss": 1.6508, + "step": 3220 + }, + { + "epoch": 0.11840609993034935, + "grad_norm": 1.9466998313668968, + "learning_rate": 9.66722301243455e-07, + "loss": 1.6662, + "step": 3230 + }, + { + "epoch": 0.11877268228307489, + "grad_norm": 1.6928758946763174, + "learning_rate": 9.665150500494686e-07, + "loss": 1.681, + "step": 3240 + }, + { + "epoch": 0.11913926463580043, + "grad_norm": 1.5050927792757436, + "learning_rate": 9.66307177840651e-07, + "loss": 1.6669, + "step": 3250 + }, + { + "epoch": 0.11950584698852597, + "grad_norm": 1.179067981511082, + "learning_rate": 9.66098684893717e-07, + "loss": 1.6503, + "step": 3260 + }, + { + "epoch": 0.11987242934125152, + "grad_norm": 1.7279906281142485, + "learning_rate": 9.658895714862082e-07, + "loss": 1.6331, + "step": 3270 + }, + { + "epoch": 0.12023901169397705, + "grad_norm": 1.1891919657193728, + "learning_rate": 9.656798378964918e-07, + "loss": 1.6111, + "step": 3280 + }, + { + "epoch": 0.1206055940467026, + "grad_norm": 1.7749941957068498, + "learning_rate": 9.654694844037607e-07, + "loss": 1.666, + "step": 3290 + }, + { + "epoch": 0.12097217639942813, + "grad_norm": 1.5093366351881725, + "learning_rate": 9.65258511288033e-07, + "loss": 1.6569, + "step": 3300 + }, + { + "epoch": 0.12133875875215368, + "grad_norm": 1.2872309950824516, + "learning_rate": 9.650469188301512e-07, + "loss": 1.6697, + "step": 3310 + }, + { + "epoch": 0.12170534110487921, + "grad_norm": 1.2299002535631731, + "learning_rate": 9.648347073117832e-07, + "loss": 1.6413, + "step": 3320 + }, + { + "epoch": 0.12207192345760474, + "grad_norm": 1.407253463937065, + "learning_rate": 9.6462187701542e-07, + "loss": 1.6757, + "step": 3330 + }, + { + "epoch": 0.12243850581033029, + "grad_norm": 1.166071729039829, + "learning_rate": 9.644084282243768e-07, + "loss": 1.6654, + "step": 3340 + }, + { + "epoch": 0.12280508816305583, + "grad_norm": 1.558952263125209, + "learning_rate": 9.641943612227921e-07, + "loss": 1.6807, + "step": 3350 + }, + { + "epoch": 0.12317167051578137, + "grad_norm": 1.3374281457093373, + "learning_rate": 9.639796762956276e-07, + "loss": 1.6664, + "step": 3360 + }, + { + "epoch": 0.1235382528685069, + "grad_norm": 1.1902844247942133, + "learning_rate": 9.637643737286667e-07, + "loss": 1.6914, + "step": 3370 + }, + { + "epoch": 0.12390483522123245, + "grad_norm": 1.2998133772041194, + "learning_rate": 9.63548453808516e-07, + "loss": 1.7112, + "step": 3380 + }, + { + "epoch": 0.12427141757395799, + "grad_norm": 1.3162405748836254, + "learning_rate": 9.633319168226036e-07, + "loss": 1.6936, + "step": 3390 + }, + { + "epoch": 0.12463799992668353, + "grad_norm": 1.3677758198871173, + "learning_rate": 9.631147630591782e-07, + "loss": 1.6883, + "step": 3400 + }, + { + "epoch": 0.12500458227940908, + "grad_norm": 1.2054292111865461, + "learning_rate": 9.62896992807311e-07, + "loss": 1.6576, + "step": 3410 + }, + { + "epoch": 0.1253711646321346, + "grad_norm": 1.156101638091166, + "learning_rate": 9.626786063568925e-07, + "loss": 1.6667, + "step": 3420 + }, + { + "epoch": 0.12573774698486015, + "grad_norm": 1.3745543808654352, + "learning_rate": 9.624596039986343e-07, + "loss": 1.6712, + "step": 3430 + }, + { + "epoch": 0.1261043293375857, + "grad_norm": 1.178401890967186, + "learning_rate": 9.622399860240679e-07, + "loss": 1.6474, + "step": 3440 + }, + { + "epoch": 0.12647091169031122, + "grad_norm": 1.4332376083467566, + "learning_rate": 9.620197527255436e-07, + "loss": 1.6655, + "step": 3450 + }, + { + "epoch": 0.12683749404303676, + "grad_norm": 1.2402171846377348, + "learning_rate": 9.617989043962315e-07, + "loss": 1.6349, + "step": 3460 + }, + { + "epoch": 0.1272040763957623, + "grad_norm": 1.1586534075249035, + "learning_rate": 9.615774413301201e-07, + "loss": 1.6514, + "step": 3470 + }, + { + "epoch": 0.12757065874848786, + "grad_norm": 1.3594354851138566, + "learning_rate": 9.613553638220162e-07, + "loss": 1.6516, + "step": 3480 + }, + { + "epoch": 0.12793724110121338, + "grad_norm": 1.6613648157437189, + "learning_rate": 9.611326721675447e-07, + "loss": 1.6111, + "step": 3490 + }, + { + "epoch": 0.12830382345393893, + "grad_norm": 1.1659314128590663, + "learning_rate": 9.60909366663148e-07, + "loss": 1.6144, + "step": 3500 + }, + { + "epoch": 0.12867040580666447, + "grad_norm": 1.3825427999836462, + "learning_rate": 9.606854476060858e-07, + "loss": 1.6355, + "step": 3510 + }, + { + "epoch": 0.12903698815939002, + "grad_norm": 1.3221664320987678, + "learning_rate": 9.604609152944339e-07, + "loss": 1.6582, + "step": 3520 + }, + { + "epoch": 0.12940357051211554, + "grad_norm": 1.223865417664176, + "learning_rate": 9.602357700270848e-07, + "loss": 1.6629, + "step": 3530 + }, + { + "epoch": 0.1297701528648411, + "grad_norm": 1.2654800350319806, + "learning_rate": 9.600100121037478e-07, + "loss": 1.6746, + "step": 3540 + }, + { + "epoch": 0.13013673521756663, + "grad_norm": 1.5629673478694224, + "learning_rate": 9.597836418249463e-07, + "loss": 1.598, + "step": 3550 + }, + { + "epoch": 0.13050331757029215, + "grad_norm": 1.434783120339992, + "learning_rate": 9.5955665949202e-07, + "loss": 1.6667, + "step": 3560 + }, + { + "epoch": 0.1308698999230177, + "grad_norm": 1.391092196783546, + "learning_rate": 9.593290654071227e-07, + "loss": 1.6533, + "step": 3570 + }, + { + "epoch": 0.13123648227574325, + "grad_norm": 1.4923072292703214, + "learning_rate": 9.591008598732227e-07, + "loss": 1.6742, + "step": 3580 + }, + { + "epoch": 0.1316030646284688, + "grad_norm": 1.313620532521857, + "learning_rate": 9.588720431941024e-07, + "loss": 1.643, + "step": 3590 + }, + { + "epoch": 0.13196964698119432, + "grad_norm": 1.527900388849829, + "learning_rate": 9.586426156743576e-07, + "loss": 1.6466, + "step": 3600 + }, + { + "epoch": 0.13233622933391986, + "grad_norm": 1.3345529937125478, + "learning_rate": 9.584125776193977e-07, + "loss": 1.6242, + "step": 3610 + }, + { + "epoch": 0.1327028116866454, + "grad_norm": 1.1722053149478573, + "learning_rate": 9.581819293354437e-07, + "loss": 1.6361, + "step": 3620 + }, + { + "epoch": 0.13306939403937096, + "grad_norm": 1.448965551365503, + "learning_rate": 9.579506711295303e-07, + "loss": 1.6766, + "step": 3630 + }, + { + "epoch": 0.13343597639209648, + "grad_norm": 1.435539195626326, + "learning_rate": 9.57718803309503e-07, + "loss": 1.6639, + "step": 3640 + }, + { + "epoch": 0.13380255874482203, + "grad_norm": 1.5710598550118229, + "learning_rate": 9.574863261840195e-07, + "loss": 1.6821, + "step": 3650 + }, + { + "epoch": 0.13416914109754757, + "grad_norm": 1.3432388820323078, + "learning_rate": 9.572532400625486e-07, + "loss": 1.6578, + "step": 3660 + }, + { + "epoch": 0.1345357234502731, + "grad_norm": 1.4304292951831412, + "learning_rate": 9.570195452553692e-07, + "loss": 1.6683, + "step": 3670 + }, + { + "epoch": 0.13490230580299864, + "grad_norm": 1.293030659950829, + "learning_rate": 9.567852420735707e-07, + "loss": 1.6712, + "step": 3680 + }, + { + "epoch": 0.1352688881557242, + "grad_norm": 1.5727628914988818, + "learning_rate": 9.565503308290529e-07, + "loss": 1.6362, + "step": 3690 + }, + { + "epoch": 0.13563547050844973, + "grad_norm": 1.6929875598843593, + "learning_rate": 9.56314811834524e-07, + "loss": 1.6734, + "step": 3700 + }, + { + "epoch": 0.13600205286117525, + "grad_norm": 1.5989548687758315, + "learning_rate": 9.560786854035027e-07, + "loss": 1.6449, + "step": 3710 + }, + { + "epoch": 0.1363686352139008, + "grad_norm": 1.5032676879166582, + "learning_rate": 9.558419518503146e-07, + "loss": 1.6572, + "step": 3720 + }, + { + "epoch": 0.13673521756662635, + "grad_norm": 1.4171570128132858, + "learning_rate": 9.55604611490095e-07, + "loss": 1.6084, + "step": 3730 + }, + { + "epoch": 0.13710179991935187, + "grad_norm": 1.445587424899926, + "learning_rate": 9.553666646387859e-07, + "loss": 1.6226, + "step": 3740 + }, + { + "epoch": 0.13746838227207742, + "grad_norm": 1.3746442868420083, + "learning_rate": 9.55128111613137e-07, + "loss": 1.6244, + "step": 3750 + }, + { + "epoch": 0.13783496462480296, + "grad_norm": 1.379515983296158, + "learning_rate": 9.548889527307052e-07, + "loss": 1.6178, + "step": 3760 + }, + { + "epoch": 0.1382015469775285, + "grad_norm": 1.3571114141269711, + "learning_rate": 9.546491883098536e-07, + "loss": 1.6295, + "step": 3770 + }, + { + "epoch": 0.13856812933025403, + "grad_norm": 1.463273179907825, + "learning_rate": 9.544088186697514e-07, + "loss": 1.6252, + "step": 3780 + }, + { + "epoch": 0.13893471168297958, + "grad_norm": 1.409249057690562, + "learning_rate": 9.541678441303736e-07, + "loss": 1.6226, + "step": 3790 + }, + { + "epoch": 0.13930129403570513, + "grad_norm": 1.2549772425250405, + "learning_rate": 9.539262650125003e-07, + "loss": 1.6904, + "step": 3800 + }, + { + "epoch": 0.13966787638843067, + "grad_norm": 1.398529314496367, + "learning_rate": 9.536840816377163e-07, + "loss": 1.641, + "step": 3810 + }, + { + "epoch": 0.1400344587411562, + "grad_norm": 1.4089240361542354, + "learning_rate": 9.534412943284111e-07, + "loss": 1.6749, + "step": 3820 + }, + { + "epoch": 0.14040104109388174, + "grad_norm": 1.2690921990550241, + "learning_rate": 9.53197903407778e-07, + "loss": 1.6483, + "step": 3830 + }, + { + "epoch": 0.1407676234466073, + "grad_norm": 1.443019453596183, + "learning_rate": 9.529539091998138e-07, + "loss": 1.5942, + "step": 3840 + }, + { + "epoch": 0.1411342057993328, + "grad_norm": 1.3973353826502415, + "learning_rate": 9.527093120293179e-07, + "loss": 1.6637, + "step": 3850 + }, + { + "epoch": 0.14150078815205835, + "grad_norm": 1.612241752672322, + "learning_rate": 9.524641122218934e-07, + "loss": 1.6144, + "step": 3860 + }, + { + "epoch": 0.1418673705047839, + "grad_norm": 1.6392078912198202, + "learning_rate": 9.522183101039447e-07, + "loss": 1.599, + "step": 3870 + }, + { + "epoch": 0.14223395285750945, + "grad_norm": 1.3307238721886945, + "learning_rate": 9.519719060026784e-07, + "loss": 1.6692, + "step": 3880 + }, + { + "epoch": 0.14260053521023497, + "grad_norm": 1.3570795255125636, + "learning_rate": 9.517249002461023e-07, + "loss": 1.6871, + "step": 3890 + }, + { + "epoch": 0.14296711756296052, + "grad_norm": 1.4037736413570712, + "learning_rate": 9.514772931630253e-07, + "loss": 1.5922, + "step": 3900 + }, + { + "epoch": 0.14333369991568606, + "grad_norm": 1.6691508908927133, + "learning_rate": 9.512290850830564e-07, + "loss": 1.5939, + "step": 3910 + }, + { + "epoch": 0.1437002822684116, + "grad_norm": 1.2746936442730004, + "learning_rate": 9.509802763366052e-07, + "loss": 1.6376, + "step": 3920 + }, + { + "epoch": 0.14406686462113713, + "grad_norm": 1.7263750991736497, + "learning_rate": 9.507308672548803e-07, + "loss": 1.6251, + "step": 3930 + }, + { + "epoch": 0.14443344697386268, + "grad_norm": 1.6162337099963227, + "learning_rate": 9.504808581698898e-07, + "loss": 1.6855, + "step": 3940 + }, + { + "epoch": 0.14480002932658823, + "grad_norm": 1.4400774058967862, + "learning_rate": 9.502302494144405e-07, + "loss": 1.6688, + "step": 3950 + }, + { + "epoch": 0.14516661167931375, + "grad_norm": 1.4106971014212684, + "learning_rate": 9.499790413221372e-07, + "loss": 1.6212, + "step": 3960 + }, + { + "epoch": 0.1455331940320393, + "grad_norm": 1.549216443416639, + "learning_rate": 9.49727234227383e-07, + "loss": 1.6316, + "step": 3970 + }, + { + "epoch": 0.14589977638476484, + "grad_norm": 1.2499725096259189, + "learning_rate": 9.494748284653779e-07, + "loss": 1.6113, + "step": 3980 + }, + { + "epoch": 0.1462663587374904, + "grad_norm": 1.8429540203762498, + "learning_rate": 9.492218243721192e-07, + "loss": 1.6424, + "step": 3990 + }, + { + "epoch": 0.1466329410902159, + "grad_norm": 1.4097823826329705, + "learning_rate": 9.489682222844004e-07, + "loss": 1.5986, + "step": 4000 + }, + { + "epoch": 0.1466329410902159, + "eval_accuracy": 0.634133690356089, + "eval_loss": 1.6327084302902222, + "eval_runtime": 310.7367, + "eval_samples_per_second": 10.642, + "eval_steps_per_second": 0.888, + "step": 4000 + }, + { + "epoch": 0.14699952344294145, + "grad_norm": 1.4923503061339742, + "learning_rate": 9.487140225398112e-07, + "loss": 1.6354, + "step": 4010 + }, + { + "epoch": 0.147366105795667, + "grad_norm": 1.4794551483340477, + "learning_rate": 9.484592254767368e-07, + "loss": 1.6337, + "step": 4020 + }, + { + "epoch": 0.14773268814839255, + "grad_norm": 1.5712257291796352, + "learning_rate": 9.482038314343577e-07, + "loss": 1.6569, + "step": 4030 + }, + { + "epoch": 0.14809927050111807, + "grad_norm": 1.7977345143090582, + "learning_rate": 9.479478407526489e-07, + "loss": 1.6489, + "step": 4040 + }, + { + "epoch": 0.14846585285384362, + "grad_norm": 1.3741458319499518, + "learning_rate": 9.476912537723797e-07, + "loss": 1.6133, + "step": 4050 + }, + { + "epoch": 0.14883243520656916, + "grad_norm": 1.4690331639136838, + "learning_rate": 9.474340708351131e-07, + "loss": 1.6232, + "step": 4060 + }, + { + "epoch": 0.14919901755929468, + "grad_norm": 1.2959341038239927, + "learning_rate": 9.471762922832059e-07, + "loss": 1.6136, + "step": 4070 + }, + { + "epoch": 0.14956559991202023, + "grad_norm": 1.3662274482371721, + "learning_rate": 9.469179184598068e-07, + "loss": 1.6568, + "step": 4080 + }, + { + "epoch": 0.14993218226474578, + "grad_norm": 1.6303487241504246, + "learning_rate": 9.46658949708858e-07, + "loss": 1.5929, + "step": 4090 + }, + { + "epoch": 0.15029876461747133, + "grad_norm": 1.5690296034603222, + "learning_rate": 9.463993863750927e-07, + "loss": 1.6273, + "step": 4100 + }, + { + "epoch": 0.15066534697019685, + "grad_norm": 1.4565888691647535, + "learning_rate": 9.461392288040364e-07, + "loss": 1.6111, + "step": 4110 + }, + { + "epoch": 0.1510319293229224, + "grad_norm": 1.3399651168141258, + "learning_rate": 9.458784773420052e-07, + "loss": 1.6317, + "step": 4120 + }, + { + "epoch": 0.15139851167564794, + "grad_norm": 1.4314663401678571, + "learning_rate": 9.456171323361057e-07, + "loss": 1.6149, + "step": 4130 + }, + { + "epoch": 0.1517650940283735, + "grad_norm": 1.8610614612324794, + "learning_rate": 9.45355194134235e-07, + "loss": 1.6129, + "step": 4140 + }, + { + "epoch": 0.152131676381099, + "grad_norm": 1.4894532553388709, + "learning_rate": 9.450926630850795e-07, + "loss": 1.609, + "step": 4150 + }, + { + "epoch": 0.15249825873382455, + "grad_norm": 1.4046406522547454, + "learning_rate": 9.44829539538115e-07, + "loss": 1.5696, + "step": 4160 + }, + { + "epoch": 0.1528648410865501, + "grad_norm": 1.507747542986857, + "learning_rate": 9.445658238436056e-07, + "loss": 1.6105, + "step": 4170 + }, + { + "epoch": 0.15323142343927562, + "grad_norm": 1.5105255618831799, + "learning_rate": 9.443015163526043e-07, + "loss": 1.6656, + "step": 4180 + }, + { + "epoch": 0.15359800579200117, + "grad_norm": 1.409667843388443, + "learning_rate": 9.440366174169514e-07, + "loss": 1.6143, + "step": 4190 + }, + { + "epoch": 0.15396458814472672, + "grad_norm": 1.4899089219548238, + "learning_rate": 9.437711273892748e-07, + "loss": 1.6434, + "step": 4200 + }, + { + "epoch": 0.15433117049745226, + "grad_norm": 1.3835730704800184, + "learning_rate": 9.435050466229892e-07, + "loss": 1.5896, + "step": 4210 + }, + { + "epoch": 0.15469775285017778, + "grad_norm": 1.5192649294767298, + "learning_rate": 9.432383754722953e-07, + "loss": 1.5982, + "step": 4220 + }, + { + "epoch": 0.15506433520290333, + "grad_norm": 1.414847151501446, + "learning_rate": 9.429711142921804e-07, + "loss": 1.6195, + "step": 4230 + }, + { + "epoch": 0.15543091755562888, + "grad_norm": 1.6343731391974052, + "learning_rate": 9.427032634384166e-07, + "loss": 1.6571, + "step": 4240 + }, + { + "epoch": 0.15579749990835443, + "grad_norm": 1.3341873108704791, + "learning_rate": 9.424348232675612e-07, + "loss": 1.6592, + "step": 4250 + }, + { + "epoch": 0.15616408226107995, + "grad_norm": 1.6008064117545706, + "learning_rate": 9.421657941369561e-07, + "loss": 1.5976, + "step": 4260 + }, + { + "epoch": 0.1565306646138055, + "grad_norm": 1.5239464972441716, + "learning_rate": 9.418961764047271e-07, + "loss": 1.6696, + "step": 4270 + }, + { + "epoch": 0.15689724696653104, + "grad_norm": 1.4769248460119957, + "learning_rate": 9.416259704297836e-07, + "loss": 1.5887, + "step": 4280 + }, + { + "epoch": 0.15726382931925656, + "grad_norm": 1.5681596592695635, + "learning_rate": 9.413551765718178e-07, + "loss": 1.6013, + "step": 4290 + }, + { + "epoch": 0.1576304116719821, + "grad_norm": 1.631287334977878, + "learning_rate": 9.410837951913049e-07, + "loss": 1.5945, + "step": 4300 + }, + { + "epoch": 0.15799699402470765, + "grad_norm": 1.4050312863210865, + "learning_rate": 9.408118266495019e-07, + "loss": 1.6402, + "step": 4310 + }, + { + "epoch": 0.1583635763774332, + "grad_norm": 1.5578526902775003, + "learning_rate": 9.405392713084475e-07, + "loss": 1.5887, + "step": 4320 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 1.838536265304532, + "learning_rate": 9.402661295309613e-07, + "loss": 1.6579, + "step": 4330 + }, + { + "epoch": 0.15909674108288427, + "grad_norm": 1.399860997384879, + "learning_rate": 9.399924016806442e-07, + "loss": 1.6393, + "step": 4340 + }, + { + "epoch": 0.15946332343560982, + "grad_norm": 1.5068872354692342, + "learning_rate": 9.397180881218764e-07, + "loss": 1.615, + "step": 4350 + }, + { + "epoch": 0.15982990578833534, + "grad_norm": 1.3780932641355175, + "learning_rate": 9.394431892198187e-07, + "loss": 1.5897, + "step": 4360 + }, + { + "epoch": 0.16019648814106088, + "grad_norm": 1.3266983904985465, + "learning_rate": 9.391677053404102e-07, + "loss": 1.622, + "step": 4370 + }, + { + "epoch": 0.16056307049378643, + "grad_norm": 1.620877234564149, + "learning_rate": 9.388916368503695e-07, + "loss": 1.5967, + "step": 4380 + }, + { + "epoch": 0.16092965284651198, + "grad_norm": 1.4779982203811086, + "learning_rate": 9.386149841171927e-07, + "loss": 1.6698, + "step": 4390 + }, + { + "epoch": 0.1612962351992375, + "grad_norm": 1.8674907963100393, + "learning_rate": 9.38337747509154e-07, + "loss": 1.587, + "step": 4400 + }, + { + "epoch": 0.16166281755196305, + "grad_norm": 1.253158061665667, + "learning_rate": 9.380599273953052e-07, + "loss": 1.5428, + "step": 4410 + }, + { + "epoch": 0.1620293999046886, + "grad_norm": 1.3525050799204679, + "learning_rate": 9.37781524145474e-07, + "loss": 1.6247, + "step": 4420 + }, + { + "epoch": 0.16239598225741414, + "grad_norm": 1.4613300416955568, + "learning_rate": 9.375025381302654e-07, + "loss": 1.6224, + "step": 4430 + }, + { + "epoch": 0.16276256461013966, + "grad_norm": 1.2944336505844816, + "learning_rate": 9.372229697210592e-07, + "loss": 1.6073, + "step": 4440 + }, + { + "epoch": 0.1631291469628652, + "grad_norm": 1.5174622698952627, + "learning_rate": 9.369428192900108e-07, + "loss": 1.6071, + "step": 4450 + }, + { + "epoch": 0.16349572931559075, + "grad_norm": 1.338534858401422, + "learning_rate": 9.366620872100508e-07, + "loss": 1.6601, + "step": 4460 + }, + { + "epoch": 0.16386231166831627, + "grad_norm": 1.6728271928417346, + "learning_rate": 9.363807738548834e-07, + "loss": 1.551, + "step": 4470 + }, + { + "epoch": 0.16422889402104182, + "grad_norm": 1.302057455107361, + "learning_rate": 9.360988795989873e-07, + "loss": 1.6131, + "step": 4480 + }, + { + "epoch": 0.16459547637376737, + "grad_norm": 1.3688499844245678, + "learning_rate": 9.358164048176136e-07, + "loss": 1.6117, + "step": 4490 + }, + { + "epoch": 0.16496205872649292, + "grad_norm": 1.8246828901080199, + "learning_rate": 9.355333498867869e-07, + "loss": 1.5894, + "step": 4500 + }, + { + "epoch": 0.16532864107921844, + "grad_norm": 1.6028775096282735, + "learning_rate": 9.352497151833038e-07, + "loss": 1.614, + "step": 4510 + }, + { + "epoch": 0.16569522343194398, + "grad_norm": 1.4820831927771527, + "learning_rate": 9.349655010847329e-07, + "loss": 1.6046, + "step": 4520 + }, + { + "epoch": 0.16606180578466953, + "grad_norm": 1.7672157547664196, + "learning_rate": 9.346807079694139e-07, + "loss": 1.5998, + "step": 4530 + }, + { + "epoch": 0.16642838813739508, + "grad_norm": 1.399533793932768, + "learning_rate": 9.34395336216457e-07, + "loss": 1.6209, + "step": 4540 + }, + { + "epoch": 0.1667949704901206, + "grad_norm": 1.3639375879771105, + "learning_rate": 9.341093862057432e-07, + "loss": 1.6321, + "step": 4550 + }, + { + "epoch": 0.16716155284284615, + "grad_norm": 1.5049904120253712, + "learning_rate": 9.338228583179231e-07, + "loss": 1.5531, + "step": 4560 + }, + { + "epoch": 0.1675281351955717, + "grad_norm": 1.2985124195396522, + "learning_rate": 9.335357529344162e-07, + "loss": 1.5925, + "step": 4570 + }, + { + "epoch": 0.1678947175482972, + "grad_norm": 1.6446327484619145, + "learning_rate": 9.332480704374113e-07, + "loss": 1.5926, + "step": 4580 + }, + { + "epoch": 0.16826129990102276, + "grad_norm": 1.6322229820052805, + "learning_rate": 9.329598112098649e-07, + "loss": 1.6415, + "step": 4590 + }, + { + "epoch": 0.1686278822537483, + "grad_norm": 1.4469690988313273, + "learning_rate": 9.326709756355018e-07, + "loss": 1.5885, + "step": 4600 + }, + { + "epoch": 0.16899446460647385, + "grad_norm": 2.0102392352379415, + "learning_rate": 9.323815640988135e-07, + "loss": 1.559, + "step": 4610 + }, + { + "epoch": 0.16936104695919937, + "grad_norm": 2.121900247865438, + "learning_rate": 9.320915769850585e-07, + "loss": 1.628, + "step": 4620 + }, + { + "epoch": 0.16972762931192492, + "grad_norm": 1.6562713457587275, + "learning_rate": 9.318010146802615e-07, + "loss": 1.6442, + "step": 4630 + }, + { + "epoch": 0.17009421166465047, + "grad_norm": 1.825933954099794, + "learning_rate": 9.315098775712127e-07, + "loss": 1.5848, + "step": 4640 + }, + { + "epoch": 0.17046079401737602, + "grad_norm": 2.2902161148174445, + "learning_rate": 9.312181660454677e-07, + "loss": 1.5825, + "step": 4650 + }, + { + "epoch": 0.17082737637010154, + "grad_norm": 1.392734199429953, + "learning_rate": 9.309258804913465e-07, + "loss": 1.6126, + "step": 4660 + }, + { + "epoch": 0.17119395872282708, + "grad_norm": 1.565256666892175, + "learning_rate": 9.306330212979334e-07, + "loss": 1.6022, + "step": 4670 + }, + { + "epoch": 0.17156054107555263, + "grad_norm": 1.7600380550932417, + "learning_rate": 9.303395888550763e-07, + "loss": 1.5663, + "step": 4680 + }, + { + "epoch": 0.17192712342827815, + "grad_norm": 1.5247880984614344, + "learning_rate": 9.300455835533863e-07, + "loss": 1.6012, + "step": 4690 + }, + { + "epoch": 0.1722937057810037, + "grad_norm": 1.7352070019598504, + "learning_rate": 9.297510057842367e-07, + "loss": 1.5681, + "step": 4700 + }, + { + "epoch": 0.17266028813372924, + "grad_norm": 1.6435683033446582, + "learning_rate": 9.294558559397633e-07, + "loss": 1.6687, + "step": 4710 + }, + { + "epoch": 0.1730268704864548, + "grad_norm": 1.3964234370853204, + "learning_rate": 9.291601344128631e-07, + "loss": 1.5829, + "step": 4720 + }, + { + "epoch": 0.1733934528391803, + "grad_norm": 1.76715189072495, + "learning_rate": 9.288638415971944e-07, + "loss": 1.5724, + "step": 4730 + }, + { + "epoch": 0.17376003519190586, + "grad_norm": 1.3087839062281306, + "learning_rate": 9.285669778871758e-07, + "loss": 1.6033, + "step": 4740 + }, + { + "epoch": 0.1741266175446314, + "grad_norm": 1.7592015890177557, + "learning_rate": 9.282695436779857e-07, + "loss": 1.5787, + "step": 4750 + }, + { + "epoch": 0.17449319989735695, + "grad_norm": 1.5281595493710598, + "learning_rate": 9.279715393655625e-07, + "loss": 1.5593, + "step": 4760 + }, + { + "epoch": 0.17485978225008247, + "grad_norm": 1.738599325299021, + "learning_rate": 9.276729653466029e-07, + "loss": 1.5669, + "step": 4770 + }, + { + "epoch": 0.17522636460280802, + "grad_norm": 1.594132633669574, + "learning_rate": 9.273738220185624e-07, + "loss": 1.623, + "step": 4780 + }, + { + "epoch": 0.17559294695553357, + "grad_norm": 2.226861365359913, + "learning_rate": 9.27074109779654e-07, + "loss": 1.6368, + "step": 4790 + }, + { + "epoch": 0.1759595293082591, + "grad_norm": 1.7870988536401553, + "learning_rate": 9.267738290288484e-07, + "loss": 1.5905, + "step": 4800 + }, + { + "epoch": 0.17632611166098464, + "grad_norm": 1.6753244560734581, + "learning_rate": 9.264729801658726e-07, + "loss": 1.588, + "step": 4810 + }, + { + "epoch": 0.17669269401371018, + "grad_norm": 1.5163383708898754, + "learning_rate": 9.261715635912105e-07, + "loss": 1.6068, + "step": 4820 + }, + { + "epoch": 0.17705927636643573, + "grad_norm": 1.6054513357762625, + "learning_rate": 9.258695797061011e-07, + "loss": 1.5623, + "step": 4830 + }, + { + "epoch": 0.17742585871916125, + "grad_norm": 1.7549519455125482, + "learning_rate": 9.255670289125392e-07, + "loss": 1.6342, + "step": 4840 + }, + { + "epoch": 0.1777924410718868, + "grad_norm": 1.5524081159338652, + "learning_rate": 9.252639116132737e-07, + "loss": 1.5866, + "step": 4850 + }, + { + "epoch": 0.17815902342461234, + "grad_norm": 1.5466546969225983, + "learning_rate": 9.249602282118078e-07, + "loss": 1.6022, + "step": 4860 + }, + { + "epoch": 0.1785256057773379, + "grad_norm": 1.4959615382996556, + "learning_rate": 9.246559791123984e-07, + "loss": 1.6196, + "step": 4870 + }, + { + "epoch": 0.1788921881300634, + "grad_norm": 1.4914720900146645, + "learning_rate": 9.243511647200554e-07, + "loss": 1.5919, + "step": 4880 + }, + { + "epoch": 0.17925877048278896, + "grad_norm": 1.5337435868741187, + "learning_rate": 9.240457854405411e-07, + "loss": 1.6044, + "step": 4890 + }, + { + "epoch": 0.1796253528355145, + "grad_norm": 1.6816858785763387, + "learning_rate": 9.237398416803702e-07, + "loss": 1.5634, + "step": 4900 + }, + { + "epoch": 0.17999193518824003, + "grad_norm": 1.8428666379108207, + "learning_rate": 9.234333338468079e-07, + "loss": 1.5595, + "step": 4910 + }, + { + "epoch": 0.18035851754096557, + "grad_norm": 1.4112423758680814, + "learning_rate": 9.231262623478712e-07, + "loss": 1.5958, + "step": 4920 + }, + { + "epoch": 0.18072509989369112, + "grad_norm": 1.9379415330464052, + "learning_rate": 9.228186275923271e-07, + "loss": 1.6132, + "step": 4930 + }, + { + "epoch": 0.18109168224641667, + "grad_norm": 1.6478659028610085, + "learning_rate": 9.225104299896923e-07, + "loss": 1.5253, + "step": 4940 + }, + { + "epoch": 0.1814582645991422, + "grad_norm": 1.4723128432871142, + "learning_rate": 9.222016699502329e-07, + "loss": 1.6025, + "step": 4950 + }, + { + "epoch": 0.18182484695186774, + "grad_norm": 1.7186069161894069, + "learning_rate": 9.218923478849636e-07, + "loss": 1.5888, + "step": 4960 + }, + { + "epoch": 0.18219142930459328, + "grad_norm": 2.0518524516759706, + "learning_rate": 9.215824642056473e-07, + "loss": 1.6131, + "step": 4970 + }, + { + "epoch": 0.18255801165731883, + "grad_norm": 1.7336503978028492, + "learning_rate": 9.212720193247946e-07, + "loss": 1.5725, + "step": 4980 + }, + { + "epoch": 0.18292459401004435, + "grad_norm": 1.4722133429873332, + "learning_rate": 9.209610136556629e-07, + "loss": 1.5547, + "step": 4990 + }, + { + "epoch": 0.1832911763627699, + "grad_norm": 1.6753596780660358, + "learning_rate": 9.206494476122565e-07, + "loss": 1.5997, + "step": 5000 + }, + { + "epoch": 0.1832911763627699, + "eval_accuracy": 0.642745649510724, + "eval_loss": 1.587723731994629, + "eval_runtime": 309.6063, + "eval_samples_per_second": 10.681, + "eval_steps_per_second": 0.891, + "step": 5000 + }, + { + "epoch": 0.18365775871549544, + "grad_norm": 1.5685677710443469, + "learning_rate": 9.203373216093253e-07, + "loss": 1.5679, + "step": 5010 + }, + { + "epoch": 0.18402434106822096, + "grad_norm": 1.8335955057050302, + "learning_rate": 9.200246360623647e-07, + "loss": 1.5621, + "step": 5020 + }, + { + "epoch": 0.1843909234209465, + "grad_norm": 1.522191845438261, + "learning_rate": 9.19711391387615e-07, + "loss": 1.5729, + "step": 5030 + }, + { + "epoch": 0.18475750577367206, + "grad_norm": 1.6776006382527855, + "learning_rate": 9.193975880020609e-07, + "loss": 1.59, + "step": 5040 + }, + { + "epoch": 0.1851240881263976, + "grad_norm": 1.626198881855077, + "learning_rate": 9.190832263234307e-07, + "loss": 1.5274, + "step": 5050 + }, + { + "epoch": 0.18549067047912313, + "grad_norm": 1.7849118070867178, + "learning_rate": 9.18768306770196e-07, + "loss": 1.5976, + "step": 5060 + }, + { + "epoch": 0.18585725283184867, + "grad_norm": 1.6492509263028217, + "learning_rate": 9.184528297615706e-07, + "loss": 1.574, + "step": 5070 + }, + { + "epoch": 0.18622383518457422, + "grad_norm": 1.6650634512326183, + "learning_rate": 9.181367957175111e-07, + "loss": 1.6145, + "step": 5080 + }, + { + "epoch": 0.18659041753729974, + "grad_norm": 1.728522905813247, + "learning_rate": 9.178202050587152e-07, + "loss": 1.623, + "step": 5090 + }, + { + "epoch": 0.1869569998900253, + "grad_norm": 1.5996442049523565, + "learning_rate": 9.175030582066215e-07, + "loss": 1.5807, + "step": 5100 + }, + { + "epoch": 0.18732358224275084, + "grad_norm": 2.127736796999369, + "learning_rate": 9.17185355583409e-07, + "loss": 1.6288, + "step": 5110 + }, + { + "epoch": 0.18769016459547638, + "grad_norm": 1.7060344023543381, + "learning_rate": 9.16867097611997e-07, + "loss": 1.5706, + "step": 5120 + }, + { + "epoch": 0.1880567469482019, + "grad_norm": 1.6633154215840553, + "learning_rate": 9.165482847160433e-07, + "loss": 1.6202, + "step": 5130 + }, + { + "epoch": 0.18842332930092745, + "grad_norm": 2.008854546754292, + "learning_rate": 9.162289173199449e-07, + "loss": 1.5684, + "step": 5140 + }, + { + "epoch": 0.188789911653653, + "grad_norm": 1.8267125273776432, + "learning_rate": 9.159089958488368e-07, + "loss": 1.5463, + "step": 5150 + }, + { + "epoch": 0.18915649400637854, + "grad_norm": 1.5564239251002085, + "learning_rate": 9.155885207285917e-07, + "loss": 1.5432, + "step": 5160 + }, + { + "epoch": 0.18952307635910406, + "grad_norm": 1.6146271060205803, + "learning_rate": 9.152674923858192e-07, + "loss": 1.5524, + "step": 5170 + }, + { + "epoch": 0.1898896587118296, + "grad_norm": 1.5552810397285535, + "learning_rate": 9.149459112478653e-07, + "loss": 1.5704, + "step": 5180 + }, + { + "epoch": 0.19025624106455516, + "grad_norm": 1.5384519496242604, + "learning_rate": 9.146237777428119e-07, + "loss": 1.5832, + "step": 5190 + }, + { + "epoch": 0.19062282341728068, + "grad_norm": 2.017102331377888, + "learning_rate": 9.143010922994761e-07, + "loss": 1.5652, + "step": 5200 + }, + { + "epoch": 0.19098940577000623, + "grad_norm": 1.8257390842642465, + "learning_rate": 9.139778553474102e-07, + "loss": 1.6286, + "step": 5210 + }, + { + "epoch": 0.19135598812273177, + "grad_norm": 1.8375892545538077, + "learning_rate": 9.136540673169e-07, + "loss": 1.5999, + "step": 5220 + }, + { + "epoch": 0.19172257047545732, + "grad_norm": 2.0587302949543327, + "learning_rate": 9.133297286389652e-07, + "loss": 1.5976, + "step": 5230 + }, + { + "epoch": 0.19208915282818284, + "grad_norm": 2.011881523827466, + "learning_rate": 9.130048397453586e-07, + "loss": 1.5948, + "step": 5240 + }, + { + "epoch": 0.1924557351809084, + "grad_norm": 1.8390608792602066, + "learning_rate": 9.126794010685652e-07, + "loss": 1.6149, + "step": 5250 + }, + { + "epoch": 0.19282231753363394, + "grad_norm": 1.9246481251033047, + "learning_rate": 9.123534130418022e-07, + "loss": 1.5918, + "step": 5260 + }, + { + "epoch": 0.19318889988635948, + "grad_norm": 1.716961973736044, + "learning_rate": 9.120268760990177e-07, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 0.193555482239085, + "grad_norm": 2.0653331266058053, + "learning_rate": 9.116997906748906e-07, + "loss": 1.5646, + "step": 5280 + }, + { + "epoch": 0.19392206459181055, + "grad_norm": 1.518359023904073, + "learning_rate": 9.113721572048303e-07, + "loss": 1.5893, + "step": 5290 + }, + { + "epoch": 0.1942886469445361, + "grad_norm": 1.5221964255305394, + "learning_rate": 9.110439761249752e-07, + "loss": 1.5944, + "step": 5300 + }, + { + "epoch": 0.19465522929726162, + "grad_norm": 1.591016019300809, + "learning_rate": 9.107152478721929e-07, + "loss": 1.5957, + "step": 5310 + }, + { + "epoch": 0.19502181164998716, + "grad_norm": 1.6048630337553804, + "learning_rate": 9.103859728840797e-07, + "loss": 1.5373, + "step": 5320 + }, + { + "epoch": 0.1953883940027127, + "grad_norm": 1.8089344462427293, + "learning_rate": 9.10056151598959e-07, + "loss": 1.5484, + "step": 5330 + }, + { + "epoch": 0.19575497635543826, + "grad_norm": 1.7077347921127968, + "learning_rate": 9.097257844558821e-07, + "loss": 1.5688, + "step": 5340 + }, + { + "epoch": 0.19612155870816378, + "grad_norm": 2.0584080275062706, + "learning_rate": 9.093948718946265e-07, + "loss": 1.5202, + "step": 5350 + }, + { + "epoch": 0.19648814106088933, + "grad_norm": 1.6275162784009292, + "learning_rate": 9.090634143556961e-07, + "loss": 1.5851, + "step": 5360 + }, + { + "epoch": 0.19685472341361487, + "grad_norm": 1.7941515009032263, + "learning_rate": 9.087314122803198e-07, + "loss": 1.5794, + "step": 5370 + }, + { + "epoch": 0.19722130576634042, + "grad_norm": 1.72604148825101, + "learning_rate": 9.083988661104519e-07, + "loss": 1.5966, + "step": 5380 + }, + { + "epoch": 0.19758788811906594, + "grad_norm": 1.7824620622659664, + "learning_rate": 9.080657762887706e-07, + "loss": 1.5893, + "step": 5390 + }, + { + "epoch": 0.1979544704717915, + "grad_norm": 1.710078177829696, + "learning_rate": 9.077321432586779e-07, + "loss": 1.5668, + "step": 5400 + }, + { + "epoch": 0.19832105282451704, + "grad_norm": 1.8516264946489545, + "learning_rate": 9.073979674642991e-07, + "loss": 1.6049, + "step": 5410 + }, + { + "epoch": 0.19868763517724256, + "grad_norm": 2.1561627747886583, + "learning_rate": 9.070632493504815e-07, + "loss": 1.585, + "step": 5420 + }, + { + "epoch": 0.1990542175299681, + "grad_norm": 1.912041110250784, + "learning_rate": 9.06727989362795e-07, + "loss": 1.5196, + "step": 5430 + }, + { + "epoch": 0.19942079988269365, + "grad_norm": 1.8404077118276456, + "learning_rate": 9.063921879475306e-07, + "loss": 1.611, + "step": 5440 + }, + { + "epoch": 0.1997873822354192, + "grad_norm": 1.5865821224681815, + "learning_rate": 9.060558455516996e-07, + "loss": 1.5739, + "step": 5450 + }, + { + "epoch": 0.20015396458814472, + "grad_norm": 1.9756512969668862, + "learning_rate": 9.057189626230341e-07, + "loss": 1.5002, + "step": 5460 + }, + { + "epoch": 0.20052054694087026, + "grad_norm": 1.5812577707350812, + "learning_rate": 9.053815396099851e-07, + "loss": 1.5869, + "step": 5470 + }, + { + "epoch": 0.2008871292935958, + "grad_norm": 2.0162867580185555, + "learning_rate": 9.050435769617231e-07, + "loss": 1.5559, + "step": 5480 + }, + { + "epoch": 0.20125371164632136, + "grad_norm": 1.899649598636165, + "learning_rate": 9.047050751281368e-07, + "loss": 1.5407, + "step": 5490 + }, + { + "epoch": 0.20162029399904688, + "grad_norm": 1.9101266806326496, + "learning_rate": 9.043660345598322e-07, + "loss": 1.5576, + "step": 5500 + }, + { + "epoch": 0.20198687635177243, + "grad_norm": 2.0420669589479403, + "learning_rate": 9.040264557081334e-07, + "loss": 1.557, + "step": 5510 + }, + { + "epoch": 0.20235345870449797, + "grad_norm": 1.9260883055795428, + "learning_rate": 9.036863390250801e-07, + "loss": 1.5521, + "step": 5520 + }, + { + "epoch": 0.2027200410572235, + "grad_norm": 1.6555197284342995, + "learning_rate": 9.033456849634284e-07, + "loss": 1.5717, + "step": 5530 + }, + { + "epoch": 0.20308662340994904, + "grad_norm": 2.153362825776131, + "learning_rate": 9.030044939766497e-07, + "loss": 1.5713, + "step": 5540 + }, + { + "epoch": 0.2034532057626746, + "grad_norm": 1.910089724316295, + "learning_rate": 9.026627665189303e-07, + "loss": 1.5697, + "step": 5550 + }, + { + "epoch": 0.20381978811540014, + "grad_norm": 1.7762617538543, + "learning_rate": 9.0232050304517e-07, + "loss": 1.5239, + "step": 5560 + }, + { + "epoch": 0.20418637046812566, + "grad_norm": 1.7174298843577596, + "learning_rate": 9.019777040109831e-07, + "loss": 1.5276, + "step": 5570 + }, + { + "epoch": 0.2045529528208512, + "grad_norm": 1.6862369469038345, + "learning_rate": 9.016343698726961e-07, + "loss": 1.5541, + "step": 5580 + }, + { + "epoch": 0.20491953517357675, + "grad_norm": 1.875834526669963, + "learning_rate": 9.01290501087348e-07, + "loss": 1.555, + "step": 5590 + }, + { + "epoch": 0.2052861175263023, + "grad_norm": 1.7840227955187389, + "learning_rate": 9.009460981126898e-07, + "loss": 1.5872, + "step": 5600 + }, + { + "epoch": 0.20565269987902782, + "grad_norm": 1.668168953110993, + "learning_rate": 9.006011614071829e-07, + "loss": 1.599, + "step": 5610 + }, + { + "epoch": 0.20601928223175336, + "grad_norm": 1.6951419814826267, + "learning_rate": 9.002556914300001e-07, + "loss": 1.5599, + "step": 5620 + }, + { + "epoch": 0.2063858645844789, + "grad_norm": 2.031183645077938, + "learning_rate": 8.999096886410234e-07, + "loss": 1.5697, + "step": 5630 + }, + { + "epoch": 0.20675244693720443, + "grad_norm": 2.2433698552413595, + "learning_rate": 8.995631535008442e-07, + "loss": 1.5751, + "step": 5640 + }, + { + "epoch": 0.20711902928992998, + "grad_norm": 1.96339871171306, + "learning_rate": 8.992160864707629e-07, + "loss": 1.5922, + "step": 5650 + }, + { + "epoch": 0.20748561164265553, + "grad_norm": 1.7341008984989021, + "learning_rate": 8.988684880127877e-07, + "loss": 1.5476, + "step": 5660 + }, + { + "epoch": 0.20785219399538107, + "grad_norm": 1.6011033018349554, + "learning_rate": 8.985203585896339e-07, + "loss": 1.5337, + "step": 5670 + }, + { + "epoch": 0.2082187763481066, + "grad_norm": 1.804008259917083, + "learning_rate": 8.981716986647241e-07, + "loss": 1.548, + "step": 5680 + }, + { + "epoch": 0.20858535870083214, + "grad_norm": 1.7644993504571036, + "learning_rate": 8.978225087021872e-07, + "loss": 1.5566, + "step": 5690 + }, + { + "epoch": 0.2089519410535577, + "grad_norm": 2.1995890332913812, + "learning_rate": 8.974727891668568e-07, + "loss": 1.509, + "step": 5700 + }, + { + "epoch": 0.2093185234062832, + "grad_norm": 1.7307439040874695, + "learning_rate": 8.971225405242724e-07, + "loss": 1.5792, + "step": 5710 + }, + { + "epoch": 0.20968510575900876, + "grad_norm": 1.8843347719325225, + "learning_rate": 8.967717632406775e-07, + "loss": 1.5745, + "step": 5720 + }, + { + "epoch": 0.2100516881117343, + "grad_norm": 1.8994279922279045, + "learning_rate": 8.964204577830193e-07, + "loss": 1.5346, + "step": 5730 + }, + { + "epoch": 0.21041827046445985, + "grad_norm": 2.0146207080838305, + "learning_rate": 8.960686246189479e-07, + "loss": 1.5724, + "step": 5740 + }, + { + "epoch": 0.21078485281718537, + "grad_norm": 1.9175010632666802, + "learning_rate": 8.957162642168164e-07, + "loss": 1.482, + "step": 5750 + }, + { + "epoch": 0.21115143516991092, + "grad_norm": 1.6492564643172203, + "learning_rate": 8.953633770456791e-07, + "loss": 1.5635, + "step": 5760 + }, + { + "epoch": 0.21151801752263646, + "grad_norm": 1.8913486368556613, + "learning_rate": 8.950099635752919e-07, + "loss": 1.5634, + "step": 5770 + }, + { + "epoch": 0.211884599875362, + "grad_norm": 1.7405053491856226, + "learning_rate": 8.946560242761114e-07, + "loss": 1.5475, + "step": 5780 + }, + { + "epoch": 0.21225118222808753, + "grad_norm": 1.7166883252641594, + "learning_rate": 8.943015596192938e-07, + "loss": 1.516, + "step": 5790 + }, + { + "epoch": 0.21261776458081308, + "grad_norm": 1.935712334758643, + "learning_rate": 8.93946570076695e-07, + "loss": 1.5575, + "step": 5800 + }, + { + "epoch": 0.21298434693353863, + "grad_norm": 1.9385604701128256, + "learning_rate": 8.935910561208693e-07, + "loss": 1.5634, + "step": 5810 + }, + { + "epoch": 0.21335092928626415, + "grad_norm": 2.557688500744313, + "learning_rate": 8.932350182250694e-07, + "loss": 1.5103, + "step": 5820 + }, + { + "epoch": 0.2137175116389897, + "grad_norm": 1.7120107495237882, + "learning_rate": 8.928784568632454e-07, + "loss": 1.5332, + "step": 5830 + }, + { + "epoch": 0.21408409399171524, + "grad_norm": 1.9120958570178155, + "learning_rate": 8.925213725100439e-07, + "loss": 1.5902, + "step": 5840 + }, + { + "epoch": 0.2144506763444408, + "grad_norm": 2.0551912368717984, + "learning_rate": 8.921637656408081e-07, + "loss": 1.5784, + "step": 5850 + }, + { + "epoch": 0.2148172586971663, + "grad_norm": 1.9480411905431083, + "learning_rate": 8.918056367315765e-07, + "loss": 1.5551, + "step": 5860 + }, + { + "epoch": 0.21518384104989186, + "grad_norm": 2.072902657734444, + "learning_rate": 8.914469862590825e-07, + "loss": 1.5555, + "step": 5870 + }, + { + "epoch": 0.2155504234026174, + "grad_norm": 1.9451661388320578, + "learning_rate": 8.910878147007544e-07, + "loss": 1.5513, + "step": 5880 + }, + { + "epoch": 0.21591700575534295, + "grad_norm": 2.0629785589418104, + "learning_rate": 8.907281225347132e-07, + "loss": 1.5553, + "step": 5890 + }, + { + "epoch": 0.21628358810806847, + "grad_norm": 1.863954721076218, + "learning_rate": 8.903679102397735e-07, + "loss": 1.5691, + "step": 5900 + }, + { + "epoch": 0.21665017046079402, + "grad_norm": 1.8545804685124208, + "learning_rate": 8.900071782954424e-07, + "loss": 1.5331, + "step": 5910 + }, + { + "epoch": 0.21701675281351956, + "grad_norm": 1.8522158136831326, + "learning_rate": 8.896459271819181e-07, + "loss": 1.5481, + "step": 5920 + }, + { + "epoch": 0.21738333516624508, + "grad_norm": 2.114169763199409, + "learning_rate": 8.892841573800909e-07, + "loss": 1.5574, + "step": 5930 + }, + { + "epoch": 0.21774991751897063, + "grad_norm": 2.2195708048317897, + "learning_rate": 8.889218693715405e-07, + "loss": 1.5632, + "step": 5940 + }, + { + "epoch": 0.21811649987169618, + "grad_norm": 1.9709151192601133, + "learning_rate": 8.885590636385373e-07, + "loss": 1.5861, + "step": 5950 + }, + { + "epoch": 0.21848308222442173, + "grad_norm": 1.9808333239294875, + "learning_rate": 8.881957406640402e-07, + "loss": 1.5065, + "step": 5960 + }, + { + "epoch": 0.21884966457714725, + "grad_norm": 2.442742784557856, + "learning_rate": 8.878319009316973e-07, + "loss": 1.5445, + "step": 5970 + }, + { + "epoch": 0.2192162469298728, + "grad_norm": 2.311119780435353, + "learning_rate": 8.874675449258439e-07, + "loss": 1.5483, + "step": 5980 + }, + { + "epoch": 0.21958282928259834, + "grad_norm": 2.0035864035930655, + "learning_rate": 8.871026731315031e-07, + "loss": 1.5516, + "step": 5990 + }, + { + "epoch": 0.2199494116353239, + "grad_norm": 1.9235134048584597, + "learning_rate": 8.867372860343843e-07, + "loss": 1.5841, + "step": 6000 + }, + { + "epoch": 0.2199494116353239, + "eval_accuracy": 0.6509060196907062, + "eval_loss": 1.540500521659851, + "eval_runtime": 311.0144, + "eval_samples_per_second": 10.633, + "eval_steps_per_second": 0.887, + "step": 6000 + }, + { + "epoch": 0.2203159939880494, + "grad_norm": 1.7524109005789064, + "learning_rate": 8.863713841208831e-07, + "loss": 1.5597, + "step": 6010 + }, + { + "epoch": 0.22068257634077496, + "grad_norm": 1.6692328056749952, + "learning_rate": 8.860049678780803e-07, + "loss": 1.4923, + "step": 6020 + }, + { + "epoch": 0.2210491586935005, + "grad_norm": 1.9399213197528828, + "learning_rate": 8.856380377937411e-07, + "loss": 1.552, + "step": 6030 + }, + { + "epoch": 0.22141574104622602, + "grad_norm": 2.2904467183798753, + "learning_rate": 8.852705943563153e-07, + "loss": 1.5254, + "step": 6040 + }, + { + "epoch": 0.22178232339895157, + "grad_norm": 1.8153750134894717, + "learning_rate": 8.849026380549354e-07, + "loss": 1.5141, + "step": 6050 + }, + { + "epoch": 0.22214890575167712, + "grad_norm": 2.618147882062693, + "learning_rate": 8.84534169379417e-07, + "loss": 1.5427, + "step": 6060 + }, + { + "epoch": 0.22251548810440266, + "grad_norm": 1.7910988941866253, + "learning_rate": 8.84165188820258e-07, + "loss": 1.5024, + "step": 6070 + }, + { + "epoch": 0.22288207045712818, + "grad_norm": 2.1174011777995565, + "learning_rate": 8.837956968686371e-07, + "loss": 1.5354, + "step": 6080 + }, + { + "epoch": 0.22324865280985373, + "grad_norm": 1.9009206870385398, + "learning_rate": 8.834256940164142e-07, + "loss": 1.5147, + "step": 6090 + }, + { + "epoch": 0.22361523516257928, + "grad_norm": 1.8496325535415874, + "learning_rate": 8.830551807561291e-07, + "loss": 1.5179, + "step": 6100 + }, + { + "epoch": 0.22398181751530483, + "grad_norm": 1.662570964745413, + "learning_rate": 8.826841575810011e-07, + "loss": 1.5187, + "step": 6110 + }, + { + "epoch": 0.22434839986803035, + "grad_norm": 1.8932960142147148, + "learning_rate": 8.823126249849283e-07, + "loss": 1.511, + "step": 6120 + }, + { + "epoch": 0.2247149822207559, + "grad_norm": 2.055911875635135, + "learning_rate": 8.819405834624869e-07, + "loss": 1.5155, + "step": 6130 + }, + { + "epoch": 0.22508156457348144, + "grad_norm": 2.0651755539958603, + "learning_rate": 8.815680335089308e-07, + "loss": 1.4753, + "step": 6140 + }, + { + "epoch": 0.22544814692620696, + "grad_norm": 2.0717254734315405, + "learning_rate": 8.811949756201902e-07, + "loss": 1.5565, + "step": 6150 + }, + { + "epoch": 0.2258147292789325, + "grad_norm": 1.9847422671401158, + "learning_rate": 8.808214102928721e-07, + "loss": 1.5438, + "step": 6160 + }, + { + "epoch": 0.22618131163165806, + "grad_norm": 2.4190623603018806, + "learning_rate": 8.804473380242583e-07, + "loss": 1.5399, + "step": 6170 + }, + { + "epoch": 0.2265478939843836, + "grad_norm": 2.20009570928599, + "learning_rate": 8.80072759312306e-07, + "loss": 1.5398, + "step": 6180 + }, + { + "epoch": 0.22691447633710912, + "grad_norm": 1.9921790637181438, + "learning_rate": 8.796976746556462e-07, + "loss": 1.4771, + "step": 6190 + }, + { + "epoch": 0.22728105868983467, + "grad_norm": 2.0203680363068344, + "learning_rate": 8.793220845535838e-07, + "loss": 1.5176, + "step": 6200 + }, + { + "epoch": 0.22764764104256022, + "grad_norm": 2.7532988176359754, + "learning_rate": 8.789459895060962e-07, + "loss": 1.5371, + "step": 6210 + }, + { + "epoch": 0.22801422339528576, + "grad_norm": 1.937352911027064, + "learning_rate": 8.785693900138329e-07, + "loss": 1.5356, + "step": 6220 + }, + { + "epoch": 0.22838080574801128, + "grad_norm": 1.9964616803134492, + "learning_rate": 8.781922865781151e-07, + "loss": 1.56, + "step": 6230 + }, + { + "epoch": 0.22874738810073683, + "grad_norm": 2.106377863408321, + "learning_rate": 8.778146797009349e-07, + "loss": 1.559, + "step": 6240 + }, + { + "epoch": 0.22911397045346238, + "grad_norm": 1.6409859726466804, + "learning_rate": 8.774365698849547e-07, + "loss": 1.5116, + "step": 6250 + }, + { + "epoch": 0.2294805528061879, + "grad_norm": 2.305691070208384, + "learning_rate": 8.770579576335058e-07, + "loss": 1.5683, + "step": 6260 + }, + { + "epoch": 0.22984713515891345, + "grad_norm": 1.7207294769909895, + "learning_rate": 8.766788434505887e-07, + "loss": 1.4618, + "step": 6270 + }, + { + "epoch": 0.230213717511639, + "grad_norm": 1.9323445658200624, + "learning_rate": 8.762992278408723e-07, + "loss": 1.5618, + "step": 6280 + }, + { + "epoch": 0.23058029986436454, + "grad_norm": 1.999152732092489, + "learning_rate": 8.759191113096927e-07, + "loss": 1.5569, + "step": 6290 + }, + { + "epoch": 0.23094688221709006, + "grad_norm": 1.8502749258838977, + "learning_rate": 8.755384943630529e-07, + "loss": 1.5114, + "step": 6300 + }, + { + "epoch": 0.2313134645698156, + "grad_norm": 2.0061014414371003, + "learning_rate": 8.751573775076219e-07, + "loss": 1.5011, + "step": 6310 + }, + { + "epoch": 0.23168004692254116, + "grad_norm": 2.064565021271191, + "learning_rate": 8.747757612507345e-07, + "loss": 1.5588, + "step": 6320 + }, + { + "epoch": 0.23204662927526667, + "grad_norm": 1.878533236916369, + "learning_rate": 8.743936461003898e-07, + "loss": 1.5179, + "step": 6330 + }, + { + "epoch": 0.23241321162799222, + "grad_norm": 2.080116702687917, + "learning_rate": 8.740110325652515e-07, + "loss": 1.5211, + "step": 6340 + }, + { + "epoch": 0.23277979398071777, + "grad_norm": 2.2534624739469433, + "learning_rate": 8.736279211546465e-07, + "loss": 1.5077, + "step": 6350 + }, + { + "epoch": 0.23314637633344332, + "grad_norm": 2.1778452457873527, + "learning_rate": 8.732443123785644e-07, + "loss": 1.5385, + "step": 6360 + }, + { + "epoch": 0.23351295868616884, + "grad_norm": 2.0802562378092317, + "learning_rate": 8.72860206747657e-07, + "loss": 1.5053, + "step": 6370 + }, + { + "epoch": 0.23387954103889438, + "grad_norm": 2.197133342414823, + "learning_rate": 8.724756047732376e-07, + "loss": 1.5223, + "step": 6380 + }, + { + "epoch": 0.23424612339161993, + "grad_norm": 2.3786394596220437, + "learning_rate": 8.720905069672799e-07, + "loss": 1.5124, + "step": 6390 + }, + { + "epoch": 0.23461270574434548, + "grad_norm": 1.8455501641424978, + "learning_rate": 8.717049138424182e-07, + "loss": 1.525, + "step": 6400 + }, + { + "epoch": 0.234979288097071, + "grad_norm": 2.0418699202678727, + "learning_rate": 8.713188259119452e-07, + "loss": 1.5082, + "step": 6410 + }, + { + "epoch": 0.23534587044979655, + "grad_norm": 1.8308136052916946, + "learning_rate": 8.709322436898135e-07, + "loss": 1.4779, + "step": 6420 + }, + { + "epoch": 0.2357124528025221, + "grad_norm": 2.155105815758525, + "learning_rate": 8.705451676906328e-07, + "loss": 1.5101, + "step": 6430 + }, + { + "epoch": 0.2360790351552476, + "grad_norm": 1.9647757860923412, + "learning_rate": 8.701575984296702e-07, + "loss": 1.5105, + "step": 6440 + }, + { + "epoch": 0.23644561750797316, + "grad_norm": 2.051510082680593, + "learning_rate": 8.6976953642285e-07, + "loss": 1.503, + "step": 6450 + }, + { + "epoch": 0.2368121998606987, + "grad_norm": 2.1386714707947534, + "learning_rate": 8.693809821867517e-07, + "loss": 1.5282, + "step": 6460 + }, + { + "epoch": 0.23717878221342426, + "grad_norm": 2.1401411616284167, + "learning_rate": 8.689919362386104e-07, + "loss": 1.4949, + "step": 6470 + }, + { + "epoch": 0.23754536456614977, + "grad_norm": 1.956666297999974, + "learning_rate": 8.686023990963157e-07, + "loss": 1.4993, + "step": 6480 + }, + { + "epoch": 0.23791194691887532, + "grad_norm": 2.0257118859168672, + "learning_rate": 8.682123712784112e-07, + "loss": 1.5186, + "step": 6490 + }, + { + "epoch": 0.23827852927160087, + "grad_norm": 1.895169068962553, + "learning_rate": 8.678218533040937e-07, + "loss": 1.526, + "step": 6500 + }, + { + "epoch": 0.23864511162432642, + "grad_norm": 6.529056788123207, + "learning_rate": 8.67430845693212e-07, + "loss": 1.4975, + "step": 6510 + }, + { + "epoch": 0.23901169397705194, + "grad_norm": 2.078820041783562, + "learning_rate": 8.670393489662673e-07, + "loss": 1.5147, + "step": 6520 + }, + { + "epoch": 0.23937827632977748, + "grad_norm": 2.313941233193865, + "learning_rate": 8.666473636444116e-07, + "loss": 1.5103, + "step": 6530 + }, + { + "epoch": 0.23974485868250303, + "grad_norm": 2.204068052979437, + "learning_rate": 8.662548902494473e-07, + "loss": 1.5197, + "step": 6540 + }, + { + "epoch": 0.24011144103522855, + "grad_norm": 2.6677538134182033, + "learning_rate": 8.658619293038265e-07, + "loss": 1.4539, + "step": 6550 + }, + { + "epoch": 0.2404780233879541, + "grad_norm": 2.1826711924398876, + "learning_rate": 8.654684813306508e-07, + "loss": 1.4569, + "step": 6560 + }, + { + "epoch": 0.24084460574067965, + "grad_norm": 2.4513733249404037, + "learning_rate": 8.650745468536691e-07, + "loss": 1.472, + "step": 6570 + }, + { + "epoch": 0.2412111880934052, + "grad_norm": 1.9341316559705668, + "learning_rate": 8.64680126397279e-07, + "loss": 1.5128, + "step": 6580 + }, + { + "epoch": 0.2415777704461307, + "grad_norm": 2.2183441842361753, + "learning_rate": 8.642852204865243e-07, + "loss": 1.5409, + "step": 6590 + }, + { + "epoch": 0.24194435279885626, + "grad_norm": 2.270638521627112, + "learning_rate": 8.638898296470953e-07, + "loss": 1.4992, + "step": 6600 + }, + { + "epoch": 0.2423109351515818, + "grad_norm": 2.6732843475957146, + "learning_rate": 8.634939544053279e-07, + "loss": 1.5335, + "step": 6610 + }, + { + "epoch": 0.24267751750430736, + "grad_norm": 1.9291920434342291, + "learning_rate": 8.630975952882026e-07, + "loss": 1.4627, + "step": 6620 + }, + { + "epoch": 0.24304409985703287, + "grad_norm": 2.05169281240212, + "learning_rate": 8.627007528233445e-07, + "loss": 1.5257, + "step": 6630 + }, + { + "epoch": 0.24341068220975842, + "grad_norm": 2.42497111676382, + "learning_rate": 8.623034275390214e-07, + "loss": 1.5445, + "step": 6640 + }, + { + "epoch": 0.24377726456248397, + "grad_norm": 2.1919485638499903, + "learning_rate": 8.619056199641444e-07, + "loss": 1.5115, + "step": 6650 + }, + { + "epoch": 0.2441438469152095, + "grad_norm": 2.3664261903908343, + "learning_rate": 8.615073306282663e-07, + "loss": 1.4846, + "step": 6660 + }, + { + "epoch": 0.24451042926793504, + "grad_norm": 2.7278440906317387, + "learning_rate": 8.611085600615812e-07, + "loss": 1.5419, + "step": 6670 + }, + { + "epoch": 0.24487701162066058, + "grad_norm": 2.326361941668607, + "learning_rate": 8.607093087949244e-07, + "loss": 1.5447, + "step": 6680 + }, + { + "epoch": 0.24524359397338613, + "grad_norm": 2.101465809666948, + "learning_rate": 8.603095773597702e-07, + "loss": 1.5147, + "step": 6690 + }, + { + "epoch": 0.24561017632611165, + "grad_norm": 2.121131443755951, + "learning_rate": 8.599093662882326e-07, + "loss": 1.5046, + "step": 6700 + }, + { + "epoch": 0.2459767586788372, + "grad_norm": 2.004374535392673, + "learning_rate": 8.595086761130641e-07, + "loss": 1.5104, + "step": 6710 + }, + { + "epoch": 0.24634334103156275, + "grad_norm": 2.330571487353144, + "learning_rate": 8.591075073676548e-07, + "loss": 1.489, + "step": 6720 + }, + { + "epoch": 0.2467099233842883, + "grad_norm": 1.954097712061658, + "learning_rate": 8.587058605860319e-07, + "loss": 1.4628, + "step": 6730 + }, + { + "epoch": 0.2470765057370138, + "grad_norm": 2.287871494329092, + "learning_rate": 8.583037363028591e-07, + "loss": 1.4966, + "step": 6740 + }, + { + "epoch": 0.24744308808973936, + "grad_norm": 2.2507921472351837, + "learning_rate": 8.579011350534355e-07, + "loss": 1.5148, + "step": 6750 + }, + { + "epoch": 0.2478096704424649, + "grad_norm": 2.2811051866364034, + "learning_rate": 8.574980573736951e-07, + "loss": 1.5123, + "step": 6760 + }, + { + "epoch": 0.24817625279519043, + "grad_norm": 2.0762345472822106, + "learning_rate": 8.570945038002066e-07, + "loss": 1.5538, + "step": 6770 + }, + { + "epoch": 0.24854283514791597, + "grad_norm": 2.0481616873032618, + "learning_rate": 8.566904748701718e-07, + "loss": 1.5162, + "step": 6780 + }, + { + "epoch": 0.24890941750064152, + "grad_norm": 1.977911548805274, + "learning_rate": 8.562859711214252e-07, + "loss": 1.4945, + "step": 6790 + }, + { + "epoch": 0.24927599985336707, + "grad_norm": 2.166946374211255, + "learning_rate": 8.558809930924336e-07, + "loss": 1.5143, + "step": 6800 + }, + { + "epoch": 0.2496425822060926, + "grad_norm": 2.265635068798512, + "learning_rate": 8.554755413222952e-07, + "loss": 1.5079, + "step": 6810 + }, + { + "epoch": 0.25000916455881816, + "grad_norm": 2.376856602321205, + "learning_rate": 8.550696163507384e-07, + "loss": 1.5187, + "step": 6820 + }, + { + "epoch": 0.2503757469115437, + "grad_norm": 2.329411952961872, + "learning_rate": 8.54663218718122e-07, + "loss": 1.4985, + "step": 6830 + }, + { + "epoch": 0.2507423292642692, + "grad_norm": 2.127867609490789, + "learning_rate": 8.542563489654337e-07, + "loss": 1.5249, + "step": 6840 + }, + { + "epoch": 0.2511089116169948, + "grad_norm": 2.3846188422530545, + "learning_rate": 8.5384900763429e-07, + "loss": 1.5157, + "step": 6850 + }, + { + "epoch": 0.2514754939697203, + "grad_norm": 1.9837481727043949, + "learning_rate": 8.534411952669348e-07, + "loss": 1.5185, + "step": 6860 + }, + { + "epoch": 0.2518420763224458, + "grad_norm": 2.0300743472877776, + "learning_rate": 8.530329124062392e-07, + "loss": 1.4726, + "step": 6870 + }, + { + "epoch": 0.2522086586751714, + "grad_norm": 3.41153757527899, + "learning_rate": 8.526241595957007e-07, + "loss": 1.482, + "step": 6880 + }, + { + "epoch": 0.2525752410278969, + "grad_norm": 2.7170854102243043, + "learning_rate": 8.52214937379442e-07, + "loss": 1.4518, + "step": 6890 + }, + { + "epoch": 0.25294182338062243, + "grad_norm": 2.5040883653748294, + "learning_rate": 8.518052463022112e-07, + "loss": 1.4506, + "step": 6900 + }, + { + "epoch": 0.253308405733348, + "grad_norm": 2.1362380301717807, + "learning_rate": 8.513950869093802e-07, + "loss": 1.4975, + "step": 6910 + }, + { + "epoch": 0.2536749880860735, + "grad_norm": 56.61497948468882, + "learning_rate": 8.509844597469442e-07, + "loss": 1.5211, + "step": 6920 + }, + { + "epoch": 0.2540415704387991, + "grad_norm": 2.161248343347086, + "learning_rate": 8.505733653615217e-07, + "loss": 1.5123, + "step": 6930 + }, + { + "epoch": 0.2544081527915246, + "grad_norm": 2.197831076147601, + "learning_rate": 8.501618043003522e-07, + "loss": 1.4735, + "step": 6940 + }, + { + "epoch": 0.25477473514425014, + "grad_norm": 2.730731478650521, + "learning_rate": 8.497497771112975e-07, + "loss": 1.5154, + "step": 6950 + }, + { + "epoch": 0.2551413174969757, + "grad_norm": 2.625261843658038, + "learning_rate": 8.49337284342839e-07, + "loss": 1.4642, + "step": 6960 + }, + { + "epoch": 0.25550789984970124, + "grad_norm": 3.6302229703502302, + "learning_rate": 8.489243265440785e-07, + "loss": 1.4339, + "step": 6970 + }, + { + "epoch": 0.25587448220242676, + "grad_norm": 2.2912655831406408, + "learning_rate": 8.485109042647361e-07, + "loss": 1.5021, + "step": 6980 + }, + { + "epoch": 0.25624106455515233, + "grad_norm": 8.005970124630041, + "learning_rate": 8.48097018055151e-07, + "loss": 1.4777, + "step": 6990 + }, + { + "epoch": 0.25660764690787785, + "grad_norm": 2.2515437376163097, + "learning_rate": 8.476826684662797e-07, + "loss": 1.5096, + "step": 7000 + }, + { + "epoch": 0.25660764690787785, + "eval_accuracy": 0.6611285662580546, + "eval_loss": 1.4870213270187378, + "eval_runtime": 310.8369, + "eval_samples_per_second": 10.639, + "eval_steps_per_second": 0.888, + "step": 7000 + }, + { + "epoch": 0.25697422926060337, + "grad_norm": 2.531506922529387, + "learning_rate": 8.472678560496955e-07, + "loss": 1.4718, + "step": 7010 + }, + { + "epoch": 0.25734081161332895, + "grad_norm": 2.6738422568666778, + "learning_rate": 8.468525813575875e-07, + "loss": 1.4849, + "step": 7020 + }, + { + "epoch": 0.25770739396605447, + "grad_norm": 2.3045631257315256, + "learning_rate": 8.464368449427608e-07, + "loss": 1.3982, + "step": 7030 + }, + { + "epoch": 0.25807397631878004, + "grad_norm": 2.3127941331475586, + "learning_rate": 8.460206473586347e-07, + "loss": 1.4584, + "step": 7040 + }, + { + "epoch": 0.25844055867150556, + "grad_norm": 2.624025522294039, + "learning_rate": 8.456039891592424e-07, + "loss": 1.5064, + "step": 7050 + }, + { + "epoch": 0.2588071410242311, + "grad_norm": 2.4392755048359906, + "learning_rate": 8.451868708992305e-07, + "loss": 1.4744, + "step": 7060 + }, + { + "epoch": 0.25917372337695666, + "grad_norm": 2.244873049339989, + "learning_rate": 8.447692931338577e-07, + "loss": 1.4866, + "step": 7070 + }, + { + "epoch": 0.2595403057296822, + "grad_norm": 2.7693601328533846, + "learning_rate": 8.443512564189947e-07, + "loss": 1.4264, + "step": 7080 + }, + { + "epoch": 0.2599068880824077, + "grad_norm": 2.18123288795935, + "learning_rate": 8.439327613111231e-07, + "loss": 1.4487, + "step": 7090 + }, + { + "epoch": 0.26027347043513327, + "grad_norm": 2.770780437192883, + "learning_rate": 8.435138083673343e-07, + "loss": 1.5298, + "step": 7100 + }, + { + "epoch": 0.2606400527878588, + "grad_norm": 2.2581904540642737, + "learning_rate": 8.430943981453298e-07, + "loss": 1.4801, + "step": 7110 + }, + { + "epoch": 0.2610066351405843, + "grad_norm": 2.3222299759291674, + "learning_rate": 8.426745312034192e-07, + "loss": 1.4896, + "step": 7120 + }, + { + "epoch": 0.2613732174933099, + "grad_norm": 2.0280868196158908, + "learning_rate": 8.422542081005209e-07, + "loss": 1.4466, + "step": 7130 + }, + { + "epoch": 0.2617397998460354, + "grad_norm": 2.224282133830904, + "learning_rate": 8.418334293961593e-07, + "loss": 1.5286, + "step": 7140 + }, + { + "epoch": 0.262106382198761, + "grad_norm": 2.223919368251033, + "learning_rate": 8.414121956504665e-07, + "loss": 1.5043, + "step": 7150 + }, + { + "epoch": 0.2624729645514865, + "grad_norm": 2.505467964910925, + "learning_rate": 8.409905074241796e-07, + "loss": 1.4781, + "step": 7160 + }, + { + "epoch": 0.262839546904212, + "grad_norm": 2.0986445187287077, + "learning_rate": 8.405683652786411e-07, + "loss": 1.4804, + "step": 7170 + }, + { + "epoch": 0.2632061292569376, + "grad_norm": 2.490412539205642, + "learning_rate": 8.401457697757972e-07, + "loss": 1.518, + "step": 7180 + }, + { + "epoch": 0.2635727116096631, + "grad_norm": 2.6915376209294917, + "learning_rate": 8.397227214781983e-07, + "loss": 1.4812, + "step": 7190 + }, + { + "epoch": 0.26393929396238863, + "grad_norm": 2.3046153435535235, + "learning_rate": 8.392992209489973e-07, + "loss": 1.5159, + "step": 7200 + }, + { + "epoch": 0.2643058763151142, + "grad_norm": 2.508127660367551, + "learning_rate": 8.388752687519489e-07, + "loss": 1.4451, + "step": 7210 + }, + { + "epoch": 0.2646724586678397, + "grad_norm": 3.1862145718553245, + "learning_rate": 8.384508654514091e-07, + "loss": 1.4609, + "step": 7220 + }, + { + "epoch": 0.26503904102056525, + "grad_norm": 2.5580838478505803, + "learning_rate": 8.380260116123343e-07, + "loss": 1.4331, + "step": 7230 + }, + { + "epoch": 0.2654056233732908, + "grad_norm": 2.257862509636175, + "learning_rate": 8.376007078002813e-07, + "loss": 1.45, + "step": 7240 + }, + { + "epoch": 0.26577220572601634, + "grad_norm": 2.288080123372639, + "learning_rate": 8.371749545814051e-07, + "loss": 1.4389, + "step": 7250 + }, + { + "epoch": 0.2661387880787419, + "grad_norm": 2.396647723381076, + "learning_rate": 8.367487525224592e-07, + "loss": 1.4366, + "step": 7260 + }, + { + "epoch": 0.26650537043146744, + "grad_norm": 2.2979084143372868, + "learning_rate": 8.363221021907949e-07, + "loss": 1.4818, + "step": 7270 + }, + { + "epoch": 0.26687195278419296, + "grad_norm": 2.1808515998354694, + "learning_rate": 8.358950041543598e-07, + "loss": 1.4542, + "step": 7280 + }, + { + "epoch": 0.26723853513691853, + "grad_norm": 2.230268806261455, + "learning_rate": 8.354674589816977e-07, + "loss": 1.4329, + "step": 7290 + }, + { + "epoch": 0.26760511748964405, + "grad_norm": 2.927648869466954, + "learning_rate": 8.350394672419474e-07, + "loss": 1.5225, + "step": 7300 + }, + { + "epoch": 0.26797169984236957, + "grad_norm": 2.112114910370922, + "learning_rate": 8.346110295048425e-07, + "loss": 1.4225, + "step": 7310 + }, + { + "epoch": 0.26833828219509515, + "grad_norm": 2.660467378126346, + "learning_rate": 8.341821463407101e-07, + "loss": 1.5031, + "step": 7320 + }, + { + "epoch": 0.26870486454782067, + "grad_norm": 3.003354330326063, + "learning_rate": 8.337528183204704e-07, + "loss": 1.4707, + "step": 7330 + }, + { + "epoch": 0.2690714469005462, + "grad_norm": 2.623779251977545, + "learning_rate": 8.333230460156355e-07, + "loss": 1.4794, + "step": 7340 + }, + { + "epoch": 0.26943802925327176, + "grad_norm": 3.101895766048754, + "learning_rate": 8.32892829998309e-07, + "loss": 1.4667, + "step": 7350 + }, + { + "epoch": 0.2698046116059973, + "grad_norm": 2.960369047027641, + "learning_rate": 8.324621708411854e-07, + "loss": 1.5522, + "step": 7360 + }, + { + "epoch": 0.2701711939587228, + "grad_norm": 2.524100342925903, + "learning_rate": 8.320310691175489e-07, + "loss": 1.4526, + "step": 7370 + }, + { + "epoch": 0.2705377763114484, + "grad_norm": 2.62363195310582, + "learning_rate": 8.315995254012726e-07, + "loss": 1.4018, + "step": 7380 + }, + { + "epoch": 0.2709043586641739, + "grad_norm": 1.9920146887682115, + "learning_rate": 8.311675402668188e-07, + "loss": 1.3965, + "step": 7390 + }, + { + "epoch": 0.27127094101689947, + "grad_norm": 2.18110821192289, + "learning_rate": 8.307351142892364e-07, + "loss": 1.4842, + "step": 7400 + }, + { + "epoch": 0.271637523369625, + "grad_norm": 2.2188567896520497, + "learning_rate": 8.303022480441617e-07, + "loss": 1.4159, + "step": 7410 + }, + { + "epoch": 0.2720041057223505, + "grad_norm": 2.858166839750072, + "learning_rate": 8.298689421078171e-07, + "loss": 1.3954, + "step": 7420 + }, + { + "epoch": 0.2723706880750761, + "grad_norm": 2.740212521082454, + "learning_rate": 8.294351970570099e-07, + "loss": 1.4861, + "step": 7430 + }, + { + "epoch": 0.2727372704278016, + "grad_norm": 3.419233012340433, + "learning_rate": 8.290010134691326e-07, + "loss": 1.4824, + "step": 7440 + }, + { + "epoch": 0.2731038527805271, + "grad_norm": 2.4809215592986966, + "learning_rate": 8.285663919221606e-07, + "loss": 1.4938, + "step": 7450 + }, + { + "epoch": 0.2734704351332527, + "grad_norm": 2.607478119047904, + "learning_rate": 8.281313329946531e-07, + "loss": 1.419, + "step": 7460 + }, + { + "epoch": 0.2738370174859782, + "grad_norm": 2.8279213303777753, + "learning_rate": 8.276958372657512e-07, + "loss": 1.4801, + "step": 7470 + }, + { + "epoch": 0.27420359983870374, + "grad_norm": 2.585541966605194, + "learning_rate": 8.272599053151774e-07, + "loss": 1.4154, + "step": 7480 + }, + { + "epoch": 0.2745701821914293, + "grad_norm": 2.7236239018595336, + "learning_rate": 8.268235377232351e-07, + "loss": 1.4741, + "step": 7490 + }, + { + "epoch": 0.27493676454415483, + "grad_norm": 2.2739375571211844, + "learning_rate": 8.263867350708072e-07, + "loss": 1.4447, + "step": 7500 + }, + { + "epoch": 0.2753033468968804, + "grad_norm": 2.936703619541737, + "learning_rate": 8.259494979393562e-07, + "loss": 1.4811, + "step": 7510 + }, + { + "epoch": 0.2756699292496059, + "grad_norm": 2.644051786280347, + "learning_rate": 8.255118269109229e-07, + "loss": 1.4359, + "step": 7520 + }, + { + "epoch": 0.27603651160233145, + "grad_norm": 2.814370164816269, + "learning_rate": 8.250737225681254e-07, + "loss": 1.4697, + "step": 7530 + }, + { + "epoch": 0.276403093955057, + "grad_norm": 2.7487477516640664, + "learning_rate": 8.246351854941589e-07, + "loss": 1.4677, + "step": 7540 + }, + { + "epoch": 0.27676967630778254, + "grad_norm": 2.7840690479403807, + "learning_rate": 8.241962162727946e-07, + "loss": 1.462, + "step": 7550 + }, + { + "epoch": 0.27713625866050806, + "grad_norm": 2.9784690105392366, + "learning_rate": 8.237568154883788e-07, + "loss": 1.4439, + "step": 7560 + }, + { + "epoch": 0.27750284101323364, + "grad_norm": 2.8948634927350105, + "learning_rate": 8.233169837258325e-07, + "loss": 1.4705, + "step": 7570 + }, + { + "epoch": 0.27786942336595916, + "grad_norm": 2.612491147603324, + "learning_rate": 8.228767215706503e-07, + "loss": 1.467, + "step": 7580 + }, + { + "epoch": 0.2782360057186847, + "grad_norm": 2.8002040163179736, + "learning_rate": 8.224360296088995e-07, + "loss": 1.4573, + "step": 7590 + }, + { + "epoch": 0.27860258807141025, + "grad_norm": 2.8029823959562155, + "learning_rate": 8.219949084272201e-07, + "loss": 1.4804, + "step": 7600 + }, + { + "epoch": 0.27896917042413577, + "grad_norm": 2.6888372781846375, + "learning_rate": 8.21553358612823e-07, + "loss": 1.4633, + "step": 7610 + }, + { + "epoch": 0.27933575277686135, + "grad_norm": 2.279721839418087, + "learning_rate": 8.2111138075349e-07, + "loss": 1.4713, + "step": 7620 + }, + { + "epoch": 0.27970233512958687, + "grad_norm": 2.3829035564919807, + "learning_rate": 8.206689754375724e-07, + "loss": 1.4387, + "step": 7630 + }, + { + "epoch": 0.2800689174823124, + "grad_norm": 3.7962407630882384, + "learning_rate": 8.202261432539907e-07, + "loss": 1.4025, + "step": 7640 + }, + { + "epoch": 0.28043549983503796, + "grad_norm": 2.797043930833034, + "learning_rate": 8.197828847922337e-07, + "loss": 1.4576, + "step": 7650 + }, + { + "epoch": 0.2808020821877635, + "grad_norm": 3.256545613051792, + "learning_rate": 8.193392006423574e-07, + "loss": 1.432, + "step": 7660 + }, + { + "epoch": 0.281168664540489, + "grad_norm": 2.432668523438971, + "learning_rate": 8.188950913949848e-07, + "loss": 1.456, + "step": 7670 + }, + { + "epoch": 0.2815352468932146, + "grad_norm": 2.4546993774133856, + "learning_rate": 8.184505576413043e-07, + "loss": 1.392, + "step": 7680 + }, + { + "epoch": 0.2819018292459401, + "grad_norm": 3.0030506631971776, + "learning_rate": 8.180055999730702e-07, + "loss": 1.365, + "step": 7690 + }, + { + "epoch": 0.2822684115986656, + "grad_norm": 2.9439493487762465, + "learning_rate": 8.175602189826001e-07, + "loss": 1.4292, + "step": 7700 + }, + { + "epoch": 0.2826349939513912, + "grad_norm": 2.620909787731563, + "learning_rate": 8.171144152627761e-07, + "loss": 1.4251, + "step": 7710 + }, + { + "epoch": 0.2830015763041167, + "grad_norm": 3.263683256322055, + "learning_rate": 8.16668189407042e-07, + "loss": 1.3899, + "step": 7720 + }, + { + "epoch": 0.2833681586568423, + "grad_norm": 2.5437523385064953, + "learning_rate": 8.162215420094045e-07, + "loss": 1.3683, + "step": 7730 + }, + { + "epoch": 0.2837347410095678, + "grad_norm": 2.4580551613838844, + "learning_rate": 8.15774473664431e-07, + "loss": 1.3732, + "step": 7740 + }, + { + "epoch": 0.2841013233622933, + "grad_norm": 2.8279077970597184, + "learning_rate": 8.153269849672493e-07, + "loss": 1.419, + "step": 7750 + }, + { + "epoch": 0.2844679057150189, + "grad_norm": 3.041958703900493, + "learning_rate": 8.148790765135465e-07, + "loss": 1.4356, + "step": 7760 + }, + { + "epoch": 0.2848344880677444, + "grad_norm": 2.4582661578514426, + "learning_rate": 8.144307488995689e-07, + "loss": 1.4378, + "step": 7770 + }, + { + "epoch": 0.28520107042046994, + "grad_norm": 2.8361019596271726, + "learning_rate": 8.139820027221208e-07, + "loss": 1.4111, + "step": 7780 + }, + { + "epoch": 0.2855676527731955, + "grad_norm": 2.4415137770737427, + "learning_rate": 8.135328385785631e-07, + "loss": 1.4996, + "step": 7790 + }, + { + "epoch": 0.28593423512592103, + "grad_norm": 2.1392002967653094, + "learning_rate": 8.130832570668139e-07, + "loss": 1.433, + "step": 7800 + }, + { + "epoch": 0.28630081747864655, + "grad_norm": 3.061322031102369, + "learning_rate": 8.126332587853462e-07, + "loss": 1.4051, + "step": 7810 + }, + { + "epoch": 0.2866673998313721, + "grad_norm": 3.2748819767509354, + "learning_rate": 8.12182844333188e-07, + "loss": 1.3863, + "step": 7820 + }, + { + "epoch": 0.28703398218409765, + "grad_norm": 3.1866933217967603, + "learning_rate": 8.117320143099216e-07, + "loss": 1.4173, + "step": 7830 + }, + { + "epoch": 0.2874005645368232, + "grad_norm": 2.9290211285749175, + "learning_rate": 8.11280769315682e-07, + "loss": 1.4395, + "step": 7840 + }, + { + "epoch": 0.28776714688954874, + "grad_norm": 2.7212160772193474, + "learning_rate": 8.108291099511571e-07, + "loss": 1.4503, + "step": 7850 + }, + { + "epoch": 0.28813372924227426, + "grad_norm": 2.3892746869258317, + "learning_rate": 8.10377036817586e-07, + "loss": 1.4368, + "step": 7860 + }, + { + "epoch": 0.28850031159499984, + "grad_norm": 3.4107926691510277, + "learning_rate": 8.099245505167589e-07, + "loss": 1.4623, + "step": 7870 + }, + { + "epoch": 0.28886689394772536, + "grad_norm": 3.1259277735027307, + "learning_rate": 8.094716516510156e-07, + "loss": 1.4412, + "step": 7880 + }, + { + "epoch": 0.2892334763004509, + "grad_norm": 2.9135343767151154, + "learning_rate": 8.090183408232459e-07, + "loss": 1.4187, + "step": 7890 + }, + { + "epoch": 0.28960005865317645, + "grad_norm": 3.30617041516701, + "learning_rate": 8.085646186368867e-07, + "loss": 1.4176, + "step": 7900 + }, + { + "epoch": 0.28996664100590197, + "grad_norm": 3.1801194693312556, + "learning_rate": 8.081104856959238e-07, + "loss": 1.4534, + "step": 7910 + }, + { + "epoch": 0.2903332233586275, + "grad_norm": 3.2431476470574983, + "learning_rate": 8.07655942604889e-07, + "loss": 1.3469, + "step": 7920 + }, + { + "epoch": 0.29069980571135307, + "grad_norm": 3.1005913247685237, + "learning_rate": 8.072009899688605e-07, + "loss": 1.417, + "step": 7930 + }, + { + "epoch": 0.2910663880640786, + "grad_norm": 2.953054099149132, + "learning_rate": 8.067456283934614e-07, + "loss": 1.4252, + "step": 7940 + }, + { + "epoch": 0.29143297041680416, + "grad_norm": 2.6363992565855803, + "learning_rate": 8.062898584848592e-07, + "loss": 1.4499, + "step": 7950 + }, + { + "epoch": 0.2917995527695297, + "grad_norm": 2.7290690238502635, + "learning_rate": 8.05833680849765e-07, + "loss": 1.4716, + "step": 7960 + }, + { + "epoch": 0.2921661351222552, + "grad_norm": 3.21591143424738, + "learning_rate": 8.053770960954328e-07, + "loss": 1.3969, + "step": 7970 + }, + { + "epoch": 0.2925327174749808, + "grad_norm": 3.8732639515812575, + "learning_rate": 8.049201048296585e-07, + "loss": 1.463, + "step": 7980 + }, + { + "epoch": 0.2928992998277063, + "grad_norm": 2.9966394441630126, + "learning_rate": 8.044627076607789e-07, + "loss": 1.4545, + "step": 7990 + }, + { + "epoch": 0.2932658821804318, + "grad_norm": 3.1577560282041017, + "learning_rate": 8.040049051976713e-07, + "loss": 1.4682, + "step": 8000 + }, + { + "epoch": 0.2932658821804318, + "eval_accuracy": 0.6739903313977985, + "eval_loss": 1.4271955490112305, + "eval_runtime": 311.2156, + "eval_samples_per_second": 10.626, + "eval_steps_per_second": 0.887, + "step": 8000 + }, + { + "epoch": 0.2936324645331574, + "grad_norm": 2.957786000444244, + "learning_rate": 8.035466980497526e-07, + "loss": 1.4592, + "step": 8010 + }, + { + "epoch": 0.2939990468858829, + "grad_norm": 2.765279941343725, + "learning_rate": 8.030880868269785e-07, + "loss": 1.4404, + "step": 8020 + }, + { + "epoch": 0.29436562923860843, + "grad_norm": 2.803405395861366, + "learning_rate": 8.026290721398421e-07, + "loss": 1.3642, + "step": 8030 + }, + { + "epoch": 0.294732211591334, + "grad_norm": 3.134947642226663, + "learning_rate": 8.02169654599374e-07, + "loss": 1.4662, + "step": 8040 + }, + { + "epoch": 0.2950987939440595, + "grad_norm": 3.3888445829207923, + "learning_rate": 8.017098348171411e-07, + "loss": 1.4092, + "step": 8050 + }, + { + "epoch": 0.2954653762967851, + "grad_norm": 2.595961601811049, + "learning_rate": 8.012496134052457e-07, + "loss": 1.3772, + "step": 8060 + }, + { + "epoch": 0.2958319586495106, + "grad_norm": 3.724884065568925, + "learning_rate": 8.007889909763246e-07, + "loss": 1.3862, + "step": 8070 + }, + { + "epoch": 0.29619854100223614, + "grad_norm": 3.6608857589920754, + "learning_rate": 8.003279681435482e-07, + "loss": 1.444, + "step": 8080 + }, + { + "epoch": 0.2965651233549617, + "grad_norm": 2.7154240671865213, + "learning_rate": 7.998665455206206e-07, + "loss": 1.4285, + "step": 8090 + }, + { + "epoch": 0.29693170570768723, + "grad_norm": 2.7151538150939927, + "learning_rate": 7.994047237217776e-07, + "loss": 1.4489, + "step": 8100 + }, + { + "epoch": 0.29729828806041275, + "grad_norm": 2.9729575587995742, + "learning_rate": 7.989425033617863e-07, + "loss": 1.4289, + "step": 8110 + }, + { + "epoch": 0.2976648704131383, + "grad_norm": 3.298808013574498, + "learning_rate": 7.984798850559447e-07, + "loss": 1.4607, + "step": 8120 + }, + { + "epoch": 0.29803145276586385, + "grad_norm": 3.1491445672684866, + "learning_rate": 7.980168694200804e-07, + "loss": 1.4097, + "step": 8130 + }, + { + "epoch": 0.29839803511858937, + "grad_norm": 3.6399703354293007, + "learning_rate": 7.975534570705497e-07, + "loss": 1.3743, + "step": 8140 + }, + { + "epoch": 0.29876461747131494, + "grad_norm": 3.2547493183004974, + "learning_rate": 7.970896486242374e-07, + "loss": 1.4346, + "step": 8150 + }, + { + "epoch": 0.29913119982404046, + "grad_norm": 3.421650269839234, + "learning_rate": 7.966254446985553e-07, + "loss": 1.43, + "step": 8160 + }, + { + "epoch": 0.29949778217676604, + "grad_norm": 3.797293850962011, + "learning_rate": 7.961608459114416e-07, + "loss": 1.4651, + "step": 8170 + }, + { + "epoch": 0.29986436452949156, + "grad_norm": 3.5920498224364508, + "learning_rate": 7.956958528813604e-07, + "loss": 1.3738, + "step": 8180 + }, + { + "epoch": 0.3002309468822171, + "grad_norm": 3.238482918382144, + "learning_rate": 7.952304662273003e-07, + "loss": 1.3987, + "step": 8190 + }, + { + "epoch": 0.30059752923494265, + "grad_norm": 2.7498611423368176, + "learning_rate": 7.947646865687742e-07, + "loss": 1.4181, + "step": 8200 + }, + { + "epoch": 0.30096411158766817, + "grad_norm": 4.031428344222072, + "learning_rate": 7.942985145258179e-07, + "loss": 1.4294, + "step": 8210 + }, + { + "epoch": 0.3013306939403937, + "grad_norm": 2.643218639323195, + "learning_rate": 7.938319507189894e-07, + "loss": 1.4302, + "step": 8220 + }, + { + "epoch": 0.30169727629311927, + "grad_norm": 3.1275133100531227, + "learning_rate": 7.933649957693689e-07, + "loss": 1.348, + "step": 8230 + }, + { + "epoch": 0.3020638586458448, + "grad_norm": 3.521399879217592, + "learning_rate": 7.928976502985565e-07, + "loss": 1.4328, + "step": 8240 + }, + { + "epoch": 0.3024304409985703, + "grad_norm": 3.1834120547065665, + "learning_rate": 7.924299149286725e-07, + "loss": 1.4742, + "step": 8250 + }, + { + "epoch": 0.3027970233512959, + "grad_norm": 3.631213709741295, + "learning_rate": 7.919617902823563e-07, + "loss": 1.4068, + "step": 8260 + }, + { + "epoch": 0.3031636057040214, + "grad_norm": 2.726938578010126, + "learning_rate": 7.914932769827653e-07, + "loss": 1.4359, + "step": 8270 + }, + { + "epoch": 0.303530188056747, + "grad_norm": 3.7017959652425882, + "learning_rate": 7.910243756535744e-07, + "loss": 1.3344, + "step": 8280 + }, + { + "epoch": 0.3038967704094725, + "grad_norm": 3.3417669291832066, + "learning_rate": 7.90555086918975e-07, + "loss": 1.4121, + "step": 8290 + }, + { + "epoch": 0.304263352762198, + "grad_norm": 2.733351967687222, + "learning_rate": 7.900854114036743e-07, + "loss": 1.3732, + "step": 8300 + }, + { + "epoch": 0.3046299351149236, + "grad_norm": 3.1756478835337476, + "learning_rate": 7.89615349732894e-07, + "loss": 1.4007, + "step": 8310 + }, + { + "epoch": 0.3049965174676491, + "grad_norm": 3.238758242953075, + "learning_rate": 7.891449025323703e-07, + "loss": 1.4288, + "step": 8320 + }, + { + "epoch": 0.30536309982037463, + "grad_norm": 2.6053607033892043, + "learning_rate": 7.886740704283525e-07, + "loss": 1.4156, + "step": 8330 + }, + { + "epoch": 0.3057296821731002, + "grad_norm": 3.4053915363354417, + "learning_rate": 7.88202854047602e-07, + "loss": 1.3763, + "step": 8340 + }, + { + "epoch": 0.3060962645258257, + "grad_norm": 3.715425460301463, + "learning_rate": 7.877312540173922e-07, + "loss": 1.4036, + "step": 8350 + }, + { + "epoch": 0.30646284687855124, + "grad_norm": 2.9427971805533697, + "learning_rate": 7.872592709655066e-07, + "loss": 1.4385, + "step": 8360 + }, + { + "epoch": 0.3068294292312768, + "grad_norm": 3.5845846532616426, + "learning_rate": 7.867869055202392e-07, + "loss": 1.415, + "step": 8370 + }, + { + "epoch": 0.30719601158400234, + "grad_norm": 3.331222139254396, + "learning_rate": 7.863141583103927e-07, + "loss": 1.4126, + "step": 8380 + }, + { + "epoch": 0.3075625939367279, + "grad_norm": 3.1984388430808406, + "learning_rate": 7.85841029965278e-07, + "loss": 1.3826, + "step": 8390 + }, + { + "epoch": 0.30792917628945343, + "grad_norm": 3.1255012278404615, + "learning_rate": 7.853675211147134e-07, + "loss": 1.383, + "step": 8400 + }, + { + "epoch": 0.30829575864217895, + "grad_norm": 3.329583698840508, + "learning_rate": 7.848936323890239e-07, + "loss": 1.3931, + "step": 8410 + }, + { + "epoch": 0.3086623409949045, + "grad_norm": 3.9347250968462055, + "learning_rate": 7.844193644190396e-07, + "loss": 1.415, + "step": 8420 + }, + { + "epoch": 0.30902892334763005, + "grad_norm": 4.137255951707039, + "learning_rate": 7.839447178360963e-07, + "loss": 1.3998, + "step": 8430 + }, + { + "epoch": 0.30939550570035557, + "grad_norm": 2.6794621566293917, + "learning_rate": 7.834696932720331e-07, + "loss": 1.4228, + "step": 8440 + }, + { + "epoch": 0.30976208805308114, + "grad_norm": 2.726588078339754, + "learning_rate": 7.829942913591925e-07, + "loss": 1.4486, + "step": 8450 + }, + { + "epoch": 0.31012867040580666, + "grad_norm": 3.6162463016794026, + "learning_rate": 7.825185127304194e-07, + "loss": 1.4051, + "step": 8460 + }, + { + "epoch": 0.3104952527585322, + "grad_norm": 2.910711368055256, + "learning_rate": 7.820423580190603e-07, + "loss": 1.41, + "step": 8470 + }, + { + "epoch": 0.31086183511125776, + "grad_norm": 4.136385316326493, + "learning_rate": 7.815658278589619e-07, + "loss": 1.3859, + "step": 8480 + }, + { + "epoch": 0.3112284174639833, + "grad_norm": 2.1538443576824404, + "learning_rate": 7.810889228844708e-07, + "loss": 1.4113, + "step": 8490 + }, + { + "epoch": 0.31159499981670885, + "grad_norm": 3.1055419264140727, + "learning_rate": 7.806116437304331e-07, + "loss": 1.4327, + "step": 8500 + }, + { + "epoch": 0.31196158216943437, + "grad_norm": 3.183052960747229, + "learning_rate": 7.801339910321922e-07, + "loss": 1.4179, + "step": 8510 + }, + { + "epoch": 0.3123281645221599, + "grad_norm": 4.6955784323633925, + "learning_rate": 7.796559654255894e-07, + "loss": 1.3961, + "step": 8520 + }, + { + "epoch": 0.31269474687488547, + "grad_norm": 3.227174794853267, + "learning_rate": 7.79177567546962e-07, + "loss": 1.4082, + "step": 8530 + }, + { + "epoch": 0.313061329227611, + "grad_norm": 2.8264595214995243, + "learning_rate": 7.78698798033143e-07, + "loss": 1.4136, + "step": 8540 + }, + { + "epoch": 0.3134279115803365, + "grad_norm": 3.7915043909577624, + "learning_rate": 7.782196575214601e-07, + "loss": 1.3758, + "step": 8550 + }, + { + "epoch": 0.3137944939330621, + "grad_norm": 4.070976938559408, + "learning_rate": 7.777401466497349e-07, + "loss": 1.3915, + "step": 8560 + }, + { + "epoch": 0.3141610762857876, + "grad_norm": 3.3538502722425916, + "learning_rate": 7.772602660562819e-07, + "loss": 1.3718, + "step": 8570 + }, + { + "epoch": 0.3145276586385131, + "grad_norm": 3.230342363406807, + "learning_rate": 7.767800163799081e-07, + "loss": 1.3408, + "step": 8580 + }, + { + "epoch": 0.3148942409912387, + "grad_norm": 3.6144160833487415, + "learning_rate": 7.762993982599113e-07, + "loss": 1.4296, + "step": 8590 + }, + { + "epoch": 0.3152608233439642, + "grad_norm": 3.1182771552970374, + "learning_rate": 7.758184123360803e-07, + "loss": 1.3858, + "step": 8600 + }, + { + "epoch": 0.3156274056966898, + "grad_norm": 3.5319206230022977, + "learning_rate": 7.75337059248693e-07, + "loss": 1.4342, + "step": 8610 + }, + { + "epoch": 0.3159939880494153, + "grad_norm": 4.327639493570607, + "learning_rate": 7.748553396385163e-07, + "loss": 1.3915, + "step": 8620 + }, + { + "epoch": 0.31636057040214083, + "grad_norm": 3.9982142503751326, + "learning_rate": 7.743732541468053e-07, + "loss": 1.363, + "step": 8630 + }, + { + "epoch": 0.3167271527548664, + "grad_norm": 2.8786530129074728, + "learning_rate": 7.738908034153015e-07, + "loss": 1.3589, + "step": 8640 + }, + { + "epoch": 0.3170937351075919, + "grad_norm": 4.4947342914569095, + "learning_rate": 7.734079880862333e-07, + "loss": 1.3506, + "step": 8650 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 3.1518608629753477, + "learning_rate": 7.729248088023139e-07, + "loss": 1.3847, + "step": 8660 + }, + { + "epoch": 0.317826899813043, + "grad_norm": 3.8964914548994534, + "learning_rate": 7.724412662067415e-07, + "loss": 1.3616, + "step": 8670 + }, + { + "epoch": 0.31819348216576854, + "grad_norm": 4.158332163473049, + "learning_rate": 7.719573609431971e-07, + "loss": 1.3477, + "step": 8680 + }, + { + "epoch": 0.31856006451849406, + "grad_norm": 5.31244346458908, + "learning_rate": 7.714730936558455e-07, + "loss": 1.3885, + "step": 8690 + }, + { + "epoch": 0.31892664687121963, + "grad_norm": 3.5750048314109946, + "learning_rate": 7.709884649893328e-07, + "loss": 1.3763, + "step": 8700 + }, + { + "epoch": 0.31929322922394515, + "grad_norm": 3.5013927398683444, + "learning_rate": 7.70503475588786e-07, + "loss": 1.3437, + "step": 8710 + }, + { + "epoch": 0.31965981157667067, + "grad_norm": 3.772854937898392, + "learning_rate": 7.700181260998131e-07, + "loss": 1.434, + "step": 8720 + }, + { + "epoch": 0.32002639392939625, + "grad_norm": 3.939247516045474, + "learning_rate": 7.695324171685004e-07, + "loss": 1.384, + "step": 8730 + }, + { + "epoch": 0.32039297628212177, + "grad_norm": 3.3160045433400334, + "learning_rate": 7.690463494414137e-07, + "loss": 1.3681, + "step": 8740 + }, + { + "epoch": 0.32075955863484734, + "grad_norm": 3.2760601494452533, + "learning_rate": 7.685599235655955e-07, + "loss": 1.3576, + "step": 8750 + }, + { + "epoch": 0.32112614098757286, + "grad_norm": 3.917398028616676, + "learning_rate": 7.680731401885658e-07, + "loss": 1.4109, + "step": 8760 + }, + { + "epoch": 0.3214927233402984, + "grad_norm": 4.3801775022523355, + "learning_rate": 7.675859999583202e-07, + "loss": 1.3688, + "step": 8770 + }, + { + "epoch": 0.32185930569302396, + "grad_norm": 3.52546033919284, + "learning_rate": 7.670985035233291e-07, + "loss": 1.3803, + "step": 8780 + }, + { + "epoch": 0.3222258880457495, + "grad_norm": 3.4568824402601925, + "learning_rate": 7.666106515325374e-07, + "loss": 1.3615, + "step": 8790 + }, + { + "epoch": 0.322592470398475, + "grad_norm": 2.7983015500958826, + "learning_rate": 7.661224446353634e-07, + "loss": 1.3767, + "step": 8800 + }, + { + "epoch": 0.32295905275120057, + "grad_norm": 3.4581919245368904, + "learning_rate": 7.656338834816976e-07, + "loss": 1.3768, + "step": 8810 + }, + { + "epoch": 0.3233256351039261, + "grad_norm": 3.7176544154346054, + "learning_rate": 7.651449687219018e-07, + "loss": 1.3312, + "step": 8820 + }, + { + "epoch": 0.3236922174566516, + "grad_norm": 3.6712040176600502, + "learning_rate": 7.646557010068091e-07, + "loss": 1.3981, + "step": 8830 + }, + { + "epoch": 0.3240587998093772, + "grad_norm": 2.8962404949789637, + "learning_rate": 7.641660809877222e-07, + "loss": 1.4085, + "step": 8840 + }, + { + "epoch": 0.3244253821621027, + "grad_norm": 5.2069626245172635, + "learning_rate": 7.636761093164126e-07, + "loss": 1.3489, + "step": 8850 + }, + { + "epoch": 0.3247919645148283, + "grad_norm": 3.3614052591604793, + "learning_rate": 7.631857866451204e-07, + "loss": 1.391, + "step": 8860 + }, + { + "epoch": 0.3251585468675538, + "grad_norm": 3.1183008582079417, + "learning_rate": 7.626951136265523e-07, + "loss": 1.3966, + "step": 8870 + }, + { + "epoch": 0.3255251292202793, + "grad_norm": 4.337276600886146, + "learning_rate": 7.622040909138818e-07, + "loss": 1.3566, + "step": 8880 + }, + { + "epoch": 0.3258917115730049, + "grad_norm": 4.083650404603487, + "learning_rate": 7.617127191607479e-07, + "loss": 1.3928, + "step": 8890 + }, + { + "epoch": 0.3262582939257304, + "grad_norm": 3.847428171873619, + "learning_rate": 7.612209990212543e-07, + "loss": 1.3259, + "step": 8900 + }, + { + "epoch": 0.32662487627845593, + "grad_norm": 3.2197146488177384, + "learning_rate": 7.607289311499678e-07, + "loss": 1.376, + "step": 8910 + }, + { + "epoch": 0.3269914586311815, + "grad_norm": 3.4983962191005373, + "learning_rate": 7.60236516201919e-07, + "loss": 1.3927, + "step": 8920 + }, + { + "epoch": 0.32735804098390703, + "grad_norm": 3.610610377134006, + "learning_rate": 7.597437548326002e-07, + "loss": 1.3792, + "step": 8930 + }, + { + "epoch": 0.32772462333663255, + "grad_norm": 5.095826376758547, + "learning_rate": 7.592506476979644e-07, + "loss": 1.358, + "step": 8940 + }, + { + "epoch": 0.3280912056893581, + "grad_norm": 3.3863305431901183, + "learning_rate": 7.587571954544254e-07, + "loss": 1.3983, + "step": 8950 + }, + { + "epoch": 0.32845778804208364, + "grad_norm": 3.5975350890244067, + "learning_rate": 7.582633987588563e-07, + "loss": 1.4057, + "step": 8960 + }, + { + "epoch": 0.3288243703948092, + "grad_norm": 3.848485096118636, + "learning_rate": 7.577692582685886e-07, + "loss": 1.3814, + "step": 8970 + }, + { + "epoch": 0.32919095274753474, + "grad_norm": 3.157404479059578, + "learning_rate": 7.572747746414117e-07, + "loss": 1.4095, + "step": 8980 + }, + { + "epoch": 0.32955753510026026, + "grad_norm": 4.1043127446716285, + "learning_rate": 7.567799485355715e-07, + "loss": 1.3755, + "step": 8990 + }, + { + "epoch": 0.32992411745298583, + "grad_norm": 3.7156219870736615, + "learning_rate": 7.562847806097696e-07, + "loss": 1.3526, + "step": 9000 + }, + { + "epoch": 0.32992411745298583, + "eval_accuracy": 0.688625248964108, + "eval_loss": 1.3686386346817017, + "eval_runtime": 311.2444, + "eval_samples_per_second": 10.625, + "eval_steps_per_second": 0.887, + "step": 9000 + }, + { + "epoch": 0.33029069980571135, + "grad_norm": 4.016168592808031, + "learning_rate": 7.557892715231634e-07, + "loss": 1.3607, + "step": 9010 + }, + { + "epoch": 0.33065728215843687, + "grad_norm": 3.504820069720998, + "learning_rate": 7.552934219353638e-07, + "loss": 1.3833, + "step": 9020 + }, + { + "epoch": 0.33102386451116245, + "grad_norm": 3.3563895186210875, + "learning_rate": 7.547972325064351e-07, + "loss": 1.393, + "step": 9030 + }, + { + "epoch": 0.33139044686388797, + "grad_norm": 3.401944814988902, + "learning_rate": 7.543007038968939e-07, + "loss": 1.3708, + "step": 9040 + }, + { + "epoch": 0.3317570292166135, + "grad_norm": 4.8917426491539935, + "learning_rate": 7.538038367677087e-07, + "loss": 1.329, + "step": 9050 + }, + { + "epoch": 0.33212361156933906, + "grad_norm": 4.014824315681244, + "learning_rate": 7.53306631780298e-07, + "loss": 1.3464, + "step": 9060 + }, + { + "epoch": 0.3324901939220646, + "grad_norm": 3.9395593086417637, + "learning_rate": 7.52809089596531e-07, + "loss": 1.4059, + "step": 9070 + }, + { + "epoch": 0.33285677627479016, + "grad_norm": 3.5141323515233274, + "learning_rate": 7.523112108787247e-07, + "loss": 1.3467, + "step": 9080 + }, + { + "epoch": 0.3332233586275157, + "grad_norm": 4.310837199551292, + "learning_rate": 7.518129962896448e-07, + "loss": 1.3432, + "step": 9090 + }, + { + "epoch": 0.3335899409802412, + "grad_norm": 4.049279934012434, + "learning_rate": 7.513144464925036e-07, + "loss": 1.4107, + "step": 9100 + }, + { + "epoch": 0.33395652333296677, + "grad_norm": 5.43599736913238, + "learning_rate": 7.508155621509603e-07, + "loss": 1.3779, + "step": 9110 + }, + { + "epoch": 0.3343231056856923, + "grad_norm": 4.312594101718665, + "learning_rate": 7.503163439291187e-07, + "loss": 1.3279, + "step": 9120 + }, + { + "epoch": 0.3346896880384178, + "grad_norm": 3.7888042986131794, + "learning_rate": 7.498167924915276e-07, + "loss": 1.3422, + "step": 9130 + }, + { + "epoch": 0.3350562703911434, + "grad_norm": 4.6227274755808665, + "learning_rate": 7.493169085031791e-07, + "loss": 1.3489, + "step": 9140 + }, + { + "epoch": 0.3354228527438689, + "grad_norm": 4.440746888404653, + "learning_rate": 7.48816692629508e-07, + "loss": 1.3955, + "step": 9150 + }, + { + "epoch": 0.3357894350965944, + "grad_norm": 3.1422454499623753, + "learning_rate": 7.483161455363909e-07, + "loss": 1.3613, + "step": 9160 + }, + { + "epoch": 0.33615601744932, + "grad_norm": 3.894653506327936, + "learning_rate": 7.478152678901455e-07, + "loss": 1.4148, + "step": 9170 + }, + { + "epoch": 0.3365225998020455, + "grad_norm": 5.433033949859381, + "learning_rate": 7.473140603575294e-07, + "loss": 1.3144, + "step": 9180 + }, + { + "epoch": 0.3368891821547711, + "grad_norm": 3.975951714183405, + "learning_rate": 7.468125236057392e-07, + "loss": 1.3691, + "step": 9190 + }, + { + "epoch": 0.3372557645074966, + "grad_norm": 4.918343199781564, + "learning_rate": 7.463106583024099e-07, + "loss": 1.3848, + "step": 9200 + }, + { + "epoch": 0.33762234686022213, + "grad_norm": 4.865872631877682, + "learning_rate": 7.458084651156138e-07, + "loss": 1.3612, + "step": 9210 + }, + { + "epoch": 0.3379889292129477, + "grad_norm": 4.124355883120795, + "learning_rate": 7.453059447138597e-07, + "loss": 1.3922, + "step": 9220 + }, + { + "epoch": 0.33835551156567323, + "grad_norm": 3.4927433175723968, + "learning_rate": 7.448030977660921e-07, + "loss": 1.3209, + "step": 9230 + }, + { + "epoch": 0.33872209391839875, + "grad_norm": 3.5565740075352887, + "learning_rate": 7.4429992494169e-07, + "loss": 1.3137, + "step": 9240 + }, + { + "epoch": 0.3390886762711243, + "grad_norm": 3.2292820179583335, + "learning_rate": 7.437964269104663e-07, + "loss": 1.3469, + "step": 9250 + }, + { + "epoch": 0.33945525862384984, + "grad_norm": 5.260253752526274, + "learning_rate": 7.432926043426668e-07, + "loss": 1.3067, + "step": 9260 + }, + { + "epoch": 0.33982184097657536, + "grad_norm": 4.394976349303848, + "learning_rate": 7.427884579089691e-07, + "loss": 1.3423, + "step": 9270 + }, + { + "epoch": 0.34018842332930094, + "grad_norm": 3.396422180187779, + "learning_rate": 7.422839882804825e-07, + "loss": 1.3449, + "step": 9280 + }, + { + "epoch": 0.34055500568202646, + "grad_norm": 4.387777704799267, + "learning_rate": 7.417791961287457e-07, + "loss": 1.3274, + "step": 9290 + }, + { + "epoch": 0.34092158803475203, + "grad_norm": 4.664699242153168, + "learning_rate": 7.412740821257275e-07, + "loss": 1.3147, + "step": 9300 + }, + { + "epoch": 0.34128817038747755, + "grad_norm": 3.393736360787831, + "learning_rate": 7.407686469438248e-07, + "loss": 1.3934, + "step": 9310 + }, + { + "epoch": 0.34165475274020307, + "grad_norm": 4.750927708757991, + "learning_rate": 7.40262891255862e-07, + "loss": 1.4067, + "step": 9320 + }, + { + "epoch": 0.34202133509292865, + "grad_norm": 3.428169411059033, + "learning_rate": 7.397568157350903e-07, + "loss": 1.3411, + "step": 9330 + }, + { + "epoch": 0.34238791744565417, + "grad_norm": 4.302469394811799, + "learning_rate": 7.392504210551865e-07, + "loss": 1.299, + "step": 9340 + }, + { + "epoch": 0.3427544997983797, + "grad_norm": 7.00981557963566, + "learning_rate": 7.387437078902523e-07, + "loss": 1.3573, + "step": 9350 + }, + { + "epoch": 0.34312108215110526, + "grad_norm": 5.566063359486336, + "learning_rate": 7.382366769148136e-07, + "loss": 1.3497, + "step": 9360 + }, + { + "epoch": 0.3434876645038308, + "grad_norm": 3.4660448886166244, + "learning_rate": 7.37729328803819e-07, + "loss": 1.4092, + "step": 9370 + }, + { + "epoch": 0.3438542468565563, + "grad_norm": 3.702869545438875, + "learning_rate": 7.372216642326394e-07, + "loss": 1.3603, + "step": 9380 + }, + { + "epoch": 0.3442208292092819, + "grad_norm": 4.231146103126798, + "learning_rate": 7.367136838770671e-07, + "loss": 1.3428, + "step": 9390 + }, + { + "epoch": 0.3445874115620074, + "grad_norm": 4.554271919619236, + "learning_rate": 7.362053884133146e-07, + "loss": 1.3311, + "step": 9400 + }, + { + "epoch": 0.34495399391473297, + "grad_norm": 4.041325390537124, + "learning_rate": 7.35696778518014e-07, + "loss": 1.3471, + "step": 9410 + }, + { + "epoch": 0.3453205762674585, + "grad_norm": 5.283681695413367, + "learning_rate": 7.351878548682155e-07, + "loss": 1.3334, + "step": 9420 + }, + { + "epoch": 0.345687158620184, + "grad_norm": 4.104429136831335, + "learning_rate": 7.34678618141388e-07, + "loss": 1.3443, + "step": 9430 + }, + { + "epoch": 0.3460537409729096, + "grad_norm": 4.637839526253117, + "learning_rate": 7.341690690154161e-07, + "loss": 1.3383, + "step": 9440 + }, + { + "epoch": 0.3464203233256351, + "grad_norm": 6.447434633082354, + "learning_rate": 7.336592081686007e-07, + "loss": 1.3769, + "step": 9450 + }, + { + "epoch": 0.3467869056783606, + "grad_norm": 4.989354934531907, + "learning_rate": 7.331490362796579e-07, + "loss": 1.3651, + "step": 9460 + }, + { + "epoch": 0.3471534880310862, + "grad_norm": 4.121285832330203, + "learning_rate": 7.326385540277171e-07, + "loss": 1.319, + "step": 9470 + }, + { + "epoch": 0.3475200703838117, + "grad_norm": 3.7909593948348284, + "learning_rate": 7.321277620923217e-07, + "loss": 1.3743, + "step": 9480 + }, + { + "epoch": 0.34788665273653724, + "grad_norm": 3.3733089497346853, + "learning_rate": 7.316166611534267e-07, + "loss": 1.3743, + "step": 9490 + }, + { + "epoch": 0.3482532350892628, + "grad_norm": 3.7253741770570823, + "learning_rate": 7.311052518913989e-07, + "loss": 1.2903, + "step": 9500 + }, + { + "epoch": 0.34861981744198833, + "grad_norm": 4.039793671210928, + "learning_rate": 7.305935349870155e-07, + "loss": 1.2862, + "step": 9510 + }, + { + "epoch": 0.3489863997947139, + "grad_norm": 4.342535349346429, + "learning_rate": 7.300815111214628e-07, + "loss": 1.3808, + "step": 9520 + }, + { + "epoch": 0.34935298214743943, + "grad_norm": 5.42799281760455, + "learning_rate": 7.29569180976336e-07, + "loss": 1.3523, + "step": 9530 + }, + { + "epoch": 0.34971956450016495, + "grad_norm": 5.020277916958928, + "learning_rate": 7.290565452336381e-07, + "loss": 1.3256, + "step": 9540 + }, + { + "epoch": 0.3500861468528905, + "grad_norm": 4.373712918374428, + "learning_rate": 7.285436045757789e-07, + "loss": 1.2827, + "step": 9550 + }, + { + "epoch": 0.35045272920561604, + "grad_norm": 6.179796353095443, + "learning_rate": 7.280303596855737e-07, + "loss": 1.3197, + "step": 9560 + }, + { + "epoch": 0.35081931155834156, + "grad_norm": 5.167300912494304, + "learning_rate": 7.275168112462433e-07, + "loss": 1.331, + "step": 9570 + }, + { + "epoch": 0.35118589391106714, + "grad_norm": 4.118700000532668, + "learning_rate": 7.270029599414125e-07, + "loss": 1.3529, + "step": 9580 + }, + { + "epoch": 0.35155247626379266, + "grad_norm": 3.6038833094843516, + "learning_rate": 7.264888064551089e-07, + "loss": 1.3258, + "step": 9590 + }, + { + "epoch": 0.3519190586165182, + "grad_norm": 3.5142758374979524, + "learning_rate": 7.259743514717627e-07, + "loss": 1.3377, + "step": 9600 + }, + { + "epoch": 0.35228564096924375, + "grad_norm": 4.1250041287694685, + "learning_rate": 7.254595956762053e-07, + "loss": 1.3135, + "step": 9610 + }, + { + "epoch": 0.35265222332196927, + "grad_norm": 3.132058137932181, + "learning_rate": 7.249445397536686e-07, + "loss": 1.3349, + "step": 9620 + }, + { + "epoch": 0.35301880567469485, + "grad_norm": 3.399519224329254, + "learning_rate": 7.244291843897839e-07, + "loss": 1.3052, + "step": 9630 + }, + { + "epoch": 0.35338538802742037, + "grad_norm": 4.712619284275666, + "learning_rate": 7.239135302705816e-07, + "loss": 1.3065, + "step": 9640 + }, + { + "epoch": 0.3537519703801459, + "grad_norm": 3.734161433235809, + "learning_rate": 7.23397578082489e-07, + "loss": 1.3094, + "step": 9650 + }, + { + "epoch": 0.35411855273287146, + "grad_norm": 5.100823292959423, + "learning_rate": 7.228813285123308e-07, + "loss": 1.3331, + "step": 9660 + }, + { + "epoch": 0.354485135085597, + "grad_norm": 4.534677424827633, + "learning_rate": 7.223647822473271e-07, + "loss": 1.3912, + "step": 9670 + }, + { + "epoch": 0.3548517174383225, + "grad_norm": 3.470979394380451, + "learning_rate": 7.218479399750934e-07, + "loss": 1.3476, + "step": 9680 + }, + { + "epoch": 0.3552182997910481, + "grad_norm": 4.753775104454421, + "learning_rate": 7.21330802383639e-07, + "loss": 1.3167, + "step": 9690 + }, + { + "epoch": 0.3555848821437736, + "grad_norm": 3.412263014571041, + "learning_rate": 7.208133701613665e-07, + "loss": 1.3358, + "step": 9700 + }, + { + "epoch": 0.3559514644964991, + "grad_norm": 4.131601355517602, + "learning_rate": 7.202956439970704e-07, + "loss": 1.3244, + "step": 9710 + }, + { + "epoch": 0.3563180468492247, + "grad_norm": 5.122163472630932, + "learning_rate": 7.197776245799367e-07, + "loss": 1.2796, + "step": 9720 + }, + { + "epoch": 0.3566846292019502, + "grad_norm": 5.335391466451254, + "learning_rate": 7.192593125995418e-07, + "loss": 1.3161, + "step": 9730 + }, + { + "epoch": 0.3570512115546758, + "grad_norm": 4.103339016303858, + "learning_rate": 7.187407087458518e-07, + "loss": 1.4146, + "step": 9740 + }, + { + "epoch": 0.3574177939074013, + "grad_norm": 5.904708913785668, + "learning_rate": 7.182218137092204e-07, + "loss": 1.3092, + "step": 9750 + }, + { + "epoch": 0.3577843762601268, + "grad_norm": 4.187532290173183, + "learning_rate": 7.1770262818039e-07, + "loss": 1.2946, + "step": 9760 + }, + { + "epoch": 0.3581509586128524, + "grad_norm": 4.6467762537942, + "learning_rate": 7.17183152850489e-07, + "loss": 1.3212, + "step": 9770 + }, + { + "epoch": 0.3585175409655779, + "grad_norm": 4.424491675585427, + "learning_rate": 7.16663388411032e-07, + "loss": 1.3167, + "step": 9780 + }, + { + "epoch": 0.35888412331830344, + "grad_norm": 4.460602913760459, + "learning_rate": 7.161433355539181e-07, + "loss": 1.3514, + "step": 9790 + }, + { + "epoch": 0.359250705671029, + "grad_norm": 7.380392542181771, + "learning_rate": 7.156229949714307e-07, + "loss": 1.305, + "step": 9800 + }, + { + "epoch": 0.35961728802375453, + "grad_norm": 3.677155226574757, + "learning_rate": 7.15102367356236e-07, + "loss": 1.3175, + "step": 9810 + }, + { + "epoch": 0.35998387037648005, + "grad_norm": 2.995203775176967, + "learning_rate": 7.145814534013821e-07, + "loss": 1.3833, + "step": 9820 + }, + { + "epoch": 0.36035045272920563, + "grad_norm": 3.5086546677463364, + "learning_rate": 7.140602538002989e-07, + "loss": 1.3858, + "step": 9830 + }, + { + "epoch": 0.36071703508193115, + "grad_norm": 3.523795917156669, + "learning_rate": 7.135387692467957e-07, + "loss": 1.3375, + "step": 9840 + }, + { + "epoch": 0.3610836174346567, + "grad_norm": 3.7313877963514, + "learning_rate": 7.130170004350617e-07, + "loss": 1.3094, + "step": 9850 + }, + { + "epoch": 0.36145019978738224, + "grad_norm": 4.442532041857861, + "learning_rate": 7.124949480596644e-07, + "loss": 1.3121, + "step": 9860 + }, + { + "epoch": 0.36181678214010776, + "grad_norm": 5.641090705197642, + "learning_rate": 7.119726128155487e-07, + "loss": 1.3387, + "step": 9870 + }, + { + "epoch": 0.36218336449283334, + "grad_norm": 9.369536303911914, + "learning_rate": 7.114499953980362e-07, + "loss": 1.3413, + "step": 9880 + }, + { + "epoch": 0.36254994684555886, + "grad_norm": 4.32109030408511, + "learning_rate": 7.109270965028238e-07, + "loss": 1.3636, + "step": 9890 + }, + { + "epoch": 0.3629165291982844, + "grad_norm": 6.871086039775216, + "learning_rate": 7.104039168259834e-07, + "loss": 1.352, + "step": 9900 + }, + { + "epoch": 0.36328311155100995, + "grad_norm": 4.509944406939018, + "learning_rate": 7.098804570639605e-07, + "loss": 1.2874, + "step": 9910 + }, + { + "epoch": 0.36364969390373547, + "grad_norm": 4.612863347134658, + "learning_rate": 7.093567179135738e-07, + "loss": 1.2676, + "step": 9920 + }, + { + "epoch": 0.364016276256461, + "grad_norm": 4.091094769005595, + "learning_rate": 7.088327000720131e-07, + "loss": 1.3038, + "step": 9930 + }, + { + "epoch": 0.36438285860918657, + "grad_norm": 4.977334963231582, + "learning_rate": 7.083084042368401e-07, + "loss": 1.3008, + "step": 9940 + }, + { + "epoch": 0.3647494409619121, + "grad_norm": 5.166826475680081, + "learning_rate": 7.077838311059862e-07, + "loss": 1.2881, + "step": 9950 + }, + { + "epoch": 0.36511602331463766, + "grad_norm": 4.01832965003142, + "learning_rate": 7.072589813777518e-07, + "loss": 1.3523, + "step": 9960 + }, + { + "epoch": 0.3654826056673632, + "grad_norm": 3.8045628665321214, + "learning_rate": 7.067338557508055e-07, + "loss": 1.3155, + "step": 9970 + }, + { + "epoch": 0.3658491880200887, + "grad_norm": 4.344284713227578, + "learning_rate": 7.062084549241833e-07, + "loss": 1.3314, + "step": 9980 + }, + { + "epoch": 0.3662157703728143, + "grad_norm": 4.559382806632024, + "learning_rate": 7.056827795972876e-07, + "loss": 1.3242, + "step": 9990 + }, + { + "epoch": 0.3665823527255398, + "grad_norm": 8.960735940046002, + "learning_rate": 7.051568304698862e-07, + "loss": 1.2563, + "step": 10000 + }, + { + "epoch": 0.3665823527255398, + "eval_accuracy": 0.7009188125309459, + "eval_loss": 1.3158118724822998, + "eval_runtime": 311.2198, + "eval_samples_per_second": 10.626, + "eval_steps_per_second": 0.887, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 27279, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1085213557587968.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}