{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1521, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01972386587771203, "grad_norm": 8.260985845744274, "learning_rate": 6.493506493506493e-07, "loss": 0.7556, "step": 10 }, { "epoch": 0.03944773175542406, "grad_norm": 3.285262290141762, "learning_rate": 1.2987012987012986e-06, "loss": 0.6243, "step": 20 }, { "epoch": 0.05917159763313609, "grad_norm": 1.8240152669785012, "learning_rate": 1.9480519480519483e-06, "loss": 0.527, "step": 30 }, { "epoch": 0.07889546351084813, "grad_norm": 2.484918347587673, "learning_rate": 2.597402597402597e-06, "loss": 0.4858, "step": 40 }, { "epoch": 0.09861932938856016, "grad_norm": 1.6071106659008887, "learning_rate": 3.246753246753247e-06, "loss": 0.4616, "step": 50 }, { "epoch": 0.11834319526627218, "grad_norm": 1.7753595146102916, "learning_rate": 3.896103896103897e-06, "loss": 0.4467, "step": 60 }, { "epoch": 0.13806706114398423, "grad_norm": 1.9952784096044014, "learning_rate": 4.5454545454545455e-06, "loss": 0.4355, "step": 70 }, { "epoch": 0.15779092702169625, "grad_norm": 2.0090088525329057, "learning_rate": 4.999952075361122e-06, "loss": 0.4303, "step": 80 }, { "epoch": 0.17751479289940827, "grad_norm": 1.9429866993343683, "learning_rate": 4.99910013857428e-06, "loss": 0.4213, "step": 90 }, { "epoch": 0.19723865877712032, "grad_norm": 2.2150406308730166, "learning_rate": 4.997183673954895e-06, "loss": 0.4205, "step": 100 }, { "epoch": 0.21696252465483234, "grad_norm": 2.3280715715799105, "learning_rate": 4.994203588590157e-06, "loss": 0.4132, "step": 110 }, { "epoch": 0.23668639053254437, "grad_norm": 2.0514718162160617, "learning_rate": 4.9901612929925455e-06, "loss": 0.4097, "step": 120 }, { "epoch": 0.2564102564102564, "grad_norm": 2.248051724393392, "learning_rate": 4.985058700432217e-06, "loss": 0.4078, "step": 130 }, { "epoch": 0.27613412228796846, "grad_norm": 2.4477065193392114, "learning_rate": 4.978898226031426e-06, "loss": 0.4035, "step": 140 }, { "epoch": 0.2958579881656805, "grad_norm": 2.3530821376592317, "learning_rate": 4.97168278562142e-06, "loss": 0.3988, "step": 150 }, { "epoch": 0.3155818540433925, "grad_norm": 2.0658208779463796, "learning_rate": 4.9634157943623345e-06, "loss": 0.4008, "step": 160 }, { "epoch": 0.33530571992110453, "grad_norm": 1.6308701318103827, "learning_rate": 4.954101165126764e-06, "loss": 0.3955, "step": 170 }, { "epoch": 0.35502958579881655, "grad_norm": 1.8767575875235638, "learning_rate": 4.943743306647738e-06, "loss": 0.3964, "step": 180 }, { "epoch": 0.3747534516765286, "grad_norm": 2.158851334024998, "learning_rate": 4.932347121432018e-06, "loss": 0.3955, "step": 190 }, { "epoch": 0.39447731755424065, "grad_norm": 2.2424601067528367, "learning_rate": 4.919918003439677e-06, "loss": 0.3929, "step": 200 }, { "epoch": 0.41420118343195267, "grad_norm": 1.4704562127782181, "learning_rate": 4.9064618355310694e-06, "loss": 0.3951, "step": 210 }, { "epoch": 0.4339250493096647, "grad_norm": 1.5325962055467024, "learning_rate": 4.8919849866823955e-06, "loss": 0.3936, "step": 220 }, { "epoch": 0.4536489151873767, "grad_norm": 1.752553432251344, "learning_rate": 4.8764943089711876e-06, "loss": 0.3894, "step": 230 }, { "epoch": 0.47337278106508873, "grad_norm": 3.582185649197669, "learning_rate": 4.859997134333133e-06, "loss": 0.39, "step": 240 }, { "epoch": 0.4930966469428008, "grad_norm": 2.283623608488685, "learning_rate": 4.842501271091773e-06, "loss": 0.3845, "step": 250 }, { "epoch": 0.5128205128205128, "grad_norm": 2.954635543178996, "learning_rate": 4.8240150002627285e-06, "loss": 0.3853, "step": 260 }, { "epoch": 0.5325443786982249, "grad_norm": 2.621411991175976, "learning_rate": 4.80454707163418e-06, "loss": 0.3802, "step": 270 }, { "epoch": 0.5522682445759369, "grad_norm": 3.0076538937186554, "learning_rate": 4.784106699625493e-06, "loss": 0.3778, "step": 280 }, { "epoch": 0.571992110453649, "grad_norm": 2.620788244299813, "learning_rate": 4.762703558925907e-06, "loss": 0.381, "step": 290 }, { "epoch": 0.591715976331361, "grad_norm": 2.600774288511616, "learning_rate": 4.740347779915384e-06, "loss": 0.3795, "step": 300 }, { "epoch": 0.611439842209073, "grad_norm": 2.825934593188172, "learning_rate": 4.717049943869774e-06, "loss": 0.3754, "step": 310 }, { "epoch": 0.631163708086785, "grad_norm": 1.9636455738063043, "learning_rate": 4.692821077952556e-06, "loss": 0.3709, "step": 320 }, { "epoch": 0.650887573964497, "grad_norm": 1.465934093555826, "learning_rate": 4.667672649995539e-06, "loss": 0.3686, "step": 330 }, { "epoch": 0.6706114398422091, "grad_norm": 1.6730733158146738, "learning_rate": 4.641616563071003e-06, "loss": 0.374, "step": 340 }, { "epoch": 0.6903353057199211, "grad_norm": 1.6420981338152472, "learning_rate": 4.6146651498578095e-06, "loss": 0.3725, "step": 350 }, { "epoch": 0.7100591715976331, "grad_norm": 1.7081311753490396, "learning_rate": 4.586831166804191e-06, "loss": 0.3723, "step": 360 }, { "epoch": 0.7297830374753451, "grad_norm": 1.7004420091082517, "learning_rate": 4.558127788089966e-06, "loss": 0.3685, "step": 370 }, { "epoch": 0.7495069033530573, "grad_norm": 1.813129623101683, "learning_rate": 4.5285685993910246e-06, "loss": 0.3693, "step": 380 }, { "epoch": 0.7692307692307693, "grad_norm": 1.441392302489358, "learning_rate": 4.49816759144906e-06, "loss": 0.3672, "step": 390 }, { "epoch": 0.7889546351084813, "grad_norm": 1.743528342139816, "learning_rate": 4.466939153449565e-06, "loss": 0.3629, "step": 400 }, { "epoch": 0.8086785009861933, "grad_norm": 1.5505480061250534, "learning_rate": 4.434898066211255e-06, "loss": 0.3647, "step": 410 }, { "epoch": 0.8284023668639053, "grad_norm": 1.748134152515452, "learning_rate": 4.402059495190112e-06, "loss": 0.3687, "step": 420 }, { "epoch": 0.8481262327416174, "grad_norm": 1.888131474531523, "learning_rate": 4.368438983301382e-06, "loss": 0.368, "step": 430 }, { "epoch": 0.8678500986193294, "grad_norm": 1.3077877777100417, "learning_rate": 4.334052443562914e-06, "loss": 0.364, "step": 440 }, { "epoch": 0.8875739644970414, "grad_norm": 1.7143497390643974, "learning_rate": 4.298916151563324e-06, "loss": 0.3662, "step": 450 }, { "epoch": 0.9072978303747534, "grad_norm": 1.2650560376490414, "learning_rate": 4.263046737758557e-06, "loss": 0.3634, "step": 460 }, { "epoch": 0.9270216962524654, "grad_norm": 1.325272234023546, "learning_rate": 4.226461179600474e-06, "loss": 0.3647, "step": 470 }, { "epoch": 0.9467455621301775, "grad_norm": 1.7799396783443953, "learning_rate": 4.189176793501208e-06, "loss": 0.3601, "step": 480 }, { "epoch": 0.9664694280078896, "grad_norm": 1.6138030010077298, "learning_rate": 4.151211226637083e-06, "loss": 0.3639, "step": 490 }, { "epoch": 0.9861932938856016, "grad_norm": 1.6475058606657829, "learning_rate": 4.112582448595989e-06, "loss": 0.3631, "step": 500 }, { "epoch": 1.0, "eval_loss": 0.3610161542892456, "eval_runtime": 46.5378, "eval_samples_per_second": 293.095, "eval_steps_per_second": 1.16, "step": 507 }, { "epoch": 1.0059171597633136, "grad_norm": 2.318083617694004, "learning_rate": 4.073308742872136e-06, "loss": 0.339, "step": 510 }, { "epoch": 1.0256410256410255, "grad_norm": 2.26507527796031, "learning_rate": 4.033408698212244e-06, "loss": 0.2904, "step": 520 }, { "epoch": 1.0453648915187377, "grad_norm": 2.129210352759771, "learning_rate": 3.99290119981726e-06, "loss": 0.2845, "step": 530 }, { "epoch": 1.0650887573964498, "grad_norm": 2.0458511034566897, "learning_rate": 3.95180542040374e-06, "loss": 0.2826, "step": 540 }, { "epoch": 1.0848126232741617, "grad_norm": 2.34540520465628, "learning_rate": 3.910140811129166e-06, "loss": 0.2817, "step": 550 }, { "epoch": 1.1045364891518739, "grad_norm": 1.5731137478504271, "learning_rate": 3.8679270923854596e-06, "loss": 0.2816, "step": 560 }, { "epoch": 1.1242603550295858, "grad_norm": 1.9641564243584235, "learning_rate": 3.825184244465071e-06, "loss": 0.2833, "step": 570 }, { "epoch": 1.143984220907298, "grad_norm": 1.5653763677552233, "learning_rate": 3.7819324981040517e-06, "loss": 0.2835, "step": 580 }, { "epoch": 1.1637080867850098, "grad_norm": 1.4455902546137582, "learning_rate": 3.7381923249065838e-06, "loss": 0.2806, "step": 590 }, { "epoch": 1.183431952662722, "grad_norm": 1.4589441051909717, "learning_rate": 3.6939844276555146e-06, "loss": 0.2842, "step": 600 }, { "epoch": 1.2031558185404339, "grad_norm": 1.4737079619190827, "learning_rate": 3.649329730513461e-06, "loss": 0.2818, "step": 610 }, { "epoch": 1.222879684418146, "grad_norm": 1.424470321783783, "learning_rate": 3.6042493691191377e-06, "loss": 0.2835, "step": 620 }, { "epoch": 1.242603550295858, "grad_norm": 1.43822809638539, "learning_rate": 3.558764680583589e-06, "loss": 0.2829, "step": 630 }, { "epoch": 1.26232741617357, "grad_norm": 1.4491877471048427, "learning_rate": 3.51289719339106e-06, "loss": 0.2823, "step": 640 }, { "epoch": 1.282051282051282, "grad_norm": 1.4979353903583295, "learning_rate": 3.4666686172092927e-06, "loss": 0.2859, "step": 650 }, { "epoch": 1.301775147928994, "grad_norm": 1.4793881592613725, "learning_rate": 3.4201008326140596e-06, "loss": 0.2849, "step": 660 }, { "epoch": 1.3214990138067062, "grad_norm": 1.6343693105840815, "learning_rate": 3.3732158807328116e-06, "loss": 0.2875, "step": 670 }, { "epoch": 1.3412228796844181, "grad_norm": 1.5638318327999918, "learning_rate": 3.3260359528123266e-06, "loss": 0.2877, "step": 680 }, { "epoch": 1.3609467455621302, "grad_norm": 1.434550639059279, "learning_rate": 3.2785833797153115e-06, "loss": 0.2817, "step": 690 }, { "epoch": 1.3806706114398422, "grad_norm": 1.3783604211664602, "learning_rate": 3.2308806213509204e-06, "loss": 0.2809, "step": 700 }, { "epoch": 1.4003944773175543, "grad_norm": 1.7104337243982326, "learning_rate": 3.182950256044188e-06, "loss": 0.2825, "step": 710 }, { "epoch": 1.4201183431952662, "grad_norm": 1.9527331404429782, "learning_rate": 3.1348149698494233e-06, "loss": 0.2827, "step": 720 }, { "epoch": 1.4398422090729783, "grad_norm": 1.5082040480125063, "learning_rate": 3.0864975458126158e-06, "loss": 0.2857, "step": 730 }, { "epoch": 1.4595660749506902, "grad_norm": 1.5939434329404958, "learning_rate": 3.038020853187914e-06, "loss": 0.2831, "step": 740 }, { "epoch": 1.4792899408284024, "grad_norm": 1.425454732201556, "learning_rate": 2.98940783661333e-06, "loss": 0.2802, "step": 750 }, { "epoch": 1.4990138067061145, "grad_norm": 1.4324944544127631, "learning_rate": 2.940681505250742e-06, "loss": 0.2848, "step": 760 }, { "epoch": 1.5187376725838264, "grad_norm": 1.4082984304420074, "learning_rate": 2.8918649218953624e-06, "loss": 0.2801, "step": 770 }, { "epoch": 1.5384615384615383, "grad_norm": 1.5895657718154816, "learning_rate": 2.84298119205983e-06, "loss": 0.2807, "step": 780 }, { "epoch": 1.5581854043392505, "grad_norm": 1.6080440377232041, "learning_rate": 2.7940534530380666e-06, "loss": 0.2835, "step": 790 }, { "epoch": 1.5779092702169626, "grad_norm": 1.404915797241871, "learning_rate": 2.7451048629541045e-06, "loss": 0.2808, "step": 800 }, { "epoch": 1.5976331360946747, "grad_norm": 1.4879672080505235, "learning_rate": 2.6961585898010523e-06, "loss": 0.2806, "step": 810 }, { "epoch": 1.6173570019723866, "grad_norm": 1.3888602093522253, "learning_rate": 2.647237800475384e-06, "loss": 0.2832, "step": 820 }, { "epoch": 1.6370808678500985, "grad_norm": 1.3670120148082392, "learning_rate": 2.5983656498117525e-06, "loss": 0.2825, "step": 830 }, { "epoch": 1.6568047337278107, "grad_norm": 1.2812642080517738, "learning_rate": 2.54956526962351e-06, "loss": 0.279, "step": 840 }, { "epoch": 1.6765285996055228, "grad_norm": 1.252430854449729, "learning_rate": 2.5008597577541288e-06, "loss": 0.2814, "step": 850 }, { "epoch": 1.6962524654832347, "grad_norm": 1.2750427994477165, "learning_rate": 2.45227216714469e-06, "loss": 0.2792, "step": 860 }, { "epoch": 1.7159763313609466, "grad_norm": 1.354377403404739, "learning_rate": 2.403825494922636e-06, "loss": 0.282, "step": 870 }, { "epoch": 1.7357001972386588, "grad_norm": 1.4267990848182481, "learning_rate": 2.3555426715169396e-06, "loss": 0.2791, "step": 880 }, { "epoch": 1.755424063116371, "grad_norm": 1.252857555239978, "learning_rate": 2.3074465498048303e-06, "loss": 0.2826, "step": 890 }, { "epoch": 1.7751479289940828, "grad_norm": 1.2876786054611615, "learning_rate": 2.259559894295244e-06, "loss": 0.2789, "step": 900 }, { "epoch": 1.7948717948717947, "grad_norm": 1.2629901820145135, "learning_rate": 2.2119053703540866e-06, "loss": 0.2791, "step": 910 }, { "epoch": 1.8145956607495068, "grad_norm": 1.3562733049556417, "learning_rate": 2.1645055334764237e-06, "loss": 0.2807, "step": 920 }, { "epoch": 1.834319526627219, "grad_norm": 1.3132542320273741, "learning_rate": 2.1173828186106828e-06, "loss": 0.2782, "step": 930 }, { "epoch": 1.854043392504931, "grad_norm": 1.372645351488049, "learning_rate": 2.0705595295399e-06, "loss": 0.28, "step": 940 }, { "epoch": 1.873767258382643, "grad_norm": 1.286506818666612, "learning_rate": 2.0240578283250596e-06, "loss": 0.2788, "step": 950 }, { "epoch": 1.893491124260355, "grad_norm": 1.343985774681719, "learning_rate": 1.9778997248155013e-06, "loss": 0.2779, "step": 960 }, { "epoch": 1.913214990138067, "grad_norm": 1.3873943864064089, "learning_rate": 1.9321070662313824e-06, "loss": 0.2768, "step": 970 }, { "epoch": 1.9329388560157792, "grad_norm": 1.3822544572854645, "learning_rate": 1.88670152682311e-06, "loss": 0.2753, "step": 980 }, { "epoch": 1.952662721893491, "grad_norm": 1.3724554338840655, "learning_rate": 1.8417045976126347e-06, "loss": 0.274, "step": 990 }, { "epoch": 1.972386587771203, "grad_norm": 1.428387339598408, "learning_rate": 1.797137576221482e-06, "loss": 0.2775, "step": 1000 }, { "epoch": 1.9921104536489151, "grad_norm": 1.2370547509299645, "learning_rate": 1.753021556790314e-06, "loss": 0.2746, "step": 1010 }, { "epoch": 2.0, "eval_loss": 0.3482723832130432, "eval_runtime": 46.4255, "eval_samples_per_second": 293.804, "eval_steps_per_second": 1.163, "step": 1014 }, { "epoch": 2.0118343195266273, "grad_norm": 1.9502351693684774, "learning_rate": 1.7093774199948004e-06, "loss": 0.2309, "step": 1020 }, { "epoch": 2.0315581854043394, "grad_norm": 1.5862323859503984, "learning_rate": 1.6662258231625331e-06, "loss": 0.2026, "step": 1030 }, { "epoch": 2.051282051282051, "grad_norm": 1.3292614459089434, "learning_rate": 1.6235871904956431e-06, "loss": 0.2034, "step": 1040 }, { "epoch": 2.0710059171597632, "grad_norm": 1.2370582334736997, "learning_rate": 1.5814817034037715e-06, "loss": 0.2008, "step": 1050 }, { "epoch": 2.0907297830374754, "grad_norm": 1.325897622024457, "learning_rate": 1.5399292909519422e-06, "loss": 0.2042, "step": 1060 }, { "epoch": 2.1104536489151875, "grad_norm": 1.4548395791353137, "learning_rate": 1.4989496204278897e-06, "loss": 0.2025, "step": 1070 }, { "epoch": 2.1301775147928996, "grad_norm": 1.36179677292465, "learning_rate": 1.458562088033273e-06, "loss": 0.1978, "step": 1080 }, { "epoch": 2.1499013806706113, "grad_norm": 1.4589926591648759, "learning_rate": 1.4187858097032086e-06, "loss": 0.2024, "step": 1090 }, { "epoch": 2.1696252465483234, "grad_norm": 1.3095440667780154, "learning_rate": 1.3796396120584576e-06, "loss": 0.2032, "step": 1100 }, { "epoch": 2.1893491124260356, "grad_norm": 1.3522834520399176, "learning_rate": 1.341142023494537e-06, "loss": 0.1992, "step": 1110 }, { "epoch": 2.2090729783037477, "grad_norm": 1.3914925068585928, "learning_rate": 1.3033112654120032e-06, "loss": 0.2029, "step": 1120 }, { "epoch": 2.2287968441814594, "grad_norm": 1.2392072409116117, "learning_rate": 1.266165243592024e-06, "loss": 0.2019, "step": 1130 }, { "epoch": 2.2485207100591715, "grad_norm": 1.450828785906611, "learning_rate": 1.2297215397213442e-06, "loss": 0.2029, "step": 1140 }, { "epoch": 2.2682445759368837, "grad_norm": 1.3539897715774756, "learning_rate": 1.1939974030706499e-06, "loss": 0.1989, "step": 1150 }, { "epoch": 2.287968441814596, "grad_norm": 1.3124427663284721, "learning_rate": 1.1590097423302681e-06, "loss": 0.2013, "step": 1160 }, { "epoch": 2.3076923076923075, "grad_norm": 1.2751387286158546, "learning_rate": 1.1247751176070688e-06, "loss": 0.2003, "step": 1170 }, { "epoch": 2.3274161735700196, "grad_norm": 1.2826788452929796, "learning_rate": 1.0913097325863526e-06, "loss": 0.2013, "step": 1180 }, { "epoch": 2.3471400394477318, "grad_norm": 1.3449233167779666, "learning_rate": 1.0586294268624391e-06, "loss": 0.2031, "step": 1190 }, { "epoch": 2.366863905325444, "grad_norm": 1.3034368496811286, "learning_rate": 1.026749668441587e-06, "loss": 0.1994, "step": 1200 }, { "epoch": 2.386587771203156, "grad_norm": 1.3565807097213252, "learning_rate": 9.956855464207873e-07, "loss": 0.2, "step": 1210 }, { "epoch": 2.4063116370808677, "grad_norm": 1.451004027193357, "learning_rate": 9.654517638459015e-07, "loss": 0.1996, "step": 1220 }, { "epoch": 2.42603550295858, "grad_norm": 1.3107553476519733, "learning_rate": 9.360626307525231e-07, "loss": 0.2004, "step": 1230 }, { "epoch": 2.445759368836292, "grad_norm": 1.2866100592193557, "learning_rate": 9.075320573928513e-07, "loss": 0.2026, "step": 1240 }, { "epoch": 2.465483234714004, "grad_norm": 1.3169876215045113, "learning_rate": 8.798735476517964e-07, "loss": 0.2027, "step": 1250 }, { "epoch": 2.485207100591716, "grad_norm": 1.2821201625196061, "learning_rate": 8.531001926554134e-07, "loss": 0.2011, "step": 1260 }, { "epoch": 2.504930966469428, "grad_norm": 1.315132765819279, "learning_rate": 8.272246645747072e-07, "loss": 0.199, "step": 1270 }, { "epoch": 2.52465483234714, "grad_norm": 1.276154658164099, "learning_rate": 8.022592106277332e-07, "loss": 0.2008, "step": 1280 }, { "epoch": 2.544378698224852, "grad_norm": 1.2274421062761773, "learning_rate": 7.782156472828299e-07, "loss": 0.1998, "step": 1290 }, { "epoch": 2.564102564102564, "grad_norm": 1.2435720383981574, "learning_rate": 7.551053546657356e-07, "loss": 0.1995, "step": 1300 }, { "epoch": 2.583826429980276, "grad_norm": 1.2327909078947592, "learning_rate": 7.329392711732278e-07, "loss": 0.2024, "step": 1310 }, { "epoch": 2.603550295857988, "grad_norm": 1.1783489485507048, "learning_rate": 7.117278882958421e-07, "loss": 0.2003, "step": 1320 }, { "epoch": 2.6232741617357003, "grad_norm": 1.2687230261577986, "learning_rate": 6.914812456521138e-07, "loss": 0.2006, "step": 1330 }, { "epoch": 2.6429980276134124, "grad_norm": 1.2646158919927277, "learning_rate": 6.722089262366993e-07, "loss": 0.1982, "step": 1340 }, { "epoch": 2.662721893491124, "grad_norm": 1.2236131305338422, "learning_rate": 6.539200518846226e-07, "loss": 0.2001, "step": 1350 }, { "epoch": 2.6824457593688362, "grad_norm": 1.2428023457207789, "learning_rate": 6.366232789537923e-07, "loss": 0.2048, "step": 1360 }, { "epoch": 2.7021696252465484, "grad_norm": 1.2559417256017682, "learning_rate": 6.203267942278395e-07, "loss": 0.2012, "step": 1370 }, { "epoch": 2.7218934911242605, "grad_norm": 1.2572564112264348, "learning_rate": 6.050383110412069e-07, "loss": 0.1994, "step": 1380 }, { "epoch": 2.7416173570019726, "grad_norm": 1.1764889460619852, "learning_rate": 5.907650656283289e-07, "loss": 0.2002, "step": 1390 }, { "epoch": 2.7613412228796843, "grad_norm": 1.2804661059833917, "learning_rate": 5.775138136986298e-07, "loss": 0.2002, "step": 1400 }, { "epoch": 2.7810650887573964, "grad_norm": 1.3077263435732718, "learning_rate": 5.652908272389604e-07, "loss": 0.1995, "step": 1410 }, { "epoch": 2.8007889546351086, "grad_norm": 1.231137370296971, "learning_rate": 5.541018915449863e-07, "loss": 0.1989, "step": 1420 }, { "epoch": 2.8205128205128203, "grad_norm": 1.3443797697665705, "learning_rate": 5.439523024829335e-07, "loss": 0.1983, "step": 1430 }, { "epoch": 2.8402366863905324, "grad_norm": 1.2092638219767884, "learning_rate": 5.348468639829871e-07, "loss": 0.2007, "step": 1440 }, { "epoch": 2.8599605522682445, "grad_norm": 1.2392545674361426, "learning_rate": 5.267898857655307e-07, "loss": 0.201, "step": 1450 }, { "epoch": 2.8796844181459567, "grad_norm": 1.255507262390408, "learning_rate": 5.19785181301299e-07, "loss": 0.2008, "step": 1460 }, { "epoch": 2.899408284023669, "grad_norm": 1.2545629120536586, "learning_rate": 5.138360660064146e-07, "loss": 0.1979, "step": 1470 }, { "epoch": 2.9191321499013805, "grad_norm": 1.2279624795193589, "learning_rate": 5.08945355673159e-07, "loss": 0.201, "step": 1480 }, { "epoch": 2.9388560157790926, "grad_norm": 1.2395946923655343, "learning_rate": 5.05115365137222e-07, "loss": 0.1999, "step": 1490 }, { "epoch": 2.9585798816568047, "grad_norm": 1.2212433583596156, "learning_rate": 5.023479071820607e-07, "loss": 0.1989, "step": 1500 }, { "epoch": 2.978303747534517, "grad_norm": 1.298954785136158, "learning_rate": 5.006442916808849e-07, "loss": 0.2019, "step": 1510 }, { "epoch": 2.998027613412229, "grad_norm": 1.3586461216494594, "learning_rate": 5.000053249766787e-07, "loss": 0.1999, "step": 1520 }, { "epoch": 3.0, "eval_loss": 0.37075862288475037, "eval_runtime": 53.9042, "eval_samples_per_second": 253.041, "eval_steps_per_second": 1.002, "step": 1521 }, { "epoch": 3.0, "step": 1521, "total_flos": 2547731650314240.0, "train_loss": 0.2954053143131192, "train_runtime": 9131.3778, "train_samples_per_second": 85.143, "train_steps_per_second": 0.167 } ], "logging_steps": 10, "max_steps": 1521, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2547731650314240.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }