{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 2038, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004906771344455349, "grad_norm": 0.1728515625, "learning_rate": 5e-06, "loss": 0.6955, "step": 10 }, { "epoch": 0.009813542688910697, "grad_norm": 0.291015625, "learning_rate": 1e-05, "loss": 0.6952, "step": 20 }, { "epoch": 0.014720314033366046, "grad_norm": 0.25, "learning_rate": 1.5e-05, "loss": 0.6964, "step": 30 }, { "epoch": 0.019627085377821395, "grad_norm": 0.212890625, "learning_rate": 2e-05, "loss": 0.6966, "step": 40 }, { "epoch": 0.02453385672227674, "grad_norm": 0.302734375, "learning_rate": 2.5e-05, "loss": 0.6944, "step": 50 }, { "epoch": 0.02453385672227674, "eval_loss": 0.6937512159347534, "eval_runtime": 28.3512, "eval_samples_per_second": 7.054, "eval_steps_per_second": 1.764, "step": 50 }, { "epoch": 0.029440628066732092, "grad_norm": 0.2353515625, "learning_rate": 3e-05, "loss": 0.6944, "step": 60 }, { "epoch": 0.03434739941118744, "grad_norm": 0.14453125, "learning_rate": 3.5e-05, "loss": 0.694, "step": 70 }, { "epoch": 0.03925417075564279, "grad_norm": 0.26953125, "learning_rate": 4e-05, "loss": 0.6938, "step": 80 }, { "epoch": 0.04416094210009813, "grad_norm": 0.279296875, "learning_rate": 4.5e-05, "loss": 0.6928, "step": 90 }, { "epoch": 0.04906771344455348, "grad_norm": 0.2490234375, "learning_rate": 5e-05, "loss": 0.6928, "step": 100 }, { "epoch": 0.04906771344455348, "eval_loss": 0.6929062008857727, "eval_runtime": 28.6139, "eval_samples_per_second": 6.99, "eval_steps_per_second": 1.747, "step": 100 }, { "epoch": 0.053974484789008834, "grad_norm": 0.296875, "learning_rate": 4.974200206398349e-05, "loss": 0.692, "step": 110 }, { "epoch": 0.058881256133464184, "grad_norm": 0.33984375, "learning_rate": 4.948400412796697e-05, "loss": 0.6924, "step": 120 }, { "epoch": 0.06378802747791953, "grad_norm": 0.3359375, "learning_rate": 4.922600619195047e-05, "loss": 0.6921, "step": 130 }, { "epoch": 0.06869479882237488, "grad_norm": 0.267578125, "learning_rate": 4.896800825593396e-05, "loss": 0.6927, "step": 140 }, { "epoch": 0.07360157016683022, "grad_norm": 0.2236328125, "learning_rate": 4.8710010319917446e-05, "loss": 0.6914, "step": 150 }, { "epoch": 0.07360157016683022, "eval_loss": 0.6915245056152344, "eval_runtime": 28.6003, "eval_samples_per_second": 6.993, "eval_steps_per_second": 1.748, "step": 150 }, { "epoch": 0.07850834151128558, "grad_norm": 0.345703125, "learning_rate": 4.845201238390093e-05, "loss": 0.6923, "step": 160 }, { "epoch": 0.08341511285574092, "grad_norm": 0.3046875, "learning_rate": 4.819401444788442e-05, "loss": 0.6893, "step": 170 }, { "epoch": 0.08832188420019627, "grad_norm": 0.28125, "learning_rate": 4.793601651186791e-05, "loss": 0.6865, "step": 180 }, { "epoch": 0.09322865554465162, "grad_norm": 0.43359375, "learning_rate": 4.7678018575851394e-05, "loss": 0.6877, "step": 190 }, { "epoch": 0.09813542688910697, "grad_norm": 0.42578125, "learning_rate": 4.742002063983488e-05, "loss": 0.6852, "step": 200 }, { "epoch": 0.09813542688910697, "eval_loss": 0.6849114894866943, "eval_runtime": 28.416, "eval_samples_per_second": 7.038, "eval_steps_per_second": 1.76, "step": 200 }, { "epoch": 0.10304219823356231, "grad_norm": 0.39453125, "learning_rate": 4.716202270381837e-05, "loss": 0.6767, "step": 210 }, { "epoch": 0.10794896957801767, "grad_norm": 0.45703125, "learning_rate": 4.690402476780186e-05, "loss": 0.6804, "step": 220 }, { "epoch": 0.11285574092247301, "grad_norm": 0.78125, "learning_rate": 4.664602683178535e-05, "loss": 0.6814, "step": 230 }, { "epoch": 0.11776251226692837, "grad_norm": 2.046875, "learning_rate": 4.638802889576884e-05, "loss": 0.6586, "step": 240 }, { "epoch": 0.12266928361138371, "grad_norm": 1.828125, "learning_rate": 4.613003095975233e-05, "loss": 0.6478, "step": 250 }, { "epoch": 0.12266928361138371, "eval_loss": 0.6532555222511292, "eval_runtime": 28.4259, "eval_samples_per_second": 7.036, "eval_steps_per_second": 1.759, "step": 250 }, { "epoch": 0.12757605495583907, "grad_norm": 1.90625, "learning_rate": 4.587203302373581e-05, "loss": 0.6473, "step": 260 }, { "epoch": 0.1324828263002944, "grad_norm": 1.84375, "learning_rate": 4.56140350877193e-05, "loss": 0.6397, "step": 270 }, { "epoch": 0.13738959764474976, "grad_norm": 2.046875, "learning_rate": 4.535603715170279e-05, "loss": 0.6243, "step": 280 }, { "epoch": 0.1422963689892051, "grad_norm": 1.640625, "learning_rate": 4.5098039215686275e-05, "loss": 0.6304, "step": 290 }, { "epoch": 0.14720314033366044, "grad_norm": 1.84375, "learning_rate": 4.4840041279669764e-05, "loss": 0.6278, "step": 300 }, { "epoch": 0.14720314033366044, "eval_loss": 0.6153883337974548, "eval_runtime": 28.6968, "eval_samples_per_second": 6.969, "eval_steps_per_second": 1.742, "step": 300 }, { "epoch": 0.1521099116781158, "grad_norm": 1.5, "learning_rate": 4.458204334365325e-05, "loss": 0.6067, "step": 310 }, { "epoch": 0.15701668302257116, "grad_norm": 1.7734375, "learning_rate": 4.432404540763674e-05, "loss": 0.5956, "step": 320 }, { "epoch": 0.1619234543670265, "grad_norm": 1.640625, "learning_rate": 4.406604747162023e-05, "loss": 0.6263, "step": 330 }, { "epoch": 0.16683022571148184, "grad_norm": 1.5, "learning_rate": 4.380804953560372e-05, "loss": 0.5947, "step": 340 }, { "epoch": 0.1717369970559372, "grad_norm": 1.0390625, "learning_rate": 4.355005159958721e-05, "loss": 0.609, "step": 350 }, { "epoch": 0.1717369970559372, "eval_loss": 0.6010438799858093, "eval_runtime": 28.6965, "eval_samples_per_second": 6.969, "eval_steps_per_second": 1.742, "step": 350 }, { "epoch": 0.17664376840039253, "grad_norm": 1.90625, "learning_rate": 4.329205366357069e-05, "loss": 0.5948, "step": 360 }, { "epoch": 0.1815505397448479, "grad_norm": 1.546875, "learning_rate": 4.303405572755418e-05, "loss": 0.6043, "step": 370 }, { "epoch": 0.18645731108930325, "grad_norm": 1.71875, "learning_rate": 4.2776057791537674e-05, "loss": 0.5791, "step": 380 }, { "epoch": 0.19136408243375858, "grad_norm": 1.2109375, "learning_rate": 4.2518059855521156e-05, "loss": 0.56, "step": 390 }, { "epoch": 0.19627085377821393, "grad_norm": 1.7734375, "learning_rate": 4.2260061919504645e-05, "loss": 0.5801, "step": 400 }, { "epoch": 0.19627085377821393, "eval_loss": 0.5847232937812805, "eval_runtime": 28.6639, "eval_samples_per_second": 6.977, "eval_steps_per_second": 1.744, "step": 400 }, { "epoch": 0.2011776251226693, "grad_norm": 1.703125, "learning_rate": 4.200206398348813e-05, "loss": 0.5517, "step": 410 }, { "epoch": 0.20608439646712462, "grad_norm": 1.296875, "learning_rate": 4.174406604747162e-05, "loss": 0.5799, "step": 420 }, { "epoch": 0.21099116781157998, "grad_norm": 1.1953125, "learning_rate": 4.148606811145511e-05, "loss": 0.54, "step": 430 }, { "epoch": 0.21589793915603533, "grad_norm": 3.859375, "learning_rate": 4.12280701754386e-05, "loss": 0.5591, "step": 440 }, { "epoch": 0.22080471050049066, "grad_norm": 1.1484375, "learning_rate": 4.097007223942209e-05, "loss": 0.5558, "step": 450 }, { "epoch": 0.22080471050049066, "eval_loss": 0.5724604725837708, "eval_runtime": 28.6438, "eval_samples_per_second": 6.982, "eval_steps_per_second": 1.746, "step": 450 }, { "epoch": 0.22571148184494602, "grad_norm": 1.2265625, "learning_rate": 4.071207430340557e-05, "loss": 0.6203, "step": 460 }, { "epoch": 0.23061825318940138, "grad_norm": 1.5234375, "learning_rate": 4.0454076367389066e-05, "loss": 0.5813, "step": 470 }, { "epoch": 0.23552502453385674, "grad_norm": 2.5, "learning_rate": 4.0196078431372555e-05, "loss": 0.5946, "step": 480 }, { "epoch": 0.24043179587831207, "grad_norm": 1.671875, "learning_rate": 3.9938080495356037e-05, "loss": 0.5635, "step": 490 }, { "epoch": 0.24533856722276742, "grad_norm": 1.3125, "learning_rate": 3.9680082559339525e-05, "loss": 0.5625, "step": 500 }, { "epoch": 0.24533856722276742, "eval_loss": 0.5732089877128601, "eval_runtime": 28.6521, "eval_samples_per_second": 6.98, "eval_steps_per_second": 1.745, "step": 500 }, { "epoch": 0.25024533856722275, "grad_norm": 1.453125, "learning_rate": 3.9422084623323014e-05, "loss": 0.5378, "step": 510 }, { "epoch": 0.25515210991167814, "grad_norm": 1.046875, "learning_rate": 3.91640866873065e-05, "loss": 0.5908, "step": 520 }, { "epoch": 0.26005888125613347, "grad_norm": 1.2421875, "learning_rate": 3.890608875128999e-05, "loss": 0.576, "step": 530 }, { "epoch": 0.2649656526005888, "grad_norm": 0.87109375, "learning_rate": 3.864809081527348e-05, "loss": 0.5248, "step": 540 }, { "epoch": 0.2698724239450442, "grad_norm": 2.046875, "learning_rate": 3.839009287925697e-05, "loss": 0.5448, "step": 550 }, { "epoch": 0.2698724239450442, "eval_loss": 0.5607851147651672, "eval_runtime": 28.5118, "eval_samples_per_second": 7.015, "eval_steps_per_second": 1.754, "step": 550 }, { "epoch": 0.2747791952894995, "grad_norm": 1.828125, "learning_rate": 3.813209494324045e-05, "loss": 0.5693, "step": 560 }, { "epoch": 0.27968596663395484, "grad_norm": 0.65234375, "learning_rate": 3.7874097007223946e-05, "loss": 0.5248, "step": 570 }, { "epoch": 0.2845927379784102, "grad_norm": 1.4453125, "learning_rate": 3.7616099071207435e-05, "loss": 0.5815, "step": 580 }, { "epoch": 0.28949950932286556, "grad_norm": 1.0078125, "learning_rate": 3.735810113519092e-05, "loss": 0.5524, "step": 590 }, { "epoch": 0.2944062806673209, "grad_norm": 1.3984375, "learning_rate": 3.7100103199174406e-05, "loss": 0.5517, "step": 600 }, { "epoch": 0.2944062806673209, "eval_loss": 0.5586492419242859, "eval_runtime": 28.5258, "eval_samples_per_second": 7.011, "eval_steps_per_second": 1.753, "step": 600 }, { "epoch": 0.29931305201177627, "grad_norm": 1.375, "learning_rate": 3.6842105263157895e-05, "loss": 0.5308, "step": 610 }, { "epoch": 0.3042198233562316, "grad_norm": 1.3046875, "learning_rate": 3.658410732714139e-05, "loss": 0.5938, "step": 620 }, { "epoch": 0.30912659470068693, "grad_norm": 1.78125, "learning_rate": 3.632610939112487e-05, "loss": 0.5611, "step": 630 }, { "epoch": 0.3140333660451423, "grad_norm": 0.74609375, "learning_rate": 3.606811145510836e-05, "loss": 0.5709, "step": 640 }, { "epoch": 0.31894013738959764, "grad_norm": 0.99609375, "learning_rate": 3.581011351909185e-05, "loss": 0.6033, "step": 650 }, { "epoch": 0.31894013738959764, "eval_loss": 0.564706027507782, "eval_runtime": 28.5712, "eval_samples_per_second": 7.0, "eval_steps_per_second": 1.75, "step": 650 }, { "epoch": 0.323846908734053, "grad_norm": 1.2578125, "learning_rate": 3.555211558307533e-05, "loss": 0.5123, "step": 660 }, { "epoch": 0.32875368007850836, "grad_norm": 0.97265625, "learning_rate": 3.529411764705883e-05, "loss": 0.5205, "step": 670 }, { "epoch": 0.3336604514229637, "grad_norm": 1.6953125, "learning_rate": 3.5036119711042316e-05, "loss": 0.5854, "step": 680 }, { "epoch": 0.338567222767419, "grad_norm": 1.453125, "learning_rate": 3.4778121775025805e-05, "loss": 0.5375, "step": 690 }, { "epoch": 0.3434739941118744, "grad_norm": 1.03125, "learning_rate": 3.452012383900929e-05, "loss": 0.5336, "step": 700 }, { "epoch": 0.3434739941118744, "eval_loss": 0.5557608008384705, "eval_runtime": 28.4609, "eval_samples_per_second": 7.027, "eval_steps_per_second": 1.757, "step": 700 }, { "epoch": 0.34838076545632973, "grad_norm": 1.5859375, "learning_rate": 3.4262125902992775e-05, "loss": 0.5533, "step": 710 }, { "epoch": 0.35328753680078506, "grad_norm": 0.8984375, "learning_rate": 3.400412796697627e-05, "loss": 0.5333, "step": 720 }, { "epoch": 0.35819430814524045, "grad_norm": 1.09375, "learning_rate": 3.374613003095975e-05, "loss": 0.5536, "step": 730 }, { "epoch": 0.3631010794896958, "grad_norm": 1.1171875, "learning_rate": 3.348813209494324e-05, "loss": 0.5448, "step": 740 }, { "epoch": 0.3680078508341511, "grad_norm": 0.96875, "learning_rate": 3.323013415892673e-05, "loss": 0.532, "step": 750 }, { "epoch": 0.3680078508341511, "eval_loss": 0.5521541833877563, "eval_runtime": 28.4595, "eval_samples_per_second": 7.028, "eval_steps_per_second": 1.757, "step": 750 }, { "epoch": 0.3729146221786065, "grad_norm": 0.90625, "learning_rate": 3.297213622291022e-05, "loss": 0.565, "step": 760 }, { "epoch": 0.3778213935230618, "grad_norm": 1.4140625, "learning_rate": 3.271413828689371e-05, "loss": 0.5278, "step": 770 }, { "epoch": 0.38272816486751715, "grad_norm": 1.3359375, "learning_rate": 3.24561403508772e-05, "loss": 0.5582, "step": 780 }, { "epoch": 0.38763493621197254, "grad_norm": 0.84375, "learning_rate": 3.2198142414860685e-05, "loss": 0.5253, "step": 790 }, { "epoch": 0.39254170755642787, "grad_norm": 1.4296875, "learning_rate": 3.194014447884417e-05, "loss": 0.5596, "step": 800 }, { "epoch": 0.39254170755642787, "eval_loss": 0.551950216293335, "eval_runtime": 28.6541, "eval_samples_per_second": 6.98, "eval_steps_per_second": 1.745, "step": 800 }, { "epoch": 0.3974484789008832, "grad_norm": 1.484375, "learning_rate": 3.1682146542827656e-05, "loss": 0.5513, "step": 810 }, { "epoch": 0.4023552502453386, "grad_norm": 1.21875, "learning_rate": 3.142414860681115e-05, "loss": 0.5023, "step": 820 }, { "epoch": 0.4072620215897939, "grad_norm": 1.109375, "learning_rate": 3.1166150670794634e-05, "loss": 0.5519, "step": 830 }, { "epoch": 0.41216879293424924, "grad_norm": 1.3984375, "learning_rate": 3.090815273477812e-05, "loss": 0.51, "step": 840 }, { "epoch": 0.4170755642787046, "grad_norm": 1.0859375, "learning_rate": 3.065015479876161e-05, "loss": 0.5496, "step": 850 }, { "epoch": 0.4170755642787046, "eval_loss": 0.5526400208473206, "eval_runtime": 28.572, "eval_samples_per_second": 7.0, "eval_steps_per_second": 1.75, "step": 850 }, { "epoch": 0.42198233562315995, "grad_norm": 1.2265625, "learning_rate": 3.0392156862745097e-05, "loss": 0.5372, "step": 860 }, { "epoch": 0.4268891069676153, "grad_norm": 1.9140625, "learning_rate": 3.013415892672859e-05, "loss": 0.5384, "step": 870 }, { "epoch": 0.43179587831207067, "grad_norm": 0.99609375, "learning_rate": 2.9876160990712077e-05, "loss": 0.4739, "step": 880 }, { "epoch": 0.436702649656526, "grad_norm": 1.15625, "learning_rate": 2.9618163054695563e-05, "loss": 0.4766, "step": 890 }, { "epoch": 0.44160942100098133, "grad_norm": 1.7734375, "learning_rate": 2.936016511867905e-05, "loss": 0.5321, "step": 900 }, { "epoch": 0.44160942100098133, "eval_loss": 0.5481389164924622, "eval_runtime": 28.6453, "eval_samples_per_second": 6.982, "eval_steps_per_second": 1.745, "step": 900 }, { "epoch": 0.4465161923454367, "grad_norm": 0.9453125, "learning_rate": 2.9102167182662537e-05, "loss": 0.5654, "step": 910 }, { "epoch": 0.45142296368989204, "grad_norm": 1.09375, "learning_rate": 2.884416924664603e-05, "loss": 0.5514, "step": 920 }, { "epoch": 0.4563297350343474, "grad_norm": 1.1875, "learning_rate": 2.8586171310629518e-05, "loss": 0.4833, "step": 930 }, { "epoch": 0.46123650637880276, "grad_norm": 1.4609375, "learning_rate": 2.8328173374613003e-05, "loss": 0.549, "step": 940 }, { "epoch": 0.4661432777232581, "grad_norm": 0.9453125, "learning_rate": 2.8070175438596492e-05, "loss": 0.5194, "step": 950 }, { "epoch": 0.4661432777232581, "eval_loss": 0.5417376160621643, "eval_runtime": 28.3949, "eval_samples_per_second": 7.044, "eval_steps_per_second": 1.761, "step": 950 }, { "epoch": 0.47105004906771347, "grad_norm": 1.4296875, "learning_rate": 2.7812177502579977e-05, "loss": 0.5468, "step": 960 }, { "epoch": 0.4759568204121688, "grad_norm": 0.859375, "learning_rate": 2.755417956656347e-05, "loss": 0.5307, "step": 970 }, { "epoch": 0.48086359175662413, "grad_norm": 1.6640625, "learning_rate": 2.7296181630546958e-05, "loss": 0.5379, "step": 980 }, { "epoch": 0.4857703631010795, "grad_norm": 1.8828125, "learning_rate": 2.7038183694530443e-05, "loss": 0.5197, "step": 990 }, { "epoch": 0.49067713444553485, "grad_norm": 1.265625, "learning_rate": 2.6780185758513932e-05, "loss": 0.5557, "step": 1000 }, { "epoch": 0.49067713444553485, "eval_loss": 0.5419167876243591, "eval_runtime": 28.5704, "eval_samples_per_second": 7.0, "eval_steps_per_second": 1.75, "step": 1000 }, { "epoch": 0.4955839057899902, "grad_norm": 0.984375, "learning_rate": 2.6522187822497424e-05, "loss": 0.5121, "step": 1010 }, { "epoch": 0.5004906771344455, "grad_norm": 1.7421875, "learning_rate": 2.626418988648091e-05, "loss": 0.4932, "step": 1020 }, { "epoch": 0.5053974484789009, "grad_norm": 1.1015625, "learning_rate": 2.60061919504644e-05, "loss": 0.5361, "step": 1030 }, { "epoch": 0.5103042198233563, "grad_norm": 1.1015625, "learning_rate": 2.5748194014447884e-05, "loss": 0.5107, "step": 1040 }, { "epoch": 0.5152109911678115, "grad_norm": 1.359375, "learning_rate": 2.5490196078431373e-05, "loss": 0.5111, "step": 1050 }, { "epoch": 0.5152109911678115, "eval_loss": 0.5413140654563904, "eval_runtime": 28.49, "eval_samples_per_second": 7.02, "eval_steps_per_second": 1.755, "step": 1050 }, { "epoch": 0.5201177625122669, "grad_norm": 0.57421875, "learning_rate": 2.5232198142414865e-05, "loss": 0.5302, "step": 1060 }, { "epoch": 0.5250245338567223, "grad_norm": 1.859375, "learning_rate": 2.497420020639835e-05, "loss": 0.5089, "step": 1070 }, { "epoch": 0.5299313052011776, "grad_norm": 1.78125, "learning_rate": 2.471620227038184e-05, "loss": 0.5451, "step": 1080 }, { "epoch": 0.534838076545633, "grad_norm": 1.0546875, "learning_rate": 2.4458204334365324e-05, "loss": 0.5736, "step": 1090 }, { "epoch": 0.5397448478900884, "grad_norm": 1.28125, "learning_rate": 2.4200206398348816e-05, "loss": 0.5065, "step": 1100 }, { "epoch": 0.5397448478900884, "eval_loss": 0.5397326350212097, "eval_runtime": 28.806, "eval_samples_per_second": 6.943, "eval_steps_per_second": 1.736, "step": 1100 }, { "epoch": 0.5446516192345436, "grad_norm": 1.046875, "learning_rate": 2.39422084623323e-05, "loss": 0.5719, "step": 1110 }, { "epoch": 0.549558390578999, "grad_norm": 0.75390625, "learning_rate": 2.368421052631579e-05, "loss": 0.5343, "step": 1120 }, { "epoch": 0.5544651619234544, "grad_norm": 1.6484375, "learning_rate": 2.342621259029928e-05, "loss": 0.523, "step": 1130 }, { "epoch": 0.5593719332679097, "grad_norm": 0.9765625, "learning_rate": 2.3168214654282765e-05, "loss": 0.5345, "step": 1140 }, { "epoch": 0.5642787046123651, "grad_norm": 0.8359375, "learning_rate": 2.2910216718266257e-05, "loss": 0.5264, "step": 1150 }, { "epoch": 0.5642787046123651, "eval_loss": 0.5375524759292603, "eval_runtime": 28.5508, "eval_samples_per_second": 7.005, "eval_steps_per_second": 1.751, "step": 1150 }, { "epoch": 0.5691854759568205, "grad_norm": 0.9609375, "learning_rate": 2.2652218782249742e-05, "loss": 0.4863, "step": 1160 }, { "epoch": 0.5740922473012757, "grad_norm": 1.6875, "learning_rate": 2.2394220846233234e-05, "loss": 0.4676, "step": 1170 }, { "epoch": 0.5789990186457311, "grad_norm": 1.3984375, "learning_rate": 2.213622291021672e-05, "loss": 0.5751, "step": 1180 }, { "epoch": 0.5839057899901865, "grad_norm": 2.15625, "learning_rate": 2.1878224974200205e-05, "loss": 0.5694, "step": 1190 }, { "epoch": 0.5888125613346418, "grad_norm": 0.98828125, "learning_rate": 2.1620227038183697e-05, "loss": 0.5529, "step": 1200 }, { "epoch": 0.5888125613346418, "eval_loss": 0.5384119153022766, "eval_runtime": 28.624, "eval_samples_per_second": 6.987, "eval_steps_per_second": 1.747, "step": 1200 }, { "epoch": 0.5937193326790972, "grad_norm": 1.984375, "learning_rate": 2.1362229102167182e-05, "loss": 0.5438, "step": 1210 }, { "epoch": 0.5986261040235525, "grad_norm": 1.3203125, "learning_rate": 2.1104231166150675e-05, "loss": 0.5174, "step": 1220 }, { "epoch": 0.6035328753680078, "grad_norm": 1.5078125, "learning_rate": 2.084623323013416e-05, "loss": 0.5421, "step": 1230 }, { "epoch": 0.6084396467124632, "grad_norm": 1.234375, "learning_rate": 2.058823529411765e-05, "loss": 0.5427, "step": 1240 }, { "epoch": 0.6133464180569186, "grad_norm": 0.95703125, "learning_rate": 2.0330237358101137e-05, "loss": 0.5356, "step": 1250 }, { "epoch": 0.6133464180569186, "eval_loss": 0.5382110476493835, "eval_runtime": 28.6288, "eval_samples_per_second": 6.986, "eval_steps_per_second": 1.746, "step": 1250 }, { "epoch": 0.6182531894013739, "grad_norm": 1.53125, "learning_rate": 2.0072239422084623e-05, "loss": 0.5667, "step": 1260 }, { "epoch": 0.6231599607458292, "grad_norm": 1.5078125, "learning_rate": 1.9814241486068115e-05, "loss": 0.5421, "step": 1270 }, { "epoch": 0.6280667320902846, "grad_norm": 1.6015625, "learning_rate": 1.95562435500516e-05, "loss": 0.4955, "step": 1280 }, { "epoch": 0.6329735034347399, "grad_norm": 1.2890625, "learning_rate": 1.929824561403509e-05, "loss": 0.4995, "step": 1290 }, { "epoch": 0.6378802747791953, "grad_norm": 1.4140625, "learning_rate": 1.9040247678018578e-05, "loss": 0.5029, "step": 1300 }, { "epoch": 0.6378802747791953, "eval_loss": 0.5337280035018921, "eval_runtime": 28.4662, "eval_samples_per_second": 7.026, "eval_steps_per_second": 1.756, "step": 1300 }, { "epoch": 0.6427870461236507, "grad_norm": 1.3203125, "learning_rate": 1.8782249742002063e-05, "loss": 0.5149, "step": 1310 }, { "epoch": 0.647693817468106, "grad_norm": 0.91796875, "learning_rate": 1.8524251805985555e-05, "loss": 0.5011, "step": 1320 }, { "epoch": 0.6526005888125613, "grad_norm": 1.2890625, "learning_rate": 1.826625386996904e-05, "loss": 0.5474, "step": 1330 }, { "epoch": 0.6575073601570167, "grad_norm": 1.4375, "learning_rate": 1.800825593395253e-05, "loss": 0.5195, "step": 1340 }, { "epoch": 0.662414131501472, "grad_norm": 1.375, "learning_rate": 1.7750257997936018e-05, "loss": 0.4843, "step": 1350 }, { "epoch": 0.662414131501472, "eval_loss": 0.5355111360549927, "eval_runtime": 28.5183, "eval_samples_per_second": 7.013, "eval_steps_per_second": 1.753, "step": 1350 }, { "epoch": 0.6673209028459274, "grad_norm": 1.4140625, "learning_rate": 1.7492260061919503e-05, "loss": 0.5007, "step": 1360 }, { "epoch": 0.6722276741903828, "grad_norm": 1.640625, "learning_rate": 1.7234262125902996e-05, "loss": 0.5328, "step": 1370 }, { "epoch": 0.677134445534838, "grad_norm": 0.921875, "learning_rate": 1.697626418988648e-05, "loss": 0.5312, "step": 1380 }, { "epoch": 0.6820412168792934, "grad_norm": 1.8359375, "learning_rate": 1.671826625386997e-05, "loss": 0.5863, "step": 1390 }, { "epoch": 0.6869479882237488, "grad_norm": 0.87109375, "learning_rate": 1.646026831785346e-05, "loss": 0.5408, "step": 1400 }, { "epoch": 0.6869479882237488, "eval_loss": 0.5326699614524841, "eval_runtime": 28.6461, "eval_samples_per_second": 6.982, "eval_steps_per_second": 1.745, "step": 1400 }, { "epoch": 0.6918547595682041, "grad_norm": 1.203125, "learning_rate": 1.6202270381836944e-05, "loss": 0.4963, "step": 1410 }, { "epoch": 0.6967615309126595, "grad_norm": 1.2734375, "learning_rate": 1.5944272445820436e-05, "loss": 0.5244, "step": 1420 }, { "epoch": 0.7016683022571149, "grad_norm": 1.328125, "learning_rate": 1.568627450980392e-05, "loss": 0.5301, "step": 1430 }, { "epoch": 0.7065750736015701, "grad_norm": 0.8984375, "learning_rate": 1.542827657378741e-05, "loss": 0.4815, "step": 1440 }, { "epoch": 0.7114818449460255, "grad_norm": 0.56640625, "learning_rate": 1.5170278637770899e-05, "loss": 0.5156, "step": 1450 }, { "epoch": 0.7114818449460255, "eval_loss": 0.532850980758667, "eval_runtime": 28.7098, "eval_samples_per_second": 6.966, "eval_steps_per_second": 1.742, "step": 1450 }, { "epoch": 0.7163886162904809, "grad_norm": 1.203125, "learning_rate": 1.4912280701754386e-05, "loss": 0.5187, "step": 1460 }, { "epoch": 0.7212953876349362, "grad_norm": 2.828125, "learning_rate": 1.4654282765737876e-05, "loss": 0.4884, "step": 1470 }, { "epoch": 0.7262021589793916, "grad_norm": 1.2109375, "learning_rate": 1.4396284829721363e-05, "loss": 0.5434, "step": 1480 }, { "epoch": 0.7311089303238469, "grad_norm": 0.96875, "learning_rate": 1.4138286893704852e-05, "loss": 0.5154, "step": 1490 }, { "epoch": 0.7360157016683022, "grad_norm": 1.1640625, "learning_rate": 1.388028895768834e-05, "loss": 0.5312, "step": 1500 }, { "epoch": 0.7360157016683022, "eval_loss": 0.5351821184158325, "eval_runtime": 28.5695, "eval_samples_per_second": 7.0, "eval_steps_per_second": 1.75, "step": 1500 }, { "epoch": 0.7409224730127576, "grad_norm": 1.328125, "learning_rate": 1.3622291021671826e-05, "loss": 0.5394, "step": 1510 }, { "epoch": 0.745829244357213, "grad_norm": 2.078125, "learning_rate": 1.3364293085655317e-05, "loss": 0.5081, "step": 1520 }, { "epoch": 0.7507360157016683, "grad_norm": 1.09375, "learning_rate": 1.3106295149638804e-05, "loss": 0.5498, "step": 1530 }, { "epoch": 0.7556427870461236, "grad_norm": 0.71484375, "learning_rate": 1.2848297213622292e-05, "loss": 0.4704, "step": 1540 }, { "epoch": 0.760549558390579, "grad_norm": 1.0078125, "learning_rate": 1.259029927760578e-05, "loss": 0.5095, "step": 1550 }, { "epoch": 0.760549558390579, "eval_loss": 0.5329477787017822, "eval_runtime": 28.5741, "eval_samples_per_second": 6.999, "eval_steps_per_second": 1.75, "step": 1550 }, { "epoch": 0.7654563297350343, "grad_norm": 0.86328125, "learning_rate": 1.2332301341589268e-05, "loss": 0.5021, "step": 1560 }, { "epoch": 0.7703631010794897, "grad_norm": 1.3671875, "learning_rate": 1.2074303405572757e-05, "loss": 0.5578, "step": 1570 }, { "epoch": 0.7752698724239451, "grad_norm": 1.296875, "learning_rate": 1.1816305469556244e-05, "loss": 0.5031, "step": 1580 }, { "epoch": 0.7801766437684003, "grad_norm": 0.87890625, "learning_rate": 1.1558307533539733e-05, "loss": 0.5225, "step": 1590 }, { "epoch": 0.7850834151128557, "grad_norm": 1.34375, "learning_rate": 1.130030959752322e-05, "loss": 0.4909, "step": 1600 }, { "epoch": 0.7850834151128557, "eval_loss": 0.5333244204521179, "eval_runtime": 28.4593, "eval_samples_per_second": 7.028, "eval_steps_per_second": 1.757, "step": 1600 }, { "epoch": 0.7899901864573111, "grad_norm": 1.2734375, "learning_rate": 1.1042311661506709e-05, "loss": 0.5154, "step": 1610 }, { "epoch": 0.7948969578017664, "grad_norm": 2.0, "learning_rate": 1.0784313725490197e-05, "loss": 0.5048, "step": 1620 }, { "epoch": 0.7998037291462218, "grad_norm": 1.078125, "learning_rate": 1.0526315789473684e-05, "loss": 0.4927, "step": 1630 }, { "epoch": 0.8047105004906772, "grad_norm": 1.2265625, "learning_rate": 1.0268317853457173e-05, "loss": 0.5636, "step": 1640 }, { "epoch": 0.8096172718351324, "grad_norm": 1.1796875, "learning_rate": 1.001031991744066e-05, "loss": 0.5423, "step": 1650 }, { "epoch": 0.8096172718351324, "eval_loss": 0.5323337316513062, "eval_runtime": 28.6741, "eval_samples_per_second": 6.975, "eval_steps_per_second": 1.744, "step": 1650 }, { "epoch": 0.8145240431795878, "grad_norm": 1.546875, "learning_rate": 9.752321981424149e-06, "loss": 0.4747, "step": 1660 }, { "epoch": 0.8194308145240432, "grad_norm": 1.0078125, "learning_rate": 9.494324045407638e-06, "loss": 0.4759, "step": 1670 }, { "epoch": 0.8243375858684985, "grad_norm": 0.87890625, "learning_rate": 9.236326109391125e-06, "loss": 0.5072, "step": 1680 }, { "epoch": 0.8292443572129539, "grad_norm": 0.83984375, "learning_rate": 8.978328173374614e-06, "loss": 0.4662, "step": 1690 }, { "epoch": 0.8341511285574092, "grad_norm": 0.95703125, "learning_rate": 8.7203302373581e-06, "loss": 0.5161, "step": 1700 }, { "epoch": 0.8341511285574092, "eval_loss": 0.5313096046447754, "eval_runtime": 28.6753, "eval_samples_per_second": 6.975, "eval_steps_per_second": 1.744, "step": 1700 }, { "epoch": 0.8390578999018645, "grad_norm": 1.3203125, "learning_rate": 8.46233230134159e-06, "loss": 0.4837, "step": 1710 }, { "epoch": 0.8439646712463199, "grad_norm": 1.1171875, "learning_rate": 8.204334365325078e-06, "loss": 0.49, "step": 1720 }, { "epoch": 0.8488714425907753, "grad_norm": 1.515625, "learning_rate": 7.946336429308567e-06, "loss": 0.5959, "step": 1730 }, { "epoch": 0.8537782139352306, "grad_norm": 1.1171875, "learning_rate": 7.688338493292054e-06, "loss": 0.5113, "step": 1740 }, { "epoch": 0.858684985279686, "grad_norm": 1.4375, "learning_rate": 7.430340557275542e-06, "loss": 0.5343, "step": 1750 }, { "epoch": 0.858684985279686, "eval_loss": 0.530957043170929, "eval_runtime": 28.7519, "eval_samples_per_second": 6.956, "eval_steps_per_second": 1.739, "step": 1750 }, { "epoch": 0.8635917566241413, "grad_norm": 1.109375, "learning_rate": 7.1723426212590306e-06, "loss": 0.5282, "step": 1760 }, { "epoch": 0.8684985279685966, "grad_norm": 1.5, "learning_rate": 6.9143446852425185e-06, "loss": 0.4993, "step": 1770 }, { "epoch": 0.873405299313052, "grad_norm": 1.1328125, "learning_rate": 6.656346749226007e-06, "loss": 0.4911, "step": 1780 }, { "epoch": 0.8783120706575074, "grad_norm": 1.1640625, "learning_rate": 6.398348813209494e-06, "loss": 0.4945, "step": 1790 }, { "epoch": 0.8832188420019627, "grad_norm": 0.6796875, "learning_rate": 6.140350877192982e-06, "loss": 0.5435, "step": 1800 }, { "epoch": 0.8832188420019627, "eval_loss": 0.5307140350341797, "eval_runtime": 28.6268, "eval_samples_per_second": 6.986, "eval_steps_per_second": 1.747, "step": 1800 }, { "epoch": 0.888125613346418, "grad_norm": 0.7890625, "learning_rate": 5.882352941176471e-06, "loss": 0.5001, "step": 1810 }, { "epoch": 0.8930323846908734, "grad_norm": 1.265625, "learning_rate": 5.624355005159959e-06, "loss": 0.5042, "step": 1820 }, { "epoch": 0.8979391560353287, "grad_norm": 1.1875, "learning_rate": 5.366357069143447e-06, "loss": 0.5642, "step": 1830 }, { "epoch": 0.9028459273797841, "grad_norm": 1.2265625, "learning_rate": 5.1083591331269355e-06, "loss": 0.5587, "step": 1840 }, { "epoch": 0.9077526987242395, "grad_norm": 1.921875, "learning_rate": 4.850361197110423e-06, "loss": 0.539, "step": 1850 }, { "epoch": 0.9077526987242395, "eval_loss": 0.5304572582244873, "eval_runtime": 28.4933, "eval_samples_per_second": 7.019, "eval_steps_per_second": 1.755, "step": 1850 }, { "epoch": 0.9126594700686947, "grad_norm": 1.375, "learning_rate": 4.592363261093911e-06, "loss": 0.5004, "step": 1860 }, { "epoch": 0.9175662414131501, "grad_norm": 1.234375, "learning_rate": 4.3343653250774e-06, "loss": 0.4948, "step": 1870 }, { "epoch": 0.9224730127576055, "grad_norm": 1.2109375, "learning_rate": 4.076367389060888e-06, "loss": 0.5392, "step": 1880 }, { "epoch": 0.9273797841020608, "grad_norm": 1.1171875, "learning_rate": 3.818369453044376e-06, "loss": 0.5441, "step": 1890 }, { "epoch": 0.9322865554465162, "grad_norm": 2.65625, "learning_rate": 3.560371517027864e-06, "loss": 0.5096, "step": 1900 }, { "epoch": 0.9322865554465162, "eval_loss": 0.5307794809341431, "eval_runtime": 28.4936, "eval_samples_per_second": 7.019, "eval_steps_per_second": 1.755, "step": 1900 }, { "epoch": 0.9371933267909716, "grad_norm": 0.46875, "learning_rate": 3.3023735810113516e-06, "loss": 0.5092, "step": 1910 }, { "epoch": 0.9421000981354269, "grad_norm": 1.0546875, "learning_rate": 3.0443756449948404e-06, "loss": 0.5825, "step": 1920 }, { "epoch": 0.9470068694798822, "grad_norm": 1.5078125, "learning_rate": 2.7863777089783283e-06, "loss": 0.5331, "step": 1930 }, { "epoch": 0.9519136408243376, "grad_norm": 1.03125, "learning_rate": 2.5283797729618166e-06, "loss": 0.5464, "step": 1940 }, { "epoch": 0.956820412168793, "grad_norm": 1.1640625, "learning_rate": 2.2703818369453045e-06, "loss": 0.5155, "step": 1950 }, { "epoch": 0.956820412168793, "eval_loss": 0.5306495428085327, "eval_runtime": 28.7357, "eval_samples_per_second": 6.96, "eval_steps_per_second": 1.74, "step": 1950 }, { "epoch": 0.9617271835132483, "grad_norm": 0.9921875, "learning_rate": 2.012383900928793e-06, "loss": 0.5282, "step": 1960 }, { "epoch": 0.9666339548577036, "grad_norm": 0.40234375, "learning_rate": 1.7543859649122807e-06, "loss": 0.5208, "step": 1970 }, { "epoch": 0.971540726202159, "grad_norm": 0.78515625, "learning_rate": 1.4963880288957689e-06, "loss": 0.5216, "step": 1980 }, { "epoch": 0.9764474975466143, "grad_norm": 1.3359375, "learning_rate": 1.2383900928792572e-06, "loss": 0.516, "step": 1990 }, { "epoch": 0.9813542688910697, "grad_norm": 3.125, "learning_rate": 9.80392156862745e-07, "loss": 0.529, "step": 2000 }, { "epoch": 0.9813542688910697, "eval_loss": 0.5309420228004456, "eval_runtime": 28.7264, "eval_samples_per_second": 6.962, "eval_steps_per_second": 1.741, "step": 2000 }, { "epoch": 0.9862610402355251, "grad_norm": 1.3046875, "learning_rate": 7.223942208462333e-07, "loss": 0.5101, "step": 2010 }, { "epoch": 0.9911678115799804, "grad_norm": 1.109375, "learning_rate": 4.6439628482972136e-07, "loss": 0.4883, "step": 2020 }, { "epoch": 0.9960745829244357, "grad_norm": 0.71484375, "learning_rate": 2.0639834881320948e-07, "loss": 0.5151, "step": 2030 } ], "logging_steps": 10, "max_steps": 2038, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.79578628062624e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }