|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 50, |
|
"global_step": 2038, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004906771344455349, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6955, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009813542688910697, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6952, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014720314033366046, |
|
"grad_norm": 0.25, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6964, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019627085377821395, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6966, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02453385672227674, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6944, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02453385672227674, |
|
"eval_loss": 0.6937512159347534, |
|
"eval_runtime": 28.3512, |
|
"eval_samples_per_second": 7.054, |
|
"eval_steps_per_second": 1.764, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.029440628066732092, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6944, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03434739941118744, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.694, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03925417075564279, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6938, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04416094210009813, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6928, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04906771344455348, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6928, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04906771344455348, |
|
"eval_loss": 0.6929062008857727, |
|
"eval_runtime": 28.6139, |
|
"eval_samples_per_second": 6.99, |
|
"eval_steps_per_second": 1.747, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.053974484789008834, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.974200206398349e-05, |
|
"loss": 0.692, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.058881256133464184, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.948400412796697e-05, |
|
"loss": 0.6924, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06378802747791953, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.922600619195047e-05, |
|
"loss": 0.6921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06869479882237488, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 4.896800825593396e-05, |
|
"loss": 0.6927, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07360157016683022, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 4.8710010319917446e-05, |
|
"loss": 0.6914, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07360157016683022, |
|
"eval_loss": 0.6915245056152344, |
|
"eval_runtime": 28.6003, |
|
"eval_samples_per_second": 6.993, |
|
"eval_steps_per_second": 1.748, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07850834151128558, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.845201238390093e-05, |
|
"loss": 0.6923, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08341511285574092, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 4.819401444788442e-05, |
|
"loss": 0.6893, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08832188420019627, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 4.793601651186791e-05, |
|
"loss": 0.6865, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09322865554465162, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 4.7678018575851394e-05, |
|
"loss": 0.6877, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09813542688910697, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 4.742002063983488e-05, |
|
"loss": 0.6852, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09813542688910697, |
|
"eval_loss": 0.6849114894866943, |
|
"eval_runtime": 28.416, |
|
"eval_samples_per_second": 7.038, |
|
"eval_steps_per_second": 1.76, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10304219823356231, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.716202270381837e-05, |
|
"loss": 0.6767, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10794896957801767, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.690402476780186e-05, |
|
"loss": 0.6804, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11285574092247301, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.664602683178535e-05, |
|
"loss": 0.6814, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11776251226692837, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.638802889576884e-05, |
|
"loss": 0.6586, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12266928361138371, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.613003095975233e-05, |
|
"loss": 0.6478, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12266928361138371, |
|
"eval_loss": 0.6532555222511292, |
|
"eval_runtime": 28.4259, |
|
"eval_samples_per_second": 7.036, |
|
"eval_steps_per_second": 1.759, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12757605495583907, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.587203302373581e-05, |
|
"loss": 0.6473, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1324828263002944, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.56140350877193e-05, |
|
"loss": 0.6397, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13738959764474976, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.535603715170279e-05, |
|
"loss": 0.6243, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1422963689892051, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.5098039215686275e-05, |
|
"loss": 0.6304, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14720314033366044, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.4840041279669764e-05, |
|
"loss": 0.6278, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14720314033366044, |
|
"eval_loss": 0.6153883337974548, |
|
"eval_runtime": 28.6968, |
|
"eval_samples_per_second": 6.969, |
|
"eval_steps_per_second": 1.742, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1521099116781158, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.458204334365325e-05, |
|
"loss": 0.6067, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15701668302257116, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.432404540763674e-05, |
|
"loss": 0.5956, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1619234543670265, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.406604747162023e-05, |
|
"loss": 0.6263, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16683022571148184, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.380804953560372e-05, |
|
"loss": 0.5947, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1717369970559372, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.355005159958721e-05, |
|
"loss": 0.609, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1717369970559372, |
|
"eval_loss": 0.6010438799858093, |
|
"eval_runtime": 28.6965, |
|
"eval_samples_per_second": 6.969, |
|
"eval_steps_per_second": 1.742, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17664376840039253, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.329205366357069e-05, |
|
"loss": 0.5948, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1815505397448479, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.303405572755418e-05, |
|
"loss": 0.6043, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18645731108930325, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.2776057791537674e-05, |
|
"loss": 0.5791, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19136408243375858, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.2518059855521156e-05, |
|
"loss": 0.56, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19627085377821393, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.2260061919504645e-05, |
|
"loss": 0.5801, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19627085377821393, |
|
"eval_loss": 0.5847232937812805, |
|
"eval_runtime": 28.6639, |
|
"eval_samples_per_second": 6.977, |
|
"eval_steps_per_second": 1.744, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2011776251226693, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.200206398348813e-05, |
|
"loss": 0.5517, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20608439646712462, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.174406604747162e-05, |
|
"loss": 0.5799, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21099116781157998, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.148606811145511e-05, |
|
"loss": 0.54, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21589793915603533, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 4.12280701754386e-05, |
|
"loss": 0.5591, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.22080471050049066, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.097007223942209e-05, |
|
"loss": 0.5558, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22080471050049066, |
|
"eval_loss": 0.5724604725837708, |
|
"eval_runtime": 28.6438, |
|
"eval_samples_per_second": 6.982, |
|
"eval_steps_per_second": 1.746, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22571148184494602, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.071207430340557e-05, |
|
"loss": 0.6203, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.23061825318940138, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.0454076367389066e-05, |
|
"loss": 0.5813, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.23552502453385674, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.0196078431372555e-05, |
|
"loss": 0.5946, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.24043179587831207, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.9938080495356037e-05, |
|
"loss": 0.5635, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.24533856722276742, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.9680082559339525e-05, |
|
"loss": 0.5625, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24533856722276742, |
|
"eval_loss": 0.5732089877128601, |
|
"eval_runtime": 28.6521, |
|
"eval_samples_per_second": 6.98, |
|
"eval_steps_per_second": 1.745, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.25024533856722275, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.9422084623323014e-05, |
|
"loss": 0.5378, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.25515210991167814, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.91640866873065e-05, |
|
"loss": 0.5908, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.26005888125613347, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.890608875128999e-05, |
|
"loss": 0.576, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2649656526005888, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.864809081527348e-05, |
|
"loss": 0.5248, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2698724239450442, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.839009287925697e-05, |
|
"loss": 0.5448, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2698724239450442, |
|
"eval_loss": 0.5607851147651672, |
|
"eval_runtime": 28.5118, |
|
"eval_samples_per_second": 7.015, |
|
"eval_steps_per_second": 1.754, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2747791952894995, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.813209494324045e-05, |
|
"loss": 0.5693, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.27968596663395484, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 3.7874097007223946e-05, |
|
"loss": 0.5248, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2845927379784102, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.7616099071207435e-05, |
|
"loss": 0.5815, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.28949950932286556, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.735810113519092e-05, |
|
"loss": 0.5524, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2944062806673209, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.7100103199174406e-05, |
|
"loss": 0.5517, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2944062806673209, |
|
"eval_loss": 0.5586492419242859, |
|
"eval_runtime": 28.5258, |
|
"eval_samples_per_second": 7.011, |
|
"eval_steps_per_second": 1.753, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29931305201177627, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 0.5308, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3042198233562316, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.658410732714139e-05, |
|
"loss": 0.5938, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.30912659470068693, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.632610939112487e-05, |
|
"loss": 0.5611, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3140333660451423, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.606811145510836e-05, |
|
"loss": 0.5709, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.31894013738959764, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.581011351909185e-05, |
|
"loss": 0.6033, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.31894013738959764, |
|
"eval_loss": 0.564706027507782, |
|
"eval_runtime": 28.5712, |
|
"eval_samples_per_second": 7.0, |
|
"eval_steps_per_second": 1.75, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.323846908734053, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.555211558307533e-05, |
|
"loss": 0.5123, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.32875368007850836, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.5205, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3336604514229637, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.5036119711042316e-05, |
|
"loss": 0.5854, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.338567222767419, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.4778121775025805e-05, |
|
"loss": 0.5375, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3434739941118744, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.452012383900929e-05, |
|
"loss": 0.5336, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3434739941118744, |
|
"eval_loss": 0.5557608008384705, |
|
"eval_runtime": 28.4609, |
|
"eval_samples_per_second": 7.027, |
|
"eval_steps_per_second": 1.757, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.34838076545632973, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.4262125902992775e-05, |
|
"loss": 0.5533, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.35328753680078506, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.400412796697627e-05, |
|
"loss": 0.5333, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.35819430814524045, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.374613003095975e-05, |
|
"loss": 0.5536, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3631010794896958, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.348813209494324e-05, |
|
"loss": 0.5448, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3680078508341511, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.323013415892673e-05, |
|
"loss": 0.532, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3680078508341511, |
|
"eval_loss": 0.5521541833877563, |
|
"eval_runtime": 28.4595, |
|
"eval_samples_per_second": 7.028, |
|
"eval_steps_per_second": 1.757, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3729146221786065, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.297213622291022e-05, |
|
"loss": 0.565, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3778213935230618, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 3.271413828689371e-05, |
|
"loss": 0.5278, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.38272816486751715, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.24561403508772e-05, |
|
"loss": 0.5582, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.38763493621197254, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.2198142414860685e-05, |
|
"loss": 0.5253, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.39254170755642787, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.194014447884417e-05, |
|
"loss": 0.5596, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.39254170755642787, |
|
"eval_loss": 0.551950216293335, |
|
"eval_runtime": 28.6541, |
|
"eval_samples_per_second": 6.98, |
|
"eval_steps_per_second": 1.745, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3974484789008832, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.1682146542827656e-05, |
|
"loss": 0.5513, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4023552502453386, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.142414860681115e-05, |
|
"loss": 0.5023, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4072620215897939, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.1166150670794634e-05, |
|
"loss": 0.5519, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.41216879293424924, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.090815273477812e-05, |
|
"loss": 0.51, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4170755642787046, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.065015479876161e-05, |
|
"loss": 0.5496, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4170755642787046, |
|
"eval_loss": 0.5526400208473206, |
|
"eval_runtime": 28.572, |
|
"eval_samples_per_second": 7.0, |
|
"eval_steps_per_second": 1.75, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.42198233562315995, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.0392156862745097e-05, |
|
"loss": 0.5372, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4268891069676153, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.013415892672859e-05, |
|
"loss": 0.5384, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.43179587831207067, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.9876160990712077e-05, |
|
"loss": 0.4739, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.436702649656526, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 2.9618163054695563e-05, |
|
"loss": 0.4766, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.44160942100098133, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.936016511867905e-05, |
|
"loss": 0.5321, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.44160942100098133, |
|
"eval_loss": 0.5481389164924622, |
|
"eval_runtime": 28.6453, |
|
"eval_samples_per_second": 6.982, |
|
"eval_steps_per_second": 1.745, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4465161923454367, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.9102167182662537e-05, |
|
"loss": 0.5654, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.45142296368989204, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.884416924664603e-05, |
|
"loss": 0.5514, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4563297350343474, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.8586171310629518e-05, |
|
"loss": 0.4833, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.46123650637880276, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.8328173374613003e-05, |
|
"loss": 0.549, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4661432777232581, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.8070175438596492e-05, |
|
"loss": 0.5194, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4661432777232581, |
|
"eval_loss": 0.5417376160621643, |
|
"eval_runtime": 28.3949, |
|
"eval_samples_per_second": 7.044, |
|
"eval_steps_per_second": 1.761, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.47105004906771347, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.7812177502579977e-05, |
|
"loss": 0.5468, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4759568204121688, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.755417956656347e-05, |
|
"loss": 0.5307, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.48086359175662413, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.7296181630546958e-05, |
|
"loss": 0.5379, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4857703631010795, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.7038183694530443e-05, |
|
"loss": 0.5197, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.49067713444553485, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.6780185758513932e-05, |
|
"loss": 0.5557, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.49067713444553485, |
|
"eval_loss": 0.5419167876243591, |
|
"eval_runtime": 28.5704, |
|
"eval_samples_per_second": 7.0, |
|
"eval_steps_per_second": 1.75, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4955839057899902, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 2.6522187822497424e-05, |
|
"loss": 0.5121, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5004906771344455, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.626418988648091e-05, |
|
"loss": 0.4932, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5053974484789009, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.60061919504644e-05, |
|
"loss": 0.5361, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5103042198233563, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.5748194014447884e-05, |
|
"loss": 0.5107, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5152109911678115, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.5490196078431373e-05, |
|
"loss": 0.5111, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5152109911678115, |
|
"eval_loss": 0.5413140654563904, |
|
"eval_runtime": 28.49, |
|
"eval_samples_per_second": 7.02, |
|
"eval_steps_per_second": 1.755, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5201177625122669, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 2.5232198142414865e-05, |
|
"loss": 0.5302, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5250245338567223, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 2.497420020639835e-05, |
|
"loss": 0.5089, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5299313052011776, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.471620227038184e-05, |
|
"loss": 0.5451, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.534838076545633, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.4458204334365324e-05, |
|
"loss": 0.5736, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5397448478900884, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.4200206398348816e-05, |
|
"loss": 0.5065, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5397448478900884, |
|
"eval_loss": 0.5397326350212097, |
|
"eval_runtime": 28.806, |
|
"eval_samples_per_second": 6.943, |
|
"eval_steps_per_second": 1.736, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5446516192345436, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.39422084623323e-05, |
|
"loss": 0.5719, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.549558390578999, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.368421052631579e-05, |
|
"loss": 0.5343, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5544651619234544, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.342621259029928e-05, |
|
"loss": 0.523, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5593719332679097, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 2.3168214654282765e-05, |
|
"loss": 0.5345, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5642787046123651, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.2910216718266257e-05, |
|
"loss": 0.5264, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5642787046123651, |
|
"eval_loss": 0.5375524759292603, |
|
"eval_runtime": 28.5508, |
|
"eval_samples_per_second": 7.005, |
|
"eval_steps_per_second": 1.751, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5691854759568205, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 2.2652218782249742e-05, |
|
"loss": 0.4863, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5740922473012757, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.2394220846233234e-05, |
|
"loss": 0.4676, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5789990186457311, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.213622291021672e-05, |
|
"loss": 0.5751, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5839057899901865, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.1878224974200205e-05, |
|
"loss": 0.5694, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5888125613346418, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.1620227038183697e-05, |
|
"loss": 0.5529, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5888125613346418, |
|
"eval_loss": 0.5384119153022766, |
|
"eval_runtime": 28.624, |
|
"eval_samples_per_second": 6.987, |
|
"eval_steps_per_second": 1.747, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5937193326790972, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 2.1362229102167182e-05, |
|
"loss": 0.5438, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5986261040235525, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.1104231166150675e-05, |
|
"loss": 0.5174, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6035328753680078, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.084623323013416e-05, |
|
"loss": 0.5421, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6084396467124632, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.058823529411765e-05, |
|
"loss": 0.5427, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6133464180569186, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.0330237358101137e-05, |
|
"loss": 0.5356, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6133464180569186, |
|
"eval_loss": 0.5382110476493835, |
|
"eval_runtime": 28.6288, |
|
"eval_samples_per_second": 6.986, |
|
"eval_steps_per_second": 1.746, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6182531894013739, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.0072239422084623e-05, |
|
"loss": 0.5667, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6231599607458292, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.9814241486068115e-05, |
|
"loss": 0.5421, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6280667320902846, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.95562435500516e-05, |
|
"loss": 0.4955, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6329735034347399, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.929824561403509e-05, |
|
"loss": 0.4995, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6378802747791953, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.9040247678018578e-05, |
|
"loss": 0.5029, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6378802747791953, |
|
"eval_loss": 0.5337280035018921, |
|
"eval_runtime": 28.4662, |
|
"eval_samples_per_second": 7.026, |
|
"eval_steps_per_second": 1.756, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6427870461236507, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.8782249742002063e-05, |
|
"loss": 0.5149, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.647693817468106, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.8524251805985555e-05, |
|
"loss": 0.5011, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6526005888125613, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.826625386996904e-05, |
|
"loss": 0.5474, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6575073601570167, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.800825593395253e-05, |
|
"loss": 0.5195, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.662414131501472, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.7750257997936018e-05, |
|
"loss": 0.4843, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.662414131501472, |
|
"eval_loss": 0.5355111360549927, |
|
"eval_runtime": 28.5183, |
|
"eval_samples_per_second": 7.013, |
|
"eval_steps_per_second": 1.753, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6673209028459274, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.7492260061919503e-05, |
|
"loss": 0.5007, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6722276741903828, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.7234262125902996e-05, |
|
"loss": 0.5328, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.677134445534838, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 1.697626418988648e-05, |
|
"loss": 0.5312, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6820412168792934, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.671826625386997e-05, |
|
"loss": 0.5863, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6869479882237488, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.646026831785346e-05, |
|
"loss": 0.5408, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6869479882237488, |
|
"eval_loss": 0.5326699614524841, |
|
"eval_runtime": 28.6461, |
|
"eval_samples_per_second": 6.982, |
|
"eval_steps_per_second": 1.745, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6918547595682041, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.6202270381836944e-05, |
|
"loss": 0.4963, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6967615309126595, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.5944272445820436e-05, |
|
"loss": 0.5244, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7016683022571149, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.568627450980392e-05, |
|
"loss": 0.5301, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7065750736015701, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.542827657378741e-05, |
|
"loss": 0.4815, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7114818449460255, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.5170278637770899e-05, |
|
"loss": 0.5156, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7114818449460255, |
|
"eval_loss": 0.532850980758667, |
|
"eval_runtime": 28.7098, |
|
"eval_samples_per_second": 6.966, |
|
"eval_steps_per_second": 1.742, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7163886162904809, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.4912280701754386e-05, |
|
"loss": 0.5187, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7212953876349362, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.4654282765737876e-05, |
|
"loss": 0.4884, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7262021589793916, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.4396284829721363e-05, |
|
"loss": 0.5434, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7311089303238469, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.4138286893704852e-05, |
|
"loss": 0.5154, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7360157016683022, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.388028895768834e-05, |
|
"loss": 0.5312, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7360157016683022, |
|
"eval_loss": 0.5351821184158325, |
|
"eval_runtime": 28.5695, |
|
"eval_samples_per_second": 7.0, |
|
"eval_steps_per_second": 1.75, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7409224730127576, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.3622291021671826e-05, |
|
"loss": 0.5394, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.745829244357213, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.3364293085655317e-05, |
|
"loss": 0.5081, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7507360157016683, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.3106295149638804e-05, |
|
"loss": 0.5498, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7556427870461236, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.2848297213622292e-05, |
|
"loss": 0.4704, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.760549558390579, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.259029927760578e-05, |
|
"loss": 0.5095, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.760549558390579, |
|
"eval_loss": 0.5329477787017822, |
|
"eval_runtime": 28.5741, |
|
"eval_samples_per_second": 6.999, |
|
"eval_steps_per_second": 1.75, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7654563297350343, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 1.2332301341589268e-05, |
|
"loss": 0.5021, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7703631010794897, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.2074303405572757e-05, |
|
"loss": 0.5578, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7752698724239451, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.1816305469556244e-05, |
|
"loss": 0.5031, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7801766437684003, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.1558307533539733e-05, |
|
"loss": 0.5225, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7850834151128557, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.130030959752322e-05, |
|
"loss": 0.4909, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7850834151128557, |
|
"eval_loss": 0.5333244204521179, |
|
"eval_runtime": 28.4593, |
|
"eval_samples_per_second": 7.028, |
|
"eval_steps_per_second": 1.757, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7899901864573111, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.1042311661506709e-05, |
|
"loss": 0.5154, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7948969578017664, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.0784313725490197e-05, |
|
"loss": 0.5048, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7998037291462218, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.4927, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8047105004906772, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.0268317853457173e-05, |
|
"loss": 0.5636, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8096172718351324, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.001031991744066e-05, |
|
"loss": 0.5423, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8096172718351324, |
|
"eval_loss": 0.5323337316513062, |
|
"eval_runtime": 28.6741, |
|
"eval_samples_per_second": 6.975, |
|
"eval_steps_per_second": 1.744, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8145240431795878, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 9.752321981424149e-06, |
|
"loss": 0.4747, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8194308145240432, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.494324045407638e-06, |
|
"loss": 0.4759, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8243375858684985, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 9.236326109391125e-06, |
|
"loss": 0.5072, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8292443572129539, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 8.978328173374614e-06, |
|
"loss": 0.4662, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8341511285574092, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 8.7203302373581e-06, |
|
"loss": 0.5161, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8341511285574092, |
|
"eval_loss": 0.5313096046447754, |
|
"eval_runtime": 28.6753, |
|
"eval_samples_per_second": 6.975, |
|
"eval_steps_per_second": 1.744, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8390578999018645, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 8.46233230134159e-06, |
|
"loss": 0.4837, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8439646712463199, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 8.204334365325078e-06, |
|
"loss": 0.49, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8488714425907753, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 7.946336429308567e-06, |
|
"loss": 0.5959, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8537782139352306, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.688338493292054e-06, |
|
"loss": 0.5113, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.858684985279686, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.430340557275542e-06, |
|
"loss": 0.5343, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.858684985279686, |
|
"eval_loss": 0.530957043170929, |
|
"eval_runtime": 28.7519, |
|
"eval_samples_per_second": 6.956, |
|
"eval_steps_per_second": 1.739, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8635917566241413, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.1723426212590306e-06, |
|
"loss": 0.5282, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8684985279685966, |
|
"grad_norm": 1.5, |
|
"learning_rate": 6.9143446852425185e-06, |
|
"loss": 0.4993, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.873405299313052, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.656346749226007e-06, |
|
"loss": 0.4911, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8783120706575074, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 6.398348813209494e-06, |
|
"loss": 0.4945, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8832188420019627, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 6.140350877192982e-06, |
|
"loss": 0.5435, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8832188420019627, |
|
"eval_loss": 0.5307140350341797, |
|
"eval_runtime": 28.6268, |
|
"eval_samples_per_second": 6.986, |
|
"eval_steps_per_second": 1.747, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.888125613346418, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.5001, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8930323846908734, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 5.624355005159959e-06, |
|
"loss": 0.5042, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8979391560353287, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 5.366357069143447e-06, |
|
"loss": 0.5642, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9028459273797841, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 5.1083591331269355e-06, |
|
"loss": 0.5587, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.9077526987242395, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.850361197110423e-06, |
|
"loss": 0.539, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9077526987242395, |
|
"eval_loss": 0.5304572582244873, |
|
"eval_runtime": 28.4933, |
|
"eval_samples_per_second": 7.019, |
|
"eval_steps_per_second": 1.755, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9126594700686947, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.592363261093911e-06, |
|
"loss": 0.5004, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9175662414131501, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.3343653250774e-06, |
|
"loss": 0.4948, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9224730127576055, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.076367389060888e-06, |
|
"loss": 0.5392, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9273797841020608, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.818369453044376e-06, |
|
"loss": 0.5441, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9322865554465162, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.560371517027864e-06, |
|
"loss": 0.5096, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9322865554465162, |
|
"eval_loss": 0.5307794809341431, |
|
"eval_runtime": 28.4936, |
|
"eval_samples_per_second": 7.019, |
|
"eval_steps_per_second": 1.755, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9371933267909716, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.3023735810113516e-06, |
|
"loss": 0.5092, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9421000981354269, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.0443756449948404e-06, |
|
"loss": 0.5825, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9470068694798822, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 2.7863777089783283e-06, |
|
"loss": 0.5331, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9519136408243376, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.5283797729618166e-06, |
|
"loss": 0.5464, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.956820412168793, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.2703818369453045e-06, |
|
"loss": 0.5155, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.956820412168793, |
|
"eval_loss": 0.5306495428085327, |
|
"eval_runtime": 28.7357, |
|
"eval_samples_per_second": 6.96, |
|
"eval_steps_per_second": 1.74, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9617271835132483, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.012383900928793e-06, |
|
"loss": 0.5282, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9666339548577036, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 0.5208, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.971540726202159, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.4963880288957689e-06, |
|
"loss": 0.5216, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9764474975466143, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.2383900928792572e-06, |
|
"loss": 0.516, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9813542688910697, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.80392156862745e-07, |
|
"loss": 0.529, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9813542688910697, |
|
"eval_loss": 0.5309420228004456, |
|
"eval_runtime": 28.7264, |
|
"eval_samples_per_second": 6.962, |
|
"eval_steps_per_second": 1.741, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9862610402355251, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.223942208462333e-07, |
|
"loss": 0.5101, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.9911678115799804, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.6439628482972136e-07, |
|
"loss": 0.4883, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9960745829244357, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.0639834881320948e-07, |
|
"loss": 0.5151, |
|
"step": 2030 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2038, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.79578628062624e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|