|
{ |
|
"best_metric": 0.08158940076828003, |
|
"best_model_checkpoint": "./fine-tuned/checkpoint-12500", |
|
"epoch": 2.56, |
|
"eval_steps": 500, |
|
"global_step": 16000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 14499.107421875, |
|
"learning_rate": 2.9919999999999998e-05, |
|
"loss": 0.3351, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 9562.1748046875, |
|
"learning_rate": 2.9840000000000002e-05, |
|
"loss": 0.0964, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 11098.59375, |
|
"learning_rate": 2.976e-05, |
|
"loss": 0.0895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 9281.0146484375, |
|
"learning_rate": 2.968e-05, |
|
"loss": 0.0797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10050.3623046875, |
|
"learning_rate": 2.96e-05, |
|
"loss": 0.0812, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 7611.0849609375, |
|
"learning_rate": 2.9520000000000002e-05, |
|
"loss": 0.0755, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 9915.1259765625, |
|
"learning_rate": 2.944e-05, |
|
"loss": 0.0793, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 10182.263671875, |
|
"learning_rate": 2.936e-05, |
|
"loss": 0.0775, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 11287.8271484375, |
|
"learning_rate": 2.928e-05, |
|
"loss": 0.0782, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6672.08251953125, |
|
"learning_rate": 2.92e-05, |
|
"loss": 0.0811, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.09235642850399017, |
|
"eval_runtime": 116.7651, |
|
"eval_samples_per_second": 17.128, |
|
"eval_steps_per_second": 2.141, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 6587.6513671875, |
|
"learning_rate": 2.9120000000000002e-05, |
|
"loss": 0.0815, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 6632.0947265625, |
|
"learning_rate": 2.904e-05, |
|
"loss": 0.0794, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 9301.228515625, |
|
"learning_rate": 2.896e-05, |
|
"loss": 0.076, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 10575.0791015625, |
|
"learning_rate": 2.888e-05, |
|
"loss": 0.0791, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8609.86328125, |
|
"learning_rate": 2.88e-05, |
|
"loss": 0.0799, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 11379.4423828125, |
|
"learning_rate": 2.8720000000000003e-05, |
|
"loss": 0.0759, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 8489.6904296875, |
|
"learning_rate": 2.864e-05, |
|
"loss": 0.0753, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 12353.6279296875, |
|
"learning_rate": 2.856e-05, |
|
"loss": 0.075, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 11535.3994140625, |
|
"learning_rate": 2.8480000000000002e-05, |
|
"loss": 0.0757, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8291.939453125, |
|
"learning_rate": 2.84e-05, |
|
"loss": 0.0753, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.08949962258338928, |
|
"eval_runtime": 116.7407, |
|
"eval_samples_per_second": 17.132, |
|
"eval_steps_per_second": 2.141, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 8266.658203125, |
|
"learning_rate": 2.832e-05, |
|
"loss": 0.0767, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 6160.548828125, |
|
"learning_rate": 2.824e-05, |
|
"loss": 0.067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 7343.408203125, |
|
"learning_rate": 2.816e-05, |
|
"loss": 0.0717, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 5661.76318359375, |
|
"learning_rate": 2.8080000000000002e-05, |
|
"loss": 0.0733, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8678.46484375, |
|
"learning_rate": 2.8e-05, |
|
"loss": 0.0737, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 6331.21533203125, |
|
"learning_rate": 2.792e-05, |
|
"loss": 0.0696, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 10563.5400390625, |
|
"learning_rate": 2.784e-05, |
|
"loss": 0.0747, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 7221.74365234375, |
|
"learning_rate": 2.7760000000000002e-05, |
|
"loss": 0.0716, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 6486.46142578125, |
|
"learning_rate": 2.768e-05, |
|
"loss": 0.0711, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6838.505859375, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 0.0703, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.08808805048465729, |
|
"eval_runtime": 116.8722, |
|
"eval_samples_per_second": 17.113, |
|
"eval_steps_per_second": 2.139, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 6751.6494140625, |
|
"learning_rate": 2.752e-05, |
|
"loss": 0.0781, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 5040.9033203125, |
|
"learning_rate": 2.7439999999999998e-05, |
|
"loss": 0.0686, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 8748.07421875, |
|
"learning_rate": 2.7360000000000002e-05, |
|
"loss": 0.0689, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 5971.705078125, |
|
"learning_rate": 2.728e-05, |
|
"loss": 0.0671, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 10833.1357421875, |
|
"learning_rate": 2.72e-05, |
|
"loss": 0.0734, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 10036.919921875, |
|
"learning_rate": 2.712e-05, |
|
"loss": 0.0715, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 7755.1669921875, |
|
"learning_rate": 2.704e-05, |
|
"loss": 0.0669, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 7584.822265625, |
|
"learning_rate": 2.696e-05, |
|
"loss": 0.0699, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 10103.142578125, |
|
"learning_rate": 2.688e-05, |
|
"loss": 0.07, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5768.24267578125, |
|
"learning_rate": 2.68e-05, |
|
"loss": 0.0709, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.08704760670661926, |
|
"eval_runtime": 116.8362, |
|
"eval_samples_per_second": 17.118, |
|
"eval_steps_per_second": 2.14, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 6016.46826171875, |
|
"learning_rate": 2.672e-05, |
|
"loss": 0.0663, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 6869.53076171875, |
|
"learning_rate": 2.6640000000000002e-05, |
|
"loss": 0.073, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 6099.595703125, |
|
"learning_rate": 2.656e-05, |
|
"loss": 0.0667, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 6923.919921875, |
|
"learning_rate": 2.648e-05, |
|
"loss": 0.0653, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8005.85595703125, |
|
"learning_rate": 2.64e-05, |
|
"loss": 0.0685, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 6473.466796875, |
|
"learning_rate": 2.632e-05, |
|
"loss": 0.0678, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 7177.6328125, |
|
"learning_rate": 2.6240000000000003e-05, |
|
"loss": 0.0637, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 5574.75439453125, |
|
"learning_rate": 2.616e-05, |
|
"loss": 0.0698, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 6910.39599609375, |
|
"learning_rate": 2.608e-05, |
|
"loss": 0.0645, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5913.9775390625, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.068, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.08615937829017639, |
|
"eval_runtime": 116.9591, |
|
"eval_samples_per_second": 17.1, |
|
"eval_steps_per_second": 2.137, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 7447.5625, |
|
"learning_rate": 2.592e-05, |
|
"loss": 0.0672, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 7057.10009765625, |
|
"learning_rate": 2.584e-05, |
|
"loss": 0.0683, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 8279.7392578125, |
|
"learning_rate": 2.576e-05, |
|
"loss": 0.0631, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 7663.275390625, |
|
"learning_rate": 2.568e-05, |
|
"loss": 0.0698, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7116.74609375, |
|
"learning_rate": 2.5600000000000002e-05, |
|
"loss": 0.0703, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 8839.5986328125, |
|
"learning_rate": 2.552e-05, |
|
"loss": 0.0654, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 7157.17333984375, |
|
"learning_rate": 2.544e-05, |
|
"loss": 0.0628, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 7690.267578125, |
|
"learning_rate": 2.536e-05, |
|
"loss": 0.0694, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 5030.39501953125, |
|
"learning_rate": 2.5280000000000002e-05, |
|
"loss": 0.0654, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 7269.51171875, |
|
"learning_rate": 2.52e-05, |
|
"loss": 0.0732, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.08551913499832153, |
|
"eval_runtime": 116.545, |
|
"eval_samples_per_second": 17.161, |
|
"eval_steps_per_second": 2.145, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 7060.21826171875, |
|
"learning_rate": 2.5120000000000003e-05, |
|
"loss": 0.0684, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 7841.55322265625, |
|
"learning_rate": 2.504e-05, |
|
"loss": 0.0653, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 5290.3271484375, |
|
"learning_rate": 2.4959999999999998e-05, |
|
"loss": 0.0668, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 6200.4853515625, |
|
"learning_rate": 2.4880000000000002e-05, |
|
"loss": 0.0665, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 6859.83544921875, |
|
"learning_rate": 2.48e-05, |
|
"loss": 0.0678, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 7718.70068359375, |
|
"learning_rate": 2.472e-05, |
|
"loss": 0.0679, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 10752.4873046875, |
|
"learning_rate": 2.464e-05, |
|
"loss": 0.062, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 6991.5087890625, |
|
"learning_rate": 2.456e-05, |
|
"loss": 0.0659, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 6204.99658203125, |
|
"learning_rate": 2.448e-05, |
|
"loss": 0.0636, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 13521.5908203125, |
|
"learning_rate": 2.44e-05, |
|
"loss": 0.0671, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.08540560305118561, |
|
"eval_runtime": 116.9131, |
|
"eval_samples_per_second": 17.107, |
|
"eval_steps_per_second": 2.138, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 6408.47265625, |
|
"learning_rate": 2.432e-05, |
|
"loss": 0.0652, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 5537.69287109375, |
|
"learning_rate": 2.4240000000000002e-05, |
|
"loss": 0.0633, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 7664.20703125, |
|
"learning_rate": 2.4160000000000002e-05, |
|
"loss": 0.0652, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 5726.9697265625, |
|
"learning_rate": 2.408e-05, |
|
"loss": 0.0667, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6898.275390625, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.0675, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 9309.822265625, |
|
"learning_rate": 2.392e-05, |
|
"loss": 0.0668, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 8566.080078125, |
|
"learning_rate": 2.384e-05, |
|
"loss": 0.064, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 5729.54833984375, |
|
"learning_rate": 2.3760000000000003e-05, |
|
"loss": 0.0635, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 9562.8701171875, |
|
"learning_rate": 2.368e-05, |
|
"loss": 0.0643, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4704.76025390625, |
|
"learning_rate": 2.3599999999999998e-05, |
|
"loss": 0.0649, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.08466340601444244, |
|
"eval_runtime": 116.6411, |
|
"eval_samples_per_second": 17.147, |
|
"eval_steps_per_second": 2.143, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 7243.01611328125, |
|
"learning_rate": 2.3520000000000002e-05, |
|
"loss": 0.0622, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 7986.32568359375, |
|
"learning_rate": 2.344e-05, |
|
"loss": 0.0678, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 9114.8974609375, |
|
"learning_rate": 2.336e-05, |
|
"loss": 0.0671, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 8830.62109375, |
|
"learning_rate": 2.328e-05, |
|
"loss": 0.0679, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 9311.2412109375, |
|
"learning_rate": 2.32e-05, |
|
"loss": 0.063, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 31307.103515625, |
|
"learning_rate": 2.3120000000000002e-05, |
|
"loss": 0.0649, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 9040.0126953125, |
|
"learning_rate": 2.304e-05, |
|
"loss": 0.0633, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 7183.91650390625, |
|
"learning_rate": 2.296e-05, |
|
"loss": 0.0582, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 6460.2998046875, |
|
"learning_rate": 2.288e-05, |
|
"loss": 0.0672, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6104.8671875, |
|
"learning_rate": 2.2800000000000002e-05, |
|
"loss": 0.0597, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.0842796117067337, |
|
"eval_runtime": 116.9361, |
|
"eval_samples_per_second": 17.103, |
|
"eval_steps_per_second": 2.138, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 7553.5556640625, |
|
"learning_rate": 2.272e-05, |
|
"loss": 0.063, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 7194.16162109375, |
|
"learning_rate": 2.2640000000000003e-05, |
|
"loss": 0.0597, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 7578.23583984375, |
|
"learning_rate": 2.256e-05, |
|
"loss": 0.0627, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 7874.51904296875, |
|
"learning_rate": 2.2479999999999998e-05, |
|
"loss": 0.0628, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6014.06640625, |
|
"learning_rate": 2.2400000000000002e-05, |
|
"loss": 0.0651, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 7170.10400390625, |
|
"learning_rate": 2.232e-05, |
|
"loss": 0.0656, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 7596.84326171875, |
|
"learning_rate": 2.224e-05, |
|
"loss": 0.0598, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 7802.14990234375, |
|
"learning_rate": 2.216e-05, |
|
"loss": 0.0605, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 5468.1845703125, |
|
"learning_rate": 2.208e-05, |
|
"loss": 0.0594, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5185.58642578125, |
|
"learning_rate": 2.2e-05, |
|
"loss": 0.0586, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.08396206796169281, |
|
"eval_runtime": 116.8224, |
|
"eval_samples_per_second": 17.12, |
|
"eval_steps_per_second": 2.14, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 6047.43359375, |
|
"learning_rate": 2.192e-05, |
|
"loss": 0.0673, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 6286.21484375, |
|
"learning_rate": 2.184e-05, |
|
"loss": 0.0609, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 6187.03369140625, |
|
"learning_rate": 2.1760000000000002e-05, |
|
"loss": 0.0628, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 4476.73095703125, |
|
"learning_rate": 2.1680000000000002e-05, |
|
"loss": 0.0626, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6180.27490234375, |
|
"learning_rate": 2.16e-05, |
|
"loss": 0.061, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 8477.626953125, |
|
"learning_rate": 2.152e-05, |
|
"loss": 0.0638, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 11541.119140625, |
|
"learning_rate": 2.144e-05, |
|
"loss": 0.0602, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 6183.49609375, |
|
"learning_rate": 2.136e-05, |
|
"loss": 0.0645, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 7597.5810546875, |
|
"learning_rate": 2.1280000000000003e-05, |
|
"loss": 0.067, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 8438.478515625, |
|
"learning_rate": 2.12e-05, |
|
"loss": 0.0628, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.08360794186592102, |
|
"eval_runtime": 116.6576, |
|
"eval_samples_per_second": 17.144, |
|
"eval_steps_per_second": 2.143, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 8200.35546875, |
|
"learning_rate": 2.1119999999999998e-05, |
|
"loss": 0.0676, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 8816.8076171875, |
|
"learning_rate": 2.1040000000000002e-05, |
|
"loss": 0.0626, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 8886.630859375, |
|
"learning_rate": 2.096e-05, |
|
"loss": 0.0657, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 8212.525390625, |
|
"learning_rate": 2.088e-05, |
|
"loss": 0.0619, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 5723.00439453125, |
|
"learning_rate": 2.08e-05, |
|
"loss": 0.0623, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 8616.3349609375, |
|
"learning_rate": 2.072e-05, |
|
"loss": 0.063, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 7717.373046875, |
|
"learning_rate": 2.064e-05, |
|
"loss": 0.063, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 6325.8193359375, |
|
"learning_rate": 2.056e-05, |
|
"loss": 0.0628, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 6938.89111328125, |
|
"learning_rate": 2.048e-05, |
|
"loss": 0.0585, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 8704.166015625, |
|
"learning_rate": 2.04e-05, |
|
"loss": 0.0634, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.08321517705917358, |
|
"eval_runtime": 116.6701, |
|
"eval_samples_per_second": 17.142, |
|
"eval_steps_per_second": 2.143, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 5835.19189453125, |
|
"learning_rate": 2.0320000000000002e-05, |
|
"loss": 0.0643, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 5896.76318359375, |
|
"learning_rate": 2.024e-05, |
|
"loss": 0.0625, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 6958.45751953125, |
|
"learning_rate": 2.016e-05, |
|
"loss": 0.0657, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 4680.04736328125, |
|
"learning_rate": 2.008e-05, |
|
"loss": 0.0632, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 8230.8056640625, |
|
"learning_rate": 1.9999999999999998e-05, |
|
"loss": 0.0603, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 5693.77001953125, |
|
"learning_rate": 1.9920000000000002e-05, |
|
"loss": 0.0574, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 14030.3583984375, |
|
"learning_rate": 1.984e-05, |
|
"loss": 0.0563, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 11693.09375, |
|
"learning_rate": 1.976e-05, |
|
"loss": 0.0558, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 5772.1845703125, |
|
"learning_rate": 1.968e-05, |
|
"loss": 0.0544, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 8641.919921875, |
|
"learning_rate": 1.96e-05, |
|
"loss": 0.0606, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.08356834203004837, |
|
"eval_runtime": 116.7914, |
|
"eval_samples_per_second": 17.125, |
|
"eval_steps_per_second": 2.141, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 6437.4033203125, |
|
"learning_rate": 1.952e-05, |
|
"loss": 0.0567, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 5099.38330078125, |
|
"learning_rate": 1.944e-05, |
|
"loss": 0.0553, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 5254.07275390625, |
|
"learning_rate": 1.936e-05, |
|
"loss": 0.0564, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 7453.3330078125, |
|
"learning_rate": 1.9280000000000002e-05, |
|
"loss": 0.0573, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3853.006103515625, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.0607, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 8804.1083984375, |
|
"learning_rate": 1.912e-05, |
|
"loss": 0.0578, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 5899.22021484375, |
|
"learning_rate": 1.904e-05, |
|
"loss": 0.0555, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 8429.76171875, |
|
"learning_rate": 1.896e-05, |
|
"loss": 0.0539, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 9160.4794921875, |
|
"learning_rate": 1.888e-05, |
|
"loss": 0.0572, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4707.27099609375, |
|
"learning_rate": 1.8800000000000003e-05, |
|
"loss": 0.0563, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.08350159972906113, |
|
"eval_runtime": 116.6938, |
|
"eval_samples_per_second": 17.139, |
|
"eval_steps_per_second": 2.142, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 5663.18603515625, |
|
"learning_rate": 1.872e-05, |
|
"loss": 0.0537, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 9569.765625, |
|
"learning_rate": 1.8639999999999998e-05, |
|
"loss": 0.0607, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 7370.98046875, |
|
"learning_rate": 1.8560000000000002e-05, |
|
"loss": 0.0607, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 5951.6533203125, |
|
"learning_rate": 1.848e-05, |
|
"loss": 0.0547, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 8285.0830078125, |
|
"learning_rate": 1.84e-05, |
|
"loss": 0.0589, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 7549.8271484375, |
|
"learning_rate": 1.832e-05, |
|
"loss": 0.0587, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 7480.25927734375, |
|
"learning_rate": 1.824e-05, |
|
"loss": 0.058, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 35994.15234375, |
|
"learning_rate": 1.816e-05, |
|
"loss": 0.0585, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 7489.05859375, |
|
"learning_rate": 1.808e-05, |
|
"loss": 0.0616, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 6134.80126953125, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.0572, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.08285799622535706, |
|
"eval_runtime": 116.9169, |
|
"eval_samples_per_second": 17.106, |
|
"eval_steps_per_second": 2.138, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 4982.9521484375, |
|
"learning_rate": 1.792e-05, |
|
"loss": 0.0569, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 5407.9384765625, |
|
"learning_rate": 1.7840000000000002e-05, |
|
"loss": 0.0579, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 6399.041015625, |
|
"learning_rate": 1.776e-05, |
|
"loss": 0.0569, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 6688.9658203125, |
|
"learning_rate": 1.768e-05, |
|
"loss": 0.0598, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 10116.4990234375, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.0538, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 7761.98876953125, |
|
"learning_rate": 1.7519999999999998e-05, |
|
"loss": 0.0549, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 5940.802734375, |
|
"learning_rate": 1.7440000000000002e-05, |
|
"loss": 0.0537, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 7946.06787109375, |
|
"learning_rate": 1.736e-05, |
|
"loss": 0.0548, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 8282.916015625, |
|
"learning_rate": 1.728e-05, |
|
"loss": 0.0539, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 6786.72509765625, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.0573, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.08285758644342422, |
|
"eval_runtime": 116.7577, |
|
"eval_samples_per_second": 17.129, |
|
"eval_steps_per_second": 2.141, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 6129.27783203125, |
|
"learning_rate": 1.712e-05, |
|
"loss": 0.0578, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 6502.31298828125, |
|
"learning_rate": 1.704e-05, |
|
"loss": 0.0513, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 10347.439453125, |
|
"learning_rate": 1.696e-05, |
|
"loss": 0.0527, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 7870.1796875, |
|
"learning_rate": 1.688e-05, |
|
"loss": 0.0565, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 7197.3447265625, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.0538, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 5525.79931640625, |
|
"learning_rate": 1.672e-05, |
|
"loss": 0.0579, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 5812.7490234375, |
|
"learning_rate": 1.664e-05, |
|
"loss": 0.0543, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 5728.1904296875, |
|
"learning_rate": 1.656e-05, |
|
"loss": 0.0572, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 6965.53759765625, |
|
"learning_rate": 1.648e-05, |
|
"loss": 0.0535, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 6986.52783203125, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.0549, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"eval_loss": 0.08279111981391907, |
|
"eval_runtime": 116.6629, |
|
"eval_samples_per_second": 17.143, |
|
"eval_steps_per_second": 2.143, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 6076.61865234375, |
|
"learning_rate": 1.6320000000000003e-05, |
|
"loss": 0.0566, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 6356.2578125, |
|
"learning_rate": 1.624e-05, |
|
"loss": 0.0527, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 8593.4482421875, |
|
"learning_rate": 1.6159999999999998e-05, |
|
"loss": 0.0611, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 6525.712890625, |
|
"learning_rate": 1.6080000000000002e-05, |
|
"loss": 0.0508, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 6376.82177734375, |
|
"learning_rate": 1.6e-05, |
|
"loss": 0.0554, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 7890.4990234375, |
|
"learning_rate": 1.592e-05, |
|
"loss": 0.0546, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 5426.74267578125, |
|
"learning_rate": 1.584e-05, |
|
"loss": 0.0558, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 8708.7294921875, |
|
"learning_rate": 1.576e-05, |
|
"loss": 0.0597, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 7744.2490234375, |
|
"learning_rate": 1.568e-05, |
|
"loss": 0.0553, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4320.080078125, |
|
"learning_rate": 1.56e-05, |
|
"loss": 0.0602, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 0.08268450945615768, |
|
"eval_runtime": 116.8196, |
|
"eval_samples_per_second": 17.12, |
|
"eval_steps_per_second": 2.14, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 5681.900390625, |
|
"learning_rate": 1.552e-05, |
|
"loss": 0.0549, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 5498.50048828125, |
|
"learning_rate": 1.544e-05, |
|
"loss": 0.0551, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 7044.8017578125, |
|
"learning_rate": 1.5360000000000002e-05, |
|
"loss": 0.0557, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 8311.8076171875, |
|
"learning_rate": 1.528e-05, |
|
"loss": 0.0559, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 10259.4189453125, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.057, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 7944.630859375, |
|
"learning_rate": 1.5120000000000001e-05, |
|
"loss": 0.0541, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 9513.1875, |
|
"learning_rate": 1.504e-05, |
|
"loss": 0.0558, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 6013.54296875, |
|
"learning_rate": 1.4959999999999999e-05, |
|
"loss": 0.0532, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 7162.22314453125, |
|
"learning_rate": 1.488e-05, |
|
"loss": 0.0553, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 6351.9833984375, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.0548, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.08252418041229248, |
|
"eval_runtime": 116.7082, |
|
"eval_samples_per_second": 17.137, |
|
"eval_steps_per_second": 2.142, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 6762.00244140625, |
|
"learning_rate": 1.472e-05, |
|
"loss": 0.0529, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 7704.66748046875, |
|
"learning_rate": 1.464e-05, |
|
"loss": 0.0576, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 5400.18798828125, |
|
"learning_rate": 1.4560000000000001e-05, |
|
"loss": 0.0556, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 6167.47216796875, |
|
"learning_rate": 1.448e-05, |
|
"loss": 0.0547, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5073.39892578125, |
|
"learning_rate": 1.44e-05, |
|
"loss": 0.0544, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 6849.08447265625, |
|
"learning_rate": 1.432e-05, |
|
"loss": 0.0571, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 6866.765625, |
|
"learning_rate": 1.4240000000000001e-05, |
|
"loss": 0.0518, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 8185.33740234375, |
|
"learning_rate": 1.416e-05, |
|
"loss": 0.0605, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 7759.45361328125, |
|
"learning_rate": 1.408e-05, |
|
"loss": 0.0581, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5736.8740234375, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.0582, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.08249519765377045, |
|
"eval_runtime": 116.9496, |
|
"eval_samples_per_second": 17.101, |
|
"eval_steps_per_second": 2.138, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 5240.361328125, |
|
"learning_rate": 1.392e-05, |
|
"loss": 0.0546, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 7000.00927734375, |
|
"learning_rate": 1.384e-05, |
|
"loss": 0.0535, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 8141.75048828125, |
|
"learning_rate": 1.376e-05, |
|
"loss": 0.0555, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 6566.3662109375, |
|
"learning_rate": 1.3680000000000001e-05, |
|
"loss": 0.0518, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 7028.8935546875, |
|
"learning_rate": 1.36e-05, |
|
"loss": 0.0572, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 13007.5703125, |
|
"learning_rate": 1.352e-05, |
|
"loss": 0.0567, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 6286.06640625, |
|
"learning_rate": 1.344e-05, |
|
"loss": 0.0529, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 6360.68408203125, |
|
"learning_rate": 1.336e-05, |
|
"loss": 0.054, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 8098.84228515625, |
|
"learning_rate": 1.328e-05, |
|
"loss": 0.0592, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 6886.65283203125, |
|
"learning_rate": 1.32e-05, |
|
"loss": 0.0524, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"eval_loss": 0.08225961029529572, |
|
"eval_runtime": 116.8647, |
|
"eval_samples_per_second": 17.114, |
|
"eval_steps_per_second": 2.139, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 5443.7119140625, |
|
"learning_rate": 1.3120000000000001e-05, |
|
"loss": 0.0554, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 6497.8408203125, |
|
"learning_rate": 1.304e-05, |
|
"loss": 0.057, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 5618.49853515625, |
|
"learning_rate": 1.296e-05, |
|
"loss": 0.0498, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 7447.96728515625, |
|
"learning_rate": 1.288e-05, |
|
"loss": 0.0568, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 8283.306640625, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.0566, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 7497.0419921875, |
|
"learning_rate": 1.272e-05, |
|
"loss": 0.0502, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 8445.2421875, |
|
"learning_rate": 1.2640000000000001e-05, |
|
"loss": 0.0562, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 15980.0498046875, |
|
"learning_rate": 1.2560000000000002e-05, |
|
"loss": 0.0588, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 5444.55615234375, |
|
"learning_rate": 1.2479999999999999e-05, |
|
"loss": 0.0564, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 7009.3037109375, |
|
"learning_rate": 1.24e-05, |
|
"loss": 0.0549, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.08220627158880234, |
|
"eval_runtime": 116.957, |
|
"eval_samples_per_second": 17.1, |
|
"eval_steps_per_second": 2.138, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 5123.0029296875, |
|
"learning_rate": 1.232e-05, |
|
"loss": 0.0562, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 7975.41064453125, |
|
"learning_rate": 1.224e-05, |
|
"loss": 0.0515, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 5846.47705078125, |
|
"learning_rate": 1.216e-05, |
|
"loss": 0.054, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 7158.12109375, |
|
"learning_rate": 1.2080000000000001e-05, |
|
"loss": 0.0577, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5405.5224609375, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.0538, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 7155.9677734375, |
|
"learning_rate": 1.192e-05, |
|
"loss": 0.0539, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 6886.369140625, |
|
"learning_rate": 1.184e-05, |
|
"loss": 0.0565, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 7139.15283203125, |
|
"learning_rate": 1.1760000000000001e-05, |
|
"loss": 0.0539, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 5965.82666015625, |
|
"learning_rate": 1.168e-05, |
|
"loss": 0.0587, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 6557.6708984375, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.0552, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"eval_loss": 0.08207839727401733, |
|
"eval_runtime": 116.751, |
|
"eval_samples_per_second": 17.13, |
|
"eval_steps_per_second": 2.141, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 5619.83984375, |
|
"learning_rate": 1.152e-05, |
|
"loss": 0.0563, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 92426.8046875, |
|
"learning_rate": 1.144e-05, |
|
"loss": 0.0588, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 7583.005859375, |
|
"learning_rate": 1.136e-05, |
|
"loss": 0.0559, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 6395.92578125, |
|
"learning_rate": 1.128e-05, |
|
"loss": 0.0552, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 9939.912109375, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.0523, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 5679.93212890625, |
|
"learning_rate": 1.112e-05, |
|
"loss": 0.0585, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 6536.05419921875, |
|
"learning_rate": 1.104e-05, |
|
"loss": 0.0533, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 7333.63330078125, |
|
"learning_rate": 1.096e-05, |
|
"loss": 0.0566, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 7345.85009765625, |
|
"learning_rate": 1.0880000000000001e-05, |
|
"loss": 0.0555, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 21337.044921875, |
|
"learning_rate": 1.08e-05, |
|
"loss": 0.0576, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.08194975554943085, |
|
"eval_runtime": 116.8029, |
|
"eval_samples_per_second": 17.123, |
|
"eval_steps_per_second": 2.14, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 6469.14306640625, |
|
"learning_rate": 1.072e-05, |
|
"loss": 0.0584, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 7579.2998046875, |
|
"learning_rate": 1.0640000000000001e-05, |
|
"loss": 0.0573, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 8114.94921875, |
|
"learning_rate": 1.0559999999999999e-05, |
|
"loss": 0.0523, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 7263.44384765625, |
|
"learning_rate": 1.048e-05, |
|
"loss": 0.0517, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 8325.9580078125, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.0524, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 6577.01318359375, |
|
"learning_rate": 1.032e-05, |
|
"loss": 0.0533, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 6278.1826171875, |
|
"learning_rate": 1.024e-05, |
|
"loss": 0.0532, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 7769.2333984375, |
|
"learning_rate": 1.0160000000000001e-05, |
|
"loss": 0.0532, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 10089.91796875, |
|
"learning_rate": 1.008e-05, |
|
"loss": 0.0539, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 9177.8115234375, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 0.0588, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.08158940076828003, |
|
"eval_runtime": 116.7903, |
|
"eval_samples_per_second": 17.125, |
|
"eval_steps_per_second": 2.141, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.008, |
|
"grad_norm": 6336.53076171875, |
|
"learning_rate": 9.92e-06, |
|
"loss": 0.0466, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 4880.88330078125, |
|
"learning_rate": 9.84e-06, |
|
"loss": 0.0531, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.024, |
|
"grad_norm": 6478.1640625, |
|
"learning_rate": 9.76e-06, |
|
"loss": 0.0516, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 6105.318359375, |
|
"learning_rate": 9.68e-06, |
|
"loss": 0.0492, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 6270.1318359375, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.0511, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 5914.5458984375, |
|
"learning_rate": 9.52e-06, |
|
"loss": 0.0522, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"grad_norm": 6194.03076171875, |
|
"learning_rate": 9.44e-06, |
|
"loss": 0.0535, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 7986.248046875, |
|
"learning_rate": 9.36e-06, |
|
"loss": 0.0529, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 2.072, |
|
"grad_norm": 10384.2099609375, |
|
"learning_rate": 9.280000000000001e-06, |
|
"loss": 0.0471, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 8849.5703125, |
|
"learning_rate": 9.2e-06, |
|
"loss": 0.0502, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 0.08202869445085526, |
|
"eval_runtime": 117.0019, |
|
"eval_samples_per_second": 17.094, |
|
"eval_steps_per_second": 2.137, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"grad_norm": 7875.97900390625, |
|
"learning_rate": 9.12e-06, |
|
"loss": 0.049, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 6825.78076171875, |
|
"learning_rate": 9.04e-06, |
|
"loss": 0.0465, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 2.104, |
|
"grad_norm": 5515.30322265625, |
|
"learning_rate": 8.96e-06, |
|
"loss": 0.0535, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 8940.48828125, |
|
"learning_rate": 8.88e-06, |
|
"loss": 0.0564, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5110.7119140625, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.0509, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 8984.7353515625, |
|
"learning_rate": 8.720000000000001e-06, |
|
"loss": 0.0479, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 2.136, |
|
"grad_norm": 8438.55078125, |
|
"learning_rate": 8.64e-06, |
|
"loss": 0.0502, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 5724.0849609375, |
|
"learning_rate": 8.56e-06, |
|
"loss": 0.0501, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"grad_norm": 7649.28955078125, |
|
"learning_rate": 8.48e-06, |
|
"loss": 0.0569, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 8429.0166015625, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.053, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 0.08213882148265839, |
|
"eval_runtime": 116.6956, |
|
"eval_samples_per_second": 17.139, |
|
"eval_steps_per_second": 2.142, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.168, |
|
"grad_norm": 4402.388671875, |
|
"learning_rate": 8.32e-06, |
|
"loss": 0.0499, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 9858.970703125, |
|
"learning_rate": 8.24e-06, |
|
"loss": 0.0506, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"grad_norm": 6748.5732421875, |
|
"learning_rate": 8.160000000000001e-06, |
|
"loss": 0.05, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 7720.3994140625, |
|
"learning_rate": 8.079999999999999e-06, |
|
"loss": 0.0504, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 5066.37060546875, |
|
"learning_rate": 8e-06, |
|
"loss": 0.0533, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 7975.1376953125, |
|
"learning_rate": 7.92e-06, |
|
"loss": 0.0482, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 2.216, |
|
"grad_norm": 6690.85302734375, |
|
"learning_rate": 7.84e-06, |
|
"loss": 0.0518, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 8501.337890625, |
|
"learning_rate": 7.76e-06, |
|
"loss": 0.0534, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 2.232, |
|
"grad_norm": 15215.427734375, |
|
"learning_rate": 7.680000000000001e-06, |
|
"loss": 0.0488, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 6265.7568359375, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.0468, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 0.08207998424768448, |
|
"eval_runtime": 116.7104, |
|
"eval_samples_per_second": 17.136, |
|
"eval_steps_per_second": 2.142, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"grad_norm": 5661.556640625, |
|
"learning_rate": 7.52e-06, |
|
"loss": 0.0516, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 6117.46728515625, |
|
"learning_rate": 7.44e-06, |
|
"loss": 0.0535, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 2.2640000000000002, |
|
"grad_norm": 5083.50634765625, |
|
"learning_rate": 7.36e-06, |
|
"loss": 0.0514, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 6597.24365234375, |
|
"learning_rate": 7.280000000000001e-06, |
|
"loss": 0.0566, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 7306.90185546875, |
|
"learning_rate": 7.2e-06, |
|
"loss": 0.0523, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 6694.41552734375, |
|
"learning_rate": 7.1200000000000004e-06, |
|
"loss": 0.0475, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 2.296, |
|
"grad_norm": 3753.303466796875, |
|
"learning_rate": 7.04e-06, |
|
"loss": 0.0501, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 5714.30078125, |
|
"learning_rate": 6.96e-06, |
|
"loss": 0.0485, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 2.312, |
|
"grad_norm": 7579.119140625, |
|
"learning_rate": 6.88e-06, |
|
"loss": 0.0504, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 6103.64599609375, |
|
"learning_rate": 6.8e-06, |
|
"loss": 0.0531, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_loss": 0.08199251443147659, |
|
"eval_runtime": 116.661, |
|
"eval_samples_per_second": 17.144, |
|
"eval_steps_per_second": 2.143, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.328, |
|
"grad_norm": 7419.63623046875, |
|
"learning_rate": 6.72e-06, |
|
"loss": 0.0527, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 6152.6513671875, |
|
"learning_rate": 6.64e-06, |
|
"loss": 0.048, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"grad_norm": 6703.68994140625, |
|
"learning_rate": 6.560000000000001e-06, |
|
"loss": 0.0537, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 8612.31640625, |
|
"learning_rate": 6.48e-06, |
|
"loss": 0.0512, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 6183.3798828125, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.0499, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 7795.396484375, |
|
"learning_rate": 6.3200000000000005e-06, |
|
"loss": 0.0525, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"grad_norm": 6911.2099609375, |
|
"learning_rate": 6.2399999999999995e-06, |
|
"loss": 0.0503, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 9744.9267578125, |
|
"learning_rate": 6.16e-06, |
|
"loss": 0.0509, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 2.392, |
|
"grad_norm": 4487.8115234375, |
|
"learning_rate": 6.08e-06, |
|
"loss": 0.0504, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 6276.47607421875, |
|
"learning_rate": 6e-06, |
|
"loss": 0.0505, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.08178989589214325, |
|
"eval_runtime": 116.6529, |
|
"eval_samples_per_second": 17.145, |
|
"eval_steps_per_second": 2.143, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.408, |
|
"grad_norm": 7706.4375, |
|
"learning_rate": 5.92e-06, |
|
"loss": 0.0513, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 6188.396484375, |
|
"learning_rate": 5.84e-06, |
|
"loss": 0.0511, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.424, |
|
"grad_norm": 6621.79345703125, |
|
"learning_rate": 5.76e-06, |
|
"loss": 0.0506, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 5284.65185546875, |
|
"learning_rate": 5.68e-06, |
|
"loss": 0.0486, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 6653.84716796875, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.053, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 6338.93505859375, |
|
"learning_rate": 5.52e-06, |
|
"loss": 0.0517, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.456, |
|
"grad_norm": 6020.87548828125, |
|
"learning_rate": 5.4400000000000004e-06, |
|
"loss": 0.0524, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 7275.64697265625, |
|
"learning_rate": 5.36e-06, |
|
"loss": 0.0516, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"grad_norm": 5086.87744140625, |
|
"learning_rate": 5.279999999999999e-06, |
|
"loss": 0.0514, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 4989.05078125, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.0526, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"eval_loss": 0.08169461041688919, |
|
"eval_runtime": 116.7302, |
|
"eval_samples_per_second": 17.134, |
|
"eval_steps_per_second": 2.142, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.488, |
|
"grad_norm": 6472.25537109375, |
|
"learning_rate": 5.12e-06, |
|
"loss": 0.0496, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 6369.4833984375, |
|
"learning_rate": 5.04e-06, |
|
"loss": 0.0518, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 2.504, |
|
"grad_norm": 8784.1083984375, |
|
"learning_rate": 4.96e-06, |
|
"loss": 0.0547, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 8509.6650390625, |
|
"learning_rate": 4.88e-06, |
|
"loss": 0.0555, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 7856.84716796875, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0513, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 6816.51123046875, |
|
"learning_rate": 4.72e-06, |
|
"loss": 0.0493, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"grad_norm": 6773.06884765625, |
|
"learning_rate": 4.6400000000000005e-06, |
|
"loss": 0.05, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 9726.3818359375, |
|
"learning_rate": 4.56e-06, |
|
"loss": 0.0518, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 2.552, |
|
"grad_norm": 8707.6591796875, |
|
"learning_rate": 4.48e-06, |
|
"loss": 0.0499, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 4772.958984375, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0522, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 0.08175843954086304, |
|
"eval_runtime": 116.8011, |
|
"eval_samples_per_second": 17.123, |
|
"eval_steps_per_second": 2.14, |
|
"step": 16000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 18750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.794660999168e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|