rakhman-llm's picture
Training in progress, step 16000, checkpoint
f645c58 verified
raw
history blame
55.4 kB
{
"best_metric": 0.08158940076828003,
"best_model_checkpoint": "./fine-tuned/checkpoint-12500",
"epoch": 2.56,
"eval_steps": 500,
"global_step": 16000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008,
"grad_norm": 14499.107421875,
"learning_rate": 2.9919999999999998e-05,
"loss": 0.3351,
"step": 50
},
{
"epoch": 0.016,
"grad_norm": 9562.1748046875,
"learning_rate": 2.9840000000000002e-05,
"loss": 0.0964,
"step": 100
},
{
"epoch": 0.024,
"grad_norm": 11098.59375,
"learning_rate": 2.976e-05,
"loss": 0.0895,
"step": 150
},
{
"epoch": 0.032,
"grad_norm": 9281.0146484375,
"learning_rate": 2.968e-05,
"loss": 0.0797,
"step": 200
},
{
"epoch": 0.04,
"grad_norm": 10050.3623046875,
"learning_rate": 2.96e-05,
"loss": 0.0812,
"step": 250
},
{
"epoch": 0.048,
"grad_norm": 7611.0849609375,
"learning_rate": 2.9520000000000002e-05,
"loss": 0.0755,
"step": 300
},
{
"epoch": 0.056,
"grad_norm": 9915.1259765625,
"learning_rate": 2.944e-05,
"loss": 0.0793,
"step": 350
},
{
"epoch": 0.064,
"grad_norm": 10182.263671875,
"learning_rate": 2.936e-05,
"loss": 0.0775,
"step": 400
},
{
"epoch": 0.072,
"grad_norm": 11287.8271484375,
"learning_rate": 2.928e-05,
"loss": 0.0782,
"step": 450
},
{
"epoch": 0.08,
"grad_norm": 6672.08251953125,
"learning_rate": 2.92e-05,
"loss": 0.0811,
"step": 500
},
{
"epoch": 0.08,
"eval_loss": 0.09235642850399017,
"eval_runtime": 116.7651,
"eval_samples_per_second": 17.128,
"eval_steps_per_second": 2.141,
"step": 500
},
{
"epoch": 0.088,
"grad_norm": 6587.6513671875,
"learning_rate": 2.9120000000000002e-05,
"loss": 0.0815,
"step": 550
},
{
"epoch": 0.096,
"grad_norm": 6632.0947265625,
"learning_rate": 2.904e-05,
"loss": 0.0794,
"step": 600
},
{
"epoch": 0.104,
"grad_norm": 9301.228515625,
"learning_rate": 2.896e-05,
"loss": 0.076,
"step": 650
},
{
"epoch": 0.112,
"grad_norm": 10575.0791015625,
"learning_rate": 2.888e-05,
"loss": 0.0791,
"step": 700
},
{
"epoch": 0.12,
"grad_norm": 8609.86328125,
"learning_rate": 2.88e-05,
"loss": 0.0799,
"step": 750
},
{
"epoch": 0.128,
"grad_norm": 11379.4423828125,
"learning_rate": 2.8720000000000003e-05,
"loss": 0.0759,
"step": 800
},
{
"epoch": 0.136,
"grad_norm": 8489.6904296875,
"learning_rate": 2.864e-05,
"loss": 0.0753,
"step": 850
},
{
"epoch": 0.144,
"grad_norm": 12353.6279296875,
"learning_rate": 2.856e-05,
"loss": 0.075,
"step": 900
},
{
"epoch": 0.152,
"grad_norm": 11535.3994140625,
"learning_rate": 2.8480000000000002e-05,
"loss": 0.0757,
"step": 950
},
{
"epoch": 0.16,
"grad_norm": 8291.939453125,
"learning_rate": 2.84e-05,
"loss": 0.0753,
"step": 1000
},
{
"epoch": 0.16,
"eval_loss": 0.08949962258338928,
"eval_runtime": 116.7407,
"eval_samples_per_second": 17.132,
"eval_steps_per_second": 2.141,
"step": 1000
},
{
"epoch": 0.168,
"grad_norm": 8266.658203125,
"learning_rate": 2.832e-05,
"loss": 0.0767,
"step": 1050
},
{
"epoch": 0.176,
"grad_norm": 6160.548828125,
"learning_rate": 2.824e-05,
"loss": 0.067,
"step": 1100
},
{
"epoch": 0.184,
"grad_norm": 7343.408203125,
"learning_rate": 2.816e-05,
"loss": 0.0717,
"step": 1150
},
{
"epoch": 0.192,
"grad_norm": 5661.76318359375,
"learning_rate": 2.8080000000000002e-05,
"loss": 0.0733,
"step": 1200
},
{
"epoch": 0.2,
"grad_norm": 8678.46484375,
"learning_rate": 2.8e-05,
"loss": 0.0737,
"step": 1250
},
{
"epoch": 0.208,
"grad_norm": 6331.21533203125,
"learning_rate": 2.792e-05,
"loss": 0.0696,
"step": 1300
},
{
"epoch": 0.216,
"grad_norm": 10563.5400390625,
"learning_rate": 2.784e-05,
"loss": 0.0747,
"step": 1350
},
{
"epoch": 0.224,
"grad_norm": 7221.74365234375,
"learning_rate": 2.7760000000000002e-05,
"loss": 0.0716,
"step": 1400
},
{
"epoch": 0.232,
"grad_norm": 6486.46142578125,
"learning_rate": 2.768e-05,
"loss": 0.0711,
"step": 1450
},
{
"epoch": 0.24,
"grad_norm": 6838.505859375,
"learning_rate": 2.7600000000000003e-05,
"loss": 0.0703,
"step": 1500
},
{
"epoch": 0.24,
"eval_loss": 0.08808805048465729,
"eval_runtime": 116.8722,
"eval_samples_per_second": 17.113,
"eval_steps_per_second": 2.139,
"step": 1500
},
{
"epoch": 0.248,
"grad_norm": 6751.6494140625,
"learning_rate": 2.752e-05,
"loss": 0.0781,
"step": 1550
},
{
"epoch": 0.256,
"grad_norm": 5040.9033203125,
"learning_rate": 2.7439999999999998e-05,
"loss": 0.0686,
"step": 1600
},
{
"epoch": 0.264,
"grad_norm": 8748.07421875,
"learning_rate": 2.7360000000000002e-05,
"loss": 0.0689,
"step": 1650
},
{
"epoch": 0.272,
"grad_norm": 5971.705078125,
"learning_rate": 2.728e-05,
"loss": 0.0671,
"step": 1700
},
{
"epoch": 0.28,
"grad_norm": 10833.1357421875,
"learning_rate": 2.72e-05,
"loss": 0.0734,
"step": 1750
},
{
"epoch": 0.288,
"grad_norm": 10036.919921875,
"learning_rate": 2.712e-05,
"loss": 0.0715,
"step": 1800
},
{
"epoch": 0.296,
"grad_norm": 7755.1669921875,
"learning_rate": 2.704e-05,
"loss": 0.0669,
"step": 1850
},
{
"epoch": 0.304,
"grad_norm": 7584.822265625,
"learning_rate": 2.696e-05,
"loss": 0.0699,
"step": 1900
},
{
"epoch": 0.312,
"grad_norm": 10103.142578125,
"learning_rate": 2.688e-05,
"loss": 0.07,
"step": 1950
},
{
"epoch": 0.32,
"grad_norm": 5768.24267578125,
"learning_rate": 2.68e-05,
"loss": 0.0709,
"step": 2000
},
{
"epoch": 0.32,
"eval_loss": 0.08704760670661926,
"eval_runtime": 116.8362,
"eval_samples_per_second": 17.118,
"eval_steps_per_second": 2.14,
"step": 2000
},
{
"epoch": 0.328,
"grad_norm": 6016.46826171875,
"learning_rate": 2.672e-05,
"loss": 0.0663,
"step": 2050
},
{
"epoch": 0.336,
"grad_norm": 6869.53076171875,
"learning_rate": 2.6640000000000002e-05,
"loss": 0.073,
"step": 2100
},
{
"epoch": 0.344,
"grad_norm": 6099.595703125,
"learning_rate": 2.656e-05,
"loss": 0.0667,
"step": 2150
},
{
"epoch": 0.352,
"grad_norm": 6923.919921875,
"learning_rate": 2.648e-05,
"loss": 0.0653,
"step": 2200
},
{
"epoch": 0.36,
"grad_norm": 8005.85595703125,
"learning_rate": 2.64e-05,
"loss": 0.0685,
"step": 2250
},
{
"epoch": 0.368,
"grad_norm": 6473.466796875,
"learning_rate": 2.632e-05,
"loss": 0.0678,
"step": 2300
},
{
"epoch": 0.376,
"grad_norm": 7177.6328125,
"learning_rate": 2.6240000000000003e-05,
"loss": 0.0637,
"step": 2350
},
{
"epoch": 0.384,
"grad_norm": 5574.75439453125,
"learning_rate": 2.616e-05,
"loss": 0.0698,
"step": 2400
},
{
"epoch": 0.392,
"grad_norm": 6910.39599609375,
"learning_rate": 2.608e-05,
"loss": 0.0645,
"step": 2450
},
{
"epoch": 0.4,
"grad_norm": 5913.9775390625,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.068,
"step": 2500
},
{
"epoch": 0.4,
"eval_loss": 0.08615937829017639,
"eval_runtime": 116.9591,
"eval_samples_per_second": 17.1,
"eval_steps_per_second": 2.137,
"step": 2500
},
{
"epoch": 0.408,
"grad_norm": 7447.5625,
"learning_rate": 2.592e-05,
"loss": 0.0672,
"step": 2550
},
{
"epoch": 0.416,
"grad_norm": 7057.10009765625,
"learning_rate": 2.584e-05,
"loss": 0.0683,
"step": 2600
},
{
"epoch": 0.424,
"grad_norm": 8279.7392578125,
"learning_rate": 2.576e-05,
"loss": 0.0631,
"step": 2650
},
{
"epoch": 0.432,
"grad_norm": 7663.275390625,
"learning_rate": 2.568e-05,
"loss": 0.0698,
"step": 2700
},
{
"epoch": 0.44,
"grad_norm": 7116.74609375,
"learning_rate": 2.5600000000000002e-05,
"loss": 0.0703,
"step": 2750
},
{
"epoch": 0.448,
"grad_norm": 8839.5986328125,
"learning_rate": 2.552e-05,
"loss": 0.0654,
"step": 2800
},
{
"epoch": 0.456,
"grad_norm": 7157.17333984375,
"learning_rate": 2.544e-05,
"loss": 0.0628,
"step": 2850
},
{
"epoch": 0.464,
"grad_norm": 7690.267578125,
"learning_rate": 2.536e-05,
"loss": 0.0694,
"step": 2900
},
{
"epoch": 0.472,
"grad_norm": 5030.39501953125,
"learning_rate": 2.5280000000000002e-05,
"loss": 0.0654,
"step": 2950
},
{
"epoch": 0.48,
"grad_norm": 7269.51171875,
"learning_rate": 2.52e-05,
"loss": 0.0732,
"step": 3000
},
{
"epoch": 0.48,
"eval_loss": 0.08551913499832153,
"eval_runtime": 116.545,
"eval_samples_per_second": 17.161,
"eval_steps_per_second": 2.145,
"step": 3000
},
{
"epoch": 0.488,
"grad_norm": 7060.21826171875,
"learning_rate": 2.5120000000000003e-05,
"loss": 0.0684,
"step": 3050
},
{
"epoch": 0.496,
"grad_norm": 7841.55322265625,
"learning_rate": 2.504e-05,
"loss": 0.0653,
"step": 3100
},
{
"epoch": 0.504,
"grad_norm": 5290.3271484375,
"learning_rate": 2.4959999999999998e-05,
"loss": 0.0668,
"step": 3150
},
{
"epoch": 0.512,
"grad_norm": 6200.4853515625,
"learning_rate": 2.4880000000000002e-05,
"loss": 0.0665,
"step": 3200
},
{
"epoch": 0.52,
"grad_norm": 6859.83544921875,
"learning_rate": 2.48e-05,
"loss": 0.0678,
"step": 3250
},
{
"epoch": 0.528,
"grad_norm": 7718.70068359375,
"learning_rate": 2.472e-05,
"loss": 0.0679,
"step": 3300
},
{
"epoch": 0.536,
"grad_norm": 10752.4873046875,
"learning_rate": 2.464e-05,
"loss": 0.062,
"step": 3350
},
{
"epoch": 0.544,
"grad_norm": 6991.5087890625,
"learning_rate": 2.456e-05,
"loss": 0.0659,
"step": 3400
},
{
"epoch": 0.552,
"grad_norm": 6204.99658203125,
"learning_rate": 2.448e-05,
"loss": 0.0636,
"step": 3450
},
{
"epoch": 0.56,
"grad_norm": 13521.5908203125,
"learning_rate": 2.44e-05,
"loss": 0.0671,
"step": 3500
},
{
"epoch": 0.56,
"eval_loss": 0.08540560305118561,
"eval_runtime": 116.9131,
"eval_samples_per_second": 17.107,
"eval_steps_per_second": 2.138,
"step": 3500
},
{
"epoch": 0.568,
"grad_norm": 6408.47265625,
"learning_rate": 2.432e-05,
"loss": 0.0652,
"step": 3550
},
{
"epoch": 0.576,
"grad_norm": 5537.69287109375,
"learning_rate": 2.4240000000000002e-05,
"loss": 0.0633,
"step": 3600
},
{
"epoch": 0.584,
"grad_norm": 7664.20703125,
"learning_rate": 2.4160000000000002e-05,
"loss": 0.0652,
"step": 3650
},
{
"epoch": 0.592,
"grad_norm": 5726.9697265625,
"learning_rate": 2.408e-05,
"loss": 0.0667,
"step": 3700
},
{
"epoch": 0.6,
"grad_norm": 6898.275390625,
"learning_rate": 2.4e-05,
"loss": 0.0675,
"step": 3750
},
{
"epoch": 0.608,
"grad_norm": 9309.822265625,
"learning_rate": 2.392e-05,
"loss": 0.0668,
"step": 3800
},
{
"epoch": 0.616,
"grad_norm": 8566.080078125,
"learning_rate": 2.384e-05,
"loss": 0.064,
"step": 3850
},
{
"epoch": 0.624,
"grad_norm": 5729.54833984375,
"learning_rate": 2.3760000000000003e-05,
"loss": 0.0635,
"step": 3900
},
{
"epoch": 0.632,
"grad_norm": 9562.8701171875,
"learning_rate": 2.368e-05,
"loss": 0.0643,
"step": 3950
},
{
"epoch": 0.64,
"grad_norm": 4704.76025390625,
"learning_rate": 2.3599999999999998e-05,
"loss": 0.0649,
"step": 4000
},
{
"epoch": 0.64,
"eval_loss": 0.08466340601444244,
"eval_runtime": 116.6411,
"eval_samples_per_second": 17.147,
"eval_steps_per_second": 2.143,
"step": 4000
},
{
"epoch": 0.648,
"grad_norm": 7243.01611328125,
"learning_rate": 2.3520000000000002e-05,
"loss": 0.0622,
"step": 4050
},
{
"epoch": 0.656,
"grad_norm": 7986.32568359375,
"learning_rate": 2.344e-05,
"loss": 0.0678,
"step": 4100
},
{
"epoch": 0.664,
"grad_norm": 9114.8974609375,
"learning_rate": 2.336e-05,
"loss": 0.0671,
"step": 4150
},
{
"epoch": 0.672,
"grad_norm": 8830.62109375,
"learning_rate": 2.328e-05,
"loss": 0.0679,
"step": 4200
},
{
"epoch": 0.68,
"grad_norm": 9311.2412109375,
"learning_rate": 2.32e-05,
"loss": 0.063,
"step": 4250
},
{
"epoch": 0.688,
"grad_norm": 31307.103515625,
"learning_rate": 2.3120000000000002e-05,
"loss": 0.0649,
"step": 4300
},
{
"epoch": 0.696,
"grad_norm": 9040.0126953125,
"learning_rate": 2.304e-05,
"loss": 0.0633,
"step": 4350
},
{
"epoch": 0.704,
"grad_norm": 7183.91650390625,
"learning_rate": 2.296e-05,
"loss": 0.0582,
"step": 4400
},
{
"epoch": 0.712,
"grad_norm": 6460.2998046875,
"learning_rate": 2.288e-05,
"loss": 0.0672,
"step": 4450
},
{
"epoch": 0.72,
"grad_norm": 6104.8671875,
"learning_rate": 2.2800000000000002e-05,
"loss": 0.0597,
"step": 4500
},
{
"epoch": 0.72,
"eval_loss": 0.0842796117067337,
"eval_runtime": 116.9361,
"eval_samples_per_second": 17.103,
"eval_steps_per_second": 2.138,
"step": 4500
},
{
"epoch": 0.728,
"grad_norm": 7553.5556640625,
"learning_rate": 2.272e-05,
"loss": 0.063,
"step": 4550
},
{
"epoch": 0.736,
"grad_norm": 7194.16162109375,
"learning_rate": 2.2640000000000003e-05,
"loss": 0.0597,
"step": 4600
},
{
"epoch": 0.744,
"grad_norm": 7578.23583984375,
"learning_rate": 2.256e-05,
"loss": 0.0627,
"step": 4650
},
{
"epoch": 0.752,
"grad_norm": 7874.51904296875,
"learning_rate": 2.2479999999999998e-05,
"loss": 0.0628,
"step": 4700
},
{
"epoch": 0.76,
"grad_norm": 6014.06640625,
"learning_rate": 2.2400000000000002e-05,
"loss": 0.0651,
"step": 4750
},
{
"epoch": 0.768,
"grad_norm": 7170.10400390625,
"learning_rate": 2.232e-05,
"loss": 0.0656,
"step": 4800
},
{
"epoch": 0.776,
"grad_norm": 7596.84326171875,
"learning_rate": 2.224e-05,
"loss": 0.0598,
"step": 4850
},
{
"epoch": 0.784,
"grad_norm": 7802.14990234375,
"learning_rate": 2.216e-05,
"loss": 0.0605,
"step": 4900
},
{
"epoch": 0.792,
"grad_norm": 5468.1845703125,
"learning_rate": 2.208e-05,
"loss": 0.0594,
"step": 4950
},
{
"epoch": 0.8,
"grad_norm": 5185.58642578125,
"learning_rate": 2.2e-05,
"loss": 0.0586,
"step": 5000
},
{
"epoch": 0.8,
"eval_loss": 0.08396206796169281,
"eval_runtime": 116.8224,
"eval_samples_per_second": 17.12,
"eval_steps_per_second": 2.14,
"step": 5000
},
{
"epoch": 0.808,
"grad_norm": 6047.43359375,
"learning_rate": 2.192e-05,
"loss": 0.0673,
"step": 5050
},
{
"epoch": 0.816,
"grad_norm": 6286.21484375,
"learning_rate": 2.184e-05,
"loss": 0.0609,
"step": 5100
},
{
"epoch": 0.824,
"grad_norm": 6187.03369140625,
"learning_rate": 2.1760000000000002e-05,
"loss": 0.0628,
"step": 5150
},
{
"epoch": 0.832,
"grad_norm": 4476.73095703125,
"learning_rate": 2.1680000000000002e-05,
"loss": 0.0626,
"step": 5200
},
{
"epoch": 0.84,
"grad_norm": 6180.27490234375,
"learning_rate": 2.16e-05,
"loss": 0.061,
"step": 5250
},
{
"epoch": 0.848,
"grad_norm": 8477.626953125,
"learning_rate": 2.152e-05,
"loss": 0.0638,
"step": 5300
},
{
"epoch": 0.856,
"grad_norm": 11541.119140625,
"learning_rate": 2.144e-05,
"loss": 0.0602,
"step": 5350
},
{
"epoch": 0.864,
"grad_norm": 6183.49609375,
"learning_rate": 2.136e-05,
"loss": 0.0645,
"step": 5400
},
{
"epoch": 0.872,
"grad_norm": 7597.5810546875,
"learning_rate": 2.1280000000000003e-05,
"loss": 0.067,
"step": 5450
},
{
"epoch": 0.88,
"grad_norm": 8438.478515625,
"learning_rate": 2.12e-05,
"loss": 0.0628,
"step": 5500
},
{
"epoch": 0.88,
"eval_loss": 0.08360794186592102,
"eval_runtime": 116.6576,
"eval_samples_per_second": 17.144,
"eval_steps_per_second": 2.143,
"step": 5500
},
{
"epoch": 0.888,
"grad_norm": 8200.35546875,
"learning_rate": 2.1119999999999998e-05,
"loss": 0.0676,
"step": 5550
},
{
"epoch": 0.896,
"grad_norm": 8816.8076171875,
"learning_rate": 2.1040000000000002e-05,
"loss": 0.0626,
"step": 5600
},
{
"epoch": 0.904,
"grad_norm": 8886.630859375,
"learning_rate": 2.096e-05,
"loss": 0.0657,
"step": 5650
},
{
"epoch": 0.912,
"grad_norm": 8212.525390625,
"learning_rate": 2.088e-05,
"loss": 0.0619,
"step": 5700
},
{
"epoch": 0.92,
"grad_norm": 5723.00439453125,
"learning_rate": 2.08e-05,
"loss": 0.0623,
"step": 5750
},
{
"epoch": 0.928,
"grad_norm": 8616.3349609375,
"learning_rate": 2.072e-05,
"loss": 0.063,
"step": 5800
},
{
"epoch": 0.936,
"grad_norm": 7717.373046875,
"learning_rate": 2.064e-05,
"loss": 0.063,
"step": 5850
},
{
"epoch": 0.944,
"grad_norm": 6325.8193359375,
"learning_rate": 2.056e-05,
"loss": 0.0628,
"step": 5900
},
{
"epoch": 0.952,
"grad_norm": 6938.89111328125,
"learning_rate": 2.048e-05,
"loss": 0.0585,
"step": 5950
},
{
"epoch": 0.96,
"grad_norm": 8704.166015625,
"learning_rate": 2.04e-05,
"loss": 0.0634,
"step": 6000
},
{
"epoch": 0.96,
"eval_loss": 0.08321517705917358,
"eval_runtime": 116.6701,
"eval_samples_per_second": 17.142,
"eval_steps_per_second": 2.143,
"step": 6000
},
{
"epoch": 0.968,
"grad_norm": 5835.19189453125,
"learning_rate": 2.0320000000000002e-05,
"loss": 0.0643,
"step": 6050
},
{
"epoch": 0.976,
"grad_norm": 5896.76318359375,
"learning_rate": 2.024e-05,
"loss": 0.0625,
"step": 6100
},
{
"epoch": 0.984,
"grad_norm": 6958.45751953125,
"learning_rate": 2.016e-05,
"loss": 0.0657,
"step": 6150
},
{
"epoch": 0.992,
"grad_norm": 4680.04736328125,
"learning_rate": 2.008e-05,
"loss": 0.0632,
"step": 6200
},
{
"epoch": 1.0,
"grad_norm": 8230.8056640625,
"learning_rate": 1.9999999999999998e-05,
"loss": 0.0603,
"step": 6250
},
{
"epoch": 1.008,
"grad_norm": 5693.77001953125,
"learning_rate": 1.9920000000000002e-05,
"loss": 0.0574,
"step": 6300
},
{
"epoch": 1.016,
"grad_norm": 14030.3583984375,
"learning_rate": 1.984e-05,
"loss": 0.0563,
"step": 6350
},
{
"epoch": 1.024,
"grad_norm": 11693.09375,
"learning_rate": 1.976e-05,
"loss": 0.0558,
"step": 6400
},
{
"epoch": 1.032,
"grad_norm": 5772.1845703125,
"learning_rate": 1.968e-05,
"loss": 0.0544,
"step": 6450
},
{
"epoch": 1.04,
"grad_norm": 8641.919921875,
"learning_rate": 1.96e-05,
"loss": 0.0606,
"step": 6500
},
{
"epoch": 1.04,
"eval_loss": 0.08356834203004837,
"eval_runtime": 116.7914,
"eval_samples_per_second": 17.125,
"eval_steps_per_second": 2.141,
"step": 6500
},
{
"epoch": 1.048,
"grad_norm": 6437.4033203125,
"learning_rate": 1.952e-05,
"loss": 0.0567,
"step": 6550
},
{
"epoch": 1.056,
"grad_norm": 5099.38330078125,
"learning_rate": 1.944e-05,
"loss": 0.0553,
"step": 6600
},
{
"epoch": 1.064,
"grad_norm": 5254.07275390625,
"learning_rate": 1.936e-05,
"loss": 0.0564,
"step": 6650
},
{
"epoch": 1.072,
"grad_norm": 7453.3330078125,
"learning_rate": 1.9280000000000002e-05,
"loss": 0.0573,
"step": 6700
},
{
"epoch": 1.08,
"grad_norm": 3853.006103515625,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.0607,
"step": 6750
},
{
"epoch": 1.088,
"grad_norm": 8804.1083984375,
"learning_rate": 1.912e-05,
"loss": 0.0578,
"step": 6800
},
{
"epoch": 1.096,
"grad_norm": 5899.22021484375,
"learning_rate": 1.904e-05,
"loss": 0.0555,
"step": 6850
},
{
"epoch": 1.104,
"grad_norm": 8429.76171875,
"learning_rate": 1.896e-05,
"loss": 0.0539,
"step": 6900
},
{
"epoch": 1.112,
"grad_norm": 9160.4794921875,
"learning_rate": 1.888e-05,
"loss": 0.0572,
"step": 6950
},
{
"epoch": 1.12,
"grad_norm": 4707.27099609375,
"learning_rate": 1.8800000000000003e-05,
"loss": 0.0563,
"step": 7000
},
{
"epoch": 1.12,
"eval_loss": 0.08350159972906113,
"eval_runtime": 116.6938,
"eval_samples_per_second": 17.139,
"eval_steps_per_second": 2.142,
"step": 7000
},
{
"epoch": 1.1280000000000001,
"grad_norm": 5663.18603515625,
"learning_rate": 1.872e-05,
"loss": 0.0537,
"step": 7050
},
{
"epoch": 1.1360000000000001,
"grad_norm": 9569.765625,
"learning_rate": 1.8639999999999998e-05,
"loss": 0.0607,
"step": 7100
},
{
"epoch": 1.144,
"grad_norm": 7370.98046875,
"learning_rate": 1.8560000000000002e-05,
"loss": 0.0607,
"step": 7150
},
{
"epoch": 1.152,
"grad_norm": 5951.6533203125,
"learning_rate": 1.848e-05,
"loss": 0.0547,
"step": 7200
},
{
"epoch": 1.16,
"grad_norm": 8285.0830078125,
"learning_rate": 1.84e-05,
"loss": 0.0589,
"step": 7250
},
{
"epoch": 1.168,
"grad_norm": 7549.8271484375,
"learning_rate": 1.832e-05,
"loss": 0.0587,
"step": 7300
},
{
"epoch": 1.176,
"grad_norm": 7480.25927734375,
"learning_rate": 1.824e-05,
"loss": 0.058,
"step": 7350
},
{
"epoch": 1.184,
"grad_norm": 35994.15234375,
"learning_rate": 1.816e-05,
"loss": 0.0585,
"step": 7400
},
{
"epoch": 1.192,
"grad_norm": 7489.05859375,
"learning_rate": 1.808e-05,
"loss": 0.0616,
"step": 7450
},
{
"epoch": 1.2,
"grad_norm": 6134.80126953125,
"learning_rate": 1.8e-05,
"loss": 0.0572,
"step": 7500
},
{
"epoch": 1.2,
"eval_loss": 0.08285799622535706,
"eval_runtime": 116.9169,
"eval_samples_per_second": 17.106,
"eval_steps_per_second": 2.138,
"step": 7500
},
{
"epoch": 1.208,
"grad_norm": 4982.9521484375,
"learning_rate": 1.792e-05,
"loss": 0.0569,
"step": 7550
},
{
"epoch": 1.216,
"grad_norm": 5407.9384765625,
"learning_rate": 1.7840000000000002e-05,
"loss": 0.0579,
"step": 7600
},
{
"epoch": 1.224,
"grad_norm": 6399.041015625,
"learning_rate": 1.776e-05,
"loss": 0.0569,
"step": 7650
},
{
"epoch": 1.232,
"grad_norm": 6688.9658203125,
"learning_rate": 1.768e-05,
"loss": 0.0598,
"step": 7700
},
{
"epoch": 1.24,
"grad_norm": 10116.4990234375,
"learning_rate": 1.76e-05,
"loss": 0.0538,
"step": 7750
},
{
"epoch": 1.248,
"grad_norm": 7761.98876953125,
"learning_rate": 1.7519999999999998e-05,
"loss": 0.0549,
"step": 7800
},
{
"epoch": 1.256,
"grad_norm": 5940.802734375,
"learning_rate": 1.7440000000000002e-05,
"loss": 0.0537,
"step": 7850
},
{
"epoch": 1.264,
"grad_norm": 7946.06787109375,
"learning_rate": 1.736e-05,
"loss": 0.0548,
"step": 7900
},
{
"epoch": 1.272,
"grad_norm": 8282.916015625,
"learning_rate": 1.728e-05,
"loss": 0.0539,
"step": 7950
},
{
"epoch": 1.28,
"grad_norm": 6786.72509765625,
"learning_rate": 1.72e-05,
"loss": 0.0573,
"step": 8000
},
{
"epoch": 1.28,
"eval_loss": 0.08285758644342422,
"eval_runtime": 116.7577,
"eval_samples_per_second": 17.129,
"eval_steps_per_second": 2.141,
"step": 8000
},
{
"epoch": 1.288,
"grad_norm": 6129.27783203125,
"learning_rate": 1.712e-05,
"loss": 0.0578,
"step": 8050
},
{
"epoch": 1.296,
"grad_norm": 6502.31298828125,
"learning_rate": 1.704e-05,
"loss": 0.0513,
"step": 8100
},
{
"epoch": 1.304,
"grad_norm": 10347.439453125,
"learning_rate": 1.696e-05,
"loss": 0.0527,
"step": 8150
},
{
"epoch": 1.312,
"grad_norm": 7870.1796875,
"learning_rate": 1.688e-05,
"loss": 0.0565,
"step": 8200
},
{
"epoch": 1.32,
"grad_norm": 7197.3447265625,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.0538,
"step": 8250
},
{
"epoch": 1.328,
"grad_norm": 5525.79931640625,
"learning_rate": 1.672e-05,
"loss": 0.0579,
"step": 8300
},
{
"epoch": 1.336,
"grad_norm": 5812.7490234375,
"learning_rate": 1.664e-05,
"loss": 0.0543,
"step": 8350
},
{
"epoch": 1.3439999999999999,
"grad_norm": 5728.1904296875,
"learning_rate": 1.656e-05,
"loss": 0.0572,
"step": 8400
},
{
"epoch": 1.3519999999999999,
"grad_norm": 6965.53759765625,
"learning_rate": 1.648e-05,
"loss": 0.0535,
"step": 8450
},
{
"epoch": 1.3599999999999999,
"grad_norm": 6986.52783203125,
"learning_rate": 1.64e-05,
"loss": 0.0549,
"step": 8500
},
{
"epoch": 1.3599999999999999,
"eval_loss": 0.08279111981391907,
"eval_runtime": 116.6629,
"eval_samples_per_second": 17.143,
"eval_steps_per_second": 2.143,
"step": 8500
},
{
"epoch": 1.3679999999999999,
"grad_norm": 6076.61865234375,
"learning_rate": 1.6320000000000003e-05,
"loss": 0.0566,
"step": 8550
},
{
"epoch": 1.376,
"grad_norm": 6356.2578125,
"learning_rate": 1.624e-05,
"loss": 0.0527,
"step": 8600
},
{
"epoch": 1.384,
"grad_norm": 8593.4482421875,
"learning_rate": 1.6159999999999998e-05,
"loss": 0.0611,
"step": 8650
},
{
"epoch": 1.392,
"grad_norm": 6525.712890625,
"learning_rate": 1.6080000000000002e-05,
"loss": 0.0508,
"step": 8700
},
{
"epoch": 1.4,
"grad_norm": 6376.82177734375,
"learning_rate": 1.6e-05,
"loss": 0.0554,
"step": 8750
},
{
"epoch": 1.408,
"grad_norm": 7890.4990234375,
"learning_rate": 1.592e-05,
"loss": 0.0546,
"step": 8800
},
{
"epoch": 1.416,
"grad_norm": 5426.74267578125,
"learning_rate": 1.584e-05,
"loss": 0.0558,
"step": 8850
},
{
"epoch": 1.424,
"grad_norm": 8708.7294921875,
"learning_rate": 1.576e-05,
"loss": 0.0597,
"step": 8900
},
{
"epoch": 1.432,
"grad_norm": 7744.2490234375,
"learning_rate": 1.568e-05,
"loss": 0.0553,
"step": 8950
},
{
"epoch": 1.44,
"grad_norm": 4320.080078125,
"learning_rate": 1.56e-05,
"loss": 0.0602,
"step": 9000
},
{
"epoch": 1.44,
"eval_loss": 0.08268450945615768,
"eval_runtime": 116.8196,
"eval_samples_per_second": 17.12,
"eval_steps_per_second": 2.14,
"step": 9000
},
{
"epoch": 1.448,
"grad_norm": 5681.900390625,
"learning_rate": 1.552e-05,
"loss": 0.0549,
"step": 9050
},
{
"epoch": 1.456,
"grad_norm": 5498.50048828125,
"learning_rate": 1.544e-05,
"loss": 0.0551,
"step": 9100
},
{
"epoch": 1.464,
"grad_norm": 7044.8017578125,
"learning_rate": 1.5360000000000002e-05,
"loss": 0.0557,
"step": 9150
},
{
"epoch": 1.472,
"grad_norm": 8311.8076171875,
"learning_rate": 1.528e-05,
"loss": 0.0559,
"step": 9200
},
{
"epoch": 1.48,
"grad_norm": 10259.4189453125,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.057,
"step": 9250
},
{
"epoch": 1.488,
"grad_norm": 7944.630859375,
"learning_rate": 1.5120000000000001e-05,
"loss": 0.0541,
"step": 9300
},
{
"epoch": 1.496,
"grad_norm": 9513.1875,
"learning_rate": 1.504e-05,
"loss": 0.0558,
"step": 9350
},
{
"epoch": 1.504,
"grad_norm": 6013.54296875,
"learning_rate": 1.4959999999999999e-05,
"loss": 0.0532,
"step": 9400
},
{
"epoch": 1.512,
"grad_norm": 7162.22314453125,
"learning_rate": 1.488e-05,
"loss": 0.0553,
"step": 9450
},
{
"epoch": 1.52,
"grad_norm": 6351.9833984375,
"learning_rate": 1.48e-05,
"loss": 0.0548,
"step": 9500
},
{
"epoch": 1.52,
"eval_loss": 0.08252418041229248,
"eval_runtime": 116.7082,
"eval_samples_per_second": 17.137,
"eval_steps_per_second": 2.142,
"step": 9500
},
{
"epoch": 1.528,
"grad_norm": 6762.00244140625,
"learning_rate": 1.472e-05,
"loss": 0.0529,
"step": 9550
},
{
"epoch": 1.536,
"grad_norm": 7704.66748046875,
"learning_rate": 1.464e-05,
"loss": 0.0576,
"step": 9600
},
{
"epoch": 1.544,
"grad_norm": 5400.18798828125,
"learning_rate": 1.4560000000000001e-05,
"loss": 0.0556,
"step": 9650
},
{
"epoch": 1.552,
"grad_norm": 6167.47216796875,
"learning_rate": 1.448e-05,
"loss": 0.0547,
"step": 9700
},
{
"epoch": 1.56,
"grad_norm": 5073.39892578125,
"learning_rate": 1.44e-05,
"loss": 0.0544,
"step": 9750
},
{
"epoch": 1.568,
"grad_norm": 6849.08447265625,
"learning_rate": 1.432e-05,
"loss": 0.0571,
"step": 9800
},
{
"epoch": 1.576,
"grad_norm": 6866.765625,
"learning_rate": 1.4240000000000001e-05,
"loss": 0.0518,
"step": 9850
},
{
"epoch": 1.584,
"grad_norm": 8185.33740234375,
"learning_rate": 1.416e-05,
"loss": 0.0605,
"step": 9900
},
{
"epoch": 1.592,
"grad_norm": 7759.45361328125,
"learning_rate": 1.408e-05,
"loss": 0.0581,
"step": 9950
},
{
"epoch": 1.6,
"grad_norm": 5736.8740234375,
"learning_rate": 1.4e-05,
"loss": 0.0582,
"step": 10000
},
{
"epoch": 1.6,
"eval_loss": 0.08249519765377045,
"eval_runtime": 116.9496,
"eval_samples_per_second": 17.101,
"eval_steps_per_second": 2.138,
"step": 10000
},
{
"epoch": 1.608,
"grad_norm": 5240.361328125,
"learning_rate": 1.392e-05,
"loss": 0.0546,
"step": 10050
},
{
"epoch": 1.616,
"grad_norm": 7000.00927734375,
"learning_rate": 1.384e-05,
"loss": 0.0535,
"step": 10100
},
{
"epoch": 1.624,
"grad_norm": 8141.75048828125,
"learning_rate": 1.376e-05,
"loss": 0.0555,
"step": 10150
},
{
"epoch": 1.6320000000000001,
"grad_norm": 6566.3662109375,
"learning_rate": 1.3680000000000001e-05,
"loss": 0.0518,
"step": 10200
},
{
"epoch": 1.6400000000000001,
"grad_norm": 7028.8935546875,
"learning_rate": 1.36e-05,
"loss": 0.0572,
"step": 10250
},
{
"epoch": 1.6480000000000001,
"grad_norm": 13007.5703125,
"learning_rate": 1.352e-05,
"loss": 0.0567,
"step": 10300
},
{
"epoch": 1.6560000000000001,
"grad_norm": 6286.06640625,
"learning_rate": 1.344e-05,
"loss": 0.0529,
"step": 10350
},
{
"epoch": 1.6640000000000001,
"grad_norm": 6360.68408203125,
"learning_rate": 1.336e-05,
"loss": 0.054,
"step": 10400
},
{
"epoch": 1.6720000000000002,
"grad_norm": 8098.84228515625,
"learning_rate": 1.328e-05,
"loss": 0.0592,
"step": 10450
},
{
"epoch": 1.6800000000000002,
"grad_norm": 6886.65283203125,
"learning_rate": 1.32e-05,
"loss": 0.0524,
"step": 10500
},
{
"epoch": 1.6800000000000002,
"eval_loss": 0.08225961029529572,
"eval_runtime": 116.8647,
"eval_samples_per_second": 17.114,
"eval_steps_per_second": 2.139,
"step": 10500
},
{
"epoch": 1.688,
"grad_norm": 5443.7119140625,
"learning_rate": 1.3120000000000001e-05,
"loss": 0.0554,
"step": 10550
},
{
"epoch": 1.696,
"grad_norm": 6497.8408203125,
"learning_rate": 1.304e-05,
"loss": 0.057,
"step": 10600
},
{
"epoch": 1.704,
"grad_norm": 5618.49853515625,
"learning_rate": 1.296e-05,
"loss": 0.0498,
"step": 10650
},
{
"epoch": 1.712,
"grad_norm": 7447.96728515625,
"learning_rate": 1.288e-05,
"loss": 0.0568,
"step": 10700
},
{
"epoch": 1.72,
"grad_norm": 8283.306640625,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.0566,
"step": 10750
},
{
"epoch": 1.728,
"grad_norm": 7497.0419921875,
"learning_rate": 1.272e-05,
"loss": 0.0502,
"step": 10800
},
{
"epoch": 1.736,
"grad_norm": 8445.2421875,
"learning_rate": 1.2640000000000001e-05,
"loss": 0.0562,
"step": 10850
},
{
"epoch": 1.744,
"grad_norm": 15980.0498046875,
"learning_rate": 1.2560000000000002e-05,
"loss": 0.0588,
"step": 10900
},
{
"epoch": 1.752,
"grad_norm": 5444.55615234375,
"learning_rate": 1.2479999999999999e-05,
"loss": 0.0564,
"step": 10950
},
{
"epoch": 1.76,
"grad_norm": 7009.3037109375,
"learning_rate": 1.24e-05,
"loss": 0.0549,
"step": 11000
},
{
"epoch": 1.76,
"eval_loss": 0.08220627158880234,
"eval_runtime": 116.957,
"eval_samples_per_second": 17.1,
"eval_steps_per_second": 2.138,
"step": 11000
},
{
"epoch": 1.768,
"grad_norm": 5123.0029296875,
"learning_rate": 1.232e-05,
"loss": 0.0562,
"step": 11050
},
{
"epoch": 1.776,
"grad_norm": 7975.41064453125,
"learning_rate": 1.224e-05,
"loss": 0.0515,
"step": 11100
},
{
"epoch": 1.784,
"grad_norm": 5846.47705078125,
"learning_rate": 1.216e-05,
"loss": 0.054,
"step": 11150
},
{
"epoch": 1.792,
"grad_norm": 7158.12109375,
"learning_rate": 1.2080000000000001e-05,
"loss": 0.0577,
"step": 11200
},
{
"epoch": 1.8,
"grad_norm": 5405.5224609375,
"learning_rate": 1.2e-05,
"loss": 0.0538,
"step": 11250
},
{
"epoch": 1.808,
"grad_norm": 7155.9677734375,
"learning_rate": 1.192e-05,
"loss": 0.0539,
"step": 11300
},
{
"epoch": 1.8159999999999998,
"grad_norm": 6886.369140625,
"learning_rate": 1.184e-05,
"loss": 0.0565,
"step": 11350
},
{
"epoch": 1.8239999999999998,
"grad_norm": 7139.15283203125,
"learning_rate": 1.1760000000000001e-05,
"loss": 0.0539,
"step": 11400
},
{
"epoch": 1.8319999999999999,
"grad_norm": 5965.82666015625,
"learning_rate": 1.168e-05,
"loss": 0.0587,
"step": 11450
},
{
"epoch": 1.8399999999999999,
"grad_norm": 6557.6708984375,
"learning_rate": 1.16e-05,
"loss": 0.0552,
"step": 11500
},
{
"epoch": 1.8399999999999999,
"eval_loss": 0.08207839727401733,
"eval_runtime": 116.751,
"eval_samples_per_second": 17.13,
"eval_steps_per_second": 2.141,
"step": 11500
},
{
"epoch": 1.8479999999999999,
"grad_norm": 5619.83984375,
"learning_rate": 1.152e-05,
"loss": 0.0563,
"step": 11550
},
{
"epoch": 1.8559999999999999,
"grad_norm": 92426.8046875,
"learning_rate": 1.144e-05,
"loss": 0.0588,
"step": 11600
},
{
"epoch": 1.8639999999999999,
"grad_norm": 7583.005859375,
"learning_rate": 1.136e-05,
"loss": 0.0559,
"step": 11650
},
{
"epoch": 1.8719999999999999,
"grad_norm": 6395.92578125,
"learning_rate": 1.128e-05,
"loss": 0.0552,
"step": 11700
},
{
"epoch": 1.88,
"grad_norm": 9939.912109375,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.0523,
"step": 11750
},
{
"epoch": 1.888,
"grad_norm": 5679.93212890625,
"learning_rate": 1.112e-05,
"loss": 0.0585,
"step": 11800
},
{
"epoch": 1.896,
"grad_norm": 6536.05419921875,
"learning_rate": 1.104e-05,
"loss": 0.0533,
"step": 11850
},
{
"epoch": 1.904,
"grad_norm": 7333.63330078125,
"learning_rate": 1.096e-05,
"loss": 0.0566,
"step": 11900
},
{
"epoch": 1.912,
"grad_norm": 7345.85009765625,
"learning_rate": 1.0880000000000001e-05,
"loss": 0.0555,
"step": 11950
},
{
"epoch": 1.92,
"grad_norm": 21337.044921875,
"learning_rate": 1.08e-05,
"loss": 0.0576,
"step": 12000
},
{
"epoch": 1.92,
"eval_loss": 0.08194975554943085,
"eval_runtime": 116.8029,
"eval_samples_per_second": 17.123,
"eval_steps_per_second": 2.14,
"step": 12000
},
{
"epoch": 1.928,
"grad_norm": 6469.14306640625,
"learning_rate": 1.072e-05,
"loss": 0.0584,
"step": 12050
},
{
"epoch": 1.936,
"grad_norm": 7579.2998046875,
"learning_rate": 1.0640000000000001e-05,
"loss": 0.0573,
"step": 12100
},
{
"epoch": 1.944,
"grad_norm": 8114.94921875,
"learning_rate": 1.0559999999999999e-05,
"loss": 0.0523,
"step": 12150
},
{
"epoch": 1.952,
"grad_norm": 7263.44384765625,
"learning_rate": 1.048e-05,
"loss": 0.0517,
"step": 12200
},
{
"epoch": 1.96,
"grad_norm": 8325.9580078125,
"learning_rate": 1.04e-05,
"loss": 0.0524,
"step": 12250
},
{
"epoch": 1.968,
"grad_norm": 6577.01318359375,
"learning_rate": 1.032e-05,
"loss": 0.0533,
"step": 12300
},
{
"epoch": 1.976,
"grad_norm": 6278.1826171875,
"learning_rate": 1.024e-05,
"loss": 0.0532,
"step": 12350
},
{
"epoch": 1.984,
"grad_norm": 7769.2333984375,
"learning_rate": 1.0160000000000001e-05,
"loss": 0.0532,
"step": 12400
},
{
"epoch": 1.992,
"grad_norm": 10089.91796875,
"learning_rate": 1.008e-05,
"loss": 0.0539,
"step": 12450
},
{
"epoch": 2.0,
"grad_norm": 9177.8115234375,
"learning_rate": 9.999999999999999e-06,
"loss": 0.0588,
"step": 12500
},
{
"epoch": 2.0,
"eval_loss": 0.08158940076828003,
"eval_runtime": 116.7903,
"eval_samples_per_second": 17.125,
"eval_steps_per_second": 2.141,
"step": 12500
},
{
"epoch": 2.008,
"grad_norm": 6336.53076171875,
"learning_rate": 9.92e-06,
"loss": 0.0466,
"step": 12550
},
{
"epoch": 2.016,
"grad_norm": 4880.88330078125,
"learning_rate": 9.84e-06,
"loss": 0.0531,
"step": 12600
},
{
"epoch": 2.024,
"grad_norm": 6478.1640625,
"learning_rate": 9.76e-06,
"loss": 0.0516,
"step": 12650
},
{
"epoch": 2.032,
"grad_norm": 6105.318359375,
"learning_rate": 9.68e-06,
"loss": 0.0492,
"step": 12700
},
{
"epoch": 2.04,
"grad_norm": 6270.1318359375,
"learning_rate": 9.600000000000001e-06,
"loss": 0.0511,
"step": 12750
},
{
"epoch": 2.048,
"grad_norm": 5914.5458984375,
"learning_rate": 9.52e-06,
"loss": 0.0522,
"step": 12800
},
{
"epoch": 2.056,
"grad_norm": 6194.03076171875,
"learning_rate": 9.44e-06,
"loss": 0.0535,
"step": 12850
},
{
"epoch": 2.064,
"grad_norm": 7986.248046875,
"learning_rate": 9.36e-06,
"loss": 0.0529,
"step": 12900
},
{
"epoch": 2.072,
"grad_norm": 10384.2099609375,
"learning_rate": 9.280000000000001e-06,
"loss": 0.0471,
"step": 12950
},
{
"epoch": 2.08,
"grad_norm": 8849.5703125,
"learning_rate": 9.2e-06,
"loss": 0.0502,
"step": 13000
},
{
"epoch": 2.08,
"eval_loss": 0.08202869445085526,
"eval_runtime": 117.0019,
"eval_samples_per_second": 17.094,
"eval_steps_per_second": 2.137,
"step": 13000
},
{
"epoch": 2.088,
"grad_norm": 7875.97900390625,
"learning_rate": 9.12e-06,
"loss": 0.049,
"step": 13050
},
{
"epoch": 2.096,
"grad_norm": 6825.78076171875,
"learning_rate": 9.04e-06,
"loss": 0.0465,
"step": 13100
},
{
"epoch": 2.104,
"grad_norm": 5515.30322265625,
"learning_rate": 8.96e-06,
"loss": 0.0535,
"step": 13150
},
{
"epoch": 2.112,
"grad_norm": 8940.48828125,
"learning_rate": 8.88e-06,
"loss": 0.0564,
"step": 13200
},
{
"epoch": 2.12,
"grad_norm": 5110.7119140625,
"learning_rate": 8.8e-06,
"loss": 0.0509,
"step": 13250
},
{
"epoch": 2.128,
"grad_norm": 8984.7353515625,
"learning_rate": 8.720000000000001e-06,
"loss": 0.0479,
"step": 13300
},
{
"epoch": 2.136,
"grad_norm": 8438.55078125,
"learning_rate": 8.64e-06,
"loss": 0.0502,
"step": 13350
},
{
"epoch": 2.144,
"grad_norm": 5724.0849609375,
"learning_rate": 8.56e-06,
"loss": 0.0501,
"step": 13400
},
{
"epoch": 2.152,
"grad_norm": 7649.28955078125,
"learning_rate": 8.48e-06,
"loss": 0.0569,
"step": 13450
},
{
"epoch": 2.16,
"grad_norm": 8429.0166015625,
"learning_rate": 8.400000000000001e-06,
"loss": 0.053,
"step": 13500
},
{
"epoch": 2.16,
"eval_loss": 0.08213882148265839,
"eval_runtime": 116.6956,
"eval_samples_per_second": 17.139,
"eval_steps_per_second": 2.142,
"step": 13500
},
{
"epoch": 2.168,
"grad_norm": 4402.388671875,
"learning_rate": 8.32e-06,
"loss": 0.0499,
"step": 13550
},
{
"epoch": 2.176,
"grad_norm": 9858.970703125,
"learning_rate": 8.24e-06,
"loss": 0.0506,
"step": 13600
},
{
"epoch": 2.184,
"grad_norm": 6748.5732421875,
"learning_rate": 8.160000000000001e-06,
"loss": 0.05,
"step": 13650
},
{
"epoch": 2.192,
"grad_norm": 7720.3994140625,
"learning_rate": 8.079999999999999e-06,
"loss": 0.0504,
"step": 13700
},
{
"epoch": 2.2,
"grad_norm": 5066.37060546875,
"learning_rate": 8e-06,
"loss": 0.0533,
"step": 13750
},
{
"epoch": 2.208,
"grad_norm": 7975.1376953125,
"learning_rate": 7.92e-06,
"loss": 0.0482,
"step": 13800
},
{
"epoch": 2.216,
"grad_norm": 6690.85302734375,
"learning_rate": 7.84e-06,
"loss": 0.0518,
"step": 13850
},
{
"epoch": 2.224,
"grad_norm": 8501.337890625,
"learning_rate": 7.76e-06,
"loss": 0.0534,
"step": 13900
},
{
"epoch": 2.232,
"grad_norm": 15215.427734375,
"learning_rate": 7.680000000000001e-06,
"loss": 0.0488,
"step": 13950
},
{
"epoch": 2.24,
"grad_norm": 6265.7568359375,
"learning_rate": 7.600000000000001e-06,
"loss": 0.0468,
"step": 14000
},
{
"epoch": 2.24,
"eval_loss": 0.08207998424768448,
"eval_runtime": 116.7104,
"eval_samples_per_second": 17.136,
"eval_steps_per_second": 2.142,
"step": 14000
},
{
"epoch": 2.248,
"grad_norm": 5661.556640625,
"learning_rate": 7.52e-06,
"loss": 0.0516,
"step": 14050
},
{
"epoch": 2.2560000000000002,
"grad_norm": 6117.46728515625,
"learning_rate": 7.44e-06,
"loss": 0.0535,
"step": 14100
},
{
"epoch": 2.2640000000000002,
"grad_norm": 5083.50634765625,
"learning_rate": 7.36e-06,
"loss": 0.0514,
"step": 14150
},
{
"epoch": 2.2720000000000002,
"grad_norm": 6597.24365234375,
"learning_rate": 7.280000000000001e-06,
"loss": 0.0566,
"step": 14200
},
{
"epoch": 2.2800000000000002,
"grad_norm": 7306.90185546875,
"learning_rate": 7.2e-06,
"loss": 0.0523,
"step": 14250
},
{
"epoch": 2.288,
"grad_norm": 6694.41552734375,
"learning_rate": 7.1200000000000004e-06,
"loss": 0.0475,
"step": 14300
},
{
"epoch": 2.296,
"grad_norm": 3753.303466796875,
"learning_rate": 7.04e-06,
"loss": 0.0501,
"step": 14350
},
{
"epoch": 2.304,
"grad_norm": 5714.30078125,
"learning_rate": 6.96e-06,
"loss": 0.0485,
"step": 14400
},
{
"epoch": 2.312,
"grad_norm": 7579.119140625,
"learning_rate": 6.88e-06,
"loss": 0.0504,
"step": 14450
},
{
"epoch": 2.32,
"grad_norm": 6103.64599609375,
"learning_rate": 6.8e-06,
"loss": 0.0531,
"step": 14500
},
{
"epoch": 2.32,
"eval_loss": 0.08199251443147659,
"eval_runtime": 116.661,
"eval_samples_per_second": 17.144,
"eval_steps_per_second": 2.143,
"step": 14500
},
{
"epoch": 2.328,
"grad_norm": 7419.63623046875,
"learning_rate": 6.72e-06,
"loss": 0.0527,
"step": 14550
},
{
"epoch": 2.336,
"grad_norm": 6152.6513671875,
"learning_rate": 6.64e-06,
"loss": 0.048,
"step": 14600
},
{
"epoch": 2.344,
"grad_norm": 6703.68994140625,
"learning_rate": 6.560000000000001e-06,
"loss": 0.0537,
"step": 14650
},
{
"epoch": 2.352,
"grad_norm": 8612.31640625,
"learning_rate": 6.48e-06,
"loss": 0.0512,
"step": 14700
},
{
"epoch": 2.36,
"grad_norm": 6183.3798828125,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.0499,
"step": 14750
},
{
"epoch": 2.368,
"grad_norm": 7795.396484375,
"learning_rate": 6.3200000000000005e-06,
"loss": 0.0525,
"step": 14800
},
{
"epoch": 2.376,
"grad_norm": 6911.2099609375,
"learning_rate": 6.2399999999999995e-06,
"loss": 0.0503,
"step": 14850
},
{
"epoch": 2.384,
"grad_norm": 9744.9267578125,
"learning_rate": 6.16e-06,
"loss": 0.0509,
"step": 14900
},
{
"epoch": 2.392,
"grad_norm": 4487.8115234375,
"learning_rate": 6.08e-06,
"loss": 0.0504,
"step": 14950
},
{
"epoch": 2.4,
"grad_norm": 6276.47607421875,
"learning_rate": 6e-06,
"loss": 0.0505,
"step": 15000
},
{
"epoch": 2.4,
"eval_loss": 0.08178989589214325,
"eval_runtime": 116.6529,
"eval_samples_per_second": 17.145,
"eval_steps_per_second": 2.143,
"step": 15000
},
{
"epoch": 2.408,
"grad_norm": 7706.4375,
"learning_rate": 5.92e-06,
"loss": 0.0513,
"step": 15050
},
{
"epoch": 2.416,
"grad_norm": 6188.396484375,
"learning_rate": 5.84e-06,
"loss": 0.0511,
"step": 15100
},
{
"epoch": 2.424,
"grad_norm": 6621.79345703125,
"learning_rate": 5.76e-06,
"loss": 0.0506,
"step": 15150
},
{
"epoch": 2.432,
"grad_norm": 5284.65185546875,
"learning_rate": 5.68e-06,
"loss": 0.0486,
"step": 15200
},
{
"epoch": 2.44,
"grad_norm": 6653.84716796875,
"learning_rate": 5.600000000000001e-06,
"loss": 0.053,
"step": 15250
},
{
"epoch": 2.448,
"grad_norm": 6338.93505859375,
"learning_rate": 5.52e-06,
"loss": 0.0517,
"step": 15300
},
{
"epoch": 2.456,
"grad_norm": 6020.87548828125,
"learning_rate": 5.4400000000000004e-06,
"loss": 0.0524,
"step": 15350
},
{
"epoch": 2.464,
"grad_norm": 7275.64697265625,
"learning_rate": 5.36e-06,
"loss": 0.0516,
"step": 15400
},
{
"epoch": 2.472,
"grad_norm": 5086.87744140625,
"learning_rate": 5.279999999999999e-06,
"loss": 0.0514,
"step": 15450
},
{
"epoch": 2.48,
"grad_norm": 4989.05078125,
"learning_rate": 5.2e-06,
"loss": 0.0526,
"step": 15500
},
{
"epoch": 2.48,
"eval_loss": 0.08169461041688919,
"eval_runtime": 116.7302,
"eval_samples_per_second": 17.134,
"eval_steps_per_second": 2.142,
"step": 15500
},
{
"epoch": 2.488,
"grad_norm": 6472.25537109375,
"learning_rate": 5.12e-06,
"loss": 0.0496,
"step": 15550
},
{
"epoch": 2.496,
"grad_norm": 6369.4833984375,
"learning_rate": 5.04e-06,
"loss": 0.0518,
"step": 15600
},
{
"epoch": 2.504,
"grad_norm": 8784.1083984375,
"learning_rate": 4.96e-06,
"loss": 0.0547,
"step": 15650
},
{
"epoch": 2.512,
"grad_norm": 8509.6650390625,
"learning_rate": 4.88e-06,
"loss": 0.0555,
"step": 15700
},
{
"epoch": 2.52,
"grad_norm": 7856.84716796875,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0513,
"step": 15750
},
{
"epoch": 2.528,
"grad_norm": 6816.51123046875,
"learning_rate": 4.72e-06,
"loss": 0.0493,
"step": 15800
},
{
"epoch": 2.536,
"grad_norm": 6773.06884765625,
"learning_rate": 4.6400000000000005e-06,
"loss": 0.05,
"step": 15850
},
{
"epoch": 2.544,
"grad_norm": 9726.3818359375,
"learning_rate": 4.56e-06,
"loss": 0.0518,
"step": 15900
},
{
"epoch": 2.552,
"grad_norm": 8707.6591796875,
"learning_rate": 4.48e-06,
"loss": 0.0499,
"step": 15950
},
{
"epoch": 2.56,
"grad_norm": 4772.958984375,
"learning_rate": 4.4e-06,
"loss": 0.0522,
"step": 16000
},
{
"epoch": 2.56,
"eval_loss": 0.08175843954086304,
"eval_runtime": 116.8011,
"eval_samples_per_second": 17.123,
"eval_steps_per_second": 2.14,
"step": 16000
}
],
"logging_steps": 50,
"max_steps": 18750,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.794660999168e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}