gpt_train_12_128 / trainer_state.json
gokulsrinivasagan's picture
End of training
3c1649d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.025573961057832027,
"eval_steps": 1,
"global_step": 352,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.265329845975008e-05,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 1
},
{
"epoch": 7.265329845975008e-05,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 267.5371,
"eval_samples_per_second": 126.214,
"eval_steps_per_second": 2.631,
"step": 1
},
{
"epoch": 0.00014530659691950015,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 2
},
{
"epoch": 0.00014530659691950015,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 267.9301,
"eval_samples_per_second": 126.029,
"eval_steps_per_second": 2.628,
"step": 2
},
{
"epoch": 0.00021795989537925023,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 3
},
{
"epoch": 0.00021795989537925023,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 267.74,
"eval_samples_per_second": 126.119,
"eval_steps_per_second": 2.629,
"step": 3
},
{
"epoch": 0.0002906131938390003,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 4
},
{
"epoch": 0.0002906131938390003,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 267.3271,
"eval_samples_per_second": 126.313,
"eval_steps_per_second": 2.633,
"step": 4
},
{
"epoch": 0.0003632664922987504,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 5
},
{
"epoch": 0.0003632664922987504,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 267.207,
"eval_samples_per_second": 126.37,
"eval_steps_per_second": 2.635,
"step": 5
},
{
"epoch": 0.00043591979075850045,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 6
},
{
"epoch": 0.00043591979075850045,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 268.9045,
"eval_samples_per_second": 125.572,
"eval_steps_per_second": 2.618,
"step": 6
},
{
"epoch": 0.0005085730892182505,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 7
},
{
"epoch": 0.0005085730892182505,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 269.4441,
"eval_samples_per_second": 125.321,
"eval_steps_per_second": 2.613,
"step": 7
},
{
"epoch": 0.0005812263876780006,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 8
},
{
"epoch": 0.0005812263876780006,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.7225,
"eval_samples_per_second": 127.076,
"eval_steps_per_second": 2.649,
"step": 8
},
{
"epoch": 0.0006538796861377507,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 9
},
{
"epoch": 0.0006538796861377507,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.6793,
"eval_samples_per_second": 127.097,
"eval_steps_per_second": 2.65,
"step": 9
},
{
"epoch": 0.0007265329845975008,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 10
},
{
"epoch": 0.0007265329845975008,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 266.2481,
"eval_samples_per_second": 126.825,
"eval_steps_per_second": 2.644,
"step": 10
},
{
"epoch": 0.0007991862830572508,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 11
},
{
"epoch": 0.0007991862830572508,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.3262,
"eval_samples_per_second": 127.266,
"eval_steps_per_second": 2.653,
"step": 11
},
{
"epoch": 0.0008718395815170009,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 12
},
{
"epoch": 0.0008718395815170009,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.9213,
"eval_samples_per_second": 126.981,
"eval_steps_per_second": 2.647,
"step": 12
},
{
"epoch": 0.000944492879976751,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 13
},
{
"epoch": 0.000944492879976751,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 266.6935,
"eval_samples_per_second": 126.614,
"eval_steps_per_second": 2.64,
"step": 13
},
{
"epoch": 0.001017146178436501,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 14
},
{
"epoch": 0.001017146178436501,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.5702,
"eval_samples_per_second": 127.149,
"eval_steps_per_second": 2.651,
"step": 14
},
{
"epoch": 0.001089799476896251,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 15
},
{
"epoch": 0.001089799476896251,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 266.1801,
"eval_samples_per_second": 126.858,
"eval_steps_per_second": 2.645,
"step": 15
},
{
"epoch": 0.0011624527753560012,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8438,
"step": 16
},
{
"epoch": 0.0011624527753560012,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.2069,
"eval_samples_per_second": 127.323,
"eval_steps_per_second": 2.655,
"step": 16
},
{
"epoch": 0.0012351060738157512,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8359,
"step": 17
},
{
"epoch": 0.0012351060738157512,
"eval_accuracy": 0.010348185357762373,
"eval_loss": 10.84375,
"eval_runtime": 265.4707,
"eval_samples_per_second": 127.197,
"eval_steps_per_second": 2.652,
"step": 17
},
{
"epoch": 0.0013077593722755014,
"grad_norm": 2.8743269443511963,
"learning_rate": 9.999992734670155e-06,
"loss": 10.8438,
"step": 18
},
{
"epoch": 0.0013077593722755014,
"eval_accuracy": 0.011272349663430095,
"eval_loss": 10.828125,
"eval_runtime": 265.1122,
"eval_samples_per_second": 127.369,
"eval_steps_per_second": 2.655,
"step": 18
},
{
"epoch": 0.0013804126707352513,
"grad_norm": 2.8402953147888184,
"learning_rate": 9.999985469340309e-06,
"loss": 10.8203,
"step": 19
},
{
"epoch": 0.0013804126707352513,
"eval_accuracy": 0.011634905538764718,
"eval_loss": 10.8125,
"eval_runtime": 265.8546,
"eval_samples_per_second": 127.013,
"eval_steps_per_second": 2.648,
"step": 19
},
{
"epoch": 0.0014530659691950015,
"grad_norm": 2.8661510944366455,
"learning_rate": 9.999978204010463e-06,
"loss": 10.8203,
"step": 20
},
{
"epoch": 0.0014530659691950015,
"eval_accuracy": 0.01173857150727105,
"eval_loss": 10.8046875,
"eval_runtime": 265.3217,
"eval_samples_per_second": 127.268,
"eval_steps_per_second": 2.653,
"step": 20
},
{
"epoch": 0.0015257192676547515,
"grad_norm": 2.8541078567504883,
"learning_rate": 9.999970938680617e-06,
"loss": 10.8047,
"step": 21
},
{
"epoch": 0.0015257192676547515,
"eval_accuracy": 0.011759009422313067,
"eval_loss": 10.7890625,
"eval_runtime": 265.864,
"eval_samples_per_second": 127.009,
"eval_steps_per_second": 2.648,
"step": 21
},
{
"epoch": 0.0015983725661145017,
"grad_norm": 2.8900887966156006,
"learning_rate": 9.99996367335077e-06,
"loss": 10.7969,
"step": 22
},
{
"epoch": 0.0015983725661145017,
"eval_accuracy": 0.011769199430945915,
"eval_loss": 10.7734375,
"eval_runtime": 266.281,
"eval_samples_per_second": 126.81,
"eval_steps_per_second": 2.644,
"step": 22
},
{
"epoch": 0.0016710258645742516,
"grad_norm": 2.884963035583496,
"learning_rate": 9.999956408020926e-06,
"loss": 10.7812,
"step": 23
},
{
"epoch": 0.0016710258645742516,
"eval_accuracy": 0.01179959576351549,
"eval_loss": 10.765625,
"eval_runtime": 266.3138,
"eval_samples_per_second": 126.794,
"eval_steps_per_second": 2.643,
"step": 23
},
{
"epoch": 0.0017436791630340018,
"grad_norm": 2.8954319953918457,
"learning_rate": 9.999949142691078e-06,
"loss": 10.7656,
"step": 24
},
{
"epoch": 0.0017436791630340018,
"eval_accuracy": 0.011878857819301676,
"eval_loss": 10.75,
"eval_runtime": 265.8429,
"eval_samples_per_second": 127.019,
"eval_steps_per_second": 2.648,
"step": 24
},
{
"epoch": 0.0018163324614937518,
"grad_norm": 2.713453769683838,
"learning_rate": 9.999941877361234e-06,
"loss": 10.7578,
"step": 25
},
{
"epoch": 0.0018163324614937518,
"eval_accuracy": 0.012068501985647663,
"eval_loss": 10.734375,
"eval_runtime": 266.534,
"eval_samples_per_second": 126.689,
"eval_steps_per_second": 2.641,
"step": 25
},
{
"epoch": 0.001888985759953502,
"grad_norm": 2.663592576980591,
"learning_rate": 9.999934612031386e-06,
"loss": 10.75,
"step": 26
},
{
"epoch": 0.001888985759953502,
"eval_accuracy": 0.012414296454736778,
"eval_loss": 10.7265625,
"eval_runtime": 265.503,
"eval_samples_per_second": 127.181,
"eval_steps_per_second": 2.652,
"step": 26
},
{
"epoch": 0.001961639058413252,
"grad_norm": 2.4643020629882812,
"learning_rate": 9.999927346701542e-06,
"loss": 10.7344,
"step": 27
},
{
"epoch": 0.001961639058413252,
"eval_accuracy": 0.013084492164563661,
"eval_loss": 10.71875,
"eval_runtime": 264.7117,
"eval_samples_per_second": 127.561,
"eval_steps_per_second": 2.659,
"step": 27
},
{
"epoch": 0.002034292356873002,
"grad_norm": 2.2399826049804688,
"learning_rate": 9.999920081371694e-06,
"loss": 10.7266,
"step": 28
},
{
"epoch": 0.002034292356873002,
"eval_accuracy": 0.014443063485982847,
"eval_loss": 10.703125,
"eval_runtime": 264.8704,
"eval_samples_per_second": 127.485,
"eval_steps_per_second": 2.658,
"step": 28
},
{
"epoch": 0.0021069456553327523,
"grad_norm": 2.138185977935791,
"learning_rate": 9.99991281604185e-06,
"loss": 10.7109,
"step": 29
},
{
"epoch": 0.0021069456553327523,
"eval_accuracy": 0.016544752766507735,
"eval_loss": 10.6953125,
"eval_runtime": 264.0295,
"eval_samples_per_second": 127.891,
"eval_steps_per_second": 2.666,
"step": 29
},
{
"epoch": 0.002179598953792502,
"grad_norm": 1.8671512603759766,
"learning_rate": 9.999905550712004e-06,
"loss": 10.7031,
"step": 30
},
{
"epoch": 0.002179598953792502,
"eval_accuracy": 0.01964894204406536,
"eval_loss": 10.6875,
"eval_runtime": 264.1126,
"eval_samples_per_second": 127.851,
"eval_steps_per_second": 2.666,
"step": 30
},
{
"epoch": 0.0022522522522522522,
"grad_norm": 1.72816002368927,
"learning_rate": 9.999898285382156e-06,
"loss": 10.7031,
"step": 31
},
{
"epoch": 0.0022522522522522522,
"eval_accuracy": 0.023590159473924593,
"eval_loss": 10.6796875,
"eval_runtime": 265.2939,
"eval_samples_per_second": 127.281,
"eval_steps_per_second": 2.654,
"step": 31
},
{
"epoch": 0.0023249055507120024,
"grad_norm": 1.6541900634765625,
"learning_rate": 9.999891020052312e-06,
"loss": 10.6875,
"step": 32
},
{
"epoch": 0.0023249055507120024,
"eval_accuracy": 0.028234313806121365,
"eval_loss": 10.671875,
"eval_runtime": 264.9943,
"eval_samples_per_second": 127.425,
"eval_steps_per_second": 2.657,
"step": 32
},
{
"epoch": 0.0023975588491717526,
"grad_norm": 1.4378719329833984,
"learning_rate": 9.999883754722464e-06,
"loss": 10.6797,
"step": 33
},
{
"epoch": 0.0023975588491717526,
"eval_accuracy": 0.03299875076862917,
"eval_loss": 10.6640625,
"eval_runtime": 264.4617,
"eval_samples_per_second": 127.682,
"eval_steps_per_second": 2.662,
"step": 33
},
{
"epoch": 0.0024702121476315024,
"grad_norm": 1.3948858976364136,
"learning_rate": 9.99987648939262e-06,
"loss": 10.6719,
"step": 34
},
{
"epoch": 0.0024702121476315024,
"eval_accuracy": 0.03746090344095459,
"eval_loss": 10.6640625,
"eval_runtime": 263.8541,
"eval_samples_per_second": 127.976,
"eval_steps_per_second": 2.668,
"step": 34
},
{
"epoch": 0.0025428654460912525,
"grad_norm": 1.2194068431854248,
"learning_rate": 9.999869224062774e-06,
"loss": 10.6719,
"step": 35
},
{
"epoch": 0.0025428654460912525,
"eval_accuracy": 0.04094886812886922,
"eval_loss": 10.65625,
"eval_runtime": 263.4632,
"eval_samples_per_second": 128.166,
"eval_steps_per_second": 2.672,
"step": 35
},
{
"epoch": 0.0026155187445510027,
"grad_norm": 1.2569856643676758,
"learning_rate": 9.999861958732927e-06,
"loss": 10.6719,
"step": 36
},
{
"epoch": 0.0026155187445510027,
"eval_accuracy": 0.04373679080326246,
"eval_loss": 10.6484375,
"eval_runtime": 265.8278,
"eval_samples_per_second": 127.026,
"eval_steps_per_second": 2.648,
"step": 36
},
{
"epoch": 0.002688172043010753,
"grad_norm": 1.16013503074646,
"learning_rate": 9.999854693403081e-06,
"loss": 10.6484,
"step": 37
},
{
"epoch": 0.002688172043010753,
"eval_accuracy": 0.046063847178124624,
"eval_loss": 10.6484375,
"eval_runtime": 266.8235,
"eval_samples_per_second": 126.552,
"eval_steps_per_second": 2.638,
"step": 37
},
{
"epoch": 0.0027608253414705027,
"grad_norm": 1.1432477235794067,
"learning_rate": 9.999847428073235e-06,
"loss": 10.6562,
"step": 38
},
{
"epoch": 0.0027608253414705027,
"eval_accuracy": 0.04811105465112957,
"eval_loss": 10.640625,
"eval_runtime": 266.3014,
"eval_samples_per_second": 126.8,
"eval_steps_per_second": 2.644,
"step": 38
},
{
"epoch": 0.002833478639930253,
"grad_norm": 1.071315050125122,
"learning_rate": 9.99984016274339e-06,
"loss": 10.6484,
"step": 39
},
{
"epoch": 0.002833478639930253,
"eval_accuracy": 0.04980491199523524,
"eval_loss": 10.640625,
"eval_runtime": 266.126,
"eval_samples_per_second": 126.883,
"eval_steps_per_second": 2.645,
"step": 39
},
{
"epoch": 0.002906131938390003,
"grad_norm": 1.0130771398544312,
"learning_rate": 9.999832897413543e-06,
"loss": 10.6484,
"step": 40
},
{
"epoch": 0.002906131938390003,
"eval_accuracy": 0.05107903940988734,
"eval_loss": 10.6328125,
"eval_runtime": 265.8854,
"eval_samples_per_second": 126.998,
"eval_steps_per_second": 2.648,
"step": 40
},
{
"epoch": 0.0029787852368497528,
"grad_norm": 1.014347791671753,
"learning_rate": 9.999825632083697e-06,
"loss": 10.6406,
"step": 41
},
{
"epoch": 0.0029787852368497528,
"eval_accuracy": 0.052067614991714396,
"eval_loss": 10.6328125,
"eval_runtime": 266.8406,
"eval_samples_per_second": 126.544,
"eval_steps_per_second": 2.638,
"step": 41
},
{
"epoch": 0.003051438535309503,
"grad_norm": 1.0095568895339966,
"learning_rate": 9.999818366753851e-06,
"loss": 10.6406,
"step": 42
},
{
"epoch": 0.003051438535309503,
"eval_accuracy": 0.05287193090039351,
"eval_loss": 10.625,
"eval_runtime": 266.7685,
"eval_samples_per_second": 126.578,
"eval_steps_per_second": 2.639,
"step": 42
},
{
"epoch": 0.003124091833769253,
"grad_norm": 0.9412463307380676,
"learning_rate": 9.999811101424005e-06,
"loss": 10.6406,
"step": 43
},
{
"epoch": 0.003124091833769253,
"eval_accuracy": 0.053470536009796996,
"eval_loss": 10.625,
"eval_runtime": 267.0069,
"eval_samples_per_second": 126.465,
"eval_steps_per_second": 2.637,
"step": 43
},
{
"epoch": 0.0031967451322290033,
"grad_norm": 0.952081561088562,
"learning_rate": 9.999803836094159e-06,
"loss": 10.625,
"step": 44
},
{
"epoch": 0.0031967451322290033,
"eval_accuracy": 0.05392726261832098,
"eval_loss": 10.6171875,
"eval_runtime": 269.2581,
"eval_samples_per_second": 125.408,
"eval_steps_per_second": 2.615,
"step": 44
},
{
"epoch": 0.003269398430688753,
"grad_norm": 0.9194355607032776,
"learning_rate": 9.999796570764313e-06,
"loss": 10.625,
"step": 45
},
{
"epoch": 0.003269398430688753,
"eval_accuracy": 0.05422120962871285,
"eval_loss": 10.6171875,
"eval_runtime": 269.3844,
"eval_samples_per_second": 125.349,
"eval_steps_per_second": 2.613,
"step": 45
},
{
"epoch": 0.0033420517291485033,
"grad_norm": 0.9257526993751526,
"learning_rate": 9.999789305434467e-06,
"loss": 10.625,
"step": 46
},
{
"epoch": 0.0033420517291485033,
"eval_accuracy": 0.054314019764158616,
"eval_loss": 10.6171875,
"eval_runtime": 269.7126,
"eval_samples_per_second": 125.196,
"eval_steps_per_second": 2.61,
"step": 46
},
{
"epoch": 0.0034147050276082534,
"grad_norm": 0.9701704382896423,
"learning_rate": 9.999782040104623e-06,
"loss": 10.6172,
"step": 47
},
{
"epoch": 0.0034147050276082534,
"eval_accuracy": 0.05444489768753676,
"eval_loss": 10.609375,
"eval_runtime": 269.4275,
"eval_samples_per_second": 125.329,
"eval_steps_per_second": 2.613,
"step": 47
},
{
"epoch": 0.0034873583260680036,
"grad_norm": 0.8972945809364319,
"learning_rate": 9.999774774774775e-06,
"loss": 10.625,
"step": 48
},
{
"epoch": 0.0034873583260680036,
"eval_accuracy": 0.05449599247514181,
"eval_loss": 10.609375,
"eval_runtime": 268.4057,
"eval_samples_per_second": 125.806,
"eval_steps_per_second": 2.623,
"step": 48
},
{
"epoch": 0.0035600116245277534,
"grad_norm": 0.9347382187843323,
"learning_rate": 9.99976750944493e-06,
"loss": 10.6172,
"step": 49
},
{
"epoch": 0.0035600116245277534,
"eval_accuracy": 0.05453342338753463,
"eval_loss": 10.6015625,
"eval_runtime": 269.5777,
"eval_samples_per_second": 125.259,
"eval_steps_per_second": 2.611,
"step": 49
},
{
"epoch": 0.0036326649229875036,
"grad_norm": 0.9273884892463684,
"learning_rate": 9.999760244115083e-06,
"loss": 10.6016,
"step": 50
},
{
"epoch": 0.0036326649229875036,
"eval_accuracy": 0.05452213332115164,
"eval_loss": 10.6015625,
"eval_runtime": 268.6914,
"eval_samples_per_second": 125.672,
"eval_steps_per_second": 2.62,
"step": 50
},
{
"epoch": 0.0037053182214472537,
"grad_norm": 0.9508588910102844,
"learning_rate": 9.999752978785238e-06,
"loss": 10.6016,
"step": 51
},
{
"epoch": 0.0037053182214472537,
"eval_accuracy": 0.054549547918240585,
"eval_loss": 10.6015625,
"eval_runtime": 269.6867,
"eval_samples_per_second": 125.208,
"eval_steps_per_second": 2.61,
"step": 51
},
{
"epoch": 0.003777971519907004,
"grad_norm": 0.97487872838974,
"learning_rate": 9.999745713455392e-06,
"loss": 10.6016,
"step": 52
},
{
"epoch": 0.003777971519907004,
"eval_accuracy": 0.05455154539152372,
"eval_loss": 10.59375,
"eval_runtime": 269.8783,
"eval_samples_per_second": 125.119,
"eval_steps_per_second": 2.609,
"step": 52
},
{
"epoch": 0.0038506248183667537,
"grad_norm": 1.050345540046692,
"learning_rate": 9.999738448125546e-06,
"loss": 10.6016,
"step": 53
},
{
"epoch": 0.0038506248183667537,
"eval_accuracy": 0.054549403173799776,
"eval_loss": 10.59375,
"eval_runtime": 269.476,
"eval_samples_per_second": 125.306,
"eval_steps_per_second": 2.612,
"step": 53
},
{
"epoch": 0.003923278116826504,
"grad_norm": 0.9317484498023987,
"learning_rate": 9.9997311827957e-06,
"loss": 10.5938,
"step": 54
},
{
"epoch": 0.003923278116826504,
"eval_accuracy": 0.05454671092720075,
"eval_loss": 10.59375,
"eval_runtime": 268.2865,
"eval_samples_per_second": 125.862,
"eval_steps_per_second": 2.624,
"step": 54
},
{
"epoch": 0.003995931415286254,
"grad_norm": 0.9053019285202026,
"learning_rate": 9.999723917465854e-06,
"loss": 10.6016,
"step": 55
},
{
"epoch": 0.003995931415286254,
"eval_accuracy": 0.054510206379229105,
"eval_loss": 10.5859375,
"eval_runtime": 268.6755,
"eval_samples_per_second": 125.68,
"eval_steps_per_second": 2.62,
"step": 55
},
{
"epoch": 0.004068584713746004,
"grad_norm": 1.051640272140503,
"learning_rate": 9.999716652136008e-06,
"loss": 10.5859,
"step": 56
},
{
"epoch": 0.004068584713746004,
"eval_accuracy": 0.05450499557936003,
"eval_loss": 10.5859375,
"eval_runtime": 268.9754,
"eval_samples_per_second": 125.539,
"eval_steps_per_second": 2.617,
"step": 56
},
{
"epoch": 0.004141238012205754,
"grad_norm": 0.8980646729469299,
"learning_rate": 9.999709386806162e-06,
"loss": 10.6016,
"step": 57
},
{
"epoch": 0.004141238012205754,
"eval_accuracy": 0.05452688093881013,
"eval_loss": 10.5859375,
"eval_runtime": 269.557,
"eval_samples_per_second": 125.269,
"eval_steps_per_second": 2.612,
"step": 57
},
{
"epoch": 0.004213891310665505,
"grad_norm": 0.9363867044448853,
"learning_rate": 9.999702121476316e-06,
"loss": 10.5859,
"step": 58
},
{
"epoch": 0.004213891310665505,
"eval_accuracy": 0.054599542648095495,
"eval_loss": 10.5859375,
"eval_runtime": 268.413,
"eval_samples_per_second": 125.802,
"eval_steps_per_second": 2.623,
"step": 58
},
{
"epoch": 0.004286544609125254,
"grad_norm": 0.9355424642562866,
"learning_rate": 9.99969485614647e-06,
"loss": 10.5859,
"step": 59
},
{
"epoch": 0.004286544609125254,
"eval_accuracy": 0.05472584664714412,
"eval_loss": 10.578125,
"eval_runtime": 267.1942,
"eval_samples_per_second": 126.376,
"eval_steps_per_second": 2.635,
"step": 59
},
{
"epoch": 0.004359197907585004,
"grad_norm": 0.9955667853355408,
"learning_rate": 9.999687590816624e-06,
"loss": 10.5781,
"step": 60
},
{
"epoch": 0.004359197907585004,
"eval_accuracy": 0.05484039739759917,
"eval_loss": 10.578125,
"eval_runtime": 268.4725,
"eval_samples_per_second": 125.774,
"eval_steps_per_second": 2.622,
"step": 60
},
{
"epoch": 0.004431851206044755,
"grad_norm": 0.9198755025863647,
"learning_rate": 9.999680325486778e-06,
"loss": 10.5781,
"step": 61
},
{
"epoch": 0.004431851206044755,
"eval_accuracy": 0.054993913351519604,
"eval_loss": 10.578125,
"eval_runtime": 268.5476,
"eval_samples_per_second": 125.739,
"eval_steps_per_second": 2.622,
"step": 61
},
{
"epoch": 0.0045045045045045045,
"grad_norm": 0.9875515699386597,
"learning_rate": 9.999673060156932e-06,
"loss": 10.5781,
"step": 62
},
{
"epoch": 0.0045045045045045045,
"eval_accuracy": 0.055311424756874936,
"eval_loss": 10.5703125,
"eval_runtime": 267.2035,
"eval_samples_per_second": 126.372,
"eval_steps_per_second": 2.635,
"step": 62
},
{
"epoch": 0.004577157802964254,
"grad_norm": 0.9037775993347168,
"learning_rate": 9.999665794827086e-06,
"loss": 10.5781,
"step": 63
},
{
"epoch": 0.004577157802964254,
"eval_accuracy": 0.05571670919113593,
"eval_loss": 10.5703125,
"eval_runtime": 267.7809,
"eval_samples_per_second": 126.099,
"eval_steps_per_second": 2.629,
"step": 63
},
{
"epoch": 0.004649811101424005,
"grad_norm": 0.9087035655975342,
"learning_rate": 9.99965852949724e-06,
"loss": 10.5703,
"step": 64
},
{
"epoch": 0.004649811101424005,
"eval_accuracy": 0.056142518387103435,
"eval_loss": 10.5703125,
"eval_runtime": 267.3757,
"eval_samples_per_second": 126.29,
"eval_steps_per_second": 2.633,
"step": 64
},
{
"epoch": 0.004722464399883755,
"grad_norm": 0.8892097473144531,
"learning_rate": 9.999651264167394e-06,
"loss": 10.5781,
"step": 65
},
{
"epoch": 0.004722464399883755,
"eval_accuracy": 0.05656430368761649,
"eval_loss": 10.5625,
"eval_runtime": 266.879,
"eval_samples_per_second": 126.525,
"eval_steps_per_second": 2.638,
"step": 65
},
{
"epoch": 0.004795117698343505,
"grad_norm": 0.9172134399414062,
"learning_rate": 9.999643998837548e-06,
"loss": 10.5625,
"step": 66
},
{
"epoch": 0.004795117698343505,
"eval_accuracy": 0.0569930367212883,
"eval_loss": 10.5625,
"eval_runtime": 266.0917,
"eval_samples_per_second": 126.9,
"eval_steps_per_second": 2.646,
"step": 66
},
{
"epoch": 0.004867770996803255,
"grad_norm": 0.9037718176841736,
"learning_rate": 9.999636733507701e-06,
"loss": 10.5781,
"step": 67
},
{
"epoch": 0.004867770996803255,
"eval_accuracy": 0.05732160660192132,
"eval_loss": 10.5625,
"eval_runtime": 267.3756,
"eval_samples_per_second": 126.291,
"eval_steps_per_second": 2.633,
"step": 67
},
{
"epoch": 0.004940424295263005,
"grad_norm": 0.8923665881156921,
"learning_rate": 9.999629468177855e-06,
"loss": 10.5703,
"step": 68
},
{
"epoch": 0.004940424295263005,
"eval_accuracy": 0.05754022860531697,
"eval_loss": 10.5546875,
"eval_runtime": 267.5062,
"eval_samples_per_second": 126.229,
"eval_steps_per_second": 2.632,
"step": 68
},
{
"epoch": 0.005013077593722755,
"grad_norm": 0.9167753458023071,
"learning_rate": 9.999622202848011e-06,
"loss": 10.5625,
"step": 69
},
{
"epoch": 0.005013077593722755,
"eval_accuracy": 0.0576851756883416,
"eval_loss": 10.5546875,
"eval_runtime": 269.0923,
"eval_samples_per_second": 125.485,
"eval_steps_per_second": 2.616,
"step": 69
},
{
"epoch": 0.005085730892182505,
"grad_norm": 0.9031029343605042,
"learning_rate": 9.999614937518163e-06,
"loss": 10.5625,
"step": 70
},
{
"epoch": 0.005085730892182505,
"eval_accuracy": 0.05778192287257733,
"eval_loss": 10.5546875,
"eval_runtime": 269.1242,
"eval_samples_per_second": 125.47,
"eval_steps_per_second": 2.616,
"step": 70
},
{
"epoch": 0.005158384190642255,
"grad_norm": 0.8912838101387024,
"learning_rate": 9.999607672188319e-06,
"loss": 10.5625,
"step": 71
},
{
"epoch": 0.005158384190642255,
"eval_accuracy": 0.05789719734523642,
"eval_loss": 10.5546875,
"eval_runtime": 269.7373,
"eval_samples_per_second": 125.185,
"eval_steps_per_second": 2.61,
"step": 71
},
{
"epoch": 0.0052310374891020054,
"grad_norm": 0.8998405933380127,
"learning_rate": 9.999600406858471e-06,
"loss": 10.5547,
"step": 72
},
{
"epoch": 0.0052310374891020054,
"eval_accuracy": 0.057985607249681645,
"eval_loss": 10.546875,
"eval_runtime": 269.5868,
"eval_samples_per_second": 125.255,
"eval_steps_per_second": 2.611,
"step": 72
},
{
"epoch": 0.005303690787561755,
"grad_norm": 0.9078417420387268,
"learning_rate": 9.999593141528627e-06,
"loss": 10.5469,
"step": 73
},
{
"epoch": 0.005303690787561755,
"eval_accuracy": 0.05800511880030249,
"eval_loss": 10.546875,
"eval_runtime": 271.0227,
"eval_samples_per_second": 124.591,
"eval_steps_per_second": 2.598,
"step": 73
},
{
"epoch": 0.005376344086021506,
"grad_norm": 0.8995553851127625,
"learning_rate": 9.99958587619878e-06,
"loss": 10.5469,
"step": 74
},
{
"epoch": 0.005376344086021506,
"eval_accuracy": 0.05802399347538379,
"eval_loss": 10.546875,
"eval_runtime": 270.6941,
"eval_samples_per_second": 124.742,
"eval_steps_per_second": 2.601,
"step": 74
},
{
"epoch": 0.0054489973844812556,
"grad_norm": 0.8786413073539734,
"learning_rate": 9.999578610868935e-06,
"loss": 10.5547,
"step": 75
},
{
"epoch": 0.0054489973844812556,
"eval_accuracy": 0.05797920954539795,
"eval_loss": 10.5390625,
"eval_runtime": 268.937,
"eval_samples_per_second": 125.557,
"eval_steps_per_second": 2.618,
"step": 75
},
{
"epoch": 0.005521650682941005,
"grad_norm": 0.9166957139968872,
"learning_rate": 9.999571345539089e-06,
"loss": 10.5547,
"step": 76
},
{
"epoch": 0.005521650682941005,
"eval_accuracy": 0.05800387399811155,
"eval_loss": 10.5390625,
"eval_runtime": 270.6425,
"eval_samples_per_second": 124.766,
"eval_steps_per_second": 2.601,
"step": 76
},
{
"epoch": 0.005594303981400756,
"grad_norm": 0.9106067419052124,
"learning_rate": 9.999564080209243e-06,
"loss": 10.5469,
"step": 77
},
{
"epoch": 0.005594303981400756,
"eval_accuracy": 0.058152873925478785,
"eval_loss": 10.5390625,
"eval_runtime": 268.7053,
"eval_samples_per_second": 125.666,
"eval_steps_per_second": 2.62,
"step": 77
},
{
"epoch": 0.005666957279860506,
"grad_norm": 0.9021939039230347,
"learning_rate": 9.999556814879397e-06,
"loss": 10.5469,
"step": 78
},
{
"epoch": 0.005666957279860506,
"eval_accuracy": 0.05821331920396,
"eval_loss": 10.5390625,
"eval_runtime": 268.8573,
"eval_samples_per_second": 125.594,
"eval_steps_per_second": 2.618,
"step": 78
},
{
"epoch": 0.005739610578320255,
"grad_norm": 0.935400664806366,
"learning_rate": 9.99954954954955e-06,
"loss": 10.5312,
"step": 79
},
{
"epoch": 0.005739610578320255,
"eval_accuracy": 0.05838906790398846,
"eval_loss": 10.53125,
"eval_runtime": 270.5471,
"eval_samples_per_second": 124.81,
"eval_steps_per_second": 2.602,
"step": 79
},
{
"epoch": 0.005812263876780006,
"grad_norm": 0.9361926317214966,
"learning_rate": 9.999542284219704e-06,
"loss": 10.5312,
"step": 80
},
{
"epoch": 0.005812263876780006,
"eval_accuracy": 0.05863825993328266,
"eval_loss": 10.53125,
"eval_runtime": 270.2854,
"eval_samples_per_second": 124.931,
"eval_steps_per_second": 2.605,
"step": 80
},
{
"epoch": 0.005884917175239756,
"grad_norm": 0.9991462826728821,
"learning_rate": 9.999535018889858e-06,
"loss": 10.5312,
"step": 81
},
{
"epoch": 0.005884917175239756,
"eval_accuracy": 0.058984343891253385,
"eval_loss": 10.53125,
"eval_runtime": 269.5128,
"eval_samples_per_second": 125.289,
"eval_steps_per_second": 2.612,
"step": 81
},
{
"epoch": 0.0059575704736995055,
"grad_norm": 0.8942323327064514,
"learning_rate": 9.999527753560012e-06,
"loss": 10.5312,
"step": 82
},
{
"epoch": 0.0059575704736995055,
"eval_accuracy": 0.05927577234837521,
"eval_loss": 10.53125,
"eval_runtime": 267.9968,
"eval_samples_per_second": 125.998,
"eval_steps_per_second": 2.627,
"step": 82
},
{
"epoch": 0.006030223772159256,
"grad_norm": 0.9410443902015686,
"learning_rate": 9.999520488230166e-06,
"loss": 10.5312,
"step": 83
},
{
"epoch": 0.006030223772159256,
"eval_accuracy": 0.05966313742086423,
"eval_loss": 10.5234375,
"eval_runtime": 269.3505,
"eval_samples_per_second": 125.365,
"eval_steps_per_second": 2.614,
"step": 83
},
{
"epoch": 0.006102877070619006,
"grad_norm": 0.9418770670890808,
"learning_rate": 9.99951322290032e-06,
"loss": 10.5234,
"step": 84
},
{
"epoch": 0.006102877070619006,
"eval_accuracy": 0.06000398163007773,
"eval_loss": 10.5234375,
"eval_runtime": 268.3417,
"eval_samples_per_second": 125.836,
"eval_steps_per_second": 2.624,
"step": 84
},
{
"epoch": 0.0061755303690787565,
"grad_norm": 0.8822703957557678,
"learning_rate": 9.999505957570474e-06,
"loss": 10.5312,
"step": 85
},
{
"epoch": 0.0061755303690787565,
"eval_accuracy": 0.060200255091812704,
"eval_loss": 10.5234375,
"eval_runtime": 269.365,
"eval_samples_per_second": 125.358,
"eval_steps_per_second": 2.614,
"step": 85
},
{
"epoch": 0.006248183667538506,
"grad_norm": 0.8689332604408264,
"learning_rate": 9.999498692240628e-06,
"loss": 10.5312,
"step": 86
},
{
"epoch": 0.006248183667538506,
"eval_accuracy": 0.06028594380077074,
"eval_loss": 10.5234375,
"eval_runtime": 267.6304,
"eval_samples_per_second": 126.17,
"eval_steps_per_second": 2.63,
"step": 86
},
{
"epoch": 0.006320836965998256,
"grad_norm": 0.8931795954704285,
"learning_rate": 9.999491426910782e-06,
"loss": 10.5234,
"step": 87
},
{
"epoch": 0.006320836965998256,
"eval_accuracy": 0.060404489497792084,
"eval_loss": 10.515625,
"eval_runtime": 267.5381,
"eval_samples_per_second": 126.214,
"eval_steps_per_second": 2.631,
"step": 87
},
{
"epoch": 0.006393490264458007,
"grad_norm": 0.8975218534469604,
"learning_rate": 9.999484161580936e-06,
"loss": 10.5156,
"step": 88
},
{
"epoch": 0.006393490264458007,
"eval_accuracy": 0.06048001714700543,
"eval_loss": 10.515625,
"eval_runtime": 266.4763,
"eval_samples_per_second": 126.717,
"eval_steps_per_second": 2.642,
"step": 88
},
{
"epoch": 0.006466143562917756,
"grad_norm": 0.8878839015960693,
"learning_rate": 9.99947689625109e-06,
"loss": 10.5234,
"step": 89
},
{
"epoch": 0.006466143562917756,
"eval_accuracy": 0.060569556058088954,
"eval_loss": 10.515625,
"eval_runtime": 265.4792,
"eval_samples_per_second": 127.193,
"eval_steps_per_second": 2.652,
"step": 89
},
{
"epoch": 0.006538796861377506,
"grad_norm": 0.8937884569168091,
"learning_rate": 9.999469630921244e-06,
"loss": 10.5156,
"step": 90
},
{
"epoch": 0.006538796861377506,
"eval_accuracy": 0.06063920708300553,
"eval_loss": 10.515625,
"eval_runtime": 264.9514,
"eval_samples_per_second": 127.446,
"eval_steps_per_second": 2.657,
"step": 90
},
{
"epoch": 0.006611450159837257,
"grad_norm": 0.9820625185966492,
"learning_rate": 9.9994623655914e-06,
"loss": 10.5156,
"step": 91
},
{
"epoch": 0.006611450159837257,
"eval_accuracy": 0.06055398155625807,
"eval_loss": 10.5078125,
"eval_runtime": 265.7628,
"eval_samples_per_second": 127.057,
"eval_steps_per_second": 2.649,
"step": 91
},
{
"epoch": 0.0066841034582970065,
"grad_norm": 0.8808642029762268,
"learning_rate": 9.999455100261552e-06,
"loss": 10.5156,
"step": 92
},
{
"epoch": 0.0066841034582970065,
"eval_accuracy": 0.060472056202761026,
"eval_loss": 10.5078125,
"eval_runtime": 266.9758,
"eval_samples_per_second": 126.48,
"eval_steps_per_second": 2.637,
"step": 92
},
{
"epoch": 0.006756756756756757,
"grad_norm": 0.8804787993431091,
"learning_rate": 9.999447834931707e-06,
"loss": 10.5156,
"step": 93
},
{
"epoch": 0.006756756756756757,
"eval_accuracy": 0.06028976505400806,
"eval_loss": 10.5078125,
"eval_runtime": 267.6988,
"eval_samples_per_second": 126.138,
"eval_steps_per_second": 2.63,
"step": 93
},
{
"epoch": 0.006829410055216507,
"grad_norm": 0.8913342356681824,
"learning_rate": 9.99944056960186e-06,
"loss": 10.5156,
"step": 94
},
{
"epoch": 0.006829410055216507,
"eval_accuracy": 0.06017234836362501,
"eval_loss": 10.5078125,
"eval_runtime": 268.8716,
"eval_samples_per_second": 125.588,
"eval_steps_per_second": 2.618,
"step": 94
},
{
"epoch": 0.006902063353676257,
"grad_norm": 0.8503950834274292,
"learning_rate": 9.999433304272015e-06,
"loss": 10.5234,
"step": 95
},
{
"epoch": 0.006902063353676257,
"eval_accuracy": 0.0601403308933184,
"eval_loss": 10.5,
"eval_runtime": 267.2705,
"eval_samples_per_second": 126.34,
"eval_steps_per_second": 2.634,
"step": 95
},
{
"epoch": 0.006974716652136007,
"grad_norm": 0.912339985370636,
"learning_rate": 9.99942603894217e-06,
"loss": 10.5156,
"step": 96
},
{
"epoch": 0.006974716652136007,
"eval_accuracy": 0.06016896134371012,
"eval_loss": 10.5,
"eval_runtime": 268.9999,
"eval_samples_per_second": 125.528,
"eval_steps_per_second": 2.617,
"step": 96
},
{
"epoch": 0.007047369950595757,
"grad_norm": 0.8949794769287109,
"learning_rate": 9.999418773612323e-06,
"loss": 10.5078,
"step": 97
},
{
"epoch": 0.007047369950595757,
"eval_accuracy": 0.06026235045691912,
"eval_loss": 10.5,
"eval_runtime": 267.9858,
"eval_samples_per_second": 126.003,
"eval_steps_per_second": 2.627,
"step": 97
},
{
"epoch": 0.007120023249055507,
"grad_norm": 0.8988801836967468,
"learning_rate": 9.999411508282477e-06,
"loss": 10.5,
"step": 98
},
{
"epoch": 0.007120023249055507,
"eval_accuracy": 0.06031005822460927,
"eval_loss": 10.5,
"eval_runtime": 268.4028,
"eval_samples_per_second": 125.807,
"eval_steps_per_second": 2.623,
"step": 98
},
{
"epoch": 0.007192676547515257,
"grad_norm": 0.8954498767852783,
"learning_rate": 9.999404242952631e-06,
"loss": 10.5078,
"step": 99
},
{
"epoch": 0.007192676547515257,
"eval_accuracy": 0.06044918658111344,
"eval_loss": 10.5,
"eval_runtime": 266.438,
"eval_samples_per_second": 126.735,
"eval_steps_per_second": 2.642,
"step": 99
},
{
"epoch": 0.007265329845975007,
"grad_norm": 0.8816587328910828,
"learning_rate": 9.999396977622785e-06,
"loss": 10.5078,
"step": 100
},
{
"epoch": 0.007265329845975007,
"eval_accuracy": 0.060579746066721805,
"eval_loss": 10.4921875,
"eval_runtime": 268.404,
"eval_samples_per_second": 125.807,
"eval_steps_per_second": 2.623,
"step": 100
},
{
"epoch": 0.007337983144434758,
"grad_norm": 0.9182707071304321,
"learning_rate": 9.999389712292939e-06,
"loss": 10.5,
"step": 101
},
{
"epoch": 0.007337983144434758,
"eval_accuracy": 0.0607209587431736,
"eval_loss": 10.4921875,
"eval_runtime": 269.5331,
"eval_samples_per_second": 125.28,
"eval_steps_per_second": 2.612,
"step": 101
},
{
"epoch": 0.0074106364428945075,
"grad_norm": 0.9346348643302917,
"learning_rate": 9.999382446963093e-06,
"loss": 10.4922,
"step": 102
},
{
"epoch": 0.0074106364428945075,
"eval_accuracy": 0.06087233247937008,
"eval_loss": 10.4921875,
"eval_runtime": 268.6739,
"eval_samples_per_second": 125.68,
"eval_steps_per_second": 2.62,
"step": 102
},
{
"epoch": 0.007483289741354257,
"grad_norm": 1.0120168924331665,
"learning_rate": 9.999375181633247e-06,
"loss": 10.4922,
"step": 103
},
{
"epoch": 0.007483289741354257,
"eval_accuracy": 0.06115695794777395,
"eval_loss": 10.4921875,
"eval_runtime": 269.4069,
"eval_samples_per_second": 125.338,
"eval_steps_per_second": 2.613,
"step": 103
},
{
"epoch": 0.007555943039814008,
"grad_norm": 0.9036211967468262,
"learning_rate": 9.999367916303401e-06,
"loss": 10.4844,
"step": 104
},
{
"epoch": 0.007555943039814008,
"eval_accuracy": 0.06140849483700922,
"eval_loss": 10.484375,
"eval_runtime": 267.3403,
"eval_samples_per_second": 126.307,
"eval_steps_per_second": 2.633,
"step": 104
},
{
"epoch": 0.007628596338273758,
"grad_norm": 0.895473837852478,
"learning_rate": 9.999360650973555e-06,
"loss": 10.4922,
"step": 105
},
{
"epoch": 0.007628596338273758,
"eval_accuracy": 0.0616540103575069,
"eval_loss": 10.484375,
"eval_runtime": 268.6172,
"eval_samples_per_second": 125.707,
"eval_steps_per_second": 2.621,
"step": 105
},
{
"epoch": 0.007701249636733507,
"grad_norm": 0.908990204334259,
"learning_rate": 9.999353385643709e-06,
"loss": 10.4922,
"step": 106
},
{
"epoch": 0.007701249636733507,
"eval_accuracy": 0.06192879320393586,
"eval_loss": 10.484375,
"eval_runtime": 269.3754,
"eval_samples_per_second": 125.353,
"eval_steps_per_second": 2.613,
"step": 106
},
{
"epoch": 0.007773902935193258,
"grad_norm": 0.929440975189209,
"learning_rate": 9.999346120313863e-06,
"loss": 10.4844,
"step": 107
},
{
"epoch": 0.007773902935193258,
"eval_accuracy": 0.062228703685288995,
"eval_loss": 10.484375,
"eval_runtime": 268.7245,
"eval_samples_per_second": 125.657,
"eval_steps_per_second": 2.62,
"step": 107
},
{
"epoch": 0.007846556233653008,
"grad_norm": 0.9397541880607605,
"learning_rate": 9.999338854984018e-06,
"loss": 10.4922,
"step": 108
},
{
"epoch": 0.007846556233653008,
"eval_accuracy": 0.06252207171791763,
"eval_loss": 10.4765625,
"eval_runtime": 267.6862,
"eval_samples_per_second": 126.144,
"eval_steps_per_second": 2.63,
"step": 108
},
{
"epoch": 0.007919209532112758,
"grad_norm": 0.9697725772857666,
"learning_rate": 9.99933158965417e-06,
"loss": 10.4844,
"step": 109
},
{
"epoch": 0.007919209532112758,
"eval_accuracy": 0.06282742459024514,
"eval_loss": 10.4765625,
"eval_runtime": 266.1334,
"eval_samples_per_second": 126.88,
"eval_steps_per_second": 2.645,
"step": 109
},
{
"epoch": 0.007991862830572507,
"grad_norm": 0.8942638039588928,
"learning_rate": 9.999324324324326e-06,
"loss": 10.4766,
"step": 110
},
{
"epoch": 0.007991862830572507,
"eval_accuracy": 0.06300488127467513,
"eval_loss": 10.4765625,
"eval_runtime": 267.3455,
"eval_samples_per_second": 126.305,
"eval_steps_per_second": 2.633,
"step": 110
},
{
"epoch": 0.008064516129032258,
"grad_norm": 0.8714835047721863,
"learning_rate": 9.999317058994478e-06,
"loss": 10.4844,
"step": 111
},
{
"epoch": 0.008064516129032258,
"eval_accuracy": 0.06321102630727317,
"eval_loss": 10.4765625,
"eval_runtime": 266.4815,
"eval_samples_per_second": 126.714,
"eval_steps_per_second": 2.642,
"step": 111
},
{
"epoch": 0.008137169427492008,
"grad_norm": 0.8855974078178406,
"learning_rate": 9.999309793664634e-06,
"loss": 10.4766,
"step": 112
},
{
"epoch": 0.008137169427492008,
"eval_accuracy": 0.06342374273748387,
"eval_loss": 10.4765625,
"eval_runtime": 265.4409,
"eval_samples_per_second": 127.211,
"eval_steps_per_second": 2.652,
"step": 112
},
{
"epoch": 0.008209822725951759,
"grad_norm": 0.8999938368797302,
"learning_rate": 9.999302528334788e-06,
"loss": 10.4844,
"step": 113
},
{
"epoch": 0.008209822725951759,
"eval_accuracy": 0.06359850717531484,
"eval_loss": 10.46875,
"eval_runtime": 264.9726,
"eval_samples_per_second": 127.436,
"eval_steps_per_second": 2.657,
"step": 113
},
{
"epoch": 0.008282476024411508,
"grad_norm": 0.8735718727111816,
"learning_rate": 9.99929526300494e-06,
"loss": 10.4766,
"step": 114
},
{
"epoch": 0.008282476024411508,
"eval_accuracy": 0.0637932173970891,
"eval_loss": 10.46875,
"eval_runtime": 264.7038,
"eval_samples_per_second": 127.565,
"eval_steps_per_second": 2.66,
"step": 114
},
{
"epoch": 0.008355129322871259,
"grad_norm": 0.9054996371269226,
"learning_rate": 9.999287997675096e-06,
"loss": 10.4766,
"step": 115
},
{
"epoch": 0.008355129322871259,
"eval_accuracy": 0.06404524641742311,
"eval_loss": 10.46875,
"eval_runtime": 265.6768,
"eval_samples_per_second": 127.098,
"eval_steps_per_second": 2.65,
"step": 115
},
{
"epoch": 0.00842778262133101,
"grad_norm": 0.9062832593917847,
"learning_rate": 9.999280732345248e-06,
"loss": 10.4844,
"step": 116
},
{
"epoch": 0.00842778262133101,
"eval_accuracy": 0.06427657698272166,
"eval_loss": 10.46875,
"eval_runtime": 264.6617,
"eval_samples_per_second": 127.586,
"eval_steps_per_second": 2.66,
"step": 116
},
{
"epoch": 0.008500435919790758,
"grad_norm": 0.9465892910957336,
"learning_rate": 9.999273467015404e-06,
"loss": 10.4531,
"step": 117
},
{
"epoch": 0.008500435919790758,
"eval_accuracy": 0.06444393050518328,
"eval_loss": 10.4609375,
"eval_runtime": 265.0601,
"eval_samples_per_second": 127.394,
"eval_steps_per_second": 2.656,
"step": 117
},
{
"epoch": 0.008573089218250509,
"grad_norm": 0.9980528354644775,
"learning_rate": 9.999266201685556e-06,
"loss": 10.4609,
"step": 118
},
{
"epoch": 0.008573089218250509,
"eval_accuracy": 0.06469538054775407,
"eval_loss": 10.4609375,
"eval_runtime": 265.9103,
"eval_samples_per_second": 126.986,
"eval_steps_per_second": 2.648,
"step": 118
},
{
"epoch": 0.00864574251671026,
"grad_norm": 0.9110475778579712,
"learning_rate": 9.999258936355712e-06,
"loss": 10.4609,
"step": 119
},
{
"epoch": 0.00864574251671026,
"eval_accuracy": 0.06482475312894781,
"eval_loss": 10.4609375,
"eval_runtime": 268.5579,
"eval_samples_per_second": 125.735,
"eval_steps_per_second": 2.621,
"step": 119
},
{
"epoch": 0.008718395815170008,
"grad_norm": 0.8688368797302246,
"learning_rate": 9.999251671025866e-06,
"loss": 10.4688,
"step": 120
},
{
"epoch": 0.008718395815170008,
"eval_accuracy": 0.06493464310840887,
"eval_loss": 10.4609375,
"eval_runtime": 268.4968,
"eval_samples_per_second": 125.763,
"eval_steps_per_second": 2.622,
"step": 120
},
{
"epoch": 0.008791049113629759,
"grad_norm": 0.8964656591415405,
"learning_rate": 9.99924440569602e-06,
"loss": 10.4609,
"step": 121
},
{
"epoch": 0.008791049113629759,
"eval_accuracy": 0.06505721270088466,
"eval_loss": 10.4609375,
"eval_runtime": 268.2266,
"eval_samples_per_second": 125.89,
"eval_steps_per_second": 2.625,
"step": 121
},
{
"epoch": 0.00886370241208951,
"grad_norm": 0.917405903339386,
"learning_rate": 9.999237140366174e-06,
"loss": 10.4609,
"step": 122
},
{
"epoch": 0.00886370241208951,
"eval_accuracy": 0.06532562679191808,
"eval_loss": 10.453125,
"eval_runtime": 267.3486,
"eval_samples_per_second": 126.303,
"eval_steps_per_second": 2.633,
"step": 122
},
{
"epoch": 0.008936355710549258,
"grad_norm": 0.9321388602256775,
"learning_rate": 9.999229875036328e-06,
"loss": 10.4531,
"step": 123
},
{
"epoch": 0.008936355710549258,
"eval_accuracy": 0.06563454037748945,
"eval_loss": 10.453125,
"eval_runtime": 268.6664,
"eval_samples_per_second": 125.684,
"eval_steps_per_second": 2.62,
"step": 123
},
{
"epoch": 0.009009009009009009,
"grad_norm": 1.0306340456008911,
"learning_rate": 9.999222609706481e-06,
"loss": 10.4531,
"step": 124
},
{
"epoch": 0.009009009009009009,
"eval_accuracy": 0.06589062224216607,
"eval_loss": 10.453125,
"eval_runtime": 269.4306,
"eval_samples_per_second": 125.327,
"eval_steps_per_second": 2.613,
"step": 124
},
{
"epoch": 0.00908166230746876,
"grad_norm": 0.90235435962677,
"learning_rate": 9.999215344376635e-06,
"loss": 10.4531,
"step": 125
},
{
"epoch": 0.00908166230746876,
"eval_accuracy": 0.06600340711044328,
"eval_loss": 10.453125,
"eval_runtime": 268.5113,
"eval_samples_per_second": 125.756,
"eval_steps_per_second": 2.622,
"step": 125
},
{
"epoch": 0.009154315605928508,
"grad_norm": 0.8829610347747803,
"learning_rate": 9.99920807904679e-06,
"loss": 10.4531,
"step": 126
},
{
"epoch": 0.009154315605928508,
"eval_accuracy": 0.06620975478525845,
"eval_loss": 10.4453125,
"eval_runtime": 268.5597,
"eval_samples_per_second": 125.734,
"eval_steps_per_second": 2.621,
"step": 126
},
{
"epoch": 0.009226968904388259,
"grad_norm": 0.9231570959091187,
"learning_rate": 9.999200813716943e-06,
"loss": 10.4531,
"step": 127
},
{
"epoch": 0.009226968904388259,
"eval_accuracy": 0.06642637931537096,
"eval_loss": 10.4453125,
"eval_runtime": 266.401,
"eval_samples_per_second": 126.753,
"eval_steps_per_second": 2.643,
"step": 127
},
{
"epoch": 0.00929962220284801,
"grad_norm": 0.9046792984008789,
"learning_rate": 9.999193548387097e-06,
"loss": 10.4453,
"step": 128
},
{
"epoch": 0.00929962220284801,
"eval_accuracy": 0.06673063212994831,
"eval_loss": 10.4453125,
"eval_runtime": 268.2718,
"eval_samples_per_second": 125.869,
"eval_steps_per_second": 2.624,
"step": 128
},
{
"epoch": 0.009372275501307759,
"grad_norm": 0.9026487469673157,
"learning_rate": 9.999186283057251e-06,
"loss": 10.4531,
"step": 129
},
{
"epoch": 0.009372275501307759,
"eval_accuracy": 0.06701204427176626,
"eval_loss": 10.4453125,
"eval_runtime": 266.0718,
"eval_samples_per_second": 126.909,
"eval_steps_per_second": 2.646,
"step": 129
},
{
"epoch": 0.00944492879976751,
"grad_norm": 0.918516218662262,
"learning_rate": 9.999179017727407e-06,
"loss": 10.4375,
"step": 130
},
{
"epoch": 0.00944492879976751,
"eval_accuracy": 0.0673045727866382,
"eval_loss": 10.4453125,
"eval_runtime": 267.2937,
"eval_samples_per_second": 126.329,
"eval_steps_per_second": 2.634,
"step": 130
},
{
"epoch": 0.00951758209822726,
"grad_norm": 0.9067806601524353,
"learning_rate": 9.999171752397559e-06,
"loss": 10.4453,
"step": 131
},
{
"epoch": 0.00951758209822726,
"eval_accuracy": 0.06758094782191605,
"eval_loss": 10.4375,
"eval_runtime": 266.3825,
"eval_samples_per_second": 126.761,
"eval_steps_per_second": 2.643,
"step": 131
},
{
"epoch": 0.00959023539668701,
"grad_norm": 0.9461184740066528,
"learning_rate": 9.999164487067715e-06,
"loss": 10.4375,
"step": 132
},
{
"epoch": 0.00959023539668701,
"eval_accuracy": 0.06776827607720912,
"eval_loss": 10.4375,
"eval_runtime": 266.9747,
"eval_samples_per_second": 126.48,
"eval_steps_per_second": 2.637,
"step": 132
},
{
"epoch": 0.00966288869514676,
"grad_norm": 0.9130184054374695,
"learning_rate": 9.999157221737867e-06,
"loss": 10.4375,
"step": 133
},
{
"epoch": 0.00966288869514676,
"eval_accuracy": 0.06790248312272583,
"eval_loss": 10.4375,
"eval_runtime": 267.3131,
"eval_samples_per_second": 126.32,
"eval_steps_per_second": 2.634,
"step": 133
},
{
"epoch": 0.00973554199360651,
"grad_norm": 0.9464238882064819,
"learning_rate": 9.999149956408023e-06,
"loss": 10.4297,
"step": 134
},
{
"epoch": 0.00973554199360651,
"eval_accuracy": 0.06788108989437448,
"eval_loss": 10.4375,
"eval_runtime": 267.1185,
"eval_samples_per_second": 126.412,
"eval_steps_per_second": 2.636,
"step": 134
},
{
"epoch": 0.00980819529206626,
"grad_norm": 0.8799238204956055,
"learning_rate": 9.999142691078175e-06,
"loss": 10.4453,
"step": 135
},
{
"epoch": 0.00980819529206626,
"eval_accuracy": 0.06779870135866685,
"eval_loss": 10.4296875,
"eval_runtime": 266.9212,
"eval_samples_per_second": 126.505,
"eval_steps_per_second": 2.637,
"step": 135
},
{
"epoch": 0.00988084859052601,
"grad_norm": 0.8914628624916077,
"learning_rate": 9.99913542574833e-06,
"loss": 10.4375,
"step": 136
},
{
"epoch": 0.00988084859052601,
"eval_accuracy": 0.06771660231184085,
"eval_loss": 10.4296875,
"eval_runtime": 265.5018,
"eval_samples_per_second": 127.182,
"eval_steps_per_second": 2.652,
"step": 136
},
{
"epoch": 0.00995350188898576,
"grad_norm": 0.9166758060455322,
"learning_rate": 9.999128160418484e-06,
"loss": 10.4375,
"step": 137
},
{
"epoch": 0.00995350188898576,
"eval_accuracy": 0.0677075992076226,
"eval_loss": 10.4296875,
"eval_runtime": 266.035,
"eval_samples_per_second": 126.927,
"eval_steps_per_second": 2.646,
"step": 137
},
{
"epoch": 0.01002615518744551,
"grad_norm": 0.9371738433837891,
"learning_rate": 9.999120895088638e-06,
"loss": 10.4219,
"step": 138
},
{
"epoch": 0.01002615518744551,
"eval_accuracy": 0.0677291950781911,
"eval_loss": 10.4296875,
"eval_runtime": 266.3525,
"eval_samples_per_second": 126.776,
"eval_steps_per_second": 2.643,
"step": 138
},
{
"epoch": 0.01009880848590526,
"grad_norm": 0.8814043998718262,
"learning_rate": 9.999113629758792e-06,
"loss": 10.4375,
"step": 139
},
{
"epoch": 0.01009880848590526,
"eval_accuracy": 0.06779661703871923,
"eval_loss": 10.421875,
"eval_runtime": 266.1042,
"eval_samples_per_second": 126.894,
"eval_steps_per_second": 2.646,
"step": 139
},
{
"epoch": 0.01017146178436501,
"grad_norm": 0.9055945873260498,
"learning_rate": 9.999106364428946e-06,
"loss": 10.4297,
"step": 140
},
{
"epoch": 0.01017146178436501,
"eval_accuracy": 0.06796996298103028,
"eval_loss": 10.421875,
"eval_runtime": 267.0593,
"eval_samples_per_second": 126.44,
"eval_steps_per_second": 2.636,
"step": 140
},
{
"epoch": 0.01024411508282476,
"grad_norm": 0.8938325643539429,
"learning_rate": 9.9990990990991e-06,
"loss": 10.4297,
"step": 141
},
{
"epoch": 0.01024411508282476,
"eval_accuracy": 0.06815485952971778,
"eval_loss": 10.421875,
"eval_runtime": 267.7346,
"eval_samples_per_second": 126.121,
"eval_steps_per_second": 2.629,
"step": 141
},
{
"epoch": 0.01031676838128451,
"grad_norm": 0.9098795056343079,
"learning_rate": 9.999091833769254e-06,
"loss": 10.4219,
"step": 142
},
{
"epoch": 0.01031676838128451,
"eval_accuracy": 0.06841157826993396,
"eval_loss": 10.421875,
"eval_runtime": 268.1424,
"eval_samples_per_second": 125.929,
"eval_steps_per_second": 2.625,
"step": 142
},
{
"epoch": 0.01038942167974426,
"grad_norm": 0.907673716545105,
"learning_rate": 9.999084568439408e-06,
"loss": 10.4219,
"step": 143
},
{
"epoch": 0.01038942167974426,
"eval_accuracy": 0.06867860281433565,
"eval_loss": 10.421875,
"eval_runtime": 268.3436,
"eval_samples_per_second": 125.835,
"eval_steps_per_second": 2.624,
"step": 143
},
{
"epoch": 0.010462074978204011,
"grad_norm": 0.9119425415992737,
"learning_rate": 9.999077303109562e-06,
"loss": 10.4219,
"step": 144
},
{
"epoch": 0.010462074978204011,
"eval_accuracy": 0.06893219507463037,
"eval_loss": 10.4140625,
"eval_runtime": 267.8804,
"eval_samples_per_second": 126.053,
"eval_steps_per_second": 2.628,
"step": 144
},
{
"epoch": 0.01053472827666376,
"grad_norm": 0.8991184830665588,
"learning_rate": 9.999070037779716e-06,
"loss": 10.4219,
"step": 145
},
{
"epoch": 0.01053472827666376,
"eval_accuracy": 0.0691721234597129,
"eval_loss": 10.4140625,
"eval_runtime": 268.0954,
"eval_samples_per_second": 125.951,
"eval_steps_per_second": 2.626,
"step": 145
},
{
"epoch": 0.01060738157512351,
"grad_norm": 1.0204856395721436,
"learning_rate": 9.99906277244987e-06,
"loss": 10.4141,
"step": 146
},
{
"epoch": 0.01060738157512351,
"eval_accuracy": 0.06934083757991812,
"eval_loss": 10.4140625,
"eval_runtime": 267.5128,
"eval_samples_per_second": 126.226,
"eval_steps_per_second": 2.632,
"step": 146
},
{
"epoch": 0.010680034873583261,
"grad_norm": 0.9581719040870667,
"learning_rate": 9.999055507120024e-06,
"loss": 10.4062,
"step": 147
},
{
"epoch": 0.010680034873583261,
"eval_accuracy": 0.06947478408544137,
"eval_loss": 10.4140625,
"eval_runtime": 267.7655,
"eval_samples_per_second": 126.107,
"eval_steps_per_second": 2.629,
"step": 147
},
{
"epoch": 0.010752688172043012,
"grad_norm": 0.9252108931541443,
"learning_rate": 9.999048241790178e-06,
"loss": 10.4141,
"step": 148
},
{
"epoch": 0.010752688172043012,
"eval_accuracy": 0.06958163443164546,
"eval_loss": 10.40625,
"eval_runtime": 266.9258,
"eval_samples_per_second": 126.503,
"eval_steps_per_second": 2.637,
"step": 148
},
{
"epoch": 0.01082534147050276,
"grad_norm": 0.8792810440063477,
"learning_rate": 9.999040976460332e-06,
"loss": 10.4141,
"step": 149
},
{
"epoch": 0.01082534147050276,
"eval_accuracy": 0.0696740971804333,
"eval_loss": 10.40625,
"eval_runtime": 267.4905,
"eval_samples_per_second": 126.236,
"eval_steps_per_second": 2.632,
"step": 149
},
{
"epoch": 0.010897994768962511,
"grad_norm": 0.8908605575561523,
"learning_rate": 9.999033711130486e-06,
"loss": 10.4219,
"step": 150
},
{
"epoch": 0.010897994768962511,
"eval_accuracy": 0.06974991431852827,
"eval_loss": 10.40625,
"eval_runtime": 268.0407,
"eval_samples_per_second": 125.977,
"eval_steps_per_second": 2.626,
"step": 150
},
{
"epoch": 0.010970648067422262,
"grad_norm": 0.9047368764877319,
"learning_rate": 9.99902644580064e-06,
"loss": 10.4062,
"step": 151
},
{
"epoch": 0.010970648067422262,
"eval_accuracy": 0.06984810894717207,
"eval_loss": 10.40625,
"eval_runtime": 268.9337,
"eval_samples_per_second": 125.559,
"eval_steps_per_second": 2.618,
"step": 151
},
{
"epoch": 0.01104330136588201,
"grad_norm": 0.9523606300354004,
"learning_rate": 9.999019180470794e-06,
"loss": 10.4141,
"step": 152
},
{
"epoch": 0.01104330136588201,
"eval_accuracy": 0.06996532299533799,
"eval_loss": 10.40625,
"eval_runtime": 268.7819,
"eval_samples_per_second": 125.63,
"eval_steps_per_second": 2.619,
"step": 152
},
{
"epoch": 0.011115954664341761,
"grad_norm": 0.9536779522895813,
"learning_rate": 9.999011915140948e-06,
"loss": 10.4141,
"step": 153
},
{
"epoch": 0.011115954664341761,
"eval_accuracy": 0.0701369030554712,
"eval_loss": 10.3984375,
"eval_runtime": 267.3646,
"eval_samples_per_second": 126.296,
"eval_steps_per_second": 2.633,
"step": 153
},
{
"epoch": 0.011188607962801512,
"grad_norm": 0.8978484272956848,
"learning_rate": 9.999004649811103e-06,
"loss": 10.4219,
"step": 154
},
{
"epoch": 0.011188607962801512,
"eval_accuracy": 0.07024372445278713,
"eval_loss": 10.3984375,
"eval_runtime": 267.0372,
"eval_samples_per_second": 126.451,
"eval_steps_per_second": 2.636,
"step": 154
},
{
"epoch": 0.01126126126126126,
"grad_norm": 0.9067463874816895,
"learning_rate": 9.998997384481255e-06,
"loss": 10.4141,
"step": 155
},
{
"epoch": 0.01126126126126126,
"eval_accuracy": 0.07035488818332729,
"eval_loss": 10.3984375,
"eval_runtime": 266.8902,
"eval_samples_per_second": 126.52,
"eval_steps_per_second": 2.638,
"step": 155
},
{
"epoch": 0.011333914559721011,
"grad_norm": 0.8969941735267639,
"learning_rate": 9.998990119151411e-06,
"loss": 10.4062,
"step": 156
},
{
"epoch": 0.011333914559721011,
"eval_accuracy": 0.07053625296765909,
"eval_loss": 10.3984375,
"eval_runtime": 266.2466,
"eval_samples_per_second": 126.826,
"eval_steps_per_second": 2.644,
"step": 156
},
{
"epoch": 0.011406567858180762,
"grad_norm": 0.9160457253456116,
"learning_rate": 9.998982853821563e-06,
"loss": 10.4062,
"step": 157
},
{
"epoch": 0.011406567858180762,
"eval_accuracy": 0.07071909414528711,
"eval_loss": 10.390625,
"eval_runtime": 266.1843,
"eval_samples_per_second": 126.856,
"eval_steps_per_second": 2.645,
"step": 157
},
{
"epoch": 0.01147922115664051,
"grad_norm": 0.9947687387466431,
"learning_rate": 9.998975588491719e-06,
"loss": 10.3906,
"step": 158
},
{
"epoch": 0.01147922115664051,
"eval_accuracy": 0.07083940572448631,
"eval_loss": 10.390625,
"eval_runtime": 266.0309,
"eval_samples_per_second": 126.929,
"eval_steps_per_second": 2.646,
"step": 158
},
{
"epoch": 0.011551874455100261,
"grad_norm": 0.9923911690711975,
"learning_rate": 9.998968323161873e-06,
"loss": 10.3906,
"step": 159
},
{
"epoch": 0.011551874455100261,
"eval_accuracy": 0.07099616395388084,
"eval_loss": 10.390625,
"eval_runtime": 265.8514,
"eval_samples_per_second": 127.015,
"eval_steps_per_second": 2.648,
"step": 159
},
{
"epoch": 0.011624527753560012,
"grad_norm": 0.8818120360374451,
"learning_rate": 9.998961057832027e-06,
"loss": 10.3984,
"step": 160
},
{
"epoch": 0.011624527753560012,
"eval_accuracy": 0.07109409804253118,
"eval_loss": 10.390625,
"eval_runtime": 265.1784,
"eval_samples_per_second": 127.337,
"eval_steps_per_second": 2.655,
"step": 160
},
{
"epoch": 0.011697181052019761,
"grad_norm": 0.8733471035957336,
"learning_rate": 9.99895379250218e-06,
"loss": 10.3984,
"step": 161
},
{
"epoch": 0.011697181052019761,
"eval_accuracy": 0.07113795560809585,
"eval_loss": 10.390625,
"eval_runtime": 266.4251,
"eval_samples_per_second": 126.741,
"eval_steps_per_second": 2.642,
"step": 161
},
{
"epoch": 0.011769834350479512,
"grad_norm": 0.8769287467002869,
"learning_rate": 9.998946527172335e-06,
"loss": 10.3906,
"step": 162
},
{
"epoch": 0.011769834350479512,
"eval_accuracy": 0.07117098628948813,
"eval_loss": 10.3828125,
"eval_runtime": 266.1962,
"eval_samples_per_second": 126.85,
"eval_steps_per_second": 2.645,
"step": 162
},
{
"epoch": 0.011842487648939262,
"grad_norm": 0.8957408666610718,
"learning_rate": 9.998939261842489e-06,
"loss": 10.3906,
"step": 163
},
{
"epoch": 0.011842487648939262,
"eval_accuracy": 0.07124170842326667,
"eval_loss": 10.3828125,
"eval_runtime": 266.0683,
"eval_samples_per_second": 126.911,
"eval_steps_per_second": 2.646,
"step": 163
},
{
"epoch": 0.011915140947399011,
"grad_norm": 0.9207865595817566,
"learning_rate": 9.998931996512643e-06,
"loss": 10.3906,
"step": 164
},
{
"epoch": 0.011915140947399011,
"eval_accuracy": 0.07135113522051714,
"eval_loss": 10.3828125,
"eval_runtime": 266.3681,
"eval_samples_per_second": 126.768,
"eval_steps_per_second": 2.643,
"step": 164
},
{
"epoch": 0.011987794245858762,
"grad_norm": 1.0381028652191162,
"learning_rate": 9.998924731182797e-06,
"loss": 10.3828,
"step": 165
},
{
"epoch": 0.011987794245858762,
"eval_accuracy": 0.0714838079749613,
"eval_loss": 10.3828125,
"eval_runtime": 266.8907,
"eval_samples_per_second": 126.52,
"eval_steps_per_second": 2.638,
"step": 165
},
{
"epoch": 0.012060447544318512,
"grad_norm": 1.0229851007461548,
"learning_rate": 9.99891746585295e-06,
"loss": 10.375,
"step": 166
},
{
"epoch": 0.012060447544318512,
"eval_accuracy": 0.07162267579147201,
"eval_loss": 10.375,
"eval_runtime": 268.0765,
"eval_samples_per_second": 125.96,
"eval_steps_per_second": 2.626,
"step": 166
},
{
"epoch": 0.012133100842778263,
"grad_norm": 0.8751774430274963,
"learning_rate": 9.998910200523105e-06,
"loss": 10.3828,
"step": 167
},
{
"epoch": 0.012133100842778263,
"eval_accuracy": 0.07171681757577321,
"eval_loss": 10.375,
"eval_runtime": 266.5398,
"eval_samples_per_second": 126.687,
"eval_steps_per_second": 2.641,
"step": 167
},
{
"epoch": 0.012205754141238012,
"grad_norm": 0.9067946672439575,
"learning_rate": 9.998902935193258e-06,
"loss": 10.3828,
"step": 168
},
{
"epoch": 0.012205754141238012,
"eval_accuracy": 0.07183573960834065,
"eval_loss": 10.375,
"eval_runtime": 269.0409,
"eval_samples_per_second": 125.509,
"eval_steps_per_second": 2.617,
"step": 168
},
{
"epoch": 0.012278407439697762,
"grad_norm": 0.8819664120674133,
"learning_rate": 9.998895669863412e-06,
"loss": 10.3828,
"step": 169
},
{
"epoch": 0.012278407439697762,
"eval_accuracy": 0.07194832183440072,
"eval_loss": 10.375,
"eval_runtime": 269.4359,
"eval_samples_per_second": 125.325,
"eval_steps_per_second": 2.613,
"step": 169
},
{
"epoch": 0.012351060738157513,
"grad_norm": 0.8808407783508301,
"learning_rate": 9.998888404533566e-06,
"loss": 10.3828,
"step": 170
},
{
"epoch": 0.012351060738157513,
"eval_accuracy": 0.0720716440979687,
"eval_loss": 10.375,
"eval_runtime": 269.1418,
"eval_samples_per_second": 125.462,
"eval_steps_per_second": 2.616,
"step": 170
},
{
"epoch": 0.012423714036617262,
"grad_norm": 0.9482495188713074,
"learning_rate": 9.99888113920372e-06,
"loss": 10.3672,
"step": 171
},
{
"epoch": 0.012423714036617262,
"eval_accuracy": 0.07210840918593382,
"eval_loss": 10.3671875,
"eval_runtime": 268.1791,
"eval_samples_per_second": 125.912,
"eval_steps_per_second": 2.625,
"step": 171
},
{
"epoch": 0.012496367335077013,
"grad_norm": 0.8792570233345032,
"learning_rate": 9.998873873873874e-06,
"loss": 10.375,
"step": 172
},
{
"epoch": 0.012496367335077013,
"eval_accuracy": 0.07209656909067576,
"eval_loss": 10.3671875,
"eval_runtime": 267.4096,
"eval_samples_per_second": 126.274,
"eval_steps_per_second": 2.633,
"step": 172
},
{
"epoch": 0.012569020633536763,
"grad_norm": 1.0035219192504883,
"learning_rate": 9.998866608544028e-06,
"loss": 10.3594,
"step": 173
},
{
"epoch": 0.012569020633536763,
"eval_accuracy": 0.07205164041624912,
"eval_loss": 10.3671875,
"eval_runtime": 267.2327,
"eval_samples_per_second": 126.358,
"eval_steps_per_second": 2.634,
"step": 173
},
{
"epoch": 0.012641673931996512,
"grad_norm": 0.9102580547332764,
"learning_rate": 9.998859343214182e-06,
"loss": 10.375,
"step": 174
},
{
"epoch": 0.012641673931996512,
"eval_accuracy": 0.07202917607903579,
"eval_loss": 10.3671875,
"eval_runtime": 267.9463,
"eval_samples_per_second": 126.022,
"eval_steps_per_second": 2.627,
"step": 174
},
{
"epoch": 0.012714327230456263,
"grad_norm": 0.9545760154724121,
"learning_rate": 9.998852077884336e-06,
"loss": 10.3594,
"step": 175
},
{
"epoch": 0.012714327230456263,
"eval_accuracy": 0.0720582697116381,
"eval_loss": 10.359375,
"eval_runtime": 266.8569,
"eval_samples_per_second": 126.536,
"eval_steps_per_second": 2.638,
"step": 175
},
{
"epoch": 0.012786980528916013,
"grad_norm": 0.9329653978347778,
"learning_rate": 9.998844812554492e-06,
"loss": 10.3672,
"step": 176
},
{
"epoch": 0.012786980528916013,
"eval_accuracy": 0.07217727859087003,
"eval_loss": 10.359375,
"eval_runtime": 265.4766,
"eval_samples_per_second": 127.194,
"eval_steps_per_second": 2.652,
"step": 176
},
{
"epoch": 0.012859633827375762,
"grad_norm": 0.903916597366333,
"learning_rate": 9.998837547224644e-06,
"loss": 10.375,
"step": 177
},
{
"epoch": 0.012859633827375762,
"eval_accuracy": 0.07231886760286792,
"eval_loss": 10.359375,
"eval_runtime": 266.8553,
"eval_samples_per_second": 126.537,
"eval_steps_per_second": 2.638,
"step": 177
},
{
"epoch": 0.012932287125835513,
"grad_norm": 0.9190238118171692,
"learning_rate": 9.9988302818948e-06,
"loss": 10.3672,
"step": 178
},
{
"epoch": 0.012932287125835513,
"eval_accuracy": 0.07257023079877423,
"eval_loss": 10.359375,
"eval_runtime": 266.7383,
"eval_samples_per_second": 126.592,
"eval_steps_per_second": 2.639,
"step": 178
},
{
"epoch": 0.013004940424295263,
"grad_norm": 0.8731828927993774,
"learning_rate": 9.998823016564952e-06,
"loss": 10.3672,
"step": 179
},
{
"epoch": 0.013004940424295263,
"eval_accuracy": 0.0727295075814388,
"eval_loss": 10.359375,
"eval_runtime": 266.7924,
"eval_samples_per_second": 126.567,
"eval_steps_per_second": 2.639,
"step": 179
},
{
"epoch": 0.013077593722755012,
"grad_norm": 0.8964665532112122,
"learning_rate": 9.998815751235107e-06,
"loss": 10.3594,
"step": 180
},
{
"epoch": 0.013077593722755012,
"eval_accuracy": 0.0728463452940586,
"eval_loss": 10.3515625,
"eval_runtime": 264.1125,
"eval_samples_per_second": 127.851,
"eval_steps_per_second": 2.666,
"step": 180
},
{
"epoch": 0.013150247021214763,
"grad_norm": 0.8609874844551086,
"learning_rate": 9.998808485905261e-06,
"loss": 10.3672,
"step": 181
},
{
"epoch": 0.013150247021214763,
"eval_accuracy": 0.07293759218954365,
"eval_loss": 10.3515625,
"eval_runtime": 264.4015,
"eval_samples_per_second": 127.711,
"eval_steps_per_second": 2.663,
"step": 181
},
{
"epoch": 0.013222900319674514,
"grad_norm": 0.9078623056411743,
"learning_rate": 9.998801220575415e-06,
"loss": 10.3594,
"step": 182
},
{
"epoch": 0.013222900319674514,
"eval_accuracy": 0.0730243230584755,
"eval_loss": 10.3515625,
"eval_runtime": 265.1597,
"eval_samples_per_second": 127.346,
"eval_steps_per_second": 2.655,
"step": 182
},
{
"epoch": 0.013295553618134264,
"grad_norm": 0.9211888909339905,
"learning_rate": 9.99879395524557e-06,
"loss": 10.3516,
"step": 183
},
{
"epoch": 0.013295553618134264,
"eval_accuracy": 0.07311939120719788,
"eval_loss": 10.3515625,
"eval_runtime": 265.0761,
"eval_samples_per_second": 127.386,
"eval_steps_per_second": 2.656,
"step": 183
},
{
"epoch": 0.013368206916594013,
"grad_norm": 0.9223811030387878,
"learning_rate": 9.998786689915723e-06,
"loss": 10.3594,
"step": 184
},
{
"epoch": 0.013368206916594013,
"eval_accuracy": 0.07324948751059565,
"eval_loss": 10.3515625,
"eval_runtime": 267.2168,
"eval_samples_per_second": 126.366,
"eval_steps_per_second": 2.635,
"step": 184
},
{
"epoch": 0.013440860215053764,
"grad_norm": 0.8940264582633972,
"learning_rate": 9.998779424585877e-06,
"loss": 10.3516,
"step": 185
},
{
"epoch": 0.013440860215053764,
"eval_accuracy": 0.07333268661517181,
"eval_loss": 10.34375,
"eval_runtime": 266.2508,
"eval_samples_per_second": 126.824,
"eval_steps_per_second": 2.644,
"step": 185
},
{
"epoch": 0.013513513513513514,
"grad_norm": 0.8663957118988037,
"learning_rate": 9.998772159256031e-06,
"loss": 10.3516,
"step": 186
},
{
"epoch": 0.013513513513513514,
"eval_accuracy": 0.07334869535032512,
"eval_loss": 10.34375,
"eval_runtime": 268.3928,
"eval_samples_per_second": 125.812,
"eval_steps_per_second": 2.623,
"step": 186
},
{
"epoch": 0.013586166811973263,
"grad_norm": 0.899961531162262,
"learning_rate": 9.998764893926185e-06,
"loss": 10.3438,
"step": 187
},
{
"epoch": 0.013586166811973263,
"eval_accuracy": 0.07337804952292087,
"eval_loss": 10.34375,
"eval_runtime": 268.2225,
"eval_samples_per_second": 125.892,
"eval_steps_per_second": 2.625,
"step": 187
},
{
"epoch": 0.013658820110433014,
"grad_norm": 0.8721891045570374,
"learning_rate": 9.998757628596339e-06,
"loss": 10.3516,
"step": 188
},
{
"epoch": 0.013658820110433014,
"eval_accuracy": 0.07342506251729515,
"eval_loss": 10.34375,
"eval_runtime": 268.3043,
"eval_samples_per_second": 125.853,
"eval_steps_per_second": 2.624,
"step": 188
},
{
"epoch": 0.013731473408892764,
"grad_norm": 0.9019783735275269,
"learning_rate": 9.998750363266493e-06,
"loss": 10.3516,
"step": 189
},
{
"epoch": 0.013731473408892764,
"eval_accuracy": 0.07348243921363125,
"eval_loss": 10.3359375,
"eval_runtime": 267.616,
"eval_samples_per_second": 126.177,
"eval_steps_per_second": 2.631,
"step": 189
},
{
"epoch": 0.013804126707352513,
"grad_norm": 0.9109626412391663,
"learning_rate": 9.998743097936647e-06,
"loss": 10.3438,
"step": 190
},
{
"epoch": 0.013804126707352513,
"eval_accuracy": 0.07349436615555378,
"eval_loss": 10.3359375,
"eval_runtime": 268.802,
"eval_samples_per_second": 125.62,
"eval_steps_per_second": 2.619,
"step": 190
},
{
"epoch": 0.013876780005812264,
"grad_norm": 0.8707013130187988,
"learning_rate": 9.998735832606801e-06,
"loss": 10.3516,
"step": 191
},
{
"epoch": 0.013876780005812264,
"eval_accuracy": 0.07354867426974475,
"eval_loss": 10.3359375,
"eval_runtime": 269.0333,
"eval_samples_per_second": 125.512,
"eval_steps_per_second": 2.617,
"step": 191
},
{
"epoch": 0.013949433304272014,
"grad_norm": 0.9611899256706238,
"learning_rate": 9.998728567276955e-06,
"loss": 10.3359,
"step": 192
},
{
"epoch": 0.013949433304272014,
"eval_accuracy": 0.07366001169361389,
"eval_loss": 10.3359375,
"eval_runtime": 266.4167,
"eval_samples_per_second": 126.745,
"eval_steps_per_second": 2.642,
"step": 192
},
{
"epoch": 0.014022086602731763,
"grad_norm": 0.8997408151626587,
"learning_rate": 9.99872130194711e-06,
"loss": 10.3359,
"step": 193
},
{
"epoch": 0.014022086602731763,
"eval_accuracy": 0.07367448613769463,
"eval_loss": 10.3359375,
"eval_runtime": 267.3071,
"eval_samples_per_second": 126.323,
"eval_steps_per_second": 2.634,
"step": 193
},
{
"epoch": 0.014094739901191514,
"grad_norm": 0.8796170949935913,
"learning_rate": 9.998714036617263e-06,
"loss": 10.3359,
"step": 194
},
{
"epoch": 0.014094739901191514,
"eval_accuracy": 0.07364093437631546,
"eval_loss": 10.328125,
"eval_runtime": 267.1483,
"eval_samples_per_second": 126.398,
"eval_steps_per_second": 2.635,
"step": 194
},
{
"epoch": 0.014167393199651265,
"grad_norm": 0.9038819670677185,
"learning_rate": 9.998706771287417e-06,
"loss": 10.3359,
"step": 195
},
{
"epoch": 0.014167393199651265,
"eval_accuracy": 0.0735885658376313,
"eval_loss": 10.328125,
"eval_runtime": 267.3865,
"eval_samples_per_second": 126.285,
"eval_steps_per_second": 2.633,
"step": 195
},
{
"epoch": 0.014240046498111014,
"grad_norm": 0.9091231822967529,
"learning_rate": 9.99869950595757e-06,
"loss": 10.3359,
"step": 196
},
{
"epoch": 0.014240046498111014,
"eval_accuracy": 0.0736078168482587,
"eval_loss": 10.328125,
"eval_runtime": 267.9665,
"eval_samples_per_second": 126.012,
"eval_steps_per_second": 2.627,
"step": 196
},
{
"epoch": 0.014312699796570764,
"grad_norm": 0.9065552949905396,
"learning_rate": 9.998692240627725e-06,
"loss": 10.3281,
"step": 197
},
{
"epoch": 0.014312699796570764,
"eval_accuracy": 0.07369920848818455,
"eval_loss": 10.328125,
"eval_runtime": 267.2862,
"eval_samples_per_second": 126.333,
"eval_steps_per_second": 2.634,
"step": 197
},
{
"epoch": 0.014385353095030515,
"grad_norm": 0.8987371921539307,
"learning_rate": 9.99868497529788e-06,
"loss": 10.3359,
"step": 198
},
{
"epoch": 0.014385353095030515,
"eval_accuracy": 0.07384696361336085,
"eval_loss": 10.328125,
"eval_runtime": 267.5117,
"eval_samples_per_second": 126.226,
"eval_steps_per_second": 2.632,
"step": 198
},
{
"epoch": 0.014458006393490264,
"grad_norm": 0.891822338104248,
"learning_rate": 9.998677709968032e-06,
"loss": 10.3203,
"step": 199
},
{
"epoch": 0.014458006393490264,
"eval_accuracy": 0.07396643567480336,
"eval_loss": 10.3203125,
"eval_runtime": 265.9448,
"eval_samples_per_second": 126.97,
"eval_steps_per_second": 2.647,
"step": 199
},
{
"epoch": 0.014530659691950014,
"grad_norm": 0.8724116683006287,
"learning_rate": 9.998670444638188e-06,
"loss": 10.3359,
"step": 200
},
{
"epoch": 0.014530659691950014,
"eval_accuracy": 0.07408226017633752,
"eval_loss": 10.3203125,
"eval_runtime": 266.1838,
"eval_samples_per_second": 126.856,
"eval_steps_per_second": 2.645,
"step": 200
},
{
"epoch": 0.014603312990409765,
"grad_norm": 0.8940464854240417,
"learning_rate": 9.99866317930834e-06,
"loss": 10.3359,
"step": 201
},
{
"epoch": 0.014603312990409765,
"eval_accuracy": 0.07420095061779967,
"eval_loss": 10.3203125,
"eval_runtime": 264.9987,
"eval_samples_per_second": 127.423,
"eval_steps_per_second": 2.657,
"step": 201
},
{
"epoch": 0.014675966288869515,
"grad_norm": 0.9207845330238342,
"learning_rate": 9.998655913978496e-06,
"loss": 10.3281,
"step": 202
},
{
"epoch": 0.014675966288869515,
"eval_accuracy": 0.0742804732135793,
"eval_loss": 10.3203125,
"eval_runtime": 264.7467,
"eval_samples_per_second": 127.545,
"eval_steps_per_second": 2.659,
"step": 202
},
{
"epoch": 0.014748619587329264,
"grad_norm": 0.8840688467025757,
"learning_rate": 9.998648648648648e-06,
"loss": 10.3203,
"step": 203
},
{
"epoch": 0.014748619587329264,
"eval_accuracy": 0.07432360705693994,
"eval_loss": 10.3125,
"eval_runtime": 264.0804,
"eval_samples_per_second": 127.866,
"eval_steps_per_second": 2.666,
"step": 203
},
{
"epoch": 0.014821272885789015,
"grad_norm": 0.8885826468467712,
"learning_rate": 9.998641383318804e-06,
"loss": 10.3203,
"step": 204
},
{
"epoch": 0.014821272885789015,
"eval_accuracy": 0.07433359442335566,
"eval_loss": 10.3125,
"eval_runtime": 266.5522,
"eval_samples_per_second": 126.681,
"eval_steps_per_second": 2.641,
"step": 204
},
{
"epoch": 0.014893926184248766,
"grad_norm": 0.897081732749939,
"learning_rate": 9.998634117988958e-06,
"loss": 10.3281,
"step": 205
},
{
"epoch": 0.014893926184248766,
"eval_accuracy": 0.07427410445818378,
"eval_loss": 10.3125,
"eval_runtime": 267.4051,
"eval_samples_per_second": 126.277,
"eval_steps_per_second": 2.633,
"step": 205
},
{
"epoch": 0.014966579482708514,
"grad_norm": 0.9606081247329712,
"learning_rate": 9.998626852659112e-06,
"loss": 10.3125,
"step": 206
},
{
"epoch": 0.014966579482708514,
"eval_accuracy": 0.07412779677741556,
"eval_loss": 10.3125,
"eval_runtime": 265.1816,
"eval_samples_per_second": 127.335,
"eval_steps_per_second": 2.655,
"step": 206
},
{
"epoch": 0.015039232781168265,
"grad_norm": 0.9314731955528259,
"learning_rate": 9.998619587329266e-06,
"loss": 10.3125,
"step": 207
},
{
"epoch": 0.015039232781168265,
"eval_accuracy": 0.07401284074252625,
"eval_loss": 10.3125,
"eval_runtime": 266.8876,
"eval_samples_per_second": 126.521,
"eval_steps_per_second": 2.638,
"step": 207
},
{
"epoch": 0.015111886079628016,
"grad_norm": 0.9583424925804138,
"learning_rate": 9.99861232199942e-06,
"loss": 10.3047,
"step": 208
},
{
"epoch": 0.015111886079628016,
"eval_accuracy": 0.07404071852182577,
"eval_loss": 10.3046875,
"eval_runtime": 266.703,
"eval_samples_per_second": 126.609,
"eval_steps_per_second": 2.64,
"step": 208
},
{
"epoch": 0.015184539378087765,
"grad_norm": 0.9071934223175049,
"learning_rate": 9.998605056669574e-06,
"loss": 10.3125,
"step": 209
},
{
"epoch": 0.015184539378087765,
"eval_accuracy": 0.07412215174422407,
"eval_loss": 10.3046875,
"eval_runtime": 266.5993,
"eval_samples_per_second": 126.658,
"eval_steps_per_second": 2.641,
"step": 209
},
{
"epoch": 0.015257192676547515,
"grad_norm": 0.8879753351211548,
"learning_rate": 9.998597791339728e-06,
"loss": 10.3125,
"step": 210
},
{
"epoch": 0.015257192676547515,
"eval_accuracy": 0.07420225331776693,
"eval_loss": 10.3046875,
"eval_runtime": 267.0745,
"eval_samples_per_second": 126.433,
"eval_steps_per_second": 2.636,
"step": 210
},
{
"epoch": 0.015329845975007266,
"grad_norm": 0.9064663052558899,
"learning_rate": 9.998590526009882e-06,
"loss": 10.3203,
"step": 211
},
{
"epoch": 0.015329845975007266,
"eval_accuracy": 0.07434856099853515,
"eval_loss": 10.3046875,
"eval_runtime": 267.2364,
"eval_samples_per_second": 126.356,
"eval_steps_per_second": 2.634,
"step": 211
},
{
"epoch": 0.015402499273467015,
"grad_norm": 0.888227105140686,
"learning_rate": 9.998583260680035e-06,
"loss": 10.3047,
"step": 212
},
{
"epoch": 0.015402499273467015,
"eval_accuracy": 0.07441022213031916,
"eval_loss": 10.3046875,
"eval_runtime": 267.6856,
"eval_samples_per_second": 126.144,
"eval_steps_per_second": 2.63,
"step": 212
},
{
"epoch": 0.015475152571926765,
"grad_norm": 0.873029887676239,
"learning_rate": 9.99857599535019e-06,
"loss": 10.3203,
"step": 213
},
{
"epoch": 0.015475152571926765,
"eval_accuracy": 0.07452069108754343,
"eval_loss": 10.296875,
"eval_runtime": 267.3911,
"eval_samples_per_second": 126.283,
"eval_steps_per_second": 2.633,
"step": 213
},
{
"epoch": 0.015547805870386516,
"grad_norm": 0.9147621989250183,
"learning_rate": 9.998568730020343e-06,
"loss": 10.3125,
"step": 214
},
{
"epoch": 0.015547805870386516,
"eval_accuracy": 0.07470987207167884,
"eval_loss": 10.296875,
"eval_runtime": 267.6606,
"eval_samples_per_second": 126.156,
"eval_steps_per_second": 2.63,
"step": 214
},
{
"epoch": 0.015620459168846265,
"grad_norm": 0.9260271787643433,
"learning_rate": 9.998561464690499e-06,
"loss": 10.3047,
"step": 215
},
{
"epoch": 0.015620459168846265,
"eval_accuracy": 0.07488478125395062,
"eval_loss": 10.296875,
"eval_runtime": 266.1126,
"eval_samples_per_second": 126.89,
"eval_steps_per_second": 2.645,
"step": 215
},
{
"epoch": 0.015693112467306015,
"grad_norm": 0.9096031785011292,
"learning_rate": 9.998554199360651e-06,
"loss": 10.2969,
"step": 216
},
{
"epoch": 0.015693112467306015,
"eval_accuracy": 0.07498534969142368,
"eval_loss": 10.296875,
"eval_runtime": 266.7055,
"eval_samples_per_second": 126.608,
"eval_steps_per_second": 2.64,
"step": 216
},
{
"epoch": 0.015765765765765764,
"grad_norm": 0.9063442945480347,
"learning_rate": 9.998546934030807e-06,
"loss": 10.3047,
"step": 217
},
{
"epoch": 0.015765765765765764,
"eval_accuracy": 0.07498459702033147,
"eval_loss": 10.296875,
"eval_runtime": 267.4598,
"eval_samples_per_second": 126.251,
"eval_steps_per_second": 2.632,
"step": 217
},
{
"epoch": 0.015838419064225517,
"grad_norm": 0.9258260130882263,
"learning_rate": 9.998539668700959e-06,
"loss": 10.2969,
"step": 218
},
{
"epoch": 0.015838419064225517,
"eval_accuracy": 0.0748771966452523,
"eval_loss": 10.2890625,
"eval_runtime": 267.3776,
"eval_samples_per_second": 126.29,
"eval_steps_per_second": 2.633,
"step": 218
},
{
"epoch": 0.015911072362685266,
"grad_norm": 0.9510604739189148,
"learning_rate": 9.998532403371115e-06,
"loss": 10.2891,
"step": 219
},
{
"epoch": 0.015911072362685266,
"eval_accuracy": 0.07466393018616653,
"eval_loss": 10.2890625,
"eval_runtime": 266.9849,
"eval_samples_per_second": 126.475,
"eval_steps_per_second": 2.637,
"step": 219
},
{
"epoch": 0.015983725661145014,
"grad_norm": 0.888974130153656,
"learning_rate": 9.998525138041267e-06,
"loss": 10.2969,
"step": 220
},
{
"epoch": 0.015983725661145014,
"eval_accuracy": 0.07440356388604201,
"eval_loss": 10.2890625,
"eval_runtime": 265.0185,
"eval_samples_per_second": 127.414,
"eval_steps_per_second": 2.656,
"step": 220
},
{
"epoch": 0.016056378959604767,
"grad_norm": 0.9004181623458862,
"learning_rate": 9.998517872711423e-06,
"loss": 10.2969,
"step": 221
},
{
"epoch": 0.016056378959604767,
"eval_accuracy": 0.07420781150429394,
"eval_loss": 10.2890625,
"eval_runtime": 265.4243,
"eval_samples_per_second": 127.219,
"eval_steps_per_second": 2.652,
"step": 221
},
{
"epoch": 0.016129032258064516,
"grad_norm": 0.8965704441070557,
"learning_rate": 9.998510607381577e-06,
"loss": 10.2891,
"step": 222
},
{
"epoch": 0.016129032258064516,
"eval_accuracy": 0.07409245018497036,
"eval_loss": 10.2890625,
"eval_runtime": 265.3546,
"eval_samples_per_second": 127.252,
"eval_steps_per_second": 2.653,
"step": 222
},
{
"epoch": 0.016201685556524265,
"grad_norm": 0.9500789046287537,
"learning_rate": 9.998503342051729e-06,
"loss": 10.2891,
"step": 223
},
{
"epoch": 0.016201685556524265,
"eval_accuracy": 0.07416858576083511,
"eval_loss": 10.28125,
"eval_runtime": 265.6881,
"eval_samples_per_second": 127.093,
"eval_steps_per_second": 2.65,
"step": 223
},
{
"epoch": 0.016274338854984017,
"grad_norm": 0.9081275463104248,
"learning_rate": 9.998496076721884e-06,
"loss": 10.2891,
"step": 224
},
{
"epoch": 0.016274338854984017,
"eval_accuracy": 0.07432068321923563,
"eval_loss": 10.28125,
"eval_runtime": 265.8739,
"eval_samples_per_second": 127.004,
"eval_steps_per_second": 2.648,
"step": 224
},
{
"epoch": 0.016346992153443766,
"grad_norm": 0.9124018549919128,
"learning_rate": 9.998488811392037e-06,
"loss": 10.2891,
"step": 225
},
{
"epoch": 0.016346992153443766,
"eval_accuracy": 0.07457499920173441,
"eval_loss": 10.28125,
"eval_runtime": 265.6262,
"eval_samples_per_second": 127.122,
"eval_steps_per_second": 2.65,
"step": 225
},
{
"epoch": 0.016419645451903518,
"grad_norm": 0.8811033368110657,
"learning_rate": 9.998481546062192e-06,
"loss": 10.2969,
"step": 226
},
{
"epoch": 0.016419645451903518,
"eval_accuracy": 0.07480795090477,
"eval_loss": 10.28125,
"eval_runtime": 265.1697,
"eval_samples_per_second": 127.341,
"eval_steps_per_second": 2.655,
"step": 226
},
{
"epoch": 0.016492298750363267,
"grad_norm": 0.8931852579116821,
"learning_rate": 9.998474280732346e-06,
"loss": 10.2812,
"step": 227
},
{
"epoch": 0.016492298750363267,
"eval_accuracy": 0.07488660503390479,
"eval_loss": 10.2734375,
"eval_runtime": 265.5292,
"eval_samples_per_second": 127.169,
"eval_steps_per_second": 2.651,
"step": 227
},
{
"epoch": 0.016564952048823016,
"grad_norm": 0.9159207344055176,
"learning_rate": 9.9984670154025e-06,
"loss": 10.2891,
"step": 228
},
{
"epoch": 0.016564952048823016,
"eval_accuracy": 0.07503638658125239,
"eval_loss": 10.2734375,
"eval_runtime": 264.9834,
"eval_samples_per_second": 127.431,
"eval_steps_per_second": 2.657,
"step": 228
},
{
"epoch": 0.01663760534728277,
"grad_norm": 0.9291688203811646,
"learning_rate": 9.998459750072654e-06,
"loss": 10.2734,
"step": 229
},
{
"epoch": 0.01663760534728277,
"eval_accuracy": 0.0751111905082617,
"eval_loss": 10.2734375,
"eval_runtime": 265.3402,
"eval_samples_per_second": 127.259,
"eval_steps_per_second": 2.653,
"step": 229
},
{
"epoch": 0.016710258645742517,
"grad_norm": 0.8605514168739319,
"learning_rate": 9.998452484742808e-06,
"loss": 10.2969,
"step": 230
},
{
"epoch": 0.016710258645742517,
"eval_accuracy": 0.075049760967583,
"eval_loss": 10.2734375,
"eval_runtime": 266.7587,
"eval_samples_per_second": 126.583,
"eval_steps_per_second": 2.639,
"step": 230
},
{
"epoch": 0.016782911944202266,
"grad_norm": 0.9553351998329163,
"learning_rate": 9.998445219412962e-06,
"loss": 10.2656,
"step": 231
},
{
"epoch": 0.016782911944202266,
"eval_accuracy": 0.07486694873884313,
"eval_loss": 10.2734375,
"eval_runtime": 266.6123,
"eval_samples_per_second": 126.652,
"eval_steps_per_second": 2.641,
"step": 231
},
{
"epoch": 0.01685556524266202,
"grad_norm": 0.9175562262535095,
"learning_rate": 9.998437954083116e-06,
"loss": 10.2734,
"step": 232
},
{
"epoch": 0.01685556524266202,
"eval_accuracy": 0.07473213376667504,
"eval_loss": 10.265625,
"eval_runtime": 267.4978,
"eval_samples_per_second": 126.233,
"eval_steps_per_second": 2.632,
"step": 232
},
{
"epoch": 0.016928218541121767,
"grad_norm": 0.9021575450897217,
"learning_rate": 9.99843068875327e-06,
"loss": 10.2734,
"step": 233
},
{
"epoch": 0.016928218541121767,
"eval_accuracy": 0.0746572719418894,
"eval_loss": 10.265625,
"eval_runtime": 267.5513,
"eval_samples_per_second": 126.208,
"eval_steps_per_second": 2.631,
"step": 233
},
{
"epoch": 0.017000871839581516,
"grad_norm": 0.8851971626281738,
"learning_rate": 9.998423423423424e-06,
"loss": 10.2734,
"step": 234
},
{
"epoch": 0.017000871839581516,
"eval_accuracy": 0.07463700772017634,
"eval_loss": 10.265625,
"eval_runtime": 267.13,
"eval_samples_per_second": 126.407,
"eval_steps_per_second": 2.635,
"step": 234
},
{
"epoch": 0.01707352513804127,
"grad_norm": 0.9394397139549255,
"learning_rate": 9.998416158093578e-06,
"loss": 10.2656,
"step": 235
},
{
"epoch": 0.01707352513804127,
"eval_accuracy": 0.07467038578822655,
"eval_loss": 10.265625,
"eval_runtime": 265.7106,
"eval_samples_per_second": 127.082,
"eval_steps_per_second": 2.649,
"step": 235
},
{
"epoch": 0.017146178436501017,
"grad_norm": 0.9121464490890503,
"learning_rate": 9.998408892763732e-06,
"loss": 10.2656,
"step": 236
},
{
"epoch": 0.017146178436501017,
"eval_accuracy": 0.07478282326984581,
"eval_loss": 10.265625,
"eval_runtime": 267.6727,
"eval_samples_per_second": 126.15,
"eval_steps_per_second": 2.63,
"step": 236
},
{
"epoch": 0.017218831734960766,
"grad_norm": 0.8910766839981079,
"learning_rate": 9.998401627433887e-06,
"loss": 10.2734,
"step": 237
},
{
"epoch": 0.017218831734960766,
"eval_accuracy": 0.07491986730640236,
"eval_loss": 10.2578125,
"eval_runtime": 268.4345,
"eval_samples_per_second": 125.792,
"eval_steps_per_second": 2.623,
"step": 237
},
{
"epoch": 0.01729148503342052,
"grad_norm": 0.9403276443481445,
"learning_rate": 9.99839436210404e-06,
"loss": 10.2656,
"step": 238
},
{
"epoch": 0.01729148503342052,
"eval_accuracy": 0.07518078363540195,
"eval_loss": 10.2578125,
"eval_runtime": 267.0058,
"eval_samples_per_second": 126.465,
"eval_steps_per_second": 2.637,
"step": 238
},
{
"epoch": 0.017364138331880268,
"grad_norm": 0.8892084956169128,
"learning_rate": 9.998387096774195e-06,
"loss": 10.2734,
"step": 239
},
{
"epoch": 0.017364138331880268,
"eval_accuracy": 0.07545047147751449,
"eval_loss": 10.2578125,
"eval_runtime": 265.2789,
"eval_samples_per_second": 127.289,
"eval_steps_per_second": 2.654,
"step": 239
},
{
"epoch": 0.017436791630340016,
"grad_norm": 0.9249821305274963,
"learning_rate": 9.998379831444348e-06,
"loss": 10.2578,
"step": 240
},
{
"epoch": 0.017436791630340016,
"eval_accuracy": 0.07559730023826962,
"eval_loss": 10.2578125,
"eval_runtime": 265.5188,
"eval_samples_per_second": 127.174,
"eval_steps_per_second": 2.651,
"step": 240
},
{
"epoch": 0.01750944492879977,
"grad_norm": 0.8785547614097595,
"learning_rate": 9.998372566114503e-06,
"loss": 10.2734,
"step": 241
},
{
"epoch": 0.01750944492879977,
"eval_accuracy": 0.07564480536374263,
"eval_loss": 10.2578125,
"eval_runtime": 265.4778,
"eval_samples_per_second": 127.193,
"eval_steps_per_second": 2.652,
"step": 241
},
{
"epoch": 0.017582098227259518,
"grad_norm": 0.9142479300498962,
"learning_rate": 9.998365300784656e-06,
"loss": 10.2656,
"step": 242
},
{
"epoch": 0.017582098227259518,
"eval_accuracy": 0.07562578594422054,
"eval_loss": 10.25,
"eval_runtime": 265.3917,
"eval_samples_per_second": 127.235,
"eval_steps_per_second": 2.653,
"step": 242
},
{
"epoch": 0.017654751525719267,
"grad_norm": 0.924387514591217,
"learning_rate": 9.998358035454811e-06,
"loss": 10.2578,
"step": 243
},
{
"epoch": 0.017654751525719267,
"eval_accuracy": 0.07555361636603392,
"eval_loss": 10.25,
"eval_runtime": 263.7272,
"eval_samples_per_second": 128.038,
"eval_steps_per_second": 2.669,
"step": 243
},
{
"epoch": 0.01772740482417902,
"grad_norm": 0.9198188185691833,
"learning_rate": 9.998350770124965e-06,
"loss": 10.2578,
"step": 244
},
{
"epoch": 0.01772740482417902,
"eval_accuracy": 0.07558369426083371,
"eval_loss": 10.25,
"eval_runtime": 264.6264,
"eval_samples_per_second": 127.603,
"eval_steps_per_second": 2.66,
"step": 244
},
{
"epoch": 0.017800058122638768,
"grad_norm": 0.9178450703620911,
"learning_rate": 9.998343504795119e-06,
"loss": 10.2578,
"step": 245
},
{
"epoch": 0.017800058122638768,
"eval_accuracy": 0.07555222681940216,
"eval_loss": 10.25,
"eval_runtime": 264.3962,
"eval_samples_per_second": 127.714,
"eval_steps_per_second": 2.663,
"step": 245
},
{
"epoch": 0.017872711421098517,
"grad_norm": 0.8939234614372253,
"learning_rate": 9.998336239465273e-06,
"loss": 10.2578,
"step": 246
},
{
"epoch": 0.017872711421098517,
"eval_accuracy": 0.07556374847689043,
"eval_loss": 10.25,
"eval_runtime": 264.4976,
"eval_samples_per_second": 127.665,
"eval_steps_per_second": 2.662,
"step": 246
},
{
"epoch": 0.01794536471955827,
"grad_norm": 0.9162428379058838,
"learning_rate": 9.998328974135427e-06,
"loss": 10.2578,
"step": 247
},
{
"epoch": 0.01794536471955827,
"eval_accuracy": 0.07567158308529202,
"eval_loss": 10.2421875,
"eval_runtime": 264.7261,
"eval_samples_per_second": 127.554,
"eval_steps_per_second": 2.659,
"step": 247
},
{
"epoch": 0.018018018018018018,
"grad_norm": 0.8900968432426453,
"learning_rate": 9.998321708805581e-06,
"loss": 10.2578,
"step": 248
},
{
"epoch": 0.018018018018018018,
"eval_accuracy": 0.07578150201364124,
"eval_loss": 10.2421875,
"eval_runtime": 264.8491,
"eval_samples_per_second": 127.495,
"eval_steps_per_second": 2.658,
"step": 248
},
{
"epoch": 0.018090671316477767,
"grad_norm": 0.9296072721481323,
"learning_rate": 9.998314443475735e-06,
"loss": 10.2422,
"step": 249
},
{
"epoch": 0.018090671316477767,
"eval_accuracy": 0.07588282312220648,
"eval_loss": 10.2421875,
"eval_runtime": 265.9596,
"eval_samples_per_second": 126.963,
"eval_steps_per_second": 2.647,
"step": 249
},
{
"epoch": 0.01816332461493752,
"grad_norm": 0.9414094686508179,
"learning_rate": 9.998307178145889e-06,
"loss": 10.2422,
"step": 250
},
{
"epoch": 0.01816332461493752,
"eval_accuracy": 0.07589431583080661,
"eval_loss": 10.2421875,
"eval_runtime": 265.5852,
"eval_samples_per_second": 127.142,
"eval_steps_per_second": 2.651,
"step": 250
},
{
"epoch": 0.018235977913397268,
"grad_norm": 0.9078280329704285,
"learning_rate": 9.998299912816043e-06,
"loss": 10.2422,
"step": 251
},
{
"epoch": 0.018235977913397268,
"eval_accuracy": 0.07588244678666038,
"eval_loss": 10.2421875,
"eval_runtime": 266.4837,
"eval_samples_per_second": 126.713,
"eval_steps_per_second": 2.642,
"step": 251
},
{
"epoch": 0.018308631211857017,
"grad_norm": 0.9042601585388184,
"learning_rate": 9.998292647486197e-06,
"loss": 10.2422,
"step": 252
},
{
"epoch": 0.018308631211857017,
"eval_accuracy": 0.07590384001501174,
"eval_loss": 10.234375,
"eval_runtime": 265.9021,
"eval_samples_per_second": 126.99,
"eval_steps_per_second": 2.648,
"step": 252
},
{
"epoch": 0.01838128451031677,
"grad_norm": 0.9029207825660706,
"learning_rate": 9.99828538215635e-06,
"loss": 10.2422,
"step": 253
},
{
"epoch": 0.01838128451031677,
"eval_accuracy": 0.0758776991690019,
"eval_loss": 10.234375,
"eval_runtime": 264.8706,
"eval_samples_per_second": 127.485,
"eval_steps_per_second": 2.658,
"step": 253
},
{
"epoch": 0.018453937808776518,
"grad_norm": 0.901042640209198,
"learning_rate": 9.998278116826505e-06,
"loss": 10.2422,
"step": 254
},
{
"epoch": 0.018453937808776518,
"eval_accuracy": 0.075944397407326,
"eval_loss": 10.234375,
"eval_runtime": 266.8196,
"eval_samples_per_second": 126.554,
"eval_steps_per_second": 2.638,
"step": 254
},
{
"epoch": 0.018526591107236267,
"grad_norm": 0.921441376209259,
"learning_rate": 9.998270851496658e-06,
"loss": 10.2422,
"step": 255
},
{
"epoch": 0.018526591107236267,
"eval_accuracy": 0.07608468371935663,
"eval_loss": 10.234375,
"eval_runtime": 266.5157,
"eval_samples_per_second": 126.698,
"eval_steps_per_second": 2.641,
"step": 255
},
{
"epoch": 0.01859924440569602,
"grad_norm": 0.9235514998435974,
"learning_rate": 9.998263586166812e-06,
"loss": 10.2422,
"step": 256
},
{
"epoch": 0.01859924440569602,
"eval_accuracy": 0.07609151565696273,
"eval_loss": 10.234375,
"eval_runtime": 265.5975,
"eval_samples_per_second": 127.136,
"eval_steps_per_second": 2.651,
"step": 256
},
{
"epoch": 0.018671897704155768,
"grad_norm": 0.8886791467666626,
"learning_rate": 9.998256320836966e-06,
"loss": 10.2422,
"step": 257
},
{
"epoch": 0.018671897704155768,
"eval_accuracy": 0.07604354734927914,
"eval_loss": 10.2265625,
"eval_runtime": 266.183,
"eval_samples_per_second": 126.856,
"eval_steps_per_second": 2.645,
"step": 257
},
{
"epoch": 0.018744551002615517,
"grad_norm": 0.8807479739189148,
"learning_rate": 9.99824905550712e-06,
"loss": 10.2422,
"step": 258
},
{
"epoch": 0.018744551002615517,
"eval_accuracy": 0.07599326313054261,
"eval_loss": 10.2265625,
"eval_runtime": 266.5367,
"eval_samples_per_second": 126.688,
"eval_steps_per_second": 2.641,
"step": 258
},
{
"epoch": 0.01881720430107527,
"grad_norm": 0.902275025844574,
"learning_rate": 9.998241790177274e-06,
"loss": 10.2344,
"step": 259
},
{
"epoch": 0.01881720430107527,
"eval_accuracy": 0.07592763600108049,
"eval_loss": 10.2265625,
"eval_runtime": 265.7243,
"eval_samples_per_second": 127.075,
"eval_steps_per_second": 2.649,
"step": 259
},
{
"epoch": 0.01888985759953502,
"grad_norm": 0.8911043405532837,
"learning_rate": 9.998234524847428e-06,
"loss": 10.2344,
"step": 260
},
{
"epoch": 0.01888985759953502,
"eval_accuracy": 0.07592697017665277,
"eval_loss": 10.2265625,
"eval_runtime": 265.6968,
"eval_samples_per_second": 127.088,
"eval_steps_per_second": 2.65,
"step": 260
},
{
"epoch": 0.01896251089799477,
"grad_norm": 0.9092383980751038,
"learning_rate": 9.998227259517584e-06,
"loss": 10.2266,
"step": 261
},
{
"epoch": 0.01896251089799477,
"eval_accuracy": 0.0759951448082731,
"eval_loss": 10.2265625,
"eval_runtime": 265.7199,
"eval_samples_per_second": 127.077,
"eval_steps_per_second": 2.649,
"step": 261
},
{
"epoch": 0.01903516419645452,
"grad_norm": 0.928420901298523,
"learning_rate": 9.998219994187736e-06,
"loss": 10.2188,
"step": 262
},
{
"epoch": 0.01903516419645452,
"eval_accuracy": 0.07604878709803636,
"eval_loss": 10.21875,
"eval_runtime": 265.3942,
"eval_samples_per_second": 127.233,
"eval_steps_per_second": 2.653,
"step": 262
},
{
"epoch": 0.01910781749491427,
"grad_norm": 0.9022119641304016,
"learning_rate": 9.998212728857892e-06,
"loss": 10.2266,
"step": 263
},
{
"epoch": 0.01910781749491427,
"eval_accuracy": 0.07615966133969491,
"eval_loss": 10.21875,
"eval_runtime": 265.8823,
"eval_samples_per_second": 127.0,
"eval_steps_per_second": 2.648,
"step": 263
},
{
"epoch": 0.01918047079337402,
"grad_norm": 0.8958231210708618,
"learning_rate": 9.998205463528044e-06,
"loss": 10.2266,
"step": 264
},
{
"epoch": 0.01918047079337402,
"eval_accuracy": 0.07618090982360545,
"eval_loss": 10.21875,
"eval_runtime": 264.9067,
"eval_samples_per_second": 127.468,
"eval_steps_per_second": 2.658,
"step": 264
},
{
"epoch": 0.01925312409183377,
"grad_norm": 0.9452428817749023,
"learning_rate": 9.9981981981982e-06,
"loss": 10.2188,
"step": 265
},
{
"epoch": 0.01925312409183377,
"eval_accuracy": 0.07618519425905335,
"eval_loss": 10.21875,
"eval_runtime": 265.6026,
"eval_samples_per_second": 127.134,
"eval_steps_per_second": 2.651,
"step": 265
},
{
"epoch": 0.01932577739029352,
"grad_norm": 0.8848786354064941,
"learning_rate": 9.998190932868354e-06,
"loss": 10.2266,
"step": 266
},
{
"epoch": 0.01932577739029352,
"eval_accuracy": 0.07619544216546252,
"eval_loss": 10.21875,
"eval_runtime": 264.5225,
"eval_samples_per_second": 127.653,
"eval_steps_per_second": 2.661,
"step": 266
},
{
"epoch": 0.01939843068875327,
"grad_norm": 0.89435875415802,
"learning_rate": 9.998183667538508e-06,
"loss": 10.2188,
"step": 267
},
{
"epoch": 0.01939843068875327,
"eval_accuracy": 0.07622743068688098,
"eval_loss": 10.2109375,
"eval_runtime": 263.8523,
"eval_samples_per_second": 127.977,
"eval_steps_per_second": 2.668,
"step": 267
},
{
"epoch": 0.01947108398721302,
"grad_norm": 0.9147275686264038,
"learning_rate": 9.998176402208661e-06,
"loss": 10.2109,
"step": 268
},
{
"epoch": 0.01947108398721302,
"eval_accuracy": 0.076288020709803,
"eval_loss": 10.2109375,
"eval_runtime": 264.7583,
"eval_samples_per_second": 127.539,
"eval_steps_per_second": 2.659,
"step": 268
},
{
"epoch": 0.01954373728567277,
"grad_norm": 0.9651392102241516,
"learning_rate": 9.998169136878815e-06,
"loss": 10.2109,
"step": 269
},
{
"epoch": 0.01954373728567277,
"eval_accuracy": 0.07618924710339596,
"eval_loss": 10.2109375,
"eval_runtime": 263.97,
"eval_samples_per_second": 127.92,
"eval_steps_per_second": 2.667,
"step": 269
},
{
"epoch": 0.01961639058413252,
"grad_norm": 0.9311045408248901,
"learning_rate": 9.99816187154897e-06,
"loss": 10.2109,
"step": 270
},
{
"epoch": 0.01961639058413252,
"eval_accuracy": 0.07608592852154757,
"eval_loss": 10.2109375,
"eval_runtime": 263.7922,
"eval_samples_per_second": 128.006,
"eval_steps_per_second": 2.669,
"step": 270
},
{
"epoch": 0.01968904388259227,
"grad_norm": 0.9008721113204956,
"learning_rate": 9.998154606219123e-06,
"loss": 10.2188,
"step": 271
},
{
"epoch": 0.01968904388259227,
"eval_accuracy": 0.07606844339309803,
"eval_loss": 10.2109375,
"eval_runtime": 263.8126,
"eval_samples_per_second": 127.996,
"eval_steps_per_second": 2.669,
"step": 271
},
{
"epoch": 0.01976169718105202,
"grad_norm": 0.9026838541030884,
"learning_rate": 9.998147340889277e-06,
"loss": 10.2109,
"step": 272
},
{
"epoch": 0.01976169718105202,
"eval_accuracy": 0.07600967715013018,
"eval_loss": 10.203125,
"eval_runtime": 265.1164,
"eval_samples_per_second": 127.367,
"eval_steps_per_second": 2.655,
"step": 272
},
{
"epoch": 0.01983435047951177,
"grad_norm": 0.9332795143127441,
"learning_rate": 9.998140075559431e-06,
"loss": 10.2188,
"step": 273
},
{
"epoch": 0.01983435047951177,
"eval_accuracy": 0.07610422421886563,
"eval_loss": 10.203125,
"eval_runtime": 266.0904,
"eval_samples_per_second": 126.9,
"eval_steps_per_second": 2.646,
"step": 273
},
{
"epoch": 0.01990700377797152,
"grad_norm": 0.8622159361839294,
"learning_rate": 9.998132810229585e-06,
"loss": 10.2266,
"step": 274
},
{
"epoch": 0.01990700377797152,
"eval_accuracy": 0.07618247306356617,
"eval_loss": 10.203125,
"eval_runtime": 265.3346,
"eval_samples_per_second": 127.262,
"eval_steps_per_second": 2.653,
"step": 274
},
{
"epoch": 0.01997965707643127,
"grad_norm": 0.8802500367164612,
"learning_rate": 9.998125544899739e-06,
"loss": 10.2188,
"step": 275
},
{
"epoch": 0.01997965707643127,
"eval_accuracy": 0.07621292729391207,
"eval_loss": 10.203125,
"eval_runtime": 263.5634,
"eval_samples_per_second": 128.117,
"eval_steps_per_second": 2.671,
"step": 275
},
{
"epoch": 0.02005231037489102,
"grad_norm": 0.8940539956092834,
"learning_rate": 9.998118279569893e-06,
"loss": 10.2109,
"step": 276
},
{
"epoch": 0.02005231037489102,
"eval_accuracy": 0.07612616747609205,
"eval_loss": 10.1953125,
"eval_runtime": 264.8033,
"eval_samples_per_second": 127.517,
"eval_steps_per_second": 2.659,
"step": 276
},
{
"epoch": 0.02012496367335077,
"grad_norm": 0.9146431684494019,
"learning_rate": 9.998111014240047e-06,
"loss": 10.2109,
"step": 277
},
{
"epoch": 0.02012496367335077,
"eval_accuracy": 0.07617022768387385,
"eval_loss": 10.1953125,
"eval_runtime": 264.0677,
"eval_samples_per_second": 127.873,
"eval_steps_per_second": 2.666,
"step": 277
},
{
"epoch": 0.02019761697181052,
"grad_norm": 0.9410712122917175,
"learning_rate": 9.998103748910201e-06,
"loss": 10.1953,
"step": 278
},
{
"epoch": 0.02019761697181052,
"eval_accuracy": 0.07624436578645546,
"eval_loss": 10.1953125,
"eval_runtime": 263.8716,
"eval_samples_per_second": 127.968,
"eval_steps_per_second": 2.668,
"step": 278
},
{
"epoch": 0.02027027027027027,
"grad_norm": 0.8908507227897644,
"learning_rate": 9.998096483580355e-06,
"loss": 10.2031,
"step": 279
},
{
"epoch": 0.02027027027027027,
"eval_accuracy": 0.07630177143167971,
"eval_loss": 10.1953125,
"eval_runtime": 263.3209,
"eval_samples_per_second": 128.235,
"eval_steps_per_second": 2.674,
"step": 279
},
{
"epoch": 0.02034292356873002,
"grad_norm": 0.9145093560218811,
"learning_rate": 9.998089218250509e-06,
"loss": 10.2188,
"step": 280
},
{
"epoch": 0.02034292356873002,
"eval_accuracy": 0.07648229669825482,
"eval_loss": 10.1953125,
"eval_runtime": 262.7765,
"eval_samples_per_second": 128.501,
"eval_steps_per_second": 2.679,
"step": 280
},
{
"epoch": 0.02041557686718977,
"grad_norm": 0.9509057402610779,
"learning_rate": 9.998081952920663e-06,
"loss": 10.1953,
"step": 281
},
{
"epoch": 0.02041557686718977,
"eval_accuracy": 0.07658419678458331,
"eval_loss": 10.1875,
"eval_runtime": 264.7797,
"eval_samples_per_second": 127.529,
"eval_steps_per_second": 2.659,
"step": 281
},
{
"epoch": 0.02048823016564952,
"grad_norm": 0.9156680107116699,
"learning_rate": 9.998074687590817e-06,
"loss": 10.1953,
"step": 282
},
{
"epoch": 0.02048823016564952,
"eval_accuracy": 0.07667509629341042,
"eval_loss": 10.1875,
"eval_runtime": 264.7774,
"eval_samples_per_second": 127.53,
"eval_steps_per_second": 2.659,
"step": 282
},
{
"epoch": 0.02056088346410927,
"grad_norm": 0.9039434194564819,
"learning_rate": 9.998067422260972e-06,
"loss": 10.2031,
"step": 283
},
{
"epoch": 0.02056088346410927,
"eval_accuracy": 0.07671203507470449,
"eval_loss": 10.1875,
"eval_runtime": 264.8864,
"eval_samples_per_second": 127.477,
"eval_steps_per_second": 2.658,
"step": 283
},
{
"epoch": 0.02063353676256902,
"grad_norm": 0.9945496320724487,
"learning_rate": 9.998060156931125e-06,
"loss": 10.1797,
"step": 284
},
{
"epoch": 0.02063353676256902,
"eval_accuracy": 0.0766314992678392,
"eval_loss": 10.1875,
"eval_runtime": 264.5241,
"eval_samples_per_second": 127.652,
"eval_steps_per_second": 2.661,
"step": 284
},
{
"epoch": 0.02070619006102877,
"grad_norm": 1.0437395572662354,
"learning_rate": 9.99805289160128e-06,
"loss": 10.1953,
"step": 285
},
{
"epoch": 0.02070619006102877,
"eval_accuracy": 0.07648548107595259,
"eval_loss": 10.1875,
"eval_runtime": 265.3496,
"eval_samples_per_second": 127.255,
"eval_steps_per_second": 2.653,
"step": 285
},
{
"epoch": 0.02077884335948852,
"grad_norm": 0.9568849802017212,
"learning_rate": 9.998045626271433e-06,
"loss": 10.1953,
"step": 286
},
{
"epoch": 0.02077884335948852,
"eval_accuracy": 0.07641924601983908,
"eval_loss": 10.1796875,
"eval_runtime": 264.6514,
"eval_samples_per_second": 127.59,
"eval_steps_per_second": 2.66,
"step": 286
},
{
"epoch": 0.02085149665794827,
"grad_norm": 0.9541803002357483,
"learning_rate": 9.998038360941588e-06,
"loss": 10.1875,
"step": 287
},
{
"epoch": 0.02085149665794827,
"eval_accuracy": 0.07642651219076761,
"eval_loss": 10.1796875,
"eval_runtime": 264.1269,
"eval_samples_per_second": 127.844,
"eval_steps_per_second": 2.665,
"step": 287
},
{
"epoch": 0.020924149956408022,
"grad_norm": 0.8972413539886475,
"learning_rate": 9.998031095611742e-06,
"loss": 10.1953,
"step": 288
},
{
"epoch": 0.020924149956408022,
"eval_accuracy": 0.0764929209402101,
"eval_loss": 10.1796875,
"eval_runtime": 262.3613,
"eval_samples_per_second": 128.704,
"eval_steps_per_second": 2.683,
"step": 288
},
{
"epoch": 0.02099680325486777,
"grad_norm": 0.9032208323478699,
"learning_rate": 9.998023830281896e-06,
"loss": 10.1875,
"step": 289
},
{
"epoch": 0.02099680325486777,
"eval_accuracy": 0.07653272566143215,
"eval_loss": 10.1796875,
"eval_runtime": 264.0713,
"eval_samples_per_second": 127.871,
"eval_steps_per_second": 2.666,
"step": 289
},
{
"epoch": 0.02106945655332752,
"grad_norm": 0.9714264869689941,
"learning_rate": 9.99801656495205e-06,
"loss": 10.1875,
"step": 290
},
{
"epoch": 0.02106945655332752,
"eval_accuracy": 0.07676037971793419,
"eval_loss": 10.1796875,
"eval_runtime": 265.0307,
"eval_samples_per_second": 127.408,
"eval_steps_per_second": 2.656,
"step": 290
},
{
"epoch": 0.021142109851787272,
"grad_norm": 0.9713578820228577,
"learning_rate": 9.998009299622204e-06,
"loss": 10.1797,
"step": 291
},
{
"epoch": 0.021142109851787272,
"eval_accuracy": 0.07695943227293267,
"eval_loss": 10.171875,
"eval_runtime": 264.6088,
"eval_samples_per_second": 127.611,
"eval_steps_per_second": 2.661,
"step": 291
},
{
"epoch": 0.02121476315024702,
"grad_norm": 0.947812557220459,
"learning_rate": 9.998002034292358e-06,
"loss": 10.1719,
"step": 292
},
{
"epoch": 0.02121476315024702,
"eval_accuracy": 0.07713083863973691,
"eval_loss": 10.171875,
"eval_runtime": 263.4033,
"eval_samples_per_second": 128.195,
"eval_steps_per_second": 2.673,
"step": 292
},
{
"epoch": 0.02128741644870677,
"grad_norm": 0.980165421962738,
"learning_rate": 9.997994768962512e-06,
"loss": 10.1719,
"step": 293
},
{
"epoch": 0.02128741644870677,
"eval_accuracy": 0.07723297031717068,
"eval_loss": 10.171875,
"eval_runtime": 264.0562,
"eval_samples_per_second": 127.878,
"eval_steps_per_second": 2.666,
"step": 293
},
{
"epoch": 0.021360069747166522,
"grad_norm": 0.9119016528129578,
"learning_rate": 9.997987503632666e-06,
"loss": 10.1797,
"step": 294
},
{
"epoch": 0.021360069747166522,
"eval_accuracy": 0.07731897746389849,
"eval_loss": 10.171875,
"eval_runtime": 265.096,
"eval_samples_per_second": 127.377,
"eval_steps_per_second": 2.656,
"step": 294
},
{
"epoch": 0.02143272304562627,
"grad_norm": 0.9215472936630249,
"learning_rate": 9.99798023830282e-06,
"loss": 10.1797,
"step": 295
},
{
"epoch": 0.02143272304562627,
"eval_accuracy": 0.07734911325647462,
"eval_loss": 10.171875,
"eval_runtime": 264.1241,
"eval_samples_per_second": 127.845,
"eval_steps_per_second": 2.665,
"step": 295
},
{
"epoch": 0.021505376344086023,
"grad_norm": 0.915708601474762,
"learning_rate": 9.997972972972974e-06,
"loss": 10.1641,
"step": 296
},
{
"epoch": 0.021505376344086023,
"eval_accuracy": 0.07730580571978501,
"eval_loss": 10.1640625,
"eval_runtime": 262.6233,
"eval_samples_per_second": 128.576,
"eval_steps_per_second": 2.681,
"step": 296
},
{
"epoch": 0.021578029642545772,
"grad_norm": 0.9310121536254883,
"learning_rate": 9.997965707643128e-06,
"loss": 10.1719,
"step": 297
},
{
"epoch": 0.021578029642545772,
"eval_accuracy": 0.07730317137096232,
"eval_loss": 10.1640625,
"eval_runtime": 263.411,
"eval_samples_per_second": 128.191,
"eval_steps_per_second": 2.673,
"step": 297
},
{
"epoch": 0.02165068294100552,
"grad_norm": 0.9275549650192261,
"learning_rate": 9.997958442313282e-06,
"loss": 10.1719,
"step": 298
},
{
"epoch": 0.02165068294100552,
"eval_accuracy": 0.07726713000520125,
"eval_loss": 10.1640625,
"eval_runtime": 263.956,
"eval_samples_per_second": 127.927,
"eval_steps_per_second": 2.667,
"step": 298
},
{
"epoch": 0.021723336239465273,
"grad_norm": 0.9178668260574341,
"learning_rate": 9.997951176983435e-06,
"loss": 10.1719,
"step": 299
},
{
"epoch": 0.021723336239465273,
"eval_accuracy": 0.07729367613564535,
"eval_loss": 10.1640625,
"eval_runtime": 263.6087,
"eval_samples_per_second": 128.095,
"eval_steps_per_second": 2.671,
"step": 299
},
{
"epoch": 0.021795989537925022,
"grad_norm": 0.9181063175201416,
"learning_rate": 9.997943911653591e-06,
"loss": 10.1719,
"step": 300
},
{
"epoch": 0.021795989537925022,
"eval_accuracy": 0.0773372442123284,
"eval_loss": 10.1640625,
"eval_runtime": 264.6871,
"eval_samples_per_second": 127.573,
"eval_steps_per_second": 2.66,
"step": 300
},
{
"epoch": 0.02186864283638477,
"grad_norm": 0.9063278436660767,
"learning_rate": 9.997936646323743e-06,
"loss": 10.1641,
"step": 301
},
{
"epoch": 0.02186864283638477,
"eval_accuracy": 0.07729164971347403,
"eval_loss": 10.1640625,
"eval_runtime": 263.4469,
"eval_samples_per_second": 128.174,
"eval_steps_per_second": 2.672,
"step": 301
},
{
"epoch": 0.021941296134844523,
"grad_norm": 0.9040680527687073,
"learning_rate": 9.997929380993899e-06,
"loss": 10.1562,
"step": 302
},
{
"epoch": 0.021941296134844523,
"eval_accuracy": 0.07716485358332667,
"eval_loss": 10.15625,
"eval_runtime": 264.4717,
"eval_samples_per_second": 127.677,
"eval_steps_per_second": 2.662,
"step": 302
},
{
"epoch": 0.022013949433304272,
"grad_norm": 0.9027392864227295,
"learning_rate": 9.997922115664051e-06,
"loss": 10.1719,
"step": 303
},
{
"epoch": 0.022013949433304272,
"eval_accuracy": 0.0771164799912088,
"eval_loss": 10.15625,
"eval_runtime": 263.5132,
"eval_samples_per_second": 128.142,
"eval_steps_per_second": 2.672,
"step": 303
},
{
"epoch": 0.02208660273176402,
"grad_norm": 0.9688916802406311,
"learning_rate": 9.997914850334205e-06,
"loss": 10.1562,
"step": 304
},
{
"epoch": 0.02208660273176402,
"eval_accuracy": 0.07716798006324811,
"eval_loss": 10.15625,
"eval_runtime": 265.6792,
"eval_samples_per_second": 127.097,
"eval_steps_per_second": 2.65,
"step": 304
},
{
"epoch": 0.022159256030223774,
"grad_norm": 0.9013357162475586,
"learning_rate": 9.997907585004361e-06,
"loss": 10.1641,
"step": 305
},
{
"epoch": 0.022159256030223774,
"eval_accuracy": 0.07729283661788866,
"eval_loss": 10.15625,
"eval_runtime": 263.3689,
"eval_samples_per_second": 128.212,
"eval_steps_per_second": 2.673,
"step": 305
},
{
"epoch": 0.022231909328683522,
"grad_norm": 0.9209669828414917,
"learning_rate": 9.997900319674513e-06,
"loss": 10.1562,
"step": 306
},
{
"epoch": 0.022231909328683522,
"eval_accuracy": 0.07727607521164315,
"eval_loss": 10.1484375,
"eval_runtime": 262.7117,
"eval_samples_per_second": 128.533,
"eval_steps_per_second": 2.68,
"step": 306
},
{
"epoch": 0.02230456262714327,
"grad_norm": 0.9404518604278564,
"learning_rate": 9.997893054344669e-06,
"loss": 10.1641,
"step": 307
},
{
"epoch": 0.02230456262714327,
"eval_accuracy": 0.07732798056811672,
"eval_loss": 10.1484375,
"eval_runtime": 264.5099,
"eval_samples_per_second": 127.659,
"eval_steps_per_second": 2.662,
"step": 307
},
{
"epoch": 0.022377215925603024,
"grad_norm": 0.8949778079986572,
"learning_rate": 9.997885789014821e-06,
"loss": 10.1719,
"step": 308
},
{
"epoch": 0.022377215925603024,
"eval_accuracy": 0.07748395717753088,
"eval_loss": 10.1484375,
"eval_runtime": 265.2866,
"eval_samples_per_second": 127.285,
"eval_steps_per_second": 2.654,
"step": 308
},
{
"epoch": 0.022449869224062773,
"grad_norm": 0.9001926183700562,
"learning_rate": 9.997878523684977e-06,
"loss": 10.1562,
"step": 309
},
{
"epoch": 0.022449869224062773,
"eval_accuracy": 0.07751218234348835,
"eval_loss": 10.1484375,
"eval_runtime": 265.0408,
"eval_samples_per_second": 127.403,
"eval_steps_per_second": 2.656,
"step": 309
},
{
"epoch": 0.02252252252252252,
"grad_norm": 0.9069272875785828,
"learning_rate": 9.997871258355129e-06,
"loss": 10.1719,
"step": 310
},
{
"epoch": 0.02252252252252252,
"eval_accuracy": 0.07750650836140868,
"eval_loss": 10.1484375,
"eval_runtime": 265.3373,
"eval_samples_per_second": 127.261,
"eval_steps_per_second": 2.653,
"step": 310
},
{
"epoch": 0.022595175820982274,
"grad_norm": 0.92779940366745,
"learning_rate": 9.997863993025285e-06,
"loss": 10.1562,
"step": 311
},
{
"epoch": 0.022595175820982274,
"eval_accuracy": 0.07742909903446483,
"eval_loss": 10.140625,
"eval_runtime": 264.244,
"eval_samples_per_second": 127.787,
"eval_steps_per_second": 2.664,
"step": 311
},
{
"epoch": 0.022667829119442023,
"grad_norm": 0.9007747769355774,
"learning_rate": 9.997856727695438e-06,
"loss": 10.1562,
"step": 312
},
{
"epoch": 0.022667829119442023,
"eval_accuracy": 0.07736051911841024,
"eval_loss": 10.140625,
"eval_runtime": 265.1787,
"eval_samples_per_second": 127.337,
"eval_steps_per_second": 2.655,
"step": 312
},
{
"epoch": 0.02274048241790177,
"grad_norm": 0.9027653336524963,
"learning_rate": 9.997849462365592e-06,
"loss": 10.1562,
"step": 313
},
{
"epoch": 0.02274048241790177,
"eval_accuracy": 0.07730719526641676,
"eval_loss": 10.140625,
"eval_runtime": 262.9549,
"eval_samples_per_second": 128.414,
"eval_steps_per_second": 2.677,
"step": 313
},
{
"epoch": 0.022813135716361524,
"grad_norm": 0.9862774610519409,
"learning_rate": 9.997842197035746e-06,
"loss": 10.1406,
"step": 314
},
{
"epoch": 0.022813135716361524,
"eval_accuracy": 0.07735834795179813,
"eval_loss": 10.140625,
"eval_runtime": 263.4521,
"eval_samples_per_second": 128.171,
"eval_steps_per_second": 2.672,
"step": 314
},
{
"epoch": 0.022885789014821273,
"grad_norm": 0.9319806694984436,
"learning_rate": 9.9978349317059e-06,
"loss": 10.1406,
"step": 315
},
{
"epoch": 0.022885789014821273,
"eval_accuracy": 0.07738877323325587,
"eval_loss": 10.140625,
"eval_runtime": 262.7183,
"eval_samples_per_second": 128.529,
"eval_steps_per_second": 2.68,
"step": 315
},
{
"epoch": 0.02295844231328102,
"grad_norm": 0.9190651774406433,
"learning_rate": 9.997827666376054e-06,
"loss": 10.1406,
"step": 316
},
{
"epoch": 0.02295844231328102,
"eval_accuracy": 0.07740159759071141,
"eval_loss": 10.140625,
"eval_runtime": 264.3693,
"eval_samples_per_second": 127.727,
"eval_steps_per_second": 2.663,
"step": 316
},
{
"epoch": 0.023031095611740774,
"grad_norm": 0.9385405778884888,
"learning_rate": 9.997820401046208e-06,
"loss": 10.1328,
"step": 317
},
{
"epoch": 0.023031095611740774,
"eval_accuracy": 0.07745408192494821,
"eval_loss": 10.1328125,
"eval_runtime": 263.8288,
"eval_samples_per_second": 127.988,
"eval_steps_per_second": 2.668,
"step": 317
},
{
"epoch": 0.023103748910200523,
"grad_norm": 0.9594412446022034,
"learning_rate": 9.997813135716362e-06,
"loss": 10.1484,
"step": 318
},
{
"epoch": 0.023103748910200523,
"eval_accuracy": 0.07749652099499298,
"eval_loss": 10.1328125,
"eval_runtime": 262.8505,
"eval_samples_per_second": 128.465,
"eval_steps_per_second": 2.678,
"step": 318
},
{
"epoch": 0.023176402208660272,
"grad_norm": 0.9393614530563354,
"learning_rate": 9.997805870386516e-06,
"loss": 10.1328,
"step": 319
},
{
"epoch": 0.023176402208660272,
"eval_accuracy": 0.07747440404443759,
"eval_loss": 10.1328125,
"eval_runtime": 263.8311,
"eval_samples_per_second": 127.987,
"eval_steps_per_second": 2.668,
"step": 319
},
{
"epoch": 0.023249055507120024,
"grad_norm": 0.9211113452911377,
"learning_rate": 9.99779860505667e-06,
"loss": 10.1328,
"step": 320
},
{
"epoch": 0.023249055507120024,
"eval_accuracy": 0.07746320082471908,
"eval_loss": 10.1328125,
"eval_runtime": 264.0976,
"eval_samples_per_second": 127.858,
"eval_steps_per_second": 2.666,
"step": 320
},
{
"epoch": 0.023321708805579773,
"grad_norm": 0.9568068385124207,
"learning_rate": 9.997791339726824e-06,
"loss": 10.125,
"step": 321
},
{
"epoch": 0.023321708805579773,
"eval_accuracy": 0.07749640519944033,
"eval_loss": 10.1328125,
"eval_runtime": 264.3338,
"eval_samples_per_second": 127.744,
"eval_steps_per_second": 2.663,
"step": 321
},
{
"epoch": 0.023394362104039522,
"grad_norm": 0.9372284412384033,
"learning_rate": 9.99778407439698e-06,
"loss": 10.1406,
"step": 322
},
{
"epoch": 0.023394362104039522,
"eval_accuracy": 0.07760533986559205,
"eval_loss": 10.125,
"eval_runtime": 264.2267,
"eval_samples_per_second": 127.796,
"eval_steps_per_second": 2.664,
"step": 322
},
{
"epoch": 0.023467015402499274,
"grad_norm": 0.9022813439369202,
"learning_rate": 9.997776809067132e-06,
"loss": 10.1328,
"step": 323
},
{
"epoch": 0.023467015402499274,
"eval_accuracy": 0.07770527142752555,
"eval_loss": 10.125,
"eval_runtime": 263.5487,
"eval_samples_per_second": 128.124,
"eval_steps_per_second": 2.671,
"step": 323
},
{
"epoch": 0.023539668700959023,
"grad_norm": 0.9569028615951538,
"learning_rate": 9.997769543737288e-06,
"loss": 10.125,
"step": 324
},
{
"epoch": 0.023539668700959023,
"eval_accuracy": 0.07776036116169688,
"eval_loss": 10.125,
"eval_runtime": 264.1074,
"eval_samples_per_second": 127.853,
"eval_steps_per_second": 2.666,
"step": 324
},
{
"epoch": 0.023612321999418772,
"grad_norm": 0.926621675491333,
"learning_rate": 9.99776227840744e-06,
"loss": 10.125,
"step": 325
},
{
"epoch": 0.023612321999418772,
"eval_accuracy": 0.07774163123105639,
"eval_loss": 10.125,
"eval_runtime": 264.9077,
"eval_samples_per_second": 127.467,
"eval_steps_per_second": 2.658,
"step": 325
},
{
"epoch": 0.023684975297878524,
"grad_norm": 0.8989631533622742,
"learning_rate": 9.997755013077595e-06,
"loss": 10.125,
"step": 326
},
{
"epoch": 0.023684975297878524,
"eval_accuracy": 0.0777147087650662,
"eval_loss": 10.125,
"eval_runtime": 262.9955,
"eval_samples_per_second": 128.394,
"eval_steps_per_second": 2.677,
"step": 326
},
{
"epoch": 0.023757628596338273,
"grad_norm": 0.918336033821106,
"learning_rate": 9.997747747747748e-06,
"loss": 10.1328,
"step": 327
},
{
"epoch": 0.023757628596338273,
"eval_accuracy": 0.07771042432961829,
"eval_loss": 10.1171875,
"eval_runtime": 264.3981,
"eval_samples_per_second": 127.713,
"eval_steps_per_second": 2.663,
"step": 327
},
{
"epoch": 0.023830281894798022,
"grad_norm": 0.9403995275497437,
"learning_rate": 9.997740482417903e-06,
"loss": 10.1172,
"step": 328
},
{
"epoch": 0.023830281894798022,
"eval_accuracy": 0.07768630990577977,
"eval_loss": 10.1171875,
"eval_runtime": 262.3713,
"eval_samples_per_second": 128.699,
"eval_steps_per_second": 2.683,
"step": 328
},
{
"epoch": 0.023902935193257775,
"grad_norm": 0.9186561703681946,
"learning_rate": 9.997733217088057e-06,
"loss": 10.1172,
"step": 329
},
{
"epoch": 0.023902935193257775,
"eval_accuracy": 0.07772889372026533,
"eval_loss": 10.1171875,
"eval_runtime": 264.6641,
"eval_samples_per_second": 127.584,
"eval_steps_per_second": 2.66,
"step": 329
},
{
"epoch": 0.023975588491717523,
"grad_norm": 0.9268199801445007,
"learning_rate": 9.997725951758211e-06,
"loss": 10.125,
"step": 330
},
{
"epoch": 0.023975588491717523,
"eval_accuracy": 0.07778919425430574,
"eval_loss": 10.1171875,
"eval_runtime": 264.2577,
"eval_samples_per_second": 127.781,
"eval_steps_per_second": 2.664,
"step": 330
},
{
"epoch": 0.024048241790177276,
"grad_norm": 0.9123356342315674,
"learning_rate": 9.997718686428365e-06,
"loss": 10.1094,
"step": 331
},
{
"epoch": 0.024048241790177276,
"eval_accuracy": 0.07782868053775802,
"eval_loss": 10.1171875,
"eval_runtime": 264.6854,
"eval_samples_per_second": 127.574,
"eval_steps_per_second": 2.66,
"step": 331
},
{
"epoch": 0.024120895088637025,
"grad_norm": 0.9475653767585754,
"learning_rate": 9.997711421098517e-06,
"loss": 10.1094,
"step": 332
},
{
"epoch": 0.024120895088637025,
"eval_accuracy": 0.07771977482049446,
"eval_loss": 10.109375,
"eval_runtime": 264.176,
"eval_samples_per_second": 127.82,
"eval_steps_per_second": 2.665,
"step": 332
},
{
"epoch": 0.024193548387096774,
"grad_norm": 0.9262251853942871,
"learning_rate": 9.997704155768673e-06,
"loss": 10.1094,
"step": 333
},
{
"epoch": 0.024193548387096774,
"eval_accuracy": 0.07764413137572845,
"eval_loss": 10.109375,
"eval_runtime": 266.5418,
"eval_samples_per_second": 126.686,
"eval_steps_per_second": 2.641,
"step": 333
},
{
"epoch": 0.024266201685556526,
"grad_norm": 0.9046162962913513,
"learning_rate": 9.997696890438827e-06,
"loss": 10.1172,
"step": 334
},
{
"epoch": 0.024266201685556526,
"eval_accuracy": 0.07753328608295808,
"eval_loss": 10.109375,
"eval_runtime": 264.8485,
"eval_samples_per_second": 127.496,
"eval_steps_per_second": 2.658,
"step": 334
},
{
"epoch": 0.024338854984016275,
"grad_norm": 0.8864550590515137,
"learning_rate": 9.997689625108981e-06,
"loss": 10.125,
"step": 335
},
{
"epoch": 0.024338854984016275,
"eval_accuracy": 0.07736963801818111,
"eval_loss": 10.109375,
"eval_runtime": 262.4932,
"eval_samples_per_second": 128.64,
"eval_steps_per_second": 2.682,
"step": 335
},
{
"epoch": 0.024411508282476024,
"grad_norm": 0.8957669138908386,
"learning_rate": 9.997682359779135e-06,
"loss": 10.1172,
"step": 336
},
{
"epoch": 0.024411508282476024,
"eval_accuracy": 0.07715223186808826,
"eval_loss": 10.109375,
"eval_runtime": 264.8989,
"eval_samples_per_second": 127.471,
"eval_steps_per_second": 2.658,
"step": 336
},
{
"epoch": 0.024484161580935776,
"grad_norm": 0.9608045816421509,
"learning_rate": 9.997675094449289e-06,
"loss": 10.1016,
"step": 337
},
{
"epoch": 0.024484161580935776,
"eval_accuracy": 0.07713784427067198,
"eval_loss": 10.1015625,
"eval_runtime": 264.4843,
"eval_samples_per_second": 127.671,
"eval_steps_per_second": 2.662,
"step": 337
},
{
"epoch": 0.024556814879395525,
"grad_norm": 0.9367948770523071,
"learning_rate": 9.997667829119443e-06,
"loss": 10.1094,
"step": 338
},
{
"epoch": 0.024556814879395525,
"eval_accuracy": 0.0773351309434926,
"eval_loss": 10.1015625,
"eval_runtime": 264.3155,
"eval_samples_per_second": 127.753,
"eval_steps_per_second": 2.663,
"step": 338
},
{
"epoch": 0.024629468177855274,
"grad_norm": 0.9086586833000183,
"learning_rate": 9.997660563789597e-06,
"loss": 10.1172,
"step": 339
},
{
"epoch": 0.024629468177855274,
"eval_accuracy": 0.07753942324724832,
"eval_loss": 10.1015625,
"eval_runtime": 263.7541,
"eval_samples_per_second": 128.025,
"eval_steps_per_second": 2.669,
"step": 339
},
{
"epoch": 0.024702121476315026,
"grad_norm": 0.936314046382904,
"learning_rate": 9.99765329845975e-06,
"loss": 10.1094,
"step": 340
},
{
"epoch": 0.024702121476315026,
"eval_accuracy": 0.07769429979891233,
"eval_loss": 10.1015625,
"eval_runtime": 264.8284,
"eval_samples_per_second": 127.505,
"eval_steps_per_second": 2.658,
"step": 340
},
{
"epoch": 0.024774774774774775,
"grad_norm": 0.8729653358459473,
"learning_rate": 9.997646033129905e-06,
"loss": 10.1172,
"step": 341
},
{
"epoch": 0.024774774774774775,
"eval_accuracy": 0.07776137437278254,
"eval_loss": 10.1015625,
"eval_runtime": 264.1666,
"eval_samples_per_second": 127.825,
"eval_steps_per_second": 2.665,
"step": 341
},
{
"epoch": 0.024847428073234524,
"grad_norm": 0.9122793078422546,
"learning_rate": 9.997638767800059e-06,
"loss": 10.0938,
"step": 342
},
{
"epoch": 0.024847428073234524,
"eval_accuracy": 0.07787676464099427,
"eval_loss": 10.09375,
"eval_runtime": 264.3725,
"eval_samples_per_second": 127.725,
"eval_steps_per_second": 2.663,
"step": 342
},
{
"epoch": 0.024920081371694276,
"grad_norm": 0.9096229076385498,
"learning_rate": 9.997631502470212e-06,
"loss": 10.1016,
"step": 343
},
{
"epoch": 0.024920081371694276,
"eval_accuracy": 0.077991257493673,
"eval_loss": 10.09375,
"eval_runtime": 265.889,
"eval_samples_per_second": 126.997,
"eval_steps_per_second": 2.648,
"step": 343
},
{
"epoch": 0.024992734670154025,
"grad_norm": 0.9116566181182861,
"learning_rate": 9.997624237140366e-06,
"loss": 10.0938,
"step": 344
},
{
"epoch": 0.024992734670154025,
"eval_accuracy": 0.07803271230152027,
"eval_loss": 10.09375,
"eval_runtime": 263.9618,
"eval_samples_per_second": 127.924,
"eval_steps_per_second": 2.667,
"step": 344
},
{
"epoch": 0.025065387968613774,
"grad_norm": 0.9252493381500244,
"learning_rate": 9.99761697181052e-06,
"loss": 10.0938,
"step": 345
},
{
"epoch": 0.025065387968613774,
"eval_accuracy": 0.0780303674415792,
"eval_loss": 10.09375,
"eval_runtime": 264.8551,
"eval_samples_per_second": 127.492,
"eval_steps_per_second": 2.658,
"step": 345
},
{
"epoch": 0.025138041267073526,
"grad_norm": 0.8922543525695801,
"learning_rate": 9.997609706480676e-06,
"loss": 10.1016,
"step": 346
},
{
"epoch": 0.025138041267073526,
"eval_accuracy": 0.07808099904697366,
"eval_loss": 10.09375,
"eval_runtime": 264.1901,
"eval_samples_per_second": 127.813,
"eval_steps_per_second": 2.665,
"step": 346
},
{
"epoch": 0.025210694565533275,
"grad_norm": 0.8663190603256226,
"learning_rate": 9.997602441150828e-06,
"loss": 10.1094,
"step": 347
},
{
"epoch": 0.025210694565533275,
"eval_accuracy": 0.07801609563971557,
"eval_loss": 10.0859375,
"eval_runtime": 262.4011,
"eval_samples_per_second": 128.685,
"eval_steps_per_second": 2.683,
"step": 347
},
{
"epoch": 0.025283347863993024,
"grad_norm": 0.9128501415252686,
"learning_rate": 9.997595175820984e-06,
"loss": 10.0938,
"step": 348
},
{
"epoch": 0.025283347863993024,
"eval_accuracy": 0.07798708885377775,
"eval_loss": 10.0859375,
"eval_runtime": 263.7953,
"eval_samples_per_second": 128.005,
"eval_steps_per_second": 2.669,
"step": 348
},
{
"epoch": 0.025356001162452776,
"grad_norm": 0.9011194705963135,
"learning_rate": 9.997587910491136e-06,
"loss": 10.0938,
"step": 349
},
{
"epoch": 0.025356001162452776,
"eval_accuracy": 0.07796170067886012,
"eval_loss": 10.0859375,
"eval_runtime": 263.0115,
"eval_samples_per_second": 128.386,
"eval_steps_per_second": 2.677,
"step": 349
},
{
"epoch": 0.025428654460912525,
"grad_norm": 0.9395301342010498,
"learning_rate": 9.997580645161292e-06,
"loss": 10.0859,
"step": 350
},
{
"epoch": 0.025428654460912525,
"eval_accuracy": 0.0779401337571798,
"eval_loss": 10.0859375,
"eval_runtime": 265.4304,
"eval_samples_per_second": 127.216,
"eval_steps_per_second": 2.652,
"step": 350
},
{
"epoch": 0.025501307759372274,
"grad_norm": 0.9046230316162109,
"learning_rate": 9.997573379831446e-06,
"loss": 10.0859,
"step": 351
},
{
"epoch": 0.025501307759372274,
"eval_accuracy": 0.07796905369645313,
"eval_loss": 10.0859375,
"eval_runtime": 264.1028,
"eval_samples_per_second": 127.856,
"eval_steps_per_second": 2.666,
"step": 351
},
{
"epoch": 0.025573961057832027,
"grad_norm": 0.9076169729232788,
"learning_rate": 9.9975661145016e-06,
"loss": 10.0938,
"step": 352
},
{
"epoch": 0.025573961057832027,
"eval_accuracy": 0.07807518032045319,
"eval_loss": 10.078125,
"eval_runtime": 263.5124,
"eval_samples_per_second": 128.142,
"eval_steps_per_second": 2.672,
"step": 352
},
{
"epoch": 0.025573961057832027,
"step": 352,
"total_flos": 247015648788480.0,
"train_loss": 10.390092329545455,
"train_runtime": 94034.2968,
"train_samples_per_second": 702.555,
"train_steps_per_second": 14.637
}
],
"logging_steps": 1,
"max_steps": 1376400,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 247015648788480.0,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}