akoul_whitehorseliquidity_25c / trainer_state.json
sizhkhy's picture
Upload folder using huggingface_hub
7cc506b verified
raw
history blame
118 kB
{
"best_metric": 0.014902754686772823,
"best_model_checkpoint": "/home/paperspace/Data/models/akoul_whitehorseliquidity_25c/llm3br256/checkpoint-400",
"epoch": 5.0,
"eval_steps": 5,
"global_step": 540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009259259259259259,
"grad_norm": 0.29716095328330994,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.1002,
"step": 1
},
{
"epoch": 0.018518518518518517,
"grad_norm": 0.2648535370826721,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.0936,
"step": 2
},
{
"epoch": 0.027777777777777776,
"grad_norm": 0.24819649755954742,
"learning_rate": 5.555555555555556e-06,
"loss": 0.0898,
"step": 3
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.23442289233207703,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.087,
"step": 4
},
{
"epoch": 0.046296296296296294,
"grad_norm": 0.26300737261772156,
"learning_rate": 9.259259259259259e-06,
"loss": 0.0904,
"step": 5
},
{
"epoch": 0.046296296296296294,
"eval_loss": 0.0950983464717865,
"eval_runtime": 11.9584,
"eval_samples_per_second": 4.181,
"eval_steps_per_second": 1.087,
"step": 5
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.18399731814861298,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0805,
"step": 6
},
{
"epoch": 0.06481481481481481,
"grad_norm": 0.19827856123447418,
"learning_rate": 1.2962962962962962e-05,
"loss": 0.0782,
"step": 7
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.13050280511379242,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.0636,
"step": 8
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.12110771238803864,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.056,
"step": 9
},
{
"epoch": 0.09259259259259259,
"grad_norm": 0.1111820638179779,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.053,
"step": 10
},
{
"epoch": 0.09259259259259259,
"eval_loss": 0.04887561500072479,
"eval_runtime": 9.1057,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 10
},
{
"epoch": 0.10185185185185185,
"grad_norm": 0.0779903382062912,
"learning_rate": 2.037037037037037e-05,
"loss": 0.0538,
"step": 11
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.08193033933639526,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.0398,
"step": 12
},
{
"epoch": 0.12037037037037036,
"grad_norm": 0.0821649506688118,
"learning_rate": 2.4074074074074074e-05,
"loss": 0.0473,
"step": 13
},
{
"epoch": 0.12962962962962962,
"grad_norm": 0.07107188552618027,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.0386,
"step": 14
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.05971238389611244,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0417,
"step": 15
},
{
"epoch": 0.1388888888888889,
"eval_loss": 0.04156189784407616,
"eval_runtime": 9.1211,
"eval_samples_per_second": 5.482,
"eval_steps_per_second": 1.425,
"step": 15
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.05262186750769615,
"learning_rate": 2.962962962962963e-05,
"loss": 0.0384,
"step": 16
},
{
"epoch": 0.1574074074074074,
"grad_norm": 0.05361900106072426,
"learning_rate": 3.148148148148148e-05,
"loss": 0.0378,
"step": 17
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.05355929210782051,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0399,
"step": 18
},
{
"epoch": 0.17592592592592593,
"grad_norm": 0.04563885182142258,
"learning_rate": 3.518518518518519e-05,
"loss": 0.0368,
"step": 19
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.060624465346336365,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.0396,
"step": 20
},
{
"epoch": 0.18518518518518517,
"eval_loss": 0.03584723547101021,
"eval_runtime": 9.1162,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 20
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.0525534488260746,
"learning_rate": 3.888888888888889e-05,
"loss": 0.0364,
"step": 21
},
{
"epoch": 0.2037037037037037,
"grad_norm": 0.041657958179712296,
"learning_rate": 4.074074074074074e-05,
"loss": 0.034,
"step": 22
},
{
"epoch": 0.21296296296296297,
"grad_norm": 0.04589791223406792,
"learning_rate": 4.259259259259259e-05,
"loss": 0.0317,
"step": 23
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.04220304638147354,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.0339,
"step": 24
},
{
"epoch": 0.23148148148148148,
"grad_norm": 0.03630352392792702,
"learning_rate": 4.62962962962963e-05,
"loss": 0.029,
"step": 25
},
{
"epoch": 0.23148148148148148,
"eval_loss": 0.03286580368876457,
"eval_runtime": 9.1191,
"eval_samples_per_second": 5.483,
"eval_steps_per_second": 1.426,
"step": 25
},
{
"epoch": 0.24074074074074073,
"grad_norm": 0.04235522821545601,
"learning_rate": 4.814814814814815e-05,
"loss": 0.0326,
"step": 26
},
{
"epoch": 0.25,
"grad_norm": 0.04675336927175522,
"learning_rate": 5e-05,
"loss": 0.03,
"step": 27
},
{
"epoch": 0.25925925925925924,
"grad_norm": 0.039461418986320496,
"learning_rate": 5.185185185185185e-05,
"loss": 0.0328,
"step": 28
},
{
"epoch": 0.26851851851851855,
"grad_norm": 0.044042930006980896,
"learning_rate": 5.370370370370371e-05,
"loss": 0.0294,
"step": 29
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.044502489268779755,
"learning_rate": 5.555555555555556e-05,
"loss": 0.0311,
"step": 30
},
{
"epoch": 0.2777777777777778,
"eval_loss": 0.030865700915455818,
"eval_runtime": 9.1099,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 1.427,
"step": 30
},
{
"epoch": 0.28703703703703703,
"grad_norm": 0.04979817569255829,
"learning_rate": 5.740740740740741e-05,
"loss": 0.0292,
"step": 31
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.04573828727006912,
"learning_rate": 5.925925925925926e-05,
"loss": 0.0346,
"step": 32
},
{
"epoch": 0.3055555555555556,
"grad_norm": 0.0410350002348423,
"learning_rate": 6.111111111111112e-05,
"loss": 0.0295,
"step": 33
},
{
"epoch": 0.3148148148148148,
"grad_norm": 0.0416686087846756,
"learning_rate": 6.296296296296296e-05,
"loss": 0.0267,
"step": 34
},
{
"epoch": 0.32407407407407407,
"grad_norm": 0.042319901287555695,
"learning_rate": 6.481481481481482e-05,
"loss": 0.0295,
"step": 35
},
{
"epoch": 0.32407407407407407,
"eval_loss": 0.028042705729603767,
"eval_runtime": 9.1376,
"eval_samples_per_second": 5.472,
"eval_steps_per_second": 1.423,
"step": 35
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.037845220416784286,
"learning_rate": 6.666666666666667e-05,
"loss": 0.0319,
"step": 36
},
{
"epoch": 0.3425925925925926,
"grad_norm": 0.03568718954920769,
"learning_rate": 6.851851851851852e-05,
"loss": 0.0346,
"step": 37
},
{
"epoch": 0.35185185185185186,
"grad_norm": 0.037281136959791183,
"learning_rate": 7.037037037037038e-05,
"loss": 0.031,
"step": 38
},
{
"epoch": 0.3611111111111111,
"grad_norm": 0.03607446327805519,
"learning_rate": 7.222222222222222e-05,
"loss": 0.0335,
"step": 39
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.03654631972312927,
"learning_rate": 7.407407407407407e-05,
"loss": 0.0262,
"step": 40
},
{
"epoch": 0.37037037037037035,
"eval_loss": 0.026602942496538162,
"eval_runtime": 9.1124,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 40
},
{
"epoch": 0.37962962962962965,
"grad_norm": 0.039490777999162674,
"learning_rate": 7.592592592592593e-05,
"loss": 0.0252,
"step": 41
},
{
"epoch": 0.3888888888888889,
"grad_norm": 0.036680739372968674,
"learning_rate": 7.777777777777778e-05,
"loss": 0.0242,
"step": 42
},
{
"epoch": 0.39814814814814814,
"grad_norm": 0.040739599615335464,
"learning_rate": 7.962962962962964e-05,
"loss": 0.025,
"step": 43
},
{
"epoch": 0.4074074074074074,
"grad_norm": 0.04679260402917862,
"learning_rate": 8.148148148148148e-05,
"loss": 0.0212,
"step": 44
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.04656214639544487,
"learning_rate": 8.333333333333334e-05,
"loss": 0.0272,
"step": 45
},
{
"epoch": 0.4166666666666667,
"eval_loss": 0.02608887106180191,
"eval_runtime": 9.1343,
"eval_samples_per_second": 5.474,
"eval_steps_per_second": 1.423,
"step": 45
},
{
"epoch": 0.42592592592592593,
"grad_norm": 0.04525485262274742,
"learning_rate": 8.518518518518518e-05,
"loss": 0.0274,
"step": 46
},
{
"epoch": 0.4351851851851852,
"grad_norm": 0.03210742771625519,
"learning_rate": 8.703703703703704e-05,
"loss": 0.0283,
"step": 47
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.03675089031457901,
"learning_rate": 8.888888888888889e-05,
"loss": 0.0242,
"step": 48
},
{
"epoch": 0.4537037037037037,
"grad_norm": 0.03396710753440857,
"learning_rate": 9.074074074074075e-05,
"loss": 0.0239,
"step": 49
},
{
"epoch": 0.46296296296296297,
"grad_norm": 0.02745971269905567,
"learning_rate": 9.25925925925926e-05,
"loss": 0.0224,
"step": 50
},
{
"epoch": 0.46296296296296297,
"eval_loss": 0.02490057609975338,
"eval_runtime": 9.1102,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 50
},
{
"epoch": 0.4722222222222222,
"grad_norm": 0.04084627702832222,
"learning_rate": 9.444444444444444e-05,
"loss": 0.0252,
"step": 51
},
{
"epoch": 0.48148148148148145,
"grad_norm": 0.033021993935108185,
"learning_rate": 9.62962962962963e-05,
"loss": 0.0228,
"step": 52
},
{
"epoch": 0.49074074074074076,
"grad_norm": 0.034785784780979156,
"learning_rate": 9.814814814814815e-05,
"loss": 0.0259,
"step": 53
},
{
"epoch": 0.5,
"grad_norm": 0.03407888114452362,
"learning_rate": 0.0001,
"loss": 0.0239,
"step": 54
},
{
"epoch": 0.5092592592592593,
"grad_norm": 0.03268973529338837,
"learning_rate": 9.99989553622803e-05,
"loss": 0.0229,
"step": 55
},
{
"epoch": 0.5092592592592593,
"eval_loss": 0.02450372651219368,
"eval_runtime": 9.1421,
"eval_samples_per_second": 5.469,
"eval_steps_per_second": 1.422,
"step": 55
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.032378531992435455,
"learning_rate": 9.999582149277187e-05,
"loss": 0.0219,
"step": 56
},
{
"epoch": 0.5277777777777778,
"grad_norm": 0.03997437283396721,
"learning_rate": 9.999059852242507e-05,
"loss": 0.0248,
"step": 57
},
{
"epoch": 0.5370370370370371,
"grad_norm": 0.04024836793541908,
"learning_rate": 9.998328666948438e-05,
"loss": 0.0194,
"step": 58
},
{
"epoch": 0.5462962962962963,
"grad_norm": 0.03850249573588371,
"learning_rate": 9.997388623947928e-05,
"loss": 0.0251,
"step": 59
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.03326913341879845,
"learning_rate": 9.996239762521151e-05,
"loss": 0.0233,
"step": 60
},
{
"epoch": 0.5555555555555556,
"eval_loss": 0.023316912353038788,
"eval_runtime": 9.1353,
"eval_samples_per_second": 5.473,
"eval_steps_per_second": 1.423,
"step": 60
},
{
"epoch": 0.5648148148148148,
"grad_norm": 0.034179024398326874,
"learning_rate": 9.994882130673868e-05,
"loss": 0.0222,
"step": 61
},
{
"epoch": 0.5740740740740741,
"grad_norm": 0.031797800213098526,
"learning_rate": 9.993315785135416e-05,
"loss": 0.0272,
"step": 62
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.03183833882212639,
"learning_rate": 9.991540791356342e-05,
"loss": 0.0241,
"step": 63
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.025173548609018326,
"learning_rate": 9.989557223505661e-05,
"loss": 0.0216,
"step": 64
},
{
"epoch": 0.6018518518518519,
"grad_norm": 0.04935009032487869,
"learning_rate": 9.987365164467767e-05,
"loss": 0.0217,
"step": 65
},
{
"epoch": 0.6018518518518519,
"eval_loss": 0.02255990356206894,
"eval_runtime": 9.1207,
"eval_samples_per_second": 5.482,
"eval_steps_per_second": 1.425,
"step": 65
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.02904060110449791,
"learning_rate": 9.98496470583896e-05,
"loss": 0.0213,
"step": 66
},
{
"epoch": 0.6203703703703703,
"grad_norm": 0.046014755964279175,
"learning_rate": 9.982355947923629e-05,
"loss": 0.018,
"step": 67
},
{
"epoch": 0.6296296296296297,
"grad_norm": 0.0354795977473259,
"learning_rate": 9.979538999730047e-05,
"loss": 0.0199,
"step": 68
},
{
"epoch": 0.6388888888888888,
"grad_norm": 0.03308796137571335,
"learning_rate": 9.976513978965829e-05,
"loss": 0.0239,
"step": 69
},
{
"epoch": 0.6481481481481481,
"grad_norm": 0.03860899433493614,
"learning_rate": 9.973281012033007e-05,
"loss": 0.0247,
"step": 70
},
{
"epoch": 0.6481481481481481,
"eval_loss": 0.022898558527231216,
"eval_runtime": 9.1074,
"eval_samples_per_second": 5.49,
"eval_steps_per_second": 1.427,
"step": 70
},
{
"epoch": 0.6574074074074074,
"grad_norm": 0.028213078156113625,
"learning_rate": 9.969840234022749e-05,
"loss": 0.0197,
"step": 71
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.024581043049693108,
"learning_rate": 9.966191788709716e-05,
"loss": 0.0207,
"step": 72
},
{
"epoch": 0.6759259259259259,
"grad_norm": 0.026658454909920692,
"learning_rate": 9.962335828546048e-05,
"loss": 0.0214,
"step": 73
},
{
"epoch": 0.6851851851851852,
"grad_norm": 0.034941576421260834,
"learning_rate": 9.958272514655006e-05,
"loss": 0.0205,
"step": 74
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.03060038387775421,
"learning_rate": 9.954002016824227e-05,
"loss": 0.0193,
"step": 75
},
{
"epoch": 0.6944444444444444,
"eval_loss": 0.02283317781984806,
"eval_runtime": 9.1512,
"eval_samples_per_second": 5.464,
"eval_steps_per_second": 1.421,
"step": 75
},
{
"epoch": 0.7037037037037037,
"grad_norm": 0.0313015952706337,
"learning_rate": 9.949524513498636e-05,
"loss": 0.0206,
"step": 76
},
{
"epoch": 0.7129629629629629,
"grad_norm": 0.03317766636610031,
"learning_rate": 9.944840191772987e-05,
"loss": 0.0217,
"step": 77
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.027911782264709473,
"learning_rate": 9.939949247384046e-05,
"loss": 0.0196,
"step": 78
},
{
"epoch": 0.7314814814814815,
"grad_norm": 0.028807291761040688,
"learning_rate": 9.934851884702414e-05,
"loss": 0.0223,
"step": 79
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.03152855485677719,
"learning_rate": 9.929548316723982e-05,
"loss": 0.0173,
"step": 80
},
{
"epoch": 0.7407407407407407,
"eval_loss": 0.021335698664188385,
"eval_runtime": 9.1689,
"eval_samples_per_second": 5.453,
"eval_steps_per_second": 1.418,
"step": 80
},
{
"epoch": 0.75,
"grad_norm": 0.03250882402062416,
"learning_rate": 9.924038765061042e-05,
"loss": 0.0231,
"step": 81
},
{
"epoch": 0.7592592592592593,
"grad_norm": 0.030853938311338425,
"learning_rate": 9.918323459933005e-05,
"loss": 0.0224,
"step": 82
},
{
"epoch": 0.7685185185185185,
"grad_norm": 0.03431202098727226,
"learning_rate": 9.912402640156811e-05,
"loss": 0.0223,
"step": 83
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.027050426229834557,
"learning_rate": 9.906276553136923e-05,
"loss": 0.0198,
"step": 84
},
{
"epoch": 0.7870370370370371,
"grad_norm": 0.03224191442131996,
"learning_rate": 9.899945454855006e-05,
"loss": 0.0207,
"step": 85
},
{
"epoch": 0.7870370370370371,
"eval_loss": 0.020375357940793037,
"eval_runtime": 9.1362,
"eval_samples_per_second": 5.473,
"eval_steps_per_second": 1.423,
"step": 85
},
{
"epoch": 0.7962962962962963,
"grad_norm": 0.028706278651952744,
"learning_rate": 9.893409609859222e-05,
"loss": 0.0197,
"step": 86
},
{
"epoch": 0.8055555555555556,
"grad_norm": 0.02814578451216221,
"learning_rate": 9.88666929125318e-05,
"loss": 0.0199,
"step": 87
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.028775395825505257,
"learning_rate": 9.879724780684519e-05,
"loss": 0.0169,
"step": 88
},
{
"epoch": 0.8240740740740741,
"grad_norm": 0.030078047886490822,
"learning_rate": 9.872576368333151e-05,
"loss": 0.0209,
"step": 89
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.031860969960689545,
"learning_rate": 9.865224352899119e-05,
"loss": 0.0213,
"step": 90
},
{
"epoch": 0.8333333333333334,
"eval_loss": 0.019939038902521133,
"eval_runtime": 9.1287,
"eval_samples_per_second": 5.477,
"eval_steps_per_second": 1.424,
"step": 90
},
{
"epoch": 0.8425925925925926,
"grad_norm": 0.03415157273411751,
"learning_rate": 9.857669041590134e-05,
"loss": 0.021,
"step": 91
},
{
"epoch": 0.8518518518518519,
"grad_norm": 0.032674115151166916,
"learning_rate": 9.849910750108717e-05,
"loss": 0.0207,
"step": 92
},
{
"epoch": 0.8611111111111112,
"grad_norm": 0.02941475249826908,
"learning_rate": 9.84194980263903e-05,
"loss": 0.0196,
"step": 93
},
{
"epoch": 0.8703703703703703,
"grad_norm": 0.036115583032369614,
"learning_rate": 9.83378653183331e-05,
"loss": 0.0178,
"step": 94
},
{
"epoch": 0.8796296296296297,
"grad_norm": 0.03358744457364082,
"learning_rate": 9.825421278797983e-05,
"loss": 0.0199,
"step": 95
},
{
"epoch": 0.8796296296296297,
"eval_loss": 0.020193172618746758,
"eval_runtime": 9.1141,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 95
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.029014358296990395,
"learning_rate": 9.816854393079403e-05,
"loss": 0.0219,
"step": 96
},
{
"epoch": 0.8981481481481481,
"grad_norm": 0.042931754142045975,
"learning_rate": 9.808086232649246e-05,
"loss": 0.0185,
"step": 97
},
{
"epoch": 0.9074074074074074,
"grad_norm": 0.029089825227856636,
"learning_rate": 9.799117163889559e-05,
"loss": 0.021,
"step": 98
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.03154176101088524,
"learning_rate": 9.789947561577445e-05,
"loss": 0.02,
"step": 99
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.027786221355199814,
"learning_rate": 9.780577808869398e-05,
"loss": 0.0188,
"step": 100
},
{
"epoch": 0.9259259259259259,
"eval_loss": 0.02070247381925583,
"eval_runtime": 9.1159,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 100
},
{
"epoch": 0.9351851851851852,
"grad_norm": 0.030518539249897003,
"learning_rate": 9.771008297285307e-05,
"loss": 0.0218,
"step": 101
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.024817178025841713,
"learning_rate": 9.761239426692077e-05,
"loss": 0.0202,
"step": 102
},
{
"epoch": 0.9537037037037037,
"grad_norm": 0.025192229077219963,
"learning_rate": 9.751271605286941e-05,
"loss": 0.0197,
"step": 103
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.02538897655904293,
"learning_rate": 9.741105249580383e-05,
"loss": 0.02,
"step": 104
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.025440450757741928,
"learning_rate": 9.730740784378753e-05,
"loss": 0.0193,
"step": 105
},
{
"epoch": 0.9722222222222222,
"eval_loss": 0.020300446078181267,
"eval_runtime": 9.126,
"eval_samples_per_second": 5.479,
"eval_steps_per_second": 1.425,
"step": 105
},
{
"epoch": 0.9814814814814815,
"grad_norm": 0.02362542785704136,
"learning_rate": 9.7201786427665e-05,
"loss": 0.0202,
"step": 106
},
{
"epoch": 0.9907407407407407,
"grad_norm": 0.022390421479940414,
"learning_rate": 9.709419266088086e-05,
"loss": 0.0188,
"step": 107
},
{
"epoch": 1.0,
"grad_norm": 0.026193244382739067,
"learning_rate": 9.698463103929542e-05,
"loss": 0.022,
"step": 108
},
{
"epoch": 1.0092592592592593,
"grad_norm": 0.028253022581338882,
"learning_rate": 9.687310614099675e-05,
"loss": 0.0159,
"step": 109
},
{
"epoch": 1.0185185185185186,
"grad_norm": 0.02241157554090023,
"learning_rate": 9.67596226261095e-05,
"loss": 0.016,
"step": 110
},
{
"epoch": 1.0185185185185186,
"eval_loss": 0.01969613879919052,
"eval_runtime": 9.1053,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 110
},
{
"epoch": 1.0277777777777777,
"grad_norm": 0.027405373752117157,
"learning_rate": 9.664418523660004e-05,
"loss": 0.014,
"step": 111
},
{
"epoch": 1.037037037037037,
"grad_norm": 0.032646384090185165,
"learning_rate": 9.652679879607843e-05,
"loss": 0.0172,
"step": 112
},
{
"epoch": 1.0462962962962963,
"grad_norm": 0.02552163228392601,
"learning_rate": 9.640746820959684e-05,
"loss": 0.014,
"step": 113
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.022228199988603592,
"learning_rate": 9.628619846344454e-05,
"loss": 0.0172,
"step": 114
},
{
"epoch": 1.0648148148148149,
"grad_norm": 0.028009962290525436,
"learning_rate": 9.616299462493952e-05,
"loss": 0.0166,
"step": 115
},
{
"epoch": 1.0648148148148149,
"eval_loss": 0.019864549860358238,
"eval_runtime": 9.122,
"eval_samples_per_second": 5.481,
"eval_steps_per_second": 1.425,
"step": 115
},
{
"epoch": 1.074074074074074,
"grad_norm": 0.025030331686139107,
"learning_rate": 9.603786184221693e-05,
"loss": 0.0195,
"step": 116
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.030586065724492073,
"learning_rate": 9.591080534401371e-05,
"loss": 0.015,
"step": 117
},
{
"epoch": 1.0925925925925926,
"grad_norm": 0.02425476722419262,
"learning_rate": 9.57818304394503e-05,
"loss": 0.0183,
"step": 118
},
{
"epoch": 1.1018518518518519,
"grad_norm": 0.03203345090150833,
"learning_rate": 9.565094251780871e-05,
"loss": 0.0172,
"step": 119
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.03028124012053013,
"learning_rate": 9.551814704830734e-05,
"loss": 0.0189,
"step": 120
},
{
"epoch": 1.1111111111111112,
"eval_loss": 0.019504941999912262,
"eval_runtime": 9.1171,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 120
},
{
"epoch": 1.1203703703703705,
"grad_norm": 0.026934562250971794,
"learning_rate": 9.538344957987244e-05,
"loss": 0.0132,
"step": 121
},
{
"epoch": 1.1296296296296295,
"grad_norm": 0.02392655238509178,
"learning_rate": 9.524685574090627e-05,
"loss": 0.0184,
"step": 122
},
{
"epoch": 1.1388888888888888,
"grad_norm": 0.02336742728948593,
"learning_rate": 9.51083712390519e-05,
"loss": 0.0155,
"step": 123
},
{
"epoch": 1.1481481481481481,
"grad_norm": 0.025306498631834984,
"learning_rate": 9.496800186095466e-05,
"loss": 0.0156,
"step": 124
},
{
"epoch": 1.1574074074074074,
"grad_norm": 0.02764940820634365,
"learning_rate": 9.482575347202047e-05,
"loss": 0.0211,
"step": 125
},
{
"epoch": 1.1574074074074074,
"eval_loss": 0.018362991511821747,
"eval_runtime": 9.1297,
"eval_samples_per_second": 5.477,
"eval_steps_per_second": 1.424,
"step": 125
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.02213912270963192,
"learning_rate": 9.468163201617062e-05,
"loss": 0.0178,
"step": 126
},
{
"epoch": 1.175925925925926,
"grad_norm": 0.03320689871907234,
"learning_rate": 9.453564351559348e-05,
"loss": 0.0148,
"step": 127
},
{
"epoch": 1.1851851851851851,
"grad_norm": 0.023370925337076187,
"learning_rate": 9.438779407049281e-05,
"loss": 0.0174,
"step": 128
},
{
"epoch": 1.1944444444444444,
"grad_norm": 0.02848099358379841,
"learning_rate": 9.423808985883289e-05,
"loss": 0.0174,
"step": 129
},
{
"epoch": 1.2037037037037037,
"grad_norm": 0.02608056552708149,
"learning_rate": 9.40865371360804e-05,
"loss": 0.0171,
"step": 130
},
{
"epoch": 1.2037037037037037,
"eval_loss": 0.018851976841688156,
"eval_runtime": 9.1046,
"eval_samples_per_second": 5.492,
"eval_steps_per_second": 1.428,
"step": 130
},
{
"epoch": 1.212962962962963,
"grad_norm": 0.02152630314230919,
"learning_rate": 9.393314223494296e-05,
"loss": 0.0172,
"step": 131
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.02550230175256729,
"learning_rate": 9.377791156510455e-05,
"loss": 0.016,
"step": 132
},
{
"epoch": 1.2314814814814814,
"grad_norm": 0.025004474446177483,
"learning_rate": 9.362085161295769e-05,
"loss": 0.0163,
"step": 133
},
{
"epoch": 1.2407407407407407,
"grad_norm": 0.026416007429361343,
"learning_rate": 9.346196894133239e-05,
"loss": 0.0165,
"step": 134
},
{
"epoch": 1.25,
"grad_norm": 0.029432326555252075,
"learning_rate": 9.330127018922194e-05,
"loss": 0.0191,
"step": 135
},
{
"epoch": 1.25,
"eval_loss": 0.019194327294826508,
"eval_runtime": 9.1131,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 135
},
{
"epoch": 1.2592592592592593,
"grad_norm": 0.03440408781170845,
"learning_rate": 9.313876207150543e-05,
"loss": 0.0165,
"step": 136
},
{
"epoch": 1.2685185185185186,
"grad_norm": 0.025614989921450615,
"learning_rate": 9.297445137866727e-05,
"loss": 0.0162,
"step": 137
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.02456337958574295,
"learning_rate": 9.280834497651334e-05,
"loss": 0.0192,
"step": 138
},
{
"epoch": 1.287037037037037,
"grad_norm": 0.051101330667734146,
"learning_rate": 9.264044980588416e-05,
"loss": 0.015,
"step": 139
},
{
"epoch": 1.2962962962962963,
"grad_norm": 0.03369716554880142,
"learning_rate": 9.247077288236488e-05,
"loss": 0.0184,
"step": 140
},
{
"epoch": 1.2962962962962963,
"eval_loss": 0.018648317083716393,
"eval_runtime": 9.1079,
"eval_samples_per_second": 5.49,
"eval_steps_per_second": 1.427,
"step": 140
},
{
"epoch": 1.3055555555555556,
"grad_norm": 0.024168213829398155,
"learning_rate": 9.229932129599205e-05,
"loss": 0.0166,
"step": 141
},
{
"epoch": 1.3148148148148149,
"grad_norm": 0.027960045263171196,
"learning_rate": 9.212610221095748e-05,
"loss": 0.0157,
"step": 142
},
{
"epoch": 1.324074074074074,
"grad_norm": 0.023985836654901505,
"learning_rate": 9.195112286530873e-05,
"loss": 0.0178,
"step": 143
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.026084545999765396,
"learning_rate": 9.177439057064683e-05,
"loss": 0.0164,
"step": 144
},
{
"epoch": 1.3425925925925926,
"grad_norm": 0.022582337260246277,
"learning_rate": 9.159591271182058e-05,
"loss": 0.0162,
"step": 145
},
{
"epoch": 1.3425925925925926,
"eval_loss": 0.018656810745596886,
"eval_runtime": 9.1149,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 145
},
{
"epoch": 1.3518518518518519,
"grad_norm": 0.030290907248854637,
"learning_rate": 9.141569674661817e-05,
"loss": 0.021,
"step": 146
},
{
"epoch": 1.3611111111111112,
"grad_norm": 0.026109322905540466,
"learning_rate": 9.123375020545535e-05,
"loss": 0.0162,
"step": 147
},
{
"epoch": 1.3703703703703702,
"grad_norm": 0.02652176469564438,
"learning_rate": 9.105008069106093e-05,
"loss": 0.0169,
"step": 148
},
{
"epoch": 1.3796296296296298,
"grad_norm": 0.024147020652890205,
"learning_rate": 9.086469587815904e-05,
"loss": 0.0162,
"step": 149
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.021294649690389633,
"learning_rate": 9.067760351314838e-05,
"loss": 0.0165,
"step": 150
},
{
"epoch": 1.3888888888888888,
"eval_loss": 0.018213987350463867,
"eval_runtime": 9.1247,
"eval_samples_per_second": 5.48,
"eval_steps_per_second": 1.425,
"step": 150
},
{
"epoch": 1.3981481481481481,
"grad_norm": 0.02462903782725334,
"learning_rate": 9.048881141377863e-05,
"loss": 0.0204,
"step": 151
},
{
"epoch": 1.4074074074074074,
"grad_norm": 0.024652326479554176,
"learning_rate": 9.029832746882371e-05,
"loss": 0.0164,
"step": 152
},
{
"epoch": 1.4166666666666667,
"grad_norm": 0.026834659278392792,
"learning_rate": 9.01061596377522e-05,
"loss": 0.018,
"step": 153
},
{
"epoch": 1.425925925925926,
"grad_norm": 0.02342064492404461,
"learning_rate": 8.991231595039465e-05,
"loss": 0.0156,
"step": 154
},
{
"epoch": 1.4351851851851851,
"grad_norm": 0.026441222056746483,
"learning_rate": 8.97168045066082e-05,
"loss": 0.0157,
"step": 155
},
{
"epoch": 1.4351851851851851,
"eval_loss": 0.01855114847421646,
"eval_runtime": 9.124,
"eval_samples_per_second": 5.48,
"eval_steps_per_second": 1.425,
"step": 155
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.01796615496277809,
"learning_rate": 8.951963347593797e-05,
"loss": 0.0165,
"step": 156
},
{
"epoch": 1.4537037037037037,
"grad_norm": 0.02256671153008938,
"learning_rate": 8.932081109727582e-05,
"loss": 0.0201,
"step": 157
},
{
"epoch": 1.462962962962963,
"grad_norm": 0.028528334572911263,
"learning_rate": 8.912034567851599e-05,
"loss": 0.0182,
"step": 158
},
{
"epoch": 1.4722222222222223,
"grad_norm": 0.029104968532919884,
"learning_rate": 8.891824559620801e-05,
"loss": 0.0153,
"step": 159
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.02003669925034046,
"learning_rate": 8.871451929520663e-05,
"loss": 0.0159,
"step": 160
},
{
"epoch": 1.4814814814814814,
"eval_loss": 0.01888095587491989,
"eval_runtime": 9.1172,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 160
},
{
"epoch": 1.4907407407407407,
"grad_norm": 0.019447356462478638,
"learning_rate": 8.850917528831899e-05,
"loss": 0.0163,
"step": 161
},
{
"epoch": 1.5,
"grad_norm": 0.03438901901245117,
"learning_rate": 8.83022221559489e-05,
"loss": 0.0125,
"step": 162
},
{
"epoch": 1.5092592592592593,
"grad_norm": 0.026535626500844955,
"learning_rate": 8.809366854573831e-05,
"loss": 0.0175,
"step": 163
},
{
"epoch": 1.5185185185185186,
"grad_norm": 0.029025647789239883,
"learning_rate": 8.78835231722059e-05,
"loss": 0.0164,
"step": 164
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.025528129190206528,
"learning_rate": 8.767179481638303e-05,
"loss": 0.0174,
"step": 165
},
{
"epoch": 1.5277777777777777,
"eval_loss": 0.018690049648284912,
"eval_runtime": 9.1481,
"eval_samples_per_second": 5.466,
"eval_steps_per_second": 1.421,
"step": 165
},
{
"epoch": 1.5370370370370372,
"grad_norm": 0.025675086304545403,
"learning_rate": 8.745849232544681e-05,
"loss": 0.0179,
"step": 166
},
{
"epoch": 1.5462962962962963,
"grad_norm": 0.027451254427433014,
"learning_rate": 8.724362461235029e-05,
"loss": 0.0169,
"step": 167
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.026652028784155846,
"learning_rate": 8.702720065545024e-05,
"loss": 0.0168,
"step": 168
},
{
"epoch": 1.5648148148148149,
"grad_norm": 0.030202018097043037,
"learning_rate": 8.680922949813178e-05,
"loss": 0.0162,
"step": 169
},
{
"epoch": 1.574074074074074,
"grad_norm": 0.027389824390411377,
"learning_rate": 8.658972024843062e-05,
"loss": 0.0184,
"step": 170
},
{
"epoch": 1.574074074074074,
"eval_loss": 0.018272995948791504,
"eval_runtime": 9.1448,
"eval_samples_per_second": 5.468,
"eval_steps_per_second": 1.422,
"step": 170
},
{
"epoch": 1.5833333333333335,
"grad_norm": 0.025648167356848717,
"learning_rate": 8.636868207865244e-05,
"loss": 0.0152,
"step": 171
},
{
"epoch": 1.5925925925925926,
"grad_norm": 0.02472120150923729,
"learning_rate": 8.614612422498964e-05,
"loss": 0.0153,
"step": 172
},
{
"epoch": 1.6018518518518519,
"grad_norm": 0.020042769610881805,
"learning_rate": 8.592205598713539e-05,
"loss": 0.017,
"step": 173
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.029423648491501808,
"learning_rate": 8.569648672789497e-05,
"loss": 0.0158,
"step": 174
},
{
"epoch": 1.6203703703703702,
"grad_norm": 0.02159775421023369,
"learning_rate": 8.546942587279465e-05,
"loss": 0.0165,
"step": 175
},
{
"epoch": 1.6203703703703702,
"eval_loss": 0.018273252993822098,
"eval_runtime": 9.118,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 175
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.024837305769324303,
"learning_rate": 8.524088290968781e-05,
"loss": 0.0187,
"step": 176
},
{
"epoch": 1.6388888888888888,
"grad_norm": 0.02383432537317276,
"learning_rate": 8.501086738835843e-05,
"loss": 0.0181,
"step": 177
},
{
"epoch": 1.6481481481481481,
"grad_norm": 0.025743911042809486,
"learning_rate": 8.47793889201221e-05,
"loss": 0.0171,
"step": 178
},
{
"epoch": 1.6574074074074074,
"grad_norm": 0.023100929334759712,
"learning_rate": 8.45464571774244e-05,
"loss": 0.021,
"step": 179
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.02667200192809105,
"learning_rate": 8.43120818934367e-05,
"loss": 0.0173,
"step": 180
},
{
"epoch": 1.6666666666666665,
"eval_loss": 0.01778573729097843,
"eval_runtime": 9.1324,
"eval_samples_per_second": 5.475,
"eval_steps_per_second": 1.424,
"step": 180
},
{
"epoch": 1.675925925925926,
"grad_norm": 0.02880384773015976,
"learning_rate": 8.407627286164948e-05,
"loss": 0.015,
"step": 181
},
{
"epoch": 1.6851851851851851,
"grad_norm": 0.030301645398139954,
"learning_rate": 8.383903993546311e-05,
"loss": 0.0157,
"step": 182
},
{
"epoch": 1.6944444444444444,
"grad_norm": 0.021445374935865402,
"learning_rate": 8.360039302777612e-05,
"loss": 0.0181,
"step": 183
},
{
"epoch": 1.7037037037037037,
"grad_norm": 0.023577649146318436,
"learning_rate": 8.336034211057098e-05,
"loss": 0.0153,
"step": 184
},
{
"epoch": 1.7129629629629628,
"grad_norm": 0.02492811530828476,
"learning_rate": 8.31188972144974e-05,
"loss": 0.0131,
"step": 185
},
{
"epoch": 1.7129629629629628,
"eval_loss": 0.017187727615237236,
"eval_runtime": 9.1252,
"eval_samples_per_second": 5.479,
"eval_steps_per_second": 1.425,
"step": 185
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.023155970498919487,
"learning_rate": 8.28760684284532e-05,
"loss": 0.0162,
"step": 186
},
{
"epoch": 1.7314814814814814,
"grad_norm": 0.02491271123290062,
"learning_rate": 8.263186589916273e-05,
"loss": 0.0137,
"step": 187
},
{
"epoch": 1.7407407407407407,
"grad_norm": 0.02165275253355503,
"learning_rate": 8.238629983075294e-05,
"loss": 0.0143,
"step": 188
},
{
"epoch": 1.75,
"grad_norm": 0.024284129962325096,
"learning_rate": 8.213938048432697e-05,
"loss": 0.0144,
"step": 189
},
{
"epoch": 1.7592592592592593,
"grad_norm": 0.027395077049732208,
"learning_rate": 8.18911181775353e-05,
"loss": 0.0132,
"step": 190
},
{
"epoch": 1.7592592592592593,
"eval_loss": 0.018012873828411102,
"eval_runtime": 9.1149,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 190
},
{
"epoch": 1.7685185185185186,
"grad_norm": 0.02639261819422245,
"learning_rate": 8.164152328414476e-05,
"loss": 0.0156,
"step": 191
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.02319464646279812,
"learning_rate": 8.139060623360493e-05,
"loss": 0.0121,
"step": 192
},
{
"epoch": 1.7870370370370372,
"grad_norm": 0.020444169640541077,
"learning_rate": 8.113837751061246e-05,
"loss": 0.0156,
"step": 193
},
{
"epoch": 1.7962962962962963,
"grad_norm": 0.03843529522418976,
"learning_rate": 8.088484765467286e-05,
"loss": 0.0202,
"step": 194
},
{
"epoch": 1.8055555555555556,
"grad_norm": 0.03014414757490158,
"learning_rate": 8.063002725966015e-05,
"loss": 0.0157,
"step": 195
},
{
"epoch": 1.8055555555555556,
"eval_loss": 0.018071575090289116,
"eval_runtime": 9.1428,
"eval_samples_per_second": 5.469,
"eval_steps_per_second": 1.422,
"step": 195
},
{
"epoch": 1.8148148148148149,
"grad_norm": 0.028225911781191826,
"learning_rate": 8.037392697337418e-05,
"loss": 0.0152,
"step": 196
},
{
"epoch": 1.824074074074074,
"grad_norm": 0.022350864484906197,
"learning_rate": 8.011655749709575e-05,
"loss": 0.0147,
"step": 197
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.023073699325323105,
"learning_rate": 7.985792958513931e-05,
"loss": 0.0142,
"step": 198
},
{
"epoch": 1.8425925925925926,
"grad_norm": 0.027160046622157097,
"learning_rate": 7.95980540444038e-05,
"loss": 0.0181,
"step": 199
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.02501911297440529,
"learning_rate": 7.93369417339209e-05,
"loss": 0.0154,
"step": 200
},
{
"epoch": 1.8518518518518519,
"eval_loss": 0.01711750030517578,
"eval_runtime": 9.1469,
"eval_samples_per_second": 5.466,
"eval_steps_per_second": 1.421,
"step": 200
},
{
"epoch": 1.8611111111111112,
"grad_norm": 0.02209513448178768,
"learning_rate": 7.907460356440133e-05,
"loss": 0.0156,
"step": 201
},
{
"epoch": 1.8703703703703702,
"grad_norm": 0.022372853010892868,
"learning_rate": 7.881105049777901e-05,
"loss": 0.0182,
"step": 202
},
{
"epoch": 1.8796296296296298,
"grad_norm": 0.02874351665377617,
"learning_rate": 7.854629354675291e-05,
"loss": 0.0145,
"step": 203
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.025754928588867188,
"learning_rate": 7.828034377432693e-05,
"loss": 0.0161,
"step": 204
},
{
"epoch": 1.8981481481481481,
"grad_norm": 0.023868247866630554,
"learning_rate": 7.801321229334764e-05,
"loss": 0.0139,
"step": 205
},
{
"epoch": 1.8981481481481481,
"eval_loss": 0.01687374897301197,
"eval_runtime": 9.1148,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 205
},
{
"epoch": 1.9074074074074074,
"grad_norm": 0.02167942002415657,
"learning_rate": 7.774491026603985e-05,
"loss": 0.0172,
"step": 206
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.028955647721886635,
"learning_rate": 7.74754489035403e-05,
"loss": 0.0182,
"step": 207
},
{
"epoch": 1.925925925925926,
"grad_norm": 0.023490311577916145,
"learning_rate": 7.720483946542914e-05,
"loss": 0.0176,
"step": 208
},
{
"epoch": 1.9351851851851851,
"grad_norm": 0.02635806053876877,
"learning_rate": 7.69330932592594e-05,
"loss": 0.0149,
"step": 209
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.02554040215909481,
"learning_rate": 7.666022164008457e-05,
"loss": 0.0169,
"step": 210
},
{
"epoch": 1.9444444444444444,
"eval_loss": 0.016974864527583122,
"eval_runtime": 9.1008,
"eval_samples_per_second": 5.494,
"eval_steps_per_second": 1.428,
"step": 210
},
{
"epoch": 1.9537037037037037,
"grad_norm": 0.02924305759370327,
"learning_rate": 7.63862360099841e-05,
"loss": 0.0148,
"step": 211
},
{
"epoch": 1.9629629629629628,
"grad_norm": 0.020948631688952446,
"learning_rate": 7.611114781758692e-05,
"loss": 0.0158,
"step": 212
},
{
"epoch": 1.9722222222222223,
"grad_norm": 0.021703558042645454,
"learning_rate": 7.583496855759316e-05,
"loss": 0.0172,
"step": 213
},
{
"epoch": 1.9814814814814814,
"grad_norm": 0.022922605276107788,
"learning_rate": 7.555770977029367e-05,
"loss": 0.0149,
"step": 214
},
{
"epoch": 1.9907407407407407,
"grad_norm": 0.025769095867872238,
"learning_rate": 7.527938304108795e-05,
"loss": 0.0158,
"step": 215
},
{
"epoch": 1.9907407407407407,
"eval_loss": 0.017042405903339386,
"eval_runtime": 9.1168,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 215
},
{
"epoch": 2.0,
"grad_norm": 0.03371057286858559,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0126,
"step": 216
},
{
"epoch": 2.009259259259259,
"grad_norm": 0.01711084321141243,
"learning_rate": 7.471957232119234e-05,
"loss": 0.0142,
"step": 217
},
{
"epoch": 2.0185185185185186,
"grad_norm": 0.023618614301085472,
"learning_rate": 7.443811172247821e-05,
"loss": 0.0151,
"step": 218
},
{
"epoch": 2.0277777777777777,
"grad_norm": 0.02181304432451725,
"learning_rate": 7.415562996483192e-05,
"loss": 0.0132,
"step": 219
},
{
"epoch": 2.037037037037037,
"grad_norm": 0.020521776750683784,
"learning_rate": 7.387213885189746e-05,
"loss": 0.0139,
"step": 220
},
{
"epoch": 2.037037037037037,
"eval_loss": 0.01702064275741577,
"eval_runtime": 9.1369,
"eval_samples_per_second": 5.472,
"eval_steps_per_second": 1.423,
"step": 220
},
{
"epoch": 2.0462962962962963,
"grad_norm": 0.022209780290722847,
"learning_rate": 7.358765022949519e-05,
"loss": 0.0152,
"step": 221
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.02240665629506111,
"learning_rate": 7.330217598512695e-05,
"loss": 0.0136,
"step": 222
},
{
"epoch": 2.064814814814815,
"grad_norm": 0.024021176621317863,
"learning_rate": 7.30157280474793e-05,
"loss": 0.0134,
"step": 223
},
{
"epoch": 2.074074074074074,
"grad_norm": 0.022297382354736328,
"learning_rate": 7.272831838592503e-05,
"loss": 0.0158,
"step": 224
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.023189576342701912,
"learning_rate": 7.243995901002312e-05,
"loss": 0.0146,
"step": 225
},
{
"epoch": 2.0833333333333335,
"eval_loss": 0.017011733725667,
"eval_runtime": 9.1385,
"eval_samples_per_second": 5.471,
"eval_steps_per_second": 1.423,
"step": 225
},
{
"epoch": 2.0925925925925926,
"grad_norm": 0.02641259878873825,
"learning_rate": 7.215066196901676e-05,
"loss": 0.0149,
"step": 226
},
{
"epoch": 2.1018518518518516,
"grad_norm": 0.02105395309627056,
"learning_rate": 7.186043935133005e-05,
"loss": 0.0105,
"step": 227
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.020818866789340973,
"learning_rate": 7.156930328406268e-05,
"loss": 0.0144,
"step": 228
},
{
"epoch": 2.1203703703703702,
"grad_norm": 0.028699271380901337,
"learning_rate": 7.127726593248337e-05,
"loss": 0.0134,
"step": 229
},
{
"epoch": 2.1296296296296298,
"grad_norm": 0.025844816118478775,
"learning_rate": 7.098433949952146e-05,
"loss": 0.0115,
"step": 230
},
{
"epoch": 2.1296296296296298,
"eval_loss": 0.017404422163963318,
"eval_runtime": 9.1138,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 230
},
{
"epoch": 2.138888888888889,
"grad_norm": 0.02628181129693985,
"learning_rate": 7.069053622525696e-05,
"loss": 0.0135,
"step": 231
},
{
"epoch": 2.148148148148148,
"grad_norm": 0.03826741501688957,
"learning_rate": 7.039586838640919e-05,
"loss": 0.013,
"step": 232
},
{
"epoch": 2.1574074074074074,
"grad_norm": 0.02549687772989273,
"learning_rate": 7.01003482958237e-05,
"loss": 0.0112,
"step": 233
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.02850032038986683,
"learning_rate": 6.980398830195785e-05,
"loss": 0.0114,
"step": 234
},
{
"epoch": 2.175925925925926,
"grad_norm": 0.028789905831217766,
"learning_rate": 6.950680078836474e-05,
"loss": 0.0138,
"step": 235
},
{
"epoch": 2.175925925925926,
"eval_loss": 0.016838619485497475,
"eval_runtime": 9.1141,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 235
},
{
"epoch": 2.185185185185185,
"grad_norm": 0.024276968091726303,
"learning_rate": 6.920879817317589e-05,
"loss": 0.0156,
"step": 236
},
{
"epoch": 2.1944444444444446,
"grad_norm": 0.02652347832918167,
"learning_rate": 6.890999290858214e-05,
"loss": 0.0111,
"step": 237
},
{
"epoch": 2.2037037037037037,
"grad_norm": 0.03363705053925514,
"learning_rate": 6.861039748031351e-05,
"loss": 0.0155,
"step": 238
},
{
"epoch": 2.212962962962963,
"grad_norm": 0.025364842265844345,
"learning_rate": 6.83100244071174e-05,
"loss": 0.0127,
"step": 239
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.024912815541028976,
"learning_rate": 6.800888624023553e-05,
"loss": 0.0138,
"step": 240
},
{
"epoch": 2.2222222222222223,
"eval_loss": 0.017057882621884346,
"eval_runtime": 9.1505,
"eval_samples_per_second": 5.464,
"eval_steps_per_second": 1.421,
"step": 240
},
{
"epoch": 2.2314814814814814,
"grad_norm": 0.031296826899051666,
"learning_rate": 6.770699556287939e-05,
"loss": 0.0138,
"step": 241
},
{
"epoch": 2.240740740740741,
"grad_norm": 0.03207860141992569,
"learning_rate": 6.740436498970452e-05,
"loss": 0.0128,
"step": 242
},
{
"epoch": 2.25,
"grad_norm": 0.027626443654298782,
"learning_rate": 6.710100716628344e-05,
"loss": 0.0142,
"step": 243
},
{
"epoch": 2.259259259259259,
"grad_norm": 0.025963863357901573,
"learning_rate": 6.679693476857711e-05,
"loss": 0.0137,
"step": 244
},
{
"epoch": 2.2685185185185186,
"grad_norm": 0.022552739828824997,
"learning_rate": 6.649216050240539e-05,
"loss": 0.0134,
"step": 245
},
{
"epoch": 2.2685185185185186,
"eval_loss": 0.016679909080266953,
"eval_runtime": 9.1095,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 1.427,
"step": 245
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.0247825738042593,
"learning_rate": 6.618669710291606e-05,
"loss": 0.0116,
"step": 246
},
{
"epoch": 2.287037037037037,
"grad_norm": 0.021808508783578873,
"learning_rate": 6.588055733405266e-05,
"loss": 0.014,
"step": 247
},
{
"epoch": 2.2962962962962963,
"grad_norm": 0.025087367743253708,
"learning_rate": 6.557375398802123e-05,
"loss": 0.0167,
"step": 248
},
{
"epoch": 2.3055555555555554,
"grad_norm": 0.022722622379660606,
"learning_rate": 6.526629988475567e-05,
"loss": 0.013,
"step": 249
},
{
"epoch": 2.314814814814815,
"grad_norm": 0.023495636880397797,
"learning_rate": 6.495820787138209e-05,
"loss": 0.0167,
"step": 250
},
{
"epoch": 2.314814814814815,
"eval_loss": 0.016377143561840057,
"eval_runtime": 9.1133,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 250
},
{
"epoch": 2.324074074074074,
"grad_norm": 0.021211953833699226,
"learning_rate": 6.464949082168204e-05,
"loss": 0.0125,
"step": 251
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.022748148068785667,
"learning_rate": 6.434016163555452e-05,
"loss": 0.0121,
"step": 252
},
{
"epoch": 2.3425925925925926,
"grad_norm": 0.021960506215691566,
"learning_rate": 6.403023323847695e-05,
"loss": 0.0159,
"step": 253
},
{
"epoch": 2.351851851851852,
"grad_norm": 0.02572719193994999,
"learning_rate": 6.371971858096508e-05,
"loss": 0.0137,
"step": 254
},
{
"epoch": 2.361111111111111,
"grad_norm": 0.027611717581748962,
"learning_rate": 6.340863063803188e-05,
"loss": 0.0123,
"step": 255
},
{
"epoch": 2.361111111111111,
"eval_loss": 0.016414109617471695,
"eval_runtime": 9.1093,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 1.427,
"step": 255
},
{
"epoch": 2.3703703703703702,
"grad_norm": 0.026147907599806786,
"learning_rate": 6.30969824086453e-05,
"loss": 0.012,
"step": 256
},
{
"epoch": 2.3796296296296298,
"grad_norm": 0.026667073369026184,
"learning_rate": 6.27847869151852e-05,
"loss": 0.0127,
"step": 257
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.023840012028813362,
"learning_rate": 6.247205720289907e-05,
"loss": 0.0141,
"step": 258
},
{
"epoch": 2.398148148148148,
"grad_norm": 0.028697500005364418,
"learning_rate": 6.215880633935708e-05,
"loss": 0.0135,
"step": 259
},
{
"epoch": 2.4074074074074074,
"grad_norm": 0.029124466702342033,
"learning_rate": 6.184504741390596e-05,
"loss": 0.0139,
"step": 260
},
{
"epoch": 2.4074074074074074,
"eval_loss": 0.016279693692922592,
"eval_runtime": 9.1162,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 260
},
{
"epoch": 2.4166666666666665,
"grad_norm": 0.020265506580471992,
"learning_rate": 6.153079353712201e-05,
"loss": 0.0129,
"step": 261
},
{
"epoch": 2.425925925925926,
"grad_norm": 0.020486822351813316,
"learning_rate": 6.121605784026339e-05,
"loss": 0.0114,
"step": 262
},
{
"epoch": 2.435185185185185,
"grad_norm": 0.02432914823293686,
"learning_rate": 6.09008534747213e-05,
"loss": 0.0138,
"step": 263
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.027614833787083626,
"learning_rate": 6.058519361147055e-05,
"loss": 0.0118,
"step": 264
},
{
"epoch": 2.4537037037037037,
"grad_norm": 0.03493235632777214,
"learning_rate": 6.02690914405191e-05,
"loss": 0.0125,
"step": 265
},
{
"epoch": 2.4537037037037037,
"eval_loss": 0.016143780201673508,
"eval_runtime": 9.2054,
"eval_samples_per_second": 5.432,
"eval_steps_per_second": 1.412,
"step": 265
},
{
"epoch": 2.462962962962963,
"grad_norm": 0.024250265210866928,
"learning_rate": 5.995256017035703e-05,
"loss": 0.0139,
"step": 266
},
{
"epoch": 2.4722222222222223,
"grad_norm": 0.022808292880654335,
"learning_rate": 5.963561302740449e-05,
"loss": 0.0162,
"step": 267
},
{
"epoch": 2.4814814814814814,
"grad_norm": 0.03109206259250641,
"learning_rate": 5.9318263255459116e-05,
"loss": 0.0123,
"step": 268
},
{
"epoch": 2.490740740740741,
"grad_norm": 0.02985144406557083,
"learning_rate": 5.900052411514257e-05,
"loss": 0.015,
"step": 269
},
{
"epoch": 2.5,
"grad_norm": 0.024866314604878426,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0126,
"step": 270
},
{
"epoch": 2.5,
"eval_loss": 0.016046511009335518,
"eval_runtime": 9.1128,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 270
},
{
"epoch": 2.5092592592592595,
"grad_norm": 0.0215854924172163,
"learning_rate": 5.836393085267776e-05,
"loss": 0.0133,
"step": 271
},
{
"epoch": 2.5185185185185186,
"grad_norm": 0.02321489341557026,
"learning_rate": 5.804510333090287e-05,
"loss": 0.0175,
"step": 272
},
{
"epoch": 2.5277777777777777,
"grad_norm": 0.024908283725380898,
"learning_rate": 5.772593964039203e-05,
"loss": 0.0116,
"step": 273
},
{
"epoch": 2.537037037037037,
"grad_norm": 0.02571980282664299,
"learning_rate": 5.740645311756245e-05,
"loss": 0.0125,
"step": 274
},
{
"epoch": 2.5462962962962963,
"grad_norm": 0.022897284477949142,
"learning_rate": 5.708665711232103e-05,
"loss": 0.0138,
"step": 275
},
{
"epoch": 2.5462962962962963,
"eval_loss": 0.016013609245419502,
"eval_runtime": 9.1743,
"eval_samples_per_second": 5.45,
"eval_steps_per_second": 1.417,
"step": 275
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.023732876405119896,
"learning_rate": 5.6766564987506566e-05,
"loss": 0.0136,
"step": 276
},
{
"epoch": 2.564814814814815,
"grad_norm": 0.024980880320072174,
"learning_rate": 5.644619011833133e-05,
"loss": 0.0131,
"step": 277
},
{
"epoch": 2.574074074074074,
"grad_norm": 0.023262949660420418,
"learning_rate": 5.6125545891822274e-05,
"loss": 0.0143,
"step": 278
},
{
"epoch": 2.5833333333333335,
"grad_norm": 0.024468230083584785,
"learning_rate": 5.5804645706261514e-05,
"loss": 0.0148,
"step": 279
},
{
"epoch": 2.5925925925925926,
"grad_norm": 0.020350055769085884,
"learning_rate": 5.548350297062659e-05,
"loss": 0.0125,
"step": 280
},
{
"epoch": 2.5925925925925926,
"eval_loss": 0.015153205953538418,
"eval_runtime": 9.1126,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 280
},
{
"epoch": 2.601851851851852,
"grad_norm": 0.027165360748767853,
"learning_rate": 5.516213110403009e-05,
"loss": 0.0093,
"step": 281
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.021070580929517746,
"learning_rate": 5.484054353515896e-05,
"loss": 0.0138,
"step": 282
},
{
"epoch": 2.6203703703703702,
"grad_norm": 0.025997430086135864,
"learning_rate": 5.451875370171341e-05,
"loss": 0.0121,
"step": 283
},
{
"epoch": 2.6296296296296298,
"grad_norm": 0.02517426759004593,
"learning_rate": 5.419677504984534e-05,
"loss": 0.0126,
"step": 284
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.025812286883592606,
"learning_rate": 5.387462103359655e-05,
"loss": 0.0133,
"step": 285
},
{
"epoch": 2.638888888888889,
"eval_loss": 0.016152961179614067,
"eval_runtime": 9.1127,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 285
},
{
"epoch": 2.648148148148148,
"grad_norm": 0.02393972873687744,
"learning_rate": 5.355230511433651e-05,
"loss": 0.0136,
"step": 286
},
{
"epoch": 2.6574074074074074,
"grad_norm": 0.021706297993659973,
"learning_rate": 5.32298407601999e-05,
"loss": 0.0133,
"step": 287
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.026299407705664635,
"learning_rate": 5.290724144552379e-05,
"loss": 0.0143,
"step": 288
},
{
"epoch": 2.675925925925926,
"grad_norm": 0.030511364340782166,
"learning_rate": 5.258452065028473e-05,
"loss": 0.0137,
"step": 289
},
{
"epoch": 2.685185185185185,
"grad_norm": 0.024854540824890137,
"learning_rate": 5.226169185953532e-05,
"loss": 0.0125,
"step": 290
},
{
"epoch": 2.685185185185185,
"eval_loss": 0.016076602041721344,
"eval_runtime": 9.1632,
"eval_samples_per_second": 5.457,
"eval_steps_per_second": 1.419,
"step": 290
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.022800520062446594,
"learning_rate": 5.193876856284085e-05,
"loss": 0.012,
"step": 291
},
{
"epoch": 2.7037037037037037,
"grad_norm": 0.021870015189051628,
"learning_rate": 5.1615764253715536e-05,
"loss": 0.0136,
"step": 292
},
{
"epoch": 2.712962962962963,
"grad_norm": 0.020156167447566986,
"learning_rate": 5.129269242905882e-05,
"loss": 0.012,
"step": 293
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.019064266234636307,
"learning_rate": 5.096956658859122e-05,
"loss": 0.0137,
"step": 294
},
{
"epoch": 2.7314814814814814,
"grad_norm": 0.027288921177387238,
"learning_rate": 5.064640023429043e-05,
"loss": 0.0147,
"step": 295
},
{
"epoch": 2.7314814814814814,
"eval_loss": 0.01584070920944214,
"eval_runtime": 9.1151,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 295
},
{
"epoch": 2.7407407407407405,
"grad_norm": 0.02484748885035515,
"learning_rate": 5.0323206869826966e-05,
"loss": 0.0111,
"step": 296
},
{
"epoch": 2.75,
"grad_norm": 0.02521962858736515,
"learning_rate": 5e-05,
"loss": 0.0134,
"step": 297
},
{
"epoch": 2.7592592592592595,
"grad_norm": 0.023346634581685066,
"learning_rate": 4.967679313017303e-05,
"loss": 0.0124,
"step": 298
},
{
"epoch": 2.7685185185185186,
"grad_norm": 0.021654650568962097,
"learning_rate": 4.9353599765709584e-05,
"loss": 0.0144,
"step": 299
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.021227596327662468,
"learning_rate": 4.903043341140879e-05,
"loss": 0.0134,
"step": 300
},
{
"epoch": 2.7777777777777777,
"eval_loss": 0.016122175380587578,
"eval_runtime": 9.1019,
"eval_samples_per_second": 5.493,
"eval_steps_per_second": 1.428,
"step": 300
},
{
"epoch": 2.787037037037037,
"grad_norm": 0.024656914174556732,
"learning_rate": 4.870730757094121e-05,
"loss": 0.0123,
"step": 301
},
{
"epoch": 2.7962962962962963,
"grad_norm": 0.02583468146622181,
"learning_rate": 4.8384235746284476e-05,
"loss": 0.015,
"step": 302
},
{
"epoch": 2.8055555555555554,
"grad_norm": 0.022909915074706078,
"learning_rate": 4.806123143715916e-05,
"loss": 0.0142,
"step": 303
},
{
"epoch": 2.814814814814815,
"grad_norm": 0.02014041878283024,
"learning_rate": 4.7738308140464685e-05,
"loss": 0.0131,
"step": 304
},
{
"epoch": 2.824074074074074,
"grad_norm": 0.022683143615722656,
"learning_rate": 4.7415479349715275e-05,
"loss": 0.0124,
"step": 305
},
{
"epoch": 2.824074074074074,
"eval_loss": 0.015797268599271774,
"eval_runtime": 9.1281,
"eval_samples_per_second": 5.478,
"eval_steps_per_second": 1.424,
"step": 305
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.025906002148985863,
"learning_rate": 4.709275855447621e-05,
"loss": 0.0154,
"step": 306
},
{
"epoch": 2.8425925925925926,
"grad_norm": 0.027820315212011337,
"learning_rate": 4.677015923980011e-05,
"loss": 0.0138,
"step": 307
},
{
"epoch": 2.851851851851852,
"grad_norm": 0.023744860664010048,
"learning_rate": 4.6447694885663514e-05,
"loss": 0.0124,
"step": 308
},
{
"epoch": 2.861111111111111,
"grad_norm": 0.026518192142248154,
"learning_rate": 4.612537896640346e-05,
"loss": 0.0155,
"step": 309
},
{
"epoch": 2.8703703703703702,
"grad_norm": 0.020426657050848007,
"learning_rate": 4.5803224950154656e-05,
"loss": 0.0132,
"step": 310
},
{
"epoch": 2.8703703703703702,
"eval_loss": 0.015400240197777748,
"eval_runtime": 9.1185,
"eval_samples_per_second": 5.483,
"eval_steps_per_second": 1.426,
"step": 310
},
{
"epoch": 2.8796296296296298,
"grad_norm": 0.022766800597310066,
"learning_rate": 4.54812462982866e-05,
"loss": 0.0139,
"step": 311
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.021728193387389183,
"learning_rate": 4.515945646484105e-05,
"loss": 0.0133,
"step": 312
},
{
"epoch": 2.898148148148148,
"grad_norm": 0.0226016603410244,
"learning_rate": 4.4837868895969936e-05,
"loss": 0.0126,
"step": 313
},
{
"epoch": 2.9074074074074074,
"grad_norm": 0.027723975479602814,
"learning_rate": 4.451649702937342e-05,
"loss": 0.0106,
"step": 314
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.01856391504406929,
"learning_rate": 4.4195354293738484e-05,
"loss": 0.0146,
"step": 315
},
{
"epoch": 2.9166666666666665,
"eval_loss": 0.015166966244578362,
"eval_runtime": 9.1172,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 315
},
{
"epoch": 2.925925925925926,
"grad_norm": 0.019857853651046753,
"learning_rate": 4.387445410817774e-05,
"loss": 0.0124,
"step": 316
},
{
"epoch": 2.935185185185185,
"grad_norm": 0.025410892441868782,
"learning_rate": 4.355380988166867e-05,
"loss": 0.0119,
"step": 317
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.02312655746936798,
"learning_rate": 4.323343501249346e-05,
"loss": 0.0144,
"step": 318
},
{
"epoch": 2.9537037037037037,
"grad_norm": 0.022076064720749855,
"learning_rate": 4.2913342887678985e-05,
"loss": 0.0117,
"step": 319
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.023769903928041458,
"learning_rate": 4.259354688243757e-05,
"loss": 0.014,
"step": 320
},
{
"epoch": 2.962962962962963,
"eval_loss": 0.014957955107092857,
"eval_runtime": 9.1101,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 320
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.023904340341687202,
"learning_rate": 4.227406035960798e-05,
"loss": 0.0121,
"step": 321
},
{
"epoch": 2.9814814814814814,
"grad_norm": 0.02383498102426529,
"learning_rate": 4.195489666909713e-05,
"loss": 0.0119,
"step": 322
},
{
"epoch": 2.9907407407407405,
"grad_norm": 0.03048449568450451,
"learning_rate": 4.1636069147322246e-05,
"loss": 0.0136,
"step": 323
},
{
"epoch": 3.0,
"grad_norm": 0.023879334330558777,
"learning_rate": 4.131759111665349e-05,
"loss": 0.0137,
"step": 324
},
{
"epoch": 3.009259259259259,
"grad_norm": 0.025208691135048866,
"learning_rate": 4.099947588485744e-05,
"loss": 0.0122,
"step": 325
},
{
"epoch": 3.009259259259259,
"eval_loss": 0.015089023858308792,
"eval_runtime": 9.116,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 325
},
{
"epoch": 3.0185185185185186,
"grad_norm": 0.020718788728117943,
"learning_rate": 4.06817367445409e-05,
"loss": 0.0095,
"step": 326
},
{
"epoch": 3.0277777777777777,
"grad_norm": 0.024810951203107834,
"learning_rate": 4.036438697259551e-05,
"loss": 0.0134,
"step": 327
},
{
"epoch": 3.037037037037037,
"grad_norm": 0.019842958077788353,
"learning_rate": 4.004743982964298e-05,
"loss": 0.0122,
"step": 328
},
{
"epoch": 3.0462962962962963,
"grad_norm": 0.01818239875137806,
"learning_rate": 3.97309085594809e-05,
"loss": 0.0101,
"step": 329
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.022604303434491158,
"learning_rate": 3.941480638852948e-05,
"loss": 0.0118,
"step": 330
},
{
"epoch": 3.0555555555555554,
"eval_loss": 0.015503546223044395,
"eval_runtime": 9.1063,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 330
},
{
"epoch": 3.064814814814815,
"grad_norm": 0.024690452963113785,
"learning_rate": 3.909914652527871e-05,
"loss": 0.0109,
"step": 331
},
{
"epoch": 3.074074074074074,
"grad_norm": 0.02343621291220188,
"learning_rate": 3.878394215973663e-05,
"loss": 0.0123,
"step": 332
},
{
"epoch": 3.0833333333333335,
"grad_norm": 0.026170087978243828,
"learning_rate": 3.846920646287799e-05,
"loss": 0.0122,
"step": 333
},
{
"epoch": 3.0925925925925926,
"grad_norm": 0.024799769744277,
"learning_rate": 3.815495258609404e-05,
"loss": 0.0125,
"step": 334
},
{
"epoch": 3.1018518518518516,
"grad_norm": 0.02072787657380104,
"learning_rate": 3.784119366064293e-05,
"loss": 0.0108,
"step": 335
},
{
"epoch": 3.1018518518518516,
"eval_loss": 0.0155374426394701,
"eval_runtime": 9.1152,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 335
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.021989421918988228,
"learning_rate": 3.752794279710094e-05,
"loss": 0.0114,
"step": 336
},
{
"epoch": 3.1203703703703702,
"grad_norm": 0.03829918056726456,
"learning_rate": 3.721521308481482e-05,
"loss": 0.0101,
"step": 337
},
{
"epoch": 3.1296296296296298,
"grad_norm": 0.029835987836122513,
"learning_rate": 3.6903017591354706e-05,
"loss": 0.0107,
"step": 338
},
{
"epoch": 3.138888888888889,
"grad_norm": 0.02231847681105137,
"learning_rate": 3.6591369361968124e-05,
"loss": 0.012,
"step": 339
},
{
"epoch": 3.148148148148148,
"grad_norm": 0.02263280376791954,
"learning_rate": 3.628028141903493e-05,
"loss": 0.0103,
"step": 340
},
{
"epoch": 3.148148148148148,
"eval_loss": 0.01546421181410551,
"eval_runtime": 9.1199,
"eval_samples_per_second": 5.483,
"eval_steps_per_second": 1.425,
"step": 340
},
{
"epoch": 3.1574074074074074,
"grad_norm": 0.023618226870894432,
"learning_rate": 3.596976676152306e-05,
"loss": 0.0116,
"step": 341
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.02577986940741539,
"learning_rate": 3.5659838364445505e-05,
"loss": 0.0108,
"step": 342
},
{
"epoch": 3.175925925925926,
"grad_norm": 0.026071948930621147,
"learning_rate": 3.535050917831797e-05,
"loss": 0.0108,
"step": 343
},
{
"epoch": 3.185185185185185,
"grad_norm": 0.038238752633333206,
"learning_rate": 3.5041792128617927e-05,
"loss": 0.0094,
"step": 344
},
{
"epoch": 3.1944444444444446,
"grad_norm": 0.029051663354039192,
"learning_rate": 3.473370011524435e-05,
"loss": 0.0099,
"step": 345
},
{
"epoch": 3.1944444444444446,
"eval_loss": 0.015372861176729202,
"eval_runtime": 9.1378,
"eval_samples_per_second": 5.472,
"eval_steps_per_second": 1.423,
"step": 345
},
{
"epoch": 3.2037037037037037,
"grad_norm": 0.022384386509656906,
"learning_rate": 3.442624601197877e-05,
"loss": 0.0096,
"step": 346
},
{
"epoch": 3.212962962962963,
"grad_norm": 0.024341940879821777,
"learning_rate": 3.4119442665947344e-05,
"loss": 0.0094,
"step": 347
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.02119499258697033,
"learning_rate": 3.381330289708396e-05,
"loss": 0.011,
"step": 348
},
{
"epoch": 3.2314814814814814,
"grad_norm": 0.025269504636526108,
"learning_rate": 3.350783949759462e-05,
"loss": 0.0105,
"step": 349
},
{
"epoch": 3.240740740740741,
"grad_norm": 0.02428189478814602,
"learning_rate": 3.3203065231422904e-05,
"loss": 0.0115,
"step": 350
},
{
"epoch": 3.240740740740741,
"eval_loss": 0.015474287793040276,
"eval_runtime": 9.1142,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 350
},
{
"epoch": 3.25,
"grad_norm": 0.027830710634589195,
"learning_rate": 3.289899283371657e-05,
"loss": 0.014,
"step": 351
},
{
"epoch": 3.259259259259259,
"grad_norm": 0.026644067838788033,
"learning_rate": 3.2595635010295475e-05,
"loss": 0.0132,
"step": 352
},
{
"epoch": 3.2685185185185186,
"grad_norm": 0.028307707980275154,
"learning_rate": 3.2293004437120624e-05,
"loss": 0.0093,
"step": 353
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.03480321913957596,
"learning_rate": 3.199111375976449e-05,
"loss": 0.0107,
"step": 354
},
{
"epoch": 3.287037037037037,
"grad_norm": 0.029546814039349556,
"learning_rate": 3.1689975592882603e-05,
"loss": 0.0099,
"step": 355
},
{
"epoch": 3.287037037037037,
"eval_loss": 0.015444349497556686,
"eval_runtime": 9.1458,
"eval_samples_per_second": 5.467,
"eval_steps_per_second": 1.421,
"step": 355
},
{
"epoch": 3.2962962962962963,
"grad_norm": 0.02437739446759224,
"learning_rate": 3.1389602519686515e-05,
"loss": 0.0118,
"step": 356
},
{
"epoch": 3.3055555555555554,
"grad_norm": 0.029530519619584084,
"learning_rate": 3.109000709141788e-05,
"loss": 0.0121,
"step": 357
},
{
"epoch": 3.314814814814815,
"grad_norm": 0.029449855908751488,
"learning_rate": 3.079120182682412e-05,
"loss": 0.0099,
"step": 358
},
{
"epoch": 3.324074074074074,
"grad_norm": 0.020589128136634827,
"learning_rate": 3.049319921163526e-05,
"loss": 0.0119,
"step": 359
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.02450876496732235,
"learning_rate": 3.019601169804216e-05,
"loss": 0.0129,
"step": 360
},
{
"epoch": 3.3333333333333335,
"eval_loss": 0.0157760102301836,
"eval_runtime": 9.1103,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 360
},
{
"epoch": 3.3425925925925926,
"grad_norm": 0.0208604596555233,
"learning_rate": 2.9899651704176325e-05,
"loss": 0.011,
"step": 361
},
{
"epoch": 3.351851851851852,
"grad_norm": 0.025153055787086487,
"learning_rate": 2.9604131613590824e-05,
"loss": 0.0109,
"step": 362
},
{
"epoch": 3.361111111111111,
"grad_norm": 0.021455859765410423,
"learning_rate": 2.9309463774743046e-05,
"loss": 0.0122,
"step": 363
},
{
"epoch": 3.3703703703703702,
"grad_norm": 0.01964252069592476,
"learning_rate": 2.901566050047855e-05,
"loss": 0.0113,
"step": 364
},
{
"epoch": 3.3796296296296298,
"grad_norm": 0.020809266716241837,
"learning_rate": 2.872273406751664e-05,
"loss": 0.0105,
"step": 365
},
{
"epoch": 3.3796296296296298,
"eval_loss": 0.015391937457025051,
"eval_runtime": 9.111,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 365
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.025048566982150078,
"learning_rate": 2.8430696715937337e-05,
"loss": 0.0107,
"step": 366
},
{
"epoch": 3.398148148148148,
"grad_norm": 0.024674881249666214,
"learning_rate": 2.8139560648669962e-05,
"loss": 0.0113,
"step": 367
},
{
"epoch": 3.4074074074074074,
"grad_norm": 0.025468124076724052,
"learning_rate": 2.7849338030983257e-05,
"loss": 0.012,
"step": 368
},
{
"epoch": 3.4166666666666665,
"grad_norm": 0.022864418104290962,
"learning_rate": 2.7560040989976892e-05,
"loss": 0.01,
"step": 369
},
{
"epoch": 3.425925925925926,
"grad_norm": 0.02258789725601673,
"learning_rate": 2.7271681614074973e-05,
"loss": 0.0121,
"step": 370
},
{
"epoch": 3.425925925925926,
"eval_loss": 0.015503110364079475,
"eval_runtime": 9.1077,
"eval_samples_per_second": 5.49,
"eval_steps_per_second": 1.427,
"step": 370
},
{
"epoch": 3.435185185185185,
"grad_norm": 0.025097696110606194,
"learning_rate": 2.6984271952520722e-05,
"loss": 0.0104,
"step": 371
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.028177309781312943,
"learning_rate": 2.6697824014873075e-05,
"loss": 0.0132,
"step": 372
},
{
"epoch": 3.4537037037037037,
"grad_norm": 0.026587417349219322,
"learning_rate": 2.641234977050484e-05,
"loss": 0.0085,
"step": 373
},
{
"epoch": 3.462962962962963,
"grad_norm": 0.0189076978713274,
"learning_rate": 2.612786114810255e-05,
"loss": 0.0096,
"step": 374
},
{
"epoch": 3.4722222222222223,
"grad_norm": 0.029332995414733887,
"learning_rate": 2.5844370035168073e-05,
"loss": 0.0096,
"step": 375
},
{
"epoch": 3.4722222222222223,
"eval_loss": 0.015461472794413567,
"eval_runtime": 9.1144,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 375
},
{
"epoch": 3.4814814814814814,
"grad_norm": 0.02185731939971447,
"learning_rate": 2.5561888277521794e-05,
"loss": 0.0098,
"step": 376
},
{
"epoch": 3.490740740740741,
"grad_norm": 0.026887575164437294,
"learning_rate": 2.528042767880766e-05,
"loss": 0.0114,
"step": 377
},
{
"epoch": 3.5,
"grad_norm": 0.023131586611270905,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0112,
"step": 378
},
{
"epoch": 3.5092592592592595,
"grad_norm": 0.028937749564647675,
"learning_rate": 2.4720616958912053e-05,
"loss": 0.0121,
"step": 379
},
{
"epoch": 3.5185185185185186,
"grad_norm": 0.032668791711330414,
"learning_rate": 2.4442290229706344e-05,
"loss": 0.0112,
"step": 380
},
{
"epoch": 3.5185185185185186,
"eval_loss": 0.015212837606668472,
"eval_runtime": 9.1177,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 380
},
{
"epoch": 3.5277777777777777,
"grad_norm": 0.02449023723602295,
"learning_rate": 2.4165031442406855e-05,
"loss": 0.0117,
"step": 381
},
{
"epoch": 3.537037037037037,
"grad_norm": 0.025157004594802856,
"learning_rate": 2.3888852182413085e-05,
"loss": 0.0091,
"step": 382
},
{
"epoch": 3.5462962962962963,
"grad_norm": 0.03108743578195572,
"learning_rate": 2.361376399001592e-05,
"loss": 0.0108,
"step": 383
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.021932488307356834,
"learning_rate": 2.333977835991545e-05,
"loss": 0.0093,
"step": 384
},
{
"epoch": 3.564814814814815,
"grad_norm": 0.026496881619095802,
"learning_rate": 2.3066906740740623e-05,
"loss": 0.0118,
"step": 385
},
{
"epoch": 3.564814814814815,
"eval_loss": 0.01467986311763525,
"eval_runtime": 9.1127,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.427,
"step": 385
},
{
"epoch": 3.574074074074074,
"grad_norm": 0.024211710318922997,
"learning_rate": 2.2795160534570864e-05,
"loss": 0.0086,
"step": 386
},
{
"epoch": 3.5833333333333335,
"grad_norm": 0.023977207019925117,
"learning_rate": 2.25245510964597e-05,
"loss": 0.0128,
"step": 387
},
{
"epoch": 3.5925925925925926,
"grad_norm": 0.02136526070535183,
"learning_rate": 2.225508973396016e-05,
"loss": 0.0121,
"step": 388
},
{
"epoch": 3.601851851851852,
"grad_norm": 0.026328187435865402,
"learning_rate": 2.198678770665238e-05,
"loss": 0.0108,
"step": 389
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.02159940078854561,
"learning_rate": 2.171965622567308e-05,
"loss": 0.0082,
"step": 390
},
{
"epoch": 3.611111111111111,
"eval_loss": 0.014544774778187275,
"eval_runtime": 9.1133,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 1.426,
"step": 390
},
{
"epoch": 3.6203703703703702,
"grad_norm": 0.02303987927734852,
"learning_rate": 2.1453706453247087e-05,
"loss": 0.0092,
"step": 391
},
{
"epoch": 3.6296296296296298,
"grad_norm": 0.027734337374567986,
"learning_rate": 2.1188949502220983e-05,
"loss": 0.0101,
"step": 392
},
{
"epoch": 3.638888888888889,
"grad_norm": 0.02069096453487873,
"learning_rate": 2.0925396435598664e-05,
"loss": 0.0111,
"step": 393
},
{
"epoch": 3.648148148148148,
"grad_norm": 0.02777431532740593,
"learning_rate": 2.066305826607911e-05,
"loss": 0.0091,
"step": 394
},
{
"epoch": 3.6574074074074074,
"grad_norm": 0.02333620935678482,
"learning_rate": 2.0401945955596206e-05,
"loss": 0.0112,
"step": 395
},
{
"epoch": 3.6574074074074074,
"eval_loss": 0.01460795197635889,
"eval_runtime": 9.1059,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 395
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.022142188623547554,
"learning_rate": 2.0142070414860704e-05,
"loss": 0.01,
"step": 396
},
{
"epoch": 3.675925925925926,
"grad_norm": 0.01749616675078869,
"learning_rate": 1.9883442502904283e-05,
"loss": 0.0095,
"step": 397
},
{
"epoch": 3.685185185185185,
"grad_norm": 0.02393367514014244,
"learning_rate": 1.9626073026625818e-05,
"loss": 0.0095,
"step": 398
},
{
"epoch": 3.6944444444444446,
"grad_norm": 0.023465050384402275,
"learning_rate": 1.936997274033986e-05,
"loss": 0.0108,
"step": 399
},
{
"epoch": 3.7037037037037037,
"grad_norm": 0.023157304152846336,
"learning_rate": 1.9115152345327152e-05,
"loss": 0.0086,
"step": 400
},
{
"epoch": 3.7037037037037037,
"eval_loss": 0.014902754686772823,
"eval_runtime": 9.1616,
"eval_samples_per_second": 5.458,
"eval_steps_per_second": 1.419,
"step": 400
},
{
"epoch": 3.712962962962963,
"grad_norm": 0.021799901500344276,
"learning_rate": 1.8861622489387555e-05,
"loss": 0.0128,
"step": 401
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.03070679120719433,
"learning_rate": 1.8609393766395085e-05,
"loss": 0.0123,
"step": 402
},
{
"epoch": 3.7314814814814814,
"grad_norm": 0.02543518878519535,
"learning_rate": 1.835847671585526e-05,
"loss": 0.0114,
"step": 403
},
{
"epoch": 3.7407407407407405,
"grad_norm": 0.027585655450820923,
"learning_rate": 1.8108881822464696e-05,
"loss": 0.0099,
"step": 404
},
{
"epoch": 3.75,
"grad_norm": 0.02352389506995678,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.0102,
"step": 405
},
{
"epoch": 3.75,
"eval_loss": 0.014981208369135857,
"eval_runtime": 9.1106,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 405
},
{
"epoch": 3.7592592592592595,
"grad_norm": 0.02560283988714218,
"learning_rate": 1.7613700169247056e-05,
"loss": 0.012,
"step": 406
},
{
"epoch": 3.7685185185185186,
"grad_norm": 0.026089752092957497,
"learning_rate": 1.7368134100837287e-05,
"loss": 0.0088,
"step": 407
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.030365899205207825,
"learning_rate": 1.7123931571546827e-05,
"loss": 0.0119,
"step": 408
},
{
"epoch": 3.787037037037037,
"grad_norm": 0.031558796763420105,
"learning_rate": 1.6881102785502616e-05,
"loss": 0.011,
"step": 409
},
{
"epoch": 3.7962962962962963,
"grad_norm": 0.030366325750947,
"learning_rate": 1.6639657889429018e-05,
"loss": 0.0116,
"step": 410
},
{
"epoch": 3.7962962962962963,
"eval_loss": 0.014859426766633987,
"eval_runtime": 9.1059,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 410
},
{
"epoch": 3.8055555555555554,
"grad_norm": 0.025008074939250946,
"learning_rate": 1.639960697222388e-05,
"loss": 0.0106,
"step": 411
},
{
"epoch": 3.814814814814815,
"grad_norm": 0.028196556493639946,
"learning_rate": 1.6160960064536908e-05,
"loss": 0.0113,
"step": 412
},
{
"epoch": 3.824074074074074,
"grad_norm": 0.02165764756500721,
"learning_rate": 1.592372713835055e-05,
"loss": 0.0115,
"step": 413
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.020175475627183914,
"learning_rate": 1.5687918106563326e-05,
"loss": 0.0112,
"step": 414
},
{
"epoch": 3.8425925925925926,
"grad_norm": 0.027304671704769135,
"learning_rate": 1.545354282257562e-05,
"loss": 0.0126,
"step": 415
},
{
"epoch": 3.8425925925925926,
"eval_loss": 0.014735485427081585,
"eval_runtime": 9.198,
"eval_samples_per_second": 5.436,
"eval_steps_per_second": 1.413,
"step": 415
},
{
"epoch": 3.851851851851852,
"grad_norm": 0.026429716497659683,
"learning_rate": 1.52206110798779e-05,
"loss": 0.0103,
"step": 416
},
{
"epoch": 3.861111111111111,
"grad_norm": 0.02409077063202858,
"learning_rate": 1.4989132611641576e-05,
"loss": 0.012,
"step": 417
},
{
"epoch": 3.8703703703703702,
"grad_norm": 0.02310461364686489,
"learning_rate": 1.4759117090312197e-05,
"loss": 0.0096,
"step": 418
},
{
"epoch": 3.8796296296296298,
"grad_norm": 0.026219584047794342,
"learning_rate": 1.453057412720536e-05,
"loss": 0.0094,
"step": 419
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.027541201561689377,
"learning_rate": 1.4303513272105057e-05,
"loss": 0.0112,
"step": 420
},
{
"epoch": 3.888888888888889,
"eval_loss": 0.014594363048672676,
"eval_runtime": 9.1304,
"eval_samples_per_second": 5.476,
"eval_steps_per_second": 1.424,
"step": 420
},
{
"epoch": 3.898148148148148,
"grad_norm": 0.024942217394709587,
"learning_rate": 1.4077944012864636e-05,
"loss": 0.0093,
"step": 421
},
{
"epoch": 3.9074074074074074,
"grad_norm": 0.018137283623218536,
"learning_rate": 1.3853875775010355e-05,
"loss": 0.0102,
"step": 422
},
{
"epoch": 3.9166666666666665,
"grad_norm": 0.021817779168486595,
"learning_rate": 1.3631317921347563e-05,
"loss": 0.0084,
"step": 423
},
{
"epoch": 3.925925925925926,
"grad_norm": 0.023799235001206398,
"learning_rate": 1.3410279751569399e-05,
"loss": 0.0122,
"step": 424
},
{
"epoch": 3.935185185185185,
"grad_norm": 0.030764896422624588,
"learning_rate": 1.3190770501868243e-05,
"loss": 0.0107,
"step": 425
},
{
"epoch": 3.935185185185185,
"eval_loss": 0.014631365425884724,
"eval_runtime": 9.1149,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 425
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.022886106744408607,
"learning_rate": 1.297279934454978e-05,
"loss": 0.0096,
"step": 426
},
{
"epoch": 3.9537037037037037,
"grad_norm": 0.03152737021446228,
"learning_rate": 1.2756375387649716e-05,
"loss": 0.0124,
"step": 427
},
{
"epoch": 3.962962962962963,
"grad_norm": 0.02872036211192608,
"learning_rate": 1.25415076745532e-05,
"loss": 0.0091,
"step": 428
},
{
"epoch": 3.9722222222222223,
"grad_norm": 0.021184636279940605,
"learning_rate": 1.2328205183616965e-05,
"loss": 0.0105,
"step": 429
},
{
"epoch": 3.9814814814814814,
"grad_norm": 0.02112959884107113,
"learning_rate": 1.2116476827794104e-05,
"loss": 0.0113,
"step": 430
},
{
"epoch": 3.9814814814814814,
"eval_loss": 0.01471536885946989,
"eval_runtime": 9.116,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 430
},
{
"epoch": 3.9907407407407405,
"grad_norm": 0.019945990294218063,
"learning_rate": 1.1906331454261704e-05,
"loss": 0.0093,
"step": 431
},
{
"epoch": 4.0,
"grad_norm": 0.023910805583000183,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.011,
"step": 432
},
{
"epoch": 4.0092592592592595,
"grad_norm": 0.01957758143544197,
"learning_rate": 1.1490824711681025e-05,
"loss": 0.0094,
"step": 433
},
{
"epoch": 4.018518518518518,
"grad_norm": 0.02563118375837803,
"learning_rate": 1.1285480704793377e-05,
"loss": 0.0093,
"step": 434
},
{
"epoch": 4.027777777777778,
"grad_norm": 0.026251764968037605,
"learning_rate": 1.1081754403791999e-05,
"loss": 0.0091,
"step": 435
},
{
"epoch": 4.027777777777778,
"eval_loss": 0.014734329655766487,
"eval_runtime": 9.1592,
"eval_samples_per_second": 5.459,
"eval_steps_per_second": 1.419,
"step": 435
},
{
"epoch": 4.037037037037037,
"grad_norm": 0.025834446772933006,
"learning_rate": 1.0879654321484012e-05,
"loss": 0.0067,
"step": 436
},
{
"epoch": 4.046296296296297,
"grad_norm": 0.0185233224183321,
"learning_rate": 1.0679188902724191e-05,
"loss": 0.0108,
"step": 437
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.021918736398220062,
"learning_rate": 1.0480366524062042e-05,
"loss": 0.0088,
"step": 438
},
{
"epoch": 4.064814814814815,
"grad_norm": 0.03142661973834038,
"learning_rate": 1.0283195493391823e-05,
"loss": 0.0103,
"step": 439
},
{
"epoch": 4.074074074074074,
"grad_norm": 0.023410873487591743,
"learning_rate": 1.008768404960535e-05,
"loss": 0.0094,
"step": 440
},
{
"epoch": 4.074074074074074,
"eval_loss": 0.014965096488595009,
"eval_runtime": 9.1135,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 440
},
{
"epoch": 4.083333333333333,
"grad_norm": 0.02943902276456356,
"learning_rate": 9.893840362247809e-06,
"loss": 0.0056,
"step": 441
},
{
"epoch": 4.092592592592593,
"grad_norm": 0.021431270986795425,
"learning_rate": 9.701672531176286e-06,
"loss": 0.0089,
"step": 442
},
{
"epoch": 4.101851851851852,
"grad_norm": 0.02797669917345047,
"learning_rate": 9.511188586221376e-06,
"loss": 0.0092,
"step": 443
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.02437691204249859,
"learning_rate": 9.322396486851626e-06,
"loss": 0.0104,
"step": 444
},
{
"epoch": 4.12037037037037,
"grad_norm": 0.024811841547489166,
"learning_rate": 9.135304121840976e-06,
"loss": 0.0096,
"step": 445
},
{
"epoch": 4.12037037037037,
"eval_loss": 0.014996801503002644,
"eval_runtime": 9.1094,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 1.427,
"step": 445
},
{
"epoch": 4.12962962962963,
"grad_norm": 0.0309213325381279,
"learning_rate": 8.949919308939082e-06,
"loss": 0.0109,
"step": 446
},
{
"epoch": 4.138888888888889,
"grad_norm": 0.023763932287693024,
"learning_rate": 8.766249794544662e-06,
"loss": 0.0073,
"step": 447
},
{
"epoch": 4.148148148148148,
"grad_norm": 0.023741643875837326,
"learning_rate": 8.584303253381847e-06,
"loss": 0.0105,
"step": 448
},
{
"epoch": 4.157407407407407,
"grad_norm": 0.02090543322265148,
"learning_rate": 8.404087288179424e-06,
"loss": 0.0096,
"step": 449
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.026315612718462944,
"learning_rate": 8.225609429353187e-06,
"loss": 0.0091,
"step": 450
},
{
"epoch": 4.166666666666667,
"eval_loss": 0.015186839736998081,
"eval_runtime": 9.1241,
"eval_samples_per_second": 5.48,
"eval_steps_per_second": 1.425,
"step": 450
},
{
"epoch": 4.175925925925926,
"grad_norm": 0.023099206387996674,
"learning_rate": 8.048877134691268e-06,
"loss": 0.0091,
"step": 451
},
{
"epoch": 4.185185185185185,
"grad_norm": 0.027901167050004005,
"learning_rate": 7.873897789042523e-06,
"loss": 0.0092,
"step": 452
},
{
"epoch": 4.194444444444445,
"grad_norm": 0.025486482307314873,
"learning_rate": 7.700678704007947e-06,
"loss": 0.0077,
"step": 453
},
{
"epoch": 4.203703703703703,
"grad_norm": 0.0233286302536726,
"learning_rate": 7.529227117635135e-06,
"loss": 0.0077,
"step": 454
},
{
"epoch": 4.212962962962963,
"grad_norm": 0.023314587771892548,
"learning_rate": 7.35955019411585e-06,
"loss": 0.0089,
"step": 455
},
{
"epoch": 4.212962962962963,
"eval_loss": 0.015497377142310143,
"eval_runtime": 9.1064,
"eval_samples_per_second": 5.491,
"eval_steps_per_second": 1.428,
"step": 455
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.021640775725245476,
"learning_rate": 7.191655023486682e-06,
"loss": 0.01,
"step": 456
},
{
"epoch": 4.231481481481482,
"grad_norm": 0.027831410989165306,
"learning_rate": 7.02554862133275e-06,
"loss": 0.0105,
"step": 457
},
{
"epoch": 4.2407407407407405,
"grad_norm": 0.023242153227329254,
"learning_rate": 6.861237928494579e-06,
"loss": 0.009,
"step": 458
},
{
"epoch": 4.25,
"grad_norm": 0.02775505743920803,
"learning_rate": 6.698729810778065e-06,
"loss": 0.0102,
"step": 459
},
{
"epoch": 4.2592592592592595,
"grad_norm": 0.0267843846231699,
"learning_rate": 6.53803105866761e-06,
"loss": 0.0063,
"step": 460
},
{
"epoch": 4.2592592592592595,
"eval_loss": 0.01563325710594654,
"eval_runtime": 9.111,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 460
},
{
"epoch": 4.268518518518518,
"grad_norm": 0.02488654851913452,
"learning_rate": 6.379148387042316e-06,
"loss": 0.01,
"step": 461
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.024208445101976395,
"learning_rate": 6.222088434895462e-06,
"loss": 0.0072,
"step": 462
},
{
"epoch": 4.287037037037037,
"grad_norm": 0.023147890344262123,
"learning_rate": 6.066857765057055e-06,
"loss": 0.0088,
"step": 463
},
{
"epoch": 4.296296296296296,
"grad_norm": 0.029451172798871994,
"learning_rate": 5.9134628639196e-06,
"loss": 0.0085,
"step": 464
},
{
"epoch": 4.305555555555555,
"grad_norm": 0.02764413133263588,
"learning_rate": 5.7619101411671095e-06,
"loss": 0.0099,
"step": 465
},
{
"epoch": 4.305555555555555,
"eval_loss": 0.015693385154008865,
"eval_runtime": 9.1176,
"eval_samples_per_second": 5.484,
"eval_steps_per_second": 1.426,
"step": 465
},
{
"epoch": 4.314814814814815,
"grad_norm": 0.021906448528170586,
"learning_rate": 5.6122059295072085e-06,
"loss": 0.0096,
"step": 466
},
{
"epoch": 4.324074074074074,
"grad_norm": 0.02385389618575573,
"learning_rate": 5.464356484406535e-06,
"loss": 0.0072,
"step": 467
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.026357507333159447,
"learning_rate": 5.318367983829392e-06,
"loss": 0.0079,
"step": 468
},
{
"epoch": 4.342592592592593,
"grad_norm": 0.026002187281847,
"learning_rate": 5.174246527979531e-06,
"loss": 0.0095,
"step": 469
},
{
"epoch": 4.351851851851852,
"grad_norm": 0.02679777517914772,
"learning_rate": 5.031998139045352e-06,
"loss": 0.0085,
"step": 470
},
{
"epoch": 4.351851851851852,
"eval_loss": 0.015615792945027351,
"eval_runtime": 9.1365,
"eval_samples_per_second": 5.473,
"eval_steps_per_second": 1.423,
"step": 470
},
{
"epoch": 4.361111111111111,
"grad_norm": 0.023431269451975822,
"learning_rate": 4.891628760948114e-06,
"loss": 0.009,
"step": 471
},
{
"epoch": 4.37037037037037,
"grad_norm": 0.02848837524652481,
"learning_rate": 4.7531442590937335e-06,
"loss": 0.0102,
"step": 472
},
{
"epoch": 4.37962962962963,
"grad_norm": 0.026586227118968964,
"learning_rate": 4.616550420127563e-06,
"loss": 0.0078,
"step": 473
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.025660747662186623,
"learning_rate": 4.4818529516926726e-06,
"loss": 0.0086,
"step": 474
},
{
"epoch": 4.398148148148148,
"grad_norm": 0.02436869405210018,
"learning_rate": 4.349057482191299e-06,
"loss": 0.011,
"step": 475
},
{
"epoch": 4.398148148148148,
"eval_loss": 0.015554042533040047,
"eval_runtime": 9.1142,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 475
},
{
"epoch": 4.407407407407407,
"grad_norm": 0.02513139322400093,
"learning_rate": 4.218169560549706e-06,
"loss": 0.0108,
"step": 476
},
{
"epoch": 4.416666666666667,
"grad_norm": 0.027343349531292915,
"learning_rate": 4.089194655986306e-06,
"loss": 0.0099,
"step": 477
},
{
"epoch": 4.425925925925926,
"grad_norm": 0.02374204248189926,
"learning_rate": 3.962138157783085e-06,
"loss": 0.0095,
"step": 478
},
{
"epoch": 4.435185185185185,
"grad_norm": 0.04114212468266487,
"learning_rate": 3.837005375060482e-06,
"loss": 0.0089,
"step": 479
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.024016965180635452,
"learning_rate": 3.7138015365554833e-06,
"loss": 0.0067,
"step": 480
},
{
"epoch": 4.444444444444445,
"eval_loss": 0.01539613213390112,
"eval_runtime": 9.1246,
"eval_samples_per_second": 5.48,
"eval_steps_per_second": 1.425,
"step": 480
},
{
"epoch": 4.453703703703704,
"grad_norm": 0.02901994250714779,
"learning_rate": 3.5925317904031587e-06,
"loss": 0.0087,
"step": 481
},
{
"epoch": 4.462962962962963,
"grad_norm": 0.020981522276997566,
"learning_rate": 3.4732012039215776e-06,
"loss": 0.011,
"step": 482
},
{
"epoch": 4.472222222222222,
"grad_norm": 0.023783011361956596,
"learning_rate": 3.3558147633999728e-06,
"loss": 0.0096,
"step": 483
},
{
"epoch": 4.481481481481482,
"grad_norm": 0.02081628330051899,
"learning_rate": 3.2403773738905187e-06,
"loss": 0.0087,
"step": 484
},
{
"epoch": 4.4907407407407405,
"grad_norm": 0.024986054748296738,
"learning_rate": 3.126893859003249e-06,
"loss": 0.0092,
"step": 485
},
{
"epoch": 4.4907407407407405,
"eval_loss": 0.015287145972251892,
"eval_runtime": 9.1097,
"eval_samples_per_second": 5.489,
"eval_steps_per_second": 1.427,
"step": 485
},
{
"epoch": 4.5,
"grad_norm": 0.032323963940143585,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.0086,
"step": 486
},
{
"epoch": 4.5092592592592595,
"grad_norm": 0.02963520959019661,
"learning_rate": 2.9058073391191375e-06,
"loss": 0.0068,
"step": 487
},
{
"epoch": 4.518518518518518,
"grad_norm": 0.035344675183296204,
"learning_rate": 2.798213572335001e-06,
"loss": 0.0062,
"step": 488
},
{
"epoch": 4.527777777777778,
"grad_norm": 0.026800939813256264,
"learning_rate": 2.692592156212487e-06,
"loss": 0.0092,
"step": 489
},
{
"epoch": 4.537037037037037,
"grad_norm": 0.024116506800055504,
"learning_rate": 2.5889475041961765e-06,
"loss": 0.0072,
"step": 490
},
{
"epoch": 4.537037037037037,
"eval_loss": 0.015211592428386211,
"eval_runtime": 9.1184,
"eval_samples_per_second": 5.483,
"eval_steps_per_second": 1.426,
"step": 490
},
{
"epoch": 4.546296296296296,
"grad_norm": 0.027498748153448105,
"learning_rate": 2.4872839471306084e-06,
"loss": 0.0082,
"step": 491
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.026998436078429222,
"learning_rate": 2.3876057330792346e-06,
"loss": 0.008,
"step": 492
},
{
"epoch": 4.564814814814815,
"grad_norm": 0.023703446611762047,
"learning_rate": 2.2899170271469428e-06,
"loss": 0.011,
"step": 493
},
{
"epoch": 4.574074074074074,
"grad_norm": 0.019968930631875992,
"learning_rate": 2.1942219113060212e-06,
"loss": 0.0075,
"step": 494
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.02214980125427246,
"learning_rate": 2.100524384225555e-06,
"loss": 0.0078,
"step": 495
},
{
"epoch": 4.583333333333333,
"eval_loss": 0.015181516297161579,
"eval_runtime": 9.1214,
"eval_samples_per_second": 5.482,
"eval_steps_per_second": 1.425,
"step": 495
},
{
"epoch": 4.592592592592593,
"grad_norm": 0.025330157950520515,
"learning_rate": 2.0088283611044036e-06,
"loss": 0.0062,
"step": 496
},
{
"epoch": 4.601851851851852,
"grad_norm": 0.019013626500964165,
"learning_rate": 1.9191376735075427e-06,
"loss": 0.0088,
"step": 497
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.022145694121718407,
"learning_rate": 1.8314560692059835e-06,
"loss": 0.0089,
"step": 498
},
{
"epoch": 4.62037037037037,
"grad_norm": 0.023724934086203575,
"learning_rate": 1.7457872120201779e-06,
"loss": 0.0086,
"step": 499
},
{
"epoch": 4.62962962962963,
"grad_norm": 0.020578699186444283,
"learning_rate": 1.6621346816668992e-06,
"loss": 0.0091,
"step": 500
},
{
"epoch": 4.62962962962963,
"eval_loss": 0.015207822434604168,
"eval_runtime": 9.1136,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 500
},
{
"epoch": 4.638888888888889,
"grad_norm": 0.024306217208504677,
"learning_rate": 1.5805019736097104e-06,
"loss": 0.009,
"step": 501
},
{
"epoch": 4.648148148148148,
"grad_norm": 0.020744021981954575,
"learning_rate": 1.5008924989128258e-06,
"loss": 0.0089,
"step": 502
},
{
"epoch": 4.657407407407407,
"grad_norm": 0.02516799047589302,
"learning_rate": 1.4233095840986753e-06,
"loss": 0.0093,
"step": 503
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.024567998945713043,
"learning_rate": 1.3477564710088098e-06,
"loss": 0.0094,
"step": 504
},
{
"epoch": 4.675925925925926,
"grad_norm": 0.024358859285712242,
"learning_rate": 1.2742363166685034e-06,
"loss": 0.007,
"step": 505
},
{
"epoch": 4.675925925925926,
"eval_loss": 0.015200878493487835,
"eval_runtime": 9.1155,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 505
},
{
"epoch": 4.685185185185185,
"grad_norm": 0.023163504898548126,
"learning_rate": 1.2027521931548214e-06,
"loss": 0.0074,
"step": 506
},
{
"epoch": 4.694444444444445,
"grad_norm": 0.023604586720466614,
"learning_rate": 1.1333070874682216e-06,
"loss": 0.0093,
"step": 507
},
{
"epoch": 4.703703703703704,
"grad_norm": 0.02068418823182583,
"learning_rate": 1.0659039014077944e-06,
"loss": 0.0084,
"step": 508
},
{
"epoch": 4.712962962962963,
"grad_norm": 0.02598651312291622,
"learning_rate": 1.0005454514499414e-06,
"loss": 0.0088,
"step": 509
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.02512424811720848,
"learning_rate": 9.372344686307655e-07,
"loss": 0.0064,
"step": 510
},
{
"epoch": 4.722222222222222,
"eval_loss": 0.01521637849509716,
"eval_runtime": 9.1143,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 510
},
{
"epoch": 4.731481481481482,
"grad_norm": 0.021041063591837883,
"learning_rate": 8.759735984318895e-07,
"loss": 0.0096,
"step": 511
},
{
"epoch": 4.7407407407407405,
"grad_norm": 0.025718161836266518,
"learning_rate": 8.167654006699443e-07,
"loss": 0.0077,
"step": 512
},
{
"epoch": 4.75,
"grad_norm": 0.02913082391023636,
"learning_rate": 7.596123493895991e-07,
"loss": 0.0072,
"step": 513
},
{
"epoch": 4.7592592592592595,
"grad_norm": 0.026588505133986473,
"learning_rate": 7.04516832760177e-07,
"loss": 0.0094,
"step": 514
},
{
"epoch": 4.768518518518518,
"grad_norm": 0.023728126659989357,
"learning_rate": 6.514811529758747e-07,
"loss": 0.0099,
"step": 515
},
{
"epoch": 4.768518518518518,
"eval_loss": 0.01521516963839531,
"eval_runtime": 9.1511,
"eval_samples_per_second": 5.464,
"eval_steps_per_second": 1.421,
"step": 515
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.03438512608408928,
"learning_rate": 6.005075261595494e-07,
"loss": 0.0086,
"step": 516
},
{
"epoch": 4.787037037037037,
"grad_norm": 0.019554298371076584,
"learning_rate": 5.515980822701439e-07,
"loss": 0.0092,
"step": 517
},
{
"epoch": 4.796296296296296,
"grad_norm": 0.0235204566270113,
"learning_rate": 5.047548650136513e-07,
"loss": 0.009,
"step": 518
},
{
"epoch": 4.805555555555555,
"grad_norm": 0.023747643455863,
"learning_rate": 4.5997983175773417e-07,
"loss": 0.0092,
"step": 519
},
{
"epoch": 4.814814814814815,
"grad_norm": 0.02751827985048294,
"learning_rate": 4.1727485344994486e-07,
"loss": 0.0088,
"step": 520
},
{
"epoch": 4.814814814814815,
"eval_loss": 0.015235532075166702,
"eval_runtime": 9.1256,
"eval_samples_per_second": 5.479,
"eval_steps_per_second": 1.425,
"step": 520
},
{
"epoch": 4.824074074074074,
"grad_norm": 0.026621591299772263,
"learning_rate": 3.766417145395218e-07,
"loss": 0.0086,
"step": 521
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.01991841197013855,
"learning_rate": 3.380821129028489e-07,
"loss": 0.0084,
"step": 522
},
{
"epoch": 4.842592592592593,
"grad_norm": 0.023508219048380852,
"learning_rate": 3.0159765977250673e-07,
"loss": 0.0103,
"step": 523
},
{
"epoch": 4.851851851851852,
"grad_norm": 0.02976732887327671,
"learning_rate": 2.671898796699268e-07,
"loss": 0.0084,
"step": 524
},
{
"epoch": 4.861111111111111,
"grad_norm": 0.02255621738731861,
"learning_rate": 2.3486021034170857e-07,
"loss": 0.0089,
"step": 525
},
{
"epoch": 4.861111111111111,
"eval_loss": 0.015216498635709286,
"eval_runtime": 9.1106,
"eval_samples_per_second": 5.488,
"eval_steps_per_second": 1.427,
"step": 525
},
{
"epoch": 4.87037037037037,
"grad_norm": 0.025215914472937584,
"learning_rate": 2.0461000269953456e-07,
"loss": 0.0075,
"step": 526
},
{
"epoch": 4.87962962962963,
"grad_norm": 0.02554066851735115,
"learning_rate": 1.7644052076371542e-07,
"loss": 0.0083,
"step": 527
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.02162836864590645,
"learning_rate": 1.503529416103988e-07,
"loss": 0.009,
"step": 528
},
{
"epoch": 4.898148148148148,
"grad_norm": 0.02335723116993904,
"learning_rate": 1.2634835532233657e-07,
"loss": 0.0093,
"step": 529
},
{
"epoch": 4.907407407407407,
"grad_norm": 0.02844967506825924,
"learning_rate": 1.044277649433989e-07,
"loss": 0.0083,
"step": 530
},
{
"epoch": 4.907407407407407,
"eval_loss": 0.015229844488203526,
"eval_runtime": 9.1406,
"eval_samples_per_second": 5.47,
"eval_steps_per_second": 1.422,
"step": 530
},
{
"epoch": 4.916666666666667,
"grad_norm": 0.02188325859606266,
"learning_rate": 8.459208643659122e-08,
"loss": 0.0084,
"step": 531
},
{
"epoch": 4.925925925925926,
"grad_norm": 0.026782654225826263,
"learning_rate": 6.684214864584038e-08,
"loss": 0.009,
"step": 532
},
{
"epoch": 4.935185185185185,
"grad_norm": 0.024010982364416122,
"learning_rate": 5.11786932613223e-08,
"loss": 0.0055,
"step": 533
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.02621973119676113,
"learning_rate": 3.760237478849793e-08,
"loss": 0.0093,
"step": 534
},
{
"epoch": 4.953703703703704,
"grad_norm": 0.02257387712597847,
"learning_rate": 2.6113760520735108e-08,
"loss": 0.0103,
"step": 535
},
{
"epoch": 4.953703703703704,
"eval_loss": 0.015256751328706741,
"eval_runtime": 9.1156,
"eval_samples_per_second": 5.485,
"eval_steps_per_second": 1.426,
"step": 535
},
{
"epoch": 4.962962962962963,
"grad_norm": 0.02289225161075592,
"learning_rate": 1.6713330515627513e-08,
"loss": 0.0106,
"step": 536
},
{
"epoch": 4.972222222222222,
"grad_norm": 0.032289694994688034,
"learning_rate": 9.401477574932926e-09,
"loss": 0.0074,
"step": 537
},
{
"epoch": 4.981481481481482,
"grad_norm": 0.0215620007365942,
"learning_rate": 4.178507228136397e-09,
"loss": 0.0082,
"step": 538
},
{
"epoch": 4.9907407407407405,
"grad_norm": 0.02391226962208748,
"learning_rate": 1.0446377197104173e-09,
"loss": 0.0085,
"step": 539
},
{
"epoch": 5.0,
"grad_norm": 0.0241775494068861,
"learning_rate": 0.0,
"loss": 0.0092,
"step": 540
},
{
"epoch": 5.0,
"eval_loss": 0.01526525616645813,
"eval_runtime": 9.1149,
"eval_samples_per_second": 5.486,
"eval_steps_per_second": 1.426,
"step": 540
},
{
"epoch": 5.0,
"step": 540,
"total_flos": 1.2254685925518213e+18,
"train_loss": 0.016027936152251506,
"train_runtime": 9839.9649,
"train_samples_per_second": 1.756,
"train_steps_per_second": 0.055
}
],
"logging_steps": 1,
"max_steps": 540,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2254685925518213e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}