|
{ |
|
"best_metric": 0.014902754686772823, |
|
"best_model_checkpoint": "/home/paperspace/Data/models/akoul_whitehorseliquidity_25c/llm3br256/checkpoint-400", |
|
"epoch": 5.0, |
|
"eval_steps": 5, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009259259259259259, |
|
"grad_norm": 0.29716095328330994, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.1002, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018518518518518517, |
|
"grad_norm": 0.2648535370826721, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.0936, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.027777777777777776, |
|
"grad_norm": 0.24819649755954742, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.0898, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 0.23442289233207703, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.087, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.046296296296296294, |
|
"grad_norm": 0.26300737261772156, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.0904, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.046296296296296294, |
|
"eval_loss": 0.0950983464717865, |
|
"eval_runtime": 11.9584, |
|
"eval_samples_per_second": 4.181, |
|
"eval_steps_per_second": 1.087, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 0.18399731814861298, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0805, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06481481481481481, |
|
"grad_norm": 0.19827856123447418, |
|
"learning_rate": 1.2962962962962962e-05, |
|
"loss": 0.0782, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.13050280511379242, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.0636, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.12110771238803864, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.056, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 0.1111820638179779, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.053, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"eval_loss": 0.04887561500072479, |
|
"eval_runtime": 9.1057, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10185185185185185, |
|
"grad_norm": 0.0779903382062912, |
|
"learning_rate": 2.037037037037037e-05, |
|
"loss": 0.0538, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.08193033933639526, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.0398, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12037037037037036, |
|
"grad_norm": 0.0821649506688118, |
|
"learning_rate": 2.4074074074074074e-05, |
|
"loss": 0.0473, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12962962962962962, |
|
"grad_norm": 0.07107188552618027, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.0386, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 0.05971238389611244, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0417, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"eval_loss": 0.04156189784407616, |
|
"eval_runtime": 9.1211, |
|
"eval_samples_per_second": 5.482, |
|
"eval_steps_per_second": 1.425, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.05262186750769615, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.0384, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1574074074074074, |
|
"grad_norm": 0.05361900106072426, |
|
"learning_rate": 3.148148148148148e-05, |
|
"loss": 0.0378, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.05355929210782051, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.0399, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.17592592592592593, |
|
"grad_norm": 0.04563885182142258, |
|
"learning_rate": 3.518518518518519e-05, |
|
"loss": 0.0368, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.060624465346336365, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.0396, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"eval_loss": 0.03584723547101021, |
|
"eval_runtime": 9.1162, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19444444444444445, |
|
"grad_norm": 0.0525534488260746, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.0364, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2037037037037037, |
|
"grad_norm": 0.041657958179712296, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.034, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21296296296296297, |
|
"grad_norm": 0.04589791223406792, |
|
"learning_rate": 4.259259259259259e-05, |
|
"loss": 0.0317, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.04220304638147354, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.0339, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"grad_norm": 0.03630352392792702, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.029, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"eval_loss": 0.03286580368876457, |
|
"eval_runtime": 9.1191, |
|
"eval_samples_per_second": 5.483, |
|
"eval_steps_per_second": 1.426, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24074074074074073, |
|
"grad_norm": 0.04235522821545601, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.0326, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.04675336927175522, |
|
"learning_rate": 5e-05, |
|
"loss": 0.03, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 0.039461418986320496, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 0.0328, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.26851851851851855, |
|
"grad_norm": 0.044042930006980896, |
|
"learning_rate": 5.370370370370371e-05, |
|
"loss": 0.0294, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 0.044502489268779755, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.0311, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"eval_loss": 0.030865700915455818, |
|
"eval_runtime": 9.1099, |
|
"eval_samples_per_second": 5.489, |
|
"eval_steps_per_second": 1.427, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.28703703703703703, |
|
"grad_norm": 0.04979817569255829, |
|
"learning_rate": 5.740740740740741e-05, |
|
"loss": 0.0292, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.04573828727006912, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 0.0346, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3055555555555556, |
|
"grad_norm": 0.0410350002348423, |
|
"learning_rate": 6.111111111111112e-05, |
|
"loss": 0.0295, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3148148148148148, |
|
"grad_norm": 0.0416686087846756, |
|
"learning_rate": 6.296296296296296e-05, |
|
"loss": 0.0267, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.32407407407407407, |
|
"grad_norm": 0.042319901287555695, |
|
"learning_rate": 6.481481481481482e-05, |
|
"loss": 0.0295, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.32407407407407407, |
|
"eval_loss": 0.028042705729603767, |
|
"eval_runtime": 9.1376, |
|
"eval_samples_per_second": 5.472, |
|
"eval_steps_per_second": 1.423, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.037845220416784286, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0319, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3425925925925926, |
|
"grad_norm": 0.03568718954920769, |
|
"learning_rate": 6.851851851851852e-05, |
|
"loss": 0.0346, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.35185185185185186, |
|
"grad_norm": 0.037281136959791183, |
|
"learning_rate": 7.037037037037038e-05, |
|
"loss": 0.031, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3611111111111111, |
|
"grad_norm": 0.03607446327805519, |
|
"learning_rate": 7.222222222222222e-05, |
|
"loss": 0.0335, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.03654631972312927, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.0262, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"eval_loss": 0.026602942496538162, |
|
"eval_runtime": 9.1124, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37962962962962965, |
|
"grad_norm": 0.039490777999162674, |
|
"learning_rate": 7.592592592592593e-05, |
|
"loss": 0.0252, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 0.036680739372968674, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.0242, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.39814814814814814, |
|
"grad_norm": 0.040739599615335464, |
|
"learning_rate": 7.962962962962964e-05, |
|
"loss": 0.025, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 0.04679260402917862, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 0.0212, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.04656214639544487, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.0272, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"eval_loss": 0.02608887106180191, |
|
"eval_runtime": 9.1343, |
|
"eval_samples_per_second": 5.474, |
|
"eval_steps_per_second": 1.423, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.42592592592592593, |
|
"grad_norm": 0.04525485262274742, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 0.0274, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4351851851851852, |
|
"grad_norm": 0.03210742771625519, |
|
"learning_rate": 8.703703703703704e-05, |
|
"loss": 0.0283, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.03675089031457901, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.0242, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4537037037037037, |
|
"grad_norm": 0.03396710753440857, |
|
"learning_rate": 9.074074074074075e-05, |
|
"loss": 0.0239, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 0.02745971269905567, |
|
"learning_rate": 9.25925925925926e-05, |
|
"loss": 0.0224, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"eval_loss": 0.02490057609975338, |
|
"eval_runtime": 9.1102, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4722222222222222, |
|
"grad_norm": 0.04084627702832222, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 0.0252, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 0.033021993935108185, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 0.0228, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.49074074074074076, |
|
"grad_norm": 0.034785784780979156, |
|
"learning_rate": 9.814814814814815e-05, |
|
"loss": 0.0259, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.03407888114452362, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0239, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5092592592592593, |
|
"grad_norm": 0.03268973529338837, |
|
"learning_rate": 9.99989553622803e-05, |
|
"loss": 0.0229, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5092592592592593, |
|
"eval_loss": 0.02450372651219368, |
|
"eval_runtime": 9.1421, |
|
"eval_samples_per_second": 5.469, |
|
"eval_steps_per_second": 1.422, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.032378531992435455, |
|
"learning_rate": 9.999582149277187e-05, |
|
"loss": 0.0219, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5277777777777778, |
|
"grad_norm": 0.03997437283396721, |
|
"learning_rate": 9.999059852242507e-05, |
|
"loss": 0.0248, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5370370370370371, |
|
"grad_norm": 0.04024836793541908, |
|
"learning_rate": 9.998328666948438e-05, |
|
"loss": 0.0194, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5462962962962963, |
|
"grad_norm": 0.03850249573588371, |
|
"learning_rate": 9.997388623947928e-05, |
|
"loss": 0.0251, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.03326913341879845, |
|
"learning_rate": 9.996239762521151e-05, |
|
"loss": 0.0233, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"eval_loss": 0.023316912353038788, |
|
"eval_runtime": 9.1353, |
|
"eval_samples_per_second": 5.473, |
|
"eval_steps_per_second": 1.423, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5648148148148148, |
|
"grad_norm": 0.034179024398326874, |
|
"learning_rate": 9.994882130673868e-05, |
|
"loss": 0.0222, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5740740740740741, |
|
"grad_norm": 0.031797800213098526, |
|
"learning_rate": 9.993315785135416e-05, |
|
"loss": 0.0272, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.03183833882212639, |
|
"learning_rate": 9.991540791356342e-05, |
|
"loss": 0.0241, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.025173548609018326, |
|
"learning_rate": 9.989557223505661e-05, |
|
"loss": 0.0216, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6018518518518519, |
|
"grad_norm": 0.04935009032487869, |
|
"learning_rate": 9.987365164467767e-05, |
|
"loss": 0.0217, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6018518518518519, |
|
"eval_loss": 0.02255990356206894, |
|
"eval_runtime": 9.1207, |
|
"eval_samples_per_second": 5.482, |
|
"eval_steps_per_second": 1.425, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 0.02904060110449791, |
|
"learning_rate": 9.98496470583896e-05, |
|
"loss": 0.0213, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6203703703703703, |
|
"grad_norm": 0.046014755964279175, |
|
"learning_rate": 9.982355947923629e-05, |
|
"loss": 0.018, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 0.0354795977473259, |
|
"learning_rate": 9.979538999730047e-05, |
|
"loss": 0.0199, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6388888888888888, |
|
"grad_norm": 0.03308796137571335, |
|
"learning_rate": 9.976513978965829e-05, |
|
"loss": 0.0239, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 0.03860899433493614, |
|
"learning_rate": 9.973281012033007e-05, |
|
"loss": 0.0247, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"eval_loss": 0.022898558527231216, |
|
"eval_runtime": 9.1074, |
|
"eval_samples_per_second": 5.49, |
|
"eval_steps_per_second": 1.427, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6574074074074074, |
|
"grad_norm": 0.028213078156113625, |
|
"learning_rate": 9.969840234022749e-05, |
|
"loss": 0.0197, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.024581043049693108, |
|
"learning_rate": 9.966191788709716e-05, |
|
"loss": 0.0207, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6759259259259259, |
|
"grad_norm": 0.026658454909920692, |
|
"learning_rate": 9.962335828546048e-05, |
|
"loss": 0.0214, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6851851851851852, |
|
"grad_norm": 0.034941576421260834, |
|
"learning_rate": 9.958272514655006e-05, |
|
"loss": 0.0205, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 0.03060038387775421, |
|
"learning_rate": 9.954002016824227e-05, |
|
"loss": 0.0193, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"eval_loss": 0.02283317781984806, |
|
"eval_runtime": 9.1512, |
|
"eval_samples_per_second": 5.464, |
|
"eval_steps_per_second": 1.421, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 0.0313015952706337, |
|
"learning_rate": 9.949524513498636e-05, |
|
"loss": 0.0206, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7129629629629629, |
|
"grad_norm": 0.03317766636610031, |
|
"learning_rate": 9.944840191772987e-05, |
|
"loss": 0.0217, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 0.027911782264709473, |
|
"learning_rate": 9.939949247384046e-05, |
|
"loss": 0.0196, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7314814814814815, |
|
"grad_norm": 0.028807291761040688, |
|
"learning_rate": 9.934851884702414e-05, |
|
"loss": 0.0223, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.03152855485677719, |
|
"learning_rate": 9.929548316723982e-05, |
|
"loss": 0.0173, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"eval_loss": 0.021335698664188385, |
|
"eval_runtime": 9.1689, |
|
"eval_samples_per_second": 5.453, |
|
"eval_steps_per_second": 1.418, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.03250882402062416, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.0231, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7592592592592593, |
|
"grad_norm": 0.030853938311338425, |
|
"learning_rate": 9.918323459933005e-05, |
|
"loss": 0.0224, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7685185185185185, |
|
"grad_norm": 0.03431202098727226, |
|
"learning_rate": 9.912402640156811e-05, |
|
"loss": 0.0223, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.027050426229834557, |
|
"learning_rate": 9.906276553136923e-05, |
|
"loss": 0.0198, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7870370370370371, |
|
"grad_norm": 0.03224191442131996, |
|
"learning_rate": 9.899945454855006e-05, |
|
"loss": 0.0207, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7870370370370371, |
|
"eval_loss": 0.020375357940793037, |
|
"eval_runtime": 9.1362, |
|
"eval_samples_per_second": 5.473, |
|
"eval_steps_per_second": 1.423, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7962962962962963, |
|
"grad_norm": 0.028706278651952744, |
|
"learning_rate": 9.893409609859222e-05, |
|
"loss": 0.0197, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8055555555555556, |
|
"grad_norm": 0.02814578451216221, |
|
"learning_rate": 9.88666929125318e-05, |
|
"loss": 0.0199, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.028775395825505257, |
|
"learning_rate": 9.879724780684519e-05, |
|
"loss": 0.0169, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8240740740740741, |
|
"grad_norm": 0.030078047886490822, |
|
"learning_rate": 9.872576368333151e-05, |
|
"loss": 0.0209, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.031860969960689545, |
|
"learning_rate": 9.865224352899119e-05, |
|
"loss": 0.0213, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 0.019939038902521133, |
|
"eval_runtime": 9.1287, |
|
"eval_samples_per_second": 5.477, |
|
"eval_steps_per_second": 1.424, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8425925925925926, |
|
"grad_norm": 0.03415157273411751, |
|
"learning_rate": 9.857669041590134e-05, |
|
"loss": 0.021, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 0.032674115151166916, |
|
"learning_rate": 9.849910750108717e-05, |
|
"loss": 0.0207, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8611111111111112, |
|
"grad_norm": 0.02941475249826908, |
|
"learning_rate": 9.84194980263903e-05, |
|
"loss": 0.0196, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8703703703703703, |
|
"grad_norm": 0.036115583032369614, |
|
"learning_rate": 9.83378653183331e-05, |
|
"loss": 0.0178, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8796296296296297, |
|
"grad_norm": 0.03358744457364082, |
|
"learning_rate": 9.825421278797983e-05, |
|
"loss": 0.0199, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8796296296296297, |
|
"eval_loss": 0.020193172618746758, |
|
"eval_runtime": 9.1141, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.029014358296990395, |
|
"learning_rate": 9.816854393079403e-05, |
|
"loss": 0.0219, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8981481481481481, |
|
"grad_norm": 0.042931754142045975, |
|
"learning_rate": 9.808086232649246e-05, |
|
"loss": 0.0185, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9074074074074074, |
|
"grad_norm": 0.029089825227856636, |
|
"learning_rate": 9.799117163889559e-05, |
|
"loss": 0.021, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.03154176101088524, |
|
"learning_rate": 9.789947561577445e-05, |
|
"loss": 0.02, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.027786221355199814, |
|
"learning_rate": 9.780577808869398e-05, |
|
"loss": 0.0188, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"eval_loss": 0.02070247381925583, |
|
"eval_runtime": 9.1159, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9351851851851852, |
|
"grad_norm": 0.030518539249897003, |
|
"learning_rate": 9.771008297285307e-05, |
|
"loss": 0.0218, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 0.024817178025841713, |
|
"learning_rate": 9.761239426692077e-05, |
|
"loss": 0.0202, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9537037037037037, |
|
"grad_norm": 0.025192229077219963, |
|
"learning_rate": 9.751271605286941e-05, |
|
"loss": 0.0197, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.02538897655904293, |
|
"learning_rate": 9.741105249580383e-05, |
|
"loss": 0.02, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 0.025440450757741928, |
|
"learning_rate": 9.730740784378753e-05, |
|
"loss": 0.0193, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"eval_loss": 0.020300446078181267, |
|
"eval_runtime": 9.126, |
|
"eval_samples_per_second": 5.479, |
|
"eval_steps_per_second": 1.425, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9814814814814815, |
|
"grad_norm": 0.02362542785704136, |
|
"learning_rate": 9.7201786427665e-05, |
|
"loss": 0.0202, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9907407407407407, |
|
"grad_norm": 0.022390421479940414, |
|
"learning_rate": 9.709419266088086e-05, |
|
"loss": 0.0188, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.026193244382739067, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.022, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0092592592592593, |
|
"grad_norm": 0.028253022581338882, |
|
"learning_rate": 9.687310614099675e-05, |
|
"loss": 0.0159, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 0.02241157554090023, |
|
"learning_rate": 9.67596226261095e-05, |
|
"loss": 0.016, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"eval_loss": 0.01969613879919052, |
|
"eval_runtime": 9.1053, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0277777777777777, |
|
"grad_norm": 0.027405373752117157, |
|
"learning_rate": 9.664418523660004e-05, |
|
"loss": 0.014, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.032646384090185165, |
|
"learning_rate": 9.652679879607843e-05, |
|
"loss": 0.0172, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0462962962962963, |
|
"grad_norm": 0.02552163228392601, |
|
"learning_rate": 9.640746820959684e-05, |
|
"loss": 0.014, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 0.022228199988603592, |
|
"learning_rate": 9.628619846344454e-05, |
|
"loss": 0.0172, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0648148148148149, |
|
"grad_norm": 0.028009962290525436, |
|
"learning_rate": 9.616299462493952e-05, |
|
"loss": 0.0166, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0648148148148149, |
|
"eval_loss": 0.019864549860358238, |
|
"eval_runtime": 9.122, |
|
"eval_samples_per_second": 5.481, |
|
"eval_steps_per_second": 1.425, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.074074074074074, |
|
"grad_norm": 0.025030331686139107, |
|
"learning_rate": 9.603786184221693e-05, |
|
"loss": 0.0195, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0833333333333333, |
|
"grad_norm": 0.030586065724492073, |
|
"learning_rate": 9.591080534401371e-05, |
|
"loss": 0.015, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.0925925925925926, |
|
"grad_norm": 0.02425476722419262, |
|
"learning_rate": 9.57818304394503e-05, |
|
"loss": 0.0183, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.1018518518518519, |
|
"grad_norm": 0.03203345090150833, |
|
"learning_rate": 9.565094251780871e-05, |
|
"loss": 0.0172, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.03028124012053013, |
|
"learning_rate": 9.551814704830734e-05, |
|
"loss": 0.0189, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"eval_loss": 0.019504941999912262, |
|
"eval_runtime": 9.1171, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1203703703703705, |
|
"grad_norm": 0.026934562250971794, |
|
"learning_rate": 9.538344957987244e-05, |
|
"loss": 0.0132, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1296296296296295, |
|
"grad_norm": 0.02392655238509178, |
|
"learning_rate": 9.524685574090627e-05, |
|
"loss": 0.0184, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1388888888888888, |
|
"grad_norm": 0.02336742728948593, |
|
"learning_rate": 9.51083712390519e-05, |
|
"loss": 0.0155, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 0.025306498631834984, |
|
"learning_rate": 9.496800186095466e-05, |
|
"loss": 0.0156, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1574074074074074, |
|
"grad_norm": 0.02764940820634365, |
|
"learning_rate": 9.482575347202047e-05, |
|
"loss": 0.0211, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1574074074074074, |
|
"eval_loss": 0.018362991511821747, |
|
"eval_runtime": 9.1297, |
|
"eval_samples_per_second": 5.477, |
|
"eval_steps_per_second": 1.424, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.02213912270963192, |
|
"learning_rate": 9.468163201617062e-05, |
|
"loss": 0.0178, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.175925925925926, |
|
"grad_norm": 0.03320689871907234, |
|
"learning_rate": 9.453564351559348e-05, |
|
"loss": 0.0148, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.023370925337076187, |
|
"learning_rate": 9.438779407049281e-05, |
|
"loss": 0.0174, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1944444444444444, |
|
"grad_norm": 0.02848099358379841, |
|
"learning_rate": 9.423808985883289e-05, |
|
"loss": 0.0174, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 0.02608056552708149, |
|
"learning_rate": 9.40865371360804e-05, |
|
"loss": 0.0171, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"eval_loss": 0.018851976841688156, |
|
"eval_runtime": 9.1046, |
|
"eval_samples_per_second": 5.492, |
|
"eval_steps_per_second": 1.428, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.212962962962963, |
|
"grad_norm": 0.02152630314230919, |
|
"learning_rate": 9.393314223494296e-05, |
|
"loss": 0.0172, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.02550230175256729, |
|
"learning_rate": 9.377791156510455e-05, |
|
"loss": 0.016, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2314814814814814, |
|
"grad_norm": 0.025004474446177483, |
|
"learning_rate": 9.362085161295769e-05, |
|
"loss": 0.0163, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"grad_norm": 0.026416007429361343, |
|
"learning_rate": 9.346196894133239e-05, |
|
"loss": 0.0165, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.029432326555252075, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0191, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.019194327294826508, |
|
"eval_runtime": 9.1131, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.03440408781170845, |
|
"learning_rate": 9.313876207150543e-05, |
|
"loss": 0.0165, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.2685185185185186, |
|
"grad_norm": 0.025614989921450615, |
|
"learning_rate": 9.297445137866727e-05, |
|
"loss": 0.0162, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 0.02456337958574295, |
|
"learning_rate": 9.280834497651334e-05, |
|
"loss": 0.0192, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.287037037037037, |
|
"grad_norm": 0.051101330667734146, |
|
"learning_rate": 9.264044980588416e-05, |
|
"loss": 0.015, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.03369716554880142, |
|
"learning_rate": 9.247077288236488e-05, |
|
"loss": 0.0184, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"eval_loss": 0.018648317083716393, |
|
"eval_runtime": 9.1079, |
|
"eval_samples_per_second": 5.49, |
|
"eval_steps_per_second": 1.427, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3055555555555556, |
|
"grad_norm": 0.024168213829398155, |
|
"learning_rate": 9.229932129599205e-05, |
|
"loss": 0.0166, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.3148148148148149, |
|
"grad_norm": 0.027960045263171196, |
|
"learning_rate": 9.212610221095748e-05, |
|
"loss": 0.0157, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.324074074074074, |
|
"grad_norm": 0.023985836654901505, |
|
"learning_rate": 9.195112286530873e-05, |
|
"loss": 0.0178, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.026084545999765396, |
|
"learning_rate": 9.177439057064683e-05, |
|
"loss": 0.0164, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3425925925925926, |
|
"grad_norm": 0.022582337260246277, |
|
"learning_rate": 9.159591271182058e-05, |
|
"loss": 0.0162, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3425925925925926, |
|
"eval_loss": 0.018656810745596886, |
|
"eval_runtime": 9.1149, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3518518518518519, |
|
"grad_norm": 0.030290907248854637, |
|
"learning_rate": 9.141569674661817e-05, |
|
"loss": 0.021, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.3611111111111112, |
|
"grad_norm": 0.026109322905540466, |
|
"learning_rate": 9.123375020545535e-05, |
|
"loss": 0.0162, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3703703703703702, |
|
"grad_norm": 0.02652176469564438, |
|
"learning_rate": 9.105008069106093e-05, |
|
"loss": 0.0169, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.3796296296296298, |
|
"grad_norm": 0.024147020652890205, |
|
"learning_rate": 9.086469587815904e-05, |
|
"loss": 0.0162, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 0.021294649690389633, |
|
"learning_rate": 9.067760351314838e-05, |
|
"loss": 0.0165, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"eval_loss": 0.018213987350463867, |
|
"eval_runtime": 9.1247, |
|
"eval_samples_per_second": 5.48, |
|
"eval_steps_per_second": 1.425, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3981481481481481, |
|
"grad_norm": 0.02462903782725334, |
|
"learning_rate": 9.048881141377863e-05, |
|
"loss": 0.0204, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 0.024652326479554176, |
|
"learning_rate": 9.029832746882371e-05, |
|
"loss": 0.0164, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.4166666666666667, |
|
"grad_norm": 0.026834659278392792, |
|
"learning_rate": 9.01061596377522e-05, |
|
"loss": 0.018, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.425925925925926, |
|
"grad_norm": 0.02342064492404461, |
|
"learning_rate": 8.991231595039465e-05, |
|
"loss": 0.0156, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4351851851851851, |
|
"grad_norm": 0.026441222056746483, |
|
"learning_rate": 8.97168045066082e-05, |
|
"loss": 0.0157, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4351851851851851, |
|
"eval_loss": 0.01855114847421646, |
|
"eval_runtime": 9.124, |
|
"eval_samples_per_second": 5.48, |
|
"eval_steps_per_second": 1.425, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.01796615496277809, |
|
"learning_rate": 8.951963347593797e-05, |
|
"loss": 0.0165, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4537037037037037, |
|
"grad_norm": 0.02256671153008938, |
|
"learning_rate": 8.932081109727582e-05, |
|
"loss": 0.0201, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.462962962962963, |
|
"grad_norm": 0.028528334572911263, |
|
"learning_rate": 8.912034567851599e-05, |
|
"loss": 0.0182, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4722222222222223, |
|
"grad_norm": 0.029104968532919884, |
|
"learning_rate": 8.891824559620801e-05, |
|
"loss": 0.0153, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.02003669925034046, |
|
"learning_rate": 8.871451929520663e-05, |
|
"loss": 0.0159, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"eval_loss": 0.01888095587491989, |
|
"eval_runtime": 9.1172, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4907407407407407, |
|
"grad_norm": 0.019447356462478638, |
|
"learning_rate": 8.850917528831899e-05, |
|
"loss": 0.0163, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.03438901901245117, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0125, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.5092592592592593, |
|
"grad_norm": 0.026535626500844955, |
|
"learning_rate": 8.809366854573831e-05, |
|
"loss": 0.0175, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5185185185185186, |
|
"grad_norm": 0.029025647789239883, |
|
"learning_rate": 8.78835231722059e-05, |
|
"loss": 0.0164, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 0.025528129190206528, |
|
"learning_rate": 8.767179481638303e-05, |
|
"loss": 0.0174, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"eval_loss": 0.018690049648284912, |
|
"eval_runtime": 9.1481, |
|
"eval_samples_per_second": 5.466, |
|
"eval_steps_per_second": 1.421, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"grad_norm": 0.025675086304545403, |
|
"learning_rate": 8.745849232544681e-05, |
|
"loss": 0.0179, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5462962962962963, |
|
"grad_norm": 0.027451254427433014, |
|
"learning_rate": 8.724362461235029e-05, |
|
"loss": 0.0169, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.026652028784155846, |
|
"learning_rate": 8.702720065545024e-05, |
|
"loss": 0.0168, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5648148148148149, |
|
"grad_norm": 0.030202018097043037, |
|
"learning_rate": 8.680922949813178e-05, |
|
"loss": 0.0162, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 0.027389824390411377, |
|
"learning_rate": 8.658972024843062e-05, |
|
"loss": 0.0184, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"eval_loss": 0.018272995948791504, |
|
"eval_runtime": 9.1448, |
|
"eval_samples_per_second": 5.468, |
|
"eval_steps_per_second": 1.422, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5833333333333335, |
|
"grad_norm": 0.025648167356848717, |
|
"learning_rate": 8.636868207865244e-05, |
|
"loss": 0.0152, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 0.02472120150923729, |
|
"learning_rate": 8.614612422498964e-05, |
|
"loss": 0.0153, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.6018518518518519, |
|
"grad_norm": 0.020042769610881805, |
|
"learning_rate": 8.592205598713539e-05, |
|
"loss": 0.017, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 0.029423648491501808, |
|
"learning_rate": 8.569648672789497e-05, |
|
"loss": 0.0158, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6203703703703702, |
|
"grad_norm": 0.02159775421023369, |
|
"learning_rate": 8.546942587279465e-05, |
|
"loss": 0.0165, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6203703703703702, |
|
"eval_loss": 0.018273252993822098, |
|
"eval_runtime": 9.118, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.024837305769324303, |
|
"learning_rate": 8.524088290968781e-05, |
|
"loss": 0.0187, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.6388888888888888, |
|
"grad_norm": 0.02383432537317276, |
|
"learning_rate": 8.501086738835843e-05, |
|
"loss": 0.0181, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6481481481481481, |
|
"grad_norm": 0.025743911042809486, |
|
"learning_rate": 8.47793889201221e-05, |
|
"loss": 0.0171, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6574074074074074, |
|
"grad_norm": 0.023100929334759712, |
|
"learning_rate": 8.45464571774244e-05, |
|
"loss": 0.021, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.02667200192809105, |
|
"learning_rate": 8.43120818934367e-05, |
|
"loss": 0.0173, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.01778573729097843, |
|
"eval_runtime": 9.1324, |
|
"eval_samples_per_second": 5.475, |
|
"eval_steps_per_second": 1.424, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.675925925925926, |
|
"grad_norm": 0.02880384773015976, |
|
"learning_rate": 8.407627286164948e-05, |
|
"loss": 0.015, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6851851851851851, |
|
"grad_norm": 0.030301645398139954, |
|
"learning_rate": 8.383903993546311e-05, |
|
"loss": 0.0157, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6944444444444444, |
|
"grad_norm": 0.021445374935865402, |
|
"learning_rate": 8.360039302777612e-05, |
|
"loss": 0.0181, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 0.023577649146318436, |
|
"learning_rate": 8.336034211057098e-05, |
|
"loss": 0.0153, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.7129629629629628, |
|
"grad_norm": 0.02492811530828476, |
|
"learning_rate": 8.31188972144974e-05, |
|
"loss": 0.0131, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7129629629629628, |
|
"eval_loss": 0.017187727615237236, |
|
"eval_runtime": 9.1252, |
|
"eval_samples_per_second": 5.479, |
|
"eval_steps_per_second": 1.425, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 0.023155970498919487, |
|
"learning_rate": 8.28760684284532e-05, |
|
"loss": 0.0162, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.7314814814814814, |
|
"grad_norm": 0.02491271123290062, |
|
"learning_rate": 8.263186589916273e-05, |
|
"loss": 0.0137, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 0.02165275253355503, |
|
"learning_rate": 8.238629983075294e-05, |
|
"loss": 0.0143, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.024284129962325096, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.0144, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 0.027395077049732208, |
|
"learning_rate": 8.18911181775353e-05, |
|
"loss": 0.0132, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"eval_loss": 0.018012873828411102, |
|
"eval_runtime": 9.1149, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7685185185185186, |
|
"grad_norm": 0.02639261819422245, |
|
"learning_rate": 8.164152328414476e-05, |
|
"loss": 0.0156, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.02319464646279812, |
|
"learning_rate": 8.139060623360493e-05, |
|
"loss": 0.0121, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7870370370370372, |
|
"grad_norm": 0.020444169640541077, |
|
"learning_rate": 8.113837751061246e-05, |
|
"loss": 0.0156, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7962962962962963, |
|
"grad_norm": 0.03843529522418976, |
|
"learning_rate": 8.088484765467286e-05, |
|
"loss": 0.0202, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 0.03014414757490158, |
|
"learning_rate": 8.063002725966015e-05, |
|
"loss": 0.0157, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"eval_loss": 0.018071575090289116, |
|
"eval_runtime": 9.1428, |
|
"eval_samples_per_second": 5.469, |
|
"eval_steps_per_second": 1.422, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8148148148148149, |
|
"grad_norm": 0.028225911781191826, |
|
"learning_rate": 8.037392697337418e-05, |
|
"loss": 0.0152, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.824074074074074, |
|
"grad_norm": 0.022350864484906197, |
|
"learning_rate": 8.011655749709575e-05, |
|
"loss": 0.0147, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.023073699325323105, |
|
"learning_rate": 7.985792958513931e-05, |
|
"loss": 0.0142, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.8425925925925926, |
|
"grad_norm": 0.027160046622157097, |
|
"learning_rate": 7.95980540444038e-05, |
|
"loss": 0.0181, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.02501911297440529, |
|
"learning_rate": 7.93369417339209e-05, |
|
"loss": 0.0154, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"eval_loss": 0.01711750030517578, |
|
"eval_runtime": 9.1469, |
|
"eval_samples_per_second": 5.466, |
|
"eval_steps_per_second": 1.421, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8611111111111112, |
|
"grad_norm": 0.02209513448178768, |
|
"learning_rate": 7.907460356440133e-05, |
|
"loss": 0.0156, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8703703703703702, |
|
"grad_norm": 0.022372853010892868, |
|
"learning_rate": 7.881105049777901e-05, |
|
"loss": 0.0182, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.8796296296296298, |
|
"grad_norm": 0.02874351665377617, |
|
"learning_rate": 7.854629354675291e-05, |
|
"loss": 0.0145, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.025754928588867188, |
|
"learning_rate": 7.828034377432693e-05, |
|
"loss": 0.0161, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.8981481481481481, |
|
"grad_norm": 0.023868247866630554, |
|
"learning_rate": 7.801321229334764e-05, |
|
"loss": 0.0139, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.8981481481481481, |
|
"eval_loss": 0.01687374897301197, |
|
"eval_runtime": 9.1148, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9074074074074074, |
|
"grad_norm": 0.02167942002415657, |
|
"learning_rate": 7.774491026603985e-05, |
|
"loss": 0.0172, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9166666666666665, |
|
"grad_norm": 0.028955647721886635, |
|
"learning_rate": 7.74754489035403e-05, |
|
"loss": 0.0182, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.023490311577916145, |
|
"learning_rate": 7.720483946542914e-05, |
|
"loss": 0.0176, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9351851851851851, |
|
"grad_norm": 0.02635806053876877, |
|
"learning_rate": 7.69330932592594e-05, |
|
"loss": 0.0149, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 0.02554040215909481, |
|
"learning_rate": 7.666022164008457e-05, |
|
"loss": 0.0169, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"eval_loss": 0.016974864527583122, |
|
"eval_runtime": 9.1008, |
|
"eval_samples_per_second": 5.494, |
|
"eval_steps_per_second": 1.428, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9537037037037037, |
|
"grad_norm": 0.02924305759370327, |
|
"learning_rate": 7.63862360099841e-05, |
|
"loss": 0.0148, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9629629629629628, |
|
"grad_norm": 0.020948631688952446, |
|
"learning_rate": 7.611114781758692e-05, |
|
"loss": 0.0158, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.9722222222222223, |
|
"grad_norm": 0.021703558042645454, |
|
"learning_rate": 7.583496855759316e-05, |
|
"loss": 0.0172, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"grad_norm": 0.022922605276107788, |
|
"learning_rate": 7.555770977029367e-05, |
|
"loss": 0.0149, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.9907407407407407, |
|
"grad_norm": 0.025769095867872238, |
|
"learning_rate": 7.527938304108795e-05, |
|
"loss": 0.0158, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9907407407407407, |
|
"eval_loss": 0.017042405903339386, |
|
"eval_runtime": 9.1168, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.03371057286858559, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0126, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.009259259259259, |
|
"grad_norm": 0.01711084321141243, |
|
"learning_rate": 7.471957232119234e-05, |
|
"loss": 0.0142, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0185185185185186, |
|
"grad_norm": 0.023618614301085472, |
|
"learning_rate": 7.443811172247821e-05, |
|
"loss": 0.0151, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0277777777777777, |
|
"grad_norm": 0.02181304432451725, |
|
"learning_rate": 7.415562996483192e-05, |
|
"loss": 0.0132, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 0.020521776750683784, |
|
"learning_rate": 7.387213885189746e-05, |
|
"loss": 0.0139, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"eval_loss": 0.01702064275741577, |
|
"eval_runtime": 9.1369, |
|
"eval_samples_per_second": 5.472, |
|
"eval_steps_per_second": 1.423, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0462962962962963, |
|
"grad_norm": 0.022209780290722847, |
|
"learning_rate": 7.358765022949519e-05, |
|
"loss": 0.0152, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.0555555555555554, |
|
"grad_norm": 0.02240665629506111, |
|
"learning_rate": 7.330217598512695e-05, |
|
"loss": 0.0136, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.064814814814815, |
|
"grad_norm": 0.024021176621317863, |
|
"learning_rate": 7.30157280474793e-05, |
|
"loss": 0.0134, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.022297382354736328, |
|
"learning_rate": 7.272831838592503e-05, |
|
"loss": 0.0158, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 0.023189576342701912, |
|
"learning_rate": 7.243995901002312e-05, |
|
"loss": 0.0146, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 0.017011733725667, |
|
"eval_runtime": 9.1385, |
|
"eval_samples_per_second": 5.471, |
|
"eval_steps_per_second": 1.423, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0925925925925926, |
|
"grad_norm": 0.02641259878873825, |
|
"learning_rate": 7.215066196901676e-05, |
|
"loss": 0.0149, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.1018518518518516, |
|
"grad_norm": 0.02105395309627056, |
|
"learning_rate": 7.186043935133005e-05, |
|
"loss": 0.0105, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.020818866789340973, |
|
"learning_rate": 7.156930328406268e-05, |
|
"loss": 0.0144, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.1203703703703702, |
|
"grad_norm": 0.028699271380901337, |
|
"learning_rate": 7.127726593248337e-05, |
|
"loss": 0.0134, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.1296296296296298, |
|
"grad_norm": 0.025844816118478775, |
|
"learning_rate": 7.098433949952146e-05, |
|
"loss": 0.0115, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1296296296296298, |
|
"eval_loss": 0.017404422163963318, |
|
"eval_runtime": 9.1138, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.138888888888889, |
|
"grad_norm": 0.02628181129693985, |
|
"learning_rate": 7.069053622525696e-05, |
|
"loss": 0.0135, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.148148148148148, |
|
"grad_norm": 0.03826741501688957, |
|
"learning_rate": 7.039586838640919e-05, |
|
"loss": 0.013, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.1574074074074074, |
|
"grad_norm": 0.02549687772989273, |
|
"learning_rate": 7.01003482958237e-05, |
|
"loss": 0.0112, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.1666666666666665, |
|
"grad_norm": 0.02850032038986683, |
|
"learning_rate": 6.980398830195785e-05, |
|
"loss": 0.0114, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.175925925925926, |
|
"grad_norm": 0.028789905831217766, |
|
"learning_rate": 6.950680078836474e-05, |
|
"loss": 0.0138, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.175925925925926, |
|
"eval_loss": 0.016838619485497475, |
|
"eval_runtime": 9.1141, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.185185185185185, |
|
"grad_norm": 0.024276968091726303, |
|
"learning_rate": 6.920879817317589e-05, |
|
"loss": 0.0156, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.1944444444444446, |
|
"grad_norm": 0.02652347832918167, |
|
"learning_rate": 6.890999290858214e-05, |
|
"loss": 0.0111, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.2037037037037037, |
|
"grad_norm": 0.03363705053925514, |
|
"learning_rate": 6.861039748031351e-05, |
|
"loss": 0.0155, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.212962962962963, |
|
"grad_norm": 0.025364842265844345, |
|
"learning_rate": 6.83100244071174e-05, |
|
"loss": 0.0127, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.024912815541028976, |
|
"learning_rate": 6.800888624023553e-05, |
|
"loss": 0.0138, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_loss": 0.017057882621884346, |
|
"eval_runtime": 9.1505, |
|
"eval_samples_per_second": 5.464, |
|
"eval_steps_per_second": 1.421, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2314814814814814, |
|
"grad_norm": 0.031296826899051666, |
|
"learning_rate": 6.770699556287939e-05, |
|
"loss": 0.0138, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.240740740740741, |
|
"grad_norm": 0.03207860141992569, |
|
"learning_rate": 6.740436498970452e-05, |
|
"loss": 0.0128, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.027626443654298782, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.0142, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.259259259259259, |
|
"grad_norm": 0.025963863357901573, |
|
"learning_rate": 6.679693476857711e-05, |
|
"loss": 0.0137, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.2685185185185186, |
|
"grad_norm": 0.022552739828824997, |
|
"learning_rate": 6.649216050240539e-05, |
|
"loss": 0.0134, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2685185185185186, |
|
"eval_loss": 0.016679909080266953, |
|
"eval_runtime": 9.1095, |
|
"eval_samples_per_second": 5.489, |
|
"eval_steps_per_second": 1.427, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2777777777777777, |
|
"grad_norm": 0.0247825738042593, |
|
"learning_rate": 6.618669710291606e-05, |
|
"loss": 0.0116, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.287037037037037, |
|
"grad_norm": 0.021808508783578873, |
|
"learning_rate": 6.588055733405266e-05, |
|
"loss": 0.014, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 0.025087367743253708, |
|
"learning_rate": 6.557375398802123e-05, |
|
"loss": 0.0167, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.3055555555555554, |
|
"grad_norm": 0.022722622379660606, |
|
"learning_rate": 6.526629988475567e-05, |
|
"loss": 0.013, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.314814814814815, |
|
"grad_norm": 0.023495636880397797, |
|
"learning_rate": 6.495820787138209e-05, |
|
"loss": 0.0167, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.314814814814815, |
|
"eval_loss": 0.016377143561840057, |
|
"eval_runtime": 9.1133, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.324074074074074, |
|
"grad_norm": 0.021211953833699226, |
|
"learning_rate": 6.464949082168204e-05, |
|
"loss": 0.0125, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.022748148068785667, |
|
"learning_rate": 6.434016163555452e-05, |
|
"loss": 0.0121, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.3425925925925926, |
|
"grad_norm": 0.021960506215691566, |
|
"learning_rate": 6.403023323847695e-05, |
|
"loss": 0.0159, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.351851851851852, |
|
"grad_norm": 0.02572719193994999, |
|
"learning_rate": 6.371971858096508e-05, |
|
"loss": 0.0137, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"grad_norm": 0.027611717581748962, |
|
"learning_rate": 6.340863063803188e-05, |
|
"loss": 0.0123, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"eval_loss": 0.016414109617471695, |
|
"eval_runtime": 9.1093, |
|
"eval_samples_per_second": 5.489, |
|
"eval_steps_per_second": 1.427, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.026147907599806786, |
|
"learning_rate": 6.30969824086453e-05, |
|
"loss": 0.012, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.3796296296296298, |
|
"grad_norm": 0.026667073369026184, |
|
"learning_rate": 6.27847869151852e-05, |
|
"loss": 0.0127, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.388888888888889, |
|
"grad_norm": 0.023840012028813362, |
|
"learning_rate": 6.247205720289907e-05, |
|
"loss": 0.0141, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.398148148148148, |
|
"grad_norm": 0.028697500005364418, |
|
"learning_rate": 6.215880633935708e-05, |
|
"loss": 0.0135, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 0.029124466702342033, |
|
"learning_rate": 6.184504741390596e-05, |
|
"loss": 0.0139, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"eval_loss": 0.016279693692922592, |
|
"eval_runtime": 9.1162, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4166666666666665, |
|
"grad_norm": 0.020265506580471992, |
|
"learning_rate": 6.153079353712201e-05, |
|
"loss": 0.0129, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.425925925925926, |
|
"grad_norm": 0.020486822351813316, |
|
"learning_rate": 6.121605784026339e-05, |
|
"loss": 0.0114, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.435185185185185, |
|
"grad_norm": 0.02432914823293686, |
|
"learning_rate": 6.09008534747213e-05, |
|
"loss": 0.0138, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.027614833787083626, |
|
"learning_rate": 6.058519361147055e-05, |
|
"loss": 0.0118, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.4537037037037037, |
|
"grad_norm": 0.03493235632777214, |
|
"learning_rate": 6.02690914405191e-05, |
|
"loss": 0.0125, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4537037037037037, |
|
"eval_loss": 0.016143780201673508, |
|
"eval_runtime": 9.2054, |
|
"eval_samples_per_second": 5.432, |
|
"eval_steps_per_second": 1.412, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.462962962962963, |
|
"grad_norm": 0.024250265210866928, |
|
"learning_rate": 5.995256017035703e-05, |
|
"loss": 0.0139, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.4722222222222223, |
|
"grad_norm": 0.022808292880654335, |
|
"learning_rate": 5.963561302740449e-05, |
|
"loss": 0.0162, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.4814814814814814, |
|
"grad_norm": 0.03109206259250641, |
|
"learning_rate": 5.9318263255459116e-05, |
|
"loss": 0.0123, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.490740740740741, |
|
"grad_norm": 0.02985144406557083, |
|
"learning_rate": 5.900052411514257e-05, |
|
"loss": 0.015, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.024866314604878426, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0126, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.016046511009335518, |
|
"eval_runtime": 9.1128, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5092592592592595, |
|
"grad_norm": 0.0215854924172163, |
|
"learning_rate": 5.836393085267776e-05, |
|
"loss": 0.0133, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.02321489341557026, |
|
"learning_rate": 5.804510333090287e-05, |
|
"loss": 0.0175, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.5277777777777777, |
|
"grad_norm": 0.024908283725380898, |
|
"learning_rate": 5.772593964039203e-05, |
|
"loss": 0.0116, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.537037037037037, |
|
"grad_norm": 0.02571980282664299, |
|
"learning_rate": 5.740645311756245e-05, |
|
"loss": 0.0125, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.5462962962962963, |
|
"grad_norm": 0.022897284477949142, |
|
"learning_rate": 5.708665711232103e-05, |
|
"loss": 0.0138, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.5462962962962963, |
|
"eval_loss": 0.016013609245419502, |
|
"eval_runtime": 9.1743, |
|
"eval_samples_per_second": 5.45, |
|
"eval_steps_per_second": 1.417, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.023732876405119896, |
|
"learning_rate": 5.6766564987506566e-05, |
|
"loss": 0.0136, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.564814814814815, |
|
"grad_norm": 0.024980880320072174, |
|
"learning_rate": 5.644619011833133e-05, |
|
"loss": 0.0131, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.574074074074074, |
|
"grad_norm": 0.023262949660420418, |
|
"learning_rate": 5.6125545891822274e-05, |
|
"loss": 0.0143, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.5833333333333335, |
|
"grad_norm": 0.024468230083584785, |
|
"learning_rate": 5.5804645706261514e-05, |
|
"loss": 0.0148, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 0.020350055769085884, |
|
"learning_rate": 5.548350297062659e-05, |
|
"loss": 0.0125, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"eval_loss": 0.015153205953538418, |
|
"eval_runtime": 9.1126, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.601851851851852, |
|
"grad_norm": 0.027165360748767853, |
|
"learning_rate": 5.516213110403009e-05, |
|
"loss": 0.0093, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.611111111111111, |
|
"grad_norm": 0.021070580929517746, |
|
"learning_rate": 5.484054353515896e-05, |
|
"loss": 0.0138, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.6203703703703702, |
|
"grad_norm": 0.025997430086135864, |
|
"learning_rate": 5.451875370171341e-05, |
|
"loss": 0.0121, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.6296296296296298, |
|
"grad_norm": 0.02517426759004593, |
|
"learning_rate": 5.419677504984534e-05, |
|
"loss": 0.0126, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"grad_norm": 0.025812286883592606, |
|
"learning_rate": 5.387462103359655e-05, |
|
"loss": 0.0133, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"eval_loss": 0.016152961179614067, |
|
"eval_runtime": 9.1127, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.648148148148148, |
|
"grad_norm": 0.02393972873687744, |
|
"learning_rate": 5.355230511433651e-05, |
|
"loss": 0.0136, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.6574074074074074, |
|
"grad_norm": 0.021706297993659973, |
|
"learning_rate": 5.32298407601999e-05, |
|
"loss": 0.0133, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.026299407705664635, |
|
"learning_rate": 5.290724144552379e-05, |
|
"loss": 0.0143, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.675925925925926, |
|
"grad_norm": 0.030511364340782166, |
|
"learning_rate": 5.258452065028473e-05, |
|
"loss": 0.0137, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.685185185185185, |
|
"grad_norm": 0.024854540824890137, |
|
"learning_rate": 5.226169185953532e-05, |
|
"loss": 0.0125, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.685185185185185, |
|
"eval_loss": 0.016076602041721344, |
|
"eval_runtime": 9.1632, |
|
"eval_samples_per_second": 5.457, |
|
"eval_steps_per_second": 1.419, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6944444444444446, |
|
"grad_norm": 0.022800520062446594, |
|
"learning_rate": 5.193876856284085e-05, |
|
"loss": 0.012, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.7037037037037037, |
|
"grad_norm": 0.021870015189051628, |
|
"learning_rate": 5.1615764253715536e-05, |
|
"loss": 0.0136, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.712962962962963, |
|
"grad_norm": 0.020156167447566986, |
|
"learning_rate": 5.129269242905882e-05, |
|
"loss": 0.012, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.7222222222222223, |
|
"grad_norm": 0.019064266234636307, |
|
"learning_rate": 5.096956658859122e-05, |
|
"loss": 0.0137, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.7314814814814814, |
|
"grad_norm": 0.027288921177387238, |
|
"learning_rate": 5.064640023429043e-05, |
|
"loss": 0.0147, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.7314814814814814, |
|
"eval_loss": 0.01584070920944214, |
|
"eval_runtime": 9.1151, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.7407407407407405, |
|
"grad_norm": 0.02484748885035515, |
|
"learning_rate": 5.0323206869826966e-05, |
|
"loss": 0.0111, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.02521962858736515, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0134, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.7592592592592595, |
|
"grad_norm": 0.023346634581685066, |
|
"learning_rate": 4.967679313017303e-05, |
|
"loss": 0.0124, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.7685185185185186, |
|
"grad_norm": 0.021654650568962097, |
|
"learning_rate": 4.9353599765709584e-05, |
|
"loss": 0.0144, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.021227596327662468, |
|
"learning_rate": 4.903043341140879e-05, |
|
"loss": 0.0134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"eval_loss": 0.016122175380587578, |
|
"eval_runtime": 9.1019, |
|
"eval_samples_per_second": 5.493, |
|
"eval_steps_per_second": 1.428, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.787037037037037, |
|
"grad_norm": 0.024656914174556732, |
|
"learning_rate": 4.870730757094121e-05, |
|
"loss": 0.0123, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.7962962962962963, |
|
"grad_norm": 0.02583468146622181, |
|
"learning_rate": 4.8384235746284476e-05, |
|
"loss": 0.015, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.8055555555555554, |
|
"grad_norm": 0.022909915074706078, |
|
"learning_rate": 4.806123143715916e-05, |
|
"loss": 0.0142, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.02014041878283024, |
|
"learning_rate": 4.7738308140464685e-05, |
|
"loss": 0.0131, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.824074074074074, |
|
"grad_norm": 0.022683143615722656, |
|
"learning_rate": 4.7415479349715275e-05, |
|
"loss": 0.0124, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.824074074074074, |
|
"eval_loss": 0.015797268599271774, |
|
"eval_runtime": 9.1281, |
|
"eval_samples_per_second": 5.478, |
|
"eval_steps_per_second": 1.424, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8333333333333335, |
|
"grad_norm": 0.025906002148985863, |
|
"learning_rate": 4.709275855447621e-05, |
|
"loss": 0.0154, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.8425925925925926, |
|
"grad_norm": 0.027820315212011337, |
|
"learning_rate": 4.677015923980011e-05, |
|
"loss": 0.0138, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.851851851851852, |
|
"grad_norm": 0.023744860664010048, |
|
"learning_rate": 4.6447694885663514e-05, |
|
"loss": 0.0124, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.861111111111111, |
|
"grad_norm": 0.026518192142248154, |
|
"learning_rate": 4.612537896640346e-05, |
|
"loss": 0.0155, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.8703703703703702, |
|
"grad_norm": 0.020426657050848007, |
|
"learning_rate": 4.5803224950154656e-05, |
|
"loss": 0.0132, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8703703703703702, |
|
"eval_loss": 0.015400240197777748, |
|
"eval_runtime": 9.1185, |
|
"eval_samples_per_second": 5.483, |
|
"eval_steps_per_second": 1.426, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8796296296296298, |
|
"grad_norm": 0.022766800597310066, |
|
"learning_rate": 4.54812462982866e-05, |
|
"loss": 0.0139, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.021728193387389183, |
|
"learning_rate": 4.515945646484105e-05, |
|
"loss": 0.0133, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.898148148148148, |
|
"grad_norm": 0.0226016603410244, |
|
"learning_rate": 4.4837868895969936e-05, |
|
"loss": 0.0126, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.9074074074074074, |
|
"grad_norm": 0.027723975479602814, |
|
"learning_rate": 4.451649702937342e-05, |
|
"loss": 0.0106, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 0.01856391504406929, |
|
"learning_rate": 4.4195354293738484e-05, |
|
"loss": 0.0146, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"eval_loss": 0.015166966244578362, |
|
"eval_runtime": 9.1172, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.925925925925926, |
|
"grad_norm": 0.019857853651046753, |
|
"learning_rate": 4.387445410817774e-05, |
|
"loss": 0.0124, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.935185185185185, |
|
"grad_norm": 0.025410892441868782, |
|
"learning_rate": 4.355380988166867e-05, |
|
"loss": 0.0119, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.9444444444444446, |
|
"grad_norm": 0.02312655746936798, |
|
"learning_rate": 4.323343501249346e-05, |
|
"loss": 0.0144, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.9537037037037037, |
|
"grad_norm": 0.022076064720749855, |
|
"learning_rate": 4.2913342887678985e-05, |
|
"loss": 0.0117, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.023769903928041458, |
|
"learning_rate": 4.259354688243757e-05, |
|
"loss": 0.014, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"eval_loss": 0.014957955107092857, |
|
"eval_runtime": 9.1101, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9722222222222223, |
|
"grad_norm": 0.023904340341687202, |
|
"learning_rate": 4.227406035960798e-05, |
|
"loss": 0.0121, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.9814814814814814, |
|
"grad_norm": 0.02383498102426529, |
|
"learning_rate": 4.195489666909713e-05, |
|
"loss": 0.0119, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.9907407407407405, |
|
"grad_norm": 0.03048449568450451, |
|
"learning_rate": 4.1636069147322246e-05, |
|
"loss": 0.0136, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.023879334330558777, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0137, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.009259259259259, |
|
"grad_norm": 0.025208691135048866, |
|
"learning_rate": 4.099947588485744e-05, |
|
"loss": 0.0122, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.009259259259259, |
|
"eval_loss": 0.015089023858308792, |
|
"eval_runtime": 9.116, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.0185185185185186, |
|
"grad_norm": 0.020718788728117943, |
|
"learning_rate": 4.06817367445409e-05, |
|
"loss": 0.0095, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.0277777777777777, |
|
"grad_norm": 0.024810951203107834, |
|
"learning_rate": 4.036438697259551e-05, |
|
"loss": 0.0134, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.037037037037037, |
|
"grad_norm": 0.019842958077788353, |
|
"learning_rate": 4.004743982964298e-05, |
|
"loss": 0.0122, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.0462962962962963, |
|
"grad_norm": 0.01818239875137806, |
|
"learning_rate": 3.97309085594809e-05, |
|
"loss": 0.0101, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"grad_norm": 0.022604303434491158, |
|
"learning_rate": 3.941480638852948e-05, |
|
"loss": 0.0118, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"eval_loss": 0.015503546223044395, |
|
"eval_runtime": 9.1063, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.064814814814815, |
|
"grad_norm": 0.024690452963113785, |
|
"learning_rate": 3.909914652527871e-05, |
|
"loss": 0.0109, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.074074074074074, |
|
"grad_norm": 0.02343621291220188, |
|
"learning_rate": 3.878394215973663e-05, |
|
"loss": 0.0123, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.0833333333333335, |
|
"grad_norm": 0.026170087978243828, |
|
"learning_rate": 3.846920646287799e-05, |
|
"loss": 0.0122, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.0925925925925926, |
|
"grad_norm": 0.024799769744277, |
|
"learning_rate": 3.815495258609404e-05, |
|
"loss": 0.0125, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.1018518518518516, |
|
"grad_norm": 0.02072787657380104, |
|
"learning_rate": 3.784119366064293e-05, |
|
"loss": 0.0108, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.1018518518518516, |
|
"eval_loss": 0.0155374426394701, |
|
"eval_runtime": 9.1152, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.021989421918988228, |
|
"learning_rate": 3.752794279710094e-05, |
|
"loss": 0.0114, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.1203703703703702, |
|
"grad_norm": 0.03829918056726456, |
|
"learning_rate": 3.721521308481482e-05, |
|
"loss": 0.0101, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.1296296296296298, |
|
"grad_norm": 0.029835987836122513, |
|
"learning_rate": 3.6903017591354706e-05, |
|
"loss": 0.0107, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.138888888888889, |
|
"grad_norm": 0.02231847681105137, |
|
"learning_rate": 3.6591369361968124e-05, |
|
"loss": 0.012, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.148148148148148, |
|
"grad_norm": 0.02263280376791954, |
|
"learning_rate": 3.628028141903493e-05, |
|
"loss": 0.0103, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.148148148148148, |
|
"eval_loss": 0.01546421181410551, |
|
"eval_runtime": 9.1199, |
|
"eval_samples_per_second": 5.483, |
|
"eval_steps_per_second": 1.425, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1574074074074074, |
|
"grad_norm": 0.023618226870894432, |
|
"learning_rate": 3.596976676152306e-05, |
|
"loss": 0.0116, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.1666666666666665, |
|
"grad_norm": 0.02577986940741539, |
|
"learning_rate": 3.5659838364445505e-05, |
|
"loss": 0.0108, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.175925925925926, |
|
"grad_norm": 0.026071948930621147, |
|
"learning_rate": 3.535050917831797e-05, |
|
"loss": 0.0108, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.185185185185185, |
|
"grad_norm": 0.038238752633333206, |
|
"learning_rate": 3.5041792128617927e-05, |
|
"loss": 0.0094, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.1944444444444446, |
|
"grad_norm": 0.029051663354039192, |
|
"learning_rate": 3.473370011524435e-05, |
|
"loss": 0.0099, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1944444444444446, |
|
"eval_loss": 0.015372861176729202, |
|
"eval_runtime": 9.1378, |
|
"eval_samples_per_second": 5.472, |
|
"eval_steps_per_second": 1.423, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.2037037037037037, |
|
"grad_norm": 0.022384386509656906, |
|
"learning_rate": 3.442624601197877e-05, |
|
"loss": 0.0096, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.212962962962963, |
|
"grad_norm": 0.024341940879821777, |
|
"learning_rate": 3.4119442665947344e-05, |
|
"loss": 0.0094, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.2222222222222223, |
|
"grad_norm": 0.02119499258697033, |
|
"learning_rate": 3.381330289708396e-05, |
|
"loss": 0.011, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.2314814814814814, |
|
"grad_norm": 0.025269504636526108, |
|
"learning_rate": 3.350783949759462e-05, |
|
"loss": 0.0105, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.240740740740741, |
|
"grad_norm": 0.02428189478814602, |
|
"learning_rate": 3.3203065231422904e-05, |
|
"loss": 0.0115, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.240740740740741, |
|
"eval_loss": 0.015474287793040276, |
|
"eval_runtime": 9.1142, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.027830710634589195, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.014, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.259259259259259, |
|
"grad_norm": 0.026644067838788033, |
|
"learning_rate": 3.2595635010295475e-05, |
|
"loss": 0.0132, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.2685185185185186, |
|
"grad_norm": 0.028307707980275154, |
|
"learning_rate": 3.2293004437120624e-05, |
|
"loss": 0.0093, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.2777777777777777, |
|
"grad_norm": 0.03480321913957596, |
|
"learning_rate": 3.199111375976449e-05, |
|
"loss": 0.0107, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.287037037037037, |
|
"grad_norm": 0.029546814039349556, |
|
"learning_rate": 3.1689975592882603e-05, |
|
"loss": 0.0099, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.287037037037037, |
|
"eval_loss": 0.015444349497556686, |
|
"eval_runtime": 9.1458, |
|
"eval_samples_per_second": 5.467, |
|
"eval_steps_per_second": 1.421, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.2962962962962963, |
|
"grad_norm": 0.02437739446759224, |
|
"learning_rate": 3.1389602519686515e-05, |
|
"loss": 0.0118, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.3055555555555554, |
|
"grad_norm": 0.029530519619584084, |
|
"learning_rate": 3.109000709141788e-05, |
|
"loss": 0.0121, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.314814814814815, |
|
"grad_norm": 0.029449855908751488, |
|
"learning_rate": 3.079120182682412e-05, |
|
"loss": 0.0099, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.324074074074074, |
|
"grad_norm": 0.020589128136634827, |
|
"learning_rate": 3.049319921163526e-05, |
|
"loss": 0.0119, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.02450876496732235, |
|
"learning_rate": 3.019601169804216e-05, |
|
"loss": 0.0129, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 0.0157760102301836, |
|
"eval_runtime": 9.1103, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3425925925925926, |
|
"grad_norm": 0.0208604596555233, |
|
"learning_rate": 2.9899651704176325e-05, |
|
"loss": 0.011, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.351851851851852, |
|
"grad_norm": 0.025153055787086487, |
|
"learning_rate": 2.9604131613590824e-05, |
|
"loss": 0.0109, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.361111111111111, |
|
"grad_norm": 0.021455859765410423, |
|
"learning_rate": 2.9309463774743046e-05, |
|
"loss": 0.0122, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.3703703703703702, |
|
"grad_norm": 0.01964252069592476, |
|
"learning_rate": 2.901566050047855e-05, |
|
"loss": 0.0113, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.3796296296296298, |
|
"grad_norm": 0.020809266716241837, |
|
"learning_rate": 2.872273406751664e-05, |
|
"loss": 0.0105, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3796296296296298, |
|
"eval_loss": 0.015391937457025051, |
|
"eval_runtime": 9.111, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.388888888888889, |
|
"grad_norm": 0.025048566982150078, |
|
"learning_rate": 2.8430696715937337e-05, |
|
"loss": 0.0107, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.398148148148148, |
|
"grad_norm": 0.024674881249666214, |
|
"learning_rate": 2.8139560648669962e-05, |
|
"loss": 0.0113, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.4074074074074074, |
|
"grad_norm": 0.025468124076724052, |
|
"learning_rate": 2.7849338030983257e-05, |
|
"loss": 0.012, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.4166666666666665, |
|
"grad_norm": 0.022864418104290962, |
|
"learning_rate": 2.7560040989976892e-05, |
|
"loss": 0.01, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.425925925925926, |
|
"grad_norm": 0.02258789725601673, |
|
"learning_rate": 2.7271681614074973e-05, |
|
"loss": 0.0121, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.425925925925926, |
|
"eval_loss": 0.015503110364079475, |
|
"eval_runtime": 9.1077, |
|
"eval_samples_per_second": 5.49, |
|
"eval_steps_per_second": 1.427, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.435185185185185, |
|
"grad_norm": 0.025097696110606194, |
|
"learning_rate": 2.6984271952520722e-05, |
|
"loss": 0.0104, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.4444444444444446, |
|
"grad_norm": 0.028177309781312943, |
|
"learning_rate": 2.6697824014873075e-05, |
|
"loss": 0.0132, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.4537037037037037, |
|
"grad_norm": 0.026587417349219322, |
|
"learning_rate": 2.641234977050484e-05, |
|
"loss": 0.0085, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.462962962962963, |
|
"grad_norm": 0.0189076978713274, |
|
"learning_rate": 2.612786114810255e-05, |
|
"loss": 0.0096, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.4722222222222223, |
|
"grad_norm": 0.029332995414733887, |
|
"learning_rate": 2.5844370035168073e-05, |
|
"loss": 0.0096, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.4722222222222223, |
|
"eval_loss": 0.015461472794413567, |
|
"eval_runtime": 9.1144, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.4814814814814814, |
|
"grad_norm": 0.02185731939971447, |
|
"learning_rate": 2.5561888277521794e-05, |
|
"loss": 0.0098, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.490740740740741, |
|
"grad_norm": 0.026887575164437294, |
|
"learning_rate": 2.528042767880766e-05, |
|
"loss": 0.0114, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.023131586611270905, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0112, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.5092592592592595, |
|
"grad_norm": 0.028937749564647675, |
|
"learning_rate": 2.4720616958912053e-05, |
|
"loss": 0.0121, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.5185185185185186, |
|
"grad_norm": 0.032668791711330414, |
|
"learning_rate": 2.4442290229706344e-05, |
|
"loss": 0.0112, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5185185185185186, |
|
"eval_loss": 0.015212837606668472, |
|
"eval_runtime": 9.1177, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5277777777777777, |
|
"grad_norm": 0.02449023723602295, |
|
"learning_rate": 2.4165031442406855e-05, |
|
"loss": 0.0117, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.537037037037037, |
|
"grad_norm": 0.025157004594802856, |
|
"learning_rate": 2.3888852182413085e-05, |
|
"loss": 0.0091, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.5462962962962963, |
|
"grad_norm": 0.03108743578195572, |
|
"learning_rate": 2.361376399001592e-05, |
|
"loss": 0.0108, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.021932488307356834, |
|
"learning_rate": 2.333977835991545e-05, |
|
"loss": 0.0093, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.564814814814815, |
|
"grad_norm": 0.026496881619095802, |
|
"learning_rate": 2.3066906740740623e-05, |
|
"loss": 0.0118, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.564814814814815, |
|
"eval_loss": 0.01467986311763525, |
|
"eval_runtime": 9.1127, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.427, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.574074074074074, |
|
"grad_norm": 0.024211710318922997, |
|
"learning_rate": 2.2795160534570864e-05, |
|
"loss": 0.0086, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.5833333333333335, |
|
"grad_norm": 0.023977207019925117, |
|
"learning_rate": 2.25245510964597e-05, |
|
"loss": 0.0128, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.5925925925925926, |
|
"grad_norm": 0.02136526070535183, |
|
"learning_rate": 2.225508973396016e-05, |
|
"loss": 0.0121, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.601851851851852, |
|
"grad_norm": 0.026328187435865402, |
|
"learning_rate": 2.198678770665238e-05, |
|
"loss": 0.0108, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"grad_norm": 0.02159940078854561, |
|
"learning_rate": 2.171965622567308e-05, |
|
"loss": 0.0082, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"eval_loss": 0.014544774778187275, |
|
"eval_runtime": 9.1133, |
|
"eval_samples_per_second": 5.487, |
|
"eval_steps_per_second": 1.426, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6203703703703702, |
|
"grad_norm": 0.02303987927734852, |
|
"learning_rate": 2.1453706453247087e-05, |
|
"loss": 0.0092, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.6296296296296298, |
|
"grad_norm": 0.027734337374567986, |
|
"learning_rate": 2.1188949502220983e-05, |
|
"loss": 0.0101, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.638888888888889, |
|
"grad_norm": 0.02069096453487873, |
|
"learning_rate": 2.0925396435598664e-05, |
|
"loss": 0.0111, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.648148148148148, |
|
"grad_norm": 0.02777431532740593, |
|
"learning_rate": 2.066305826607911e-05, |
|
"loss": 0.0091, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.6574074074074074, |
|
"grad_norm": 0.02333620935678482, |
|
"learning_rate": 2.0401945955596206e-05, |
|
"loss": 0.0112, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.6574074074074074, |
|
"eval_loss": 0.01460795197635889, |
|
"eval_runtime": 9.1059, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 0.022142188623547554, |
|
"learning_rate": 2.0142070414860704e-05, |
|
"loss": 0.01, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.675925925925926, |
|
"grad_norm": 0.01749616675078869, |
|
"learning_rate": 1.9883442502904283e-05, |
|
"loss": 0.0095, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.685185185185185, |
|
"grad_norm": 0.02393367514014244, |
|
"learning_rate": 1.9626073026625818e-05, |
|
"loss": 0.0095, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.6944444444444446, |
|
"grad_norm": 0.023465050384402275, |
|
"learning_rate": 1.936997274033986e-05, |
|
"loss": 0.0108, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 0.023157304152846336, |
|
"learning_rate": 1.9115152345327152e-05, |
|
"loss": 0.0086, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"eval_loss": 0.014902754686772823, |
|
"eval_runtime": 9.1616, |
|
"eval_samples_per_second": 5.458, |
|
"eval_steps_per_second": 1.419, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.712962962962963, |
|
"grad_norm": 0.021799901500344276, |
|
"learning_rate": 1.8861622489387555e-05, |
|
"loss": 0.0128, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.7222222222222223, |
|
"grad_norm": 0.03070679120719433, |
|
"learning_rate": 1.8609393766395085e-05, |
|
"loss": 0.0123, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.7314814814814814, |
|
"grad_norm": 0.02543518878519535, |
|
"learning_rate": 1.835847671585526e-05, |
|
"loss": 0.0114, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.7407407407407405, |
|
"grad_norm": 0.027585655450820923, |
|
"learning_rate": 1.8108881822464696e-05, |
|
"loss": 0.0099, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.02352389506995678, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0102, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.014981208369135857, |
|
"eval_runtime": 9.1106, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.7592592592592595, |
|
"grad_norm": 0.02560283988714218, |
|
"learning_rate": 1.7613700169247056e-05, |
|
"loss": 0.012, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.7685185185185186, |
|
"grad_norm": 0.026089752092957497, |
|
"learning_rate": 1.7368134100837287e-05, |
|
"loss": 0.0088, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.030365899205207825, |
|
"learning_rate": 1.7123931571546827e-05, |
|
"loss": 0.0119, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.787037037037037, |
|
"grad_norm": 0.031558796763420105, |
|
"learning_rate": 1.6881102785502616e-05, |
|
"loss": 0.011, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.7962962962962963, |
|
"grad_norm": 0.030366325750947, |
|
"learning_rate": 1.6639657889429018e-05, |
|
"loss": 0.0116, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7962962962962963, |
|
"eval_loss": 0.014859426766633987, |
|
"eval_runtime": 9.1059, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.8055555555555554, |
|
"grad_norm": 0.025008074939250946, |
|
"learning_rate": 1.639960697222388e-05, |
|
"loss": 0.0106, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.814814814814815, |
|
"grad_norm": 0.028196556493639946, |
|
"learning_rate": 1.6160960064536908e-05, |
|
"loss": 0.0113, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.824074074074074, |
|
"grad_norm": 0.02165764756500721, |
|
"learning_rate": 1.592372713835055e-05, |
|
"loss": 0.0115, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.8333333333333335, |
|
"grad_norm": 0.020175475627183914, |
|
"learning_rate": 1.5687918106563326e-05, |
|
"loss": 0.0112, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.8425925925925926, |
|
"grad_norm": 0.027304671704769135, |
|
"learning_rate": 1.545354282257562e-05, |
|
"loss": 0.0126, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8425925925925926, |
|
"eval_loss": 0.014735485427081585, |
|
"eval_runtime": 9.198, |
|
"eval_samples_per_second": 5.436, |
|
"eval_steps_per_second": 1.413, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.851851851851852, |
|
"grad_norm": 0.026429716497659683, |
|
"learning_rate": 1.52206110798779e-05, |
|
"loss": 0.0103, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.861111111111111, |
|
"grad_norm": 0.02409077063202858, |
|
"learning_rate": 1.4989132611641576e-05, |
|
"loss": 0.012, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.8703703703703702, |
|
"grad_norm": 0.02310461364686489, |
|
"learning_rate": 1.4759117090312197e-05, |
|
"loss": 0.0096, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.8796296296296298, |
|
"grad_norm": 0.026219584047794342, |
|
"learning_rate": 1.453057412720536e-05, |
|
"loss": 0.0094, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.027541201561689377, |
|
"learning_rate": 1.4303513272105057e-05, |
|
"loss": 0.0112, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"eval_loss": 0.014594363048672676, |
|
"eval_runtime": 9.1304, |
|
"eval_samples_per_second": 5.476, |
|
"eval_steps_per_second": 1.424, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.898148148148148, |
|
"grad_norm": 0.024942217394709587, |
|
"learning_rate": 1.4077944012864636e-05, |
|
"loss": 0.0093, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.9074074074074074, |
|
"grad_norm": 0.018137283623218536, |
|
"learning_rate": 1.3853875775010355e-05, |
|
"loss": 0.0102, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.9166666666666665, |
|
"grad_norm": 0.021817779168486595, |
|
"learning_rate": 1.3631317921347563e-05, |
|
"loss": 0.0084, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.925925925925926, |
|
"grad_norm": 0.023799235001206398, |
|
"learning_rate": 1.3410279751569399e-05, |
|
"loss": 0.0122, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.935185185185185, |
|
"grad_norm": 0.030764896422624588, |
|
"learning_rate": 1.3190770501868243e-05, |
|
"loss": 0.0107, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.935185185185185, |
|
"eval_loss": 0.014631365425884724, |
|
"eval_runtime": 9.1149, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.9444444444444446, |
|
"grad_norm": 0.022886106744408607, |
|
"learning_rate": 1.297279934454978e-05, |
|
"loss": 0.0096, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.9537037037037037, |
|
"grad_norm": 0.03152737021446228, |
|
"learning_rate": 1.2756375387649716e-05, |
|
"loss": 0.0124, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.962962962962963, |
|
"grad_norm": 0.02872036211192608, |
|
"learning_rate": 1.25415076745532e-05, |
|
"loss": 0.0091, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.9722222222222223, |
|
"grad_norm": 0.021184636279940605, |
|
"learning_rate": 1.2328205183616965e-05, |
|
"loss": 0.0105, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.9814814814814814, |
|
"grad_norm": 0.02112959884107113, |
|
"learning_rate": 1.2116476827794104e-05, |
|
"loss": 0.0113, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9814814814814814, |
|
"eval_loss": 0.01471536885946989, |
|
"eval_runtime": 9.116, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9907407407407405, |
|
"grad_norm": 0.019945990294218063, |
|
"learning_rate": 1.1906331454261704e-05, |
|
"loss": 0.0093, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.023910805583000183, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.011, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.0092592592592595, |
|
"grad_norm": 0.01957758143544197, |
|
"learning_rate": 1.1490824711681025e-05, |
|
"loss": 0.0094, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 4.018518518518518, |
|
"grad_norm": 0.02563118375837803, |
|
"learning_rate": 1.1285480704793377e-05, |
|
"loss": 0.0093, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.027777777777778, |
|
"grad_norm": 0.026251764968037605, |
|
"learning_rate": 1.1081754403791999e-05, |
|
"loss": 0.0091, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.027777777777778, |
|
"eval_loss": 0.014734329655766487, |
|
"eval_runtime": 9.1592, |
|
"eval_samples_per_second": 5.459, |
|
"eval_steps_per_second": 1.419, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.037037037037037, |
|
"grad_norm": 0.025834446772933006, |
|
"learning_rate": 1.0879654321484012e-05, |
|
"loss": 0.0067, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.046296296296297, |
|
"grad_norm": 0.0185233224183321, |
|
"learning_rate": 1.0679188902724191e-05, |
|
"loss": 0.0108, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 4.055555555555555, |
|
"grad_norm": 0.021918736398220062, |
|
"learning_rate": 1.0480366524062042e-05, |
|
"loss": 0.0088, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.064814814814815, |
|
"grad_norm": 0.03142661973834038, |
|
"learning_rate": 1.0283195493391823e-05, |
|
"loss": 0.0103, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 4.074074074074074, |
|
"grad_norm": 0.023410873487591743, |
|
"learning_rate": 1.008768404960535e-05, |
|
"loss": 0.0094, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.074074074074074, |
|
"eval_loss": 0.014965096488595009, |
|
"eval_runtime": 9.1135, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.083333333333333, |
|
"grad_norm": 0.02943902276456356, |
|
"learning_rate": 9.893840362247809e-06, |
|
"loss": 0.0056, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.092592592592593, |
|
"grad_norm": 0.021431270986795425, |
|
"learning_rate": 9.701672531176286e-06, |
|
"loss": 0.0089, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.101851851851852, |
|
"grad_norm": 0.02797669917345047, |
|
"learning_rate": 9.511188586221376e-06, |
|
"loss": 0.0092, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 4.111111111111111, |
|
"grad_norm": 0.02437691204249859, |
|
"learning_rate": 9.322396486851626e-06, |
|
"loss": 0.0104, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.12037037037037, |
|
"grad_norm": 0.024811841547489166, |
|
"learning_rate": 9.135304121840976e-06, |
|
"loss": 0.0096, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.12037037037037, |
|
"eval_loss": 0.014996801503002644, |
|
"eval_runtime": 9.1094, |
|
"eval_samples_per_second": 5.489, |
|
"eval_steps_per_second": 1.427, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.12962962962963, |
|
"grad_norm": 0.0309213325381279, |
|
"learning_rate": 8.949919308939082e-06, |
|
"loss": 0.0109, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.138888888888889, |
|
"grad_norm": 0.023763932287693024, |
|
"learning_rate": 8.766249794544662e-06, |
|
"loss": 0.0073, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.023741643875837326, |
|
"learning_rate": 8.584303253381847e-06, |
|
"loss": 0.0105, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.157407407407407, |
|
"grad_norm": 0.02090543322265148, |
|
"learning_rate": 8.404087288179424e-06, |
|
"loss": 0.0096, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 0.026315612718462944, |
|
"learning_rate": 8.225609429353187e-06, |
|
"loss": 0.0091, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"eval_loss": 0.015186839736998081, |
|
"eval_runtime": 9.1241, |
|
"eval_samples_per_second": 5.48, |
|
"eval_steps_per_second": 1.425, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.175925925925926, |
|
"grad_norm": 0.023099206387996674, |
|
"learning_rate": 8.048877134691268e-06, |
|
"loss": 0.0091, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.185185185185185, |
|
"grad_norm": 0.027901167050004005, |
|
"learning_rate": 7.873897789042523e-06, |
|
"loss": 0.0092, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.194444444444445, |
|
"grad_norm": 0.025486482307314873, |
|
"learning_rate": 7.700678704007947e-06, |
|
"loss": 0.0077, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.203703703703703, |
|
"grad_norm": 0.0233286302536726, |
|
"learning_rate": 7.529227117635135e-06, |
|
"loss": 0.0077, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.212962962962963, |
|
"grad_norm": 0.023314587771892548, |
|
"learning_rate": 7.35955019411585e-06, |
|
"loss": 0.0089, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.212962962962963, |
|
"eval_loss": 0.015497377142310143, |
|
"eval_runtime": 9.1064, |
|
"eval_samples_per_second": 5.491, |
|
"eval_steps_per_second": 1.428, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.021640775725245476, |
|
"learning_rate": 7.191655023486682e-06, |
|
"loss": 0.01, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.231481481481482, |
|
"grad_norm": 0.027831410989165306, |
|
"learning_rate": 7.02554862133275e-06, |
|
"loss": 0.0105, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.2407407407407405, |
|
"grad_norm": 0.023242153227329254, |
|
"learning_rate": 6.861237928494579e-06, |
|
"loss": 0.009, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.02775505743920803, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0102, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.2592592592592595, |
|
"grad_norm": 0.0267843846231699, |
|
"learning_rate": 6.53803105866761e-06, |
|
"loss": 0.0063, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.2592592592592595, |
|
"eval_loss": 0.01563325710594654, |
|
"eval_runtime": 9.111, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.268518518518518, |
|
"grad_norm": 0.02488654851913452, |
|
"learning_rate": 6.379148387042316e-06, |
|
"loss": 0.01, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.277777777777778, |
|
"grad_norm": 0.024208445101976395, |
|
"learning_rate": 6.222088434895462e-06, |
|
"loss": 0.0072, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.287037037037037, |
|
"grad_norm": 0.023147890344262123, |
|
"learning_rate": 6.066857765057055e-06, |
|
"loss": 0.0088, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.296296296296296, |
|
"grad_norm": 0.029451172798871994, |
|
"learning_rate": 5.9134628639196e-06, |
|
"loss": 0.0085, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.305555555555555, |
|
"grad_norm": 0.02764413133263588, |
|
"learning_rate": 5.7619101411671095e-06, |
|
"loss": 0.0099, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.305555555555555, |
|
"eval_loss": 0.015693385154008865, |
|
"eval_runtime": 9.1176, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 1.426, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.314814814814815, |
|
"grad_norm": 0.021906448528170586, |
|
"learning_rate": 5.6122059295072085e-06, |
|
"loss": 0.0096, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.324074074074074, |
|
"grad_norm": 0.02385389618575573, |
|
"learning_rate": 5.464356484406535e-06, |
|
"loss": 0.0072, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.333333333333333, |
|
"grad_norm": 0.026357507333159447, |
|
"learning_rate": 5.318367983829392e-06, |
|
"loss": 0.0079, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.342592592592593, |
|
"grad_norm": 0.026002187281847, |
|
"learning_rate": 5.174246527979531e-06, |
|
"loss": 0.0095, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.351851851851852, |
|
"grad_norm": 0.02679777517914772, |
|
"learning_rate": 5.031998139045352e-06, |
|
"loss": 0.0085, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.351851851851852, |
|
"eval_loss": 0.015615792945027351, |
|
"eval_runtime": 9.1365, |
|
"eval_samples_per_second": 5.473, |
|
"eval_steps_per_second": 1.423, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.361111111111111, |
|
"grad_norm": 0.023431269451975822, |
|
"learning_rate": 4.891628760948114e-06, |
|
"loss": 0.009, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 4.37037037037037, |
|
"grad_norm": 0.02848837524652481, |
|
"learning_rate": 4.7531442590937335e-06, |
|
"loss": 0.0102, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.37962962962963, |
|
"grad_norm": 0.026586227118968964, |
|
"learning_rate": 4.616550420127563e-06, |
|
"loss": 0.0078, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 4.388888888888889, |
|
"grad_norm": 0.025660747662186623, |
|
"learning_rate": 4.4818529516926726e-06, |
|
"loss": 0.0086, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.398148148148148, |
|
"grad_norm": 0.02436869405210018, |
|
"learning_rate": 4.349057482191299e-06, |
|
"loss": 0.011, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.398148148148148, |
|
"eval_loss": 0.015554042533040047, |
|
"eval_runtime": 9.1142, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.407407407407407, |
|
"grad_norm": 0.02513139322400093, |
|
"learning_rate": 4.218169560549706e-06, |
|
"loss": 0.0108, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.416666666666667, |
|
"grad_norm": 0.027343349531292915, |
|
"learning_rate": 4.089194655986306e-06, |
|
"loss": 0.0099, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.425925925925926, |
|
"grad_norm": 0.02374204248189926, |
|
"learning_rate": 3.962138157783085e-06, |
|
"loss": 0.0095, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.435185185185185, |
|
"grad_norm": 0.04114212468266487, |
|
"learning_rate": 3.837005375060482e-06, |
|
"loss": 0.0089, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.024016965180635452, |
|
"learning_rate": 3.7138015365554833e-06, |
|
"loss": 0.0067, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"eval_loss": 0.01539613213390112, |
|
"eval_runtime": 9.1246, |
|
"eval_samples_per_second": 5.48, |
|
"eval_steps_per_second": 1.425, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.453703703703704, |
|
"grad_norm": 0.02901994250714779, |
|
"learning_rate": 3.5925317904031587e-06, |
|
"loss": 0.0087, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 4.462962962962963, |
|
"grad_norm": 0.020981522276997566, |
|
"learning_rate": 3.4732012039215776e-06, |
|
"loss": 0.011, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 4.472222222222222, |
|
"grad_norm": 0.023783011361956596, |
|
"learning_rate": 3.3558147633999728e-06, |
|
"loss": 0.0096, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 4.481481481481482, |
|
"grad_norm": 0.02081628330051899, |
|
"learning_rate": 3.2403773738905187e-06, |
|
"loss": 0.0087, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 4.4907407407407405, |
|
"grad_norm": 0.024986054748296738, |
|
"learning_rate": 3.126893859003249e-06, |
|
"loss": 0.0092, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.4907407407407405, |
|
"eval_loss": 0.015287145972251892, |
|
"eval_runtime": 9.1097, |
|
"eval_samples_per_second": 5.489, |
|
"eval_steps_per_second": 1.427, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.032323963940143585, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0086, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.5092592592592595, |
|
"grad_norm": 0.02963520959019661, |
|
"learning_rate": 2.9058073391191375e-06, |
|
"loss": 0.0068, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 4.518518518518518, |
|
"grad_norm": 0.035344675183296204, |
|
"learning_rate": 2.798213572335001e-06, |
|
"loss": 0.0062, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 4.527777777777778, |
|
"grad_norm": 0.026800939813256264, |
|
"learning_rate": 2.692592156212487e-06, |
|
"loss": 0.0092, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 4.537037037037037, |
|
"grad_norm": 0.024116506800055504, |
|
"learning_rate": 2.5889475041961765e-06, |
|
"loss": 0.0072, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.537037037037037, |
|
"eval_loss": 0.015211592428386211, |
|
"eval_runtime": 9.1184, |
|
"eval_samples_per_second": 5.483, |
|
"eval_steps_per_second": 1.426, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.546296296296296, |
|
"grad_norm": 0.027498748153448105, |
|
"learning_rate": 2.4872839471306084e-06, |
|
"loss": 0.0082, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 4.555555555555555, |
|
"grad_norm": 0.026998436078429222, |
|
"learning_rate": 2.3876057330792346e-06, |
|
"loss": 0.008, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 4.564814814814815, |
|
"grad_norm": 0.023703446611762047, |
|
"learning_rate": 2.2899170271469428e-06, |
|
"loss": 0.011, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 4.574074074074074, |
|
"grad_norm": 0.019968930631875992, |
|
"learning_rate": 2.1942219113060212e-06, |
|
"loss": 0.0075, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"grad_norm": 0.02214980125427246, |
|
"learning_rate": 2.100524384225555e-06, |
|
"loss": 0.0078, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"eval_loss": 0.015181516297161579, |
|
"eval_runtime": 9.1214, |
|
"eval_samples_per_second": 5.482, |
|
"eval_steps_per_second": 1.425, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.592592592592593, |
|
"grad_norm": 0.025330157950520515, |
|
"learning_rate": 2.0088283611044036e-06, |
|
"loss": 0.0062, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.601851851851852, |
|
"grad_norm": 0.019013626500964165, |
|
"learning_rate": 1.9191376735075427e-06, |
|
"loss": 0.0088, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 4.611111111111111, |
|
"grad_norm": 0.022145694121718407, |
|
"learning_rate": 1.8314560692059835e-06, |
|
"loss": 0.0089, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.62037037037037, |
|
"grad_norm": 0.023724934086203575, |
|
"learning_rate": 1.7457872120201779e-06, |
|
"loss": 0.0086, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 0.020578699186444283, |
|
"learning_rate": 1.6621346816668992e-06, |
|
"loss": 0.0091, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"eval_loss": 0.015207822434604168, |
|
"eval_runtime": 9.1136, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.638888888888889, |
|
"grad_norm": 0.024306217208504677, |
|
"learning_rate": 1.5805019736097104e-06, |
|
"loss": 0.009, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 4.648148148148148, |
|
"grad_norm": 0.020744021981954575, |
|
"learning_rate": 1.5008924989128258e-06, |
|
"loss": 0.0089, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 4.657407407407407, |
|
"grad_norm": 0.02516799047589302, |
|
"learning_rate": 1.4233095840986753e-06, |
|
"loss": 0.0093, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.024567998945713043, |
|
"learning_rate": 1.3477564710088098e-06, |
|
"loss": 0.0094, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.675925925925926, |
|
"grad_norm": 0.024358859285712242, |
|
"learning_rate": 1.2742363166685034e-06, |
|
"loss": 0.007, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.675925925925926, |
|
"eval_loss": 0.015200878493487835, |
|
"eval_runtime": 9.1155, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.685185185185185, |
|
"grad_norm": 0.023163504898548126, |
|
"learning_rate": 1.2027521931548214e-06, |
|
"loss": 0.0074, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 4.694444444444445, |
|
"grad_norm": 0.023604586720466614, |
|
"learning_rate": 1.1333070874682216e-06, |
|
"loss": 0.0093, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 4.703703703703704, |
|
"grad_norm": 0.02068418823182583, |
|
"learning_rate": 1.0659039014077944e-06, |
|
"loss": 0.0084, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 4.712962962962963, |
|
"grad_norm": 0.02598651312291622, |
|
"learning_rate": 1.0005454514499414e-06, |
|
"loss": 0.0088, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"grad_norm": 0.02512424811720848, |
|
"learning_rate": 9.372344686307655e-07, |
|
"loss": 0.0064, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"eval_loss": 0.01521637849509716, |
|
"eval_runtime": 9.1143, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.731481481481482, |
|
"grad_norm": 0.021041063591837883, |
|
"learning_rate": 8.759735984318895e-07, |
|
"loss": 0.0096, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 0.025718161836266518, |
|
"learning_rate": 8.167654006699443e-07, |
|
"loss": 0.0077, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.02913082391023636, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.0072, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 4.7592592592592595, |
|
"grad_norm": 0.026588505133986473, |
|
"learning_rate": 7.04516832760177e-07, |
|
"loss": 0.0094, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 4.768518518518518, |
|
"grad_norm": 0.023728126659989357, |
|
"learning_rate": 6.514811529758747e-07, |
|
"loss": 0.0099, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.768518518518518, |
|
"eval_loss": 0.01521516963839531, |
|
"eval_runtime": 9.1511, |
|
"eval_samples_per_second": 5.464, |
|
"eval_steps_per_second": 1.421, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.777777777777778, |
|
"grad_norm": 0.03438512608408928, |
|
"learning_rate": 6.005075261595494e-07, |
|
"loss": 0.0086, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 4.787037037037037, |
|
"grad_norm": 0.019554298371076584, |
|
"learning_rate": 5.515980822701439e-07, |
|
"loss": 0.0092, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 4.796296296296296, |
|
"grad_norm": 0.0235204566270113, |
|
"learning_rate": 5.047548650136513e-07, |
|
"loss": 0.009, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 4.805555555555555, |
|
"grad_norm": 0.023747643455863, |
|
"learning_rate": 4.5997983175773417e-07, |
|
"loss": 0.0092, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 4.814814814814815, |
|
"grad_norm": 0.02751827985048294, |
|
"learning_rate": 4.1727485344994486e-07, |
|
"loss": 0.0088, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.814814814814815, |
|
"eval_loss": 0.015235532075166702, |
|
"eval_runtime": 9.1256, |
|
"eval_samples_per_second": 5.479, |
|
"eval_steps_per_second": 1.425, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.824074074074074, |
|
"grad_norm": 0.026621591299772263, |
|
"learning_rate": 3.766417145395218e-07, |
|
"loss": 0.0086, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 4.833333333333333, |
|
"grad_norm": 0.01991841197013855, |
|
"learning_rate": 3.380821129028489e-07, |
|
"loss": 0.0084, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 4.842592592592593, |
|
"grad_norm": 0.023508219048380852, |
|
"learning_rate": 3.0159765977250673e-07, |
|
"loss": 0.0103, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 4.851851851851852, |
|
"grad_norm": 0.02976732887327671, |
|
"learning_rate": 2.671898796699268e-07, |
|
"loss": 0.0084, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 4.861111111111111, |
|
"grad_norm": 0.02255621738731861, |
|
"learning_rate": 2.3486021034170857e-07, |
|
"loss": 0.0089, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.861111111111111, |
|
"eval_loss": 0.015216498635709286, |
|
"eval_runtime": 9.1106, |
|
"eval_samples_per_second": 5.488, |
|
"eval_steps_per_second": 1.427, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.87037037037037, |
|
"grad_norm": 0.025215914472937584, |
|
"learning_rate": 2.0461000269953456e-07, |
|
"loss": 0.0075, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 4.87962962962963, |
|
"grad_norm": 0.02554066851735115, |
|
"learning_rate": 1.7644052076371542e-07, |
|
"loss": 0.0083, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.02162836864590645, |
|
"learning_rate": 1.503529416103988e-07, |
|
"loss": 0.009, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 4.898148148148148, |
|
"grad_norm": 0.02335723116993904, |
|
"learning_rate": 1.2634835532233657e-07, |
|
"loss": 0.0093, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 4.907407407407407, |
|
"grad_norm": 0.02844967506825924, |
|
"learning_rate": 1.044277649433989e-07, |
|
"loss": 0.0083, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.907407407407407, |
|
"eval_loss": 0.015229844488203526, |
|
"eval_runtime": 9.1406, |
|
"eval_samples_per_second": 5.47, |
|
"eval_steps_per_second": 1.422, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.916666666666667, |
|
"grad_norm": 0.02188325859606266, |
|
"learning_rate": 8.459208643659122e-08, |
|
"loss": 0.0084, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 4.925925925925926, |
|
"grad_norm": 0.026782654225826263, |
|
"learning_rate": 6.684214864584038e-08, |
|
"loss": 0.009, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 4.935185185185185, |
|
"grad_norm": 0.024010982364416122, |
|
"learning_rate": 5.11786932613223e-08, |
|
"loss": 0.0055, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 4.944444444444445, |
|
"grad_norm": 0.02621973119676113, |
|
"learning_rate": 3.760237478849793e-08, |
|
"loss": 0.0093, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 4.953703703703704, |
|
"grad_norm": 0.02257387712597847, |
|
"learning_rate": 2.6113760520735108e-08, |
|
"loss": 0.0103, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.953703703703704, |
|
"eval_loss": 0.015256751328706741, |
|
"eval_runtime": 9.1156, |
|
"eval_samples_per_second": 5.485, |
|
"eval_steps_per_second": 1.426, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.962962962962963, |
|
"grad_norm": 0.02289225161075592, |
|
"learning_rate": 1.6713330515627513e-08, |
|
"loss": 0.0106, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 4.972222222222222, |
|
"grad_norm": 0.032289694994688034, |
|
"learning_rate": 9.401477574932926e-09, |
|
"loss": 0.0074, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 4.981481481481482, |
|
"grad_norm": 0.0215620007365942, |
|
"learning_rate": 4.178507228136397e-09, |
|
"loss": 0.0082, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 4.9907407407407405, |
|
"grad_norm": 0.02391226962208748, |
|
"learning_rate": 1.0446377197104173e-09, |
|
"loss": 0.0085, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.0241775494068861, |
|
"learning_rate": 0.0, |
|
"loss": 0.0092, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.01526525616645813, |
|
"eval_runtime": 9.1149, |
|
"eval_samples_per_second": 5.486, |
|
"eval_steps_per_second": 1.426, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 540, |
|
"total_flos": 1.2254685925518213e+18, |
|
"train_loss": 0.016027936152251506, |
|
"train_runtime": 9839.9649, |
|
"train_samples_per_second": 1.756, |
|
"train_steps_per_second": 0.055 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2254685925518213e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|