diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,4685 +1,7553 @@ { - "best_metric": 0.014902754686772823, - "best_model_checkpoint": "/home/paperspace/Data/models/akoul_whitehorseliquidity_25c/llm3br256/checkpoint-400", - "epoch": 5.0, - "eval_steps": 5, - "global_step": 540, + "best_metric": 0.0008496911614201963, + "best_model_checkpoint": "/home/paperspace/Data/models/akoul_whitehorseliquidity_25c/llm3br256/checkpoint-900", + "epoch": 3.313131313131313, + "eval_steps": 25, + "global_step": 1025, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.009259259259259259, - "grad_norm": 0.29716095328330994, - "learning_rate": 1.8518518518518519e-06, - "loss": 0.1002, + "epoch": 0.0032323232323232323, + "grad_norm": 0.03562889248132706, + "learning_rate": 1.2936610608020701e-07, + "loss": 0.0088, "step": 1 }, { - "epoch": 0.018518518518518517, - "grad_norm": 0.2648535370826721, - "learning_rate": 3.7037037037037037e-06, - "loss": 0.0936, + "epoch": 0.006464646464646465, + "grad_norm": 0.03539334237575531, + "learning_rate": 2.5873221216041403e-07, + "loss": 0.0098, "step": 2 }, { - "epoch": 0.027777777777777776, - "grad_norm": 0.24819649755954742, - "learning_rate": 5.555555555555556e-06, - "loss": 0.0898, + "epoch": 0.009696969696969697, + "grad_norm": 0.03888387605547905, + "learning_rate": 3.8809831824062096e-07, + "loss": 0.0103, "step": 3 }, { - "epoch": 0.037037037037037035, - "grad_norm": 0.23442289233207703, - "learning_rate": 7.4074074074074075e-06, - "loss": 0.087, + "epoch": 0.01292929292929293, + "grad_norm": 0.03283314406871796, + "learning_rate": 5.174644243208281e-07, + "loss": 0.0123, "step": 4 }, { - "epoch": 0.046296296296296294, - "grad_norm": 0.26300737261772156, - "learning_rate": 9.259259259259259e-06, - "loss": 0.0904, - "step": 5 - }, - { - "epoch": 0.046296296296296294, - "eval_loss": 0.0950983464717865, - "eval_runtime": 11.9584, - "eval_samples_per_second": 4.181, - "eval_steps_per_second": 1.087, + "epoch": 0.01616161616161616, + "grad_norm": 0.033392902463674545, + "learning_rate": 6.468305304010349e-07, + "loss": 0.0104, "step": 5 }, { - "epoch": 0.05555555555555555, - "grad_norm": 0.18399731814861298, - "learning_rate": 1.1111111111111112e-05, - "loss": 0.0805, + "epoch": 0.019393939393939394, + "grad_norm": 0.041360873728990555, + "learning_rate": 7.761966364812419e-07, + "loss": 0.0135, "step": 6 }, { - "epoch": 0.06481481481481481, - "grad_norm": 0.19827856123447418, - "learning_rate": 1.2962962962962962e-05, - "loss": 0.0782, + "epoch": 0.022626262626262626, + "grad_norm": 0.03297152742743492, + "learning_rate": 9.055627425614489e-07, + "loss": 0.01, "step": 7 }, { - "epoch": 0.07407407407407407, - "grad_norm": 0.13050280511379242, - "learning_rate": 1.4814814814814815e-05, - "loss": 0.0636, + "epoch": 0.02585858585858586, + "grad_norm": 0.03385542333126068, + "learning_rate": 1.0349288486416561e-06, + "loss": 0.0092, "step": 8 }, { - "epoch": 0.08333333333333333, - "grad_norm": 0.12110771238803864, - "learning_rate": 1.6666666666666667e-05, - "loss": 0.056, + "epoch": 0.02909090909090909, + "grad_norm": 0.038460493087768555, + "learning_rate": 1.164294954721863e-06, + "loss": 0.0132, "step": 9 }, { - "epoch": 0.09259259259259259, - "grad_norm": 0.1111820638179779, - "learning_rate": 1.8518518518518518e-05, - "loss": 0.053, - "step": 10 - }, - { - "epoch": 0.09259259259259259, - "eval_loss": 0.04887561500072479, - "eval_runtime": 9.1057, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, + "epoch": 0.03232323232323232, + "grad_norm": 0.030043406412005424, + "learning_rate": 1.2936610608020699e-06, + "loss": 0.0088, "step": 10 }, { - "epoch": 0.10185185185185185, - "grad_norm": 0.0779903382062912, - "learning_rate": 2.037037037037037e-05, - "loss": 0.0538, + "epoch": 0.035555555555555556, + "grad_norm": 0.039398446679115295, + "learning_rate": 1.423027166882277e-06, + "loss": 0.0118, "step": 11 }, { - "epoch": 0.1111111111111111, - "grad_norm": 0.08193033933639526, - "learning_rate": 2.2222222222222223e-05, - "loss": 0.0398, + "epoch": 0.03878787878787879, + "grad_norm": 0.03809528797864914, + "learning_rate": 1.5523932729624839e-06, + "loss": 0.0146, "step": 12 }, { - "epoch": 0.12037037037037036, - "grad_norm": 0.0821649506688118, - "learning_rate": 2.4074074074074074e-05, - "loss": 0.0473, + "epoch": 0.04202020202020202, + "grad_norm": 0.0388377383351326, + "learning_rate": 1.6817593790426907e-06, + "loss": 0.0102, "step": 13 }, { - "epoch": 0.12962962962962962, - "grad_norm": 0.07107188552618027, - "learning_rate": 2.5925925925925925e-05, - "loss": 0.0386, + "epoch": 0.04525252525252525, + "grad_norm": 0.043977439403533936, + "learning_rate": 1.8111254851228978e-06, + "loss": 0.0099, "step": 14 }, { - "epoch": 0.1388888888888889, - "grad_norm": 0.05971238389611244, - "learning_rate": 2.777777777777778e-05, - "loss": 0.0417, - "step": 15 - }, - { - "epoch": 0.1388888888888889, - "eval_loss": 0.04156189784407616, - "eval_runtime": 9.1211, - "eval_samples_per_second": 5.482, - "eval_steps_per_second": 1.425, + "epoch": 0.048484848484848485, + "grad_norm": 0.036720160394907, + "learning_rate": 1.9404915912031045e-06, + "loss": 0.0143, "step": 15 }, { - "epoch": 0.14814814814814814, - "grad_norm": 0.05262186750769615, - "learning_rate": 2.962962962962963e-05, - "loss": 0.0384, + "epoch": 0.05171717171717172, + "grad_norm": 0.03513036668300629, + "learning_rate": 2.0698576972833122e-06, + "loss": 0.0092, "step": 16 }, { - "epoch": 0.1574074074074074, - "grad_norm": 0.05361900106072426, - "learning_rate": 3.148148148148148e-05, - "loss": 0.0378, + "epoch": 0.05494949494949495, + "grad_norm": 0.03713906928896904, + "learning_rate": 2.199223803363519e-06, + "loss": 0.0144, "step": 17 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.05355929210782051, - "learning_rate": 3.3333333333333335e-05, - "loss": 0.0399, + "epoch": 0.05818181818181818, + "grad_norm": 0.0338338240981102, + "learning_rate": 2.328589909443726e-06, + "loss": 0.0133, "step": 18 }, { - "epoch": 0.17592592592592593, - "grad_norm": 0.04563885182142258, - "learning_rate": 3.518518518518519e-05, - "loss": 0.0368, + "epoch": 0.061414141414141414, + "grad_norm": 0.0263836532831192, + "learning_rate": 2.457956015523933e-06, + "loss": 0.0113, "step": 19 }, { - "epoch": 0.18518518518518517, - "grad_norm": 0.060624465346336365, - "learning_rate": 3.7037037037037037e-05, - "loss": 0.0396, - "step": 20 - }, - { - "epoch": 0.18518518518518517, - "eval_loss": 0.03584723547101021, - "eval_runtime": 9.1162, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 0.06464646464646465, + "grad_norm": 0.01817137934267521, + "learning_rate": 2.5873221216041398e-06, + "loss": 0.0074, "step": 20 }, { - "epoch": 0.19444444444444445, - "grad_norm": 0.0525534488260746, - "learning_rate": 3.888888888888889e-05, - "loss": 0.0364, + "epoch": 0.06787878787878789, + "grad_norm": 0.018132060766220093, + "learning_rate": 2.716688227684347e-06, + "loss": 0.008, "step": 21 }, { - "epoch": 0.2037037037037037, - "grad_norm": 0.041657958179712296, - "learning_rate": 4.074074074074074e-05, - "loss": 0.034, + "epoch": 0.07111111111111111, + "grad_norm": 0.02117246761918068, + "learning_rate": 2.846054333764554e-06, + "loss": 0.0091, "step": 22 }, { - "epoch": 0.21296296296296297, - "grad_norm": 0.04589791223406792, - "learning_rate": 4.259259259259259e-05, - "loss": 0.0317, + "epoch": 0.07434343434343435, + "grad_norm": 0.01761500909924507, + "learning_rate": 2.975420439844761e-06, + "loss": 0.008, "step": 23 }, { - "epoch": 0.2222222222222222, - "grad_norm": 0.04220304638147354, - "learning_rate": 4.4444444444444447e-05, - "loss": 0.0339, + "epoch": 0.07757575757575758, + "grad_norm": 0.01908932812511921, + "learning_rate": 3.1047865459249677e-06, + "loss": 0.008, "step": 24 }, { - "epoch": 0.23148148148148148, - "grad_norm": 0.03630352392792702, - "learning_rate": 4.62962962962963e-05, - "loss": 0.029, + "epoch": 0.08080808080808081, + "grad_norm": 0.01619753986597061, + "learning_rate": 3.234152652005175e-06, + "loss": 0.0078, "step": 25 }, { - "epoch": 0.23148148148148148, - "eval_loss": 0.03286580368876457, - "eval_runtime": 9.1191, - "eval_samples_per_second": 5.483, - "eval_steps_per_second": 1.426, + "epoch": 0.08080808080808081, + "eval_loss": 0.007939654402434826, + "eval_runtime": 20.513, + "eval_samples_per_second": 4.875, + "eval_steps_per_second": 1.219, "step": 25 }, { - "epoch": 0.24074074074074073, - "grad_norm": 0.04235522821545601, - "learning_rate": 4.814814814814815e-05, - "loss": 0.0326, + "epoch": 0.08404040404040404, + "grad_norm": 0.01418386958539486, + "learning_rate": 3.3635187580853815e-06, + "loss": 0.0067, "step": 26 }, { - "epoch": 0.25, - "grad_norm": 0.04675336927175522, - "learning_rate": 5e-05, - "loss": 0.03, + "epoch": 0.08727272727272728, + "grad_norm": 0.018947051838040352, + "learning_rate": 3.492884864165589e-06, + "loss": 0.0086, "step": 27 }, { - "epoch": 0.25925925925925924, - "grad_norm": 0.039461418986320496, - "learning_rate": 5.185185185185185e-05, - "loss": 0.0328, + "epoch": 0.0905050505050505, + "grad_norm": 0.016921700909733772, + "learning_rate": 3.6222509702457957e-06, + "loss": 0.0085, "step": 28 }, { - "epoch": 0.26851851851851855, - "grad_norm": 0.044042930006980896, - "learning_rate": 5.370370370370371e-05, - "loss": 0.0294, + "epoch": 0.09373737373737374, + "grad_norm": 0.01551737543195486, + "learning_rate": 3.751617076326003e-06, + "loss": 0.0084, "step": 29 }, { - "epoch": 0.2777777777777778, - "grad_norm": 0.044502489268779755, - "learning_rate": 5.555555555555556e-05, - "loss": 0.0311, - "step": 30 - }, - { - "epoch": 0.2777777777777778, - "eval_loss": 0.030865700915455818, - "eval_runtime": 9.1099, - "eval_samples_per_second": 5.489, - "eval_steps_per_second": 1.427, + "epoch": 0.09696969696969697, + "grad_norm": 0.014143792912364006, + "learning_rate": 3.880983182406209e-06, + "loss": 0.0078, "step": 30 }, { - "epoch": 0.28703703703703703, - "grad_norm": 0.04979817569255829, - "learning_rate": 5.740740740740741e-05, - "loss": 0.0292, + "epoch": 0.10020202020202021, + "grad_norm": 0.011397017166018486, + "learning_rate": 4.010349288486417e-06, + "loss": 0.0053, "step": 31 }, { - "epoch": 0.2962962962962963, - "grad_norm": 0.04573828727006912, - "learning_rate": 5.925925925925926e-05, - "loss": 0.0346, + "epoch": 0.10343434343434343, + "grad_norm": 0.015161341056227684, + "learning_rate": 4.1397153945666245e-06, + "loss": 0.0073, "step": 32 }, { - "epoch": 0.3055555555555556, - "grad_norm": 0.0410350002348423, - "learning_rate": 6.111111111111112e-05, - "loss": 0.0295, + "epoch": 0.10666666666666667, + "grad_norm": 0.012180107645690441, + "learning_rate": 4.2690815006468305e-06, + "loss": 0.0063, "step": 33 }, { - "epoch": 0.3148148148148148, - "grad_norm": 0.0416686087846756, - "learning_rate": 6.296296296296296e-05, - "loss": 0.0267, + "epoch": 0.1098989898989899, + "grad_norm": 0.012770959176123142, + "learning_rate": 4.398447606727038e-06, + "loss": 0.0079, "step": 34 }, { - "epoch": 0.32407407407407407, - "grad_norm": 0.042319901287555695, - "learning_rate": 6.481481481481482e-05, - "loss": 0.0295, - "step": 35 - }, - { - "epoch": 0.32407407407407407, - "eval_loss": 0.028042705729603767, - "eval_runtime": 9.1376, - "eval_samples_per_second": 5.472, - "eval_steps_per_second": 1.423, + "epoch": 0.11313131313131314, + "grad_norm": 0.014727453701198101, + "learning_rate": 4.527813712807244e-06, + "loss": 0.0122, "step": 35 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.037845220416784286, - "learning_rate": 6.666666666666667e-05, - "loss": 0.0319, + "epoch": 0.11636363636363636, + "grad_norm": 0.012778300791978836, + "learning_rate": 4.657179818887452e-06, + "loss": 0.0071, "step": 36 }, { - "epoch": 0.3425925925925926, - "grad_norm": 0.03568718954920769, - "learning_rate": 6.851851851851852e-05, - "loss": 0.0346, + "epoch": 0.1195959595959596, + "grad_norm": 0.012685113586485386, + "learning_rate": 4.786545924967659e-06, + "loss": 0.0065, "step": 37 }, { - "epoch": 0.35185185185185186, - "grad_norm": 0.037281136959791183, - "learning_rate": 7.037037037037038e-05, - "loss": 0.031, + "epoch": 0.12282828282828283, + "grad_norm": 0.014156874269247055, + "learning_rate": 4.915912031047866e-06, + "loss": 0.0082, "step": 38 }, { - "epoch": 0.3611111111111111, - "grad_norm": 0.03607446327805519, - "learning_rate": 7.222222222222222e-05, - "loss": 0.0335, + "epoch": 0.12606060606060607, + "grad_norm": 0.01115910243242979, + "learning_rate": 5.045278137128073e-06, + "loss": 0.007, "step": 39 }, { - "epoch": 0.37037037037037035, - "grad_norm": 0.03654631972312927, - "learning_rate": 7.407407407407407e-05, - "loss": 0.0262, - "step": 40 - }, - { - "epoch": 0.37037037037037035, - "eval_loss": 0.026602942496538162, - "eval_runtime": 9.1124, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 0.1292929292929293, + "grad_norm": 0.014089164324104786, + "learning_rate": 5.1746442432082795e-06, + "loss": 0.0073, "step": 40 }, { - "epoch": 0.37962962962962965, - "grad_norm": 0.039490777999162674, - "learning_rate": 7.592592592592593e-05, - "loss": 0.0252, + "epoch": 0.13252525252525252, + "grad_norm": 0.013440214097499847, + "learning_rate": 5.304010349288486e-06, + "loss": 0.0082, "step": 41 }, { - "epoch": 0.3888888888888889, - "grad_norm": 0.036680739372968674, - "learning_rate": 7.777777777777778e-05, - "loss": 0.0242, + "epoch": 0.13575757575757577, + "grad_norm": 0.01083499938249588, + "learning_rate": 5.433376455368694e-06, + "loss": 0.006, "step": 42 }, { - "epoch": 0.39814814814814814, - "grad_norm": 0.040739599615335464, - "learning_rate": 7.962962962962964e-05, - "loss": 0.025, + "epoch": 0.138989898989899, + "grad_norm": 0.011082631535828114, + "learning_rate": 5.5627425614489e-06, + "loss": 0.007, "step": 43 }, { - "epoch": 0.4074074074074074, - "grad_norm": 0.04679260402917862, - "learning_rate": 8.148148148148148e-05, - "loss": 0.0212, + "epoch": 0.14222222222222222, + "grad_norm": 0.012490961700677872, + "learning_rate": 5.692108667529108e-06, + "loss": 0.0062, "step": 44 }, { - "epoch": 0.4166666666666667, - "grad_norm": 0.04656214639544487, - "learning_rate": 8.333333333333334e-05, - "loss": 0.0272, - "step": 45 - }, - { - "epoch": 0.4166666666666667, - "eval_loss": 0.02608887106180191, - "eval_runtime": 9.1343, - "eval_samples_per_second": 5.474, - "eval_steps_per_second": 1.423, + "epoch": 0.14545454545454545, + "grad_norm": 0.012171825394034386, + "learning_rate": 5.821474773609315e-06, + "loss": 0.0066, "step": 45 }, { - "epoch": 0.42592592592592593, - "grad_norm": 0.04525485262274742, - "learning_rate": 8.518518518518518e-05, - "loss": 0.0274, + "epoch": 0.1486868686868687, + "grad_norm": 0.010148804634809494, + "learning_rate": 5.950840879689522e-06, + "loss": 0.0042, "step": 46 }, { - "epoch": 0.4351851851851852, - "grad_norm": 0.03210742771625519, - "learning_rate": 8.703703703703704e-05, - "loss": 0.0283, + "epoch": 0.15191919191919193, + "grad_norm": 0.013537143357098103, + "learning_rate": 6.0802069857697286e-06, + "loss": 0.0073, "step": 47 }, { - "epoch": 0.4444444444444444, - "grad_norm": 0.03675089031457901, - "learning_rate": 8.888888888888889e-05, - "loss": 0.0242, + "epoch": 0.15515151515151515, + "grad_norm": 0.011110929772257805, + "learning_rate": 6.2095730918499354e-06, + "loss": 0.0072, "step": 48 }, { - "epoch": 0.4537037037037037, - "grad_norm": 0.03396710753440857, - "learning_rate": 9.074074074074075e-05, - "loss": 0.0239, + "epoch": 0.15838383838383838, + "grad_norm": 0.00937278475612402, + "learning_rate": 6.338939197930142e-06, + "loss": 0.0054, "step": 49 }, { - "epoch": 0.46296296296296297, - "grad_norm": 0.02745971269905567, - "learning_rate": 9.25925925925926e-05, - "loss": 0.0224, + "epoch": 0.16161616161616163, + "grad_norm": 0.011824984103441238, + "learning_rate": 6.46830530401035e-06, + "loss": 0.0119, "step": 50 }, { - "epoch": 0.46296296296296297, - "eval_loss": 0.02490057609975338, - "eval_runtime": 9.1102, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 0.16161616161616163, + "eval_loss": 0.005149205215275288, + "eval_runtime": 18.6062, + "eval_samples_per_second": 5.375, + "eval_steps_per_second": 1.344, "step": 50 }, { - "epoch": 0.4722222222222222, - "grad_norm": 0.04084627702832222, - "learning_rate": 9.444444444444444e-05, - "loss": 0.0252, + "epoch": 0.16484848484848486, + "grad_norm": 0.009701536037027836, + "learning_rate": 6.597671410090557e-06, + "loss": 0.0054, "step": 51 }, { - "epoch": 0.48148148148148145, - "grad_norm": 0.033021993935108185, - "learning_rate": 9.62962962962963e-05, - "loss": 0.0228, + "epoch": 0.16808080808080808, + "grad_norm": 0.010364706628024578, + "learning_rate": 6.727037516170763e-06, + "loss": 0.0049, "step": 52 }, { - "epoch": 0.49074074074074076, - "grad_norm": 0.034785784780979156, - "learning_rate": 9.814814814814815e-05, - "loss": 0.0259, + "epoch": 0.1713131313131313, + "grad_norm": 0.009346776641905308, + "learning_rate": 6.856403622250971e-06, + "loss": 0.0054, "step": 53 }, { - "epoch": 0.5, - "grad_norm": 0.03407888114452362, - "learning_rate": 0.0001, - "loss": 0.0239, + "epoch": 0.17454545454545456, + "grad_norm": 0.008815059438347816, + "learning_rate": 6.985769728331178e-06, + "loss": 0.0069, "step": 54 }, { - "epoch": 0.5092592592592593, - "grad_norm": 0.03268973529338837, - "learning_rate": 9.99989553622803e-05, - "loss": 0.0229, - "step": 55 - }, - { - "epoch": 0.5092592592592593, - "eval_loss": 0.02450372651219368, - "eval_runtime": 9.1421, - "eval_samples_per_second": 5.469, - "eval_steps_per_second": 1.422, + "epoch": 0.17777777777777778, + "grad_norm": 0.009431697428226471, + "learning_rate": 7.115135834411385e-06, + "loss": 0.0083, "step": 55 }, { - "epoch": 0.5185185185185185, - "grad_norm": 0.032378531992435455, - "learning_rate": 9.999582149277187e-05, - "loss": 0.0219, + "epoch": 0.181010101010101, + "grad_norm": 0.012181616388261318, + "learning_rate": 7.244501940491591e-06, + "loss": 0.0065, "step": 56 }, { - "epoch": 0.5277777777777778, - "grad_norm": 0.03997437283396721, - "learning_rate": 9.999059852242507e-05, - "loss": 0.0248, + "epoch": 0.18424242424242424, + "grad_norm": 0.009227719157934189, + "learning_rate": 7.373868046571798e-06, + "loss": 0.0052, "step": 57 }, { - "epoch": 0.5370370370370371, - "grad_norm": 0.04024836793541908, - "learning_rate": 9.998328666948438e-05, - "loss": 0.0194, + "epoch": 0.1874747474747475, + "grad_norm": 0.009855546988546848, + "learning_rate": 7.503234152652006e-06, + "loss": 0.0082, "step": 58 }, { - "epoch": 0.5462962962962963, - "grad_norm": 0.03850249573588371, - "learning_rate": 9.997388623947928e-05, - "loss": 0.0251, + "epoch": 0.1907070707070707, + "grad_norm": 0.008454745635390282, + "learning_rate": 7.632600258732213e-06, + "loss": 0.0044, "step": 59 }, { - "epoch": 0.5555555555555556, - "grad_norm": 0.03326913341879845, - "learning_rate": 9.996239762521151e-05, - "loss": 0.0233, - "step": 60 - }, - { - "epoch": 0.5555555555555556, - "eval_loss": 0.023316912353038788, - "eval_runtime": 9.1353, - "eval_samples_per_second": 5.473, - "eval_steps_per_second": 1.423, + "epoch": 0.19393939393939394, + "grad_norm": 0.010128876194357872, + "learning_rate": 7.761966364812418e-06, + "loss": 0.0092, "step": 60 }, { - "epoch": 0.5648148148148148, - "grad_norm": 0.034179024398326874, - "learning_rate": 9.994882130673868e-05, - "loss": 0.0222, + "epoch": 0.19717171717171716, + "grad_norm": 0.009267722256481647, + "learning_rate": 7.891332470892627e-06, + "loss": 0.0049, "step": 61 }, { - "epoch": 0.5740740740740741, - "grad_norm": 0.031797800213098526, - "learning_rate": 9.993315785135416e-05, - "loss": 0.0272, + "epoch": 0.20040404040404042, + "grad_norm": 0.009394655004143715, + "learning_rate": 8.020698576972833e-06, + "loss": 0.0059, "step": 62 }, { - "epoch": 0.5833333333333334, - "grad_norm": 0.03183833882212639, - "learning_rate": 9.991540791356342e-05, - "loss": 0.0241, + "epoch": 0.20363636363636364, + "grad_norm": 0.008983040228486061, + "learning_rate": 8.15006468305304e-06, + "loss": 0.0041, "step": 63 }, { - "epoch": 0.5925925925925926, - "grad_norm": 0.025173548609018326, - "learning_rate": 9.989557223505661e-05, - "loss": 0.0216, + "epoch": 0.20686868686868687, + "grad_norm": 0.009920783340930939, + "learning_rate": 8.279430789133249e-06, + "loss": 0.0058, "step": 64 }, { - "epoch": 0.6018518518518519, - "grad_norm": 0.04935009032487869, - "learning_rate": 9.987365164467767e-05, - "loss": 0.0217, - "step": 65 - }, - { - "epoch": 0.6018518518518519, - "eval_loss": 0.02255990356206894, - "eval_runtime": 9.1207, - "eval_samples_per_second": 5.482, - "eval_steps_per_second": 1.425, + "epoch": 0.2101010101010101, + "grad_norm": 0.007694128435105085, + "learning_rate": 8.408796895213454e-06, + "loss": 0.0033, "step": 65 }, { - "epoch": 0.6111111111111112, - "grad_norm": 0.02904060110449791, - "learning_rate": 9.98496470583896e-05, - "loss": 0.0213, + "epoch": 0.21333333333333335, + "grad_norm": 0.01226822566241026, + "learning_rate": 8.538163001293661e-06, + "loss": 0.0099, "step": 66 }, { - "epoch": 0.6203703703703703, - "grad_norm": 0.046014755964279175, - "learning_rate": 9.982355947923629e-05, - "loss": 0.018, + "epoch": 0.21656565656565657, + "grad_norm": 0.010597337037324905, + "learning_rate": 8.66752910737387e-06, + "loss": 0.0081, "step": 67 }, { - "epoch": 0.6296296296296297, - "grad_norm": 0.0354795977473259, - "learning_rate": 9.979538999730047e-05, - "loss": 0.0199, + "epoch": 0.2197979797979798, + "grad_norm": 0.009066218510270119, + "learning_rate": 8.796895213454076e-06, + "loss": 0.0044, "step": 68 }, { - "epoch": 0.6388888888888888, - "grad_norm": 0.03308796137571335, - "learning_rate": 9.976513978965829e-05, - "loss": 0.0239, + "epoch": 0.22303030303030302, + "grad_norm": 0.01163114607334137, + "learning_rate": 8.926261319534282e-06, + "loss": 0.0129, "step": 69 }, { - "epoch": 0.6481481481481481, - "grad_norm": 0.03860899433493614, - "learning_rate": 9.973281012033007e-05, - "loss": 0.0247, - "step": 70 - }, - { - "epoch": 0.6481481481481481, - "eval_loss": 0.022898558527231216, - "eval_runtime": 9.1074, - "eval_samples_per_second": 5.49, - "eval_steps_per_second": 1.427, + "epoch": 0.22626262626262628, + "grad_norm": 0.012832598760724068, + "learning_rate": 9.055627425614489e-06, + "loss": 0.0071, "step": 70 }, { - "epoch": 0.6574074074074074, - "grad_norm": 0.028213078156113625, - "learning_rate": 9.969840234022749e-05, - "loss": 0.0197, + "epoch": 0.2294949494949495, + "grad_norm": 0.010620299726724625, + "learning_rate": 9.184993531694697e-06, + "loss": 0.004, "step": 71 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.024581043049693108, - "learning_rate": 9.966191788709716e-05, - "loss": 0.0207, + "epoch": 0.23272727272727273, + "grad_norm": 0.00913357175886631, + "learning_rate": 9.314359637774904e-06, + "loss": 0.0058, "step": 72 }, { - "epoch": 0.6759259259259259, - "grad_norm": 0.026658454909920692, - "learning_rate": 9.962335828546048e-05, - "loss": 0.0214, + "epoch": 0.23595959595959595, + "grad_norm": 0.011772734113037586, + "learning_rate": 9.44372574385511e-06, + "loss": 0.0036, "step": 73 }, { - "epoch": 0.6851851851851852, - "grad_norm": 0.034941576421260834, - "learning_rate": 9.958272514655006e-05, - "loss": 0.0205, + "epoch": 0.2391919191919192, + "grad_norm": 0.011326112784445286, + "learning_rate": 9.573091849935318e-06, + "loss": 0.0071, "step": 74 }, { - "epoch": 0.6944444444444444, - "grad_norm": 0.03060038387775421, - "learning_rate": 9.954002016824227e-05, - "loss": 0.0193, + "epoch": 0.24242424242424243, + "grad_norm": 0.010667567141354084, + "learning_rate": 9.702457956015525e-06, + "loss": 0.0036, "step": 75 }, { - "epoch": 0.6944444444444444, - "eval_loss": 0.02283317781984806, - "eval_runtime": 9.1512, - "eval_samples_per_second": 5.464, - "eval_steps_per_second": 1.421, + "epoch": 0.24242424242424243, + "eval_loss": 0.0031650287564843893, + "eval_runtime": 18.6391, + "eval_samples_per_second": 5.365, + "eval_steps_per_second": 1.341, "step": 75 }, { - "epoch": 0.7037037037037037, - "grad_norm": 0.0313015952706337, - "learning_rate": 9.949524513498636e-05, - "loss": 0.0206, + "epoch": 0.24565656565656566, + "grad_norm": 0.007108451332896948, + "learning_rate": 9.831824062095732e-06, + "loss": 0.0023, "step": 76 }, { - "epoch": 0.7129629629629629, - "grad_norm": 0.03317766636610031, - "learning_rate": 9.944840191772987e-05, - "loss": 0.0217, + "epoch": 0.24888888888888888, + "grad_norm": 0.008278781548142433, + "learning_rate": 9.961190168175938e-06, + "loss": 0.0026, "step": 77 }, { - "epoch": 0.7222222222222222, - "grad_norm": 0.027911782264709473, - "learning_rate": 9.939949247384046e-05, - "loss": 0.0196, + "epoch": 0.25212121212121213, + "grad_norm": 0.010979422368109226, + "learning_rate": 1.0090556274256145e-05, + "loss": 0.0036, "step": 78 }, { - "epoch": 0.7314814814814815, - "grad_norm": 0.028807291761040688, - "learning_rate": 9.934851884702414e-05, - "loss": 0.0223, + "epoch": 0.25535353535353533, + "grad_norm": 0.007666402496397495, + "learning_rate": 1.0219922380336352e-05, + "loss": 0.0015, "step": 79 }, { - "epoch": 0.7407407407407407, - "grad_norm": 0.03152855485677719, - "learning_rate": 9.929548316723982e-05, - "loss": 0.0173, - "step": 80 - }, - { - "epoch": 0.7407407407407407, - "eval_loss": 0.021335698664188385, - "eval_runtime": 9.1689, - "eval_samples_per_second": 5.453, - "eval_steps_per_second": 1.418, + "epoch": 0.2585858585858586, + "grad_norm": 0.009133332408964634, + "learning_rate": 1.0349288486416559e-05, + "loss": 0.0021, "step": 80 }, { - "epoch": 0.75, - "grad_norm": 0.03250882402062416, - "learning_rate": 9.924038765061042e-05, - "loss": 0.0231, + "epoch": 0.26181818181818184, + "grad_norm": 0.009566927328705788, + "learning_rate": 1.0478654592496766e-05, + "loss": 0.0038, "step": 81 }, { - "epoch": 0.7592592592592593, - "grad_norm": 0.030853938311338425, - "learning_rate": 9.918323459933005e-05, - "loss": 0.0224, + "epoch": 0.26505050505050504, + "grad_norm": 0.011765814386308193, + "learning_rate": 1.0608020698576973e-05, + "loss": 0.0061, "step": 82 }, { - "epoch": 0.7685185185185185, - "grad_norm": 0.03431202098727226, - "learning_rate": 9.912402640156811e-05, - "loss": 0.0223, + "epoch": 0.2682828282828283, + "grad_norm": 0.009488740935921669, + "learning_rate": 1.073738680465718e-05, + "loss": 0.0023, "step": 83 }, { - "epoch": 0.7777777777777778, - "grad_norm": 0.027050426229834557, - "learning_rate": 9.906276553136923e-05, - "loss": 0.0198, + "epoch": 0.27151515151515154, + "grad_norm": 0.007137450389564037, + "learning_rate": 1.0866752910737388e-05, + "loss": 0.0021, "step": 84 }, { - "epoch": 0.7870370370370371, - "grad_norm": 0.03224191442131996, - "learning_rate": 9.899945454855006e-05, - "loss": 0.0207, + "epoch": 0.27474747474747474, + "grad_norm": 0.008984182961285114, + "learning_rate": 1.0996119016817593e-05, + "loss": 0.0028, "step": 85 }, { - "epoch": 0.7870370370370371, - "eval_loss": 0.020375357940793037, - "eval_runtime": 9.1362, - "eval_samples_per_second": 5.473, - "eval_steps_per_second": 1.423, - "step": 85 - }, - { - "epoch": 0.7962962962962963, - "grad_norm": 0.028706278651952744, - "learning_rate": 9.893409609859222e-05, - "loss": 0.0197, + "epoch": 0.277979797979798, + "grad_norm": 0.010403821244835854, + "learning_rate": 1.11254851228978e-05, + "loss": 0.0057, "step": 86 }, { - "epoch": 0.8055555555555556, - "grad_norm": 0.02814578451216221, - "learning_rate": 9.88666929125318e-05, - "loss": 0.0199, + "epoch": 0.2812121212121212, + "grad_norm": 0.0071556540206074715, + "learning_rate": 1.1254851228978009e-05, + "loss": 0.0017, "step": 87 }, { - "epoch": 0.8148148148148148, - "grad_norm": 0.028775395825505257, - "learning_rate": 9.879724780684519e-05, - "loss": 0.0169, + "epoch": 0.28444444444444444, + "grad_norm": 0.012338937260210514, + "learning_rate": 1.1384217335058216e-05, + "loss": 0.0029, "step": 88 }, { - "epoch": 0.8240740740740741, - "grad_norm": 0.030078047886490822, - "learning_rate": 9.872576368333151e-05, - "loss": 0.0209, + "epoch": 0.2876767676767677, + "grad_norm": 0.010343515314161777, + "learning_rate": 1.1513583441138421e-05, + "loss": 0.0032, "step": 89 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.031860969960689545, - "learning_rate": 9.865224352899119e-05, - "loss": 0.0213, - "step": 90 - }, - { - "epoch": 0.8333333333333334, - "eval_loss": 0.019939038902521133, - "eval_runtime": 9.1287, - "eval_samples_per_second": 5.477, - "eval_steps_per_second": 1.424, + "epoch": 0.2909090909090909, + "grad_norm": 0.009991390630602837, + "learning_rate": 1.164294954721863e-05, + "loss": 0.0035, "step": 90 }, { - "epoch": 0.8425925925925926, - "grad_norm": 0.03415157273411751, - "learning_rate": 9.857669041590134e-05, - "loss": 0.021, + "epoch": 0.29414141414141415, + "grad_norm": 0.009020160883665085, + "learning_rate": 1.1772315653298836e-05, + "loss": 0.0033, "step": 91 }, { - "epoch": 0.8518518518518519, - "grad_norm": 0.032674115151166916, - "learning_rate": 9.849910750108717e-05, - "loss": 0.0207, + "epoch": 0.2973737373737374, + "grad_norm": 0.0067694829776883125, + "learning_rate": 1.1901681759379043e-05, + "loss": 0.0019, "step": 92 }, { - "epoch": 0.8611111111111112, - "grad_norm": 0.02941475249826908, - "learning_rate": 9.84194980263903e-05, - "loss": 0.0196, + "epoch": 0.3006060606060606, + "grad_norm": 0.013036763295531273, + "learning_rate": 1.203104786545925e-05, + "loss": 0.0041, "step": 93 }, { - "epoch": 0.8703703703703703, - "grad_norm": 0.036115583032369614, - "learning_rate": 9.83378653183331e-05, - "loss": 0.0178, + "epoch": 0.30383838383838385, + "grad_norm": 0.009171461686491966, + "learning_rate": 1.2160413971539457e-05, + "loss": 0.006, "step": 94 }, { - "epoch": 0.8796296296296297, - "grad_norm": 0.03358744457364082, - "learning_rate": 9.825421278797983e-05, - "loss": 0.0199, - "step": 95 - }, - { - "epoch": 0.8796296296296297, - "eval_loss": 0.020193172618746758, - "eval_runtime": 9.1141, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 0.30707070707070705, + "grad_norm": 0.007229423616081476, + "learning_rate": 1.2289780077619664e-05, + "loss": 0.002, "step": 95 }, { - "epoch": 0.8888888888888888, - "grad_norm": 0.029014358296990395, - "learning_rate": 9.816854393079403e-05, - "loss": 0.0219, + "epoch": 0.3103030303030303, + "grad_norm": 0.009065698832273483, + "learning_rate": 1.2419146183699871e-05, + "loss": 0.0028, "step": 96 }, { - "epoch": 0.8981481481481481, - "grad_norm": 0.042931754142045975, - "learning_rate": 9.808086232649246e-05, - "loss": 0.0185, + "epoch": 0.31353535353535356, + "grad_norm": 0.0068460931070148945, + "learning_rate": 1.254851228978008e-05, + "loss": 0.0024, "step": 97 }, { - "epoch": 0.9074074074074074, - "grad_norm": 0.029089825227856636, - "learning_rate": 9.799117163889559e-05, - "loss": 0.021, + "epoch": 0.31676767676767675, + "grad_norm": 0.12772586941719055, + "learning_rate": 1.2677878395860285e-05, + "loss": 0.0046, "step": 98 }, { - "epoch": 0.9166666666666666, - "grad_norm": 0.03154176101088524, - "learning_rate": 9.789947561577445e-05, - "loss": 0.02, + "epoch": 0.32, + "grad_norm": 0.008569791913032532, + "learning_rate": 1.2807244501940493e-05, + "loss": 0.0024, "step": 99 }, { - "epoch": 0.9259259259259259, - "grad_norm": 0.027786221355199814, - "learning_rate": 9.780577808869398e-05, - "loss": 0.0188, + "epoch": 0.32323232323232326, + "grad_norm": 0.010226168669760227, + "learning_rate": 1.29366106080207e-05, + "loss": 0.004, "step": 100 }, { - "epoch": 0.9259259259259259, - "eval_loss": 0.02070247381925583, - "eval_runtime": 9.1159, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 0.32323232323232326, + "eval_loss": 0.002474932000041008, + "eval_runtime": 18.5931, + "eval_samples_per_second": 5.378, + "eval_steps_per_second": 1.345, "step": 100 }, { - "epoch": 0.9351851851851852, - "grad_norm": 0.030518539249897003, - "learning_rate": 9.771008297285307e-05, - "loss": 0.0218, + "epoch": 0.32646464646464646, + "grad_norm": 0.010962710715830326, + "learning_rate": 1.3065976714100905e-05, + "loss": 0.0018, "step": 101 }, { - "epoch": 0.9444444444444444, - "grad_norm": 0.024817178025841713, - "learning_rate": 9.761239426692077e-05, - "loss": 0.0202, + "epoch": 0.3296969696969697, + "grad_norm": 0.01257998775690794, + "learning_rate": 1.3195342820181114e-05, + "loss": 0.0033, "step": 102 }, { - "epoch": 0.9537037037037037, - "grad_norm": 0.025192229077219963, - "learning_rate": 9.751271605286941e-05, - "loss": 0.0197, + "epoch": 0.3329292929292929, + "grad_norm": 0.013269903138279915, + "learning_rate": 1.332470892626132e-05, + "loss": 0.0036, "step": 103 }, { - "epoch": 0.9629629629629629, - "grad_norm": 0.02538897655904293, - "learning_rate": 9.741105249580383e-05, - "loss": 0.02, + "epoch": 0.33616161616161616, + "grad_norm": 0.009509469382464886, + "learning_rate": 1.3454075032341526e-05, + "loss": 0.0034, "step": 104 }, { - "epoch": 0.9722222222222222, - "grad_norm": 0.025440450757741928, - "learning_rate": 9.730740784378753e-05, - "loss": 0.0193, - "step": 105 - }, - { - "epoch": 0.9722222222222222, - "eval_loss": 0.020300446078181267, - "eval_runtime": 9.126, - "eval_samples_per_second": 5.479, - "eval_steps_per_second": 1.425, + "epoch": 0.3393939393939394, + "grad_norm": 0.00902635883539915, + "learning_rate": 1.3583441138421735e-05, + "loss": 0.0028, "step": 105 }, { - "epoch": 0.9814814814814815, - "grad_norm": 0.02362542785704136, - "learning_rate": 9.7201786427665e-05, - "loss": 0.0202, + "epoch": 0.3426262626262626, + "grad_norm": 0.007760872133076191, + "learning_rate": 1.3712807244501941e-05, + "loss": 0.0024, "step": 106 }, { - "epoch": 0.9907407407407407, - "grad_norm": 0.022390421479940414, - "learning_rate": 9.709419266088086e-05, - "loss": 0.0188, + "epoch": 0.34585858585858587, + "grad_norm": 0.009263481013476849, + "learning_rate": 1.384217335058215e-05, + "loss": 0.0031, "step": 107 }, { - "epoch": 1.0, - "grad_norm": 0.026193244382739067, - "learning_rate": 9.698463103929542e-05, - "loss": 0.022, + "epoch": 0.3490909090909091, + "grad_norm": 0.00697364890947938, + "learning_rate": 1.3971539456662355e-05, + "loss": 0.0021, "step": 108 }, { - "epoch": 1.0092592592592593, - "grad_norm": 0.028253022581338882, - "learning_rate": 9.687310614099675e-05, - "loss": 0.0159, + "epoch": 0.3523232323232323, + "grad_norm": 0.007600918412208557, + "learning_rate": 1.4100905562742562e-05, + "loss": 0.0018, "step": 109 }, { - "epoch": 1.0185185185185186, - "grad_norm": 0.02241157554090023, - "learning_rate": 9.67596226261095e-05, - "loss": 0.016, - "step": 110 - }, - { - "epoch": 1.0185185185185186, - "eval_loss": 0.01969613879919052, - "eval_runtime": 9.1053, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, + "epoch": 0.35555555555555557, + "grad_norm": 0.00856352224946022, + "learning_rate": 1.423027166882277e-05, + "loss": 0.0031, "step": 110 }, { - "epoch": 1.0277777777777777, - "grad_norm": 0.027405373752117157, - "learning_rate": 9.664418523660004e-05, - "loss": 0.014, + "epoch": 0.35878787878787877, + "grad_norm": 0.007947358302772045, + "learning_rate": 1.4359637774902976e-05, + "loss": 0.0025, "step": 111 }, { - "epoch": 1.037037037037037, - "grad_norm": 0.032646384090185165, - "learning_rate": 9.652679879607843e-05, - "loss": 0.0172, + "epoch": 0.362020202020202, + "grad_norm": 0.007706194184720516, + "learning_rate": 1.4489003880983183e-05, + "loss": 0.0023, "step": 112 }, { - "epoch": 1.0462962962962963, - "grad_norm": 0.02552163228392601, - "learning_rate": 9.640746820959684e-05, - "loss": 0.014, + "epoch": 0.3652525252525253, + "grad_norm": 0.008598407730460167, + "learning_rate": 1.4618369987063391e-05, + "loss": 0.0028, "step": 113 }, { - "epoch": 1.0555555555555556, - "grad_norm": 0.022228199988603592, - "learning_rate": 9.628619846344454e-05, - "loss": 0.0172, + "epoch": 0.36848484848484847, + "grad_norm": 0.00845644623041153, + "learning_rate": 1.4747736093143596e-05, + "loss": 0.0029, "step": 114 }, { - "epoch": 1.0648148148148149, - "grad_norm": 0.028009962290525436, - "learning_rate": 9.616299462493952e-05, - "loss": 0.0166, + "epoch": 0.3717171717171717, + "grad_norm": 0.008533057756721973, + "learning_rate": 1.4877102199223805e-05, + "loss": 0.0039, "step": 115 }, { - "epoch": 1.0648148148148149, - "eval_loss": 0.019864549860358238, - "eval_runtime": 9.122, - "eval_samples_per_second": 5.481, - "eval_steps_per_second": 1.425, - "step": 115 - }, - { - "epoch": 1.074074074074074, - "grad_norm": 0.025030331686139107, - "learning_rate": 9.603786184221693e-05, - "loss": 0.0195, + "epoch": 0.374949494949495, + "grad_norm": 0.007429471705108881, + "learning_rate": 1.5006468305304012e-05, + "loss": 0.0024, "step": 116 }, { - "epoch": 1.0833333333333333, - "grad_norm": 0.030586065724492073, - "learning_rate": 9.591080534401371e-05, - "loss": 0.015, + "epoch": 0.3781818181818182, + "grad_norm": 0.0069627161137759686, + "learning_rate": 1.5135834411384217e-05, + "loss": 0.0018, "step": 117 }, { - "epoch": 1.0925925925925926, - "grad_norm": 0.02425476722419262, - "learning_rate": 9.57818304394503e-05, - "loss": 0.0183, + "epoch": 0.3814141414141414, + "grad_norm": 0.007471222430467606, + "learning_rate": 1.5265200517464426e-05, + "loss": 0.0024, "step": 118 }, { - "epoch": 1.1018518518518519, - "grad_norm": 0.03203345090150833, - "learning_rate": 9.565094251780871e-05, - "loss": 0.0172, + "epoch": 0.3846464646464646, + "grad_norm": 0.006873416714370251, + "learning_rate": 1.5394566623544633e-05, + "loss": 0.0019, "step": 119 }, { - "epoch": 1.1111111111111112, - "grad_norm": 0.03028124012053013, - "learning_rate": 9.551814704830734e-05, - "loss": 0.0189, - "step": 120 - }, - { - "epoch": 1.1111111111111112, - "eval_loss": 0.019504941999912262, - "eval_runtime": 9.1171, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 0.3878787878787879, + "grad_norm": 0.009251467883586884, + "learning_rate": 1.5523932729624836e-05, + "loss": 0.0027, "step": 120 }, { - "epoch": 1.1203703703703705, - "grad_norm": 0.026934562250971794, - "learning_rate": 9.538344957987244e-05, - "loss": 0.0132, + "epoch": 0.39111111111111113, + "grad_norm": 0.007302634883671999, + "learning_rate": 1.5653298835705046e-05, + "loss": 0.0017, "step": 121 }, { - "epoch": 1.1296296296296295, - "grad_norm": 0.02392655238509178, - "learning_rate": 9.524685574090627e-05, - "loss": 0.0184, + "epoch": 0.39434343434343433, + "grad_norm": 0.006746005266904831, + "learning_rate": 1.5782664941785253e-05, + "loss": 0.0019, "step": 122 }, { - "epoch": 1.1388888888888888, - "grad_norm": 0.02336742728948593, - "learning_rate": 9.51083712390519e-05, - "loss": 0.0155, + "epoch": 0.3975757575757576, + "grad_norm": 0.008011849597096443, + "learning_rate": 1.591203104786546e-05, + "loss": 0.0023, "step": 123 }, { - "epoch": 1.1481481481481481, - "grad_norm": 0.025306498631834984, - "learning_rate": 9.496800186095466e-05, - "loss": 0.0156, + "epoch": 0.40080808080808084, + "grad_norm": 0.008799983188509941, + "learning_rate": 1.6041397153945667e-05, + "loss": 0.0023, "step": 124 }, { - "epoch": 1.1574074074074074, - "grad_norm": 0.02764940820634365, - "learning_rate": 9.482575347202047e-05, - "loss": 0.0211, + "epoch": 0.40404040404040403, + "grad_norm": 0.010617449879646301, + "learning_rate": 1.6170763260025874e-05, + "loss": 0.0019, "step": 125 }, { - "epoch": 1.1574074074074074, - "eval_loss": 0.018362991511821747, - "eval_runtime": 9.1297, - "eval_samples_per_second": 5.477, - "eval_steps_per_second": 1.424, + "epoch": 0.40404040404040403, + "eval_loss": 0.002037045545876026, + "eval_runtime": 18.5899, + "eval_samples_per_second": 5.379, + "eval_steps_per_second": 1.345, "step": 125 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.02213912270963192, - "learning_rate": 9.468163201617062e-05, - "loss": 0.0178, + "epoch": 0.4072727272727273, + "grad_norm": 0.006759752053767443, + "learning_rate": 1.630012936610608e-05, + "loss": 0.0018, "step": 126 }, { - "epoch": 1.175925925925926, - "grad_norm": 0.03320689871907234, - "learning_rate": 9.453564351559348e-05, - "loss": 0.0148, + "epoch": 0.4105050505050505, + "grad_norm": 0.008995631709694862, + "learning_rate": 1.6429495472186288e-05, + "loss": 0.0022, "step": 127 }, { - "epoch": 1.1851851851851851, - "grad_norm": 0.023370925337076187, - "learning_rate": 9.438779407049281e-05, - "loss": 0.0174, + "epoch": 0.41373737373737374, + "grad_norm": 0.00835257675498724, + "learning_rate": 1.6558861578266498e-05, + "loss": 0.0026, "step": 128 }, { - "epoch": 1.1944444444444444, - "grad_norm": 0.02848099358379841, - "learning_rate": 9.423808985883289e-05, - "loss": 0.0174, + "epoch": 0.416969696969697, + "grad_norm": 0.009772442281246185, + "learning_rate": 1.66882276843467e-05, + "loss": 0.0028, "step": 129 }, { - "epoch": 1.2037037037037037, - "grad_norm": 0.02608056552708149, - "learning_rate": 9.40865371360804e-05, - "loss": 0.0171, - "step": 130 - }, - { - "epoch": 1.2037037037037037, - "eval_loss": 0.018851976841688156, - "eval_runtime": 9.1046, - "eval_samples_per_second": 5.492, - "eval_steps_per_second": 1.428, + "epoch": 0.4202020202020202, + "grad_norm": 0.006931178271770477, + "learning_rate": 1.6817593790426908e-05, + "loss": 0.002, "step": 130 }, { - "epoch": 1.212962962962963, - "grad_norm": 0.02152630314230919, - "learning_rate": 9.393314223494296e-05, - "loss": 0.0172, + "epoch": 0.42343434343434344, + "grad_norm": 0.012796151451766491, + "learning_rate": 1.694695989650712e-05, + "loss": 0.0021, "step": 131 }, { - "epoch": 1.2222222222222223, - "grad_norm": 0.02550230175256729, - "learning_rate": 9.377791156510455e-05, - "loss": 0.016, + "epoch": 0.4266666666666667, + "grad_norm": 0.007226724177598953, + "learning_rate": 1.7076326002587322e-05, + "loss": 0.0033, "step": 132 }, { - "epoch": 1.2314814814814814, - "grad_norm": 0.025004474446177483, - "learning_rate": 9.362085161295769e-05, - "loss": 0.0163, + "epoch": 0.4298989898989899, + "grad_norm": 0.0066243987530469894, + "learning_rate": 1.720569210866753e-05, + "loss": 0.002, "step": 133 }, { - "epoch": 1.2407407407407407, - "grad_norm": 0.026416007429361343, - "learning_rate": 9.346196894133239e-05, - "loss": 0.0165, + "epoch": 0.43313131313131314, + "grad_norm": 0.007128287572413683, + "learning_rate": 1.733505821474774e-05, + "loss": 0.0021, "step": 134 }, { - "epoch": 1.25, - "grad_norm": 0.029432326555252075, - "learning_rate": 9.330127018922194e-05, - "loss": 0.0191, - "step": 135 - }, - { - "epoch": 1.25, - "eval_loss": 0.019194327294826508, - "eval_runtime": 9.1131, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 0.43636363636363634, + "grad_norm": 0.007028148043900728, + "learning_rate": 1.7464424320827943e-05, + "loss": 0.0028, "step": 135 }, { - "epoch": 1.2592592592592593, - "grad_norm": 0.03440408781170845, - "learning_rate": 9.313876207150543e-05, - "loss": 0.0165, + "epoch": 0.4395959595959596, + "grad_norm": 0.00866060983389616, + "learning_rate": 1.7593790426908153e-05, + "loss": 0.0053, "step": 136 }, { - "epoch": 1.2685185185185186, - "grad_norm": 0.025614989921450615, - "learning_rate": 9.297445137866727e-05, - "loss": 0.0162, + "epoch": 0.44282828282828285, + "grad_norm": 0.005735491868108511, + "learning_rate": 1.7723156532988356e-05, + "loss": 0.0023, "step": 137 }, { - "epoch": 1.2777777777777777, - "grad_norm": 0.02456337958574295, - "learning_rate": 9.280834497651334e-05, - "loss": 0.0192, + "epoch": 0.44606060606060605, + "grad_norm": 0.005858046934008598, + "learning_rate": 1.7852522639068563e-05, + "loss": 0.0019, "step": 138 }, { - "epoch": 1.287037037037037, - "grad_norm": 0.051101330667734146, - "learning_rate": 9.264044980588416e-05, - "loss": 0.015, + "epoch": 0.4492929292929293, + "grad_norm": 0.007398343179374933, + "learning_rate": 1.7981888745148774e-05, + "loss": 0.0024, "step": 139 }, { - "epoch": 1.2962962962962963, - "grad_norm": 0.03369716554880142, - "learning_rate": 9.247077288236488e-05, - "loss": 0.0184, - "step": 140 - }, - { - "epoch": 1.2962962962962963, - "eval_loss": 0.018648317083716393, - "eval_runtime": 9.1079, - "eval_samples_per_second": 5.49, - "eval_steps_per_second": 1.427, + "epoch": 0.45252525252525255, + "grad_norm": 0.009320907294750214, + "learning_rate": 1.8111254851228977e-05, + "loss": 0.0063, "step": 140 }, { - "epoch": 1.3055555555555556, - "grad_norm": 0.024168213829398155, - "learning_rate": 9.229932129599205e-05, - "loss": 0.0166, + "epoch": 0.45575757575757575, + "grad_norm": 0.008654161356389523, + "learning_rate": 1.8240620957309184e-05, + "loss": 0.0025, "step": 141 }, { - "epoch": 1.3148148148148149, - "grad_norm": 0.027960045263171196, - "learning_rate": 9.212610221095748e-05, - "loss": 0.0157, + "epoch": 0.458989898989899, + "grad_norm": 0.004506159573793411, + "learning_rate": 1.8369987063389394e-05, + "loss": 0.0013, "step": 142 }, { - "epoch": 1.324074074074074, - "grad_norm": 0.023985836654901505, - "learning_rate": 9.195112286530873e-05, - "loss": 0.0178, + "epoch": 0.4622222222222222, + "grad_norm": 0.009703525342047215, + "learning_rate": 1.8499353169469598e-05, + "loss": 0.0034, "step": 143 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.026084545999765396, - "learning_rate": 9.177439057064683e-05, - "loss": 0.0164, + "epoch": 0.46545454545454545, + "grad_norm": 0.007734385784715414, + "learning_rate": 1.8628719275549808e-05, + "loss": 0.0029, "step": 144 }, { - "epoch": 1.3425925925925926, - "grad_norm": 0.022582337260246277, - "learning_rate": 9.159591271182058e-05, - "loss": 0.0162, - "step": 145 - }, - { - "epoch": 1.3425925925925926, - "eval_loss": 0.018656810745596886, - "eval_runtime": 9.1149, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 0.4686868686868687, + "grad_norm": 0.010392666794359684, + "learning_rate": 1.8758085381630015e-05, + "loss": 0.0028, "step": 145 }, { - "epoch": 1.3518518518518519, - "grad_norm": 0.030290907248854637, - "learning_rate": 9.141569674661817e-05, - "loss": 0.021, + "epoch": 0.4719191919191919, + "grad_norm": 0.01011224091053009, + "learning_rate": 1.888745148771022e-05, + "loss": 0.0036, "step": 146 }, { - "epoch": 1.3611111111111112, - "grad_norm": 0.026109322905540466, - "learning_rate": 9.123375020545535e-05, - "loss": 0.0162, + "epoch": 0.47515151515151516, + "grad_norm": 0.005182855296880007, + "learning_rate": 1.901681759379043e-05, + "loss": 0.0014, "step": 147 }, { - "epoch": 1.3703703703703702, - "grad_norm": 0.02652176469564438, - "learning_rate": 9.105008069106093e-05, - "loss": 0.0169, + "epoch": 0.4783838383838384, + "grad_norm": 0.009993299841880798, + "learning_rate": 1.9146183699870636e-05, + "loss": 0.003, "step": 148 }, { - "epoch": 1.3796296296296298, - "grad_norm": 0.024147020652890205, - "learning_rate": 9.086469587815904e-05, - "loss": 0.0162, + "epoch": 0.4816161616161616, + "grad_norm": 0.008879208005964756, + "learning_rate": 1.927554980595084e-05, + "loss": 0.002, "step": 149 }, { - "epoch": 1.3888888888888888, - "grad_norm": 0.021294649690389633, - "learning_rate": 9.067760351314838e-05, - "loss": 0.0165, + "epoch": 0.48484848484848486, + "grad_norm": 0.006573604419827461, + "learning_rate": 1.940491591203105e-05, + "loss": 0.0021, "step": 150 }, { - "epoch": 1.3888888888888888, - "eval_loss": 0.018213987350463867, - "eval_runtime": 9.1247, - "eval_samples_per_second": 5.48, - "eval_steps_per_second": 1.425, + "epoch": 0.48484848484848486, + "eval_loss": 0.0017716821748763323, + "eval_runtime": 18.6403, + "eval_samples_per_second": 5.365, + "eval_steps_per_second": 1.341, "step": 150 }, { - "epoch": 1.3981481481481481, - "grad_norm": 0.02462903782725334, - "learning_rate": 9.048881141377863e-05, - "loss": 0.0204, + "epoch": 0.48808080808080806, + "grad_norm": 0.005542725790292025, + "learning_rate": 1.9534282018111256e-05, + "loss": 0.0017, "step": 151 }, { - "epoch": 1.4074074074074074, - "grad_norm": 0.024652326479554176, - "learning_rate": 9.029832746882371e-05, - "loss": 0.0164, + "epoch": 0.4913131313131313, + "grad_norm": 0.008134805597364902, + "learning_rate": 1.9663648124191463e-05, + "loss": 0.002, "step": 152 }, { - "epoch": 1.4166666666666667, - "grad_norm": 0.026834659278392792, - "learning_rate": 9.01061596377522e-05, - "loss": 0.018, + "epoch": 0.49454545454545457, + "grad_norm": 0.009408293291926384, + "learning_rate": 1.979301423027167e-05, + "loss": 0.0031, "step": 153 }, { - "epoch": 1.425925925925926, - "grad_norm": 0.02342064492404461, - "learning_rate": 8.991231595039465e-05, - "loss": 0.0156, + "epoch": 0.49777777777777776, + "grad_norm": 0.007265687920153141, + "learning_rate": 1.9922380336351877e-05, + "loss": 0.0021, "step": 154 }, { - "epoch": 1.4351851851851851, - "grad_norm": 0.026441222056746483, - "learning_rate": 8.97168045066082e-05, - "loss": 0.0157, + "epoch": 0.501010101010101, + "grad_norm": 0.009999910369515419, + "learning_rate": 2.0051746442432084e-05, + "loss": 0.0014, "step": 155 }, { - "epoch": 1.4351851851851851, - "eval_loss": 0.01855114847421646, - "eval_runtime": 9.124, - "eval_samples_per_second": 5.48, - "eval_steps_per_second": 1.425, - "step": 155 - }, - { - "epoch": 1.4444444444444444, - "grad_norm": 0.01796615496277809, - "learning_rate": 8.951963347593797e-05, - "loss": 0.0165, + "epoch": 0.5042424242424243, + "grad_norm": 0.007147953379899263, + "learning_rate": 2.018111254851229e-05, + "loss": 0.0022, "step": 156 }, { - "epoch": 1.4537037037037037, - "grad_norm": 0.02256671153008938, - "learning_rate": 8.932081109727582e-05, - "loss": 0.0201, + "epoch": 0.5074747474747475, + "grad_norm": 0.0072359428741037846, + "learning_rate": 2.0310478654592497e-05, + "loss": 0.0034, "step": 157 }, { - "epoch": 1.462962962962963, - "grad_norm": 0.028528334572911263, - "learning_rate": 8.912034567851599e-05, - "loss": 0.0182, + "epoch": 0.5107070707070707, + "grad_norm": 0.005821447819471359, + "learning_rate": 2.0439844760672704e-05, + "loss": 0.0018, "step": 158 }, { - "epoch": 1.4722222222222223, - "grad_norm": 0.029104968532919884, - "learning_rate": 8.891824559620801e-05, - "loss": 0.0153, + "epoch": 0.5139393939393939, + "grad_norm": 0.007256666664034128, + "learning_rate": 2.056921086675291e-05, + "loss": 0.0027, "step": 159 }, { - "epoch": 1.4814814814814814, - "grad_norm": 0.02003669925034046, - "learning_rate": 8.871451929520663e-05, - "loss": 0.0159, - "step": 160 - }, - { - "epoch": 1.4814814814814814, - "eval_loss": 0.01888095587491989, - "eval_runtime": 9.1172, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 0.5171717171717172, + "grad_norm": 0.00663521233946085, + "learning_rate": 2.0698576972833118e-05, + "loss": 0.0023, "step": 160 }, { - "epoch": 1.4907407407407407, - "grad_norm": 0.019447356462478638, - "learning_rate": 8.850917528831899e-05, - "loss": 0.0163, + "epoch": 0.5204040404040404, + "grad_norm": 0.007504627574235201, + "learning_rate": 2.0827943078913325e-05, + "loss": 0.004, "step": 161 }, { - "epoch": 1.5, - "grad_norm": 0.03438901901245117, - "learning_rate": 8.83022221559489e-05, - "loss": 0.0125, + "epoch": 0.5236363636363637, + "grad_norm": 0.005167617462575436, + "learning_rate": 2.0957309184993532e-05, + "loss": 0.0015, "step": 162 }, { - "epoch": 1.5092592592592593, - "grad_norm": 0.026535626500844955, - "learning_rate": 8.809366854573831e-05, - "loss": 0.0175, + "epoch": 0.5268686868686868, + "grad_norm": 0.00587807409465313, + "learning_rate": 2.108667529107374e-05, + "loss": 0.0016, "step": 163 }, { - "epoch": 1.5185185185185186, - "grad_norm": 0.029025647789239883, - "learning_rate": 8.78835231722059e-05, - "loss": 0.0164, + "epoch": 0.5301010101010101, + "grad_norm": 0.005344127304852009, + "learning_rate": 2.1216041397153946e-05, + "loss": 0.0011, "step": 164 }, { - "epoch": 1.5277777777777777, - "grad_norm": 0.025528129190206528, - "learning_rate": 8.767179481638303e-05, - "loss": 0.0174, - "step": 165 - }, - { - "epoch": 1.5277777777777777, - "eval_loss": 0.018690049648284912, - "eval_runtime": 9.1481, - "eval_samples_per_second": 5.466, - "eval_steps_per_second": 1.421, + "epoch": 0.5333333333333333, + "grad_norm": 0.005351161118596792, + "learning_rate": 2.1345407503234156e-05, + "loss": 0.0016, "step": 165 }, { - "epoch": 1.5370370370370372, - "grad_norm": 0.025675086304545403, - "learning_rate": 8.745849232544681e-05, - "loss": 0.0179, + "epoch": 0.5365656565656566, + "grad_norm": 0.006603873800486326, + "learning_rate": 2.147477360931436e-05, + "loss": 0.0017, "step": 166 }, { - "epoch": 1.5462962962962963, - "grad_norm": 0.027451254427433014, - "learning_rate": 8.724362461235029e-05, - "loss": 0.0169, + "epoch": 0.5397979797979798, + "grad_norm": 0.006440309341996908, + "learning_rate": 2.1604139715394566e-05, + "loss": 0.0016, "step": 167 }, { - "epoch": 1.5555555555555556, - "grad_norm": 0.026652028784155846, - "learning_rate": 8.702720065545024e-05, - "loss": 0.0168, + "epoch": 0.5430303030303031, + "grad_norm": 0.007731405086815357, + "learning_rate": 2.1733505821474777e-05, + "loss": 0.0024, "step": 168 }, { - "epoch": 1.5648148148148149, - "grad_norm": 0.030202018097043037, - "learning_rate": 8.680922949813178e-05, - "loss": 0.0162, + "epoch": 0.5462626262626262, + "grad_norm": 0.0070298160426318645, + "learning_rate": 2.186287192755498e-05, + "loss": 0.0019, "step": 169 }, { - "epoch": 1.574074074074074, - "grad_norm": 0.027389824390411377, - "learning_rate": 8.658972024843062e-05, - "loss": 0.0184, - "step": 170 - }, - { - "epoch": 1.574074074074074, - "eval_loss": 0.018272995948791504, - "eval_runtime": 9.1448, - "eval_samples_per_second": 5.468, - "eval_steps_per_second": 1.422, + "epoch": 0.5494949494949495, + "grad_norm": 0.008382032625377178, + "learning_rate": 2.1992238033635187e-05, + "loss": 0.0017, "step": 170 }, { - "epoch": 1.5833333333333335, - "grad_norm": 0.025648167356848717, - "learning_rate": 8.636868207865244e-05, - "loss": 0.0152, + "epoch": 0.5527272727272727, + "grad_norm": 0.007320054341107607, + "learning_rate": 2.2121604139715397e-05, + "loss": 0.0017, "step": 171 }, { - "epoch": 1.5925925925925926, - "grad_norm": 0.02472120150923729, - "learning_rate": 8.614612422498964e-05, - "loss": 0.0153, + "epoch": 0.555959595959596, + "grad_norm": 0.008367806673049927, + "learning_rate": 2.22509702457956e-05, + "loss": 0.0036, "step": 172 }, { - "epoch": 1.6018518518518519, - "grad_norm": 0.020042769610881805, - "learning_rate": 8.592205598713539e-05, - "loss": 0.017, + "epoch": 0.5591919191919192, + "grad_norm": 0.008995486423373222, + "learning_rate": 2.238033635187581e-05, + "loss": 0.0019, "step": 173 }, { - "epoch": 1.6111111111111112, - "grad_norm": 0.029423648491501808, - "learning_rate": 8.569648672789497e-05, - "loss": 0.0158, + "epoch": 0.5624242424242424, + "grad_norm": 0.007300530560314655, + "learning_rate": 2.2509702457956018e-05, + "loss": 0.0036, "step": 174 }, { - "epoch": 1.6203703703703702, - "grad_norm": 0.02159775421023369, - "learning_rate": 8.546942587279465e-05, - "loss": 0.0165, + "epoch": 0.5656565656565656, + "grad_norm": 0.005388753954321146, + "learning_rate": 2.263906856403622e-05, + "loss": 0.0016, "step": 175 }, { - "epoch": 1.6203703703703702, - "eval_loss": 0.018273252993822098, - "eval_runtime": 9.118, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 0.5656565656565656, + "eval_loss": 0.0016459682956337929, + "eval_runtime": 18.6118, + "eval_samples_per_second": 5.373, + "eval_steps_per_second": 1.343, "step": 175 }, { - "epoch": 1.6296296296296298, - "grad_norm": 0.024837305769324303, - "learning_rate": 8.524088290968781e-05, - "loss": 0.0187, + "epoch": 0.5688888888888889, + "grad_norm": 0.005545719526708126, + "learning_rate": 2.276843467011643e-05, + "loss": 0.0016, "step": 176 }, { - "epoch": 1.6388888888888888, - "grad_norm": 0.02383432537317276, - "learning_rate": 8.501086738835843e-05, - "loss": 0.0181, + "epoch": 0.5721212121212121, + "grad_norm": 0.007394594140350819, + "learning_rate": 2.289780077619664e-05, + "loss": 0.0021, "step": 177 }, { - "epoch": 1.6481481481481481, - "grad_norm": 0.025743911042809486, - "learning_rate": 8.47793889201221e-05, - "loss": 0.0171, + "epoch": 0.5753535353535354, + "grad_norm": 0.005089296959340572, + "learning_rate": 2.3027166882276842e-05, + "loss": 0.0013, "step": 178 }, { - "epoch": 1.6574074074074074, - "grad_norm": 0.023100929334759712, - "learning_rate": 8.45464571774244e-05, - "loss": 0.021, + "epoch": 0.5785858585858585, + "grad_norm": 0.00883107353001833, + "learning_rate": 2.3156532988357052e-05, + "loss": 0.0022, "step": 179 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.02667200192809105, - "learning_rate": 8.43120818934367e-05, - "loss": 0.0173, + "epoch": 0.5818181818181818, + "grad_norm": 0.012213426642119884, + "learning_rate": 2.328589909443726e-05, + "loss": 0.005, "step": 180 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.01778573729097843, - "eval_runtime": 9.1324, - "eval_samples_per_second": 5.475, - "eval_steps_per_second": 1.424, - "step": 180 - }, - { - "epoch": 1.675925925925926, - "grad_norm": 0.02880384773015976, - "learning_rate": 8.407627286164948e-05, - "loss": 0.015, + "epoch": 0.585050505050505, + "grad_norm": 0.007255645003169775, + "learning_rate": 2.3415265200517466e-05, + "loss": 0.001, "step": 181 }, { - "epoch": 1.6851851851851851, - "grad_norm": 0.030301645398139954, - "learning_rate": 8.383903993546311e-05, - "loss": 0.0157, + "epoch": 0.5882828282828283, + "grad_norm": 0.006818380672484636, + "learning_rate": 2.3544631306597673e-05, + "loss": 0.0015, "step": 182 }, { - "epoch": 1.6944444444444444, - "grad_norm": 0.021445374935865402, - "learning_rate": 8.360039302777612e-05, - "loss": 0.0181, + "epoch": 0.5915151515151515, + "grad_norm": 0.006983195431530476, + "learning_rate": 2.367399741267788e-05, + "loss": 0.0027, "step": 183 }, { - "epoch": 1.7037037037037037, - "grad_norm": 0.023577649146318436, - "learning_rate": 8.336034211057098e-05, - "loss": 0.0153, + "epoch": 0.5947474747474748, + "grad_norm": 0.012661872431635857, + "learning_rate": 2.3803363518758087e-05, + "loss": 0.0053, "step": 184 }, { - "epoch": 1.7129629629629628, - "grad_norm": 0.02492811530828476, - "learning_rate": 8.31188972144974e-05, - "loss": 0.0131, - "step": 185 - }, - { - "epoch": 1.7129629629629628, - "eval_loss": 0.017187727615237236, - "eval_runtime": 9.1252, - "eval_samples_per_second": 5.479, - "eval_steps_per_second": 1.425, + "epoch": 0.597979797979798, + "grad_norm": 0.005046526901423931, + "learning_rate": 2.3932729624838294e-05, + "loss": 0.0019, "step": 185 }, { - "epoch": 1.7222222222222223, - "grad_norm": 0.023155970498919487, - "learning_rate": 8.28760684284532e-05, - "loss": 0.0162, + "epoch": 0.6012121212121212, + "grad_norm": 0.004457033704966307, + "learning_rate": 2.40620957309185e-05, + "loss": 0.0012, "step": 186 }, { - "epoch": 1.7314814814814814, - "grad_norm": 0.02491271123290062, - "learning_rate": 8.263186589916273e-05, - "loss": 0.0137, + "epoch": 0.6044444444444445, + "grad_norm": 0.00732004176825285, + "learning_rate": 2.4191461836998707e-05, + "loss": 0.0017, "step": 187 }, { - "epoch": 1.7407407407407407, - "grad_norm": 0.02165275253355503, - "learning_rate": 8.238629983075294e-05, - "loss": 0.0143, + "epoch": 0.6076767676767677, + "grad_norm": 0.006394708063453436, + "learning_rate": 2.4320827943078914e-05, + "loss": 0.0019, "step": 188 }, { - "epoch": 1.75, - "grad_norm": 0.024284129962325096, - "learning_rate": 8.213938048432697e-05, - "loss": 0.0144, + "epoch": 0.610909090909091, + "grad_norm": 0.007961345836520195, + "learning_rate": 2.445019404915912e-05, + "loss": 0.0034, "step": 189 }, { - "epoch": 1.7592592592592593, - "grad_norm": 0.027395077049732208, - "learning_rate": 8.18911181775353e-05, - "loss": 0.0132, - "step": 190 - }, - { - "epoch": 1.7592592592592593, - "eval_loss": 0.018012873828411102, - "eval_runtime": 9.1149, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 0.6141414141414141, + "grad_norm": 0.0068709347397089005, + "learning_rate": 2.4579560155239328e-05, + "loss": 0.0013, "step": 190 }, { - "epoch": 1.7685185185185186, - "grad_norm": 0.02639261819422245, - "learning_rate": 8.164152328414476e-05, - "loss": 0.0156, + "epoch": 0.6173737373737374, + "grad_norm": 0.009581703692674637, + "learning_rate": 2.4708926261319535e-05, + "loss": 0.0026, "step": 191 }, { - "epoch": 1.7777777777777777, - "grad_norm": 0.02319464646279812, - "learning_rate": 8.139060623360493e-05, - "loss": 0.0121, + "epoch": 0.6206060606060606, + "grad_norm": 0.007400548551231623, + "learning_rate": 2.4838292367399742e-05, + "loss": 0.0024, "step": 192 }, { - "epoch": 1.7870370370370372, - "grad_norm": 0.020444169640541077, - "learning_rate": 8.113837751061246e-05, - "loss": 0.0156, + "epoch": 0.6238383838383839, + "grad_norm": 0.00970857497304678, + "learning_rate": 2.496765847347995e-05, + "loss": 0.0013, "step": 193 }, { - "epoch": 1.7962962962962963, - "grad_norm": 0.03843529522418976, - "learning_rate": 8.088484765467286e-05, - "loss": 0.0202, + "epoch": 0.6270707070707071, + "grad_norm": 0.007368095684796572, + "learning_rate": 2.509702457956016e-05, + "loss": 0.0016, "step": 194 }, { - "epoch": 1.8055555555555556, - "grad_norm": 0.03014414757490158, - "learning_rate": 8.063002725966015e-05, - "loss": 0.0157, - "step": 195 - }, - { - "epoch": 1.8055555555555556, - "eval_loss": 0.018071575090289116, - "eval_runtime": 9.1428, - "eval_samples_per_second": 5.469, - "eval_steps_per_second": 1.422, + "epoch": 0.6303030303030303, + "grad_norm": 0.007306639105081558, + "learning_rate": 2.5226390685640362e-05, + "loss": 0.0015, "step": 195 }, { - "epoch": 1.8148148148148149, - "grad_norm": 0.028225911781191826, - "learning_rate": 8.037392697337418e-05, - "loss": 0.0152, + "epoch": 0.6335353535353535, + "grad_norm": 0.008257709443569183, + "learning_rate": 2.535575679172057e-05, + "loss": 0.0017, "step": 196 }, { - "epoch": 1.824074074074074, - "grad_norm": 0.022350864484906197, - "learning_rate": 8.011655749709575e-05, - "loss": 0.0147, + "epoch": 0.6367676767676768, + "grad_norm": 0.0050062090158462524, + "learning_rate": 2.548512289780078e-05, + "loss": 0.0012, "step": 197 }, { - "epoch": 1.8333333333333335, - "grad_norm": 0.023073699325323105, - "learning_rate": 7.985792958513931e-05, - "loss": 0.0142, + "epoch": 0.64, + "grad_norm": 0.007328143808990717, + "learning_rate": 2.5614489003880986e-05, + "loss": 0.0025, "step": 198 }, { - "epoch": 1.8425925925925926, - "grad_norm": 0.027160046622157097, - "learning_rate": 7.95980540444038e-05, - "loss": 0.0181, + "epoch": 0.6432323232323233, + "grad_norm": 0.007282217964529991, + "learning_rate": 2.574385510996119e-05, + "loss": 0.0018, "step": 199 }, { - "epoch": 1.8518518518518519, - "grad_norm": 0.02501911297440529, - "learning_rate": 7.93369417339209e-05, - "loss": 0.0154, + "epoch": 0.6464646464646465, + "grad_norm": 0.007711055688560009, + "learning_rate": 2.58732212160414e-05, + "loss": 0.0013, "step": 200 }, { - "epoch": 1.8518518518518519, - "eval_loss": 0.01711750030517578, - "eval_runtime": 9.1469, - "eval_samples_per_second": 5.466, - "eval_steps_per_second": 1.421, + "epoch": 0.6464646464646465, + "eval_loss": 0.0015150802209973335, + "eval_runtime": 18.6339, + "eval_samples_per_second": 5.367, + "eval_steps_per_second": 1.342, "step": 200 }, { - "epoch": 1.8611111111111112, - "grad_norm": 0.02209513448178768, - "learning_rate": 7.907460356440133e-05, - "loss": 0.0156, + "epoch": 0.6496969696969697, + "grad_norm": 0.006153900176286697, + "learning_rate": 2.6002587322121607e-05, + "loss": 0.0016, "step": 201 }, { - "epoch": 1.8703703703703702, - "grad_norm": 0.022372853010892868, - "learning_rate": 7.881105049777901e-05, - "loss": 0.0182, + "epoch": 0.6529292929292929, + "grad_norm": 0.0070861754938960075, + "learning_rate": 2.613195342820181e-05, + "loss": 0.002, "step": 202 }, { - "epoch": 1.8796296296296298, - "grad_norm": 0.02874351665377617, - "learning_rate": 7.854629354675291e-05, - "loss": 0.0145, + "epoch": 0.6561616161616162, + "grad_norm": 0.0048012156039476395, + "learning_rate": 2.626131953428202e-05, + "loss": 0.0012, "step": 203 }, { - "epoch": 1.8888888888888888, - "grad_norm": 0.025754928588867188, - "learning_rate": 7.828034377432693e-05, - "loss": 0.0161, + "epoch": 0.6593939393939394, + "grad_norm": 0.006167956627905369, + "learning_rate": 2.6390685640362228e-05, + "loss": 0.0018, "step": 204 }, { - "epoch": 1.8981481481481481, - "grad_norm": 0.023868247866630554, - "learning_rate": 7.801321229334764e-05, - "loss": 0.0139, + "epoch": 0.6626262626262627, + "grad_norm": 0.0051431735046207905, + "learning_rate": 2.652005174644243e-05, + "loss": 0.0013, "step": 205 }, { - "epoch": 1.8981481481481481, - "eval_loss": 0.01687374897301197, - "eval_runtime": 9.1148, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, - "step": 205 - }, - { - "epoch": 1.9074074074074074, - "grad_norm": 0.02167942002415657, - "learning_rate": 7.774491026603985e-05, - "loss": 0.0172, + "epoch": 0.6658585858585858, + "grad_norm": 0.009899413213133812, + "learning_rate": 2.664941785252264e-05, + "loss": 0.0031, "step": 206 }, { - "epoch": 1.9166666666666665, - "grad_norm": 0.028955647721886635, - "learning_rate": 7.74754489035403e-05, - "loss": 0.0182, + "epoch": 0.6690909090909091, + "grad_norm": 0.007686274591833353, + "learning_rate": 2.677878395860285e-05, + "loss": 0.0022, "step": 207 }, { - "epoch": 1.925925925925926, - "grad_norm": 0.023490311577916145, - "learning_rate": 7.720483946542914e-05, - "loss": 0.0176, + "epoch": 0.6723232323232323, + "grad_norm": 0.004816455766558647, + "learning_rate": 2.6908150064683052e-05, + "loss": 0.0011, "step": 208 }, { - "epoch": 1.9351851851851851, - "grad_norm": 0.02635806053876877, - "learning_rate": 7.69330932592594e-05, - "loss": 0.0149, + "epoch": 0.6755555555555556, + "grad_norm": 0.004873145837336779, + "learning_rate": 2.7037516170763262e-05, + "loss": 0.0011, "step": 209 }, { - "epoch": 1.9444444444444444, - "grad_norm": 0.02554040215909481, - "learning_rate": 7.666022164008457e-05, - "loss": 0.0169, - "step": 210 - }, - { - "epoch": 1.9444444444444444, - "eval_loss": 0.016974864527583122, - "eval_runtime": 9.1008, - "eval_samples_per_second": 5.494, - "eval_steps_per_second": 1.428, + "epoch": 0.6787878787878788, + "grad_norm": 0.006136384792625904, + "learning_rate": 2.716688227684347e-05, + "loss": 0.0016, "step": 210 }, { - "epoch": 1.9537037037037037, - "grad_norm": 0.02924305759370327, - "learning_rate": 7.63862360099841e-05, - "loss": 0.0148, + "epoch": 0.682020202020202, + "grad_norm": 0.004508585669100285, + "learning_rate": 2.7296248382923673e-05, + "loss": 0.0012, "step": 211 }, { - "epoch": 1.9629629629629628, - "grad_norm": 0.020948631688952446, - "learning_rate": 7.611114781758692e-05, - "loss": 0.0158, + "epoch": 0.6852525252525252, + "grad_norm": 0.005723617039620876, + "learning_rate": 2.7425614489003883e-05, + "loss": 0.0016, "step": 212 }, { - "epoch": 1.9722222222222223, - "grad_norm": 0.021703558042645454, - "learning_rate": 7.583496855759316e-05, - "loss": 0.0172, + "epoch": 0.6884848484848485, + "grad_norm": 0.0072562843561172485, + "learning_rate": 2.755498059508409e-05, + "loss": 0.0015, "step": 213 }, { - "epoch": 1.9814814814814814, - "grad_norm": 0.022922605276107788, - "learning_rate": 7.555770977029367e-05, - "loss": 0.0149, + "epoch": 0.6917171717171717, + "grad_norm": 0.006079181097447872, + "learning_rate": 2.76843467011643e-05, + "loss": 0.0017, "step": 214 }, { - "epoch": 1.9907407407407407, - "grad_norm": 0.025769095867872238, - "learning_rate": 7.527938304108795e-05, - "loss": 0.0158, - "step": 215 - }, - { - "epoch": 1.9907407407407407, - "eval_loss": 0.017042405903339386, - "eval_runtime": 9.1168, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 0.694949494949495, + "grad_norm": 0.011977693997323513, + "learning_rate": 2.7813712807244503e-05, + "loss": 0.0048, "step": 215 }, { - "epoch": 2.0, - "grad_norm": 0.03371057286858559, - "learning_rate": 7.500000000000001e-05, - "loss": 0.0126, + "epoch": 0.6981818181818182, + "grad_norm": 0.007406435441225767, + "learning_rate": 2.794307891332471e-05, + "loss": 0.0022, "step": 216 }, { - "epoch": 2.009259259259259, - "grad_norm": 0.01711084321141243, - "learning_rate": 7.471957232119234e-05, - "loss": 0.0142, + "epoch": 0.7014141414141414, + "grad_norm": 0.006909268908202648, + "learning_rate": 2.807244501940492e-05, + "loss": 0.0022, "step": 217 }, { - "epoch": 2.0185185185185186, - "grad_norm": 0.023618614301085472, - "learning_rate": 7.443811172247821e-05, - "loss": 0.0151, + "epoch": 0.7046464646464646, + "grad_norm": 0.0051758866757154465, + "learning_rate": 2.8201811125485124e-05, + "loss": 0.0012, "step": 218 }, { - "epoch": 2.0277777777777777, - "grad_norm": 0.02181304432451725, - "learning_rate": 7.415562996483192e-05, - "loss": 0.0132, + "epoch": 0.7078787878787879, + "grad_norm": 0.00657995231449604, + "learning_rate": 2.833117723156533e-05, + "loss": 0.0014, "step": 219 }, { - "epoch": 2.037037037037037, - "grad_norm": 0.020521776750683784, - "learning_rate": 7.387213885189746e-05, - "loss": 0.0139, - "step": 220 - }, - { - "epoch": 2.037037037037037, - "eval_loss": 0.01702064275741577, - "eval_runtime": 9.1369, - "eval_samples_per_second": 5.472, - "eval_steps_per_second": 1.423, + "epoch": 0.7111111111111111, + "grad_norm": 0.006561241112649441, + "learning_rate": 2.846054333764554e-05, + "loss": 0.0018, "step": 220 }, { - "epoch": 2.0462962962962963, - "grad_norm": 0.022209780290722847, - "learning_rate": 7.358765022949519e-05, - "loss": 0.0152, + "epoch": 0.7143434343434344, + "grad_norm": 0.005906842183321714, + "learning_rate": 2.8589909443725745e-05, + "loss": 0.0013, "step": 221 }, { - "epoch": 2.0555555555555554, - "grad_norm": 0.02240665629506111, - "learning_rate": 7.330217598512695e-05, - "loss": 0.0136, + "epoch": 0.7175757575757575, + "grad_norm": 0.004000976216048002, + "learning_rate": 2.871927554980595e-05, + "loss": 0.0009, "step": 222 }, { - "epoch": 2.064814814814815, - "grad_norm": 0.024021176621317863, - "learning_rate": 7.30157280474793e-05, - "loss": 0.0134, + "epoch": 0.7208080808080808, + "grad_norm": 0.006313610821962357, + "learning_rate": 2.8848641655886162e-05, + "loss": 0.0015, "step": 223 }, { - "epoch": 2.074074074074074, - "grad_norm": 0.022297382354736328, - "learning_rate": 7.272831838592503e-05, - "loss": 0.0158, + "epoch": 0.724040404040404, + "grad_norm": 0.005339731462299824, + "learning_rate": 2.8978007761966365e-05, + "loss": 0.0014, "step": 224 }, { - "epoch": 2.0833333333333335, - "grad_norm": 0.023189576342701912, - "learning_rate": 7.243995901002312e-05, - "loss": 0.0146, + "epoch": 0.7272727272727273, + "grad_norm": 0.00704893097281456, + "learning_rate": 2.9107373868046572e-05, + "loss": 0.0017, "step": 225 }, { - "epoch": 2.0833333333333335, - "eval_loss": 0.017011733725667, - "eval_runtime": 9.1385, - "eval_samples_per_second": 5.471, - "eval_steps_per_second": 1.423, + "epoch": 0.7272727272727273, + "eval_loss": 0.0014526441227644682, + "eval_runtime": 18.5916, + "eval_samples_per_second": 5.379, + "eval_steps_per_second": 1.345, "step": 225 }, { - "epoch": 2.0925925925925926, - "grad_norm": 0.02641259878873825, - "learning_rate": 7.215066196901676e-05, - "loss": 0.0149, + "epoch": 0.7305050505050505, + "grad_norm": 0.007863117381930351, + "learning_rate": 2.9236739974126783e-05, + "loss": 0.002, "step": 226 }, { - "epoch": 2.1018518518518516, - "grad_norm": 0.02105395309627056, - "learning_rate": 7.186043935133005e-05, - "loss": 0.0105, + "epoch": 0.7337373737373737, + "grad_norm": 0.007929647341370583, + "learning_rate": 2.936610608020699e-05, + "loss": 0.0014, "step": 227 }, { - "epoch": 2.111111111111111, - "grad_norm": 0.020818866789340973, - "learning_rate": 7.156930328406268e-05, - "loss": 0.0144, + "epoch": 0.7369696969696969, + "grad_norm": 0.008834928274154663, + "learning_rate": 2.9495472186287193e-05, + "loss": 0.0033, "step": 228 }, { - "epoch": 2.1203703703703702, - "grad_norm": 0.028699271380901337, - "learning_rate": 7.127726593248337e-05, - "loss": 0.0134, + "epoch": 0.7402020202020202, + "grad_norm": 0.005479221232235432, + "learning_rate": 2.9624838292367403e-05, + "loss": 0.0013, "step": 229 }, { - "epoch": 2.1296296296296298, - "grad_norm": 0.025844816118478775, - "learning_rate": 7.098433949952146e-05, - "loss": 0.0115, - "step": 230 - }, - { - "epoch": 2.1296296296296298, - "eval_loss": 0.017404422163963318, - "eval_runtime": 9.1138, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 0.7434343434343434, + "grad_norm": 0.008690214715898037, + "learning_rate": 2.975420439844761e-05, + "loss": 0.0031, "step": 230 }, { - "epoch": 2.138888888888889, - "grad_norm": 0.02628181129693985, - "learning_rate": 7.069053622525696e-05, - "loss": 0.0135, + "epoch": 0.7466666666666667, + "grad_norm": 0.006373909767717123, + "learning_rate": 2.9883570504527814e-05, + "loss": 0.0013, "step": 231 }, { - "epoch": 2.148148148148148, - "grad_norm": 0.03826741501688957, - "learning_rate": 7.039586838640919e-05, - "loss": 0.013, + "epoch": 0.74989898989899, + "grad_norm": 0.005924658849835396, + "learning_rate": 3.0012936610608024e-05, + "loss": 0.0014, "step": 232 }, { - "epoch": 2.1574074074074074, - "grad_norm": 0.02549687772989273, - "learning_rate": 7.01003482958237e-05, - "loss": 0.0112, + "epoch": 0.7531313131313131, + "grad_norm": 0.007938825525343418, + "learning_rate": 3.014230271668823e-05, + "loss": 0.0027, "step": 233 }, { - "epoch": 2.1666666666666665, - "grad_norm": 0.02850032038986683, - "learning_rate": 6.980398830195785e-05, - "loss": 0.0114, + "epoch": 0.7563636363636363, + "grad_norm": 0.007831827737390995, + "learning_rate": 3.0271668822768434e-05, + "loss": 0.0025, "step": 234 }, { - "epoch": 2.175925925925926, - "grad_norm": 0.028789905831217766, - "learning_rate": 6.950680078836474e-05, - "loss": 0.0138, - "step": 235 - }, - { - "epoch": 2.175925925925926, - "eval_loss": 0.016838619485497475, - "eval_runtime": 9.1141, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 0.7595959595959596, + "grad_norm": 0.008570835925638676, + "learning_rate": 3.0401034928848644e-05, + "loss": 0.003, "step": 235 }, { - "epoch": 2.185185185185185, - "grad_norm": 0.024276968091726303, - "learning_rate": 6.920879817317589e-05, - "loss": 0.0156, + "epoch": 0.7628282828282829, + "grad_norm": 0.005871222820132971, + "learning_rate": 3.053040103492885e-05, + "loss": 0.0013, "step": 236 }, { - "epoch": 2.1944444444444446, - "grad_norm": 0.02652347832918167, - "learning_rate": 6.890999290858214e-05, - "loss": 0.0111, + "epoch": 0.7660606060606061, + "grad_norm": 0.007041990291327238, + "learning_rate": 3.0659767141009055e-05, + "loss": 0.0028, "step": 237 }, { - "epoch": 2.2037037037037037, - "grad_norm": 0.03363705053925514, - "learning_rate": 6.861039748031351e-05, - "loss": 0.0155, + "epoch": 0.7692929292929293, + "grad_norm": 0.0073891859501600266, + "learning_rate": 3.0789133247089265e-05, + "loss": 0.0023, "step": 238 }, { - "epoch": 2.212962962962963, - "grad_norm": 0.025364842265844345, - "learning_rate": 6.83100244071174e-05, - "loss": 0.0127, + "epoch": 0.7725252525252525, + "grad_norm": 0.005527487024664879, + "learning_rate": 3.0918499353169475e-05, + "loss": 0.0014, "step": 239 }, { - "epoch": 2.2222222222222223, - "grad_norm": 0.024912815541028976, - "learning_rate": 6.800888624023553e-05, - "loss": 0.0138, - "step": 240 - }, - { - "epoch": 2.2222222222222223, - "eval_loss": 0.017057882621884346, - "eval_runtime": 9.1505, - "eval_samples_per_second": 5.464, - "eval_steps_per_second": 1.421, + "epoch": 0.7757575757575758, + "grad_norm": 0.007551091257482767, + "learning_rate": 3.104786545924967e-05, + "loss": 0.002, "step": 240 }, { - "epoch": 2.2314814814814814, - "grad_norm": 0.031296826899051666, - "learning_rate": 6.770699556287939e-05, - "loss": 0.0138, + "epoch": 0.778989898989899, + "grad_norm": 0.005599314346909523, + "learning_rate": 3.117723156532988e-05, + "loss": 0.002, "step": 241 }, { - "epoch": 2.240740740740741, - "grad_norm": 0.03207860141992569, - "learning_rate": 6.740436498970452e-05, - "loss": 0.0128, + "epoch": 0.7822222222222223, + "grad_norm": 0.007462210953235626, + "learning_rate": 3.130659767141009e-05, + "loss": 0.0019, "step": 242 }, { - "epoch": 2.25, - "grad_norm": 0.027626443654298782, - "learning_rate": 6.710100716628344e-05, - "loss": 0.0142, + "epoch": 0.7854545454545454, + "grad_norm": 0.004866201896220446, + "learning_rate": 3.14359637774903e-05, + "loss": 0.0012, "step": 243 }, { - "epoch": 2.259259259259259, - "grad_norm": 0.025963863357901573, - "learning_rate": 6.679693476857711e-05, - "loss": 0.0137, + "epoch": 0.7886868686868687, + "grad_norm": 0.005268635228276253, + "learning_rate": 3.1565329883570506e-05, + "loss": 0.0013, "step": 244 }, { - "epoch": 2.2685185185185186, - "grad_norm": 0.022552739828824997, - "learning_rate": 6.649216050240539e-05, - "loss": 0.0134, - "step": 245 - }, - { - "epoch": 2.2685185185185186, - "eval_loss": 0.016679909080266953, - "eval_runtime": 9.1095, - "eval_samples_per_second": 5.489, - "eval_steps_per_second": 1.427, + "epoch": 0.7919191919191919, + "grad_norm": 0.007261104416102171, + "learning_rate": 3.169469598965072e-05, + "loss": 0.0031, "step": 245 }, { - "epoch": 2.2777777777777777, - "grad_norm": 0.0247825738042593, - "learning_rate": 6.618669710291606e-05, - "loss": 0.0116, + "epoch": 0.7951515151515152, + "grad_norm": 0.006308354903012514, + "learning_rate": 3.182406209573092e-05, + "loss": 0.0014, "step": 246 }, { - "epoch": 2.287037037037037, - "grad_norm": 0.021808508783578873, - "learning_rate": 6.588055733405266e-05, - "loss": 0.014, + "epoch": 0.7983838383838384, + "grad_norm": 0.0074616689234972, + "learning_rate": 3.1953428201811124e-05, + "loss": 0.0015, "step": 247 }, { - "epoch": 2.2962962962962963, - "grad_norm": 0.025087367743253708, - "learning_rate": 6.557375398802123e-05, - "loss": 0.0167, + "epoch": 0.8016161616161617, + "grad_norm": 0.007333788555115461, + "learning_rate": 3.2082794307891334e-05, + "loss": 0.0014, "step": 248 }, { - "epoch": 2.3055555555555554, - "grad_norm": 0.022722622379660606, - "learning_rate": 6.526629988475567e-05, - "loss": 0.013, + "epoch": 0.8048484848484848, + "grad_norm": 0.006965926848351955, + "learning_rate": 3.2212160413971544e-05, + "loss": 0.0018, "step": 249 }, { - "epoch": 2.314814814814815, - "grad_norm": 0.023495636880397797, - "learning_rate": 6.495820787138209e-05, - "loss": 0.0167, + "epoch": 0.8080808080808081, + "grad_norm": 0.00535194855183363, + "learning_rate": 3.234152652005175e-05, + "loss": 0.0015, "step": 250 }, { - "epoch": 2.314814814814815, - "eval_loss": 0.016377143561840057, - "eval_runtime": 9.1133, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 0.8080808080808081, + "eval_loss": 0.0013973835157230496, + "eval_runtime": 18.6113, + "eval_samples_per_second": 5.373, + "eval_steps_per_second": 1.343, "step": 250 }, { - "epoch": 2.324074074074074, - "grad_norm": 0.021211953833699226, - "learning_rate": 6.464949082168204e-05, - "loss": 0.0125, + "epoch": 0.8113131313131313, + "grad_norm": 0.006415734998881817, + "learning_rate": 3.247089262613196e-05, + "loss": 0.0014, "step": 251 }, { - "epoch": 2.3333333333333335, - "grad_norm": 0.022748148068785667, - "learning_rate": 6.434016163555452e-05, - "loss": 0.0121, + "epoch": 0.8145454545454546, + "grad_norm": 0.00617300346493721, + "learning_rate": 3.260025873221216e-05, + "loss": 0.0017, "step": 252 }, { - "epoch": 2.3425925925925926, - "grad_norm": 0.021960506215691566, - "learning_rate": 6.403023323847695e-05, - "loss": 0.0159, + "epoch": 0.8177777777777778, + "grad_norm": 0.007527848239988089, + "learning_rate": 3.2729624838292365e-05, + "loss": 0.0017, "step": 253 }, { - "epoch": 2.351851851851852, - "grad_norm": 0.02572719193994999, - "learning_rate": 6.371971858096508e-05, - "loss": 0.0137, + "epoch": 0.821010101010101, + "grad_norm": 0.00747877499088645, + "learning_rate": 3.2858990944372575e-05, + "loss": 0.0014, "step": 254 }, { - "epoch": 2.361111111111111, - "grad_norm": 0.027611717581748962, - "learning_rate": 6.340863063803188e-05, - "loss": 0.0123, - "step": 255 - }, - { - "epoch": 2.361111111111111, - "eval_loss": 0.016414109617471695, - "eval_runtime": 9.1093, - "eval_samples_per_second": 5.489, - "eval_steps_per_second": 1.427, + "epoch": 0.8242424242424242, + "grad_norm": 0.005451089236885309, + "learning_rate": 3.2988357050452786e-05, + "loss": 0.0012, "step": 255 }, { - "epoch": 2.3703703703703702, - "grad_norm": 0.026147907599806786, - "learning_rate": 6.30969824086453e-05, - "loss": 0.012, + "epoch": 0.8274747474747475, + "grad_norm": 0.006235205102711916, + "learning_rate": 3.3117723156532996e-05, + "loss": 0.0014, "step": 256 }, { - "epoch": 2.3796296296296298, - "grad_norm": 0.026667073369026184, - "learning_rate": 6.27847869151852e-05, - "loss": 0.0127, + "epoch": 0.8307070707070707, + "grad_norm": 0.005764327012002468, + "learning_rate": 3.324708926261319e-05, + "loss": 0.0013, "step": 257 }, { - "epoch": 2.388888888888889, - "grad_norm": 0.023840012028813362, - "learning_rate": 6.247205720289907e-05, - "loss": 0.0141, + "epoch": 0.833939393939394, + "grad_norm": 0.007408145349472761, + "learning_rate": 3.33764553686934e-05, + "loss": 0.0013, "step": 258 }, { - "epoch": 2.398148148148148, - "grad_norm": 0.028697500005364418, - "learning_rate": 6.215880633935708e-05, - "loss": 0.0135, + "epoch": 0.8371717171717171, + "grad_norm": 0.004855205304920673, + "learning_rate": 3.350582147477361e-05, + "loss": 0.0009, "step": 259 }, { - "epoch": 2.4074074074074074, - "grad_norm": 0.029124466702342033, - "learning_rate": 6.184504741390596e-05, - "loss": 0.0139, - "step": 260 - }, - { - "epoch": 2.4074074074074074, - "eval_loss": 0.016279693692922592, - "eval_runtime": 9.1162, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 0.8404040404040404, + "grad_norm": 0.010664681904017925, + "learning_rate": 3.3635187580853817e-05, + "loss": 0.0016, "step": 260 }, { - "epoch": 2.4166666666666665, - "grad_norm": 0.020265506580471992, - "learning_rate": 6.153079353712201e-05, - "loss": 0.0129, + "epoch": 0.8436363636363636, + "grad_norm": 0.006055675912648439, + "learning_rate": 3.376455368693403e-05, + "loss": 0.0016, "step": 261 }, { - "epoch": 2.425925925925926, - "grad_norm": 0.020486822351813316, - "learning_rate": 6.121605784026339e-05, - "loss": 0.0114, + "epoch": 0.8468686868686869, + "grad_norm": 0.01056759525090456, + "learning_rate": 3.389391979301424e-05, + "loss": 0.0016, "step": 262 }, { - "epoch": 2.435185185185185, - "grad_norm": 0.02432914823293686, - "learning_rate": 6.09008534747213e-05, - "loss": 0.0138, + "epoch": 0.8501010101010101, + "grad_norm": 0.007450331002473831, + "learning_rate": 3.4023285899094434e-05, + "loss": 0.0016, "step": 263 }, { - "epoch": 2.4444444444444446, - "grad_norm": 0.027614833787083626, - "learning_rate": 6.058519361147055e-05, - "loss": 0.0118, + "epoch": 0.8533333333333334, + "grad_norm": 0.0053154826164245605, + "learning_rate": 3.4152652005174644e-05, + "loss": 0.0011, "step": 264 }, { - "epoch": 2.4537037037037037, - "grad_norm": 0.03493235632777214, - "learning_rate": 6.02690914405191e-05, - "loss": 0.0125, - "step": 265 - }, - { - "epoch": 2.4537037037037037, - "eval_loss": 0.016143780201673508, - "eval_runtime": 9.2054, - "eval_samples_per_second": 5.432, - "eval_steps_per_second": 1.412, + "epoch": 0.8565656565656565, + "grad_norm": 0.0060371761210262775, + "learning_rate": 3.4282018111254854e-05, + "loss": 0.0013, "step": 265 }, { - "epoch": 2.462962962962963, - "grad_norm": 0.024250265210866928, - "learning_rate": 5.995256017035703e-05, - "loss": 0.0139, + "epoch": 0.8597979797979798, + "grad_norm": 0.005466979928314686, + "learning_rate": 3.441138421733506e-05, + "loss": 0.0012, "step": 266 }, { - "epoch": 2.4722222222222223, - "grad_norm": 0.022808292880654335, - "learning_rate": 5.963561302740449e-05, - "loss": 0.0162, + "epoch": 0.863030303030303, + "grad_norm": 0.00831522885710001, + "learning_rate": 3.454075032341527e-05, + "loss": 0.0014, "step": 267 }, { - "epoch": 2.4814814814814814, - "grad_norm": 0.03109206259250641, - "learning_rate": 5.9318263255459116e-05, - "loss": 0.0123, + "epoch": 0.8662626262626263, + "grad_norm": 0.007963420823216438, + "learning_rate": 3.467011642949548e-05, + "loss": 0.0025, "step": 268 }, { - "epoch": 2.490740740740741, - "grad_norm": 0.02985144406557083, - "learning_rate": 5.900052411514257e-05, - "loss": 0.015, + "epoch": 0.8694949494949495, + "grad_norm": 0.004937573801726103, + "learning_rate": 3.4799482535575675e-05, + "loss": 0.0011, "step": 269 }, { - "epoch": 2.5, - "grad_norm": 0.024866314604878426, - "learning_rate": 5.868240888334653e-05, - "loss": 0.0126, - "step": 270 - }, - { - "epoch": 2.5, - "eval_loss": 0.016046511009335518, - "eval_runtime": 9.1128, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 0.8727272727272727, + "grad_norm": 0.0058564417995512486, + "learning_rate": 3.4928848641655885e-05, + "loss": 0.0016, "step": 270 }, { - "epoch": 2.5092592592592595, - "grad_norm": 0.0215854924172163, - "learning_rate": 5.836393085267776e-05, - "loss": 0.0133, + "epoch": 0.8759595959595959, + "grad_norm": 0.007912451401352882, + "learning_rate": 3.5058214747736096e-05, + "loss": 0.0026, "step": 271 }, { - "epoch": 2.5185185185185186, - "grad_norm": 0.02321489341557026, - "learning_rate": 5.804510333090287e-05, - "loss": 0.0175, + "epoch": 0.8791919191919192, + "grad_norm": 0.003929544240236282, + "learning_rate": 3.5187580853816306e-05, + "loss": 0.0011, "step": 272 }, { - "epoch": 2.5277777777777777, - "grad_norm": 0.024908283725380898, - "learning_rate": 5.772593964039203e-05, - "loss": 0.0116, + "epoch": 0.8824242424242424, + "grad_norm": 0.008150831796228886, + "learning_rate": 3.531694695989651e-05, + "loss": 0.0018, "step": 273 }, { - "epoch": 2.537037037037037, - "grad_norm": 0.02571980282664299, - "learning_rate": 5.740645311756245e-05, - "loss": 0.0125, + "epoch": 0.8856565656565657, + "grad_norm": 0.009232031181454659, + "learning_rate": 3.544631306597671e-05, + "loss": 0.0042, "step": 274 }, { - "epoch": 2.5462962962962963, - "grad_norm": 0.022897284477949142, - "learning_rate": 5.708665711232103e-05, - "loss": 0.0138, + "epoch": 0.8888888888888888, + "grad_norm": 0.007469587959349155, + "learning_rate": 3.557567917205692e-05, + "loss": 0.0023, "step": 275 }, { - "epoch": 2.5462962962962963, - "eval_loss": 0.016013609245419502, - "eval_runtime": 9.1743, - "eval_samples_per_second": 5.45, - "eval_steps_per_second": 1.417, + "epoch": 0.8888888888888888, + "eval_loss": 0.0013014678843319416, + "eval_runtime": 18.7103, + "eval_samples_per_second": 5.345, + "eval_steps_per_second": 1.336, "step": 275 }, { - "epoch": 2.5555555555555554, - "grad_norm": 0.023732876405119896, - "learning_rate": 5.6766564987506566e-05, - "loss": 0.0136, + "epoch": 0.8921212121212121, + "grad_norm": 0.006450500804930925, + "learning_rate": 3.570504527813713e-05, + "loss": 0.0014, "step": 276 }, { - "epoch": 2.564814814814815, - "grad_norm": 0.024980880320072174, - "learning_rate": 5.644619011833133e-05, - "loss": 0.0131, + "epoch": 0.8953535353535353, + "grad_norm": 0.004984740167856216, + "learning_rate": 3.583441138421734e-05, + "loss": 0.0014, "step": 277 }, { - "epoch": 2.574074074074074, - "grad_norm": 0.023262949660420418, - "learning_rate": 5.6125545891822274e-05, - "loss": 0.0143, + "epoch": 0.8985858585858586, + "grad_norm": 0.006635539699345827, + "learning_rate": 3.596377749029755e-05, + "loss": 0.0016, "step": 278 }, { - "epoch": 2.5833333333333335, - "grad_norm": 0.024468230083584785, - "learning_rate": 5.5804645706261514e-05, - "loss": 0.0148, + "epoch": 0.9018181818181819, + "grad_norm": 0.010307137854397297, + "learning_rate": 3.609314359637775e-05, + "loss": 0.0025, "step": 279 }, { - "epoch": 2.5925925925925926, - "grad_norm": 0.020350055769085884, - "learning_rate": 5.548350297062659e-05, - "loss": 0.0125, - "step": 280 - }, - { - "epoch": 2.5925925925925926, - "eval_loss": 0.015153205953538418, - "eval_runtime": 9.1126, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 0.9050505050505051, + "grad_norm": 0.008318118751049042, + "learning_rate": 3.6222509702457954e-05, + "loss": 0.0028, "step": 280 }, { - "epoch": 2.601851851851852, - "grad_norm": 0.027165360748767853, - "learning_rate": 5.516213110403009e-05, - "loss": 0.0093, + "epoch": 0.9082828282828282, + "grad_norm": 0.007523260544985533, + "learning_rate": 3.6351875808538164e-05, + "loss": 0.0022, "step": 281 }, { - "epoch": 2.611111111111111, - "grad_norm": 0.021070580929517746, - "learning_rate": 5.484054353515896e-05, - "loss": 0.0138, + "epoch": 0.9115151515151515, + "grad_norm": 0.00529656745493412, + "learning_rate": 3.648124191461837e-05, + "loss": 0.001, "step": 282 }, { - "epoch": 2.6203703703703702, - "grad_norm": 0.025997430086135864, - "learning_rate": 5.451875370171341e-05, - "loss": 0.0121, + "epoch": 0.9147474747474748, + "grad_norm": 0.0050112344324588776, + "learning_rate": 3.661060802069858e-05, + "loss": 0.0012, "step": 283 }, { - "epoch": 2.6296296296296298, - "grad_norm": 0.02517426759004593, - "learning_rate": 5.419677504984534e-05, - "loss": 0.0126, + "epoch": 0.917979797979798, + "grad_norm": 0.006318412255495787, + "learning_rate": 3.673997412677879e-05, + "loss": 0.0017, "step": 284 }, { - "epoch": 2.638888888888889, - "grad_norm": 0.025812286883592606, - "learning_rate": 5.387462103359655e-05, - "loss": 0.0133, - "step": 285 - }, - { - "epoch": 2.638888888888889, - "eval_loss": 0.016152961179614067, - "eval_runtime": 9.1127, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 0.9212121212121213, + "grad_norm": 0.006407878361642361, + "learning_rate": 3.6869340232859e-05, + "loss": 0.0022, "step": 285 }, { - "epoch": 2.648148148148148, - "grad_norm": 0.02393972873687744, - "learning_rate": 5.355230511433651e-05, - "loss": 0.0136, + "epoch": 0.9244444444444444, + "grad_norm": 0.004356731195002794, + "learning_rate": 3.6998706338939195e-05, + "loss": 0.0007, "step": 286 }, { - "epoch": 2.6574074074074074, - "grad_norm": 0.021706297993659973, - "learning_rate": 5.32298407601999e-05, - "loss": 0.0133, + "epoch": 0.9276767676767677, + "grad_norm": 0.004966625012457371, + "learning_rate": 3.7128072445019406e-05, + "loss": 0.0012, "step": 287 }, { - "epoch": 2.6666666666666665, - "grad_norm": 0.026299407705664635, - "learning_rate": 5.290724144552379e-05, - "loss": 0.0143, + "epoch": 0.9309090909090909, + "grad_norm": 0.0055012330412864685, + "learning_rate": 3.7257438551099616e-05, + "loss": 0.0011, "step": 288 }, { - "epoch": 2.675925925925926, - "grad_norm": 0.030511364340782166, - "learning_rate": 5.258452065028473e-05, - "loss": 0.0137, + "epoch": 0.9341414141414142, + "grad_norm": 0.006766812410205603, + "learning_rate": 3.738680465717982e-05, + "loss": 0.0017, "step": 289 }, { - "epoch": 2.685185185185185, - "grad_norm": 0.024854540824890137, - "learning_rate": 5.226169185953532e-05, - "loss": 0.0125, - "step": 290 - }, - { - "epoch": 2.685185185185185, - "eval_loss": 0.016076602041721344, - "eval_runtime": 9.1632, - "eval_samples_per_second": 5.457, - "eval_steps_per_second": 1.419, + "epoch": 0.9373737373737374, + "grad_norm": 0.006011553108692169, + "learning_rate": 3.751617076326003e-05, + "loss": 0.0022, "step": 290 }, { - "epoch": 2.6944444444444446, - "grad_norm": 0.022800520062446594, - "learning_rate": 5.193876856284085e-05, - "loss": 0.012, + "epoch": 0.9406060606060606, + "grad_norm": 0.0056586177088320255, + "learning_rate": 3.764553686934023e-05, + "loss": 0.0014, "step": 291 }, { - "epoch": 2.7037037037037037, - "grad_norm": 0.021870015189051628, - "learning_rate": 5.1615764253715536e-05, - "loss": 0.0136, + "epoch": 0.9438383838383838, + "grad_norm": 0.0064941453747451305, + "learning_rate": 3.777490297542044e-05, + "loss": 0.0011, "step": 292 }, { - "epoch": 2.712962962962963, - "grad_norm": 0.020156167447566986, - "learning_rate": 5.129269242905882e-05, - "loss": 0.012, + "epoch": 0.9470707070707071, + "grad_norm": 0.006994554307311773, + "learning_rate": 3.790426908150065e-05, + "loss": 0.0021, "step": 293 }, { - "epoch": 2.7222222222222223, - "grad_norm": 0.019064266234636307, - "learning_rate": 5.096956658859122e-05, - "loss": 0.0137, + "epoch": 0.9503030303030303, + "grad_norm": 0.006325817201286554, + "learning_rate": 3.803363518758086e-05, + "loss": 0.0014, "step": 294 }, { - "epoch": 2.7314814814814814, - "grad_norm": 0.027288921177387238, - "learning_rate": 5.064640023429043e-05, - "loss": 0.0147, - "step": 295 - }, - { - "epoch": 2.7314814814814814, - "eval_loss": 0.01584070920944214, - "eval_runtime": 9.1151, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 0.9535353535353536, + "grad_norm": 0.006501290015876293, + "learning_rate": 3.816300129366106e-05, + "loss": 0.0017, "step": 295 }, { - "epoch": 2.7407407407407405, - "grad_norm": 0.02484748885035515, - "learning_rate": 5.0323206869826966e-05, - "loss": 0.0111, + "epoch": 0.9567676767676768, + "grad_norm": 0.006014223676174879, + "learning_rate": 3.829236739974127e-05, + "loss": 0.0013, "step": 296 }, { - "epoch": 2.75, - "grad_norm": 0.02521962858736515, - "learning_rate": 5e-05, - "loss": 0.0134, + "epoch": 0.96, + "grad_norm": 0.005024549085646868, + "learning_rate": 3.8421733505821475e-05, + "loss": 0.0013, "step": 297 }, { - "epoch": 2.7592592592592595, - "grad_norm": 0.023346634581685066, - "learning_rate": 4.967679313017303e-05, - "loss": 0.0124, + "epoch": 0.9632323232323232, + "grad_norm": 0.00840625911951065, + "learning_rate": 3.855109961190168e-05, + "loss": 0.0017, "step": 298 }, { - "epoch": 2.7685185185185186, - "grad_norm": 0.021654650568962097, - "learning_rate": 4.9353599765709584e-05, - "loss": 0.0144, + "epoch": 0.9664646464646465, + "grad_norm": 0.005791939329355955, + "learning_rate": 3.868046571798189e-05, + "loss": 0.0014, "step": 299 }, { - "epoch": 2.7777777777777777, - "grad_norm": 0.021227596327662468, - "learning_rate": 4.903043341140879e-05, - "loss": 0.0134, + "epoch": 0.9696969696969697, + "grad_norm": 0.0047224657610058784, + "learning_rate": 3.88098318240621e-05, + "loss": 0.0012, "step": 300 }, { - "epoch": 2.7777777777777777, - "eval_loss": 0.016122175380587578, - "eval_runtime": 9.1019, - "eval_samples_per_second": 5.493, - "eval_steps_per_second": 1.428, + "epoch": 0.9696969696969697, + "eval_loss": 0.0012624365044757724, + "eval_runtime": 18.7089, + "eval_samples_per_second": 5.345, + "eval_steps_per_second": 1.336, "step": 300 }, { - "epoch": 2.787037037037037, - "grad_norm": 0.024656914174556732, - "learning_rate": 4.870730757094121e-05, - "loss": 0.0123, + "epoch": 0.972929292929293, + "grad_norm": 0.005727376788854599, + "learning_rate": 3.893919793014231e-05, + "loss": 0.0013, "step": 301 }, { - "epoch": 2.7962962962962963, - "grad_norm": 0.02583468146622181, - "learning_rate": 4.8384235746284476e-05, - "loss": 0.015, + "epoch": 0.9761616161616161, + "grad_norm": 0.007024977821856737, + "learning_rate": 3.906856403622251e-05, + "loss": 0.0018, "step": 302 }, { - "epoch": 2.8055555555555554, - "grad_norm": 0.022909915074706078, - "learning_rate": 4.806123143715916e-05, - "loss": 0.0142, + "epoch": 0.9793939393939394, + "grad_norm": 0.004745373502373695, + "learning_rate": 3.9197930142302716e-05, + "loss": 0.001, "step": 303 }, { - "epoch": 2.814814814814815, - "grad_norm": 0.02014041878283024, - "learning_rate": 4.7738308140464685e-05, - "loss": 0.0131, + "epoch": 0.9826262626262626, + "grad_norm": 0.0050310962833464146, + "learning_rate": 3.9327296248382926e-05, + "loss": 0.0013, "step": 304 }, { - "epoch": 2.824074074074074, - "grad_norm": 0.022683143615722656, - "learning_rate": 4.7415479349715275e-05, - "loss": 0.0124, - "step": 305 - }, - { - "epoch": 2.824074074074074, - "eval_loss": 0.015797268599271774, - "eval_runtime": 9.1281, - "eval_samples_per_second": 5.478, - "eval_steps_per_second": 1.424, + "epoch": 0.9858585858585859, + "grad_norm": 0.008561142720282078, + "learning_rate": 3.945666235446313e-05, + "loss": 0.0018, "step": 305 }, { - "epoch": 2.8333333333333335, - "grad_norm": 0.025906002148985863, - "learning_rate": 4.709275855447621e-05, - "loss": 0.0154, + "epoch": 0.9890909090909091, + "grad_norm": 0.0055971029214560986, + "learning_rate": 3.958602846054334e-05, + "loss": 0.0013, "step": 306 }, { - "epoch": 2.8425925925925926, - "grad_norm": 0.027820315212011337, - "learning_rate": 4.677015923980011e-05, - "loss": 0.0138, + "epoch": 0.9923232323232323, + "grad_norm": 0.007602418772876263, + "learning_rate": 3.971539456662355e-05, + "loss": 0.0014, "step": 307 }, { - "epoch": 2.851851851851852, - "grad_norm": 0.023744860664010048, - "learning_rate": 4.6447694885663514e-05, - "loss": 0.0124, + "epoch": 0.9955555555555555, + "grad_norm": 0.0054479725658893585, + "learning_rate": 3.9844760672703754e-05, + "loss": 0.0014, "step": 308 }, { - "epoch": 2.861111111111111, - "grad_norm": 0.026518192142248154, - "learning_rate": 4.612537896640346e-05, - "loss": 0.0155, + "epoch": 0.9987878787878788, + "grad_norm": 0.006376450881361961, + "learning_rate": 3.997412677878396e-05, + "loss": 0.0013, "step": 309 }, { - "epoch": 2.8703703703703702, - "grad_norm": 0.020426657050848007, - "learning_rate": 4.5803224950154656e-05, - "loss": 0.0132, - "step": 310 - }, - { - "epoch": 2.8703703703703702, - "eval_loss": 0.015400240197777748, - "eval_runtime": 9.1185, - "eval_samples_per_second": 5.483, - "eval_steps_per_second": 1.426, + "epoch": 1.002020202020202, + "grad_norm": 0.00963524729013443, + "learning_rate": 4.010349288486417e-05, + "loss": 0.0025, "step": 310 }, { - "epoch": 2.8796296296296298, - "grad_norm": 0.022766800597310066, - "learning_rate": 4.54812462982866e-05, - "loss": 0.0139, + "epoch": 1.0052525252525253, + "grad_norm": 0.004950121510773897, + "learning_rate": 4.023285899094437e-05, + "loss": 0.0009, "step": 311 }, { - "epoch": 2.888888888888889, - "grad_norm": 0.021728193387389183, - "learning_rate": 4.515945646484105e-05, - "loss": 0.0133, + "epoch": 1.0084848484848485, + "grad_norm": 0.0047010756097733974, + "learning_rate": 4.036222509702458e-05, + "loss": 0.0012, "step": 312 }, { - "epoch": 2.898148148148148, - "grad_norm": 0.0226016603410244, - "learning_rate": 4.4837868895969936e-05, - "loss": 0.0126, + "epoch": 1.0117171717171718, + "grad_norm": 0.004061982501298189, + "learning_rate": 4.049159120310479e-05, + "loss": 0.0008, "step": 313 }, { - "epoch": 2.9074074074074074, - "grad_norm": 0.027723975479602814, - "learning_rate": 4.451649702937342e-05, - "loss": 0.0106, + "epoch": 1.014949494949495, + "grad_norm": 0.006061443593353033, + "learning_rate": 4.0620957309184995e-05, + "loss": 0.0022, "step": 314 }, { - "epoch": 2.9166666666666665, - "grad_norm": 0.01856391504406929, - "learning_rate": 4.4195354293738484e-05, - "loss": 0.0146, - "step": 315 - }, - { - "epoch": 2.9166666666666665, - "eval_loss": 0.015166966244578362, - "eval_runtime": 9.1172, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 1.018181818181818, + "grad_norm": 0.005154592916369438, + "learning_rate": 4.07503234152652e-05, + "loss": 0.0023, "step": 315 }, { - "epoch": 2.925925925925926, - "grad_norm": 0.019857853651046753, - "learning_rate": 4.387445410817774e-05, - "loss": 0.0124, + "epoch": 1.0214141414141413, + "grad_norm": 0.004801798611879349, + "learning_rate": 4.087968952134541e-05, + "loss": 0.0012, "step": 316 }, { - "epoch": 2.935185185185185, - "grad_norm": 0.025410892441868782, - "learning_rate": 4.355380988166867e-05, - "loss": 0.0119, + "epoch": 1.0246464646464646, + "grad_norm": 0.004275497514754534, + "learning_rate": 4.100905562742562e-05, + "loss": 0.0012, "step": 317 }, { - "epoch": 2.9444444444444446, - "grad_norm": 0.02312655746936798, - "learning_rate": 4.323343501249346e-05, - "loss": 0.0144, + "epoch": 1.0278787878787878, + "grad_norm": 0.006128865294158459, + "learning_rate": 4.113842173350582e-05, + "loss": 0.0022, "step": 318 }, { - "epoch": 2.9537037037037037, - "grad_norm": 0.022076064720749855, - "learning_rate": 4.2913342887678985e-05, - "loss": 0.0117, + "epoch": 1.031111111111111, + "grad_norm": 0.004465613514184952, + "learning_rate": 4.126778783958603e-05, + "loss": 0.0019, "step": 319 }, { - "epoch": 2.962962962962963, - "grad_norm": 0.023769903928041458, - "learning_rate": 4.259354688243757e-05, - "loss": 0.014, - "step": 320 - }, - { - "epoch": 2.962962962962963, - "eval_loss": 0.014957955107092857, - "eval_runtime": 9.1101, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.0343434343434343, + "grad_norm": 0.004612160846590996, + "learning_rate": 4.1397153945666236e-05, + "loss": 0.0013, "step": 320 }, { - "epoch": 2.9722222222222223, - "grad_norm": 0.023904340341687202, - "learning_rate": 4.227406035960798e-05, - "loss": 0.0121, + "epoch": 1.0375757575757576, + "grad_norm": 0.00512789748609066, + "learning_rate": 4.152652005174644e-05, + "loss": 0.0012, "step": 321 }, { - "epoch": 2.9814814814814814, - "grad_norm": 0.02383498102426529, - "learning_rate": 4.195489666909713e-05, - "loss": 0.0119, + "epoch": 1.0408080808080808, + "grad_norm": 0.006197880953550339, + "learning_rate": 4.165588615782665e-05, + "loss": 0.0015, "step": 322 }, { - "epoch": 2.9907407407407405, - "grad_norm": 0.03048449568450451, - "learning_rate": 4.1636069147322246e-05, - "loss": 0.0136, + "epoch": 1.044040404040404, + "grad_norm": 0.005857696291059256, + "learning_rate": 4.178525226390686e-05, + "loss": 0.0014, "step": 323 }, { - "epoch": 3.0, - "grad_norm": 0.023879334330558777, - "learning_rate": 4.131759111665349e-05, - "loss": 0.0137, + "epoch": 1.0472727272727274, + "grad_norm": 0.010157118551433086, + "learning_rate": 4.1914618369987064e-05, + "loss": 0.0015, "step": 324 }, { - "epoch": 3.009259259259259, - "grad_norm": 0.025208691135048866, - "learning_rate": 4.099947588485744e-05, - "loss": 0.0122, + "epoch": 1.0505050505050506, + "grad_norm": 0.006577750667929649, + "learning_rate": 4.2043984476067274e-05, + "loss": 0.0011, "step": 325 }, { - "epoch": 3.009259259259259, - "eval_loss": 0.015089023858308792, - "eval_runtime": 9.116, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 1.0505050505050506, + "eval_loss": 0.0012751913163810968, + "eval_runtime": 18.6453, + "eval_samples_per_second": 5.363, + "eval_steps_per_second": 1.341, "step": 325 }, { - "epoch": 3.0185185185185186, - "grad_norm": 0.020718788728117943, - "learning_rate": 4.06817367445409e-05, - "loss": 0.0095, + "epoch": 1.0537373737373736, + "grad_norm": 0.004863920155912638, + "learning_rate": 4.217335058214748e-05, + "loss": 0.0009, "step": 326 }, { - "epoch": 3.0277777777777777, - "grad_norm": 0.024810951203107834, - "learning_rate": 4.036438697259551e-05, - "loss": 0.0134, + "epoch": 1.056969696969697, + "grad_norm": 0.005144843365997076, + "learning_rate": 4.230271668822768e-05, + "loss": 0.0013, "step": 327 }, { - "epoch": 3.037037037037037, - "grad_norm": 0.019842958077788353, - "learning_rate": 4.004743982964298e-05, - "loss": 0.0122, + "epoch": 1.0602020202020201, + "grad_norm": 0.005289267282932997, + "learning_rate": 4.243208279430789e-05, + "loss": 0.0012, "step": 328 }, { - "epoch": 3.0462962962962963, - "grad_norm": 0.01818239875137806, - "learning_rate": 3.97309085594809e-05, - "loss": 0.0101, + "epoch": 1.0634343434343434, + "grad_norm": 0.005019436590373516, + "learning_rate": 4.25614489003881e-05, + "loss": 0.0011, "step": 329 }, { - "epoch": 3.0555555555555554, - "grad_norm": 0.022604303434491158, - "learning_rate": 3.941480638852948e-05, - "loss": 0.0118, - "step": 330 - }, - { - "epoch": 3.0555555555555554, - "eval_loss": 0.015503546223044395, - "eval_runtime": 9.1063, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, + "epoch": 1.0666666666666667, + "grad_norm": 0.0038786993827670813, + "learning_rate": 4.269081500646831e-05, + "loss": 0.0008, "step": 330 }, { - "epoch": 3.064814814814815, - "grad_norm": 0.024690452963113785, - "learning_rate": 3.909914652527871e-05, - "loss": 0.0109, + "epoch": 1.06989898989899, + "grad_norm": 0.005253227427601814, + "learning_rate": 4.2820181112548515e-05, + "loss": 0.0012, "step": 331 }, { - "epoch": 3.074074074074074, - "grad_norm": 0.02343621291220188, - "learning_rate": 3.878394215973663e-05, - "loss": 0.0123, + "epoch": 1.0731313131313132, + "grad_norm": 0.005880107171833515, + "learning_rate": 4.294954721862872e-05, + "loss": 0.0016, "step": 332 }, { - "epoch": 3.0833333333333335, - "grad_norm": 0.026170087978243828, - "learning_rate": 3.846920646287799e-05, - "loss": 0.0122, + "epoch": 1.0763636363636364, + "grad_norm": 0.006426130421459675, + "learning_rate": 4.307891332470893e-05, + "loss": 0.0009, "step": 333 }, { - "epoch": 3.0925925925925926, - "grad_norm": 0.024799769744277, - "learning_rate": 3.815495258609404e-05, - "loss": 0.0125, + "epoch": 1.0795959595959597, + "grad_norm": 0.0038312741089612246, + "learning_rate": 4.320827943078913e-05, + "loss": 0.0009, "step": 334 }, { - "epoch": 3.1018518518518516, - "grad_norm": 0.02072787657380104, - "learning_rate": 3.784119366064293e-05, - "loss": 0.0108, - "step": 335 - }, - { - "epoch": 3.1018518518518516, - "eval_loss": 0.0155374426394701, - "eval_runtime": 9.1152, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 1.082828282828283, + "grad_norm": 0.005139067303389311, + "learning_rate": 4.333764553686934e-05, + "loss": 0.0011, "step": 335 }, { - "epoch": 3.111111111111111, - "grad_norm": 0.021989421918988228, - "learning_rate": 3.752794279710094e-05, - "loss": 0.0114, + "epoch": 1.086060606060606, + "grad_norm": 0.006610156502574682, + "learning_rate": 4.346701164294955e-05, + "loss": 0.0019, "step": 336 }, { - "epoch": 3.1203703703703702, - "grad_norm": 0.03829918056726456, - "learning_rate": 3.721521308481482e-05, - "loss": 0.0101, + "epoch": 1.0892929292929292, + "grad_norm": 0.003907322883605957, + "learning_rate": 4.359637774902976e-05, + "loss": 0.001, "step": 337 }, { - "epoch": 3.1296296296296298, - "grad_norm": 0.029835987836122513, - "learning_rate": 3.6903017591354706e-05, - "loss": 0.0107, + "epoch": 1.0925252525252525, + "grad_norm": 0.006776052061468363, + "learning_rate": 4.372574385510996e-05, + "loss": 0.0011, "step": 338 }, { - "epoch": 3.138888888888889, - "grad_norm": 0.02231847681105137, - "learning_rate": 3.6591369361968124e-05, - "loss": 0.012, + "epoch": 1.0957575757575757, + "grad_norm": 0.007625575177371502, + "learning_rate": 4.385510996119017e-05, + "loss": 0.0016, "step": 339 }, { - "epoch": 3.148148148148148, - "grad_norm": 0.02263280376791954, - "learning_rate": 3.628028141903493e-05, - "loss": 0.0103, - "step": 340 - }, - { - "epoch": 3.148148148148148, - "eval_loss": 0.01546421181410551, - "eval_runtime": 9.1199, - "eval_samples_per_second": 5.483, - "eval_steps_per_second": 1.425, + "epoch": 1.098989898989899, + "grad_norm": 0.012086856178939342, + "learning_rate": 4.3984476067270374e-05, + "loss": 0.0035, "step": 340 }, { - "epoch": 3.1574074074074074, - "grad_norm": 0.023618226870894432, - "learning_rate": 3.596976676152306e-05, - "loss": 0.0116, + "epoch": 1.1022222222222222, + "grad_norm": 0.007610958535224199, + "learning_rate": 4.4113842173350584e-05, + "loss": 0.0014, "step": 341 }, { - "epoch": 3.1666666666666665, - "grad_norm": 0.02577986940741539, - "learning_rate": 3.5659838364445505e-05, - "loss": 0.0108, + "epoch": 1.1054545454545455, + "grad_norm": 0.006693006958812475, + "learning_rate": 4.4243208279430794e-05, + "loss": 0.0013, "step": 342 }, { - "epoch": 3.175925925925926, - "grad_norm": 0.026071948930621147, - "learning_rate": 3.535050917831797e-05, - "loss": 0.0108, + "epoch": 1.1086868686868687, + "grad_norm": 0.0045067970640957355, + "learning_rate": 4.4372574385511e-05, + "loss": 0.0011, "step": 343 }, { - "epoch": 3.185185185185185, - "grad_norm": 0.038238752633333206, - "learning_rate": 3.5041792128617927e-05, - "loss": 0.0094, + "epoch": 1.111919191919192, + "grad_norm": 0.006877266336232424, + "learning_rate": 4.45019404915912e-05, + "loss": 0.001, "step": 344 }, { - "epoch": 3.1944444444444446, - "grad_norm": 0.029051663354039192, - "learning_rate": 3.473370011524435e-05, - "loss": 0.0099, - "step": 345 - }, - { - "epoch": 3.1944444444444446, - "eval_loss": 0.015372861176729202, - "eval_runtime": 9.1378, - "eval_samples_per_second": 5.472, - "eval_steps_per_second": 1.423, + "epoch": 1.1151515151515152, + "grad_norm": 0.0043833632953464985, + "learning_rate": 4.463130659767141e-05, + "loss": 0.0011, "step": 345 }, { - "epoch": 3.2037037037037037, - "grad_norm": 0.022384386509656906, - "learning_rate": 3.442624601197877e-05, - "loss": 0.0096, + "epoch": 1.1183838383838385, + "grad_norm": 0.004103951156139374, + "learning_rate": 4.476067270375162e-05, + "loss": 0.0009, "step": 346 }, { - "epoch": 3.212962962962963, - "grad_norm": 0.024341940879821777, - "learning_rate": 3.4119442665947344e-05, - "loss": 0.0094, + "epoch": 1.1216161616161617, + "grad_norm": 0.006397966295480728, + "learning_rate": 4.4890038809831825e-05, + "loss": 0.0008, "step": 347 }, { - "epoch": 3.2222222222222223, - "grad_norm": 0.02119499258697033, - "learning_rate": 3.381330289708396e-05, - "loss": 0.011, + "epoch": 1.1248484848484848, + "grad_norm": 0.007373814936727285, + "learning_rate": 4.5019404915912036e-05, + "loss": 0.0016, "step": 348 }, { - "epoch": 3.2314814814814814, - "grad_norm": 0.025269504636526108, - "learning_rate": 3.350783949759462e-05, - "loss": 0.0105, + "epoch": 1.128080808080808, + "grad_norm": 0.005406413692981005, + "learning_rate": 4.514877102199224e-05, + "loss": 0.0012, "step": 349 }, { - "epoch": 3.240740740740741, - "grad_norm": 0.02428189478814602, - "learning_rate": 3.3203065231422904e-05, - "loss": 0.0115, + "epoch": 1.1313131313131313, + "grad_norm": 0.0059000300243496895, + "learning_rate": 4.527813712807244e-05, + "loss": 0.0011, "step": 350 }, { - "epoch": 3.240740740740741, - "eval_loss": 0.015474287793040276, - "eval_runtime": 9.1142, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.1313131313131313, + "eval_loss": 0.0012826790334656835, + "eval_runtime": 18.7277, + "eval_samples_per_second": 5.34, + "eval_steps_per_second": 1.335, "step": 350 }, { - "epoch": 3.25, - "grad_norm": 0.027830710634589195, - "learning_rate": 3.289899283371657e-05, - "loss": 0.014, + "epoch": 1.1345454545454545, + "grad_norm": 0.007405865006148815, + "learning_rate": 4.540750323415265e-05, + "loss": 0.0026, "step": 351 }, { - "epoch": 3.259259259259259, - "grad_norm": 0.026644067838788033, - "learning_rate": 3.2595635010295475e-05, - "loss": 0.0132, + "epoch": 1.1377777777777778, + "grad_norm": 0.005577345844358206, + "learning_rate": 4.553686934023286e-05, + "loss": 0.0018, "step": 352 }, { - "epoch": 3.2685185185185186, - "grad_norm": 0.028307707980275154, - "learning_rate": 3.2293004437120624e-05, - "loss": 0.0093, + "epoch": 1.141010101010101, + "grad_norm": 0.006122430320829153, + "learning_rate": 4.566623544631307e-05, + "loss": 0.0015, "step": 353 }, { - "epoch": 3.2777777777777777, - "grad_norm": 0.03480321913957596, - "learning_rate": 3.199111375976449e-05, - "loss": 0.0107, + "epoch": 1.1442424242424243, + "grad_norm": 0.007260697893798351, + "learning_rate": 4.579560155239328e-05, + "loss": 0.0012, "step": 354 }, { - "epoch": 3.287037037037037, - "grad_norm": 0.029546814039349556, - "learning_rate": 3.1689975592882603e-05, - "loss": 0.0099, - "step": 355 - }, - { - "epoch": 3.287037037037037, - "eval_loss": 0.015444349497556686, - "eval_runtime": 9.1458, - "eval_samples_per_second": 5.467, - "eval_steps_per_second": 1.421, + "epoch": 1.1474747474747475, + "grad_norm": 0.007470736745744944, + "learning_rate": 4.592496765847348e-05, + "loss": 0.0032, "step": 355 }, { - "epoch": 3.2962962962962963, - "grad_norm": 0.02437739446759224, - "learning_rate": 3.1389602519686515e-05, - "loss": 0.0118, + "epoch": 1.1507070707070708, + "grad_norm": 0.0061160847544670105, + "learning_rate": 4.6054333764553684e-05, + "loss": 0.0013, "step": 356 }, { - "epoch": 3.3055555555555554, - "grad_norm": 0.029530519619584084, - "learning_rate": 3.109000709141788e-05, - "loss": 0.0121, + "epoch": 1.1539393939393938, + "grad_norm": 0.006069364957511425, + "learning_rate": 4.6183699870633894e-05, + "loss": 0.0011, "step": 357 }, { - "epoch": 3.314814814814815, - "grad_norm": 0.029449855908751488, - "learning_rate": 3.079120182682412e-05, - "loss": 0.0099, + "epoch": 1.157171717171717, + "grad_norm": 0.004093985538929701, + "learning_rate": 4.6313065976714105e-05, + "loss": 0.0005, "step": 358 }, { - "epoch": 3.324074074074074, - "grad_norm": 0.020589128136634827, - "learning_rate": 3.049319921163526e-05, - "loss": 0.0119, + "epoch": 1.1604040404040403, + "grad_norm": 0.0061876364052295685, + "learning_rate": 4.6442432082794315e-05, + "loss": 0.0017, "step": 359 }, { - "epoch": 3.3333333333333335, - "grad_norm": 0.02450876496732235, - "learning_rate": 3.019601169804216e-05, - "loss": 0.0129, - "step": 360 - }, - { - "epoch": 3.3333333333333335, - "eval_loss": 0.0157760102301836, - "eval_runtime": 9.1103, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.1636363636363636, + "grad_norm": 0.0048703462816774845, + "learning_rate": 4.657179818887452e-05, + "loss": 0.0009, "step": 360 }, { - "epoch": 3.3425925925925926, - "grad_norm": 0.0208604596555233, - "learning_rate": 2.9899651704176325e-05, - "loss": 0.011, + "epoch": 1.1668686868686868, + "grad_norm": 0.007618950214236975, + "learning_rate": 4.670116429495472e-05, + "loss": 0.0019, "step": 361 }, { - "epoch": 3.351851851851852, - "grad_norm": 0.025153055787086487, - "learning_rate": 2.9604131613590824e-05, - "loss": 0.0109, + "epoch": 1.17010101010101, + "grad_norm": 0.004160857293754816, + "learning_rate": 4.683053040103493e-05, + "loss": 0.001, "step": 362 }, { - "epoch": 3.361111111111111, - "grad_norm": 0.021455859765410423, - "learning_rate": 2.9309463774743046e-05, - "loss": 0.0122, + "epoch": 1.1733333333333333, + "grad_norm": 0.007051107473671436, + "learning_rate": 4.6959896507115136e-05, + "loss": 0.0019, "step": 363 }, { - "epoch": 3.3703703703703702, - "grad_norm": 0.01964252069592476, - "learning_rate": 2.901566050047855e-05, - "loss": 0.0113, + "epoch": 1.1765656565656566, + "grad_norm": 0.004829261917620897, + "learning_rate": 4.7089262613195346e-05, + "loss": 0.001, "step": 364 }, { - "epoch": 3.3796296296296298, - "grad_norm": 0.020809266716241837, - "learning_rate": 2.872273406751664e-05, - "loss": 0.0105, - "step": 365 - }, - { - "epoch": 3.3796296296296298, - "eval_loss": 0.015391937457025051, - "eval_runtime": 9.111, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.1797979797979798, + "grad_norm": 0.005092508625239134, + "learning_rate": 4.7218628719275556e-05, + "loss": 0.0023, "step": 365 }, { - "epoch": 3.388888888888889, - "grad_norm": 0.025048566982150078, - "learning_rate": 2.8430696715937337e-05, - "loss": 0.0107, + "epoch": 1.183030303030303, + "grad_norm": 0.004417267628014088, + "learning_rate": 4.734799482535576e-05, + "loss": 0.001, "step": 366 }, { - "epoch": 3.398148148148148, - "grad_norm": 0.024674881249666214, - "learning_rate": 2.8139560648669962e-05, - "loss": 0.0113, + "epoch": 1.1862626262626264, + "grad_norm": 0.005750073119997978, + "learning_rate": 4.747736093143596e-05, + "loss": 0.0013, "step": 367 }, { - "epoch": 3.4074074074074074, - "grad_norm": 0.025468124076724052, - "learning_rate": 2.7849338030983257e-05, - "loss": 0.012, + "epoch": 1.1894949494949496, + "grad_norm": 0.0033691064454615116, + "learning_rate": 4.760672703751617e-05, + "loss": 0.0008, "step": 368 }, { - "epoch": 3.4166666666666665, - "grad_norm": 0.022864418104290962, - "learning_rate": 2.7560040989976892e-05, - "loss": 0.01, + "epoch": 1.1927272727272726, + "grad_norm": 0.0044763232581317425, + "learning_rate": 4.773609314359638e-05, + "loss": 0.0009, "step": 369 }, { - "epoch": 3.425925925925926, - "grad_norm": 0.02258789725601673, - "learning_rate": 2.7271681614074973e-05, - "loss": 0.0121, - "step": 370 - }, - { - "epoch": 3.425925925925926, - "eval_loss": 0.015503110364079475, - "eval_runtime": 9.1077, - "eval_samples_per_second": 5.49, - "eval_steps_per_second": 1.427, + "epoch": 1.195959595959596, + "grad_norm": 0.007102371193468571, + "learning_rate": 4.786545924967659e-05, + "loss": 0.0012, "step": 370 }, { - "epoch": 3.435185185185185, - "grad_norm": 0.025097696110606194, - "learning_rate": 2.6984271952520722e-05, - "loss": 0.0104, + "epoch": 1.1991919191919191, + "grad_norm": 0.0041382270865142345, + "learning_rate": 4.79948253557568e-05, + "loss": 0.001, "step": 371 }, { - "epoch": 3.4444444444444446, - "grad_norm": 0.028177309781312943, - "learning_rate": 2.6697824014873075e-05, - "loss": 0.0132, + "epoch": 1.2024242424242424, + "grad_norm": 0.006266591139137745, + "learning_rate": 4.8124191461837e-05, + "loss": 0.0015, "step": 372 }, { - "epoch": 3.4537037037037037, - "grad_norm": 0.026587417349219322, - "learning_rate": 2.641234977050484e-05, - "loss": 0.0085, + "epoch": 1.2056565656565656, + "grad_norm": 0.00487930653616786, + "learning_rate": 4.8253557567917204e-05, + "loss": 0.0012, "step": 373 }, { - "epoch": 3.462962962962963, - "grad_norm": 0.0189076978713274, - "learning_rate": 2.612786114810255e-05, - "loss": 0.0096, + "epoch": 1.208888888888889, + "grad_norm": 0.005311232525855303, + "learning_rate": 4.8382923673997415e-05, + "loss": 0.0014, "step": 374 }, { - "epoch": 3.4722222222222223, - "grad_norm": 0.029332995414733887, - "learning_rate": 2.5844370035168073e-05, - "loss": 0.0096, + "epoch": 1.2121212121212122, + "grad_norm": 0.004289830103516579, + "learning_rate": 4.8512289780077625e-05, + "loss": 0.0009, "step": 375 }, { - "epoch": 3.4722222222222223, - "eval_loss": 0.015461472794413567, - "eval_runtime": 9.1144, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.2121212121212122, + "eval_loss": 0.0011982680298388004, + "eval_runtime": 18.7329, + "eval_samples_per_second": 5.338, + "eval_steps_per_second": 1.335, "step": 375 }, { - "epoch": 3.4814814814814814, - "grad_norm": 0.02185731939971447, - "learning_rate": 2.5561888277521794e-05, - "loss": 0.0098, + "epoch": 1.2153535353535354, + "grad_norm": 0.0062402524054050446, + "learning_rate": 4.864165588615783e-05, + "loss": 0.0011, "step": 376 }, { - "epoch": 3.490740740740741, - "grad_norm": 0.026887575164437294, - "learning_rate": 2.528042767880766e-05, - "loss": 0.0114, + "epoch": 1.2185858585858587, + "grad_norm": 0.0034229829907417297, + "learning_rate": 4.877102199223804e-05, + "loss": 0.0008, "step": 377 }, { - "epoch": 3.5, - "grad_norm": 0.023131586611270905, - "learning_rate": 2.500000000000001e-05, - "loss": 0.0112, + "epoch": 1.221818181818182, + "grad_norm": 0.0063551426865160465, + "learning_rate": 4.890038809831824e-05, + "loss": 0.0027, "step": 378 }, { - "epoch": 3.5092592592592595, - "grad_norm": 0.028937749564647675, - "learning_rate": 2.4720616958912053e-05, - "loss": 0.0121, + "epoch": 1.225050505050505, + "grad_norm": 0.004902714863419533, + "learning_rate": 4.9029754204398446e-05, + "loss": 0.0015, "step": 379 }, { - "epoch": 3.5185185185185186, - "grad_norm": 0.032668791711330414, - "learning_rate": 2.4442290229706344e-05, - "loss": 0.0112, - "step": 380 - }, - { - "epoch": 3.5185185185185186, - "eval_loss": 0.015212837606668472, - "eval_runtime": 9.1177, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 1.2282828282828282, + "grad_norm": 0.0044431439600884914, + "learning_rate": 4.9159120310478656e-05, + "loss": 0.0005, "step": 380 }, { - "epoch": 3.5277777777777777, - "grad_norm": 0.02449023723602295, - "learning_rate": 2.4165031442406855e-05, - "loss": 0.0117, + "epoch": 1.2315151515151515, + "grad_norm": 0.007968113757669926, + "learning_rate": 4.9288486416558866e-05, + "loss": 0.0013, "step": 381 }, { - "epoch": 3.537037037037037, - "grad_norm": 0.025157004594802856, - "learning_rate": 2.3888852182413085e-05, - "loss": 0.0091, + "epoch": 1.2347474747474747, + "grad_norm": 0.007964645512402058, + "learning_rate": 4.941785252263907e-05, + "loss": 0.0033, "step": 382 }, { - "epoch": 3.5462962962962963, - "grad_norm": 0.03108743578195572, - "learning_rate": 2.361376399001592e-05, - "loss": 0.0108, + "epoch": 1.237979797979798, + "grad_norm": 0.004984153900295496, + "learning_rate": 4.954721862871928e-05, + "loss": 0.0008, "step": 383 }, { - "epoch": 3.5555555555555554, - "grad_norm": 0.021932488307356834, - "learning_rate": 2.333977835991545e-05, - "loss": 0.0093, + "epoch": 1.2412121212121212, + "grad_norm": 0.005040554329752922, + "learning_rate": 4.9676584734799483e-05, + "loss": 0.0008, "step": 384 }, { - "epoch": 3.564814814814815, - "grad_norm": 0.026496881619095802, - "learning_rate": 2.3066906740740623e-05, - "loss": 0.0118, - "step": 385 - }, - { - "epoch": 3.564814814814815, - "eval_loss": 0.01467986311763525, - "eval_runtime": 9.1127, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.427, + "epoch": 1.2444444444444445, + "grad_norm": 0.005269059911370277, + "learning_rate": 4.980595084087969e-05, + "loss": 0.0016, "step": 385 }, { - "epoch": 3.574074074074074, - "grad_norm": 0.024211710318922997, - "learning_rate": 2.2795160534570864e-05, - "loss": 0.0086, + "epoch": 1.2476767676767677, + "grad_norm": 0.004820747766643763, + "learning_rate": 4.99353169469599e-05, + "loss": 0.0013, "step": 386 }, { - "epoch": 3.5833333333333335, - "grad_norm": 0.023977207019925117, - "learning_rate": 2.25245510964597e-05, - "loss": 0.0128, + "epoch": 1.250909090909091, + "grad_norm": 0.003650473430752754, + "learning_rate": 5.006468305304011e-05, + "loss": 0.001, "step": 387 }, { - "epoch": 3.5925925925925926, - "grad_norm": 0.02136526070535183, - "learning_rate": 2.225508973396016e-05, - "loss": 0.0121, + "epoch": 1.2541414141414142, + "grad_norm": 0.003410038072615862, + "learning_rate": 5.019404915912032e-05, + "loss": 0.0007, "step": 388 }, { - "epoch": 3.601851851851852, - "grad_norm": 0.026328187435865402, - "learning_rate": 2.198678770665238e-05, - "loss": 0.0108, + "epoch": 1.2573737373737375, + "grad_norm": 0.00503958947956562, + "learning_rate": 5.032341526520052e-05, + "loss": 0.0013, "step": 389 }, { - "epoch": 3.611111111111111, - "grad_norm": 0.02159940078854561, - "learning_rate": 2.171965622567308e-05, - "loss": 0.0082, - "step": 390 - }, - { - "epoch": 3.611111111111111, - "eval_loss": 0.014544774778187275, - "eval_runtime": 9.1133, - "eval_samples_per_second": 5.487, - "eval_steps_per_second": 1.426, + "epoch": 1.2606060606060607, + "grad_norm": 0.006551372352987528, + "learning_rate": 5.0452781371280725e-05, + "loss": 0.0017, "step": 390 }, { - "epoch": 3.6203703703703702, - "grad_norm": 0.02303987927734852, - "learning_rate": 2.1453706453247087e-05, - "loss": 0.0092, + "epoch": 1.2638383838383838, + "grad_norm": 0.004784473218023777, + "learning_rate": 5.058214747736093e-05, + "loss": 0.0011, "step": 391 }, { - "epoch": 3.6296296296296298, - "grad_norm": 0.027734337374567986, - "learning_rate": 2.1188949502220983e-05, - "loss": 0.0101, + "epoch": 1.267070707070707, + "grad_norm": 0.009283591993153095, + "learning_rate": 5.071151358344114e-05, + "loss": 0.0017, "step": 392 }, { - "epoch": 3.638888888888889, - "grad_norm": 0.02069096453487873, - "learning_rate": 2.0925396435598664e-05, - "loss": 0.0111, + "epoch": 1.2703030303030303, + "grad_norm": 0.005054951179772615, + "learning_rate": 5.084087968952135e-05, + "loss": 0.0014, "step": 393 }, { - "epoch": 3.648148148148148, - "grad_norm": 0.02777431532740593, - "learning_rate": 2.066305826607911e-05, - "loss": 0.0091, + "epoch": 1.2735353535353535, + "grad_norm": 0.007423613220453262, + "learning_rate": 5.097024579560156e-05, + "loss": 0.0012, "step": 394 }, { - "epoch": 3.6574074074074074, - "grad_norm": 0.02333620935678482, - "learning_rate": 2.0401945955596206e-05, - "loss": 0.0112, - "step": 395 - }, - { - "epoch": 3.6574074074074074, - "eval_loss": 0.01460795197635889, - "eval_runtime": 9.1059, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, + "epoch": 1.2767676767676768, + "grad_norm": 0.004986994434148073, + "learning_rate": 5.109961190168176e-05, + "loss": 0.0017, "step": 395 }, { - "epoch": 3.6666666666666665, - "grad_norm": 0.022142188623547554, - "learning_rate": 2.0142070414860704e-05, - "loss": 0.01, + "epoch": 1.28, + "grad_norm": 0.004347431473433971, + "learning_rate": 5.122897800776197e-05, + "loss": 0.0012, "step": 396 }, { - "epoch": 3.675925925925926, - "grad_norm": 0.01749616675078869, - "learning_rate": 1.9883442502904283e-05, - "loss": 0.0095, + "epoch": 1.2832323232323233, + "grad_norm": 0.006622066255658865, + "learning_rate": 5.135834411384217e-05, + "loss": 0.0015, "step": 397 }, { - "epoch": 3.685185185185185, - "grad_norm": 0.02393367514014244, - "learning_rate": 1.9626073026625818e-05, - "loss": 0.0095, + "epoch": 1.2864646464646465, + "grad_norm": 0.006094200070947409, + "learning_rate": 5.148771021992238e-05, + "loss": 0.0014, "step": 398 }, { - "epoch": 3.6944444444444446, - "grad_norm": 0.023465050384402275, - "learning_rate": 1.936997274033986e-05, - "loss": 0.0108, + "epoch": 1.2896969696969696, + "grad_norm": 0.004648863337934017, + "learning_rate": 5.161707632600259e-05, + "loss": 0.0012, "step": 399 }, { - "epoch": 3.7037037037037037, - "grad_norm": 0.023157304152846336, - "learning_rate": 1.9115152345327152e-05, - "loss": 0.0086, + "epoch": 1.2929292929292928, + "grad_norm": 0.005023527424782515, + "learning_rate": 5.17464424320828e-05, + "loss": 0.0015, "step": 400 }, { - "epoch": 3.7037037037037037, - "eval_loss": 0.014902754686772823, - "eval_runtime": 9.1616, - "eval_samples_per_second": 5.458, - "eval_steps_per_second": 1.419, + "epoch": 1.2929292929292928, + "eval_loss": 0.001097235712222755, + "eval_runtime": 18.6509, + "eval_samples_per_second": 5.362, + "eval_steps_per_second": 1.34, "step": 400 }, { - "epoch": 3.712962962962963, - "grad_norm": 0.021799901500344276, - "learning_rate": 1.8861622489387555e-05, - "loss": 0.0128, + "epoch": 1.296161616161616, + "grad_norm": 0.004932883661240339, + "learning_rate": 5.1875808538163004e-05, + "loss": 0.0012, "step": 401 }, { - "epoch": 3.7222222222222223, - "grad_norm": 0.03070679120719433, - "learning_rate": 1.8609393766395085e-05, - "loss": 0.0123, + "epoch": 1.2993939393939393, + "grad_norm": 0.004028505180031061, + "learning_rate": 5.2005174644243214e-05, + "loss": 0.0013, "step": 402 }, { - "epoch": 3.7314814814814814, - "grad_norm": 0.02543518878519535, - "learning_rate": 1.835847671585526e-05, - "loss": 0.0114, + "epoch": 1.3026262626262626, + "grad_norm": 0.004800683818757534, + "learning_rate": 5.213454075032341e-05, + "loss": 0.0014, "step": 403 }, { - "epoch": 3.7407407407407405, - "grad_norm": 0.027585655450820923, - "learning_rate": 1.8108881822464696e-05, - "loss": 0.0099, + "epoch": 1.3058585858585858, + "grad_norm": 0.0042937519028782845, + "learning_rate": 5.226390685640362e-05, + "loss": 0.001, "step": 404 }, { - "epoch": 3.75, - "grad_norm": 0.02352389506995678, - "learning_rate": 1.7860619515673033e-05, - "loss": 0.0102, - "step": 405 - }, - { - "epoch": 3.75, - "eval_loss": 0.014981208369135857, - "eval_runtime": 9.1106, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.309090909090909, + "grad_norm": 0.004741652403026819, + "learning_rate": 5.239327296248383e-05, + "loss": 0.0011, "step": 405 }, { - "epoch": 3.7592592592592595, - "grad_norm": 0.02560283988714218, - "learning_rate": 1.7613700169247056e-05, - "loss": 0.012, + "epoch": 1.3123232323232323, + "grad_norm": 0.004214128013700247, + "learning_rate": 5.252263906856404e-05, + "loss": 0.0009, "step": 406 }, { - "epoch": 3.7685185185185186, - "grad_norm": 0.026089752092957497, - "learning_rate": 1.7368134100837287e-05, - "loss": 0.0088, + "epoch": 1.3155555555555556, + "grad_norm": 0.004764182958751917, + "learning_rate": 5.2652005174644245e-05, + "loss": 0.0011, "step": 407 }, { - "epoch": 3.7777777777777777, - "grad_norm": 0.030365899205207825, - "learning_rate": 1.7123931571546827e-05, - "loss": 0.0119, + "epoch": 1.3187878787878788, + "grad_norm": 0.005405279342085123, + "learning_rate": 5.2781371280724455e-05, + "loss": 0.0011, "step": 408 }, { - "epoch": 3.787037037037037, - "grad_norm": 0.031558796763420105, - "learning_rate": 1.6881102785502616e-05, - "loss": 0.011, + "epoch": 1.322020202020202, + "grad_norm": 0.005355632398277521, + "learning_rate": 5.2910737386804666e-05, + "loss": 0.0012, "step": 409 }, { - "epoch": 3.7962962962962963, - "grad_norm": 0.030366325750947, - "learning_rate": 1.6639657889429018e-05, - "loss": 0.0116, - "step": 410 - }, - { - "epoch": 3.7962962962962963, - "eval_loss": 0.014859426766633987, - "eval_runtime": 9.1059, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, + "epoch": 1.3252525252525253, + "grad_norm": 0.004391840659081936, + "learning_rate": 5.304010349288486e-05, + "loss": 0.0011, "step": 410 }, { - "epoch": 3.8055555555555554, - "grad_norm": 0.025008074939250946, - "learning_rate": 1.639960697222388e-05, - "loss": 0.0106, + "epoch": 1.3284848484848486, + "grad_norm": 0.002719137817621231, + "learning_rate": 5.316946959896507e-05, + "loss": 0.0005, "step": 411 }, { - "epoch": 3.814814814814815, - "grad_norm": 0.028196556493639946, - "learning_rate": 1.6160960064536908e-05, - "loss": 0.0113, + "epoch": 1.3317171717171719, + "grad_norm": 0.005910648964345455, + "learning_rate": 5.329883570504528e-05, + "loss": 0.0014, "step": 412 }, { - "epoch": 3.824074074074074, - "grad_norm": 0.02165764756500721, - "learning_rate": 1.592372713835055e-05, - "loss": 0.0115, + "epoch": 1.3349494949494949, + "grad_norm": 0.004575495608150959, + "learning_rate": 5.3428201811125486e-05, + "loss": 0.0013, "step": 413 }, { - "epoch": 3.8333333333333335, - "grad_norm": 0.020175475627183914, - "learning_rate": 1.5687918106563326e-05, - "loss": 0.0112, + "epoch": 1.3381818181818181, + "grad_norm": 0.004968920256942511, + "learning_rate": 5.35575679172057e-05, + "loss": 0.0012, "step": 414 }, { - "epoch": 3.8425925925925926, - "grad_norm": 0.027304671704769135, - "learning_rate": 1.545354282257562e-05, - "loss": 0.0126, - "step": 415 - }, - { - "epoch": 3.8425925925925926, - "eval_loss": 0.014735485427081585, - "eval_runtime": 9.198, - "eval_samples_per_second": 5.436, - "eval_steps_per_second": 1.413, + "epoch": 1.3414141414141414, + "grad_norm": 0.005680674687027931, + "learning_rate": 5.368693402328591e-05, + "loss": 0.0015, "step": 415 }, { - "epoch": 3.851851851851852, - "grad_norm": 0.026429716497659683, - "learning_rate": 1.52206110798779e-05, - "loss": 0.0103, + "epoch": 1.3446464646464646, + "grad_norm": 0.005805244669318199, + "learning_rate": 5.3816300129366104e-05, + "loss": 0.0017, "step": 416 }, { - "epoch": 3.861111111111111, - "grad_norm": 0.02409077063202858, - "learning_rate": 1.4989132611641576e-05, - "loss": 0.012, + "epoch": 1.347878787878788, + "grad_norm": 0.004365504253655672, + "learning_rate": 5.3945666235446314e-05, + "loss": 0.001, "step": 417 }, { - "epoch": 3.8703703703703702, - "grad_norm": 0.02310461364686489, - "learning_rate": 1.4759117090312197e-05, - "loss": 0.0096, + "epoch": 1.3511111111111112, + "grad_norm": 0.004444981925189495, + "learning_rate": 5.4075032341526524e-05, + "loss": 0.0011, "step": 418 }, { - "epoch": 3.8796296296296298, - "grad_norm": 0.026219584047794342, - "learning_rate": 1.453057412720536e-05, - "loss": 0.0094, + "epoch": 1.3543434343434344, + "grad_norm": 0.0048050908371806145, + "learning_rate": 5.420439844760673e-05, + "loss": 0.0011, "step": 419 }, { - "epoch": 3.888888888888889, - "grad_norm": 0.027541201561689377, - "learning_rate": 1.4303513272105057e-05, - "loss": 0.0112, - "step": 420 - }, - { - "epoch": 3.888888888888889, - "eval_loss": 0.014594363048672676, - "eval_runtime": 9.1304, - "eval_samples_per_second": 5.476, - "eval_steps_per_second": 1.424, + "epoch": 1.3575757575757577, + "grad_norm": 0.0112074613571167, + "learning_rate": 5.433376455368694e-05, + "loss": 0.0009, "step": 420 }, { - "epoch": 3.898148148148148, - "grad_norm": 0.024942217394709587, - "learning_rate": 1.4077944012864636e-05, - "loss": 0.0093, + "epoch": 1.3608080808080807, + "grad_norm": 0.005805397406220436, + "learning_rate": 5.446313065976715e-05, + "loss": 0.001, "step": 421 }, { - "epoch": 3.9074074074074074, - "grad_norm": 0.018137283623218536, - "learning_rate": 1.3853875775010355e-05, - "loss": 0.0102, + "epoch": 1.364040404040404, + "grad_norm": 0.006793915294110775, + "learning_rate": 5.4592496765847345e-05, + "loss": 0.0018, "step": 422 }, { - "epoch": 3.9166666666666665, - "grad_norm": 0.021817779168486595, - "learning_rate": 1.3631317921347563e-05, - "loss": 0.0084, + "epoch": 1.3672727272727272, + "grad_norm": 0.005133500322699547, + "learning_rate": 5.4721862871927555e-05, + "loss": 0.0015, "step": 423 }, { - "epoch": 3.925925925925926, - "grad_norm": 0.023799235001206398, - "learning_rate": 1.3410279751569399e-05, - "loss": 0.0122, + "epoch": 1.3705050505050504, + "grad_norm": 0.0034578884951770306, + "learning_rate": 5.4851228978007766e-05, + "loss": 0.0008, "step": 424 }, { - "epoch": 3.935185185185185, - "grad_norm": 0.030764896422624588, - "learning_rate": 1.3190770501868243e-05, - "loss": 0.0107, + "epoch": 1.3737373737373737, + "grad_norm": 0.005579716991633177, + "learning_rate": 5.498059508408797e-05, + "loss": 0.0025, "step": 425 }, { - "epoch": 3.935185185185185, - "eval_loss": 0.014631365425884724, - "eval_runtime": 9.1149, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.3737373737373737, + "eval_loss": 0.0011115286033600569, + "eval_runtime": 18.7263, + "eval_samples_per_second": 5.34, + "eval_steps_per_second": 1.335, "step": 425 }, { - "epoch": 3.9444444444444446, - "grad_norm": 0.022886106744408607, - "learning_rate": 1.297279934454978e-05, - "loss": 0.0096, + "epoch": 1.376969696969697, + "grad_norm": 0.005262942984700203, + "learning_rate": 5.510996119016818e-05, + "loss": 0.0011, "step": 426 }, { - "epoch": 3.9537037037037037, - "grad_norm": 0.03152737021446228, - "learning_rate": 1.2756375387649716e-05, - "loss": 0.0124, + "epoch": 1.3802020202020202, + "grad_norm": 0.004813515581190586, + "learning_rate": 5.523932729624839e-05, + "loss": 0.0011, "step": 427 }, { - "epoch": 3.962962962962963, - "grad_norm": 0.02872036211192608, - "learning_rate": 1.25415076745532e-05, - "loss": 0.0091, + "epoch": 1.3834343434343435, + "grad_norm": 0.005556623917073011, + "learning_rate": 5.53686934023286e-05, + "loss": 0.0014, "step": 428 }, { - "epoch": 3.9722222222222223, - "grad_norm": 0.021184636279940605, - "learning_rate": 1.2328205183616965e-05, - "loss": 0.0105, + "epoch": 1.3866666666666667, + "grad_norm": 0.0028185530100017786, + "learning_rate": 5.5498059508408797e-05, + "loss": 0.0007, "step": 429 }, { - "epoch": 3.9814814814814814, - "grad_norm": 0.02112959884107113, - "learning_rate": 1.2116476827794104e-05, - "loss": 0.0113, - "step": 430 - }, - { - "epoch": 3.9814814814814814, - "eval_loss": 0.01471536885946989, - "eval_runtime": 9.116, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 1.38989898989899, + "grad_norm": 0.004296639934182167, + "learning_rate": 5.562742561448901e-05, + "loss": 0.001, "step": 430 }, { - "epoch": 3.9907407407407405, - "grad_norm": 0.019945990294218063, - "learning_rate": 1.1906331454261704e-05, - "loss": 0.0093, + "epoch": 1.3931313131313132, + "grad_norm": 0.004092990420758724, + "learning_rate": 5.575679172056921e-05, + "loss": 0.001, "step": 431 }, { - "epoch": 4.0, - "grad_norm": 0.023910805583000183, - "learning_rate": 1.1697777844051105e-05, - "loss": 0.011, + "epoch": 1.3963636363636365, + "grad_norm": 0.005713030230253935, + "learning_rate": 5.588615782664942e-05, + "loss": 0.0015, "step": 432 }, { - "epoch": 4.0092592592592595, - "grad_norm": 0.01957758143544197, - "learning_rate": 1.1490824711681025e-05, - "loss": 0.0094, + "epoch": 1.3995959595959597, + "grad_norm": 0.005011020693928003, + "learning_rate": 5.601552393272963e-05, + "loss": 0.0012, "step": 433 }, { - "epoch": 4.018518518518518, - "grad_norm": 0.02563118375837803, - "learning_rate": 1.1285480704793377e-05, - "loss": 0.0093, + "epoch": 1.4028282828282828, + "grad_norm": 0.004393043462187052, + "learning_rate": 5.614489003880984e-05, + "loss": 0.0009, "step": 434 }, { - "epoch": 4.027777777777778, - "grad_norm": 0.026251764968037605, - "learning_rate": 1.1081754403791999e-05, - "loss": 0.0091, - "step": 435 - }, - { - "epoch": 4.027777777777778, - "eval_loss": 0.014734329655766487, - "eval_runtime": 9.1592, - "eval_samples_per_second": 5.459, - "eval_steps_per_second": 1.419, + "epoch": 1.406060606060606, + "grad_norm": 0.0047727166675031185, + "learning_rate": 5.627425614489004e-05, + "loss": 0.0011, "step": 435 }, { - "epoch": 4.037037037037037, - "grad_norm": 0.025834446772933006, - "learning_rate": 1.0879654321484012e-05, - "loss": 0.0067, + "epoch": 1.4092929292929293, + "grad_norm": 0.0034502504859119654, + "learning_rate": 5.640362225097025e-05, + "loss": 0.001, "step": 436 }, { - "epoch": 4.046296296296297, - "grad_norm": 0.0185233224183321, - "learning_rate": 1.0679188902724191e-05, - "loss": 0.0108, + "epoch": 1.4125252525252525, + "grad_norm": 0.004155455157160759, + "learning_rate": 5.653298835705045e-05, + "loss": 0.0008, "step": 437 }, { - "epoch": 4.055555555555555, - "grad_norm": 0.021918736398220062, - "learning_rate": 1.0480366524062042e-05, - "loss": 0.0088, + "epoch": 1.4157575757575758, + "grad_norm": 0.005455946549773216, + "learning_rate": 5.666235446313066e-05, + "loss": 0.0012, "step": 438 }, { - "epoch": 4.064814814814815, - "grad_norm": 0.03142661973834038, - "learning_rate": 1.0283195493391823e-05, - "loss": 0.0103, + "epoch": 1.418989898989899, + "grad_norm": 0.005093955434858799, + "learning_rate": 5.679172056921087e-05, + "loss": 0.0015, "step": 439 }, { - "epoch": 4.074074074074074, - "grad_norm": 0.023410873487591743, - "learning_rate": 1.008768404960535e-05, - "loss": 0.0094, + "epoch": 1.4222222222222223, + "grad_norm": 0.004076706245541573, + "learning_rate": 5.692108667529108e-05, + "loss": 0.0012, "step": 440 }, { - "epoch": 4.074074074074074, - "eval_loss": 0.014965096488595009, - "eval_runtime": 9.1135, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, - "step": 440 - }, - { - "epoch": 4.083333333333333, - "grad_norm": 0.02943902276456356, - "learning_rate": 9.893840362247809e-06, - "loss": 0.0056, + "epoch": 1.4254545454545455, + "grad_norm": 0.004360303282737732, + "learning_rate": 5.7050452781371286e-05, + "loss": 0.001, "step": 441 }, { - "epoch": 4.092592592592593, - "grad_norm": 0.021431270986795425, - "learning_rate": 9.701672531176286e-06, - "loss": 0.0089, + "epoch": 1.4286868686868686, + "grad_norm": 0.005434586200863123, + "learning_rate": 5.717981888745149e-05, + "loss": 0.002, "step": 442 }, { - "epoch": 4.101851851851852, - "grad_norm": 0.02797669917345047, - "learning_rate": 9.511188586221376e-06, - "loss": 0.0092, + "epoch": 1.4319191919191918, + "grad_norm": 0.006644332781434059, + "learning_rate": 5.730918499353169e-05, + "loss": 0.0033, "step": 443 }, { - "epoch": 4.111111111111111, - "grad_norm": 0.02437691204249859, - "learning_rate": 9.322396486851626e-06, - "loss": 0.0104, + "epoch": 1.435151515151515, + "grad_norm": 0.006485591642558575, + "learning_rate": 5.74385510996119e-05, + "loss": 0.002, "step": 444 }, { - "epoch": 4.12037037037037, - "grad_norm": 0.024811841547489166, - "learning_rate": 9.135304121840976e-06, - "loss": 0.0096, - "step": 445 - }, - { - "epoch": 4.12037037037037, - "eval_loss": 0.014996801503002644, - "eval_runtime": 9.1094, - "eval_samples_per_second": 5.489, - "eval_steps_per_second": 1.427, + "epoch": 1.4383838383838383, + "grad_norm": 0.006898663938045502, + "learning_rate": 5.7567917205692113e-05, + "loss": 0.001, "step": 445 }, { - "epoch": 4.12962962962963, - "grad_norm": 0.0309213325381279, - "learning_rate": 8.949919308939082e-06, - "loss": 0.0109, + "epoch": 1.4416161616161616, + "grad_norm": 0.006443925201892853, + "learning_rate": 5.7697283311772324e-05, + "loss": 0.0015, "step": 446 }, { - "epoch": 4.138888888888889, - "grad_norm": 0.023763932287693024, - "learning_rate": 8.766249794544662e-06, - "loss": 0.0073, + "epoch": 1.4448484848484848, + "grad_norm": 0.003106453223153949, + "learning_rate": 5.782664941785253e-05, + "loss": 0.0009, "step": 447 }, { - "epoch": 4.148148148148148, - "grad_norm": 0.023741643875837326, - "learning_rate": 8.584303253381847e-06, - "loss": 0.0105, + "epoch": 1.448080808080808, + "grad_norm": 0.0036977077834308147, + "learning_rate": 5.795601552393273e-05, + "loss": 0.0009, "step": 448 }, { - "epoch": 4.157407407407407, - "grad_norm": 0.02090543322265148, - "learning_rate": 8.404087288179424e-06, - "loss": 0.0096, + "epoch": 1.4513131313131313, + "grad_norm": 0.0050973836332559586, + "learning_rate": 5.8085381630012934e-05, + "loss": 0.0014, "step": 449 }, { - "epoch": 4.166666666666667, - "grad_norm": 0.026315612718462944, - "learning_rate": 8.225609429353187e-06, - "loss": 0.0091, + "epoch": 1.4545454545454546, + "grad_norm": 0.0057764300145208836, + "learning_rate": 5.8214747736093145e-05, + "loss": 0.0016, "step": 450 }, { - "epoch": 4.166666666666667, - "eval_loss": 0.015186839736998081, - "eval_runtime": 9.1241, - "eval_samples_per_second": 5.48, - "eval_steps_per_second": 1.425, + "epoch": 1.4545454545454546, + "eval_loss": 0.0010863245697692037, + "eval_runtime": 18.7321, + "eval_samples_per_second": 5.338, + "eval_steps_per_second": 1.335, "step": 450 }, { - "epoch": 4.175925925925926, - "grad_norm": 0.023099206387996674, - "learning_rate": 8.048877134691268e-06, - "loss": 0.0091, + "epoch": 1.4577777777777778, + "grad_norm": 0.004166171886026859, + "learning_rate": 5.8344113842173355e-05, + "loss": 0.0009, "step": 451 }, { - "epoch": 4.185185185185185, - "grad_norm": 0.027901167050004005, - "learning_rate": 7.873897789042523e-06, - "loss": 0.0092, + "epoch": 1.461010101010101, + "grad_norm": 0.004900315310806036, + "learning_rate": 5.8473479948253565e-05, + "loss": 0.0013, "step": 452 }, { - "epoch": 4.194444444444445, - "grad_norm": 0.025486482307314873, - "learning_rate": 7.700678704007947e-06, - "loss": 0.0077, + "epoch": 1.4642424242424243, + "grad_norm": 0.003387290518730879, + "learning_rate": 5.860284605433377e-05, + "loss": 0.0007, "step": 453 }, { - "epoch": 4.203703703703703, - "grad_norm": 0.0233286302536726, - "learning_rate": 7.529227117635135e-06, - "loss": 0.0077, + "epoch": 1.4674747474747476, + "grad_norm": 0.004271840676665306, + "learning_rate": 5.873221216041398e-05, + "loss": 0.0011, "step": 454 }, { - "epoch": 4.212962962962963, - "grad_norm": 0.023314587771892548, - "learning_rate": 7.35955019411585e-06, - "loss": 0.0089, + "epoch": 1.4707070707070706, + "grad_norm": 0.0032306541688740253, + "learning_rate": 5.8861578266494176e-05, + "loss": 0.0007, "step": 455 }, { - "epoch": 4.212962962962963, - "eval_loss": 0.015497377142310143, - "eval_runtime": 9.1064, - "eval_samples_per_second": 5.491, - "eval_steps_per_second": 1.428, - "step": 455 - }, - { - "epoch": 4.222222222222222, - "grad_norm": 0.021640775725245476, - "learning_rate": 7.191655023486682e-06, - "loss": 0.01, + "epoch": 1.4739393939393939, + "grad_norm": 0.0022131800651550293, + "learning_rate": 5.8990944372574386e-05, + "loss": 0.0006, "step": 456 }, { - "epoch": 4.231481481481482, - "grad_norm": 0.027831410989165306, - "learning_rate": 7.02554862133275e-06, - "loss": 0.0105, + "epoch": 1.4771717171717171, + "grad_norm": 0.005533823277801275, + "learning_rate": 5.9120310478654596e-05, + "loss": 0.0017, "step": 457 }, { - "epoch": 4.2407407407407405, - "grad_norm": 0.023242153227329254, - "learning_rate": 6.861237928494579e-06, - "loss": 0.009, + "epoch": 1.4804040404040404, + "grad_norm": 0.004672409035265446, + "learning_rate": 5.9249676584734806e-05, + "loss": 0.0014, "step": 458 }, { - "epoch": 4.25, - "grad_norm": 0.02775505743920803, - "learning_rate": 6.698729810778065e-06, - "loss": 0.0102, + "epoch": 1.4836363636363636, + "grad_norm": 0.006192138884216547, + "learning_rate": 5.937904269081501e-05, + "loss": 0.0015, "step": 459 }, { - "epoch": 4.2592592592592595, - "grad_norm": 0.0267843846231699, - "learning_rate": 6.53803105866761e-06, - "loss": 0.0063, - "step": 460 - }, - { - "epoch": 4.2592592592592595, - "eval_loss": 0.01563325710594654, - "eval_runtime": 9.111, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.486868686868687, + "grad_norm": 0.011036567389965057, + "learning_rate": 5.950840879689522e-05, + "loss": 0.0025, "step": 460 }, { - "epoch": 4.268518518518518, - "grad_norm": 0.02488654851913452, - "learning_rate": 6.379148387042316e-06, - "loss": 0.01, + "epoch": 1.4901010101010101, + "grad_norm": 0.0055129267275333405, + "learning_rate": 5.963777490297542e-05, + "loss": 0.0015, "step": 461 }, { - "epoch": 4.277777777777778, - "grad_norm": 0.024208445101976395, - "learning_rate": 6.222088434895462e-06, - "loss": 0.0072, + "epoch": 1.4933333333333334, + "grad_norm": 0.005369866266846657, + "learning_rate": 5.976714100905563e-05, + "loss": 0.0027, "step": 462 }, { - "epoch": 4.287037037037037, - "grad_norm": 0.023147890344262123, - "learning_rate": 6.066857765057055e-06, - "loss": 0.0088, + "epoch": 1.4965656565656564, + "grad_norm": 0.005017601884901524, + "learning_rate": 5.989650711513584e-05, + "loss": 0.0011, "step": 463 }, { - "epoch": 4.296296296296296, - "grad_norm": 0.029451172798871994, - "learning_rate": 5.9134628639196e-06, - "loss": 0.0085, + "epoch": 1.4997979797979797, + "grad_norm": 0.00623415969312191, + "learning_rate": 6.002587322121605e-05, + "loss": 0.0028, "step": 464 }, { - "epoch": 4.305555555555555, - "grad_norm": 0.02764413133263588, - "learning_rate": 5.7619101411671095e-06, - "loss": 0.0099, - "step": 465 - }, - { - "epoch": 4.305555555555555, - "eval_loss": 0.015693385154008865, - "eval_runtime": 9.1176, - "eval_samples_per_second": 5.484, - "eval_steps_per_second": 1.426, + "epoch": 1.503030303030303, + "grad_norm": 0.005604151636362076, + "learning_rate": 6.015523932729625e-05, + "loss": 0.0023, "step": 465 }, { - "epoch": 4.314814814814815, - "grad_norm": 0.021906448528170586, - "learning_rate": 5.6122059295072085e-06, - "loss": 0.0096, + "epoch": 1.5062626262626262, + "grad_norm": 0.004366429056972265, + "learning_rate": 6.028460543337646e-05, + "loss": 0.001, "step": 466 }, { - "epoch": 4.324074074074074, - "grad_norm": 0.02385389618575573, - "learning_rate": 5.464356484406535e-06, - "loss": 0.0072, + "epoch": 1.5094949494949494, + "grad_norm": 0.004982110112905502, + "learning_rate": 6.041397153945667e-05, + "loss": 0.0014, "step": 467 }, { - "epoch": 4.333333333333333, - "grad_norm": 0.026357507333159447, - "learning_rate": 5.318367983829392e-06, - "loss": 0.0079, + "epoch": 1.5127272727272727, + "grad_norm": 0.005147982854396105, + "learning_rate": 6.054333764553687e-05, + "loss": 0.0013, "step": 468 }, { - "epoch": 4.342592592592593, - "grad_norm": 0.026002187281847, - "learning_rate": 5.174246527979531e-06, - "loss": 0.0095, + "epoch": 1.515959595959596, + "grad_norm": 0.004790551029145718, + "learning_rate": 6.067270375161708e-05, + "loss": 0.0011, "step": 469 }, { - "epoch": 4.351851851851852, - "grad_norm": 0.02679777517914772, - "learning_rate": 5.031998139045352e-06, - "loss": 0.0085, - "step": 470 - }, - { - "epoch": 4.351851851851852, - "eval_loss": 0.015615792945027351, - "eval_runtime": 9.1365, - "eval_samples_per_second": 5.473, - "eval_steps_per_second": 1.423, + "epoch": 1.5191919191919192, + "grad_norm": 0.0038922505918890238, + "learning_rate": 6.080206985769729e-05, + "loss": 0.0011, "step": 470 }, { - "epoch": 4.361111111111111, - "grad_norm": 0.023431269451975822, - "learning_rate": 4.891628760948114e-06, - "loss": 0.009, + "epoch": 1.5224242424242425, + "grad_norm": 0.006303661502897739, + "learning_rate": 6.093143596377749e-05, + "loss": 0.0023, "step": 471 }, { - "epoch": 4.37037037037037, - "grad_norm": 0.02848837524652481, - "learning_rate": 4.7531442590937335e-06, - "loss": 0.0102, + "epoch": 1.5256565656565657, + "grad_norm": 0.003573813708499074, + "learning_rate": 6.10608020698577e-05, + "loss": 0.0011, "step": 472 }, { - "epoch": 4.37962962962963, - "grad_norm": 0.026586227118968964, - "learning_rate": 4.616550420127563e-06, - "loss": 0.0078, + "epoch": 1.528888888888889, + "grad_norm": 0.005556274671107531, + "learning_rate": 6.119016817593791e-05, + "loss": 0.0033, "step": 473 }, { - "epoch": 4.388888888888889, - "grad_norm": 0.025660747662186623, - "learning_rate": 4.4818529516926726e-06, - "loss": 0.0086, + "epoch": 1.5321212121212122, + "grad_norm": 0.004455295857042074, + "learning_rate": 6.131953428201811e-05, + "loss": 0.0014, "step": 474 }, { - "epoch": 4.398148148148148, - "grad_norm": 0.02436869405210018, - "learning_rate": 4.349057482191299e-06, - "loss": 0.011, + "epoch": 1.5353535353535355, + "grad_norm": 0.003466435242444277, + "learning_rate": 6.144890038809832e-05, + "loss": 0.001, "step": 475 }, { - "epoch": 4.398148148148148, - "eval_loss": 0.015554042533040047, - "eval_runtime": 9.1142, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.5353535353535355, + "eval_loss": 0.0010681893909350038, + "eval_runtime": 18.6082, + "eval_samples_per_second": 5.374, + "eval_steps_per_second": 1.343, "step": 475 }, { - "epoch": 4.407407407407407, - "grad_norm": 0.02513139322400093, - "learning_rate": 4.218169560549706e-06, - "loss": 0.0108, + "epoch": 1.5385858585858587, + "grad_norm": 0.004641688894480467, + "learning_rate": 6.157826649417853e-05, + "loss": 0.0029, "step": 476 }, { - "epoch": 4.416666666666667, - "grad_norm": 0.027343349531292915, - "learning_rate": 4.089194655986306e-06, - "loss": 0.0099, + "epoch": 1.541818181818182, + "grad_norm": 0.004081512801349163, + "learning_rate": 6.170763260025874e-05, + "loss": 0.0011, "step": 477 }, { - "epoch": 4.425925925925926, - "grad_norm": 0.02374204248189926, - "learning_rate": 3.962138157783085e-06, - "loss": 0.0095, + "epoch": 1.545050505050505, + "grad_norm": 0.005640064366161823, + "learning_rate": 6.183699870633895e-05, + "loss": 0.0024, "step": 478 }, { - "epoch": 4.435185185185185, - "grad_norm": 0.04114212468266487, - "learning_rate": 3.837005375060482e-06, - "loss": 0.0089, + "epoch": 1.5482828282828283, + "grad_norm": 0.0031765501480549574, + "learning_rate": 6.196636481241915e-05, + "loss": 0.0009, "step": 479 }, { - "epoch": 4.444444444444445, - "grad_norm": 0.024016965180635452, - "learning_rate": 3.7138015365554833e-06, - "loss": 0.0067, - "step": 480 - }, - { - "epoch": 4.444444444444445, - "eval_loss": 0.01539613213390112, - "eval_runtime": 9.1246, - "eval_samples_per_second": 5.48, - "eval_steps_per_second": 1.425, + "epoch": 1.5515151515151515, + "grad_norm": 0.0049113016575574875, + "learning_rate": 6.209573091849934e-05, + "loss": 0.0019, "step": 480 }, { - "epoch": 4.453703703703704, - "grad_norm": 0.02901994250714779, - "learning_rate": 3.5925317904031587e-06, - "loss": 0.0087, + "epoch": 1.5547474747474748, + "grad_norm": 0.003700861008837819, + "learning_rate": 6.222509702457955e-05, + "loss": 0.0012, "step": 481 }, { - "epoch": 4.462962962962963, - "grad_norm": 0.020981522276997566, - "learning_rate": 3.4732012039215776e-06, - "loss": 0.011, + "epoch": 1.557979797979798, + "grad_norm": 0.004294991493225098, + "learning_rate": 6.235446313065976e-05, + "loss": 0.0014, "step": 482 }, { - "epoch": 4.472222222222222, - "grad_norm": 0.023783011361956596, - "learning_rate": 3.3558147633999728e-06, - "loss": 0.0096, + "epoch": 1.561212121212121, + "grad_norm": 0.003475453006103635, + "learning_rate": 6.248382923673998e-05, + "loss": 0.0007, "step": 483 }, { - "epoch": 4.481481481481482, - "grad_norm": 0.02081628330051899, - "learning_rate": 3.2403773738905187e-06, - "loss": 0.0087, + "epoch": 1.5644444444444443, + "grad_norm": 0.007765649352222681, + "learning_rate": 6.261319534282019e-05, + "loss": 0.001, "step": 484 }, { - "epoch": 4.4907407407407405, - "grad_norm": 0.024986054748296738, - "learning_rate": 3.126893859003249e-06, - "loss": 0.0092, - "step": 485 - }, - { - "epoch": 4.4907407407407405, - "eval_loss": 0.015287145972251892, - "eval_runtime": 9.1097, - "eval_samples_per_second": 5.489, - "eval_steps_per_second": 1.427, + "epoch": 1.5676767676767676, + "grad_norm": 0.0032908658031374216, + "learning_rate": 6.27425614489004e-05, + "loss": 0.0008, "step": 485 }, { - "epoch": 4.5, - "grad_norm": 0.032323963940143585, - "learning_rate": 3.0153689607045845e-06, - "loss": 0.0086, + "epoch": 1.5709090909090908, + "grad_norm": 0.00453177560120821, + "learning_rate": 6.28719275549806e-05, + "loss": 0.0013, "step": 486 }, { - "epoch": 4.5092592592592595, - "grad_norm": 0.02963520959019661, - "learning_rate": 2.9058073391191375e-06, - "loss": 0.0068, + "epoch": 1.574141414141414, + "grad_norm": 0.0038091707974672318, + "learning_rate": 6.30012936610608e-05, + "loss": 0.0008, "step": 487 }, { - "epoch": 4.518518518518518, - "grad_norm": 0.035344675183296204, - "learning_rate": 2.798213572335001e-06, - "loss": 0.0062, + "epoch": 1.5773737373737373, + "grad_norm": 0.004123839549720287, + "learning_rate": 6.313065976714101e-05, + "loss": 0.0011, "step": 488 }, { - "epoch": 4.527777777777778, - "grad_norm": 0.026800939813256264, - "learning_rate": 2.692592156212487e-06, - "loss": 0.0092, + "epoch": 1.5806060606060606, + "grad_norm": 0.003308449639007449, + "learning_rate": 6.326002587322122e-05, + "loss": 0.0012, "step": 489 }, { - "epoch": 4.537037037037037, - "grad_norm": 0.024116506800055504, - "learning_rate": 2.5889475041961765e-06, - "loss": 0.0072, - "step": 490 - }, - { - "epoch": 4.537037037037037, - "eval_loss": 0.015211592428386211, - "eval_runtime": 9.1184, - "eval_samples_per_second": 5.483, - "eval_steps_per_second": 1.426, + "epoch": 1.5838383838383838, + "grad_norm": 0.005438206251710653, + "learning_rate": 6.338939197930143e-05, + "loss": 0.0023, "step": 490 }, { - "epoch": 4.546296296296296, - "grad_norm": 0.027498748153448105, - "learning_rate": 2.4872839471306084e-06, - "loss": 0.0082, + "epoch": 1.587070707070707, + "grad_norm": 0.005823109764605761, + "learning_rate": 6.351875808538163e-05, + "loss": 0.0014, "step": 491 }, { - "epoch": 4.555555555555555, - "grad_norm": 0.026998436078429222, - "learning_rate": 2.3876057330792346e-06, - "loss": 0.008, + "epoch": 1.5903030303030303, + "grad_norm": 0.0052726129069924355, + "learning_rate": 6.364812419146184e-05, + "loss": 0.0012, "step": 492 }, { - "epoch": 4.564814814814815, - "grad_norm": 0.023703446611762047, - "learning_rate": 2.2899170271469428e-06, - "loss": 0.011, + "epoch": 1.5935353535353536, + "grad_norm": 0.003648497397080064, + "learning_rate": 6.377749029754204e-05, + "loss": 0.0013, "step": 493 }, { - "epoch": 4.574074074074074, - "grad_norm": 0.019968930631875992, - "learning_rate": 2.1942219113060212e-06, - "loss": 0.0075, + "epoch": 1.5967676767676768, + "grad_norm": 0.003007176099345088, + "learning_rate": 6.390685640362225e-05, + "loss": 0.0007, "step": 494 }, { - "epoch": 4.583333333333333, - "grad_norm": 0.02214980125427246, - "learning_rate": 2.100524384225555e-06, - "loss": 0.0078, - "step": 495 - }, - { - "epoch": 4.583333333333333, - "eval_loss": 0.015181516297161579, - "eval_runtime": 9.1214, - "eval_samples_per_second": 5.482, - "eval_steps_per_second": 1.425, + "epoch": 1.6, + "grad_norm": 0.004037541802972555, + "learning_rate": 6.403622250970246e-05, + "loss": 0.0011, "step": 495 }, { - "epoch": 4.592592592592593, - "grad_norm": 0.025330157950520515, - "learning_rate": 2.0088283611044036e-06, - "loss": 0.0062, + "epoch": 1.6032323232323233, + "grad_norm": 0.0036643114872276783, + "learning_rate": 6.416558861578267e-05, + "loss": 0.0009, "step": 496 }, { - "epoch": 4.601851851851852, - "grad_norm": 0.019013626500964165, - "learning_rate": 1.9191376735075427e-06, - "loss": 0.0088, + "epoch": 1.6064646464646466, + "grad_norm": 0.0038799517787992954, + "learning_rate": 6.429495472186288e-05, + "loss": 0.001, "step": 497 }, { - "epoch": 4.611111111111111, - "grad_norm": 0.022145694121718407, - "learning_rate": 1.8314560692059835e-06, - "loss": 0.0089, + "epoch": 1.6096969696969698, + "grad_norm": 0.002598103601485491, + "learning_rate": 6.442432082794309e-05, + "loss": 0.0005, "step": 498 }, { - "epoch": 4.62037037037037, - "grad_norm": 0.023724934086203575, - "learning_rate": 1.7457872120201779e-06, - "loss": 0.0086, + "epoch": 1.6129292929292929, + "grad_norm": 0.006756095215678215, + "learning_rate": 6.45536869340233e-05, + "loss": 0.0014, "step": 499 }, { - "epoch": 4.62962962962963, - "grad_norm": 0.020578699186444283, - "learning_rate": 1.6621346816668992e-06, - "loss": 0.0091, + "epoch": 1.6161616161616161, + "grad_norm": 0.0038995088543742895, + "learning_rate": 6.46830530401035e-05, + "loss": 0.0007, "step": 500 }, { - "epoch": 4.62962962962963, - "eval_loss": 0.015207822434604168, - "eval_runtime": 9.1136, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.6161616161616161, + "eval_loss": 0.0010785168269649148, + "eval_runtime": 18.596, + "eval_samples_per_second": 5.378, + "eval_steps_per_second": 1.344, "step": 500 }, { - "epoch": 4.638888888888889, - "grad_norm": 0.024306217208504677, - "learning_rate": 1.5805019736097104e-06, - "loss": 0.009, + "epoch": 1.6193939393939394, + "grad_norm": 0.004936838988214731, + "learning_rate": 6.48124191461837e-05, + "loss": 0.0012, "step": 501 }, { - "epoch": 4.648148148148148, - "grad_norm": 0.020744021981954575, - "learning_rate": 1.5008924989128258e-06, - "loss": 0.0089, + "epoch": 1.6226262626262626, + "grad_norm": 0.006074307020753622, + "learning_rate": 6.494178525226392e-05, + "loss": 0.0022, "step": 502 }, { - "epoch": 4.657407407407407, - "grad_norm": 0.02516799047589302, - "learning_rate": 1.4233095840986753e-06, - "loss": 0.0093, + "epoch": 1.625858585858586, + "grad_norm": 0.0044588991440832615, + "learning_rate": 6.507115135834411e-05, + "loss": 0.0011, "step": 503 }, { - "epoch": 4.666666666666667, - "grad_norm": 0.024567998945713043, - "learning_rate": 1.3477564710088098e-06, - "loss": 0.0094, + "epoch": 1.6290909090909091, + "grad_norm": 0.004507533740252256, + "learning_rate": 6.520051746442432e-05, + "loss": 0.0011, "step": 504 }, { - "epoch": 4.675925925925926, - "grad_norm": 0.024358859285712242, - "learning_rate": 1.2742363166685034e-06, - "loss": 0.007, - "step": 505 - }, - { - "epoch": 4.675925925925926, - "eval_loss": 0.015200878493487835, - "eval_runtime": 9.1155, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 1.6323232323232322, + "grad_norm": 0.0048494781367480755, + "learning_rate": 6.532988357050453e-05, + "loss": 0.0008, "step": 505 }, { - "epoch": 4.685185185185185, - "grad_norm": 0.023163504898548126, - "learning_rate": 1.2027521931548214e-06, - "loss": 0.0074, + "epoch": 1.6355555555555554, + "grad_norm": 0.0033034745138138533, + "learning_rate": 6.545924967658473e-05, + "loss": 0.0007, "step": 506 }, { - "epoch": 4.694444444444445, - "grad_norm": 0.023604586720466614, - "learning_rate": 1.1333070874682216e-06, - "loss": 0.0093, + "epoch": 1.6387878787878787, + "grad_norm": 0.0053002117201685905, + "learning_rate": 6.558861578266494e-05, + "loss": 0.0009, "step": 507 }, { - "epoch": 4.703703703703704, - "grad_norm": 0.02068418823182583, - "learning_rate": 1.0659039014077944e-06, - "loss": 0.0084, + "epoch": 1.642020202020202, + "grad_norm": 0.00404641218483448, + "learning_rate": 6.571798188874515e-05, + "loss": 0.0008, "step": 508 }, { - "epoch": 4.712962962962963, - "grad_norm": 0.02598651312291622, - "learning_rate": 1.0005454514499414e-06, - "loss": 0.0088, + "epoch": 1.6452525252525252, + "grad_norm": 0.0036740771029144526, + "learning_rate": 6.584734799482536e-05, + "loss": 0.0009, "step": 509 }, { - "epoch": 4.722222222222222, - "grad_norm": 0.02512424811720848, - "learning_rate": 9.372344686307655e-07, - "loss": 0.0064, - "step": 510 - }, - { - "epoch": 4.722222222222222, - "eval_loss": 0.01521637849509716, - "eval_runtime": 9.1143, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, + "epoch": 1.6484848484848484, + "grad_norm": 0.005331697873771191, + "learning_rate": 6.597671410090557e-05, + "loss": 0.0014, "step": 510 }, { - "epoch": 4.731481481481482, - "grad_norm": 0.021041063591837883, - "learning_rate": 8.759735984318895e-07, - "loss": 0.0096, + "epoch": 1.6517171717171717, + "grad_norm": 0.004965492524206638, + "learning_rate": 6.610608020698578e-05, + "loss": 0.0019, "step": 511 }, { - "epoch": 4.7407407407407405, - "grad_norm": 0.025718161836266518, - "learning_rate": 8.167654006699443e-07, - "loss": 0.0077, + "epoch": 1.654949494949495, + "grad_norm": 0.0037727411836385727, + "learning_rate": 6.623544631306599e-05, + "loss": 0.001, "step": 512 }, { - "epoch": 4.75, - "grad_norm": 0.02913082391023636, - "learning_rate": 7.596123493895991e-07, - "loss": 0.0072, + "epoch": 1.6581818181818182, + "grad_norm": 0.0048839072696864605, + "learning_rate": 6.636481241914619e-05, + "loss": 0.0013, "step": 513 }, { - "epoch": 4.7592592592592595, - "grad_norm": 0.026588505133986473, - "learning_rate": 7.04516832760177e-07, - "loss": 0.0094, + "epoch": 1.6614141414141415, + "grad_norm": 0.004607974551618099, + "learning_rate": 6.649417852522638e-05, + "loss": 0.0017, "step": 514 }, { - "epoch": 4.768518518518518, - "grad_norm": 0.023728126659989357, - "learning_rate": 6.514811529758747e-07, - "loss": 0.0099, - "step": 515 - }, - { - "epoch": 4.768518518518518, - "eval_loss": 0.01521516963839531, - "eval_runtime": 9.1511, - "eval_samples_per_second": 5.464, - "eval_steps_per_second": 1.421, + "epoch": 1.6646464646464647, + "grad_norm": 0.004647100809961557, + "learning_rate": 6.66235446313066e-05, + "loss": 0.0024, "step": 515 }, { - "epoch": 4.777777777777778, - "grad_norm": 0.03438512608408928, - "learning_rate": 6.005075261595494e-07, - "loss": 0.0086, + "epoch": 1.667878787878788, + "grad_norm": 0.004482895601540804, + "learning_rate": 6.67529107373868e-05, + "loss": 0.0023, "step": 516 }, { - "epoch": 4.787037037037037, - "grad_norm": 0.019554298371076584, - "learning_rate": 5.515980822701439e-07, - "loss": 0.0092, + "epoch": 1.6711111111111112, + "grad_norm": 0.004681951366364956, + "learning_rate": 6.688227684346702e-05, + "loss": 0.0031, "step": 517 }, { - "epoch": 4.796296296296296, - "grad_norm": 0.0235204566270113, - "learning_rate": 5.047548650136513e-07, - "loss": 0.009, + "epoch": 1.6743434343434345, + "grad_norm": 0.004061279818415642, + "learning_rate": 6.701164294954723e-05, + "loss": 0.0012, "step": 518 }, { - "epoch": 4.805555555555555, - "grad_norm": 0.023747643455863, - "learning_rate": 4.5997983175773417e-07, - "loss": 0.0092, + "epoch": 1.6775757575757577, + "grad_norm": 0.005936305969953537, + "learning_rate": 6.714100905562742e-05, + "loss": 0.0018, "step": 519 }, { - "epoch": 4.814814814814815, - "grad_norm": 0.02751827985048294, - "learning_rate": 4.1727485344994486e-07, - "loss": 0.0088, - "step": 520 - }, - { - "epoch": 4.814814814814815, - "eval_loss": 0.015235532075166702, - "eval_runtime": 9.1256, - "eval_samples_per_second": 5.479, - "eval_steps_per_second": 1.425, + "epoch": 1.680808080808081, + "grad_norm": 0.003031841479241848, + "learning_rate": 6.727037516170763e-05, + "loss": 0.0009, "step": 520 }, { - "epoch": 4.824074074074074, - "grad_norm": 0.026621591299772263, - "learning_rate": 3.766417145395218e-07, - "loss": 0.0086, + "epoch": 1.684040404040404, + "grad_norm": 0.0044912113808095455, + "learning_rate": 6.739974126778784e-05, + "loss": 0.0013, "step": 521 }, { - "epoch": 4.833333333333333, - "grad_norm": 0.01991841197013855, - "learning_rate": 3.380821129028489e-07, - "loss": 0.0084, + "epoch": 1.6872727272727273, + "grad_norm": 0.003882101271301508, + "learning_rate": 6.752910737386805e-05, + "loss": 0.0014, "step": 522 }, { - "epoch": 4.842592592592593, - "grad_norm": 0.023508219048380852, - "learning_rate": 3.0159765977250673e-07, - "loss": 0.0103, + "epoch": 1.6905050505050505, + "grad_norm": 0.0031570433638989925, + "learning_rate": 6.765847347994826e-05, + "loss": 0.0011, "step": 523 }, { - "epoch": 4.851851851851852, - "grad_norm": 0.02976732887327671, - "learning_rate": 2.671898796699268e-07, - "loss": 0.0084, + "epoch": 1.6937373737373738, + "grad_norm": 0.004184515681117773, + "learning_rate": 6.778783958602847e-05, + "loss": 0.0015, "step": 524 }, { - "epoch": 4.861111111111111, - "grad_norm": 0.02255621738731861, - "learning_rate": 2.3486021034170857e-07, - "loss": 0.0089, + "epoch": 1.696969696969697, + "grad_norm": 0.002800683258101344, + "learning_rate": 6.791720569210867e-05, + "loss": 0.0008, "step": 525 }, { - "epoch": 4.861111111111111, - "eval_loss": 0.015216498635709286, - "eval_runtime": 9.1106, - "eval_samples_per_second": 5.488, - "eval_steps_per_second": 1.427, + "epoch": 1.696969696969697, + "eval_loss": 0.0010542384115979075, + "eval_runtime": 18.5989, + "eval_samples_per_second": 5.377, + "eval_steps_per_second": 1.344, "step": 525 }, { - "epoch": 4.87037037037037, - "grad_norm": 0.025215914472937584, - "learning_rate": 2.0461000269953456e-07, - "loss": 0.0075, + "epoch": 1.70020202020202, + "grad_norm": 0.003550174878910184, + "learning_rate": 6.804657179818887e-05, + "loss": 0.0014, "step": 526 }, { - "epoch": 4.87962962962963, - "grad_norm": 0.02554066851735115, - "learning_rate": 1.7644052076371542e-07, - "loss": 0.0083, + "epoch": 1.7034343434343433, + "grad_norm": 0.0030985972844064236, + "learning_rate": 6.817593790426908e-05, + "loss": 0.0008, "step": 527 }, { - "epoch": 4.888888888888889, - "grad_norm": 0.02162836864590645, - "learning_rate": 1.503529416103988e-07, - "loss": 0.009, + "epoch": 1.7066666666666666, + "grad_norm": 0.0048317620530724525, + "learning_rate": 6.830530401034929e-05, + "loss": 0.0018, "step": 528 }, { - "epoch": 4.898148148148148, - "grad_norm": 0.02335723116993904, - "learning_rate": 1.2634835532233657e-07, - "loss": 0.0093, + "epoch": 1.7098989898989898, + "grad_norm": 0.005726094823330641, + "learning_rate": 6.84346701164295e-05, + "loss": 0.0025, "step": 529 }, { - "epoch": 4.907407407407407, - "grad_norm": 0.02844967506825924, - "learning_rate": 1.044277649433989e-07, - "loss": 0.0083, - "step": 530 - }, - { - "epoch": 4.907407407407407, - "eval_loss": 0.015229844488203526, - "eval_runtime": 9.1406, - "eval_samples_per_second": 5.47, - "eval_steps_per_second": 1.422, + "epoch": 1.713131313131313, + "grad_norm": 0.0024808107409626245, + "learning_rate": 6.856403622250971e-05, + "loss": 0.0007, "step": 530 }, { - "epoch": 4.916666666666667, - "grad_norm": 0.02188325859606266, - "learning_rate": 8.459208643659122e-08, - "loss": 0.0084, + "epoch": 1.7163636363636363, + "grad_norm": 0.003422652604058385, + "learning_rate": 6.869340232858992e-05, + "loss": 0.001, "step": 531 }, { - "epoch": 4.925925925925926, - "grad_norm": 0.026782654225826263, - "learning_rate": 6.684214864584038e-08, - "loss": 0.009, + "epoch": 1.7195959595959596, + "grad_norm": 0.0037957008462399244, + "learning_rate": 6.882276843467012e-05, + "loss": 0.0009, "step": 532 }, { - "epoch": 4.935185185185185, - "grad_norm": 0.024010982364416122, - "learning_rate": 5.11786932613223e-08, - "loss": 0.0055, + "epoch": 1.7228282828282828, + "grad_norm": 0.0028711955528706312, + "learning_rate": 6.895213454075033e-05, + "loss": 0.0008, "step": 533 }, { - "epoch": 4.944444444444445, - "grad_norm": 0.02621973119676113, - "learning_rate": 3.760237478849793e-08, - "loss": 0.0093, + "epoch": 1.726060606060606, + "grad_norm": 0.002812835620716214, + "learning_rate": 6.908150064683054e-05, + "loss": 0.0009, "step": 534 }, { - "epoch": 4.953703703703704, - "grad_norm": 0.02257387712597847, - "learning_rate": 2.6113760520735108e-08, - "loss": 0.0103, - "step": 535 - }, - { - "epoch": 4.953703703703704, - "eval_loss": 0.015256751328706741, - "eval_runtime": 9.1156, - "eval_samples_per_second": 5.485, - "eval_steps_per_second": 1.426, + "epoch": 1.7292929292929293, + "grad_norm": 0.003739473642781377, + "learning_rate": 6.921086675291075e-05, + "loss": 0.0016, "step": 535 }, { - "epoch": 4.962962962962963, - "grad_norm": 0.02289225161075592, - "learning_rate": 1.6713330515627513e-08, - "loss": 0.0106, + "epoch": 1.7325252525252526, + "grad_norm": 0.004807054530829191, + "learning_rate": 6.934023285899096e-05, + "loss": 0.0023, "step": 536 }, { - "epoch": 4.972222222222222, - "grad_norm": 0.032289694994688034, - "learning_rate": 9.401477574932926e-09, - "loss": 0.0074, + "epoch": 1.7357575757575758, + "grad_norm": 0.004576352424919605, + "learning_rate": 6.946959896507115e-05, + "loss": 0.0024, "step": 537 }, { - "epoch": 4.981481481481482, - "grad_norm": 0.0215620007365942, - "learning_rate": 4.178507228136397e-09, - "loss": 0.0082, + "epoch": 1.738989898989899, + "grad_norm": 0.0030553669203072786, + "learning_rate": 6.959896507115135e-05, + "loss": 0.0009, "step": 538 }, { - "epoch": 4.9907407407407405, - "grad_norm": 0.02391226962208748, - "learning_rate": 1.0446377197104173e-09, - "loss": 0.0085, + "epoch": 1.7422222222222223, + "grad_norm": 0.003879109164699912, + "learning_rate": 6.972833117723156e-05, + "loss": 0.0012, "step": 539 }, { - "epoch": 5.0, - "grad_norm": 0.0241775494068861, - "learning_rate": 0.0, - "loss": 0.0092, + "epoch": 1.7454545454545456, + "grad_norm": 0.0031888741068542004, + "learning_rate": 6.985769728331177e-05, + "loss": 0.0013, "step": 540 }, { - "epoch": 5.0, - "eval_loss": 0.01526525616645813, - "eval_runtime": 9.1149, - "eval_samples_per_second": 5.486, - "eval_steps_per_second": 1.426, - "step": 540 + "epoch": 1.7486868686868688, + "grad_norm": 0.0037957336753606796, + "learning_rate": 6.998706338939198e-05, + "loss": 0.0008, + "step": 541 }, { - "epoch": 5.0, - "step": 540, - "total_flos": 1.2254685925518213e+18, - "train_loss": 0.016027936152251506, - "train_runtime": 9839.9649, - "train_samples_per_second": 1.756, - "train_steps_per_second": 0.055 - } - ], - "logging_steps": 1, - "max_steps": 540, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 50, - "stateful_callbacks": { + "epoch": 1.7519191919191919, + "grad_norm": 0.002901956904679537, + "learning_rate": 7.011642949547219e-05, + "loss": 0.0007, + "step": 542 + }, + { + "epoch": 1.7551515151515151, + "grad_norm": 0.0029531833715736866, + "learning_rate": 7.02457956015524e-05, + "loss": 0.0008, + "step": 543 + }, + { + "epoch": 1.7583838383838384, + "grad_norm": 0.004063300788402557, + "learning_rate": 7.037516170763261e-05, + "loss": 0.0012, + "step": 544 + }, + { + "epoch": 1.7616161616161616, + "grad_norm": 0.0041914028115570545, + "learning_rate": 7.050452781371281e-05, + "loss": 0.001, + "step": 545 + }, + { + "epoch": 1.7648484848484849, + "grad_norm": 0.004884886089712381, + "learning_rate": 7.063389391979302e-05, + "loss": 0.0021, + "step": 546 + }, + { + "epoch": 1.768080808080808, + "grad_norm": 0.005683641415089369, + "learning_rate": 7.076326002587323e-05, + "loss": 0.0024, + "step": 547 + }, + { + "epoch": 1.7713131313131312, + "grad_norm": 0.0042356885969638824, + "learning_rate": 7.089262613195343e-05, + "loss": 0.0015, + "step": 548 + }, + { + "epoch": 1.7745454545454544, + "grad_norm": 0.005650636274367571, + "learning_rate": 7.102199223803364e-05, + "loss": 0.0013, + "step": 549 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.00460450816899538, + "learning_rate": 7.115135834411385e-05, + "loss": 0.001, + "step": 550 + }, + { + "epoch": 1.7777777777777777, + "eval_loss": 0.0010012522106990218, + "eval_runtime": 18.6807, + "eval_samples_per_second": 5.353, + "eval_steps_per_second": 1.338, + "step": 550 + }, + { + "epoch": 1.781010101010101, + "grad_norm": 0.004403329454362392, + "learning_rate": 7.128072445019404e-05, + "loss": 0.0012, + "step": 551 + }, + { + "epoch": 1.7842424242424242, + "grad_norm": 0.0032920974772423506, + "learning_rate": 7.141009055627425e-05, + "loss": 0.0008, + "step": 552 + }, + { + "epoch": 1.7874747474747474, + "grad_norm": 0.0038138548843562603, + "learning_rate": 7.153945666235446e-05, + "loss": 0.0011, + "step": 553 + }, + { + "epoch": 1.7907070707070707, + "grad_norm": 0.00296188285574317, + "learning_rate": 7.166882276843467e-05, + "loss": 0.0008, + "step": 554 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.0031317227985709906, + "learning_rate": 7.179818887451488e-05, + "loss": 0.001, + "step": 555 + }, + { + "epoch": 1.7971717171717172, + "grad_norm": 0.003491588868200779, + "learning_rate": 7.19275549805951e-05, + "loss": 0.0007, + "step": 556 + }, + { + "epoch": 1.8004040404040405, + "grad_norm": 0.0031343346927314997, + "learning_rate": 7.20569210866753e-05, + "loss": 0.0005, + "step": 557 + }, + { + "epoch": 1.8036363636363637, + "grad_norm": 0.004438635893166065, + "learning_rate": 7.21862871927555e-05, + "loss": 0.0011, + "step": 558 + }, + { + "epoch": 1.806868686868687, + "grad_norm": 0.00411741528660059, + "learning_rate": 7.231565329883571e-05, + "loss": 0.0008, + "step": 559 + }, + { + "epoch": 1.8101010101010102, + "grad_norm": 0.004590868018567562, + "learning_rate": 7.244501940491591e-05, + "loss": 0.0009, + "step": 560 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 0.00454489141702652, + "learning_rate": 7.257438551099612e-05, + "loss": 0.001, + "step": 561 + }, + { + "epoch": 1.8165656565656567, + "grad_norm": 0.004015767015516758, + "learning_rate": 7.270375161707633e-05, + "loss": 0.0007, + "step": 562 + }, + { + "epoch": 1.8197979797979797, + "grad_norm": 0.003563391976058483, + "learning_rate": 7.283311772315654e-05, + "loss": 0.0008, + "step": 563 + }, + { + "epoch": 1.823030303030303, + "grad_norm": 0.0037166401743888855, + "learning_rate": 7.296248382923674e-05, + "loss": 0.0009, + "step": 564 + }, + { + "epoch": 1.8262626262626263, + "grad_norm": 0.004116456024348736, + "learning_rate": 7.309184993531695e-05, + "loss": 0.0009, + "step": 565 + }, + { + "epoch": 1.8294949494949495, + "grad_norm": 0.004104538355022669, + "learning_rate": 7.322121604139716e-05, + "loss": 0.0013, + "step": 566 + }, + { + "epoch": 1.8327272727272728, + "grad_norm": 0.005509045906364918, + "learning_rate": 7.335058214747737e-05, + "loss": 0.0023, + "step": 567 + }, + { + "epoch": 1.835959595959596, + "grad_norm": 0.0035526566207408905, + "learning_rate": 7.347994825355758e-05, + "loss": 0.0008, + "step": 568 + }, + { + "epoch": 1.839191919191919, + "grad_norm": 0.003440872300416231, + "learning_rate": 7.360931435963779e-05, + "loss": 0.0007, + "step": 569 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.0029369727708399296, + "learning_rate": 7.3738680465718e-05, + "loss": 0.0008, + "step": 570 + }, + { + "epoch": 1.8456565656565656, + "grad_norm": 0.004200720228254795, + "learning_rate": 7.38680465717982e-05, + "loss": 0.0011, + "step": 571 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 0.005561929661780596, + "learning_rate": 7.399741267787839e-05, + "loss": 0.0018, + "step": 572 + }, + { + "epoch": 1.852121212121212, + "grad_norm": 0.0033285904210060835, + "learning_rate": 7.41267787839586e-05, + "loss": 0.0009, + "step": 573 + }, + { + "epoch": 1.8553535353535353, + "grad_norm": 0.004428067244589329, + "learning_rate": 7.425614489003881e-05, + "loss": 0.0015, + "step": 574 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 0.003229588968679309, + "learning_rate": 7.438551099611902e-05, + "loss": 0.0007, + "step": 575 + }, + { + "epoch": 1.8585858585858586, + "eval_loss": 0.0009781663538888097, + "eval_runtime": 18.7511, + "eval_samples_per_second": 5.333, + "eval_steps_per_second": 1.333, + "step": 575 + }, + { + "epoch": 1.8618181818181818, + "grad_norm": 0.003516717813909054, + "learning_rate": 7.451487710219923e-05, + "loss": 0.0011, + "step": 576 + }, + { + "epoch": 1.865050505050505, + "grad_norm": 0.002692221663892269, + "learning_rate": 7.464424320827943e-05, + "loss": 0.0008, + "step": 577 + }, + { + "epoch": 1.8682828282828283, + "grad_norm": 0.003864881582558155, + "learning_rate": 7.477360931435964e-05, + "loss": 0.0013, + "step": 578 + }, + { + "epoch": 1.8715151515151516, + "grad_norm": 0.003865283913910389, + "learning_rate": 7.490297542043985e-05, + "loss": 0.0015, + "step": 579 + }, + { + "epoch": 1.8747474747474748, + "grad_norm": 0.0035206254106014967, + "learning_rate": 7.503234152652006e-05, + "loss": 0.0012, + "step": 580 + }, + { + "epoch": 1.877979797979798, + "grad_norm": 0.004602121654897928, + "learning_rate": 7.516170763260027e-05, + "loss": 0.0018, + "step": 581 + }, + { + "epoch": 1.8812121212121213, + "grad_norm": 0.0029543524142354727, + "learning_rate": 7.529107373868047e-05, + "loss": 0.0009, + "step": 582 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 0.0023640701547265053, + "learning_rate": 7.542043984476068e-05, + "loss": 0.0007, + "step": 583 + }, + { + "epoch": 1.8876767676767678, + "grad_norm": 0.0040098559111356735, + "learning_rate": 7.554980595084087e-05, + "loss": 0.0012, + "step": 584 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.004274715203791857, + "learning_rate": 7.567917205692108e-05, + "loss": 0.0015, + "step": 585 + }, + { + "epoch": 1.8941414141414141, + "grad_norm": 0.0031054418068379164, + "learning_rate": 7.58085381630013e-05, + "loss": 0.0008, + "step": 586 + }, + { + "epoch": 1.8973737373737374, + "grad_norm": 0.003954428713768721, + "learning_rate": 7.59379042690815e-05, + "loss": 0.0009, + "step": 587 + }, + { + "epoch": 1.9006060606060606, + "grad_norm": 0.003786911489441991, + "learning_rate": 7.606727037516171e-05, + "loss": 0.0006, + "step": 588 + }, + { + "epoch": 1.9038383838383839, + "grad_norm": 0.0041464087553322315, + "learning_rate": 7.619663648124192e-05, + "loss": 0.0011, + "step": 589 + }, + { + "epoch": 1.907070707070707, + "grad_norm": 0.004568018950521946, + "learning_rate": 7.632600258732212e-05, + "loss": 0.0019, + "step": 590 + }, + { + "epoch": 1.9103030303030302, + "grad_norm": 0.003288006642833352, + "learning_rate": 7.645536869340233e-05, + "loss": 0.0009, + "step": 591 + }, + { + "epoch": 1.9135353535353534, + "grad_norm": 0.004277890548110008, + "learning_rate": 7.658473479948254e-05, + "loss": 0.0008, + "step": 592 + }, + { + "epoch": 1.9167676767676767, + "grad_norm": 0.004379922058433294, + "learning_rate": 7.671410090556275e-05, + "loss": 0.001, + "step": 593 + }, + { + "epoch": 1.92, + "grad_norm": 0.003021504729986191, + "learning_rate": 7.684346701164295e-05, + "loss": 0.0006, + "step": 594 + }, + { + "epoch": 1.9232323232323232, + "grad_norm": 0.0032772128470242023, + "learning_rate": 7.697283311772316e-05, + "loss": 0.0008, + "step": 595 + }, + { + "epoch": 1.9264646464646464, + "grad_norm": 0.004533900413662195, + "learning_rate": 7.710219922380336e-05, + "loss": 0.0015, + "step": 596 + }, + { + "epoch": 1.9296969696969697, + "grad_norm": 0.005284740123897791, + "learning_rate": 7.723156532988357e-05, + "loss": 0.0011, + "step": 597 + }, + { + "epoch": 1.932929292929293, + "grad_norm": 0.005875818431377411, + "learning_rate": 7.736093143596378e-05, + "loss": 0.0013, + "step": 598 + }, + { + "epoch": 1.9361616161616162, + "grad_norm": 0.004483737051486969, + "learning_rate": 7.749029754204399e-05, + "loss": 0.0009, + "step": 599 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.004435641225427389, + "learning_rate": 7.76196636481242e-05, + "loss": 0.0013, + "step": 600 + }, + { + "epoch": 1.9393939393939394, + "eval_loss": 0.0009266917477361858, + "eval_runtime": 18.7646, + "eval_samples_per_second": 5.329, + "eval_steps_per_second": 1.332, + "step": 600 + }, + { + "epoch": 1.9426262626262627, + "grad_norm": 0.003876154311001301, + "learning_rate": 7.774902975420441e-05, + "loss": 0.001, + "step": 601 + }, + { + "epoch": 1.945858585858586, + "grad_norm": 0.004851337987929583, + "learning_rate": 7.787839586028462e-05, + "loss": 0.0014, + "step": 602 + }, + { + "epoch": 1.9490909090909092, + "grad_norm": 0.003333737375214696, + "learning_rate": 7.800776196636481e-05, + "loss": 0.001, + "step": 603 + }, + { + "epoch": 1.9523232323232325, + "grad_norm": 0.0032944250851869583, + "learning_rate": 7.813712807244502e-05, + "loss": 0.0007, + "step": 604 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.004762569442391396, + "learning_rate": 7.826649417852523e-05, + "loss": 0.0027, + "step": 605 + }, + { + "epoch": 1.9587878787878787, + "grad_norm": 0.0037507396191358566, + "learning_rate": 7.839586028460543e-05, + "loss": 0.0014, + "step": 606 + }, + { + "epoch": 1.962020202020202, + "grad_norm": 0.005207899492233992, + "learning_rate": 7.852522639068564e-05, + "loss": 0.0031, + "step": 607 + }, + { + "epoch": 1.9652525252525253, + "grad_norm": 0.0035236128605902195, + "learning_rate": 7.865459249676585e-05, + "loss": 0.001, + "step": 608 + }, + { + "epoch": 1.9684848484848485, + "grad_norm": 0.004186858423054218, + "learning_rate": 7.878395860284605e-05, + "loss": 0.0012, + "step": 609 + }, + { + "epoch": 1.9717171717171718, + "grad_norm": 0.0038515774067491293, + "learning_rate": 7.891332470892626e-05, + "loss": 0.0022, + "step": 610 + }, + { + "epoch": 1.9749494949494948, + "grad_norm": 0.0030782243702560663, + "learning_rate": 7.904269081500647e-05, + "loss": 0.0006, + "step": 611 + }, + { + "epoch": 1.978181818181818, + "grad_norm": 0.005162171553820372, + "learning_rate": 7.917205692108668e-05, + "loss": 0.0013, + "step": 612 + }, + { + "epoch": 1.9814141414141413, + "grad_norm": 0.004226456396281719, + "learning_rate": 7.930142302716689e-05, + "loss": 0.0009, + "step": 613 + }, + { + "epoch": 1.9846464646464645, + "grad_norm": 0.003615841967985034, + "learning_rate": 7.94307891332471e-05, + "loss": 0.0022, + "step": 614 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.00408519571647048, + "learning_rate": 7.956015523932731e-05, + "loss": 0.0015, + "step": 615 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 0.0042512728832662106, + "learning_rate": 7.968952134540751e-05, + "loss": 0.0014, + "step": 616 + }, + { + "epoch": 1.9943434343434343, + "grad_norm": 0.003178700339049101, + "learning_rate": 7.981888745148772e-05, + "loss": 0.0008, + "step": 617 + }, + { + "epoch": 1.9975757575757576, + "grad_norm": 0.003822662867605686, + "learning_rate": 7.994825355756791e-05, + "loss": 0.001, + "step": 618 + }, + { + "epoch": 2.000808080808081, + "grad_norm": 0.0046024019829928875, + "learning_rate": 8.007761966364812e-05, + "loss": 0.0011, + "step": 619 + }, + { + "epoch": 2.004040404040404, + "grad_norm": 0.0038011916913092136, + "learning_rate": 8.020698576972833e-05, + "loss": 0.0008, + "step": 620 + }, + { + "epoch": 2.0072727272727273, + "grad_norm": 0.006909184157848358, + "learning_rate": 8.033635187580855e-05, + "loss": 0.0026, + "step": 621 + }, + { + "epoch": 2.0105050505050506, + "grad_norm": 0.005042599979788065, + "learning_rate": 8.046571798188874e-05, + "loss": 0.0011, + "step": 622 + }, + { + "epoch": 2.013737373737374, + "grad_norm": 0.004002322442829609, + "learning_rate": 8.059508408796895e-05, + "loss": 0.0011, + "step": 623 + }, + { + "epoch": 2.016969696969697, + "grad_norm": 0.0031209783628582954, + "learning_rate": 8.072445019404916e-05, + "loss": 0.0009, + "step": 624 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 0.0026158462278544903, + "learning_rate": 8.085381630012937e-05, + "loss": 0.0007, + "step": 625 + }, + { + "epoch": 2.0202020202020203, + "eval_loss": 0.0009721739334054291, + "eval_runtime": 18.7204, + "eval_samples_per_second": 5.342, + "eval_steps_per_second": 1.335, + "step": 625 + }, + { + "epoch": 2.0234343434343436, + "grad_norm": 0.003943873103708029, + "learning_rate": 8.098318240620958e-05, + "loss": 0.001, + "step": 626 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 0.0038578941021114588, + "learning_rate": 8.111254851228979e-05, + "loss": 0.0017, + "step": 627 + }, + { + "epoch": 2.02989898989899, + "grad_norm": 0.002991423010826111, + "learning_rate": 8.124191461836999e-05, + "loss": 0.0007, + "step": 628 + }, + { + "epoch": 2.0331313131313133, + "grad_norm": 0.002815463813021779, + "learning_rate": 8.13712807244502e-05, + "loss": 0.0009, + "step": 629 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 0.0025943731889128685, + "learning_rate": 8.15006468305304e-05, + "loss": 0.0006, + "step": 630 + }, + { + "epoch": 2.0395959595959594, + "grad_norm": 0.0029918155632913113, + "learning_rate": 8.163001293661061e-05, + "loss": 0.0008, + "step": 631 + }, + { + "epoch": 2.0428282828282827, + "grad_norm": 0.00398919777944684, + "learning_rate": 8.175937904269082e-05, + "loss": 0.0011, + "step": 632 + }, + { + "epoch": 2.046060606060606, + "grad_norm": 0.003045483957976103, + "learning_rate": 8.188874514877103e-05, + "loss": 0.0007, + "step": 633 + }, + { + "epoch": 2.049292929292929, + "grad_norm": 0.005803203675895929, + "learning_rate": 8.201811125485124e-05, + "loss": 0.0008, + "step": 634 + }, + { + "epoch": 2.0525252525252524, + "grad_norm": 0.003953828942030668, + "learning_rate": 8.214747736093143e-05, + "loss": 0.001, + "step": 635 + }, + { + "epoch": 2.0557575757575757, + "grad_norm": 0.003973971586674452, + "learning_rate": 8.227684346701164e-05, + "loss": 0.0006, + "step": 636 + }, + { + "epoch": 2.058989898989899, + "grad_norm": 0.0025888276286423206, + "learning_rate": 8.240620957309186e-05, + "loss": 0.0005, + "step": 637 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 0.003989651799201965, + "learning_rate": 8.253557567917207e-05, + "loss": 0.0008, + "step": 638 + }, + { + "epoch": 2.0654545454545454, + "grad_norm": 0.0030330433510243893, + "learning_rate": 8.266494178525228e-05, + "loss": 0.0009, + "step": 639 + }, + { + "epoch": 2.0686868686868687, + "grad_norm": 0.0044325897470116615, + "learning_rate": 8.279430789133247e-05, + "loss": 0.0016, + "step": 640 + }, + { + "epoch": 2.071919191919192, + "grad_norm": 0.002264161128550768, + "learning_rate": 8.292367399741268e-05, + "loss": 0.0006, + "step": 641 + }, + { + "epoch": 2.075151515151515, + "grad_norm": 0.0033110452350229025, + "learning_rate": 8.305304010349288e-05, + "loss": 0.001, + "step": 642 + }, + { + "epoch": 2.0783838383838384, + "grad_norm": 0.0029199772980064154, + "learning_rate": 8.318240620957309e-05, + "loss": 0.0006, + "step": 643 + }, + { + "epoch": 2.0816161616161617, + "grad_norm": 0.0031028217636048794, + "learning_rate": 8.33117723156533e-05, + "loss": 0.0007, + "step": 644 + }, + { + "epoch": 2.084848484848485, + "grad_norm": 0.004011626821011305, + "learning_rate": 8.344113842173351e-05, + "loss": 0.0009, + "step": 645 + }, + { + "epoch": 2.088080808080808, + "grad_norm": 0.003449072130024433, + "learning_rate": 8.357050452781372e-05, + "loss": 0.0009, + "step": 646 + }, + { + "epoch": 2.0913131313131315, + "grad_norm": 0.003367091529071331, + "learning_rate": 8.369987063389393e-05, + "loss": 0.001, + "step": 647 + }, + { + "epoch": 2.0945454545454547, + "grad_norm": 0.003321894910186529, + "learning_rate": 8.382923673997413e-05, + "loss": 0.0016, + "step": 648 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 0.003046546597033739, + "learning_rate": 8.395860284605434e-05, + "loss": 0.0008, + "step": 649 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 0.0023006205447018147, + "learning_rate": 8.408796895213455e-05, + "loss": 0.0006, + "step": 650 + }, + { + "epoch": 2.101010101010101, + "eval_loss": 0.0009433354716747999, + "eval_runtime": 18.7374, + "eval_samples_per_second": 5.337, + "eval_steps_per_second": 1.334, + "step": 650 + }, + { + "epoch": 2.1042424242424245, + "grad_norm": 0.004013998433947563, + "learning_rate": 8.421733505821476e-05, + "loss": 0.001, + "step": 651 + }, + { + "epoch": 2.1074747474747473, + "grad_norm": 0.0035437876358628273, + "learning_rate": 8.434670116429496e-05, + "loss": 0.0007, + "step": 652 + }, + { + "epoch": 2.1107070707070705, + "grad_norm": 0.004152386449277401, + "learning_rate": 8.447606727037517e-05, + "loss": 0.001, + "step": 653 + }, + { + "epoch": 2.113939393939394, + "grad_norm": 0.0031009165104478598, + "learning_rate": 8.460543337645536e-05, + "loss": 0.0007, + "step": 654 + }, + { + "epoch": 2.117171717171717, + "grad_norm": 0.002030389616265893, + "learning_rate": 8.473479948253557e-05, + "loss": 0.0005, + "step": 655 + }, + { + "epoch": 2.1204040404040403, + "grad_norm": 0.003986351191997528, + "learning_rate": 8.486416558861578e-05, + "loss": 0.0011, + "step": 656 + }, + { + "epoch": 2.1236363636363635, + "grad_norm": 0.0037633986212313175, + "learning_rate": 8.499353169469599e-05, + "loss": 0.0017, + "step": 657 + }, + { + "epoch": 2.126868686868687, + "grad_norm": 0.003191509749740362, + "learning_rate": 8.51228978007762e-05, + "loss": 0.0009, + "step": 658 + }, + { + "epoch": 2.13010101010101, + "grad_norm": 0.003234416712075472, + "learning_rate": 8.525226390685641e-05, + "loss": 0.0015, + "step": 659 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.00314782140776515, + "learning_rate": 8.538163001293662e-05, + "loss": 0.0008, + "step": 660 + }, + { + "epoch": 2.1365656565656566, + "grad_norm": 0.004626644309610128, + "learning_rate": 8.551099611901682e-05, + "loss": 0.0017, + "step": 661 + }, + { + "epoch": 2.13979797979798, + "grad_norm": 0.0037515638396143913, + "learning_rate": 8.564036222509703e-05, + "loss": 0.001, + "step": 662 + }, + { + "epoch": 2.143030303030303, + "grad_norm": 0.0028777304105460644, + "learning_rate": 8.576972833117724e-05, + "loss": 0.0009, + "step": 663 + }, + { + "epoch": 2.1462626262626263, + "grad_norm": 0.0035881204530596733, + "learning_rate": 8.589909443725744e-05, + "loss": 0.0008, + "step": 664 + }, + { + "epoch": 2.1494949494949496, + "grad_norm": 0.002268304582685232, + "learning_rate": 8.602846054333765e-05, + "loss": 0.0005, + "step": 665 + }, + { + "epoch": 2.152727272727273, + "grad_norm": 0.003100323723629117, + "learning_rate": 8.615782664941786e-05, + "loss": 0.0008, + "step": 666 + }, + { + "epoch": 2.155959595959596, + "grad_norm": 0.003520137397572398, + "learning_rate": 8.628719275549805e-05, + "loss": 0.0009, + "step": 667 + }, + { + "epoch": 2.1591919191919193, + "grad_norm": 0.0042273253202438354, + "learning_rate": 8.641655886157827e-05, + "loss": 0.0015, + "step": 668 + }, + { + "epoch": 2.1624242424242426, + "grad_norm": 0.0030146201606839895, + "learning_rate": 8.654592496765848e-05, + "loss": 0.0009, + "step": 669 + }, + { + "epoch": 2.165656565656566, + "grad_norm": 0.0055503263138234615, + "learning_rate": 8.667529107373869e-05, + "loss": 0.0025, + "step": 670 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 0.0036939766723662615, + "learning_rate": 8.68046571798189e-05, + "loss": 0.002, + "step": 671 + }, + { + "epoch": 2.172121212121212, + "grad_norm": 0.0032185425516217947, + "learning_rate": 8.69340232858991e-05, + "loss": 0.0009, + "step": 672 + }, + { + "epoch": 2.175353535353535, + "grad_norm": 0.0030356363859027624, + "learning_rate": 8.706338939197932e-05, + "loss": 0.0007, + "step": 673 + }, + { + "epoch": 2.1785858585858584, + "grad_norm": 0.006692877039313316, + "learning_rate": 8.719275549805951e-05, + "loss": 0.0008, + "step": 674 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.0026474855840206146, + "learning_rate": 8.732212160413972e-05, + "loss": 0.0007, + "step": 675 + }, + { + "epoch": 2.1818181818181817, + "eval_loss": 0.0009410877246409655, + "eval_runtime": 18.7748, + "eval_samples_per_second": 5.326, + "eval_steps_per_second": 1.332, + "step": 675 + }, + { + "epoch": 2.185050505050505, + "grad_norm": 0.003672214224934578, + "learning_rate": 8.745148771021992e-05, + "loss": 0.0018, + "step": 676 + }, + { + "epoch": 2.188282828282828, + "grad_norm": 0.0031880387105047703, + "learning_rate": 8.758085381630013e-05, + "loss": 0.0012, + "step": 677 + }, + { + "epoch": 2.1915151515151514, + "grad_norm": 0.0033309224527329206, + "learning_rate": 8.771021992238034e-05, + "loss": 0.001, + "step": 678 + }, + { + "epoch": 2.1947474747474747, + "grad_norm": 0.003027291502803564, + "learning_rate": 8.783958602846055e-05, + "loss": 0.0008, + "step": 679 + }, + { + "epoch": 2.197979797979798, + "grad_norm": 0.0036887172609567642, + "learning_rate": 8.796895213454075e-05, + "loss": 0.001, + "step": 680 + }, + { + "epoch": 2.201212121212121, + "grad_norm": 0.003997828811407089, + "learning_rate": 8.809831824062096e-05, + "loss": 0.0011, + "step": 681 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 0.003314135130494833, + "learning_rate": 8.822768434670117e-05, + "loss": 0.0017, + "step": 682 + }, + { + "epoch": 2.2076767676767677, + "grad_norm": 0.0036913359072059393, + "learning_rate": 8.835705045278138e-05, + "loss": 0.0009, + "step": 683 + }, + { + "epoch": 2.210909090909091, + "grad_norm": 0.0030470637138932943, + "learning_rate": 8.848641655886159e-05, + "loss": 0.0008, + "step": 684 + }, + { + "epoch": 2.214141414141414, + "grad_norm": 0.004008392803370953, + "learning_rate": 8.86157826649418e-05, + "loss": 0.0012, + "step": 685 + }, + { + "epoch": 2.2173737373737374, + "grad_norm": 0.0055717285722494125, + "learning_rate": 8.8745148771022e-05, + "loss": 0.0019, + "step": 686 + }, + { + "epoch": 2.2206060606060607, + "grad_norm": 0.002990501932799816, + "learning_rate": 8.88745148771022e-05, + "loss": 0.0009, + "step": 687 + }, + { + "epoch": 2.223838383838384, + "grad_norm": 0.002291604643687606, + "learning_rate": 8.90038809831824e-05, + "loss": 0.0006, + "step": 688 + }, + { + "epoch": 2.227070707070707, + "grad_norm": 0.002231605350971222, + "learning_rate": 8.913324708926261e-05, + "loss": 0.0006, + "step": 689 + }, + { + "epoch": 2.2303030303030305, + "grad_norm": 0.002650222275406122, + "learning_rate": 8.926261319534282e-05, + "loss": 0.0007, + "step": 690 + }, + { + "epoch": 2.2335353535353537, + "grad_norm": 0.0019864251371473074, + "learning_rate": 8.939197930142303e-05, + "loss": 0.0006, + "step": 691 + }, + { + "epoch": 2.236767676767677, + "grad_norm": 0.0028250280302017927, + "learning_rate": 8.952134540750324e-05, + "loss": 0.0006, + "step": 692 + }, + { + "epoch": 2.24, + "grad_norm": 0.0035562533885240555, + "learning_rate": 8.965071151358344e-05, + "loss": 0.0018, + "step": 693 + }, + { + "epoch": 2.2432323232323235, + "grad_norm": 0.0030060771387070417, + "learning_rate": 8.978007761966365e-05, + "loss": 0.0007, + "step": 694 + }, + { + "epoch": 2.2464646464646463, + "grad_norm": 0.002113576978445053, + "learning_rate": 8.990944372574386e-05, + "loss": 0.0005, + "step": 695 + }, + { + "epoch": 2.2496969696969695, + "grad_norm": 0.002114512724801898, + "learning_rate": 9.003880983182407e-05, + "loss": 0.0006, + "step": 696 + }, + { + "epoch": 2.252929292929293, + "grad_norm": 0.0029473064932972193, + "learning_rate": 9.016817593790428e-05, + "loss": 0.0006, + "step": 697 + }, + { + "epoch": 2.256161616161616, + "grad_norm": 0.004192824941128492, + "learning_rate": 9.029754204398448e-05, + "loss": 0.001, + "step": 698 + }, + { + "epoch": 2.2593939393939393, + "grad_norm": 0.003509392263367772, + "learning_rate": 9.042690815006469e-05, + "loss": 0.0009, + "step": 699 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 0.004787352867424488, + "learning_rate": 9.055627425614489e-05, + "loss": 0.001, + "step": 700 + }, + { + "epoch": 2.2626262626262625, + "eval_loss": 0.000903558568097651, + "eval_runtime": 18.694, + "eval_samples_per_second": 5.349, + "eval_steps_per_second": 1.337, + "step": 700 + }, + { + "epoch": 2.265858585858586, + "grad_norm": 0.0027677167672663927, + "learning_rate": 9.06856403622251e-05, + "loss": 0.0009, + "step": 701 + }, + { + "epoch": 2.269090909090909, + "grad_norm": 0.0026491908356547356, + "learning_rate": 9.08150064683053e-05, + "loss": 0.0007, + "step": 702 + }, + { + "epoch": 2.2723232323232323, + "grad_norm": 0.0025004090275615454, + "learning_rate": 9.094437257438552e-05, + "loss": 0.0007, + "step": 703 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 0.0028262247797101736, + "learning_rate": 9.107373868046573e-05, + "loss": 0.0008, + "step": 704 + }, + { + "epoch": 2.278787878787879, + "grad_norm": 0.003089368110522628, + "learning_rate": 9.120310478654594e-05, + "loss": 0.0009, + "step": 705 + }, + { + "epoch": 2.282020202020202, + "grad_norm": 0.002824244322255254, + "learning_rate": 9.133247089262613e-05, + "loss": 0.0008, + "step": 706 + }, + { + "epoch": 2.2852525252525253, + "grad_norm": 0.0023714362177997828, + "learning_rate": 9.146183699870634e-05, + "loss": 0.0007, + "step": 707 + }, + { + "epoch": 2.2884848484848486, + "grad_norm": 0.0033874395303428173, + "learning_rate": 9.159120310478655e-05, + "loss": 0.0017, + "step": 708 + }, + { + "epoch": 2.291717171717172, + "grad_norm": 0.0033185749780386686, + "learning_rate": 9.172056921086676e-05, + "loss": 0.0017, + "step": 709 + }, + { + "epoch": 2.294949494949495, + "grad_norm": 0.00360478856600821, + "learning_rate": 9.184993531694696e-05, + "loss": 0.0009, + "step": 710 + }, + { + "epoch": 2.2981818181818183, + "grad_norm": 0.0032836326863616705, + "learning_rate": 9.197930142302717e-05, + "loss": 0.0009, + "step": 711 + }, + { + "epoch": 2.3014141414141416, + "grad_norm": 0.0029353760182857513, + "learning_rate": 9.210866752910737e-05, + "loss": 0.0008, + "step": 712 + }, + { + "epoch": 2.304646464646465, + "grad_norm": 0.004789955448359251, + "learning_rate": 9.223803363518758e-05, + "loss": 0.0015, + "step": 713 + }, + { + "epoch": 2.3078787878787876, + "grad_norm": 0.00258410326205194, + "learning_rate": 9.236739974126779e-05, + "loss": 0.0007, + "step": 714 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.003211255418136716, + "learning_rate": 9.2496765847348e-05, + "loss": 0.0009, + "step": 715 + }, + { + "epoch": 2.314343434343434, + "grad_norm": 0.003458111546933651, + "learning_rate": 9.262613195342821e-05, + "loss": 0.0008, + "step": 716 + }, + { + "epoch": 2.3175757575757574, + "grad_norm": 0.0020549860782921314, + "learning_rate": 9.275549805950842e-05, + "loss": 0.0005, + "step": 717 + }, + { + "epoch": 2.3208080808080807, + "grad_norm": 0.0034006584901362658, + "learning_rate": 9.288486416558863e-05, + "loss": 0.0011, + "step": 718 + }, + { + "epoch": 2.324040404040404, + "grad_norm": 0.003893098793923855, + "learning_rate": 9.301423027166883e-05, + "loss": 0.0008, + "step": 719 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 0.004374553449451923, + "learning_rate": 9.314359637774904e-05, + "loss": 0.0014, + "step": 720 + }, + { + "epoch": 2.3305050505050504, + "grad_norm": 0.004447360523045063, + "learning_rate": 9.327296248382925e-05, + "loss": 0.0016, + "step": 721 + }, + { + "epoch": 2.3337373737373737, + "grad_norm": 0.003019913798198104, + "learning_rate": 9.340232858990944e-05, + "loss": 0.0007, + "step": 722 + }, + { + "epoch": 2.336969696969697, + "grad_norm": 0.00266702170483768, + "learning_rate": 9.353169469598965e-05, + "loss": 0.0006, + "step": 723 + }, + { + "epoch": 2.34020202020202, + "grad_norm": 0.004037540405988693, + "learning_rate": 9.366106080206986e-05, + "loss": 0.001, + "step": 724 + }, + { + "epoch": 2.3434343434343434, + "grad_norm": 0.003830693429335952, + "learning_rate": 9.379042690815006e-05, + "loss": 0.0015, + "step": 725 + }, + { + "epoch": 2.3434343434343434, + "eval_loss": 0.0009221473592333496, + "eval_runtime": 18.682, + "eval_samples_per_second": 5.353, + "eval_steps_per_second": 1.338, + "step": 725 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 0.002714785747230053, + "learning_rate": 9.391979301423027e-05, + "loss": 0.0007, + "step": 726 + }, + { + "epoch": 2.34989898989899, + "grad_norm": 0.0034484020434319973, + "learning_rate": 9.404915912031048e-05, + "loss": 0.0021, + "step": 727 + }, + { + "epoch": 2.353131313131313, + "grad_norm": 0.0027550137601792812, + "learning_rate": 9.417852522639069e-05, + "loss": 0.0014, + "step": 728 + }, + { + "epoch": 2.3563636363636364, + "grad_norm": 0.004323527216911316, + "learning_rate": 9.43078913324709e-05, + "loss": 0.0021, + "step": 729 + }, + { + "epoch": 2.3595959595959597, + "grad_norm": 0.0036909414920955896, + "learning_rate": 9.443725743855111e-05, + "loss": 0.0009, + "step": 730 + }, + { + "epoch": 2.362828282828283, + "grad_norm": 0.0026773421559482813, + "learning_rate": 9.456662354463132e-05, + "loss": 0.0007, + "step": 731 + }, + { + "epoch": 2.366060606060606, + "grad_norm": 0.0027335931081324816, + "learning_rate": 9.469598965071152e-05, + "loss": 0.0011, + "step": 732 + }, + { + "epoch": 2.3692929292929295, + "grad_norm": 0.0037821868900209665, + "learning_rate": 9.482535575679173e-05, + "loss": 0.0009, + "step": 733 + }, + { + "epoch": 2.3725252525252527, + "grad_norm": 0.0027637695893645287, + "learning_rate": 9.495472186287193e-05, + "loss": 0.0008, + "step": 734 + }, + { + "epoch": 2.375757575757576, + "grad_norm": 0.0028088942635804415, + "learning_rate": 9.508408796895214e-05, + "loss": 0.0008, + "step": 735 + }, + { + "epoch": 2.378989898989899, + "grad_norm": 0.004683246370404959, + "learning_rate": 9.521345407503235e-05, + "loss": 0.0011, + "step": 736 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 0.0035344541538506746, + "learning_rate": 9.534282018111256e-05, + "loss": 0.001, + "step": 737 + }, + { + "epoch": 2.3854545454545453, + "grad_norm": 0.003326730104163289, + "learning_rate": 9.547218628719275e-05, + "loss": 0.001, + "step": 738 + }, + { + "epoch": 2.3886868686868685, + "grad_norm": 0.0027331975288689137, + "learning_rate": 9.560155239327296e-05, + "loss": 0.001, + "step": 739 + }, + { + "epoch": 2.391919191919192, + "grad_norm": 0.0023180183488875628, + "learning_rate": 9.573091849935317e-05, + "loss": 0.0005, + "step": 740 + }, + { + "epoch": 2.395151515151515, + "grad_norm": 0.00545561034232378, + "learning_rate": 9.586028460543338e-05, + "loss": 0.0019, + "step": 741 + }, + { + "epoch": 2.3983838383838383, + "grad_norm": 0.002278296509757638, + "learning_rate": 9.59896507115136e-05, + "loss": 0.0008, + "step": 742 + }, + { + "epoch": 2.4016161616161615, + "grad_norm": 0.0036029706243425608, + "learning_rate": 9.61190168175938e-05, + "loss": 0.0011, + "step": 743 + }, + { + "epoch": 2.404848484848485, + "grad_norm": 0.0028302932623773813, + "learning_rate": 9.6248382923674e-05, + "loss": 0.001, + "step": 744 + }, + { + "epoch": 2.408080808080808, + "grad_norm": 0.005623773206025362, + "learning_rate": 9.63777490297542e-05, + "loss": 0.0025, + "step": 745 + }, + { + "epoch": 2.4113131313131313, + "grad_norm": 0.003791616763919592, + "learning_rate": 9.650711513583441e-05, + "loss": 0.001, + "step": 746 + }, + { + "epoch": 2.4145454545454546, + "grad_norm": 0.003847538959234953, + "learning_rate": 9.663648124191462e-05, + "loss": 0.002, + "step": 747 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 0.003144781803712249, + "learning_rate": 9.676584734799483e-05, + "loss": 0.0008, + "step": 748 + }, + { + "epoch": 2.421010101010101, + "grad_norm": 0.0030476360116153955, + "learning_rate": 9.689521345407504e-05, + "loss": 0.002, + "step": 749 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.023542512208223343, + "learning_rate": 9.702457956015525e-05, + "loss": 0.0012, + "step": 750 + }, + { + "epoch": 2.4242424242424243, + "eval_loss": 0.0009755383944138885, + "eval_runtime": 18.7174, + "eval_samples_per_second": 5.343, + "eval_steps_per_second": 1.336, + "step": 750 + }, + { + "epoch": 2.4274747474747476, + "grad_norm": 0.005616334266960621, + "learning_rate": 9.715394566623545e-05, + "loss": 0.0009, + "step": 751 + }, + { + "epoch": 2.430707070707071, + "grad_norm": 0.007717492058873177, + "learning_rate": 9.728331177231566e-05, + "loss": 0.0008, + "step": 752 + }, + { + "epoch": 2.433939393939394, + "grad_norm": 0.008256220258772373, + "learning_rate": 9.741267787839587e-05, + "loss": 0.0011, + "step": 753 + }, + { + "epoch": 2.4371717171717173, + "grad_norm": 0.005972269922494888, + "learning_rate": 9.754204398447608e-05, + "loss": 0.0004, + "step": 754 + }, + { + "epoch": 2.4404040404040406, + "grad_norm": 0.006538939196616411, + "learning_rate": 9.767141009055629e-05, + "loss": 0.0009, + "step": 755 + }, + { + "epoch": 2.443636363636364, + "grad_norm": 0.003234976204112172, + "learning_rate": 9.780077619663648e-05, + "loss": 0.0011, + "step": 756 + }, + { + "epoch": 2.4468686868686866, + "grad_norm": 0.0033180294558405876, + "learning_rate": 9.793014230271668e-05, + "loss": 0.0009, + "step": 757 + }, + { + "epoch": 2.45010101010101, + "grad_norm": 0.0037402757443487644, + "learning_rate": 9.805950840879689e-05, + "loss": 0.0007, + "step": 758 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 0.0028039240278303623, + "learning_rate": 9.81888745148771e-05, + "loss": 0.0008, + "step": 759 + }, + { + "epoch": 2.4565656565656564, + "grad_norm": 0.0029848841950297356, + "learning_rate": 9.831824062095731e-05, + "loss": 0.0007, + "step": 760 + }, + { + "epoch": 2.4597979797979797, + "grad_norm": 0.002809871220961213, + "learning_rate": 9.844760672703752e-05, + "loss": 0.0007, + "step": 761 + }, + { + "epoch": 2.463030303030303, + "grad_norm": 0.004293316043913364, + "learning_rate": 9.857697283311773e-05, + "loss": 0.0012, + "step": 762 + }, + { + "epoch": 2.466262626262626, + "grad_norm": 0.003171822987496853, + "learning_rate": 9.870633893919794e-05, + "loss": 0.0008, + "step": 763 + }, + { + "epoch": 2.4694949494949494, + "grad_norm": 0.0035177527461200953, + "learning_rate": 9.883570504527814e-05, + "loss": 0.0008, + "step": 764 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 0.0037470462266355753, + "learning_rate": 9.896507115135835e-05, + "loss": 0.0011, + "step": 765 + }, + { + "epoch": 2.475959595959596, + "grad_norm": 0.004131955560296774, + "learning_rate": 9.909443725743856e-05, + "loss": 0.0015, + "step": 766 + }, + { + "epoch": 2.479191919191919, + "grad_norm": 0.002823730930685997, + "learning_rate": 9.922380336351877e-05, + "loss": 0.0007, + "step": 767 + }, + { + "epoch": 2.4824242424242424, + "grad_norm": 0.0037240665405988693, + "learning_rate": 9.935316946959897e-05, + "loss": 0.001, + "step": 768 + }, + { + "epoch": 2.4856565656565657, + "grad_norm": 0.0038744148332625628, + "learning_rate": 9.948253557567918e-05, + "loss": 0.0019, + "step": 769 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.003629435319453478, + "learning_rate": 9.961190168175937e-05, + "loss": 0.0008, + "step": 770 + }, + { + "epoch": 2.492121212121212, + "grad_norm": 0.004831044003367424, + "learning_rate": 9.974126778783958e-05, + "loss": 0.0011, + "step": 771 + }, + { + "epoch": 2.4953535353535354, + "grad_norm": 0.003547330852597952, + "learning_rate": 9.98706338939198e-05, + "loss": 0.0009, + "step": 772 + }, + { + "epoch": 2.4985858585858587, + "grad_norm": 0.003263116115704179, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 773 + }, + { + "epoch": 2.501818181818182, + "grad_norm": 0.0030386094003915787, + "learning_rate": 9.999999489471233e-05, + "loss": 0.0007, + "step": 774 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.0036390256136655807, + "learning_rate": 9.99999795788503e-05, + "loss": 0.0012, + "step": 775 + }, + { + "epoch": 2.505050505050505, + "eval_loss": 0.000891694042365998, + "eval_runtime": 18.7365, + "eval_samples_per_second": 5.337, + "eval_steps_per_second": 1.334, + "step": 775 + }, + { + "epoch": 2.5082828282828284, + "grad_norm": 0.0027272645384073257, + "learning_rate": 9.99999540524171e-05, + "loss": 0.0006, + "step": 776 + }, + { + "epoch": 2.5115151515151517, + "grad_norm": 0.004212843254208565, + "learning_rate": 9.999991831541789e-05, + "loss": 0.0023, + "step": 777 + }, + { + "epoch": 2.514747474747475, + "grad_norm": 0.0034173503518104553, + "learning_rate": 9.999987236786e-05, + "loss": 0.0008, + "step": 778 + }, + { + "epoch": 2.517979797979798, + "grad_norm": 0.0028145266696810722, + "learning_rate": 9.999981620975281e-05, + "loss": 0.0007, + "step": 779 + }, + { + "epoch": 2.5212121212121215, + "grad_norm": 0.0030887024477124214, + "learning_rate": 9.999974984110779e-05, + "loss": 0.0009, + "step": 780 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 0.003968552686274052, + "learning_rate": 9.999967326193847e-05, + "loss": 0.0009, + "step": 781 + }, + { + "epoch": 2.5276767676767675, + "grad_norm": 0.00264795683324337, + "learning_rate": 9.999958647226049e-05, + "loss": 0.0006, + "step": 782 + }, + { + "epoch": 2.5309090909090908, + "grad_norm": 0.004550054203718901, + "learning_rate": 9.999948947209162e-05, + "loss": 0.002, + "step": 783 + }, + { + "epoch": 2.534141414141414, + "grad_norm": 0.0030430385377258062, + "learning_rate": 9.999938226145161e-05, + "loss": 0.0007, + "step": 784 + }, + { + "epoch": 2.5373737373737373, + "grad_norm": 0.003766052657738328, + "learning_rate": 9.999926484036237e-05, + "loss": 0.001, + "step": 785 + }, + { + "epoch": 2.5406060606060605, + "grad_norm": 0.003455114783719182, + "learning_rate": 9.999913720884791e-05, + "loss": 0.002, + "step": 786 + }, + { + "epoch": 2.543838383838384, + "grad_norm": 0.002563537796959281, + "learning_rate": 9.999899936693426e-05, + "loss": 0.0008, + "step": 787 + }, + { + "epoch": 2.547070707070707, + "grad_norm": 0.0025122894439846277, + "learning_rate": 9.99988513146496e-05, + "loss": 0.0006, + "step": 788 + }, + { + "epoch": 2.5503030303030303, + "grad_norm": 0.004589984659105539, + "learning_rate": 9.999869305202412e-05, + "loss": 0.0015, + "step": 789 + }, + { + "epoch": 2.5535353535353535, + "grad_norm": 0.0041996450163424015, + "learning_rate": 9.999852457909018e-05, + "loss": 0.0019, + "step": 790 + }, + { + "epoch": 2.556767676767677, + "grad_norm": 0.0033383311238139868, + "learning_rate": 9.999834589588217e-05, + "loss": 0.0013, + "step": 791 + }, + { + "epoch": 2.56, + "grad_norm": 0.0018662047805264592, + "learning_rate": 9.999815700243656e-05, + "loss": 0.0006, + "step": 792 + }, + { + "epoch": 2.5632323232323233, + "grad_norm": 0.0032810529228299856, + "learning_rate": 9.999795789879196e-05, + "loss": 0.0009, + "step": 793 + }, + { + "epoch": 2.5664646464646466, + "grad_norm": 0.003441553795710206, + "learning_rate": 9.9997748584989e-05, + "loss": 0.0013, + "step": 794 + }, + { + "epoch": 2.56969696969697, + "grad_norm": 0.0025200124364346266, + "learning_rate": 9.999752906107042e-05, + "loss": 0.0008, + "step": 795 + }, + { + "epoch": 2.572929292929293, + "grad_norm": 0.0030850358307361603, + "learning_rate": 9.999729932708109e-05, + "loss": 0.0009, + "step": 796 + }, + { + "epoch": 2.5761616161616163, + "grad_norm": 0.0029367755632847548, + "learning_rate": 9.999705938306789e-05, + "loss": 0.0007, + "step": 797 + }, + { + "epoch": 2.579393939393939, + "grad_norm": 0.003588201245293021, + "learning_rate": 9.999680922907982e-05, + "loss": 0.0026, + "step": 798 + }, + { + "epoch": 2.5826262626262624, + "grad_norm": 0.0031253646593540907, + "learning_rate": 9.999654886516798e-05, + "loss": 0.0009, + "step": 799 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 0.0036176196299493313, + "learning_rate": 9.999627829138554e-05, + "loss": 0.0015, + "step": 800 + }, + { + "epoch": 2.5858585858585856, + "eval_loss": 0.0009534513228572905, + "eval_runtime": 18.6223, + "eval_samples_per_second": 5.37, + "eval_steps_per_second": 1.342, + "step": 800 + }, + { + "epoch": 2.589090909090909, + "grad_norm": 0.00325636169873178, + "learning_rate": 9.999599750778772e-05, + "loss": 0.0007, + "step": 801 + }, + { + "epoch": 2.592323232323232, + "grad_norm": 0.003968099132180214, + "learning_rate": 9.999570651443191e-05, + "loss": 0.0011, + "step": 802 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 0.00244711060076952, + "learning_rate": 9.99954053113775e-05, + "loss": 0.0006, + "step": 803 + }, + { + "epoch": 2.5987878787878786, + "grad_norm": 0.003342804964631796, + "learning_rate": 9.9995093898686e-05, + "loss": 0.0014, + "step": 804 + }, + { + "epoch": 2.602020202020202, + "grad_norm": 0.006182527635246515, + "learning_rate": 9.999477227642103e-05, + "loss": 0.0025, + "step": 805 + }, + { + "epoch": 2.605252525252525, + "grad_norm": 0.0032223479356616735, + "learning_rate": 9.999444044464823e-05, + "loss": 0.0007, + "step": 806 + }, + { + "epoch": 2.6084848484848484, + "grad_norm": 0.002208409830927849, + "learning_rate": 9.999409840343539e-05, + "loss": 0.0007, + "step": 807 + }, + { + "epoch": 2.6117171717171717, + "grad_norm": 0.002247196389362216, + "learning_rate": 9.999374615285236e-05, + "loss": 0.0007, + "step": 808 + }, + { + "epoch": 2.614949494949495, + "grad_norm": 0.0024588643573224545, + "learning_rate": 9.999338369297106e-05, + "loss": 0.0005, + "step": 809 + }, + { + "epoch": 2.618181818181818, + "grad_norm": 0.003213444259017706, + "learning_rate": 9.999301102386553e-05, + "loss": 0.0008, + "step": 810 + }, + { + "epoch": 2.6214141414141414, + "grad_norm": 0.0028623330872505903, + "learning_rate": 9.999262814561185e-05, + "loss": 0.0009, + "step": 811 + }, + { + "epoch": 2.6246464646464647, + "grad_norm": 0.0031820102594792843, + "learning_rate": 9.999223505828821e-05, + "loss": 0.001, + "step": 812 + }, + { + "epoch": 2.627878787878788, + "grad_norm": 0.0017483988776803017, + "learning_rate": 9.999183176197491e-05, + "loss": 0.0006, + "step": 813 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 0.0020935633219778538, + "learning_rate": 9.999141825675426e-05, + "loss": 0.0006, + "step": 814 + }, + { + "epoch": 2.6343434343434344, + "grad_norm": 0.003117109416052699, + "learning_rate": 9.999099454271074e-05, + "loss": 0.0008, + "step": 815 + }, + { + "epoch": 2.6375757575757577, + "grad_norm": 0.0037443467881530523, + "learning_rate": 9.999056061993089e-05, + "loss": 0.001, + "step": 816 + }, + { + "epoch": 2.640808080808081, + "grad_norm": 0.0032844438683241606, + "learning_rate": 9.999011648850329e-05, + "loss": 0.0009, + "step": 817 + }, + { + "epoch": 2.644040404040404, + "grad_norm": 0.0027299323119223118, + "learning_rate": 9.998966214851864e-05, + "loss": 0.001, + "step": 818 + }, + { + "epoch": 2.6472727272727274, + "grad_norm": 0.003157002152875066, + "learning_rate": 9.998919760006972e-05, + "loss": 0.001, + "step": 819 + }, + { + "epoch": 2.6505050505050507, + "grad_norm": 0.003021983429789543, + "learning_rate": 9.998872284325142e-05, + "loss": 0.0011, + "step": 820 + }, + { + "epoch": 2.653737373737374, + "grad_norm": 0.0036499700509011745, + "learning_rate": 9.998823787816066e-05, + "loss": 0.001, + "step": 821 + }, + { + "epoch": 2.656969696969697, + "grad_norm": 0.0031615635380148888, + "learning_rate": 9.99877427048965e-05, + "loss": 0.0009, + "step": 822 + }, + { + "epoch": 2.6602020202020205, + "grad_norm": 0.0032300378661602736, + "learning_rate": 9.998723732356006e-05, + "loss": 0.0009, + "step": 823 + }, + { + "epoch": 2.6634343434343437, + "grad_norm": 0.00247188750654459, + "learning_rate": 9.998672173425452e-05, + "loss": 0.0008, + "step": 824 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.004531141836196184, + "learning_rate": 9.998619593708518e-05, + "loss": 0.0011, + "step": 825 + }, + { + "epoch": 2.6666666666666665, + "eval_loss": 0.0008804297540336847, + "eval_runtime": 18.6211, + "eval_samples_per_second": 5.37, + "eval_steps_per_second": 1.343, + "step": 825 + }, + { + "epoch": 2.6698989898989898, + "grad_norm": 0.0034359728451818228, + "learning_rate": 9.998565993215943e-05, + "loss": 0.0009, + "step": 826 + }, + { + "epoch": 2.673131313131313, + "grad_norm": 0.002785489894449711, + "learning_rate": 9.998511371958672e-05, + "loss": 0.0007, + "step": 827 + }, + { + "epoch": 2.6763636363636363, + "grad_norm": 0.0025934847071766853, + "learning_rate": 9.998455729947858e-05, + "loss": 0.0008, + "step": 828 + }, + { + "epoch": 2.6795959595959595, + "grad_norm": 0.0026367492973804474, + "learning_rate": 9.998399067194864e-05, + "loss": 0.0008, + "step": 829 + }, + { + "epoch": 2.682828282828283, + "grad_norm": 0.003330858191475272, + "learning_rate": 9.998341383711263e-05, + "loss": 0.0007, + "step": 830 + }, + { + "epoch": 2.686060606060606, + "grad_norm": 0.003155591432005167, + "learning_rate": 9.998282679508835e-05, + "loss": 0.0017, + "step": 831 + }, + { + "epoch": 2.6892929292929293, + "grad_norm": 0.0030347639694809914, + "learning_rate": 9.998222954599563e-05, + "loss": 0.0007, + "step": 832 + }, + { + "epoch": 2.6925252525252525, + "grad_norm": 0.0032036558259278536, + "learning_rate": 9.99816220899565e-05, + "loss": 0.0009, + "step": 833 + }, + { + "epoch": 2.695757575757576, + "grad_norm": 0.003590774955227971, + "learning_rate": 9.998100442709497e-05, + "loss": 0.0009, + "step": 834 + }, + { + "epoch": 2.698989898989899, + "grad_norm": 0.002222040668129921, + "learning_rate": 9.998037655753717e-05, + "loss": 0.0009, + "step": 835 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 0.0027599751483649015, + "learning_rate": 9.997973848141137e-05, + "loss": 0.0009, + "step": 836 + }, + { + "epoch": 2.7054545454545456, + "grad_norm": 0.00284950970672071, + "learning_rate": 9.997909019884781e-05, + "loss": 0.0008, + "step": 837 + }, + { + "epoch": 2.708686868686869, + "grad_norm": 0.0024073810782283545, + "learning_rate": 9.99784317099789e-05, + "loss": 0.001, + "step": 838 + }, + { + "epoch": 2.711919191919192, + "grad_norm": 0.0033184473868459463, + "learning_rate": 9.997776301493914e-05, + "loss": 0.0016, + "step": 839 + }, + { + "epoch": 2.7151515151515153, + "grad_norm": 0.0023880742955952883, + "learning_rate": 9.997708411386501e-05, + "loss": 0.0015, + "step": 840 + }, + { + "epoch": 2.718383838383838, + "grad_norm": 0.0031732122879475355, + "learning_rate": 9.997639500689523e-05, + "loss": 0.0007, + "step": 841 + }, + { + "epoch": 2.7216161616161614, + "grad_norm": 0.0036892895586788654, + "learning_rate": 9.997569569417049e-05, + "loss": 0.002, + "step": 842 + }, + { + "epoch": 2.7248484848484846, + "grad_norm": 0.002280700486153364, + "learning_rate": 9.997498617583358e-05, + "loss": 0.0018, + "step": 843 + }, + { + "epoch": 2.728080808080808, + "grad_norm": 0.003923532087355852, + "learning_rate": 9.997426645202943e-05, + "loss": 0.0006, + "step": 844 + }, + { + "epoch": 2.731313131313131, + "grad_norm": 0.0025437732692807913, + "learning_rate": 9.9973536522905e-05, + "loss": 0.0009, + "step": 845 + }, + { + "epoch": 2.7345454545454544, + "grad_norm": 0.002573117846623063, + "learning_rate": 9.997279638860933e-05, + "loss": 0.0005, + "step": 846 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 0.0021492561791092157, + "learning_rate": 9.99720460492936e-05, + "loss": 0.0008, + "step": 847 + }, + { + "epoch": 2.741010101010101, + "grad_norm": 0.0021558962762355804, + "learning_rate": 9.997128550511099e-05, + "loss": 0.0006, + "step": 848 + }, + { + "epoch": 2.744242424242424, + "grad_norm": 0.0031675281934440136, + "learning_rate": 9.997051475621687e-05, + "loss": 0.001, + "step": 849 + }, + { + "epoch": 2.7474747474747474, + "grad_norm": 0.003528911853209138, + "learning_rate": 9.996973380276857e-05, + "loss": 0.0007, + "step": 850 + }, + { + "epoch": 2.7474747474747474, + "eval_loss": 0.000889240182004869, + "eval_runtime": 18.6302, + "eval_samples_per_second": 5.368, + "eval_steps_per_second": 1.342, + "step": 850 + }, + { + "epoch": 2.7507070707070707, + "grad_norm": 0.003921836614608765, + "learning_rate": 9.996894264492563e-05, + "loss": 0.001, + "step": 851 + }, + { + "epoch": 2.753939393939394, + "grad_norm": 0.002436129143461585, + "learning_rate": 9.99681412828496e-05, + "loss": 0.0007, + "step": 852 + }, + { + "epoch": 2.757171717171717, + "grad_norm": 0.002964465180411935, + "learning_rate": 9.996732971670408e-05, + "loss": 0.0008, + "step": 853 + }, + { + "epoch": 2.7604040404040404, + "grad_norm": 0.0034572421573102474, + "learning_rate": 9.996650794665487e-05, + "loss": 0.0007, + "step": 854 + }, + { + "epoch": 2.7636363636363637, + "grad_norm": 0.0021866310853511095, + "learning_rate": 9.996567597286974e-05, + "loss": 0.0005, + "step": 855 + }, + { + "epoch": 2.766868686868687, + "grad_norm": 0.0026556740049272776, + "learning_rate": 9.996483379551861e-05, + "loss": 0.0014, + "step": 856 + }, + { + "epoch": 2.77010101010101, + "grad_norm": 0.003023721743375063, + "learning_rate": 9.996398141477344e-05, + "loss": 0.0007, + "step": 857 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 0.0022941052448004484, + "learning_rate": 9.996311883080832e-05, + "loss": 0.0006, + "step": 858 + }, + { + "epoch": 2.7765656565656567, + "grad_norm": 0.003141100751236081, + "learning_rate": 9.996224604379938e-05, + "loss": 0.0007, + "step": 859 + }, + { + "epoch": 2.77979797979798, + "grad_norm": 0.002024593763053417, + "learning_rate": 9.996136305392487e-05, + "loss": 0.0006, + "step": 860 + }, + { + "epoch": 2.783030303030303, + "grad_norm": 0.004872876685112715, + "learning_rate": 9.996046986136509e-05, + "loss": 0.0026, + "step": 861 + }, + { + "epoch": 2.7862626262626264, + "grad_norm": 0.002601329004392028, + "learning_rate": 9.995956646630246e-05, + "loss": 0.0009, + "step": 862 + }, + { + "epoch": 2.7894949494949497, + "grad_norm": 0.0028132593724876642, + "learning_rate": 9.995865286892145e-05, + "loss": 0.0009, + "step": 863 + }, + { + "epoch": 2.792727272727273, + "grad_norm": 0.002662686165422201, + "learning_rate": 9.995772906940864e-05, + "loss": 0.0007, + "step": 864 + }, + { + "epoch": 2.795959595959596, + "grad_norm": 0.0027195930015295744, + "learning_rate": 9.995679506795264e-05, + "loss": 0.0007, + "step": 865 + }, + { + "epoch": 2.7991919191919195, + "grad_norm": 0.003179864026606083, + "learning_rate": 9.995585086474424e-05, + "loss": 0.0014, + "step": 866 + }, + { + "epoch": 2.8024242424242423, + "grad_norm": 0.0034151843283325434, + "learning_rate": 9.995489645997622e-05, + "loss": 0.0009, + "step": 867 + }, + { + "epoch": 2.8056565656565655, + "grad_norm": 0.0026823675725609064, + "learning_rate": 9.99539318538435e-05, + "loss": 0.0007, + "step": 868 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 0.0021664374507963657, + "learning_rate": 9.995295704654304e-05, + "loss": 0.0012, + "step": 869 + }, + { + "epoch": 2.812121212121212, + "grad_norm": 0.003440326079726219, + "learning_rate": 9.995197203827393e-05, + "loss": 0.0009, + "step": 870 + }, + { + "epoch": 2.8153535353535353, + "grad_norm": 0.003980646841228008, + "learning_rate": 9.995097682923733e-05, + "loss": 0.0018, + "step": 871 + }, + { + "epoch": 2.8185858585858585, + "grad_norm": 0.003389423480257392, + "learning_rate": 9.994997141963644e-05, + "loss": 0.001, + "step": 872 + }, + { + "epoch": 2.821818181818182, + "grad_norm": 0.0027842579875141382, + "learning_rate": 9.994895580967658e-05, + "loss": 0.0007, + "step": 873 + }, + { + "epoch": 2.825050505050505, + "grad_norm": 0.0033308009151369333, + "learning_rate": 9.994792999956518e-05, + "loss": 0.0011, + "step": 874 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.002514815656468272, + "learning_rate": 9.994689398951169e-05, + "loss": 0.0009, + "step": 875 + }, + { + "epoch": 2.8282828282828283, + "eval_loss": 0.0008539481204934418, + "eval_runtime": 18.6059, + "eval_samples_per_second": 5.375, + "eval_steps_per_second": 1.344, + "step": 875 + }, + { + "epoch": 2.8315151515151515, + "grad_norm": 0.0024293591268360615, + "learning_rate": 9.994584777972769e-05, + "loss": 0.0008, + "step": 876 + }, + { + "epoch": 2.834747474747475, + "grad_norm": 0.0035905223339796066, + "learning_rate": 9.994479137042683e-05, + "loss": 0.002, + "step": 877 + }, + { + "epoch": 2.837979797979798, + "grad_norm": 0.002918825950473547, + "learning_rate": 9.994372476182484e-05, + "loss": 0.0009, + "step": 878 + }, + { + "epoch": 2.8412121212121213, + "grad_norm": 0.0031527606770396233, + "learning_rate": 9.994264795413953e-05, + "loss": 0.0008, + "step": 879 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 0.0035410267300903797, + "learning_rate": 9.99415609475908e-05, + "loss": 0.0021, + "step": 880 + }, + { + "epoch": 2.847676767676768, + "grad_norm": 0.0020397889893501997, + "learning_rate": 9.994046374240062e-05, + "loss": 0.0007, + "step": 881 + }, + { + "epoch": 2.850909090909091, + "grad_norm": 0.002342833438888192, + "learning_rate": 9.993935633879306e-05, + "loss": 0.0007, + "step": 882 + }, + { + "epoch": 2.854141414141414, + "grad_norm": 0.003542246064171195, + "learning_rate": 9.993823873699426e-05, + "loss": 0.0008, + "step": 883 + }, + { + "epoch": 2.857373737373737, + "grad_norm": 0.0024878752883523703, + "learning_rate": 9.993711093723245e-05, + "loss": 0.0008, + "step": 884 + }, + { + "epoch": 2.8606060606060604, + "grad_norm": 0.0030815114732831717, + "learning_rate": 9.993597293973796e-05, + "loss": 0.0018, + "step": 885 + }, + { + "epoch": 2.8638383838383836, + "grad_norm": 0.004479612223803997, + "learning_rate": 9.993482474474314e-05, + "loss": 0.0009, + "step": 886 + }, + { + "epoch": 2.867070707070707, + "grad_norm": 0.0033372335601598024, + "learning_rate": 9.99336663524825e-05, + "loss": 0.0009, + "step": 887 + }, + { + "epoch": 2.87030303030303, + "grad_norm": 0.0021939794532954693, + "learning_rate": 9.993249776319258e-05, + "loss": 0.0006, + "step": 888 + }, + { + "epoch": 2.8735353535353534, + "grad_norm": 0.0028632464818656445, + "learning_rate": 9.993131897711202e-05, + "loss": 0.0009, + "step": 889 + }, + { + "epoch": 2.8767676767676766, + "grad_norm": 0.002615751465782523, + "learning_rate": 9.993012999448154e-05, + "loss": 0.0009, + "step": 890 + }, + { + "epoch": 2.88, + "grad_norm": 0.0017363366205245256, + "learning_rate": 9.992893081554397e-05, + "loss": 0.0005, + "step": 891 + }, + { + "epoch": 2.883232323232323, + "grad_norm": 0.0032067778520286083, + "learning_rate": 9.992772144054415e-05, + "loss": 0.0009, + "step": 892 + }, + { + "epoch": 2.8864646464646464, + "grad_norm": 0.0025459511671215296, + "learning_rate": 9.992650186972909e-05, + "loss": 0.0008, + "step": 893 + }, + { + "epoch": 2.8896969696969697, + "grad_norm": 0.001871303771622479, + "learning_rate": 9.99252721033478e-05, + "loss": 0.0008, + "step": 894 + }, + { + "epoch": 2.892929292929293, + "grad_norm": 0.001998218474909663, + "learning_rate": 9.992403214165147e-05, + "loss": 0.0005, + "step": 895 + }, + { + "epoch": 2.896161616161616, + "grad_norm": 0.0023115125950425863, + "learning_rate": 9.992278198489327e-05, + "loss": 0.0007, + "step": 896 + }, + { + "epoch": 2.8993939393939394, + "grad_norm": 0.0019588919822126627, + "learning_rate": 9.99215216333285e-05, + "loss": 0.0007, + "step": 897 + }, + { + "epoch": 2.9026262626262627, + "grad_norm": 0.001903701457194984, + "learning_rate": 9.992025108721454e-05, + "loss": 0.0004, + "step": 898 + }, + { + "epoch": 2.905858585858586, + "grad_norm": 0.0023024296388030052, + "learning_rate": 9.991897034681087e-05, + "loss": 0.0006, + "step": 899 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.002626319881528616, + "learning_rate": 9.9917679412379e-05, + "loss": 0.0009, + "step": 900 + }, + { + "epoch": 2.909090909090909, + "eval_loss": 0.0008496911614201963, + "eval_runtime": 18.6252, + "eval_samples_per_second": 5.369, + "eval_steps_per_second": 1.342, + "step": 900 + }, + { + "epoch": 2.9123232323232324, + "grad_norm": 0.0029552706982940435, + "learning_rate": 9.99163782841826e-05, + "loss": 0.001, + "step": 901 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 0.002366641303524375, + "learning_rate": 9.991506696248731e-05, + "loss": 0.0006, + "step": 902 + }, + { + "epoch": 2.918787878787879, + "grad_norm": 0.0030002701096236706, + "learning_rate": 9.991374544756098e-05, + "loss": 0.0009, + "step": 903 + }, + { + "epoch": 2.922020202020202, + "grad_norm": 0.002418822841718793, + "learning_rate": 9.991241373967344e-05, + "loss": 0.0005, + "step": 904 + }, + { + "epoch": 2.9252525252525254, + "grad_norm": 0.0027566729113459587, + "learning_rate": 9.991107183909664e-05, + "loss": 0.002, + "step": 905 + }, + { + "epoch": 2.9284848484848487, + "grad_norm": 0.003200074890628457, + "learning_rate": 9.990971974610466e-05, + "loss": 0.0009, + "step": 906 + }, + { + "epoch": 2.931717171717172, + "grad_norm": 0.00368666322901845, + "learning_rate": 9.990835746097356e-05, + "loss": 0.0008, + "step": 907 + }, + { + "epoch": 2.934949494949495, + "grad_norm": 0.0036671683192253113, + "learning_rate": 9.990698498398155e-05, + "loss": 0.001, + "step": 908 + }, + { + "epoch": 2.9381818181818184, + "grad_norm": 0.0037938845343887806, + "learning_rate": 9.990560231540889e-05, + "loss": 0.0007, + "step": 909 + }, + { + "epoch": 2.9414141414141413, + "grad_norm": 0.0027589588426053524, + "learning_rate": 9.990420945553797e-05, + "loss": 0.0006, + "step": 910 + }, + { + "epoch": 2.9446464646464645, + "grad_norm": 0.003152150195091963, + "learning_rate": 9.990280640465321e-05, + "loss": 0.0008, + "step": 911 + }, + { + "epoch": 2.9478787878787878, + "grad_norm": 0.002495568012818694, + "learning_rate": 9.990139316304112e-05, + "loss": 0.0007, + "step": 912 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 0.0025980276986956596, + "learning_rate": 9.989996973099032e-05, + "loss": 0.0008, + "step": 913 + }, + { + "epoch": 2.9543434343434343, + "grad_norm": 0.0013909428380429745, + "learning_rate": 9.989853610879147e-05, + "loss": 0.0004, + "step": 914 + }, + { + "epoch": 2.9575757575757575, + "grad_norm": 0.0025712710339576006, + "learning_rate": 9.989709229673736e-05, + "loss": 0.0008, + "step": 915 + }, + { + "epoch": 2.9608080808080808, + "grad_norm": 0.0029310788959264755, + "learning_rate": 9.98956382951228e-05, + "loss": 0.0007, + "step": 916 + }, + { + "epoch": 2.964040404040404, + "grad_norm": 0.0041606370359659195, + "learning_rate": 9.989417410424475e-05, + "loss": 0.0027, + "step": 917 + }, + { + "epoch": 2.9672727272727273, + "grad_norm": 0.001812944421544671, + "learning_rate": 9.98926997244022e-05, + "loss": 0.0006, + "step": 918 + }, + { + "epoch": 2.9705050505050505, + "grad_norm": 0.0026405269745737314, + "learning_rate": 9.989121515589622e-05, + "loss": 0.0011, + "step": 919 + }, + { + "epoch": 2.973737373737374, + "grad_norm": 0.003111011115834117, + "learning_rate": 9.988972039902997e-05, + "loss": 0.0009, + "step": 920 + }, + { + "epoch": 2.976969696969697, + "grad_norm": 0.0038792812265455723, + "learning_rate": 9.988821545410874e-05, + "loss": 0.0018, + "step": 921 + }, + { + "epoch": 2.9802020202020203, + "grad_norm": 0.0031774002127349377, + "learning_rate": 9.988670032143981e-05, + "loss": 0.0011, + "step": 922 + }, + { + "epoch": 2.9834343434343435, + "grad_norm": 0.002835791325196624, + "learning_rate": 9.988517500133262e-05, + "loss": 0.0007, + "step": 923 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 0.003195343306288123, + "learning_rate": 9.988363949409865e-05, + "loss": 0.0008, + "step": 924 + }, + { + "epoch": 2.98989898989899, + "grad_norm": 0.004220789764076471, + "learning_rate": 9.988209380005144e-05, + "loss": 0.001, + "step": 925 + }, + { + "epoch": 2.98989898989899, + "eval_loss": 0.000867326685693115, + "eval_runtime": 18.6533, + "eval_samples_per_second": 5.361, + "eval_steps_per_second": 1.34, + "step": 925 + }, + { + "epoch": 2.993131313131313, + "grad_norm": 0.0027963484171777964, + "learning_rate": 9.98805379195067e-05, + "loss": 0.001, + "step": 926 + }, + { + "epoch": 2.996363636363636, + "grad_norm": 0.002367103472352028, + "learning_rate": 9.987897185278208e-05, + "loss": 0.0006, + "step": 927 + }, + { + "epoch": 2.9995959595959594, + "grad_norm": 0.008955973200500011, + "learning_rate": 9.987739560019746e-05, + "loss": 0.0042, + "step": 928 + }, + { + "epoch": 3.0028282828282826, + "grad_norm": 0.0027045756578445435, + "learning_rate": 9.987580916207468e-05, + "loss": 0.0019, + "step": 929 + }, + { + "epoch": 3.006060606060606, + "grad_norm": 0.002136245835572481, + "learning_rate": 9.987421253873775e-05, + "loss": 0.0013, + "step": 930 + }, + { + "epoch": 3.009292929292929, + "grad_norm": 0.0018668243428692222, + "learning_rate": 9.987260573051269e-05, + "loss": 0.0006, + "step": 931 + }, + { + "epoch": 3.0125252525252524, + "grad_norm": 0.0018371654441580176, + "learning_rate": 9.987098873772763e-05, + "loss": 0.0004, + "step": 932 + }, + { + "epoch": 3.0157575757575756, + "grad_norm": 0.0020080492831766605, + "learning_rate": 9.986936156071278e-05, + "loss": 0.0007, + "step": 933 + }, + { + "epoch": 3.018989898989899, + "grad_norm": 0.0019176624482497573, + "learning_rate": 9.986772419980044e-05, + "loss": 0.0006, + "step": 934 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 0.0024317919742316008, + "learning_rate": 9.986607665532497e-05, + "loss": 0.0006, + "step": 935 + }, + { + "epoch": 3.0254545454545454, + "grad_norm": 0.002591727999970317, + "learning_rate": 9.986441892762281e-05, + "loss": 0.0006, + "step": 936 + }, + { + "epoch": 3.0286868686868686, + "grad_norm": 0.0026875538751482964, + "learning_rate": 9.98627510170325e-05, + "loss": 0.0008, + "step": 937 + }, + { + "epoch": 3.031919191919192, + "grad_norm": 0.003272917354479432, + "learning_rate": 9.986107292389464e-05, + "loss": 0.0007, + "step": 938 + }, + { + "epoch": 3.035151515151515, + "grad_norm": 0.002936862874776125, + "learning_rate": 9.985938464855191e-05, + "loss": 0.0008, + "step": 939 + }, + { + "epoch": 3.0383838383838384, + "grad_norm": 0.0021727578714489937, + "learning_rate": 9.985768619134909e-05, + "loss": 0.0007, + "step": 940 + }, + { + "epoch": 3.0416161616161617, + "grad_norm": 0.002251671627163887, + "learning_rate": 9.985597755263302e-05, + "loss": 0.0006, + "step": 941 + }, + { + "epoch": 3.044848484848485, + "grad_norm": 0.0035938192158937454, + "learning_rate": 9.985425873275263e-05, + "loss": 0.0011, + "step": 942 + }, + { + "epoch": 3.048080808080808, + "grad_norm": 0.0024208389222621918, + "learning_rate": 9.98525297320589e-05, + "loss": 0.0007, + "step": 943 + }, + { + "epoch": 3.0513131313131314, + "grad_norm": 0.0019874447025358677, + "learning_rate": 9.985079055090493e-05, + "loss": 0.0005, + "step": 944 + }, + { + "epoch": 3.0545454545454547, + "grad_norm": 0.0014314191648736596, + "learning_rate": 9.984904118964588e-05, + "loss": 0.0004, + "step": 945 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 0.0019143620738759637, + "learning_rate": 9.984728164863898e-05, + "loss": 0.0006, + "step": 946 + }, + { + "epoch": 3.061010101010101, + "grad_norm": 0.003406926291063428, + "learning_rate": 9.984551192824355e-05, + "loss": 0.0011, + "step": 947 + }, + { + "epoch": 3.0642424242424244, + "grad_norm": 0.0015149825485423207, + "learning_rate": 9.9843732028821e-05, + "loss": 0.0004, + "step": 948 + }, + { + "epoch": 3.0674747474747477, + "grad_norm": 0.0028568189591169357, + "learning_rate": 9.98419419507348e-05, + "loss": 0.0008, + "step": 949 + }, + { + "epoch": 3.0707070707070705, + "grad_norm": 0.00383683480322361, + "learning_rate": 9.98401416943505e-05, + "loss": 0.0006, + "step": 950 + }, + { + "epoch": 3.0707070707070705, + "eval_loss": 0.0009233049931935966, + "eval_runtime": 18.6291, + "eval_samples_per_second": 5.368, + "eval_steps_per_second": 1.342, + "step": 950 + }, + { + "epoch": 3.0739393939393937, + "grad_norm": 0.004385147709399462, + "learning_rate": 9.983833126003572e-05, + "loss": 0.0009, + "step": 951 + }, + { + "epoch": 3.077171717171717, + "grad_norm": 0.0024896373506635427, + "learning_rate": 9.98365106481602e-05, + "loss": 0.0006, + "step": 952 + }, + { + "epoch": 3.0804040404040403, + "grad_norm": 0.002700845478102565, + "learning_rate": 9.983467985909573e-05, + "loss": 0.0006, + "step": 953 + }, + { + "epoch": 3.0836363636363635, + "grad_norm": 0.003025912679731846, + "learning_rate": 9.983283889321615e-05, + "loss": 0.0008, + "step": 954 + }, + { + "epoch": 3.0868686868686868, + "grad_norm": 0.003908303566277027, + "learning_rate": 9.983098775089742e-05, + "loss": 0.0017, + "step": 955 + }, + { + "epoch": 3.09010101010101, + "grad_norm": 0.0035844456870108843, + "learning_rate": 9.982912643251757e-05, + "loss": 0.0024, + "step": 956 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 0.0022256888914853334, + "learning_rate": 9.98272549384567e-05, + "loss": 0.0005, + "step": 957 + }, + { + "epoch": 3.0965656565656565, + "grad_norm": 0.0024845930747687817, + "learning_rate": 9.982537326909697e-05, + "loss": 0.0007, + "step": 958 + }, + { + "epoch": 3.0997979797979798, + "grad_norm": 0.0021472277585417032, + "learning_rate": 9.982348142482269e-05, + "loss": 0.0006, + "step": 959 + }, + { + "epoch": 3.103030303030303, + "grad_norm": 0.0027109517250210047, + "learning_rate": 9.982157940602014e-05, + "loss": 0.0007, + "step": 960 + }, + { + "epoch": 3.1062626262626263, + "grad_norm": 0.0016069613629952073, + "learning_rate": 9.981966721307778e-05, + "loss": 0.0005, + "step": 961 + }, + { + "epoch": 3.1094949494949495, + "grad_norm": 0.002237373497337103, + "learning_rate": 9.981774484638606e-05, + "loss": 0.0005, + "step": 962 + }, + { + "epoch": 3.112727272727273, + "grad_norm": 0.002959214383736253, + "learning_rate": 9.981581230633758e-05, + "loss": 0.001, + "step": 963 + }, + { + "epoch": 3.115959595959596, + "grad_norm": 0.0031158181373029947, + "learning_rate": 9.981386959332697e-05, + "loss": 0.0008, + "step": 964 + }, + { + "epoch": 3.1191919191919193, + "grad_norm": 0.007239287253469229, + "learning_rate": 9.981191670775097e-05, + "loss": 0.0037, + "step": 965 + }, + { + "epoch": 3.1224242424242425, + "grad_norm": 0.0029164832085371017, + "learning_rate": 9.980995365000836e-05, + "loss": 0.0006, + "step": 966 + }, + { + "epoch": 3.125656565656566, + "grad_norm": 0.0018486580811440945, + "learning_rate": 9.980798042050004e-05, + "loss": 0.0004, + "step": 967 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 0.0032664432656019926, + "learning_rate": 9.980599701962896e-05, + "loss": 0.0009, + "step": 968 + }, + { + "epoch": 3.1321212121212123, + "grad_norm": 0.0037746732123196125, + "learning_rate": 9.980400344780015e-05, + "loss": 0.0015, + "step": 969 + }, + { + "epoch": 3.1353535353535356, + "grad_norm": 0.002780639799311757, + "learning_rate": 9.98019997054207e-05, + "loss": 0.0006, + "step": 970 + }, + { + "epoch": 3.1385858585858584, + "grad_norm": 0.0026836313772946596, + "learning_rate": 9.979998579289984e-05, + "loss": 0.0006, + "step": 971 + }, + { + "epoch": 3.1418181818181816, + "grad_norm": 0.002338019199669361, + "learning_rate": 9.979796171064881e-05, + "loss": 0.0007, + "step": 972 + }, + { + "epoch": 3.145050505050505, + "grad_norm": 0.0024902368895709515, + "learning_rate": 9.979592745908095e-05, + "loss": 0.0011, + "step": 973 + }, + { + "epoch": 3.148282828282828, + "grad_norm": 0.0033587226644158363, + "learning_rate": 9.979388303861169e-05, + "loss": 0.0006, + "step": 974 + }, + { + "epoch": 3.1515151515151514, + "grad_norm": 0.002750332234427333, + "learning_rate": 9.97918284496585e-05, + "loss": 0.0006, + "step": 975 + }, + { + "epoch": 3.1515151515151514, + "eval_loss": 0.0009188714902848005, + "eval_runtime": 18.6436, + "eval_samples_per_second": 5.364, + "eval_steps_per_second": 1.341, + "step": 975 + }, + { + "epoch": 3.1547474747474746, + "grad_norm": 0.004522048868238926, + "learning_rate": 9.978976369264098e-05, + "loss": 0.0009, + "step": 976 + }, + { + "epoch": 3.157979797979798, + "grad_norm": 0.0020558724645525217, + "learning_rate": 9.978768876798075e-05, + "loss": 0.0004, + "step": 977 + }, + { + "epoch": 3.161212121212121, + "grad_norm": 0.0017784403171390295, + "learning_rate": 9.978560367610156e-05, + "loss": 0.0004, + "step": 978 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 0.0039355335757136345, + "learning_rate": 9.978350841742919e-05, + "loss": 0.0017, + "step": 979 + }, + { + "epoch": 3.1676767676767676, + "grad_norm": 0.0031672257464379072, + "learning_rate": 9.978140299239152e-05, + "loss": 0.0017, + "step": 980 + }, + { + "epoch": 3.170909090909091, + "grad_norm": 0.0048868078738451, + "learning_rate": 9.977928740141851e-05, + "loss": 0.001, + "step": 981 + }, + { + "epoch": 3.174141414141414, + "grad_norm": 0.002355673350393772, + "learning_rate": 9.977716164494217e-05, + "loss": 0.0005, + "step": 982 + }, + { + "epoch": 3.1773737373737374, + "grad_norm": 0.0035993049386888742, + "learning_rate": 9.977502572339664e-05, + "loss": 0.001, + "step": 983 + }, + { + "epoch": 3.1806060606060607, + "grad_norm": 0.00291255209594965, + "learning_rate": 9.977287963721804e-05, + "loss": 0.0007, + "step": 984 + }, + { + "epoch": 3.183838383838384, + "grad_norm": 0.002027621492743492, + "learning_rate": 9.977072338684469e-05, + "loss": 0.0006, + "step": 985 + }, + { + "epoch": 3.187070707070707, + "grad_norm": 0.0021479360293596983, + "learning_rate": 9.976855697271689e-05, + "loss": 0.0006, + "step": 986 + }, + { + "epoch": 3.1903030303030304, + "grad_norm": 0.002248652745038271, + "learning_rate": 9.976638039527704e-05, + "loss": 0.0005, + "step": 987 + }, + { + "epoch": 3.1935353535353537, + "grad_norm": 0.002659781137481332, + "learning_rate": 9.976419365496963e-05, + "loss": 0.0008, + "step": 988 + }, + { + "epoch": 3.196767676767677, + "grad_norm": 0.0037245461717247963, + "learning_rate": 9.976199675224123e-05, + "loss": 0.001, + "step": 989 + }, + { + "epoch": 3.2, + "grad_norm": 0.004409831948578358, + "learning_rate": 9.975978968754045e-05, + "loss": 0.0022, + "step": 990 + }, + { + "epoch": 3.2032323232323234, + "grad_norm": 0.0030354245100170374, + "learning_rate": 9.975757246131803e-05, + "loss": 0.0006, + "step": 991 + }, + { + "epoch": 3.2064646464646467, + "grad_norm": 0.0026450392324477434, + "learning_rate": 9.975534507402671e-05, + "loss": 0.0008, + "step": 992 + }, + { + "epoch": 3.2096969696969695, + "grad_norm": 0.003626378020271659, + "learning_rate": 9.975310752612137e-05, + "loss": 0.0004, + "step": 993 + }, + { + "epoch": 3.2129292929292927, + "grad_norm": 0.0026495913043618202, + "learning_rate": 9.975085981805897e-05, + "loss": 0.0018, + "step": 994 + }, + { + "epoch": 3.216161616161616, + "grad_norm": 0.0032507823780179024, + "learning_rate": 9.974860195029847e-05, + "loss": 0.0012, + "step": 995 + }, + { + "epoch": 3.2193939393939393, + "grad_norm": 0.002977850381284952, + "learning_rate": 9.974633392330097e-05, + "loss": 0.0006, + "step": 996 + }, + { + "epoch": 3.2226262626262625, + "grad_norm": 0.002434935886412859, + "learning_rate": 9.974405573752965e-05, + "loss": 0.0006, + "step": 997 + }, + { + "epoch": 3.2258585858585858, + "grad_norm": 0.002726243808865547, + "learning_rate": 9.974176739344971e-05, + "loss": 0.0007, + "step": 998 + }, + { + "epoch": 3.229090909090909, + "grad_norm": 0.003465299028903246, + "learning_rate": 9.973946889152847e-05, + "loss": 0.0006, + "step": 999 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.002784241922199726, + "learning_rate": 9.973716023223531e-05, + "loss": 0.0007, + "step": 1000 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.0009049187647178769, + "eval_runtime": 18.6293, + "eval_samples_per_second": 5.368, + "eval_steps_per_second": 1.342, + "step": 1000 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 0.0021909528877586126, + "learning_rate": 9.97348414160417e-05, + "loss": 0.0005, + "step": 1001 + }, + { + "epoch": 3.2387878787878788, + "grad_norm": 0.003046189434826374, + "learning_rate": 9.973251244342114e-05, + "loss": 0.0009, + "step": 1002 + }, + { + "epoch": 3.242020202020202, + "grad_norm": 0.004089404363185167, + "learning_rate": 9.973017331484926e-05, + "loss": 0.0019, + "step": 1003 + }, + { + "epoch": 3.2452525252525253, + "grad_norm": 0.002897520549595356, + "learning_rate": 9.972782403080372e-05, + "loss": 0.0009, + "step": 1004 + }, + { + "epoch": 3.2484848484848485, + "grad_norm": 0.0027832165360450745, + "learning_rate": 9.972546459176425e-05, + "loss": 0.0006, + "step": 1005 + }, + { + "epoch": 3.251717171717172, + "grad_norm": 0.0021642649080604315, + "learning_rate": 9.972309499821273e-05, + "loss": 0.0005, + "step": 1006 + }, + { + "epoch": 3.254949494949495, + "grad_norm": 0.0030184059869498014, + "learning_rate": 9.972071525063303e-05, + "loss": 0.0019, + "step": 1007 + }, + { + "epoch": 3.2581818181818183, + "grad_norm": 0.0022592165041714907, + "learning_rate": 9.971832534951108e-05, + "loss": 0.0005, + "step": 1008 + }, + { + "epoch": 3.2614141414141415, + "grad_norm": 0.0013180490350350738, + "learning_rate": 9.9715925295335e-05, + "loss": 0.0004, + "step": 1009 + }, + { + "epoch": 3.264646464646465, + "grad_norm": 0.00403400557115674, + "learning_rate": 9.971351508859488e-05, + "loss": 0.0007, + "step": 1010 + }, + { + "epoch": 3.267878787878788, + "grad_norm": 0.003028259379789233, + "learning_rate": 9.971109472978288e-05, + "loss": 0.0013, + "step": 1011 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 0.004238371271640062, + "learning_rate": 9.97086642193933e-05, + "loss": 0.0008, + "step": 1012 + }, + { + "epoch": 3.274343434343434, + "grad_norm": 0.003420765744522214, + "learning_rate": 9.970622355792247e-05, + "loss": 0.002, + "step": 1013 + }, + { + "epoch": 3.2775757575757574, + "grad_norm": 0.002874986035749316, + "learning_rate": 9.970377274586879e-05, + "loss": 0.0007, + "step": 1014 + }, + { + "epoch": 3.2808080808080806, + "grad_norm": 0.001124391914345324, + "learning_rate": 9.970131178373277e-05, + "loss": 0.0003, + "step": 1015 + }, + { + "epoch": 3.284040404040404, + "grad_norm": 0.0029891314916312695, + "learning_rate": 9.969884067201695e-05, + "loss": 0.0007, + "step": 1016 + }, + { + "epoch": 3.287272727272727, + "grad_norm": 0.0026401756331324577, + "learning_rate": 9.969635941122595e-05, + "loss": 0.0007, + "step": 1017 + }, + { + "epoch": 3.2905050505050504, + "grad_norm": 0.001897217589430511, + "learning_rate": 9.969386800186649e-05, + "loss": 0.0004, + "step": 1018 + }, + { + "epoch": 3.2937373737373736, + "grad_norm": 0.002726042177528143, + "learning_rate": 9.969136644444731e-05, + "loss": 0.0008, + "step": 1019 + }, + { + "epoch": 3.296969696969697, + "grad_norm": 0.0030468441545963287, + "learning_rate": 9.968885473947932e-05, + "loss": 0.0008, + "step": 1020 + }, + { + "epoch": 3.30020202020202, + "grad_norm": 0.0016252384521067142, + "learning_rate": 9.968633288747539e-05, + "loss": 0.0004, + "step": 1021 + }, + { + "epoch": 3.3034343434343434, + "grad_norm": 0.002642759820446372, + "learning_rate": 9.968380088895052e-05, + "loss": 0.0007, + "step": 1022 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 0.003615351626649499, + "learning_rate": 9.968125874442179e-05, + "loss": 0.0008, + "step": 1023 + }, + { + "epoch": 3.30989898989899, + "grad_norm": 0.0026971250772476196, + "learning_rate": 9.96787064544083e-05, + "loss": 0.0007, + "step": 1024 + }, + { + "epoch": 3.313131313131313, + "grad_norm": 0.001658923109062016, + "learning_rate": 9.96761440194313e-05, + "loss": 0.0004, + "step": 1025 + }, + { + "epoch": 3.313131313131313, + "eval_loss": 0.0008755004382692277, + "eval_runtime": 18.6451, + "eval_samples_per_second": 5.363, + "eval_steps_per_second": 1.341, + "step": 1025 + }, + { + "epoch": 3.313131313131313, + "step": 1025, + "total_flos": 2.5838521494131835e+18, + "train_loss": 0.001856716921518943, + "train_runtime": 20010.9872, + "train_samples_per_second": 12.368, + "train_steps_per_second": 0.386 + } + ], + "logging_steps": 1, + "max_steps": 7725, + "num_input_tokens_seen": 0, + "num_train_epochs": 25, + "save_steps": 100, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 4 + } + }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } }, - "total_flos": 1.2254685925518213e+18, + "total_flos": 2.5838521494131835e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null