diff --git "a/results/checkpoint-8922/trainer_state.json" "b/results/checkpoint-8922/trainer_state.json" new file mode 100644--- /dev/null +++ "b/results/checkpoint-8922/trainer_state.json" @@ -0,0 +1,6413 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 8922, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006724949562878279, + "grad_norm": 7.708007335662842, + "learning_rate": 1.0000000000000002e-06, + "loss": 5.3528, + "step": 10 + }, + { + "epoch": 0.013449899125756557, + "grad_norm": 4.7876434326171875, + "learning_rate": 2.0000000000000003e-06, + "loss": 5.3265, + "step": 20 + }, + { + "epoch": 0.020174848688634835, + "grad_norm": 4.998968124389648, + "learning_rate": 3e-06, + "loss": 5.2177, + "step": 30 + }, + { + "epoch": 0.026899798251513115, + "grad_norm": 4.760256290435791, + "learning_rate": 3.9e-06, + "loss": 5.2683, + "step": 40 + }, + { + "epoch": 0.03362474781439139, + "grad_norm": 6.136503219604492, + "learning_rate": 4.9000000000000005e-06, + "loss": 5.0256, + "step": 50 + }, + { + "epoch": 0.04034969737726967, + "grad_norm": 4.262901782989502, + "learning_rate": 5.9e-06, + "loss": 4.9378, + "step": 60 + }, + { + "epoch": 0.04707464694014795, + "grad_norm": 3.3165853023529053, + "learning_rate": 6.900000000000001e-06, + "loss": 4.6847, + "step": 70 + }, + { + "epoch": 0.05379959650302623, + "grad_norm": 3.6227917671203613, + "learning_rate": 7.9e-06, + "loss": 4.573, + "step": 80 + }, + { + "epoch": 0.0605245460659045, + "grad_norm": 3.023857593536377, + "learning_rate": 8.8e-06, + "loss": 4.2629, + "step": 90 + }, + { + "epoch": 0.06724949562878278, + "grad_norm": 2.9306392669677734, + "learning_rate": 9.800000000000001e-06, + "loss": 3.9944, + "step": 100 + }, + { + "epoch": 0.07397444519166106, + "grad_norm": 2.7784805297851562, + "learning_rate": 1.0700000000000001e-05, + "loss": 3.5371, + "step": 110 + }, + { + "epoch": 0.08069939475453934, + "grad_norm": 2.2752227783203125, + "learning_rate": 1.1700000000000001e-05, + "loss": 3.5498, + "step": 120 + }, + { + "epoch": 0.08742434431741762, + "grad_norm": 2.1082851886749268, + "learning_rate": 1.27e-05, + "loss": 3.3127, + "step": 130 + }, + { + "epoch": 0.0941492938802959, + "grad_norm": 2.923049211502075, + "learning_rate": 1.3700000000000001e-05, + "loss": 3.102, + "step": 140 + }, + { + "epoch": 0.10087424344317418, + "grad_norm": 2.354473352432251, + "learning_rate": 1.47e-05, + "loss": 2.9127, + "step": 150 + }, + { + "epoch": 0.10759919300605246, + "grad_norm": 2.0444324016571045, + "learning_rate": 1.5700000000000002e-05, + "loss": 2.8566, + "step": 160 + }, + { + "epoch": 0.11432414256893074, + "grad_norm": 3.295562267303467, + "learning_rate": 1.6700000000000003e-05, + "loss": 2.6766, + "step": 170 + }, + { + "epoch": 0.121049092131809, + "grad_norm": 21.337621688842773, + "learning_rate": 1.77e-05, + "loss": 2.4635, + "step": 180 + }, + { + "epoch": 0.12777404169468728, + "grad_norm": 2.203986167907715, + "learning_rate": 1.87e-05, + "loss": 2.4118, + "step": 190 + }, + { + "epoch": 0.13449899125756556, + "grad_norm": 2.7684690952301025, + "learning_rate": 1.97e-05, + "loss": 2.2789, + "step": 200 + }, + { + "epoch": 0.14122394082044384, + "grad_norm": 2.3202223777770996, + "learning_rate": 2.07e-05, + "loss": 2.1204, + "step": 210 + }, + { + "epoch": 0.14794889038332212, + "grad_norm": 2.229384660720825, + "learning_rate": 2.1700000000000002e-05, + "loss": 1.9699, + "step": 220 + }, + { + "epoch": 0.1546738399462004, + "grad_norm": 2.0208075046539307, + "learning_rate": 2.2700000000000003e-05, + "loss": 1.935, + "step": 230 + }, + { + "epoch": 0.16139878950907868, + "grad_norm": 2.1106619834899902, + "learning_rate": 2.37e-05, + "loss": 1.8648, + "step": 240 + }, + { + "epoch": 0.16812373907195696, + "grad_norm": 2.151031017303467, + "learning_rate": 2.47e-05, + "loss": 1.7949, + "step": 250 + }, + { + "epoch": 0.17484868863483524, + "grad_norm": 2.3149192333221436, + "learning_rate": 2.57e-05, + "loss": 1.6146, + "step": 260 + }, + { + "epoch": 0.18157363819771352, + "grad_norm": 2.266303539276123, + "learning_rate": 2.6700000000000002e-05, + "loss": 1.6113, + "step": 270 + }, + { + "epoch": 0.1882985877605918, + "grad_norm": NaN, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.5469, + "step": 280 + }, + { + "epoch": 0.19502353732347008, + "grad_norm": 2.277637243270874, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.5391, + "step": 290 + }, + { + "epoch": 0.20174848688634836, + "grad_norm": 2.165311574935913, + "learning_rate": 2.95e-05, + "loss": 1.407, + "step": 300 + }, + { + "epoch": 0.20847343644922664, + "grad_norm": 2.2575840950012207, + "learning_rate": 3.05e-05, + "loss": 1.2985, + "step": 310 + }, + { + "epoch": 0.21519838601210492, + "grad_norm": 2.3073315620422363, + "learning_rate": 3.15e-05, + "loss": 1.3224, + "step": 320 + }, + { + "epoch": 0.2219233355749832, + "grad_norm": 2.1623892784118652, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.314, + "step": 330 + }, + { + "epoch": 0.22864828513786148, + "grad_norm": 1.951863408088684, + "learning_rate": 3.3400000000000005e-05, + "loss": 1.257, + "step": 340 + }, + { + "epoch": 0.23537323470073973, + "grad_norm": 2.1350021362304688, + "learning_rate": 3.4399999999999996e-05, + "loss": 1.1251, + "step": 350 + }, + { + "epoch": 0.242098184263618, + "grad_norm": 2.1534841060638428, + "learning_rate": 3.53e-05, + "loss": 1.1653, + "step": 360 + }, + { + "epoch": 0.2488231338264963, + "grad_norm": 1.8336793184280396, + "learning_rate": 3.62e-05, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.25554808338937457, + "grad_norm": 1.8173967599868774, + "learning_rate": 3.72e-05, + "loss": 1.065, + "step": 380 + }, + { + "epoch": 0.26227303295225285, + "grad_norm": 1.9319089651107788, + "learning_rate": 3.82e-05, + "loss": 1.0498, + "step": 390 + }, + { + "epoch": 0.26899798251513113, + "grad_norm": 5.745934009552002, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.0344, + "step": 400 + }, + { + "epoch": 0.2757229320780094, + "grad_norm": 1.7822935581207275, + "learning_rate": 4.02e-05, + "loss": 1.0393, + "step": 410 + }, + { + "epoch": 0.2824478816408877, + "grad_norm": 1.6039519309997559, + "learning_rate": 4.12e-05, + "loss": 0.9708, + "step": 420 + }, + { + "epoch": 0.28917283120376597, + "grad_norm": 2.8318278789520264, + "learning_rate": 4.22e-05, + "loss": 0.9958, + "step": 430 + }, + { + "epoch": 0.29589778076664425, + "grad_norm": 2.0125811100006104, + "learning_rate": 4.32e-05, + "loss": 0.9762, + "step": 440 + }, + { + "epoch": 0.3026227303295225, + "grad_norm": 1.8111716508865356, + "learning_rate": 4.4200000000000004e-05, + "loss": 0.8681, + "step": 450 + }, + { + "epoch": 0.3093476798924008, + "grad_norm": 1.900375247001648, + "learning_rate": 4.52e-05, + "loss": 0.8475, + "step": 460 + }, + { + "epoch": 0.3160726294552791, + "grad_norm": 2.1886966228485107, + "learning_rate": 4.61e-05, + "loss": 0.9559, + "step": 470 + }, + { + "epoch": 0.32279757901815737, + "grad_norm": 1.6458282470703125, + "learning_rate": 4.71e-05, + "loss": 0.83, + "step": 480 + }, + { + "epoch": 0.32952252858103565, + "grad_norm": 1.9217121601104736, + "learning_rate": 4.8100000000000004e-05, + "loss": 0.8023, + "step": 490 + }, + { + "epoch": 0.3362474781439139, + "grad_norm": 1.5879414081573486, + "learning_rate": 4.91e-05, + "loss": 0.7311, + "step": 500 + }, + { + "epoch": 0.3362474781439139, + "eval_loss": 0.6057256460189819, + "eval_runtime": 14.7732, + "eval_samples_per_second": 178.973, + "eval_steps_per_second": 22.405, + "step": 500 + }, + { + "epoch": 0.3429724277067922, + "grad_norm": 1.7010548114776611, + "learning_rate": 4.9994063167893614e-05, + "loss": 0.8544, + "step": 510 + }, + { + "epoch": 0.3496973772696705, + "grad_norm": 1.644062876701355, + "learning_rate": 4.993469484682973e-05, + "loss": 0.7183, + "step": 520 + }, + { + "epoch": 0.35642232683254876, + "grad_norm": 1.8534233570098877, + "learning_rate": 4.987532652576585e-05, + "loss": 0.7635, + "step": 530 + }, + { + "epoch": 0.36314727639542704, + "grad_norm": 1.9692680835723877, + "learning_rate": 4.9815958204701975e-05, + "loss": 0.7794, + "step": 540 + }, + { + "epoch": 0.3698722259583053, + "grad_norm": 2.2176966667175293, + "learning_rate": 4.975658988363809e-05, + "loss": 0.6438, + "step": 550 + }, + { + "epoch": 0.3765971755211836, + "grad_norm": 1.6217116117477417, + "learning_rate": 4.969722156257421e-05, + "loss": 0.7305, + "step": 560 + }, + { + "epoch": 0.3833221250840619, + "grad_norm": 1.4970368146896362, + "learning_rate": 4.9637853241510336e-05, + "loss": 0.6838, + "step": 570 + }, + { + "epoch": 0.39004707464694016, + "grad_norm": 1.6664597988128662, + "learning_rate": 4.957848492044645e-05, + "loss": 0.6605, + "step": 580 + }, + { + "epoch": 0.39677202420981844, + "grad_norm": 1.796850323677063, + "learning_rate": 4.951911659938257e-05, + "loss": 0.5982, + "step": 590 + }, + { + "epoch": 0.4034969737726967, + "grad_norm": 1.7564682960510254, + "learning_rate": 4.945974827831869e-05, + "loss": 0.6208, + "step": 600 + }, + { + "epoch": 0.410221923335575, + "grad_norm": 1.770803689956665, + "learning_rate": 4.940037995725481e-05, + "loss": 0.6699, + "step": 610 + }, + { + "epoch": 0.4169468728984533, + "grad_norm": 1.7247834205627441, + "learning_rate": 4.934101163619093e-05, + "loss": 0.6711, + "step": 620 + }, + { + "epoch": 0.42367182246133156, + "grad_norm": 1.7561867237091064, + "learning_rate": 4.928164331512705e-05, + "loss": 0.6286, + "step": 630 + }, + { + "epoch": 0.43039677202420984, + "grad_norm": 1.5144267082214355, + "learning_rate": 4.922227499406317e-05, + "loss": 0.6011, + "step": 640 + }, + { + "epoch": 0.4371217215870881, + "grad_norm": 1.731772780418396, + "learning_rate": 4.9162906672999294e-05, + "loss": 0.5068, + "step": 650 + }, + { + "epoch": 0.4438466711499664, + "grad_norm": 1.797025203704834, + "learning_rate": 4.9103538351935406e-05, + "loss": 0.5727, + "step": 660 + }, + { + "epoch": 0.4505716207128447, + "grad_norm": 1.615449070930481, + "learning_rate": 4.904417003087153e-05, + "loss": 0.6116, + "step": 670 + }, + { + "epoch": 0.45729657027572296, + "grad_norm": 1.2763652801513672, + "learning_rate": 4.898480170980765e-05, + "loss": 0.5898, + "step": 680 + }, + { + "epoch": 0.46402151983860124, + "grad_norm": 1.553597331047058, + "learning_rate": 4.8925433388743767e-05, + "loss": 0.5822, + "step": 690 + }, + { + "epoch": 0.47074646940147946, + "grad_norm": 1.478072166442871, + "learning_rate": 4.886606506767989e-05, + "loss": 0.565, + "step": 700 + }, + { + "epoch": 0.47747141896435774, + "grad_norm": 1.3752771615982056, + "learning_rate": 4.880669674661601e-05, + "loss": 0.5575, + "step": 710 + }, + { + "epoch": 0.484196368527236, + "grad_norm": 1.5757756233215332, + "learning_rate": 4.874732842555213e-05, + "loss": 0.5506, + "step": 720 + }, + { + "epoch": 0.4909213180901143, + "grad_norm": 1.8003489971160889, + "learning_rate": 4.8687960104488246e-05, + "loss": 0.5185, + "step": 730 + }, + { + "epoch": 0.4976462676529926, + "grad_norm": 1.8582769632339478, + "learning_rate": 4.8628591783424364e-05, + "loss": 0.4633, + "step": 740 + }, + { + "epoch": 0.5043712172158709, + "grad_norm": 1.8770896196365356, + "learning_rate": 4.856922346236049e-05, + "loss": 0.5392, + "step": 750 + }, + { + "epoch": 0.5110961667787491, + "grad_norm": 1.782762885093689, + "learning_rate": 4.8509855141296607e-05, + "loss": 0.5292, + "step": 760 + }, + { + "epoch": 0.5178211163416274, + "grad_norm": 1.5999263525009155, + "learning_rate": 4.8450486820232725e-05, + "loss": 0.5244, + "step": 770 + }, + { + "epoch": 0.5245460659045057, + "grad_norm": 1.4285874366760254, + "learning_rate": 4.839111849916885e-05, + "loss": 0.5169, + "step": 780 + }, + { + "epoch": 0.531271015467384, + "grad_norm": 1.814850091934204, + "learning_rate": 4.833175017810497e-05, + "loss": 0.4731, + "step": 790 + }, + { + "epoch": 0.5379959650302623, + "grad_norm": 1.5589072704315186, + "learning_rate": 4.8272381857041086e-05, + "loss": 0.5805, + "step": 800 + }, + { + "epoch": 0.5447209145931405, + "grad_norm": 1.4412167072296143, + "learning_rate": 4.8213013535977204e-05, + "loss": 0.5044, + "step": 810 + }, + { + "epoch": 0.5514458641560188, + "grad_norm": 1.56436288356781, + "learning_rate": 4.815364521491332e-05, + "loss": 0.4496, + "step": 820 + }, + { + "epoch": 0.5581708137188971, + "grad_norm": 1.2886687517166138, + "learning_rate": 4.8094276893849447e-05, + "loss": 0.4655, + "step": 830 + }, + { + "epoch": 0.5648957632817754, + "grad_norm": 1.4733515977859497, + "learning_rate": 4.8034908572785565e-05, + "loss": 0.4916, + "step": 840 + }, + { + "epoch": 0.5716207128446537, + "grad_norm": 1.6645103693008423, + "learning_rate": 4.797554025172168e-05, + "loss": 0.5046, + "step": 850 + }, + { + "epoch": 0.5783456624075319, + "grad_norm": 1.0857338905334473, + "learning_rate": 4.791617193065781e-05, + "loss": 0.4996, + "step": 860 + }, + { + "epoch": 0.5850706119704102, + "grad_norm": 1.4320952892303467, + "learning_rate": 4.7856803609593926e-05, + "loss": 0.4788, + "step": 870 + }, + { + "epoch": 0.5917955615332885, + "grad_norm": 1.6821132898330688, + "learning_rate": 4.7797435288530044e-05, + "loss": 0.5284, + "step": 880 + }, + { + "epoch": 0.5985205110961668, + "grad_norm": 1.1405266523361206, + "learning_rate": 4.773806696746616e-05, + "loss": 0.5179, + "step": 890 + }, + { + "epoch": 0.605245460659045, + "grad_norm": 1.2930107116699219, + "learning_rate": 4.767869864640228e-05, + "loss": 0.4225, + "step": 900 + }, + { + "epoch": 0.6119704102219233, + "grad_norm": 1.2401055097579956, + "learning_rate": 4.7619330325338405e-05, + "loss": 0.4995, + "step": 910 + }, + { + "epoch": 0.6186953597848016, + "grad_norm": 1.5603152513504028, + "learning_rate": 4.755996200427452e-05, + "loss": 0.4895, + "step": 920 + }, + { + "epoch": 0.6254203093476799, + "grad_norm": 1.407322883605957, + "learning_rate": 4.750059368321064e-05, + "loss": 0.3682, + "step": 930 + }, + { + "epoch": 0.6321452589105582, + "grad_norm": 1.2464901208877563, + "learning_rate": 4.7441225362146766e-05, + "loss": 0.4341, + "step": 940 + }, + { + "epoch": 0.6388702084734365, + "grad_norm": 1.2959908246994019, + "learning_rate": 4.738185704108288e-05, + "loss": 0.4965, + "step": 950 + }, + { + "epoch": 0.6455951580363147, + "grad_norm": 1.4426651000976562, + "learning_rate": 4.7322488720018995e-05, + "loss": 0.4894, + "step": 960 + }, + { + "epoch": 0.652320107599193, + "grad_norm": 1.3427236080169678, + "learning_rate": 4.726312039895512e-05, + "loss": 0.4193, + "step": 970 + }, + { + "epoch": 0.6590450571620713, + "grad_norm": 1.3153373003005981, + "learning_rate": 4.720375207789124e-05, + "loss": 0.4719, + "step": 980 + }, + { + "epoch": 0.6657700067249496, + "grad_norm": 1.2846280336380005, + "learning_rate": 4.714438375682736e-05, + "loss": 0.4027, + "step": 990 + }, + { + "epoch": 0.6724949562878278, + "grad_norm": 1.559920072555542, + "learning_rate": 4.708501543576348e-05, + "loss": 0.4952, + "step": 1000 + }, + { + "epoch": 0.6724949562878278, + "eval_loss": 0.3311997950077057, + "eval_runtime": 14.4715, + "eval_samples_per_second": 182.704, + "eval_steps_per_second": 22.873, + "step": 1000 + }, + { + "epoch": 0.6792199058507061, + "grad_norm": 1.4075298309326172, + "learning_rate": 4.70256471146996e-05, + "loss": 0.392, + "step": 1010 + }, + { + "epoch": 0.6859448554135844, + "grad_norm": 1.2165625095367432, + "learning_rate": 4.6966278793635724e-05, + "loss": 0.3714, + "step": 1020 + }, + { + "epoch": 0.6926698049764627, + "grad_norm": 1.2659556865692139, + "learning_rate": 4.6906910472571835e-05, + "loss": 0.4422, + "step": 1030 + }, + { + "epoch": 0.699394754539341, + "grad_norm": 1.6019095182418823, + "learning_rate": 4.684754215150795e-05, + "loss": 0.4335, + "step": 1040 + }, + { + "epoch": 0.7061197041022192, + "grad_norm": 1.350982904434204, + "learning_rate": 4.678817383044408e-05, + "loss": 0.4364, + "step": 1050 + }, + { + "epoch": 0.7128446536650975, + "grad_norm": 1.059822678565979, + "learning_rate": 4.6728805509380196e-05, + "loss": 0.3972, + "step": 1060 + }, + { + "epoch": 0.7195696032279758, + "grad_norm": 1.1738590002059937, + "learning_rate": 4.6669437188316314e-05, + "loss": 0.4753, + "step": 1070 + }, + { + "epoch": 0.7262945527908541, + "grad_norm": 1.3485381603240967, + "learning_rate": 4.661006886725244e-05, + "loss": 0.4033, + "step": 1080 + }, + { + "epoch": 0.7330195023537324, + "grad_norm": 1.2352713346481323, + "learning_rate": 4.655070054618856e-05, + "loss": 0.3844, + "step": 1090 + }, + { + "epoch": 0.7397444519166106, + "grad_norm": 0.9524800181388855, + "learning_rate": 4.6491332225124675e-05, + "loss": 0.3292, + "step": 1100 + }, + { + "epoch": 0.7464694014794889, + "grad_norm": 1.5953115224838257, + "learning_rate": 4.643196390406079e-05, + "loss": 0.4262, + "step": 1110 + }, + { + "epoch": 0.7531943510423672, + "grad_norm": 1.0904059410095215, + "learning_rate": 4.637259558299691e-05, + "loss": 0.3794, + "step": 1120 + }, + { + "epoch": 0.7599193006052455, + "grad_norm": 1.1243659257888794, + "learning_rate": 4.6313227261933036e-05, + "loss": 0.3931, + "step": 1130 + }, + { + "epoch": 0.7666442501681238, + "grad_norm": 1.7865381240844727, + "learning_rate": 4.6253858940869154e-05, + "loss": 0.3758, + "step": 1140 + }, + { + "epoch": 0.773369199731002, + "grad_norm": 1.322513222694397, + "learning_rate": 4.619449061980527e-05, + "loss": 0.3805, + "step": 1150 + }, + { + "epoch": 0.7800941492938803, + "grad_norm": 1.1556380987167358, + "learning_rate": 4.61351222987414e-05, + "loss": 0.4068, + "step": 1160 + }, + { + "epoch": 0.7868190988567586, + "grad_norm": 4.545334339141846, + "learning_rate": 4.6075753977677515e-05, + "loss": 0.3823, + "step": 1170 + }, + { + "epoch": 0.7935440484196369, + "grad_norm": 1.322127103805542, + "learning_rate": 4.601638565661363e-05, + "loss": 0.4151, + "step": 1180 + }, + { + "epoch": 0.8002689979825152, + "grad_norm": 0.9689624905586243, + "learning_rate": 4.595701733554975e-05, + "loss": 0.3771, + "step": 1190 + }, + { + "epoch": 0.8069939475453934, + "grad_norm": 1.1350356340408325, + "learning_rate": 4.589764901448587e-05, + "loss": 0.3708, + "step": 1200 + }, + { + "epoch": 0.8137188971082717, + "grad_norm": 1.15723717212677, + "learning_rate": 4.5838280693421994e-05, + "loss": 0.4063, + "step": 1210 + }, + { + "epoch": 0.82044384667115, + "grad_norm": 1.017109990119934, + "learning_rate": 4.577891237235811e-05, + "loss": 0.4116, + "step": 1220 + }, + { + "epoch": 0.8271687962340283, + "grad_norm": 1.16199791431427, + "learning_rate": 4.571954405129423e-05, + "loss": 0.4085, + "step": 1230 + }, + { + "epoch": 0.8338937457969066, + "grad_norm": 1.7634154558181763, + "learning_rate": 4.5660175730230355e-05, + "loss": 0.369, + "step": 1240 + }, + { + "epoch": 0.8406186953597848, + "grad_norm": 1.144355297088623, + "learning_rate": 4.5600807409166466e-05, + "loss": 0.3807, + "step": 1250 + }, + { + "epoch": 0.8473436449226631, + "grad_norm": 1.0942720174789429, + "learning_rate": 4.554143908810259e-05, + "loss": 0.3907, + "step": 1260 + }, + { + "epoch": 0.8540685944855414, + "grad_norm": 1.1029889583587646, + "learning_rate": 4.548207076703871e-05, + "loss": 0.3803, + "step": 1270 + }, + { + "epoch": 0.8607935440484197, + "grad_norm": 0.9530211687088013, + "learning_rate": 4.542270244597483e-05, + "loss": 0.3334, + "step": 1280 + }, + { + "epoch": 0.867518493611298, + "grad_norm": 1.1777485609054565, + "learning_rate": 4.536333412491095e-05, + "loss": 0.3641, + "step": 1290 + }, + { + "epoch": 0.8742434431741762, + "grad_norm": 1.2794089317321777, + "learning_rate": 4.530396580384707e-05, + "loss": 0.3405, + "step": 1300 + }, + { + "epoch": 0.8809683927370545, + "grad_norm": 1.0702439546585083, + "learning_rate": 4.524459748278319e-05, + "loss": 0.396, + "step": 1310 + }, + { + "epoch": 0.8876933422999328, + "grad_norm": 1.5103455781936646, + "learning_rate": 4.518522916171931e-05, + "loss": 0.4143, + "step": 1320 + }, + { + "epoch": 0.8944182918628111, + "grad_norm": 1.0924547910690308, + "learning_rate": 4.5125860840655424e-05, + "loss": 0.3117, + "step": 1330 + }, + { + "epoch": 0.9011432414256894, + "grad_norm": 1.2703629732131958, + "learning_rate": 4.506649251959155e-05, + "loss": 0.3284, + "step": 1340 + }, + { + "epoch": 0.9078681909885676, + "grad_norm": 1.138601541519165, + "learning_rate": 4.500712419852767e-05, + "loss": 0.325, + "step": 1350 + }, + { + "epoch": 0.9145931405514459, + "grad_norm": 1.1391545534133911, + "learning_rate": 4.4947755877463785e-05, + "loss": 0.3468, + "step": 1360 + }, + { + "epoch": 0.9213180901143242, + "grad_norm": 1.068265676498413, + "learning_rate": 4.488838755639991e-05, + "loss": 0.4056, + "step": 1370 + }, + { + "epoch": 0.9280430396772025, + "grad_norm": 1.1206265687942505, + "learning_rate": 4.482901923533603e-05, + "loss": 0.3516, + "step": 1380 + }, + { + "epoch": 0.9347679892400808, + "grad_norm": 1.178755283355713, + "learning_rate": 4.4769650914272146e-05, + "loss": 0.3368, + "step": 1390 + }, + { + "epoch": 0.9414929388029589, + "grad_norm": 1.0818071365356445, + "learning_rate": 4.4710282593208264e-05, + "loss": 0.3453, + "step": 1400 + }, + { + "epoch": 0.9482178883658372, + "grad_norm": 1.2789335250854492, + "learning_rate": 4.465091427214438e-05, + "loss": 0.3647, + "step": 1410 + }, + { + "epoch": 0.9549428379287155, + "grad_norm": 1.2054712772369385, + "learning_rate": 4.459154595108051e-05, + "loss": 0.361, + "step": 1420 + }, + { + "epoch": 0.9616677874915938, + "grad_norm": 1.1972471475601196, + "learning_rate": 4.4532177630016625e-05, + "loss": 0.3843, + "step": 1430 + }, + { + "epoch": 0.968392737054472, + "grad_norm": 1.224511981010437, + "learning_rate": 4.4472809308952743e-05, + "loss": 0.3281, + "step": 1440 + }, + { + "epoch": 0.9751176866173503, + "grad_norm": 1.6475602388381958, + "learning_rate": 4.441344098788887e-05, + "loss": 0.3414, + "step": 1450 + }, + { + "epoch": 0.9818426361802286, + "grad_norm": 1.2841564416885376, + "learning_rate": 4.4354072666824986e-05, + "loss": 0.326, + "step": 1460 + }, + { + "epoch": 0.9885675857431069, + "grad_norm": 1.0296547412872314, + "learning_rate": 4.4294704345761104e-05, + "loss": 0.3432, + "step": 1470 + }, + { + "epoch": 0.9952925353059852, + "grad_norm": 1.5025538206100464, + "learning_rate": 4.423533602469722e-05, + "loss": 0.3704, + "step": 1480 + }, + { + "epoch": 1.0020174848688634, + "grad_norm": 0.8826779723167419, + "learning_rate": 4.417596770363334e-05, + "loss": 0.3386, + "step": 1490 + }, + { + "epoch": 1.0087424344317417, + "grad_norm": 1.1697226762771606, + "learning_rate": 4.4116599382569465e-05, + "loss": 0.3141, + "step": 1500 + }, + { + "epoch": 1.0087424344317417, + "eval_loss": 0.26416271924972534, + "eval_runtime": 14.3375, + "eval_samples_per_second": 184.411, + "eval_steps_per_second": 23.086, + "step": 1500 + }, + { + "epoch": 1.01546738399462, + "grad_norm": 1.0717393159866333, + "learning_rate": 4.4057231061505583e-05, + "loss": 0.3196, + "step": 1510 + }, + { + "epoch": 1.0221923335574983, + "grad_norm": 1.0421812534332275, + "learning_rate": 4.39978627404417e-05, + "loss": 0.3434, + "step": 1520 + }, + { + "epoch": 1.0289172831203766, + "grad_norm": 1.326445460319519, + "learning_rate": 4.3938494419377826e-05, + "loss": 0.3639, + "step": 1530 + }, + { + "epoch": 1.0356422326832548, + "grad_norm": 0.7816503047943115, + "learning_rate": 4.3879126098313944e-05, + "loss": 0.3278, + "step": 1540 + }, + { + "epoch": 1.0423671822461331, + "grad_norm": 1.133551836013794, + "learning_rate": 4.3819757777250056e-05, + "loss": 0.3527, + "step": 1550 + }, + { + "epoch": 1.0490921318090114, + "grad_norm": 1.0650932788848877, + "learning_rate": 4.376038945618618e-05, + "loss": 0.3401, + "step": 1560 + }, + { + "epoch": 1.0558170813718897, + "grad_norm": 1.3639334440231323, + "learning_rate": 4.37010211351223e-05, + "loss": 0.372, + "step": 1570 + }, + { + "epoch": 1.062542030934768, + "grad_norm": 1.162127137184143, + "learning_rate": 4.3641652814058423e-05, + "loss": 0.3862, + "step": 1580 + }, + { + "epoch": 1.0692669804976462, + "grad_norm": 1.0656379461288452, + "learning_rate": 4.358228449299454e-05, + "loss": 0.302, + "step": 1590 + }, + { + "epoch": 1.0759919300605245, + "grad_norm": 1.2670329809188843, + "learning_rate": 4.352291617193066e-05, + "loss": 0.3013, + "step": 1600 + }, + { + "epoch": 1.0827168796234028, + "grad_norm": 1.1603080034255981, + "learning_rate": 4.3463547850866784e-05, + "loss": 0.2655, + "step": 1610 + }, + { + "epoch": 1.089441829186281, + "grad_norm": 1.0313270092010498, + "learning_rate": 4.34041795298029e-05, + "loss": 0.3272, + "step": 1620 + }, + { + "epoch": 1.0961667787491594, + "grad_norm": 0.8739041090011597, + "learning_rate": 4.3344811208739014e-05, + "loss": 0.3031, + "step": 1630 + }, + { + "epoch": 1.1028917283120376, + "grad_norm": 1.0449531078338623, + "learning_rate": 4.328544288767514e-05, + "loss": 0.2969, + "step": 1640 + }, + { + "epoch": 1.109616677874916, + "grad_norm": 1.065719485282898, + "learning_rate": 4.322607456661126e-05, + "loss": 0.3422, + "step": 1650 + }, + { + "epoch": 1.1163416274377942, + "grad_norm": 1.2147116661071777, + "learning_rate": 4.3166706245547375e-05, + "loss": 0.309, + "step": 1660 + }, + { + "epoch": 1.1230665770006725, + "grad_norm": 0.8757501840591431, + "learning_rate": 4.31073379244835e-05, + "loss": 0.2781, + "step": 1670 + }, + { + "epoch": 1.1297915265635508, + "grad_norm": 1.0971019268035889, + "learning_rate": 4.304796960341962e-05, + "loss": 0.2931, + "step": 1680 + }, + { + "epoch": 1.136516476126429, + "grad_norm": 1.1437770128250122, + "learning_rate": 4.298860128235574e-05, + "loss": 0.2595, + "step": 1690 + }, + { + "epoch": 1.1432414256893073, + "grad_norm": 1.025486707687378, + "learning_rate": 4.2929232961291854e-05, + "loss": 0.2598, + "step": 1700 + }, + { + "epoch": 1.1499663752521856, + "grad_norm": 1.2916457653045654, + "learning_rate": 4.286986464022797e-05, + "loss": 0.2544, + "step": 1710 + }, + { + "epoch": 1.1566913248150639, + "grad_norm": 1.05109441280365, + "learning_rate": 4.28104963191641e-05, + "loss": 0.3443, + "step": 1720 + }, + { + "epoch": 1.1634162743779422, + "grad_norm": 0.974449634552002, + "learning_rate": 4.2751127998100215e-05, + "loss": 0.2725, + "step": 1730 + }, + { + "epoch": 1.1701412239408204, + "grad_norm": 0.9677799940109253, + "learning_rate": 4.269175967703633e-05, + "loss": 0.3194, + "step": 1740 + }, + { + "epoch": 1.1768661735036987, + "grad_norm": 1.2493693828582764, + "learning_rate": 4.263239135597246e-05, + "loss": 0.3004, + "step": 1750 + }, + { + "epoch": 1.183591123066577, + "grad_norm": 1.0732406377792358, + "learning_rate": 4.2573023034908576e-05, + "loss": 0.2952, + "step": 1760 + }, + { + "epoch": 1.1903160726294553, + "grad_norm": 1.2802927494049072, + "learning_rate": 4.2513654713844694e-05, + "loss": 0.2894, + "step": 1770 + }, + { + "epoch": 1.1970410221923335, + "grad_norm": 0.8654187917709351, + "learning_rate": 4.245428639278081e-05, + "loss": 0.2714, + "step": 1780 + }, + { + "epoch": 1.2037659717552118, + "grad_norm": 1.3110740184783936, + "learning_rate": 4.239491807171693e-05, + "loss": 0.2993, + "step": 1790 + }, + { + "epoch": 1.21049092131809, + "grad_norm": 1.2888758182525635, + "learning_rate": 4.2335549750653055e-05, + "loss": 0.3015, + "step": 1800 + }, + { + "epoch": 1.2172158708809684, + "grad_norm": 1.1518973112106323, + "learning_rate": 4.227618142958917e-05, + "loss": 0.2795, + "step": 1810 + }, + { + "epoch": 1.2239408204438467, + "grad_norm": 1.0696250200271606, + "learning_rate": 4.221681310852529e-05, + "loss": 0.2756, + "step": 1820 + }, + { + "epoch": 1.230665770006725, + "grad_norm": 0.879612147808075, + "learning_rate": 4.2157444787461416e-05, + "loss": 0.3179, + "step": 1830 + }, + { + "epoch": 1.2373907195696032, + "grad_norm": 0.8837944269180298, + "learning_rate": 4.2098076466397534e-05, + "loss": 0.2439, + "step": 1840 + }, + { + "epoch": 1.2441156691324815, + "grad_norm": 1.0999325513839722, + "learning_rate": 4.203870814533365e-05, + "loss": 0.3235, + "step": 1850 + }, + { + "epoch": 1.2508406186953598, + "grad_norm": 1.1159007549285889, + "learning_rate": 4.197933982426977e-05, + "loss": 0.2855, + "step": 1860 + }, + { + "epoch": 1.257565568258238, + "grad_norm": 0.933502733707428, + "learning_rate": 4.191997150320589e-05, + "loss": 0.277, + "step": 1870 + }, + { + "epoch": 1.2642905178211163, + "grad_norm": 1.038533091545105, + "learning_rate": 4.186060318214201e-05, + "loss": 0.3594, + "step": 1880 + }, + { + "epoch": 1.2710154673839946, + "grad_norm": 1.0289478302001953, + "learning_rate": 4.180123486107813e-05, + "loss": 0.271, + "step": 1890 + }, + { + "epoch": 1.277740416946873, + "grad_norm": 0.97165846824646, + "learning_rate": 4.174186654001425e-05, + "loss": 0.2784, + "step": 1900 + }, + { + "epoch": 1.2844653665097512, + "grad_norm": 1.3086190223693848, + "learning_rate": 4.1682498218950374e-05, + "loss": 0.2883, + "step": 1910 + }, + { + "epoch": 1.2911903160726295, + "grad_norm": 0.9382519721984863, + "learning_rate": 4.162312989788649e-05, + "loss": 0.2936, + "step": 1920 + }, + { + "epoch": 1.2979152656355077, + "grad_norm": 0.9957454800605774, + "learning_rate": 4.156376157682261e-05, + "loss": 0.3137, + "step": 1930 + }, + { + "epoch": 1.304640215198386, + "grad_norm": 0.8603612184524536, + "learning_rate": 4.150439325575873e-05, + "loss": 0.2735, + "step": 1940 + }, + { + "epoch": 1.3113651647612643, + "grad_norm": 0.9475612044334412, + "learning_rate": 4.1445024934694846e-05, + "loss": 0.2763, + "step": 1950 + }, + { + "epoch": 1.3180901143241426, + "grad_norm": 1.2513048648834229, + "learning_rate": 4.138565661363097e-05, + "loss": 0.254, + "step": 1960 + }, + { + "epoch": 1.3248150638870209, + "grad_norm": 1.067569375038147, + "learning_rate": 4.132628829256709e-05, + "loss": 0.235, + "step": 1970 + }, + { + "epoch": 1.3315400134498991, + "grad_norm": 1.2715383768081665, + "learning_rate": 4.126691997150321e-05, + "loss": 0.2932, + "step": 1980 + }, + { + "epoch": 1.3382649630127774, + "grad_norm": 0.8481950163841248, + "learning_rate": 4.120755165043933e-05, + "loss": 0.2962, + "step": 1990 + }, + { + "epoch": 1.3449899125756557, + "grad_norm": 0.9146913290023804, + "learning_rate": 4.114818332937544e-05, + "loss": 0.2988, + "step": 2000 + }, + { + "epoch": 1.3449899125756557, + "eval_loss": 0.22625228762626648, + "eval_runtime": 14.3844, + "eval_samples_per_second": 183.81, + "eval_steps_per_second": 23.011, + "step": 2000 + }, + { + "epoch": 1.351714862138534, + "grad_norm": 1.3533391952514648, + "learning_rate": 4.108881500831157e-05, + "loss": 0.2837, + "step": 2010 + }, + { + "epoch": 1.3584398117014123, + "grad_norm": 1.1099708080291748, + "learning_rate": 4.1029446687247686e-05, + "loss": 0.2536, + "step": 2020 + }, + { + "epoch": 1.3651647612642905, + "grad_norm": 0.7431579828262329, + "learning_rate": 4.0970078366183804e-05, + "loss": 0.2192, + "step": 2030 + }, + { + "epoch": 1.3718897108271688, + "grad_norm": 0.8818496465682983, + "learning_rate": 4.091071004511993e-05, + "loss": 0.2197, + "step": 2040 + }, + { + "epoch": 1.378614660390047, + "grad_norm": 1.0587776899337769, + "learning_rate": 4.085134172405605e-05, + "loss": 0.2696, + "step": 2050 + }, + { + "epoch": 1.3853396099529254, + "grad_norm": 1.0173654556274414, + "learning_rate": 4.0791973402992165e-05, + "loss": 0.2839, + "step": 2060 + }, + { + "epoch": 1.3920645595158037, + "grad_norm": 1.1157002449035645, + "learning_rate": 4.073260508192828e-05, + "loss": 0.2893, + "step": 2070 + }, + { + "epoch": 1.398789509078682, + "grad_norm": 0.9677891135215759, + "learning_rate": 4.06732367608644e-05, + "loss": 0.2471, + "step": 2080 + }, + { + "epoch": 1.4055144586415602, + "grad_norm": 0.6478651762008667, + "learning_rate": 4.0613868439800526e-05, + "loss": 0.222, + "step": 2090 + }, + { + "epoch": 1.4122394082044385, + "grad_norm": 0.9797048568725586, + "learning_rate": 4.0554500118736644e-05, + "loss": 0.2671, + "step": 2100 + }, + { + "epoch": 1.4189643577673168, + "grad_norm": 0.9667022824287415, + "learning_rate": 4.049513179767276e-05, + "loss": 0.2672, + "step": 2110 + }, + { + "epoch": 1.425689307330195, + "grad_norm": 0.7505651712417603, + "learning_rate": 4.043576347660889e-05, + "loss": 0.2854, + "step": 2120 + }, + { + "epoch": 1.4324142568930733, + "grad_norm": 0.9649038910865784, + "learning_rate": 4.0376395155545005e-05, + "loss": 0.2819, + "step": 2130 + }, + { + "epoch": 1.4391392064559516, + "grad_norm": 1.242560863494873, + "learning_rate": 4.031702683448112e-05, + "loss": 0.2844, + "step": 2140 + }, + { + "epoch": 1.44586415601883, + "grad_norm": 1.0524150133132935, + "learning_rate": 4.025765851341724e-05, + "loss": 0.2925, + "step": 2150 + }, + { + "epoch": 1.4525891055817082, + "grad_norm": 1.135675072669983, + "learning_rate": 4.019829019235336e-05, + "loss": 0.2961, + "step": 2160 + }, + { + "epoch": 1.4593140551445865, + "grad_norm": 1.071516752243042, + "learning_rate": 4.0138921871289484e-05, + "loss": 0.2502, + "step": 2170 + }, + { + "epoch": 1.4660390047074647, + "grad_norm": 0.862398087978363, + "learning_rate": 4.00795535502256e-05, + "loss": 0.2646, + "step": 2180 + }, + { + "epoch": 1.472763954270343, + "grad_norm": 1.5514510869979858, + "learning_rate": 4.002018522916172e-05, + "loss": 0.2775, + "step": 2190 + }, + { + "epoch": 1.4794889038332213, + "grad_norm": 1.1586170196533203, + "learning_rate": 3.9960816908097845e-05, + "loss": 0.2749, + "step": 2200 + }, + { + "epoch": 1.4862138533960996, + "grad_norm": 0.7154861688613892, + "learning_rate": 3.990144858703396e-05, + "loss": 0.2357, + "step": 2210 + }, + { + "epoch": 1.4929388029589779, + "grad_norm": 0.9317541122436523, + "learning_rate": 3.9842080265970075e-05, + "loss": 0.2601, + "step": 2220 + }, + { + "epoch": 1.4996637525218561, + "grad_norm": 1.2377835512161255, + "learning_rate": 3.97827119449062e-05, + "loss": 0.2765, + "step": 2230 + }, + { + "epoch": 1.5063887020847344, + "grad_norm": 0.704967200756073, + "learning_rate": 3.972334362384232e-05, + "loss": 0.2587, + "step": 2240 + }, + { + "epoch": 1.5131136516476127, + "grad_norm": 1.0047144889831543, + "learning_rate": 3.9663975302778436e-05, + "loss": 0.2535, + "step": 2250 + }, + { + "epoch": 1.519838601210491, + "grad_norm": 1.10822331905365, + "learning_rate": 3.960460698171456e-05, + "loss": 0.2447, + "step": 2260 + }, + { + "epoch": 1.5265635507733692, + "grad_norm": 1.041707992553711, + "learning_rate": 3.954523866065068e-05, + "loss": 0.2761, + "step": 2270 + }, + { + "epoch": 1.5332885003362475, + "grad_norm": 0.8031961917877197, + "learning_rate": 3.94858703395868e-05, + "loss": 0.2171, + "step": 2280 + }, + { + "epoch": 1.5400134498991258, + "grad_norm": 0.9082476496696472, + "learning_rate": 3.942650201852292e-05, + "loss": 0.2653, + "step": 2290 + }, + { + "epoch": 1.546738399462004, + "grad_norm": 0.9983156323432922, + "learning_rate": 3.936713369745903e-05, + "loss": 0.2718, + "step": 2300 + }, + { + "epoch": 1.5534633490248824, + "grad_norm": 0.7857641577720642, + "learning_rate": 3.930776537639516e-05, + "loss": 0.2479, + "step": 2310 + }, + { + "epoch": 1.5601882985877606, + "grad_norm": 0.7820498943328857, + "learning_rate": 3.9248397055331276e-05, + "loss": 0.2826, + "step": 2320 + }, + { + "epoch": 1.5669132481506387, + "grad_norm": 0.9067456722259521, + "learning_rate": 3.9189028734267394e-05, + "loss": 0.3029, + "step": 2330 + }, + { + "epoch": 1.573638197713517, + "grad_norm": 0.7091119289398193, + "learning_rate": 3.912966041320352e-05, + "loss": 0.222, + "step": 2340 + }, + { + "epoch": 1.5803631472763953, + "grad_norm": 1.0687780380249023, + "learning_rate": 3.9070292092139637e-05, + "loss": 0.2741, + "step": 2350 + }, + { + "epoch": 1.5870880968392735, + "grad_norm": 0.7274528741836548, + "learning_rate": 3.9010923771075755e-05, + "loss": 0.275, + "step": 2360 + }, + { + "epoch": 1.5938130464021518, + "grad_norm": 0.7796964645385742, + "learning_rate": 3.895155545001187e-05, + "loss": 0.2754, + "step": 2370 + }, + { + "epoch": 1.60053799596503, + "grad_norm": 0.9450284838676453, + "learning_rate": 3.889218712894799e-05, + "loss": 0.2318, + "step": 2380 + }, + { + "epoch": 1.6072629455279084, + "grad_norm": 0.9865807890892029, + "learning_rate": 3.8832818807884116e-05, + "loss": 0.2851, + "step": 2390 + }, + { + "epoch": 1.6139878950907867, + "grad_norm": 0.9114990830421448, + "learning_rate": 3.8773450486820234e-05, + "loss": 0.2232, + "step": 2400 + }, + { + "epoch": 1.620712844653665, + "grad_norm": 1.1912641525268555, + "learning_rate": 3.871408216575635e-05, + "loss": 0.2125, + "step": 2410 + }, + { + "epoch": 1.6274377942165432, + "grad_norm": 0.9785919189453125, + "learning_rate": 3.8654713844692477e-05, + "loss": 0.2779, + "step": 2420 + }, + { + "epoch": 1.6341627437794215, + "grad_norm": 1.6103061437606812, + "learning_rate": 3.8595345523628595e-05, + "loss": 0.242, + "step": 2430 + }, + { + "epoch": 1.6408876933422998, + "grad_norm": 1.161123275756836, + "learning_rate": 3.853597720256471e-05, + "loss": 0.2297, + "step": 2440 + }, + { + "epoch": 1.647612642905178, + "grad_norm": 0.8608954548835754, + "learning_rate": 3.847660888150083e-05, + "loss": 0.2907, + "step": 2450 + }, + { + "epoch": 1.6543375924680563, + "grad_norm": 0.7341004610061646, + "learning_rate": 3.841724056043695e-05, + "loss": 0.2325, + "step": 2460 + }, + { + "epoch": 1.6610625420309346, + "grad_norm": 0.976824939250946, + "learning_rate": 3.8357872239373074e-05, + "loss": 0.2631, + "step": 2470 + }, + { + "epoch": 1.667787491593813, + "grad_norm": 0.9651874303817749, + "learning_rate": 3.829850391830919e-05, + "loss": 0.2731, + "step": 2480 + }, + { + "epoch": 1.6745124411566912, + "grad_norm": 0.856626570224762, + "learning_rate": 3.823913559724531e-05, + "loss": 0.221, + "step": 2490 + }, + { + "epoch": 1.6812373907195695, + "grad_norm": 0.990326464176178, + "learning_rate": 3.8179767276181435e-05, + "loss": 0.3172, + "step": 2500 + }, + { + "epoch": 1.6812373907195695, + "eval_loss": 0.2056342214345932, + "eval_runtime": 14.2733, + "eval_samples_per_second": 185.241, + "eval_steps_per_second": 23.19, + "step": 2500 + }, + { + "epoch": 1.6879623402824477, + "grad_norm": 0.8716689348220825, + "learning_rate": 3.812039895511755e-05, + "loss": 0.242, + "step": 2510 + }, + { + "epoch": 1.694687289845326, + "grad_norm": 1.0509111881256104, + "learning_rate": 3.806103063405367e-05, + "loss": 0.2674, + "step": 2520 + }, + { + "epoch": 1.7014122394082043, + "grad_norm": 0.7472453713417053, + "learning_rate": 3.800166231298979e-05, + "loss": 0.2856, + "step": 2530 + }, + { + "epoch": 1.7081371889710826, + "grad_norm": 0.9000864624977112, + "learning_rate": 3.794229399192591e-05, + "loss": 0.3039, + "step": 2540 + }, + { + "epoch": 1.7148621385339609, + "grad_norm": 0.8913015127182007, + "learning_rate": 3.788292567086203e-05, + "loss": 0.3029, + "step": 2550 + }, + { + "epoch": 1.7215870880968391, + "grad_norm": 0.8225172758102417, + "learning_rate": 3.782355734979815e-05, + "loss": 0.2137, + "step": 2560 + }, + { + "epoch": 1.7283120376597174, + "grad_norm": 0.9405450224876404, + "learning_rate": 3.776418902873427e-05, + "loss": 0.2064, + "step": 2570 + }, + { + "epoch": 1.7350369872225957, + "grad_norm": 0.9194093942642212, + "learning_rate": 3.770482070767039e-05, + "loss": 0.2301, + "step": 2580 + }, + { + "epoch": 1.741761936785474, + "grad_norm": 0.8586764931678772, + "learning_rate": 3.764545238660651e-05, + "loss": 0.2553, + "step": 2590 + }, + { + "epoch": 1.7484868863483523, + "grad_norm": 1.3633416891098022, + "learning_rate": 3.758608406554263e-05, + "loss": 0.2307, + "step": 2600 + }, + { + "epoch": 1.7552118359112305, + "grad_norm": 0.7213522791862488, + "learning_rate": 3.752671574447875e-05, + "loss": 0.251, + "step": 2610 + }, + { + "epoch": 1.7619367854741088, + "grad_norm": 1.1806386709213257, + "learning_rate": 3.7467347423414865e-05, + "loss": 0.2753, + "step": 2620 + }, + { + "epoch": 1.768661735036987, + "grad_norm": 1.203027367591858, + "learning_rate": 3.740797910235099e-05, + "loss": 0.2878, + "step": 2630 + }, + { + "epoch": 1.7753866845998654, + "grad_norm": 0.9682397842407227, + "learning_rate": 3.734861078128711e-05, + "loss": 0.2722, + "step": 2640 + }, + { + "epoch": 1.7821116341627437, + "grad_norm": 1.2024784088134766, + "learning_rate": 3.7289242460223226e-05, + "loss": 0.1792, + "step": 2650 + }, + { + "epoch": 1.788836583725622, + "grad_norm": 0.9232836365699768, + "learning_rate": 3.722987413915935e-05, + "loss": 0.2255, + "step": 2660 + }, + { + "epoch": 1.7955615332885002, + "grad_norm": 0.8811613917350769, + "learning_rate": 3.717050581809546e-05, + "loss": 0.2608, + "step": 2670 + }, + { + "epoch": 1.8022864828513785, + "grad_norm": 0.9594545364379883, + "learning_rate": 3.711113749703159e-05, + "loss": 0.2906, + "step": 2680 + }, + { + "epoch": 1.8090114324142568, + "grad_norm": 0.8096931576728821, + "learning_rate": 3.7051769175967705e-05, + "loss": 0.2148, + "step": 2690 + }, + { + "epoch": 1.815736381977135, + "grad_norm": 1.0215779542922974, + "learning_rate": 3.699240085490382e-05, + "loss": 0.2833, + "step": 2700 + }, + { + "epoch": 1.8224613315400133, + "grad_norm": 0.8041989207267761, + "learning_rate": 3.693303253383995e-05, + "loss": 0.1985, + "step": 2710 + }, + { + "epoch": 1.8291862811028916, + "grad_norm": 1.1806946992874146, + "learning_rate": 3.6873664212776066e-05, + "loss": 0.2714, + "step": 2720 + }, + { + "epoch": 1.8359112306657699, + "grad_norm": 0.9091224670410156, + "learning_rate": 3.6814295891712184e-05, + "loss": 0.2034, + "step": 2730 + }, + { + "epoch": 1.8426361802286482, + "grad_norm": 0.7493769526481628, + "learning_rate": 3.675492757064831e-05, + "loss": 0.2219, + "step": 2740 + }, + { + "epoch": 1.8493611297915264, + "grad_norm": 0.8476701974868774, + "learning_rate": 3.669555924958442e-05, + "loss": 0.2407, + "step": 2750 + }, + { + "epoch": 1.8560860793544047, + "grad_norm": 0.8047271370887756, + "learning_rate": 3.6636190928520545e-05, + "loss": 0.2388, + "step": 2760 + }, + { + "epoch": 1.862811028917283, + "grad_norm": 0.8351836800575256, + "learning_rate": 3.657682260745666e-05, + "loss": 0.2393, + "step": 2770 + }, + { + "epoch": 1.8695359784801613, + "grad_norm": 0.8370159268379211, + "learning_rate": 3.651745428639278e-05, + "loss": 0.2252, + "step": 2780 + }, + { + "epoch": 1.8762609280430396, + "grad_norm": 1.0017287731170654, + "learning_rate": 3.6458085965328906e-05, + "loss": 0.2861, + "step": 2790 + }, + { + "epoch": 1.8829858776059178, + "grad_norm": 0.6298861503601074, + "learning_rate": 3.6398717644265024e-05, + "loss": 0.2139, + "step": 2800 + }, + { + "epoch": 1.8897108271687961, + "grad_norm": 0.7649684548377991, + "learning_rate": 3.633934932320114e-05, + "loss": 0.2377, + "step": 2810 + }, + { + "epoch": 1.8964357767316744, + "grad_norm": 0.9333203434944153, + "learning_rate": 3.627998100213726e-05, + "loss": 0.2546, + "step": 2820 + }, + { + "epoch": 1.9031607262945527, + "grad_norm": 0.8119333386421204, + "learning_rate": 3.622061268107338e-05, + "loss": 0.2792, + "step": 2830 + }, + { + "epoch": 1.909885675857431, + "grad_norm": 0.8402873873710632, + "learning_rate": 3.6161244360009496e-05, + "loss": 0.2776, + "step": 2840 + }, + { + "epoch": 1.9166106254203092, + "grad_norm": 0.6565370559692383, + "learning_rate": 3.610187603894562e-05, + "loss": 0.2081, + "step": 2850 + }, + { + "epoch": 1.9233355749831875, + "grad_norm": 0.818225085735321, + "learning_rate": 3.604250771788174e-05, + "loss": 0.2271, + "step": 2860 + }, + { + "epoch": 1.9300605245460658, + "grad_norm": 0.6470263004302979, + "learning_rate": 3.5983139396817864e-05, + "loss": 0.2026, + "step": 2870 + }, + { + "epoch": 1.936785474108944, + "grad_norm": 0.8524275422096252, + "learning_rate": 3.592377107575398e-05, + "loss": 0.2628, + "step": 2880 + }, + { + "epoch": 1.9435104236718224, + "grad_norm": 0.7875599265098572, + "learning_rate": 3.58644027546901e-05, + "loss": 0.281, + "step": 2890 + }, + { + "epoch": 1.9502353732347006, + "grad_norm": 0.9621008038520813, + "learning_rate": 3.580503443362622e-05, + "loss": 0.2349, + "step": 2900 + }, + { + "epoch": 1.956960322797579, + "grad_norm": 0.8459897041320801, + "learning_rate": 3.5745666112562336e-05, + "loss": 0.2372, + "step": 2910 + }, + { + "epoch": 1.9636852723604572, + "grad_norm": 0.9341444969177246, + "learning_rate": 3.5686297791498454e-05, + "loss": 0.2533, + "step": 2920 + }, + { + "epoch": 1.9704102219233355, + "grad_norm": 0.6931284666061401, + "learning_rate": 3.562692947043458e-05, + "loss": 0.1903, + "step": 2930 + }, + { + "epoch": 1.9771351714862138, + "grad_norm": 0.8395922780036926, + "learning_rate": 3.55675611493707e-05, + "loss": 0.2997, + "step": 2940 + }, + { + "epoch": 1.983860121049092, + "grad_norm": 1.1813199520111084, + "learning_rate": 3.5508192828306815e-05, + "loss": 0.2437, + "step": 2950 + }, + { + "epoch": 1.9905850706119703, + "grad_norm": 1.0604130029678345, + "learning_rate": 3.544882450724294e-05, + "loss": 0.2402, + "step": 2960 + }, + { + "epoch": 1.9973100201748486, + "grad_norm": 0.7427442073822021, + "learning_rate": 3.538945618617905e-05, + "loss": 0.2111, + "step": 2970 + }, + { + "epoch": 2.004034969737727, + "grad_norm": 0.7738786339759827, + "learning_rate": 3.5330087865115176e-05, + "loss": 0.2503, + "step": 2980 + }, + { + "epoch": 2.010759919300605, + "grad_norm": 1.0489563941955566, + "learning_rate": 3.5270719544051294e-05, + "loss": 0.2051, + "step": 2990 + }, + { + "epoch": 2.0174848688634834, + "grad_norm": 1.0015239715576172, + "learning_rate": 3.521135122298741e-05, + "loss": 0.2148, + "step": 3000 + }, + { + "epoch": 2.0174848688634834, + "eval_loss": 0.19253146648406982, + "eval_runtime": 14.4364, + "eval_samples_per_second": 183.148, + "eval_steps_per_second": 22.928, + "step": 3000 + }, + { + "epoch": 2.0242098184263617, + "grad_norm": 0.7455950975418091, + "learning_rate": 3.515198290192354e-05, + "loss": 0.2406, + "step": 3010 + }, + { + "epoch": 2.03093476798924, + "grad_norm": 1.0238157510757446, + "learning_rate": 3.5092614580859655e-05, + "loss": 0.2556, + "step": 3020 + }, + { + "epoch": 2.0376597175521183, + "grad_norm": 0.7209094166755676, + "learning_rate": 3.5033246259795774e-05, + "loss": 0.2255, + "step": 3030 + }, + { + "epoch": 2.0443846671149966, + "grad_norm": 0.7461678981781006, + "learning_rate": 3.49738779387319e-05, + "loss": 0.1926, + "step": 3040 + }, + { + "epoch": 2.051109616677875, + "grad_norm": 0.7917340993881226, + "learning_rate": 3.491450961766801e-05, + "loss": 0.28, + "step": 3050 + }, + { + "epoch": 2.057834566240753, + "grad_norm": 0.7438961863517761, + "learning_rate": 3.4855141296604135e-05, + "loss": 0.1955, + "step": 3060 + }, + { + "epoch": 2.0645595158036314, + "grad_norm": 0.8378620147705078, + "learning_rate": 3.479577297554025e-05, + "loss": 0.1863, + "step": 3070 + }, + { + "epoch": 2.0712844653665097, + "grad_norm": 0.8809778690338135, + "learning_rate": 3.473640465447637e-05, + "loss": 0.2263, + "step": 3080 + }, + { + "epoch": 2.078009414929388, + "grad_norm": 0.6806073784828186, + "learning_rate": 3.4677036333412495e-05, + "loss": 0.1948, + "step": 3090 + }, + { + "epoch": 2.0847343644922662, + "grad_norm": 1.061790108680725, + "learning_rate": 3.4617668012348614e-05, + "loss": 0.2436, + "step": 3100 + }, + { + "epoch": 2.0914593140551445, + "grad_norm": 0.9499590992927551, + "learning_rate": 3.455829969128473e-05, + "loss": 0.2388, + "step": 3110 + }, + { + "epoch": 2.098184263618023, + "grad_norm": 0.8609486222267151, + "learning_rate": 3.449893137022085e-05, + "loss": 0.2675, + "step": 3120 + }, + { + "epoch": 2.104909213180901, + "grad_norm": 0.7025455236434937, + "learning_rate": 3.443956304915697e-05, + "loss": 0.2014, + "step": 3130 + }, + { + "epoch": 2.1116341627437794, + "grad_norm": 0.783028244972229, + "learning_rate": 3.438019472809309e-05, + "loss": 0.1954, + "step": 3140 + }, + { + "epoch": 2.1183591123066576, + "grad_norm": 0.767192542552948, + "learning_rate": 3.432082640702921e-05, + "loss": 0.2294, + "step": 3150 + }, + { + "epoch": 2.125084061869536, + "grad_norm": 1.0050126314163208, + "learning_rate": 3.426145808596533e-05, + "loss": 0.2217, + "step": 3160 + }, + { + "epoch": 2.131809011432414, + "grad_norm": 0.9885833263397217, + "learning_rate": 3.4202089764901454e-05, + "loss": 0.2299, + "step": 3170 + }, + { + "epoch": 2.1385339609952925, + "grad_norm": 0.959437906742096, + "learning_rate": 3.414272144383757e-05, + "loss": 0.1762, + "step": 3180 + }, + { + "epoch": 2.1452589105581708, + "grad_norm": 0.8083593249320984, + "learning_rate": 3.408335312277369e-05, + "loss": 0.1829, + "step": 3190 + }, + { + "epoch": 2.151983860121049, + "grad_norm": 1.081933856010437, + "learning_rate": 3.402398480170981e-05, + "loss": 0.2263, + "step": 3200 + }, + { + "epoch": 2.1587088096839273, + "grad_norm": 1.1069368124008179, + "learning_rate": 3.3964616480645926e-05, + "loss": 0.2492, + "step": 3210 + }, + { + "epoch": 2.1654337592468056, + "grad_norm": 0.8485414385795593, + "learning_rate": 3.390524815958205e-05, + "loss": 0.2201, + "step": 3220 + }, + { + "epoch": 2.172158708809684, + "grad_norm": 1.1156219244003296, + "learning_rate": 3.384587983851817e-05, + "loss": 0.2494, + "step": 3230 + }, + { + "epoch": 2.178883658372562, + "grad_norm": 0.8588371276855469, + "learning_rate": 3.378651151745429e-05, + "loss": 0.2784, + "step": 3240 + }, + { + "epoch": 2.1856086079354404, + "grad_norm": 0.9314185380935669, + "learning_rate": 3.372714319639041e-05, + "loss": 0.2127, + "step": 3250 + }, + { + "epoch": 2.1923335574983187, + "grad_norm": 1.0111339092254639, + "learning_rate": 3.366777487532653e-05, + "loss": 0.2082, + "step": 3260 + }, + { + "epoch": 2.199058507061197, + "grad_norm": 0.8263574242591858, + "learning_rate": 3.360840655426265e-05, + "loss": 0.2052, + "step": 3270 + }, + { + "epoch": 2.2057834566240753, + "grad_norm": 0.9010327458381653, + "learning_rate": 3.3549038233198766e-05, + "loss": 0.246, + "step": 3280 + }, + { + "epoch": 2.2125084061869535, + "grad_norm": 1.0510506629943848, + "learning_rate": 3.3489669912134884e-05, + "loss": 0.2138, + "step": 3290 + }, + { + "epoch": 2.219233355749832, + "grad_norm": 1.0803182125091553, + "learning_rate": 3.343030159107101e-05, + "loss": 0.2478, + "step": 3300 + }, + { + "epoch": 2.22595830531271, + "grad_norm": 0.7982410788536072, + "learning_rate": 3.337093327000713e-05, + "loss": 0.2249, + "step": 3310 + }, + { + "epoch": 2.2326832548755884, + "grad_norm": 0.7922587990760803, + "learning_rate": 3.3311564948943245e-05, + "loss": 0.2598, + "step": 3320 + }, + { + "epoch": 2.2394082044384667, + "grad_norm": 0.752295732498169, + "learning_rate": 3.325219662787937e-05, + "loss": 0.2036, + "step": 3330 + }, + { + "epoch": 2.246133154001345, + "grad_norm": 0.9120996594429016, + "learning_rate": 3.319282830681548e-05, + "loss": 0.2507, + "step": 3340 + }, + { + "epoch": 2.2528581035642232, + "grad_norm": 1.0284003019332886, + "learning_rate": 3.3133459985751606e-05, + "loss": 0.2191, + "step": 3350 + }, + { + "epoch": 2.2595830531271015, + "grad_norm": 0.6306409239768982, + "learning_rate": 3.3074091664687724e-05, + "loss": 0.2093, + "step": 3360 + }, + { + "epoch": 2.26630800268998, + "grad_norm": 0.8642512559890747, + "learning_rate": 3.301472334362384e-05, + "loss": 0.2246, + "step": 3370 + }, + { + "epoch": 2.273032952252858, + "grad_norm": 0.7505747079849243, + "learning_rate": 3.295535502255997e-05, + "loss": 0.1806, + "step": 3380 + }, + { + "epoch": 2.2797579018157363, + "grad_norm": 0.7727899551391602, + "learning_rate": 3.2901923533602466e-05, + "loss": 0.204, + "step": 3390 + }, + { + "epoch": 2.2864828513786146, + "grad_norm": 0.7602193355560303, + "learning_rate": 3.284255521253859e-05, + "loss": 0.243, + "step": 3400 + }, + { + "epoch": 2.293207800941493, + "grad_norm": 0.6114295125007629, + "learning_rate": 3.278318689147471e-05, + "loss": 0.2212, + "step": 3410 + }, + { + "epoch": 2.299932750504371, + "grad_norm": 0.8132253289222717, + "learning_rate": 3.272381857041083e-05, + "loss": 0.2095, + "step": 3420 + }, + { + "epoch": 2.3066577000672495, + "grad_norm": 1.0195751190185547, + "learning_rate": 3.266445024934695e-05, + "loss": 0.2549, + "step": 3430 + }, + { + "epoch": 2.3133826496301277, + "grad_norm": 0.8007863163948059, + "learning_rate": 3.260508192828307e-05, + "loss": 0.2355, + "step": 3440 + }, + { + "epoch": 2.320107599193006, + "grad_norm": 0.736564040184021, + "learning_rate": 3.254571360721919e-05, + "loss": 0.2084, + "step": 3450 + }, + { + "epoch": 2.3268325487558843, + "grad_norm": 0.7039320468902588, + "learning_rate": 3.2492282118261695e-05, + "loss": 0.187, + "step": 3460 + }, + { + "epoch": 2.3335574983187626, + "grad_norm": 0.7210522890090942, + "learning_rate": 3.243291379719782e-05, + "loss": 0.2183, + "step": 3470 + }, + { + "epoch": 2.340282447881641, + "grad_norm": 1.0226560831069946, + "learning_rate": 3.237354547613394e-05, + "loss": 0.1965, + "step": 3480 + }, + { + "epoch": 2.347007397444519, + "grad_norm": 0.9947569966316223, + "learning_rate": 3.2314177155070056e-05, + "loss": 0.283, + "step": 3490 + }, + { + "epoch": 2.3537323470073974, + "grad_norm": 0.68961101770401, + "learning_rate": 3.225480883400618e-05, + "loss": 0.1731, + "step": 3500 + }, + { + "epoch": 2.3537323470073974, + "eval_loss": 0.18062594532966614, + "eval_runtime": 14.5269, + "eval_samples_per_second": 182.007, + "eval_steps_per_second": 22.785, + "step": 3500 + }, + { + "epoch": 2.3604572965702757, + "grad_norm": 1.0577141046524048, + "learning_rate": 3.219544051294229e-05, + "loss": 0.2143, + "step": 3510 + }, + { + "epoch": 2.367182246133154, + "grad_norm": 0.7405422925949097, + "learning_rate": 3.2136072191878416e-05, + "loss": 0.1852, + "step": 3520 + }, + { + "epoch": 2.3739071956960323, + "grad_norm": 0.7497440576553345, + "learning_rate": 3.2076703870814535e-05, + "loss": 0.2103, + "step": 3530 + }, + { + "epoch": 2.3806321452589105, + "grad_norm": 1.0656136274337769, + "learning_rate": 3.201733554975065e-05, + "loss": 0.2264, + "step": 3540 + }, + { + "epoch": 2.387357094821789, + "grad_norm": 0.7102001905441284, + "learning_rate": 3.195796722868678e-05, + "loss": 0.212, + "step": 3550 + }, + { + "epoch": 2.394082044384667, + "grad_norm": 0.681854248046875, + "learning_rate": 3.1898598907622896e-05, + "loss": 0.2084, + "step": 3560 + }, + { + "epoch": 2.4008069939475454, + "grad_norm": 0.8357053995132446, + "learning_rate": 3.1839230586559014e-05, + "loss": 0.2552, + "step": 3570 + }, + { + "epoch": 2.4075319435104237, + "grad_norm": 0.9674760699272156, + "learning_rate": 3.177986226549513e-05, + "loss": 0.1648, + "step": 3580 + }, + { + "epoch": 2.414256893073302, + "grad_norm": 0.9936577081680298, + "learning_rate": 3.172049394443125e-05, + "loss": 0.281, + "step": 3590 + }, + { + "epoch": 2.42098184263618, + "grad_norm": 0.8406476974487305, + "learning_rate": 3.1661125623367375e-05, + "loss": 0.2074, + "step": 3600 + }, + { + "epoch": 2.4277067921990585, + "grad_norm": 0.6320979595184326, + "learning_rate": 3.160175730230349e-05, + "loss": 0.2265, + "step": 3610 + }, + { + "epoch": 2.4344317417619368, + "grad_norm": 1.0120488405227661, + "learning_rate": 3.154238898123961e-05, + "loss": 0.2178, + "step": 3620 + }, + { + "epoch": 2.441156691324815, + "grad_norm": 0.6850121021270752, + "learning_rate": 3.1483020660175736e-05, + "loss": 0.1898, + "step": 3630 + }, + { + "epoch": 2.4478816408876933, + "grad_norm": 0.7465779185295105, + "learning_rate": 3.1423652339111854e-05, + "loss": 0.2227, + "step": 3640 + }, + { + "epoch": 2.4546065904505716, + "grad_norm": 0.769567608833313, + "learning_rate": 3.136428401804797e-05, + "loss": 0.2123, + "step": 3650 + }, + { + "epoch": 2.46133154001345, + "grad_norm": 0.7620592713356018, + "learning_rate": 3.130491569698409e-05, + "loss": 0.2213, + "step": 3660 + }, + { + "epoch": 2.468056489576328, + "grad_norm": 0.6534302234649658, + "learning_rate": 3.124554737592021e-05, + "loss": 0.182, + "step": 3670 + }, + { + "epoch": 2.4747814391392065, + "grad_norm": 1.0276516675949097, + "learning_rate": 3.118617905485633e-05, + "loss": 0.2901, + "step": 3680 + }, + { + "epoch": 2.4815063887020847, + "grad_norm": 0.6156563758850098, + "learning_rate": 3.112681073379245e-05, + "loss": 0.195, + "step": 3690 + }, + { + "epoch": 2.488231338264963, + "grad_norm": 0.9594860672950745, + "learning_rate": 3.106744241272857e-05, + "loss": 0.1914, + "step": 3700 + }, + { + "epoch": 2.4949562878278413, + "grad_norm": 0.935619592666626, + "learning_rate": 3.1008074091664694e-05, + "loss": 0.1992, + "step": 3710 + }, + { + "epoch": 2.5016812373907196, + "grad_norm": 1.0625261068344116, + "learning_rate": 3.094870577060081e-05, + "loss": 0.1935, + "step": 3720 + }, + { + "epoch": 2.508406186953598, + "grad_norm": 0.7582225799560547, + "learning_rate": 3.088933744953692e-05, + "loss": 0.2272, + "step": 3730 + }, + { + "epoch": 2.515131136516476, + "grad_norm": 0.5857738852500916, + "learning_rate": 3.082996912847305e-05, + "loss": 0.1747, + "step": 3740 + }, + { + "epoch": 2.5218560860793544, + "grad_norm": 0.735443651676178, + "learning_rate": 3.0770600807409166e-05, + "loss": 0.2151, + "step": 3750 + }, + { + "epoch": 2.5285810356422327, + "grad_norm": 0.7247041463851929, + "learning_rate": 3.0711232486345284e-05, + "loss": 0.1893, + "step": 3760 + }, + { + "epoch": 2.535305985205111, + "grad_norm": 0.6171685457229614, + "learning_rate": 3.065186416528141e-05, + "loss": 0.2087, + "step": 3770 + }, + { + "epoch": 2.5420309347679892, + "grad_norm": 0.8305767178535461, + "learning_rate": 3.059249584421753e-05, + "loss": 0.2012, + "step": 3780 + }, + { + "epoch": 2.5487558843308675, + "grad_norm": 0.7687706351280212, + "learning_rate": 3.053312752315365e-05, + "loss": 0.1859, + "step": 3790 + }, + { + "epoch": 2.555480833893746, + "grad_norm": 0.7182943224906921, + "learning_rate": 3.047375920208977e-05, + "loss": 0.2004, + "step": 3800 + }, + { + "epoch": 2.562205783456624, + "grad_norm": 0.8720729947090149, + "learning_rate": 3.0414390881025884e-05, + "loss": 0.1964, + "step": 3810 + }, + { + "epoch": 2.5689307330195024, + "grad_norm": 0.7959709167480469, + "learning_rate": 3.0355022559962003e-05, + "loss": 0.21, + "step": 3820 + }, + { + "epoch": 2.5756556825823806, + "grad_norm": 0.7620719075202942, + "learning_rate": 3.0295654238898124e-05, + "loss": 0.2158, + "step": 3830 + }, + { + "epoch": 2.582380632145259, + "grad_norm": 0.6374610662460327, + "learning_rate": 3.0236285917834245e-05, + "loss": 0.1785, + "step": 3840 + }, + { + "epoch": 2.589105581708137, + "grad_norm": 0.9034950137138367, + "learning_rate": 3.0176917596770367e-05, + "loss": 0.2177, + "step": 3850 + }, + { + "epoch": 2.5958305312710155, + "grad_norm": 0.7911028861999512, + "learning_rate": 3.0117549275706485e-05, + "loss": 0.1848, + "step": 3860 + }, + { + "epoch": 2.6025554808338938, + "grad_norm": 0.7024137377738953, + "learning_rate": 3.0058180954642606e-05, + "loss": 0.1929, + "step": 3870 + }, + { + "epoch": 2.609280430396772, + "grad_norm": 0.7845777273178101, + "learning_rate": 2.999881263357872e-05, + "loss": 0.2002, + "step": 3880 + }, + { + "epoch": 2.6160053799596503, + "grad_norm": 0.8540692329406738, + "learning_rate": 2.9939444312514843e-05, + "loss": 0.2275, + "step": 3890 + }, + { + "epoch": 2.6227303295225286, + "grad_norm": 0.707000732421875, + "learning_rate": 2.988007599145096e-05, + "loss": 0.2423, + "step": 3900 + }, + { + "epoch": 2.629455279085407, + "grad_norm": 0.8943628668785095, + "learning_rate": 2.9820707670387082e-05, + "loss": 0.1889, + "step": 3910 + }, + { + "epoch": 2.636180228648285, + "grad_norm": 0.7365798354148865, + "learning_rate": 2.9761339349323204e-05, + "loss": 0.1885, + "step": 3920 + }, + { + "epoch": 2.6429051782111634, + "grad_norm": 0.8606659173965454, + "learning_rate": 2.9701971028259325e-05, + "loss": 0.2033, + "step": 3930 + }, + { + "epoch": 2.6496301277740417, + "grad_norm": 1.0965065956115723, + "learning_rate": 2.9642602707195443e-05, + "loss": 0.2234, + "step": 3940 + }, + { + "epoch": 2.65635507733692, + "grad_norm": 0.8194990754127502, + "learning_rate": 2.9583234386131564e-05, + "loss": 0.1973, + "step": 3950 + }, + { + "epoch": 2.6630800268997983, + "grad_norm": 0.9065597653388977, + "learning_rate": 2.952386606506768e-05, + "loss": 0.2189, + "step": 3960 + }, + { + "epoch": 2.6698049764626766, + "grad_norm": 0.7854435443878174, + "learning_rate": 2.94644977440038e-05, + "loss": 0.2179, + "step": 3970 + }, + { + "epoch": 2.676529926025555, + "grad_norm": 0.741438627243042, + "learning_rate": 2.940512942293992e-05, + "loss": 0.2321, + "step": 3980 + }, + { + "epoch": 2.683254875588433, + "grad_norm": 0.6715734601020813, + "learning_rate": 2.934576110187604e-05, + "loss": 0.2344, + "step": 3990 + }, + { + "epoch": 2.6899798251513114, + "grad_norm": 0.94840407371521, + "learning_rate": 2.928639278081216e-05, + "loss": 0.2516, + "step": 4000 + }, + { + "epoch": 2.6899798251513114, + "eval_loss": 0.17142708599567413, + "eval_runtime": 14.4655, + "eval_samples_per_second": 182.78, + "eval_steps_per_second": 22.882, + "step": 4000 + }, + { + "epoch": 2.6967047747141897, + "grad_norm": 1.161207914352417, + "learning_rate": 2.922702445974828e-05, + "loss": 0.1899, + "step": 4010 + }, + { + "epoch": 2.703429724277068, + "grad_norm": 1.0559875965118408, + "learning_rate": 2.91676561386844e-05, + "loss": 0.2161, + "step": 4020 + }, + { + "epoch": 2.7101546738399462, + "grad_norm": 1.0273319482803345, + "learning_rate": 2.9108287817620516e-05, + "loss": 0.2025, + "step": 4030 + }, + { + "epoch": 2.7168796234028245, + "grad_norm": 0.8442862033843994, + "learning_rate": 2.9048919496556637e-05, + "loss": 0.1655, + "step": 4040 + }, + { + "epoch": 2.723604572965703, + "grad_norm": 0.8517313003540039, + "learning_rate": 2.898955117549276e-05, + "loss": 0.2349, + "step": 4050 + }, + { + "epoch": 2.730329522528581, + "grad_norm": 0.7692966461181641, + "learning_rate": 2.8930182854428877e-05, + "loss": 0.2297, + "step": 4060 + }, + { + "epoch": 2.7370544720914594, + "grad_norm": 0.6031805276870728, + "learning_rate": 2.8870814533364998e-05, + "loss": 0.1809, + "step": 4070 + }, + { + "epoch": 2.7437794216543376, + "grad_norm": 0.9225636720657349, + "learning_rate": 2.881144621230112e-05, + "loss": 0.1832, + "step": 4080 + }, + { + "epoch": 2.750504371217216, + "grad_norm": 0.8155885338783264, + "learning_rate": 2.8752077891237238e-05, + "loss": 0.1758, + "step": 4090 + }, + { + "epoch": 2.757229320780094, + "grad_norm": 0.7248073220252991, + "learning_rate": 2.869270957017336e-05, + "loss": 0.1832, + "step": 4100 + }, + { + "epoch": 2.7639542703429725, + "grad_norm": 0.6624840497970581, + "learning_rate": 2.8633341249109474e-05, + "loss": 0.1891, + "step": 4110 + }, + { + "epoch": 2.7706792199058508, + "grad_norm": 0.4667346179485321, + "learning_rate": 2.8573972928045595e-05, + "loss": 0.21, + "step": 4120 + }, + { + "epoch": 2.777404169468729, + "grad_norm": 0.8075745105743408, + "learning_rate": 2.8514604606981717e-05, + "loss": 0.205, + "step": 4130 + }, + { + "epoch": 2.7841291190316073, + "grad_norm": 0.8244682550430298, + "learning_rate": 2.8455236285917835e-05, + "loss": 0.1878, + "step": 4140 + }, + { + "epoch": 2.7908540685944856, + "grad_norm": 0.8454658389091492, + "learning_rate": 2.8395867964853956e-05, + "loss": 0.1849, + "step": 4150 + }, + { + "epoch": 2.797579018157364, + "grad_norm": 0.8751473426818848, + "learning_rate": 2.8336499643790078e-05, + "loss": 0.2139, + "step": 4160 + }, + { + "epoch": 2.804303967720242, + "grad_norm": 0.8629645705223083, + "learning_rate": 2.8277131322726196e-05, + "loss": 0.1862, + "step": 4170 + }, + { + "epoch": 2.8110289172831204, + "grad_norm": 0.6888182759284973, + "learning_rate": 2.821776300166231e-05, + "loss": 0.2068, + "step": 4180 + }, + { + "epoch": 2.8177538668459987, + "grad_norm": 0.7422506809234619, + "learning_rate": 2.8158394680598432e-05, + "loss": 0.1572, + "step": 4190 + }, + { + "epoch": 2.824478816408877, + "grad_norm": 0.6318463683128357, + "learning_rate": 2.8099026359534553e-05, + "loss": 0.2644, + "step": 4200 + }, + { + "epoch": 2.8312037659717553, + "grad_norm": 0.81563800573349, + "learning_rate": 2.803965803847067e-05, + "loss": 0.2106, + "step": 4210 + }, + { + "epoch": 2.8379287155346336, + "grad_norm": 0.8201348781585693, + "learning_rate": 2.7980289717406793e-05, + "loss": 0.2318, + "step": 4220 + }, + { + "epoch": 2.844653665097512, + "grad_norm": 0.8415096402168274, + "learning_rate": 2.7920921396342914e-05, + "loss": 0.1859, + "step": 4230 + }, + { + "epoch": 2.85137861466039, + "grad_norm": 0.7930416464805603, + "learning_rate": 2.7861553075279036e-05, + "loss": 0.2404, + "step": 4240 + }, + { + "epoch": 2.8581035642232684, + "grad_norm": 0.7852760553359985, + "learning_rate": 2.7802184754215154e-05, + "loss": 0.1959, + "step": 4250 + }, + { + "epoch": 2.8648285137861467, + "grad_norm": 0.6727051138877869, + "learning_rate": 2.774281643315127e-05, + "loss": 0.1696, + "step": 4260 + }, + { + "epoch": 2.871553463349025, + "grad_norm": 0.7168065905570984, + "learning_rate": 2.768344811208739e-05, + "loss": 0.1896, + "step": 4270 + }, + { + "epoch": 2.8782784129119032, + "grad_norm": 0.6939849853515625, + "learning_rate": 2.762407979102351e-05, + "loss": 0.2072, + "step": 4280 + }, + { + "epoch": 2.8850033624747815, + "grad_norm": 1.2583644390106201, + "learning_rate": 2.756471146995963e-05, + "loss": 0.2281, + "step": 4290 + }, + { + "epoch": 2.89172831203766, + "grad_norm": 0.8161798119544983, + "learning_rate": 2.750534314889575e-05, + "loss": 0.1871, + "step": 4300 + }, + { + "epoch": 2.898453261600538, + "grad_norm": 0.7994545102119446, + "learning_rate": 2.7445974827831872e-05, + "loss": 0.1638, + "step": 4310 + }, + { + "epoch": 2.9051782111634163, + "grad_norm": 0.6566575169563293, + "learning_rate": 2.738660650676799e-05, + "loss": 0.1552, + "step": 4320 + }, + { + "epoch": 2.9119031607262946, + "grad_norm": 1.0358943939208984, + "learning_rate": 2.732723818570411e-05, + "loss": 0.1851, + "step": 4330 + }, + { + "epoch": 2.918628110289173, + "grad_norm": 0.6944819092750549, + "learning_rate": 2.7267869864640227e-05, + "loss": 0.1477, + "step": 4340 + }, + { + "epoch": 2.925353059852051, + "grad_norm": 0.88713139295578, + "learning_rate": 2.7208501543576348e-05, + "loss": 0.1927, + "step": 4350 + }, + { + "epoch": 2.9320780094149295, + "grad_norm": 0.8105150461196899, + "learning_rate": 2.714913322251247e-05, + "loss": 0.1958, + "step": 4360 + }, + { + "epoch": 2.9388029589778077, + "grad_norm": 0.6686846613883972, + "learning_rate": 2.7089764901448588e-05, + "loss": 0.1935, + "step": 4370 + }, + { + "epoch": 2.945527908540686, + "grad_norm": 0.9116830229759216, + "learning_rate": 2.703039658038471e-05, + "loss": 0.1814, + "step": 4380 + }, + { + "epoch": 2.9522528581035643, + "grad_norm": 0.5856004357337952, + "learning_rate": 2.697102825932083e-05, + "loss": 0.1746, + "step": 4390 + }, + { + "epoch": 2.9589778076664426, + "grad_norm": 0.8741239309310913, + "learning_rate": 2.691165993825695e-05, + "loss": 0.1816, + "step": 4400 + }, + { + "epoch": 2.965702757229321, + "grad_norm": 0.7971295714378357, + "learning_rate": 2.6852291617193063e-05, + "loss": 0.2379, + "step": 4410 + }, + { + "epoch": 2.972427706792199, + "grad_norm": 0.657518744468689, + "learning_rate": 2.6792923296129185e-05, + "loss": 0.2032, + "step": 4420 + }, + { + "epoch": 2.9791526563550774, + "grad_norm": 0.6854186058044434, + "learning_rate": 2.6733554975065306e-05, + "loss": 0.2078, + "step": 4430 + }, + { + "epoch": 2.9858776059179557, + "grad_norm": 0.9783342480659485, + "learning_rate": 2.6674186654001428e-05, + "loss": 0.1903, + "step": 4440 + }, + { + "epoch": 2.992602555480834, + "grad_norm": 0.8605186939239502, + "learning_rate": 2.6614818332937546e-05, + "loss": 0.2001, + "step": 4450 + }, + { + "epoch": 2.9993275050437123, + "grad_norm": 0.4880738854408264, + "learning_rate": 2.6555450011873667e-05, + "loss": 0.2122, + "step": 4460 + }, + { + "epoch": 3.0060524546065905, + "grad_norm": 0.7420046329498291, + "learning_rate": 2.649608169080979e-05, + "loss": 0.2074, + "step": 4470 + }, + { + "epoch": 3.012777404169469, + "grad_norm": 0.8112709522247314, + "learning_rate": 2.6436713369745903e-05, + "loss": 0.1728, + "step": 4480 + }, + { + "epoch": 3.019502353732347, + "grad_norm": 0.5927948355674744, + "learning_rate": 2.637734504868202e-05, + "loss": 0.1838, + "step": 4490 + }, + { + "epoch": 3.0262273032952254, + "grad_norm": 0.7261831164360046, + "learning_rate": 2.6317976727618143e-05, + "loss": 0.1431, + "step": 4500 + }, + { + "epoch": 3.0262273032952254, + "eval_loss": 0.16571946442127228, + "eval_runtime": 14.5789, + "eval_samples_per_second": 181.358, + "eval_steps_per_second": 22.704, + "step": 4500 + }, + { + "epoch": 3.0329522528581037, + "grad_norm": 0.6484703421592712, + "learning_rate": 2.6258608406554264e-05, + "loss": 0.1987, + "step": 4510 + }, + { + "epoch": 3.039677202420982, + "grad_norm": 0.7991098165512085, + "learning_rate": 2.6199240085490386e-05, + "loss": 0.2086, + "step": 4520 + }, + { + "epoch": 3.04640215198386, + "grad_norm": 0.7324103713035583, + "learning_rate": 2.6139871764426504e-05, + "loss": 0.1974, + "step": 4530 + }, + { + "epoch": 3.0531271015467385, + "grad_norm": 0.6129199266433716, + "learning_rate": 2.6080503443362625e-05, + "loss": 0.2471, + "step": 4540 + }, + { + "epoch": 3.0598520511096168, + "grad_norm": 0.7959029674530029, + "learning_rate": 2.6021135122298747e-05, + "loss": 0.1996, + "step": 4550 + }, + { + "epoch": 3.066577000672495, + "grad_norm": 0.8219399452209473, + "learning_rate": 2.596176680123486e-05, + "loss": 0.1978, + "step": 4560 + }, + { + "epoch": 3.0733019502353733, + "grad_norm": 0.7939678430557251, + "learning_rate": 2.590239848017098e-05, + "loss": 0.2257, + "step": 4570 + }, + { + "epoch": 3.0800268997982516, + "grad_norm": 0.5718218088150024, + "learning_rate": 2.58430301591071e-05, + "loss": 0.1844, + "step": 4580 + }, + { + "epoch": 3.08675184936113, + "grad_norm": 0.8061422109603882, + "learning_rate": 2.5783661838043222e-05, + "loss": 0.2007, + "step": 4590 + }, + { + "epoch": 3.093476798924008, + "grad_norm": 0.6342537999153137, + "learning_rate": 2.572429351697934e-05, + "loss": 0.1582, + "step": 4600 + }, + { + "epoch": 3.1002017484868865, + "grad_norm": 0.8753138780593872, + "learning_rate": 2.5664925195915462e-05, + "loss": 0.1767, + "step": 4610 + }, + { + "epoch": 3.1069266980497647, + "grad_norm": 0.5665940642356873, + "learning_rate": 2.5605556874851583e-05, + "loss": 0.2006, + "step": 4620 + }, + { + "epoch": 3.113651647612643, + "grad_norm": 0.7274763584136963, + "learning_rate": 2.5546188553787698e-05, + "loss": 0.197, + "step": 4630 + }, + { + "epoch": 3.1203765971755213, + "grad_norm": 0.9588426351547241, + "learning_rate": 2.548682023272382e-05, + "loss": 0.1797, + "step": 4640 + }, + { + "epoch": 3.1271015467383996, + "grad_norm": 0.7508196830749512, + "learning_rate": 2.5427451911659938e-05, + "loss": 0.1681, + "step": 4650 + }, + { + "epoch": 3.133826496301278, + "grad_norm": 0.9038055539131165, + "learning_rate": 2.536808359059606e-05, + "loss": 0.1689, + "step": 4660 + }, + { + "epoch": 3.140551445864156, + "grad_norm": 0.8027380108833313, + "learning_rate": 2.530871526953218e-05, + "loss": 0.2027, + "step": 4670 + }, + { + "epoch": 3.1472763954270344, + "grad_norm": 0.8716439008712769, + "learning_rate": 2.52493469484683e-05, + "loss": 0.2026, + "step": 4680 + }, + { + "epoch": 3.1540013449899127, + "grad_norm": 0.6943103075027466, + "learning_rate": 2.518997862740442e-05, + "loss": 0.1811, + "step": 4690 + }, + { + "epoch": 3.160726294552791, + "grad_norm": 0.6706501245498657, + "learning_rate": 2.5130610306340535e-05, + "loss": 0.1751, + "step": 4700 + }, + { + "epoch": 3.1674512441156693, + "grad_norm": 0.7474800944328308, + "learning_rate": 2.5071241985276656e-05, + "loss": 0.1824, + "step": 4710 + }, + { + "epoch": 3.1741761936785475, + "grad_norm": 0.7841882109642029, + "learning_rate": 2.5011873664212778e-05, + "loss": 0.1923, + "step": 4720 + }, + { + "epoch": 3.180901143241426, + "grad_norm": 0.7595344185829163, + "learning_rate": 2.4952505343148896e-05, + "loss": 0.1989, + "step": 4730 + }, + { + "epoch": 3.187626092804304, + "grad_norm": 0.7175992727279663, + "learning_rate": 2.4893137022085017e-05, + "loss": 0.2077, + "step": 4740 + }, + { + "epoch": 3.1943510423671824, + "grad_norm": 0.8540729880332947, + "learning_rate": 2.483376870102114e-05, + "loss": 0.2082, + "step": 4750 + }, + { + "epoch": 3.2010759919300606, + "grad_norm": 0.8834501504898071, + "learning_rate": 2.4774400379957257e-05, + "loss": 0.1883, + "step": 4760 + }, + { + "epoch": 3.207800941492939, + "grad_norm": 1.0018720626831055, + "learning_rate": 2.4715032058893375e-05, + "loss": 0.1939, + "step": 4770 + }, + { + "epoch": 3.214525891055817, + "grad_norm": 0.8706895709037781, + "learning_rate": 2.4655663737829496e-05, + "loss": 0.171, + "step": 4780 + }, + { + "epoch": 3.2212508406186955, + "grad_norm": 0.9072573781013489, + "learning_rate": 2.4596295416765618e-05, + "loss": 0.1868, + "step": 4790 + }, + { + "epoch": 3.2279757901815738, + "grad_norm": 0.7303891777992249, + "learning_rate": 2.4536927095701732e-05, + "loss": 0.1806, + "step": 4800 + }, + { + "epoch": 3.234700739744452, + "grad_norm": 0.7761268615722656, + "learning_rate": 2.4477558774637854e-05, + "loss": 0.1857, + "step": 4810 + }, + { + "epoch": 3.2414256893073303, + "grad_norm": 0.5720176100730896, + "learning_rate": 2.4418190453573975e-05, + "loss": 0.1915, + "step": 4820 + }, + { + "epoch": 3.2481506388702086, + "grad_norm": 0.7509124875068665, + "learning_rate": 2.4358822132510097e-05, + "loss": 0.1643, + "step": 4830 + }, + { + "epoch": 3.254875588433087, + "grad_norm": 0.963586688041687, + "learning_rate": 2.429945381144621e-05, + "loss": 0.1942, + "step": 4840 + }, + { + "epoch": 3.261600537995965, + "grad_norm": 0.688130259513855, + "learning_rate": 2.4240085490382333e-05, + "loss": 0.1746, + "step": 4850 + }, + { + "epoch": 3.2683254875588434, + "grad_norm": 0.6827595233917236, + "learning_rate": 2.4180717169318454e-05, + "loss": 0.1877, + "step": 4860 + }, + { + "epoch": 3.2750504371217217, + "grad_norm": 0.8125810027122498, + "learning_rate": 2.4121348848254572e-05, + "loss": 0.1902, + "step": 4870 + }, + { + "epoch": 3.2817753866846, + "grad_norm": 0.6463695764541626, + "learning_rate": 2.406198052719069e-05, + "loss": 0.2067, + "step": 4880 + }, + { + "epoch": 3.2885003362474783, + "grad_norm": 0.3192298412322998, + "learning_rate": 2.4002612206126812e-05, + "loss": 0.1757, + "step": 4890 + }, + { + "epoch": 3.2952252858103566, + "grad_norm": 0.9306578040122986, + "learning_rate": 2.3943243885062933e-05, + "loss": 0.1811, + "step": 4900 + }, + { + "epoch": 3.301950235373235, + "grad_norm": 1.080837607383728, + "learning_rate": 2.388387556399905e-05, + "loss": 0.2022, + "step": 4910 + }, + { + "epoch": 3.308675184936113, + "grad_norm": 0.6997596621513367, + "learning_rate": 2.382450724293517e-05, + "loss": 0.1729, + "step": 4920 + }, + { + "epoch": 3.3154001344989914, + "grad_norm": 0.6311511993408203, + "learning_rate": 2.376513892187129e-05, + "loss": 0.1493, + "step": 4930 + }, + { + "epoch": 3.3221250840618697, + "grad_norm": 0.7515596747398376, + "learning_rate": 2.3705770600807412e-05, + "loss": 0.1829, + "step": 4940 + }, + { + "epoch": 3.328850033624748, + "grad_norm": 0.6540161371231079, + "learning_rate": 2.364640227974353e-05, + "loss": 0.2014, + "step": 4950 + }, + { + "epoch": 3.3355749831876262, + "grad_norm": 0.6269936561584473, + "learning_rate": 2.358703395867965e-05, + "loss": 0.1896, + "step": 4960 + }, + { + "epoch": 3.3422999327505045, + "grad_norm": 0.871743381023407, + "learning_rate": 2.352766563761577e-05, + "loss": 0.1899, + "step": 4970 + }, + { + "epoch": 3.349024882313383, + "grad_norm": 0.6379810571670532, + "learning_rate": 2.3468297316551888e-05, + "loss": 0.1796, + "step": 4980 + }, + { + "epoch": 3.355749831876261, + "grad_norm": 0.6702184677124023, + "learning_rate": 2.340892899548801e-05, + "loss": 0.239, + "step": 4990 + }, + { + "epoch": 3.3624747814391394, + "grad_norm": 0.649553120136261, + "learning_rate": 2.3349560674424127e-05, + "loss": 0.1607, + "step": 5000 + }, + { + "epoch": 3.3624747814391394, + "eval_loss": 0.16000543534755707, + "eval_runtime": 14.3892, + "eval_samples_per_second": 183.75, + "eval_steps_per_second": 23.003, + "step": 5000 + }, + { + "epoch": 3.3691997310020176, + "grad_norm": 0.7338672280311584, + "learning_rate": 2.329019235336025e-05, + "loss": 0.1769, + "step": 5010 + }, + { + "epoch": 3.375924680564896, + "grad_norm": 0.9036253690719604, + "learning_rate": 2.3230824032296367e-05, + "loss": 0.1823, + "step": 5020 + }, + { + "epoch": 3.382649630127774, + "grad_norm": 0.8734234571456909, + "learning_rate": 2.317145571123249e-05, + "loss": 0.1667, + "step": 5030 + }, + { + "epoch": 3.3893745796906525, + "grad_norm": 0.9217347502708435, + "learning_rate": 2.3112087390168607e-05, + "loss": 0.1559, + "step": 5040 + }, + { + "epoch": 3.3960995292535308, + "grad_norm": 0.7761852741241455, + "learning_rate": 2.3052719069104728e-05, + "loss": 0.1663, + "step": 5050 + }, + { + "epoch": 3.402824478816409, + "grad_norm": 0.8161054849624634, + "learning_rate": 2.2993350748040846e-05, + "loss": 0.1958, + "step": 5060 + }, + { + "epoch": 3.4095494283792873, + "grad_norm": 0.8758025765419006, + "learning_rate": 2.2933982426976968e-05, + "loss": 0.1977, + "step": 5070 + }, + { + "epoch": 3.4162743779421656, + "grad_norm": 0.9359439611434937, + "learning_rate": 2.2874614105913086e-05, + "loss": 0.1823, + "step": 5080 + }, + { + "epoch": 3.422999327505044, + "grad_norm": 0.5680366158485413, + "learning_rate": 2.2815245784849207e-05, + "loss": 0.1989, + "step": 5090 + }, + { + "epoch": 3.429724277067922, + "grad_norm": 0.8269837498664856, + "learning_rate": 2.2755877463785325e-05, + "loss": 0.1878, + "step": 5100 + }, + { + "epoch": 3.4364492266308004, + "grad_norm": 0.6142411828041077, + "learning_rate": 2.2696509142721447e-05, + "loss": 0.2008, + "step": 5110 + }, + { + "epoch": 3.4431741761936787, + "grad_norm": 0.730137050151825, + "learning_rate": 2.2637140821657565e-05, + "loss": 0.184, + "step": 5120 + }, + { + "epoch": 3.449899125756557, + "grad_norm": 0.6891915798187256, + "learning_rate": 2.2577772500593683e-05, + "loss": 0.1677, + "step": 5130 + }, + { + "epoch": 3.4566240753194353, + "grad_norm": 0.7893393039703369, + "learning_rate": 2.2518404179529804e-05, + "loss": 0.2016, + "step": 5140 + }, + { + "epoch": 3.4633490248823136, + "grad_norm": 0.9286131262779236, + "learning_rate": 2.246497269057231e-05, + "loss": 0.2178, + "step": 5150 + }, + { + "epoch": 3.470073974445192, + "grad_norm": 0.9323815703392029, + "learning_rate": 2.2405604369508432e-05, + "loss": 0.16, + "step": 5160 + }, + { + "epoch": 3.47679892400807, + "grad_norm": 0.6662344336509705, + "learning_rate": 2.2346236048444553e-05, + "loss": 0.1703, + "step": 5170 + }, + { + "epoch": 3.4835238735709484, + "grad_norm": 0.6510077118873596, + "learning_rate": 2.228686772738067e-05, + "loss": 0.1631, + "step": 5180 + }, + { + "epoch": 3.4902488231338267, + "grad_norm": 0.7270941734313965, + "learning_rate": 2.222749940631679e-05, + "loss": 0.1555, + "step": 5190 + }, + { + "epoch": 3.496973772696705, + "grad_norm": 0.7204037308692932, + "learning_rate": 2.216813108525291e-05, + "loss": 0.2068, + "step": 5200 + }, + { + "epoch": 3.503698722259583, + "grad_norm": 0.4908240735530853, + "learning_rate": 2.2108762764189032e-05, + "loss": 0.1468, + "step": 5210 + }, + { + "epoch": 3.510423671822461, + "grad_norm": 0.9486116766929626, + "learning_rate": 2.2049394443125147e-05, + "loss": 0.156, + "step": 5220 + }, + { + "epoch": 3.5171486213853393, + "grad_norm": 0.8911501169204712, + "learning_rate": 2.199002612206127e-05, + "loss": 0.1853, + "step": 5230 + }, + { + "epoch": 3.5238735709482176, + "grad_norm": 0.7860977649688721, + "learning_rate": 2.193065780099739e-05, + "loss": 0.1714, + "step": 5240 + }, + { + "epoch": 3.530598520511096, + "grad_norm": 0.9978109002113342, + "learning_rate": 2.1871289479933508e-05, + "loss": 0.1569, + "step": 5250 + }, + { + "epoch": 3.537323470073974, + "grad_norm": 0.8278188109397888, + "learning_rate": 2.1811921158869626e-05, + "loss": 0.1977, + "step": 5260 + }, + { + "epoch": 3.5440484196368525, + "grad_norm": 0.75886470079422, + "learning_rate": 2.1752552837805748e-05, + "loss": 0.2145, + "step": 5270 + }, + { + "epoch": 3.5507733691997307, + "grad_norm": 0.608791172504425, + "learning_rate": 2.169318451674187e-05, + "loss": 0.1586, + "step": 5280 + }, + { + "epoch": 3.557498318762609, + "grad_norm": 0.8277237415313721, + "learning_rate": 2.1633816195677987e-05, + "loss": 0.213, + "step": 5290 + }, + { + "epoch": 3.5642232683254873, + "grad_norm": 0.6545449495315552, + "learning_rate": 2.1574447874614105e-05, + "loss": 0.1797, + "step": 5300 + }, + { + "epoch": 3.5709482178883656, + "grad_norm": 0.8395527005195618, + "learning_rate": 2.1515079553550227e-05, + "loss": 0.1776, + "step": 5310 + }, + { + "epoch": 3.577673167451244, + "grad_norm": 0.6380318403244019, + "learning_rate": 2.1455711232486348e-05, + "loss": 0.1342, + "step": 5320 + }, + { + "epoch": 3.584398117014122, + "grad_norm": 0.751994252204895, + "learning_rate": 2.1396342911422466e-05, + "loss": 0.1525, + "step": 5330 + }, + { + "epoch": 3.5911230665770004, + "grad_norm": 0.6051616668701172, + "learning_rate": 2.1336974590358584e-05, + "loss": 0.1682, + "step": 5340 + }, + { + "epoch": 3.5978480161398787, + "grad_norm": 0.8227441906929016, + "learning_rate": 2.1277606269294706e-05, + "loss": 0.2026, + "step": 5350 + }, + { + "epoch": 3.604572965702757, + "grad_norm": 0.6999370455741882, + "learning_rate": 2.1218237948230827e-05, + "loss": 0.1954, + "step": 5360 + }, + { + "epoch": 3.6112979152656353, + "grad_norm": 0.6701903343200684, + "learning_rate": 2.1158869627166945e-05, + "loss": 0.1559, + "step": 5370 + }, + { + "epoch": 3.6180228648285135, + "grad_norm": 1.0471676588058472, + "learning_rate": 2.1099501306103063e-05, + "loss": 0.1504, + "step": 5380 + }, + { + "epoch": 3.624747814391392, + "grad_norm": 0.8120239973068237, + "learning_rate": 2.1040132985039185e-05, + "loss": 0.1757, + "step": 5390 + }, + { + "epoch": 3.63147276395427, + "grad_norm": 0.7743443846702576, + "learning_rate": 2.0980764663975303e-05, + "loss": 0.2083, + "step": 5400 + }, + { + "epoch": 3.6381977135171484, + "grad_norm": 0.630879282951355, + "learning_rate": 2.0921396342911424e-05, + "loss": 0.1435, + "step": 5410 + }, + { + "epoch": 3.6449226630800267, + "grad_norm": 1.7099244594573975, + "learning_rate": 2.0862028021847542e-05, + "loss": 0.2272, + "step": 5420 + }, + { + "epoch": 3.651647612642905, + "grad_norm": 0.7191898226737976, + "learning_rate": 2.0802659700783664e-05, + "loss": 0.2123, + "step": 5430 + }, + { + "epoch": 3.658372562205783, + "grad_norm": 0.5158668160438538, + "learning_rate": 2.0743291379719782e-05, + "loss": 0.1714, + "step": 5440 + }, + { + "epoch": 3.6650975117686615, + "grad_norm": 0.591200590133667, + "learning_rate": 2.0683923058655903e-05, + "loss": 0.1435, + "step": 5450 + }, + { + "epoch": 3.6718224613315398, + "grad_norm": 0.6342896819114685, + "learning_rate": 2.062455473759202e-05, + "loss": 0.1615, + "step": 5460 + }, + { + "epoch": 3.678547410894418, + "grad_norm": 0.6707589030265808, + "learning_rate": 2.0565186416528143e-05, + "loss": 0.1646, + "step": 5470 + }, + { + "epoch": 3.6852723604572963, + "grad_norm": 0.9689118266105652, + "learning_rate": 2.050581809546426e-05, + "loss": 0.2129, + "step": 5480 + }, + { + "epoch": 3.6919973100201746, + "grad_norm": 0.7816817760467529, + "learning_rate": 2.0446449774400382e-05, + "loss": 0.1631, + "step": 5490 + }, + { + "epoch": 3.698722259583053, + "grad_norm": 0.7102280855178833, + "learning_rate": 2.03870814533365e-05, + "loss": 0.1895, + "step": 5500 + }, + { + "epoch": 3.698722259583053, + "eval_loss": 0.1554509997367859, + "eval_runtime": 14.4049, + "eval_samples_per_second": 183.549, + "eval_steps_per_second": 22.978, + "step": 5500 + }, + { + "epoch": 3.705447209145931, + "grad_norm": 0.8417555689811707, + "learning_rate": 2.0327713132272622e-05, + "loss": 0.2083, + "step": 5510 + }, + { + "epoch": 3.7121721587088095, + "grad_norm": 0.6126737594604492, + "learning_rate": 2.026834481120874e-05, + "loss": 0.1675, + "step": 5520 + }, + { + "epoch": 3.7188971082716877, + "grad_norm": 0.8237993717193604, + "learning_rate": 2.020897649014486e-05, + "loss": 0.1737, + "step": 5530 + }, + { + "epoch": 3.725622057834566, + "grad_norm": 0.531822144985199, + "learning_rate": 2.014960816908098e-05, + "loss": 0.1557, + "step": 5540 + }, + { + "epoch": 3.7323470073974443, + "grad_norm": 0.8391734957695007, + "learning_rate": 2.0090239848017097e-05, + "loss": 0.1905, + "step": 5550 + }, + { + "epoch": 3.7390719569603226, + "grad_norm": 0.7321840524673462, + "learning_rate": 2.003087152695322e-05, + "loss": 0.2007, + "step": 5560 + }, + { + "epoch": 3.745796906523201, + "grad_norm": 1.057217001914978, + "learning_rate": 1.997150320588934e-05, + "loss": 0.2366, + "step": 5570 + }, + { + "epoch": 3.752521856086079, + "grad_norm": 0.48040035367012024, + "learning_rate": 1.991213488482546e-05, + "loss": 0.1783, + "step": 5580 + }, + { + "epoch": 3.7592468056489574, + "grad_norm": 0.9027122855186462, + "learning_rate": 1.9852766563761576e-05, + "loss": 0.1978, + "step": 5590 + }, + { + "epoch": 3.7659717552118357, + "grad_norm": 0.8584364652633667, + "learning_rate": 1.9793398242697698e-05, + "loss": 0.1962, + "step": 5600 + }, + { + "epoch": 3.772696704774714, + "grad_norm": 0.8134893178939819, + "learning_rate": 1.9734029921633816e-05, + "loss": 0.1544, + "step": 5610 + }, + { + "epoch": 3.7794216543375923, + "grad_norm": 0.7096035480499268, + "learning_rate": 1.9674661600569937e-05, + "loss": 0.1601, + "step": 5620 + }, + { + "epoch": 3.7861466039004705, + "grad_norm": 0.7845041751861572, + "learning_rate": 1.9615293279506056e-05, + "loss": 0.2081, + "step": 5630 + }, + { + "epoch": 3.792871553463349, + "grad_norm": 0.5299544334411621, + "learning_rate": 1.9555924958442177e-05, + "loss": 0.1587, + "step": 5640 + }, + { + "epoch": 3.799596503026227, + "grad_norm": 0.5104948878288269, + "learning_rate": 1.9496556637378295e-05, + "loss": 0.2026, + "step": 5650 + }, + { + "epoch": 3.8063214525891054, + "grad_norm": 0.7379271984100342, + "learning_rate": 1.9437188316314416e-05, + "loss": 0.1811, + "step": 5660 + }, + { + "epoch": 3.8130464021519836, + "grad_norm": 0.686158299446106, + "learning_rate": 1.9377819995250535e-05, + "loss": 0.1743, + "step": 5670 + }, + { + "epoch": 3.819771351714862, + "grad_norm": 0.7175692915916443, + "learning_rate": 1.9318451674186656e-05, + "loss": 0.1751, + "step": 5680 + }, + { + "epoch": 3.82649630127774, + "grad_norm": 0.6327086687088013, + "learning_rate": 1.9259083353122774e-05, + "loss": 0.1471, + "step": 5690 + }, + { + "epoch": 3.8332212508406185, + "grad_norm": 0.8163675665855408, + "learning_rate": 1.9199715032058892e-05, + "loss": 0.1989, + "step": 5700 + }, + { + "epoch": 3.8399462004034968, + "grad_norm": 0.6851422190666199, + "learning_rate": 1.9140346710995014e-05, + "loss": 0.182, + "step": 5710 + }, + { + "epoch": 3.846671149966375, + "grad_norm": 0.9112792611122131, + "learning_rate": 1.9080978389931135e-05, + "loss": 0.2003, + "step": 5720 + }, + { + "epoch": 3.8533960995292533, + "grad_norm": 0.7452694773674011, + "learning_rate": 1.9021610068867253e-05, + "loss": 0.1823, + "step": 5730 + }, + { + "epoch": 3.8601210490921316, + "grad_norm": 0.8061573505401611, + "learning_rate": 1.896224174780337e-05, + "loss": 0.2005, + "step": 5740 + }, + { + "epoch": 3.86684599865501, + "grad_norm": 0.8620453476905823, + "learning_rate": 1.8902873426739493e-05, + "loss": 0.1951, + "step": 5750 + }, + { + "epoch": 3.873570948217888, + "grad_norm": 0.7951462864875793, + "learning_rate": 1.8843505105675614e-05, + "loss": 0.1879, + "step": 5760 + }, + { + "epoch": 3.8802958977807664, + "grad_norm": 0.787812352180481, + "learning_rate": 1.8784136784611732e-05, + "loss": 0.1486, + "step": 5770 + }, + { + "epoch": 3.8870208473436447, + "grad_norm": 0.8405473232269287, + "learning_rate": 1.872476846354785e-05, + "loss": 0.1864, + "step": 5780 + }, + { + "epoch": 3.893745796906523, + "grad_norm": 0.7713287472724915, + "learning_rate": 1.866540014248397e-05, + "loss": 0.1632, + "step": 5790 + }, + { + "epoch": 3.9004707464694013, + "grad_norm": 1.1021727323532104, + "learning_rate": 1.8606031821420093e-05, + "loss": 0.1763, + "step": 5800 + }, + { + "epoch": 3.9071956960322796, + "grad_norm": 0.9041099548339844, + "learning_rate": 1.8546663500356208e-05, + "loss": 0.177, + "step": 5810 + }, + { + "epoch": 3.913920645595158, + "grad_norm": 0.607937216758728, + "learning_rate": 1.848729517929233e-05, + "loss": 0.1498, + "step": 5820 + }, + { + "epoch": 3.920645595158036, + "grad_norm": 0.8985883593559265, + "learning_rate": 1.842792685822845e-05, + "loss": 0.1786, + "step": 5830 + }, + { + "epoch": 3.9273705447209144, + "grad_norm": 0.7832037210464478, + "learning_rate": 1.8368558537164572e-05, + "loss": 0.2394, + "step": 5840 + }, + { + "epoch": 3.9340954942837927, + "grad_norm": 0.8359716534614563, + "learning_rate": 1.8309190216100687e-05, + "loss": 0.1774, + "step": 5850 + }, + { + "epoch": 3.940820443846671, + "grad_norm": 0.7045271992683411, + "learning_rate": 1.824982189503681e-05, + "loss": 0.1667, + "step": 5860 + }, + { + "epoch": 3.9475453934095492, + "grad_norm": 0.48406705260276794, + "learning_rate": 1.819045357397293e-05, + "loss": 0.1927, + "step": 5870 + }, + { + "epoch": 3.9542703429724275, + "grad_norm": 0.6424108743667603, + "learning_rate": 1.813108525290905e-05, + "loss": 0.1612, + "step": 5880 + }, + { + "epoch": 3.960995292535306, + "grad_norm": 0.9854601621627808, + "learning_rate": 1.8071716931845166e-05, + "loss": 0.1902, + "step": 5890 + }, + { + "epoch": 3.967720242098184, + "grad_norm": 0.596906304359436, + "learning_rate": 1.8012348610781287e-05, + "loss": 0.1659, + "step": 5900 + }, + { + "epoch": 3.9744451916610624, + "grad_norm": 0.5105352401733398, + "learning_rate": 1.795298028971741e-05, + "loss": 0.1498, + "step": 5910 + }, + { + "epoch": 3.9811701412239406, + "grad_norm": 0.7021915316581726, + "learning_rate": 1.789361196865353e-05, + "loss": 0.1908, + "step": 5920 + }, + { + "epoch": 3.987895090786819, + "grad_norm": 0.6894943714141846, + "learning_rate": 1.7834243647589645e-05, + "loss": 0.1869, + "step": 5930 + }, + { + "epoch": 3.994620040349697, + "grad_norm": 0.6956019997596741, + "learning_rate": 1.7774875326525766e-05, + "loss": 0.1761, + "step": 5940 + }, + { + "epoch": 4.0013449899125755, + "grad_norm": 0.7093386650085449, + "learning_rate": 1.7715507005461888e-05, + "loss": 0.1433, + "step": 5950 + }, + { + "epoch": 4.008069939475454, + "grad_norm": 0.8302282094955444, + "learning_rate": 1.7656138684398006e-05, + "loss": 0.1663, + "step": 5960 + }, + { + "epoch": 4.014794889038332, + "grad_norm": 0.7187122702598572, + "learning_rate": 1.7596770363334124e-05, + "loss": 0.1556, + "step": 5970 + }, + { + "epoch": 4.02151983860121, + "grad_norm": 0.7609972357749939, + "learning_rate": 1.7537402042270245e-05, + "loss": 0.2126, + "step": 5980 + }, + { + "epoch": 4.028244788164089, + "grad_norm": 0.7078454494476318, + "learning_rate": 1.7478033721206367e-05, + "loss": 0.178, + "step": 5990 + }, + { + "epoch": 4.034969737726967, + "grad_norm": 0.7371071577072144, + "learning_rate": 1.7418665400142485e-05, + "loss": 0.1715, + "step": 6000 + }, + { + "epoch": 4.034969737726967, + "eval_loss": 0.15210556983947754, + "eval_runtime": 14.4339, + "eval_samples_per_second": 183.18, + "eval_steps_per_second": 22.932, + "step": 6000 + }, + { + "epoch": 4.041694687289845, + "grad_norm": 0.9265919327735901, + "learning_rate": 1.7359297079078603e-05, + "loss": 0.1895, + "step": 6010 + }, + { + "epoch": 4.048419636852723, + "grad_norm": 0.5793859362602234, + "learning_rate": 1.7299928758014724e-05, + "loss": 0.1478, + "step": 6020 + }, + { + "epoch": 4.055144586415602, + "grad_norm": 0.622623085975647, + "learning_rate": 1.7240560436950846e-05, + "loss": 0.1858, + "step": 6030 + }, + { + "epoch": 4.06186953597848, + "grad_norm": 0.9503956437110901, + "learning_rate": 1.7181192115886964e-05, + "loss": 0.1935, + "step": 6040 + }, + { + "epoch": 4.068594485541358, + "grad_norm": 0.6633917689323425, + "learning_rate": 1.7121823794823082e-05, + "loss": 0.1561, + "step": 6050 + }, + { + "epoch": 4.0753194351042366, + "grad_norm": 0.7264837622642517, + "learning_rate": 1.7062455473759204e-05, + "loss": 0.1657, + "step": 6060 + }, + { + "epoch": 4.082044384667115, + "grad_norm": 0.739024817943573, + "learning_rate": 1.7003087152695325e-05, + "loss": 0.1573, + "step": 6070 + }, + { + "epoch": 4.088769334229993, + "grad_norm": 0.7400847673416138, + "learning_rate": 1.6943718831631443e-05, + "loss": 0.1804, + "step": 6080 + }, + { + "epoch": 4.095494283792871, + "grad_norm": 0.6965304017066956, + "learning_rate": 1.688435051056756e-05, + "loss": 0.1962, + "step": 6090 + }, + { + "epoch": 4.10221923335575, + "grad_norm": 0.6275157928466797, + "learning_rate": 1.6824982189503683e-05, + "loss": 0.1547, + "step": 6100 + }, + { + "epoch": 4.108944182918628, + "grad_norm": 0.677739679813385, + "learning_rate": 1.67656138684398e-05, + "loss": 0.1491, + "step": 6110 + }, + { + "epoch": 4.115669132481506, + "grad_norm": 0.8116381764411926, + "learning_rate": 1.6706245547375922e-05, + "loss": 0.1611, + "step": 6120 + }, + { + "epoch": 4.1223940820443845, + "grad_norm": 0.809904158115387, + "learning_rate": 1.664687722631204e-05, + "loss": 0.2239, + "step": 6130 + }, + { + "epoch": 4.129119031607263, + "grad_norm": 0.7069700956344604, + "learning_rate": 1.658750890524816e-05, + "loss": 0.1372, + "step": 6140 + }, + { + "epoch": 4.135843981170141, + "grad_norm": 0.8203169703483582, + "learning_rate": 1.652814058418428e-05, + "loss": 0.1438, + "step": 6150 + }, + { + "epoch": 4.142568930733019, + "grad_norm": 0.8196160197257996, + "learning_rate": 1.64687722631204e-05, + "loss": 0.1926, + "step": 6160 + }, + { + "epoch": 4.149293880295898, + "grad_norm": 0.8232800364494324, + "learning_rate": 1.640940394205652e-05, + "loss": 0.1669, + "step": 6170 + }, + { + "epoch": 4.156018829858776, + "grad_norm": 0.5236471891403198, + "learning_rate": 1.635003562099264e-05, + "loss": 0.1396, + "step": 6180 + }, + { + "epoch": 4.162743779421654, + "grad_norm": 0.8701181411743164, + "learning_rate": 1.629066729992876e-05, + "loss": 0.1826, + "step": 6190 + }, + { + "epoch": 4.1694687289845325, + "grad_norm": 0.7005910277366638, + "learning_rate": 1.6231298978864877e-05, + "loss": 0.1656, + "step": 6200 + }, + { + "epoch": 4.176193678547411, + "grad_norm": 0.8140985369682312, + "learning_rate": 1.6171930657800998e-05, + "loss": 0.1595, + "step": 6210 + }, + { + "epoch": 4.182918628110289, + "grad_norm": 0.7422236204147339, + "learning_rate": 1.6112562336737116e-05, + "loss": 0.158, + "step": 6220 + }, + { + "epoch": 4.189643577673167, + "grad_norm": 0.7477454543113708, + "learning_rate": 1.6053194015673238e-05, + "loss": 0.1812, + "step": 6230 + }, + { + "epoch": 4.196368527236046, + "grad_norm": 0.897112250328064, + "learning_rate": 1.5993825694609356e-05, + "loss": 0.1797, + "step": 6240 + }, + { + "epoch": 4.203093476798924, + "grad_norm": 0.5914967656135559, + "learning_rate": 1.5934457373545477e-05, + "loss": 0.1567, + "step": 6250 + }, + { + "epoch": 4.209818426361802, + "grad_norm": 0.8895167112350464, + "learning_rate": 1.5875089052481595e-05, + "loss": 0.1777, + "step": 6260 + }, + { + "epoch": 4.21654337592468, + "grad_norm": 0.9183871150016785, + "learning_rate": 1.5815720731417717e-05, + "loss": 0.1546, + "step": 6270 + }, + { + "epoch": 4.223268325487559, + "grad_norm": 0.7540081739425659, + "learning_rate": 1.5756352410353835e-05, + "loss": 0.1457, + "step": 6280 + }, + { + "epoch": 4.229993275050437, + "grad_norm": 0.7206098437309265, + "learning_rate": 1.5696984089289956e-05, + "loss": 0.1653, + "step": 6290 + }, + { + "epoch": 4.236718224613315, + "grad_norm": 0.7160133719444275, + "learning_rate": 1.5637615768226074e-05, + "loss": 0.1619, + "step": 6300 + }, + { + "epoch": 4.2434431741761935, + "grad_norm": 0.7305388450622559, + "learning_rate": 1.5578247447162196e-05, + "loss": 0.1402, + "step": 6310 + }, + { + "epoch": 4.250168123739072, + "grad_norm": 0.8916037082672119, + "learning_rate": 1.5518879126098314e-05, + "loss": 0.1731, + "step": 6320 + }, + { + "epoch": 4.25689307330195, + "grad_norm": 0.9328885078430176, + "learning_rate": 1.5459510805034435e-05, + "loss": 0.2115, + "step": 6330 + }, + { + "epoch": 4.263618022864828, + "grad_norm": 0.667841374874115, + "learning_rate": 1.5400142483970553e-05, + "loss": 0.1815, + "step": 6340 + }, + { + "epoch": 4.270342972427707, + "grad_norm": 0.7957150340080261, + "learning_rate": 1.5340774162906675e-05, + "loss": 0.1469, + "step": 6350 + }, + { + "epoch": 4.277067921990585, + "grad_norm": 0.4576648771762848, + "learning_rate": 1.5281405841842793e-05, + "loss": 0.1527, + "step": 6360 + }, + { + "epoch": 4.283792871553463, + "grad_norm": 0.6559432148933411, + "learning_rate": 1.5222037520778911e-05, + "loss": 0.1891, + "step": 6370 + }, + { + "epoch": 4.2905178211163415, + "grad_norm": 0.8562697768211365, + "learning_rate": 1.5162669199715032e-05, + "loss": 0.1845, + "step": 6380 + }, + { + "epoch": 4.29724277067922, + "grad_norm": 0.6783193945884705, + "learning_rate": 1.5103300878651152e-05, + "loss": 0.1537, + "step": 6390 + }, + { + "epoch": 4.303967720242098, + "grad_norm": 0.5136592388153076, + "learning_rate": 1.5043932557587274e-05, + "loss": 0.1328, + "step": 6400 + }, + { + "epoch": 4.310692669804976, + "grad_norm": 0.5365772843360901, + "learning_rate": 1.498456423652339e-05, + "loss": 0.1574, + "step": 6410 + }, + { + "epoch": 4.317417619367855, + "grad_norm": 0.7236724495887756, + "learning_rate": 1.4925195915459512e-05, + "loss": 0.1378, + "step": 6420 + }, + { + "epoch": 4.324142568930733, + "grad_norm": 0.6155509948730469, + "learning_rate": 1.4865827594395631e-05, + "loss": 0.1645, + "step": 6430 + }, + { + "epoch": 4.330867518493611, + "grad_norm": 1.0150562524795532, + "learning_rate": 1.4806459273331753e-05, + "loss": 0.1757, + "step": 6440 + }, + { + "epoch": 4.3375924680564895, + "grad_norm": 0.5872318148612976, + "learning_rate": 1.4747090952267869e-05, + "loss": 0.185, + "step": 6450 + }, + { + "epoch": 4.344317417619368, + "grad_norm": 0.6680309772491455, + "learning_rate": 1.468772263120399e-05, + "loss": 0.1732, + "step": 6460 + }, + { + "epoch": 4.351042367182246, + "grad_norm": 0.6384024024009705, + "learning_rate": 1.462835431014011e-05, + "loss": 0.1821, + "step": 6470 + }, + { + "epoch": 4.357767316745124, + "grad_norm": 0.8072052001953125, + "learning_rate": 1.456898598907623e-05, + "loss": 0.1928, + "step": 6480 + }, + { + "epoch": 4.364492266308003, + "grad_norm": 0.5495839715003967, + "learning_rate": 1.4509617668012348e-05, + "loss": 0.1674, + "step": 6490 + }, + { + "epoch": 4.371217215870881, + "grad_norm": 0.9522832632064819, + "learning_rate": 1.445024934694847e-05, + "loss": 0.1604, + "step": 6500 + }, + { + "epoch": 4.371217215870881, + "eval_loss": 0.1505712866783142, + "eval_runtime": 14.3577, + "eval_samples_per_second": 184.152, + "eval_steps_per_second": 23.054, + "step": 6500 + }, + { + "epoch": 4.377942165433759, + "grad_norm": 0.8312244415283203, + "learning_rate": 1.439088102588459e-05, + "loss": 0.1754, + "step": 6510 + }, + { + "epoch": 4.384667114996637, + "grad_norm": 0.6350352764129639, + "learning_rate": 1.4331512704820707e-05, + "loss": 0.1438, + "step": 6520 + }, + { + "epoch": 4.391392064559516, + "grad_norm": 0.8806383609771729, + "learning_rate": 1.4272144383756827e-05, + "loss": 0.1789, + "step": 6530 + }, + { + "epoch": 4.398117014122394, + "grad_norm": 0.6274639964103699, + "learning_rate": 1.4212776062692949e-05, + "loss": 0.1364, + "step": 6540 + }, + { + "epoch": 4.404841963685272, + "grad_norm": 0.6913734674453735, + "learning_rate": 1.4153407741629068e-05, + "loss": 0.1934, + "step": 6550 + }, + { + "epoch": 4.4115669132481505, + "grad_norm": 0.6295379400253296, + "learning_rate": 1.4094039420565186e-05, + "loss": 0.1629, + "step": 6560 + }, + { + "epoch": 4.418291862811029, + "grad_norm": 0.5869171023368835, + "learning_rate": 1.4040607931607694e-05, + "loss": 0.1667, + "step": 6570 + }, + { + "epoch": 4.425016812373907, + "grad_norm": 0.9569641351699829, + "learning_rate": 1.3981239610543814e-05, + "loss": 0.1467, + "step": 6580 + }, + { + "epoch": 4.431741761936785, + "grad_norm": 1.2260146141052246, + "learning_rate": 1.3921871289479934e-05, + "loss": 0.1812, + "step": 6590 + }, + { + "epoch": 4.438466711499664, + "grad_norm": 0.7992416620254517, + "learning_rate": 1.3862502968416055e-05, + "loss": 0.1325, + "step": 6600 + }, + { + "epoch": 4.445191661062542, + "grad_norm": 0.5538522601127625, + "learning_rate": 1.3803134647352173e-05, + "loss": 0.2007, + "step": 6610 + }, + { + "epoch": 4.45191661062542, + "grad_norm": 0.6790997982025146, + "learning_rate": 1.3743766326288293e-05, + "loss": 0.1648, + "step": 6620 + }, + { + "epoch": 4.4586415601882985, + "grad_norm": 0.7720995545387268, + "learning_rate": 1.3684398005224413e-05, + "loss": 0.1498, + "step": 6630 + }, + { + "epoch": 4.465366509751177, + "grad_norm": 0.7530688643455505, + "learning_rate": 1.3625029684160531e-05, + "loss": 0.1279, + "step": 6640 + }, + { + "epoch": 4.472091459314055, + "grad_norm": 0.8694820404052734, + "learning_rate": 1.3565661363096653e-05, + "loss": 0.1605, + "step": 6650 + }, + { + "epoch": 4.478816408876933, + "grad_norm": 0.6193166971206665, + "learning_rate": 1.3506293042032772e-05, + "loss": 0.195, + "step": 6660 + }, + { + "epoch": 4.485541358439812, + "grad_norm": 0.7043260931968689, + "learning_rate": 1.3446924720968892e-05, + "loss": 0.1727, + "step": 6670 + }, + { + "epoch": 4.49226630800269, + "grad_norm": 0.9062890410423279, + "learning_rate": 1.338755639990501e-05, + "loss": 0.1667, + "step": 6680 + }, + { + "epoch": 4.498991257565568, + "grad_norm": 0.7411687970161438, + "learning_rate": 1.332818807884113e-05, + "loss": 0.1906, + "step": 6690 + }, + { + "epoch": 4.5057162071284464, + "grad_norm": 0.8305112719535828, + "learning_rate": 1.3268819757777251e-05, + "loss": 0.1809, + "step": 6700 + }, + { + "epoch": 4.512441156691325, + "grad_norm": 0.6523670554161072, + "learning_rate": 1.3209451436713371e-05, + "loss": 0.2148, + "step": 6710 + }, + { + "epoch": 4.519166106254203, + "grad_norm": 0.8279176354408264, + "learning_rate": 1.3150083115649489e-05, + "loss": 0.1497, + "step": 6720 + }, + { + "epoch": 4.525891055817081, + "grad_norm": 0.4953828454017639, + "learning_rate": 1.3090714794585609e-05, + "loss": 0.1579, + "step": 6730 + }, + { + "epoch": 4.53261600537996, + "grad_norm": 0.9850824475288391, + "learning_rate": 1.303134647352173e-05, + "loss": 0.1906, + "step": 6740 + }, + { + "epoch": 4.539340954942838, + "grad_norm": 0.872776985168457, + "learning_rate": 1.297197815245785e-05, + "loss": 0.1642, + "step": 6750 + }, + { + "epoch": 4.546065904505716, + "grad_norm": 0.548650860786438, + "learning_rate": 1.2912609831393968e-05, + "loss": 0.145, + "step": 6760 + }, + { + "epoch": 4.552790854068594, + "grad_norm": 0.5439342856407166, + "learning_rate": 1.2853241510330088e-05, + "loss": 0.1569, + "step": 6770 + }, + { + "epoch": 4.559515803631473, + "grad_norm": 0.543531596660614, + "learning_rate": 1.279387318926621e-05, + "loss": 0.1542, + "step": 6780 + }, + { + "epoch": 4.566240753194351, + "grad_norm": 0.5698922276496887, + "learning_rate": 1.2734504868202326e-05, + "loss": 0.1577, + "step": 6790 + }, + { + "epoch": 4.572965702757229, + "grad_norm": 1.2096151113510132, + "learning_rate": 1.2675136547138447e-05, + "loss": 0.1717, + "step": 6800 + }, + { + "epoch": 4.5796906523201075, + "grad_norm": 0.664527416229248, + "learning_rate": 1.2615768226074567e-05, + "loss": 0.1918, + "step": 6810 + }, + { + "epoch": 4.586415601882986, + "grad_norm": 1.1124138832092285, + "learning_rate": 1.2556399905010688e-05, + "loss": 0.1684, + "step": 6820 + }, + { + "epoch": 4.593140551445864, + "grad_norm": 0.7068644165992737, + "learning_rate": 1.2497031583946807e-05, + "loss": 0.1421, + "step": 6830 + }, + { + "epoch": 4.599865501008742, + "grad_norm": 0.8015205264091492, + "learning_rate": 1.2437663262882926e-05, + "loss": 0.1561, + "step": 6840 + }, + { + "epoch": 4.606590450571621, + "grad_norm": 0.7207475304603577, + "learning_rate": 1.2378294941819046e-05, + "loss": 0.156, + "step": 6850 + }, + { + "epoch": 4.613315400134499, + "grad_norm": 0.8778402209281921, + "learning_rate": 1.2318926620755166e-05, + "loss": 0.17, + "step": 6860 + }, + { + "epoch": 4.620040349697377, + "grad_norm": 0.8167802095413208, + "learning_rate": 1.2259558299691286e-05, + "loss": 0.188, + "step": 6870 + }, + { + "epoch": 4.6267652992602555, + "grad_norm": 0.6325931549072266, + "learning_rate": 1.2200189978627405e-05, + "loss": 0.1491, + "step": 6880 + }, + { + "epoch": 4.633490248823134, + "grad_norm": 0.8713521957397461, + "learning_rate": 1.2140821657563525e-05, + "loss": 0.1405, + "step": 6890 + }, + { + "epoch": 4.640215198386012, + "grad_norm": 0.617329478263855, + "learning_rate": 1.2081453336499645e-05, + "loss": 0.1627, + "step": 6900 + }, + { + "epoch": 4.64694014794889, + "grad_norm": 0.683966338634491, + "learning_rate": 1.2022085015435763e-05, + "loss": 0.1395, + "step": 6910 + }, + { + "epoch": 4.653665097511769, + "grad_norm": 0.7148402333259583, + "learning_rate": 1.1962716694371884e-05, + "loss": 0.1865, + "step": 6920 + }, + { + "epoch": 4.660390047074647, + "grad_norm": 0.616771399974823, + "learning_rate": 1.1903348373308002e-05, + "loss": 0.1566, + "step": 6930 + }, + { + "epoch": 4.667114996637525, + "grad_norm": 0.6236411333084106, + "learning_rate": 1.1843980052244124e-05, + "loss": 0.1986, + "step": 6940 + }, + { + "epoch": 4.673839946200403, + "grad_norm": 0.8944858312606812, + "learning_rate": 1.1784611731180242e-05, + "loss": 0.1612, + "step": 6950 + }, + { + "epoch": 4.680564895763282, + "grad_norm": 0.7702111601829529, + "learning_rate": 1.1725243410116363e-05, + "loss": 0.1892, + "step": 6960 + }, + { + "epoch": 4.68728984532616, + "grad_norm": 0.6784073710441589, + "learning_rate": 1.1665875089052481e-05, + "loss": 0.2, + "step": 6970 + }, + { + "epoch": 4.694014794889038, + "grad_norm": 1.0141595602035522, + "learning_rate": 1.1606506767988603e-05, + "loss": 0.1485, + "step": 6980 + }, + { + "epoch": 4.700739744451917, + "grad_norm": 0.7536572813987732, + "learning_rate": 1.1547138446924721e-05, + "loss": 0.1602, + "step": 6990 + }, + { + "epoch": 4.707464694014795, + "grad_norm": 0.7322705984115601, + "learning_rate": 1.1487770125860842e-05, + "loss": 0.1422, + "step": 7000 + }, + { + "epoch": 4.707464694014795, + "eval_loss": 0.1484505832195282, + "eval_runtime": 14.4562, + "eval_samples_per_second": 182.897, + "eval_steps_per_second": 22.897, + "step": 7000 + }, + { + "epoch": 4.714189643577673, + "grad_norm": 0.7850281000137329, + "learning_rate": 1.142840180479696e-05, + "loss": 0.1565, + "step": 7010 + }, + { + "epoch": 4.720914593140551, + "grad_norm": 0.656444787979126, + "learning_rate": 1.1369033483733082e-05, + "loss": 0.1388, + "step": 7020 + }, + { + "epoch": 4.72763954270343, + "grad_norm": 0.8533453345298767, + "learning_rate": 1.13096651626692e-05, + "loss": 0.2173, + "step": 7030 + }, + { + "epoch": 4.734364492266308, + "grad_norm": 0.5954863429069519, + "learning_rate": 1.125029684160532e-05, + "loss": 0.1494, + "step": 7040 + }, + { + "epoch": 4.741089441829186, + "grad_norm": 0.7292872667312622, + "learning_rate": 1.119092852054144e-05, + "loss": 0.1808, + "step": 7050 + }, + { + "epoch": 4.7478143913920645, + "grad_norm": 0.5989605784416199, + "learning_rate": 1.113156019947756e-05, + "loss": 0.1474, + "step": 7060 + }, + { + "epoch": 4.754539340954943, + "grad_norm": 0.870201826095581, + "learning_rate": 1.1072191878413679e-05, + "loss": 0.1767, + "step": 7070 + }, + { + "epoch": 4.761264290517821, + "grad_norm": 0.7545794248580933, + "learning_rate": 1.1012823557349799e-05, + "loss": 0.1726, + "step": 7080 + }, + { + "epoch": 4.767989240080699, + "grad_norm": 0.5619848370552063, + "learning_rate": 1.0953455236285919e-05, + "loss": 0.154, + "step": 7090 + }, + { + "epoch": 4.774714189643578, + "grad_norm": 0.6141572594642639, + "learning_rate": 1.0894086915222038e-05, + "loss": 0.1506, + "step": 7100 + }, + { + "epoch": 4.781439139206456, + "grad_norm": 0.5468729734420776, + "learning_rate": 1.0834718594158158e-05, + "loss": 0.1809, + "step": 7110 + }, + { + "epoch": 4.788164088769334, + "grad_norm": 0.7549372315406799, + "learning_rate": 1.0775350273094278e-05, + "loss": 0.1434, + "step": 7120 + }, + { + "epoch": 4.7948890383322125, + "grad_norm": 0.5954113602638245, + "learning_rate": 1.0715981952030398e-05, + "loss": 0.1773, + "step": 7130 + }, + { + "epoch": 4.801613987895091, + "grad_norm": 0.9040542840957642, + "learning_rate": 1.0656613630966517e-05, + "loss": 0.1802, + "step": 7140 + }, + { + "epoch": 4.808338937457969, + "grad_norm": 0.8996601700782776, + "learning_rate": 1.0597245309902637e-05, + "loss": 0.1771, + "step": 7150 + }, + { + "epoch": 4.815063887020847, + "grad_norm": 0.5393683910369873, + "learning_rate": 1.0537876988838755e-05, + "loss": 0.1463, + "step": 7160 + }, + { + "epoch": 4.821788836583726, + "grad_norm": 0.5825424194335938, + "learning_rate": 1.0478508667774877e-05, + "loss": 0.17, + "step": 7170 + }, + { + "epoch": 4.828513786146604, + "grad_norm": 0.7498705983161926, + "learning_rate": 1.0419140346710995e-05, + "loss": 0.1753, + "step": 7180 + }, + { + "epoch": 4.835238735709482, + "grad_norm": 0.6035088300704956, + "learning_rate": 1.0359772025647114e-05, + "loss": 0.1459, + "step": 7190 + }, + { + "epoch": 4.84196368527236, + "grad_norm": 0.7723816633224487, + "learning_rate": 1.0300403704583234e-05, + "loss": 0.1585, + "step": 7200 + }, + { + "epoch": 4.848688634835239, + "grad_norm": 0.6356052160263062, + "learning_rate": 1.0241035383519354e-05, + "loss": 0.1658, + "step": 7210 + }, + { + "epoch": 4.855413584398117, + "grad_norm": 0.9170238971710205, + "learning_rate": 1.0181667062455474e-05, + "loss": 0.1738, + "step": 7220 + }, + { + "epoch": 4.862138533960995, + "grad_norm": 0.5520994067192078, + "learning_rate": 1.0122298741391594e-05, + "loss": 0.1839, + "step": 7230 + }, + { + "epoch": 4.8688634835238735, + "grad_norm": 0.6071659326553345, + "learning_rate": 1.0062930420327713e-05, + "loss": 0.1802, + "step": 7240 + }, + { + "epoch": 4.875588433086752, + "grad_norm": 0.7233278751373291, + "learning_rate": 1.0003562099263833e-05, + "loss": 0.1741, + "step": 7250 + }, + { + "epoch": 4.88231338264963, + "grad_norm": 0.7937821745872498, + "learning_rate": 9.944193778199953e-06, + "loss": 0.1784, + "step": 7260 + }, + { + "epoch": 4.889038332212508, + "grad_norm": 0.5812748670578003, + "learning_rate": 9.884825457136073e-06, + "loss": 0.1916, + "step": 7270 + }, + { + "epoch": 4.895763281775387, + "grad_norm": 0.5226556062698364, + "learning_rate": 9.825457136072192e-06, + "loss": 0.1482, + "step": 7280 + }, + { + "epoch": 4.902488231338265, + "grad_norm": 0.7629415392875671, + "learning_rate": 9.766088815008312e-06, + "loss": 0.2192, + "step": 7290 + }, + { + "epoch": 4.909213180901143, + "grad_norm": 0.7357354760169983, + "learning_rate": 9.706720493944432e-06, + "loss": 0.1695, + "step": 7300 + }, + { + "epoch": 4.9159381304640215, + "grad_norm": 0.6955385804176331, + "learning_rate": 9.647352172880552e-06, + "loss": 0.1841, + "step": 7310 + }, + { + "epoch": 4.9226630800269, + "grad_norm": 0.803431510925293, + "learning_rate": 9.587983851816671e-06, + "loss": 0.1883, + "step": 7320 + }, + { + "epoch": 4.929388029589778, + "grad_norm": 0.5103302001953125, + "learning_rate": 9.528615530752791e-06, + "loss": 0.1581, + "step": 7330 + }, + { + "epoch": 4.936112979152656, + "grad_norm": 0.7879159450531006, + "learning_rate": 9.46924720968891e-06, + "loss": 0.1355, + "step": 7340 + }, + { + "epoch": 4.942837928715535, + "grad_norm": 0.6451155543327332, + "learning_rate": 9.40987888862503e-06, + "loss": 0.1633, + "step": 7350 + }, + { + "epoch": 4.949562878278413, + "grad_norm": 0.8154376745223999, + "learning_rate": 9.350510567561149e-06, + "loss": 0.1468, + "step": 7360 + }, + { + "epoch": 4.956287827841291, + "grad_norm": 0.6419748663902283, + "learning_rate": 9.29114224649727e-06, + "loss": 0.178, + "step": 7370 + }, + { + "epoch": 4.9630127774041695, + "grad_norm": 0.8760988116264343, + "learning_rate": 9.231773925433388e-06, + "loss": 0.1849, + "step": 7380 + }, + { + "epoch": 4.969737726967048, + "grad_norm": 0.8139458894729614, + "learning_rate": 9.17240560436951e-06, + "loss": 0.149, + "step": 7390 + }, + { + "epoch": 4.976462676529926, + "grad_norm": 0.565104603767395, + "learning_rate": 9.113037283305628e-06, + "loss": 0.1271, + "step": 7400 + }, + { + "epoch": 4.983187626092804, + "grad_norm": 0.8110491037368774, + "learning_rate": 9.05366896224175e-06, + "loss": 0.1803, + "step": 7410 + }, + { + "epoch": 4.989912575655683, + "grad_norm": 1.0213757753372192, + "learning_rate": 8.994300641177867e-06, + "loss": 0.1923, + "step": 7420 + }, + { + "epoch": 4.996637525218561, + "grad_norm": 0.7190690636634827, + "learning_rate": 8.934932320113989e-06, + "loss": 0.1763, + "step": 7430 + }, + { + "epoch": 5.003362474781439, + "grad_norm": 0.8246272206306458, + "learning_rate": 8.875563999050107e-06, + "loss": 0.1486, + "step": 7440 + }, + { + "epoch": 5.010087424344317, + "grad_norm": 0.5950284600257874, + "learning_rate": 8.816195677986228e-06, + "loss": 0.2139, + "step": 7450 + }, + { + "epoch": 5.016812373907196, + "grad_norm": 0.7800911664962769, + "learning_rate": 8.756827356922346e-06, + "loss": 0.1364, + "step": 7460 + }, + { + "epoch": 5.023537323470074, + "grad_norm": 0.8360269069671631, + "learning_rate": 8.697459035858466e-06, + "loss": 0.1636, + "step": 7470 + }, + { + "epoch": 5.030262273032952, + "grad_norm": 0.5287578105926514, + "learning_rate": 8.638090714794586e-06, + "loss": 0.1711, + "step": 7480 + }, + { + "epoch": 5.0369872225958305, + "grad_norm": 0.907789945602417, + "learning_rate": 8.578722393730706e-06, + "loss": 0.176, + "step": 7490 + }, + { + "epoch": 5.043712172158709, + "grad_norm": 0.6482927799224854, + "learning_rate": 8.519354072666825e-06, + "loss": 0.1526, + "step": 7500 + }, + { + "epoch": 5.043712172158709, + "eval_loss": 0.14650234580039978, + "eval_runtime": 14.2858, + "eval_samples_per_second": 185.079, + "eval_steps_per_second": 23.17, + "step": 7500 + }, + { + "epoch": 5.050437121721587, + "grad_norm": 0.7564139366149902, + "learning_rate": 8.459985751602945e-06, + "loss": 0.178, + "step": 7510 + }, + { + "epoch": 5.057162071284465, + "grad_norm": 0.7940858602523804, + "learning_rate": 8.400617430539065e-06, + "loss": 0.1412, + "step": 7520 + }, + { + "epoch": 5.063887020847344, + "grad_norm": 0.9502273201942444, + "learning_rate": 8.341249109475185e-06, + "loss": 0.1552, + "step": 7530 + }, + { + "epoch": 5.070611970410222, + "grad_norm": 0.530156135559082, + "learning_rate": 8.281880788411304e-06, + "loss": 0.1297, + "step": 7540 + }, + { + "epoch": 5.0773369199731, + "grad_norm": 0.632544755935669, + "learning_rate": 8.222512467347424e-06, + "loss": 0.1596, + "step": 7550 + }, + { + "epoch": 5.0840618695359785, + "grad_norm": 0.6239969730377197, + "learning_rate": 8.163144146283544e-06, + "loss": 0.1603, + "step": 7560 + }, + { + "epoch": 5.090786819098857, + "grad_norm": 0.6600353121757507, + "learning_rate": 8.103775825219664e-06, + "loss": 0.1512, + "step": 7570 + }, + { + "epoch": 5.097511768661735, + "grad_norm": 0.6660837531089783, + "learning_rate": 8.044407504155783e-06, + "loss": 0.1666, + "step": 7580 + }, + { + "epoch": 5.104236718224613, + "grad_norm": 0.5615241527557373, + "learning_rate": 7.985039183091903e-06, + "loss": 0.1626, + "step": 7590 + }, + { + "epoch": 5.110961667787492, + "grad_norm": 0.8264844417572021, + "learning_rate": 7.925670862028023e-06, + "loss": 0.1772, + "step": 7600 + }, + { + "epoch": 5.11768661735037, + "grad_norm": 0.8056089282035828, + "learning_rate": 7.866302540964143e-06, + "loss": 0.2064, + "step": 7610 + }, + { + "epoch": 5.124411566913248, + "grad_norm": 0.6176988482475281, + "learning_rate": 7.80693421990026e-06, + "loss": 0.162, + "step": 7620 + }, + { + "epoch": 5.1311365164761265, + "grad_norm": 0.6516183018684387, + "learning_rate": 7.74756589883638e-06, + "loss": 0.1632, + "step": 7630 + }, + { + "epoch": 5.137861466039005, + "grad_norm": 0.7382626533508301, + "learning_rate": 7.6881975777725e-06, + "loss": 0.1617, + "step": 7640 + }, + { + "epoch": 5.144586415601883, + "grad_norm": 0.7340312600135803, + "learning_rate": 7.628829256708621e-06, + "loss": 0.1519, + "step": 7650 + }, + { + "epoch": 5.151311365164761, + "grad_norm": 0.700930655002594, + "learning_rate": 7.56946093564474e-06, + "loss": 0.1409, + "step": 7660 + }, + { + "epoch": 5.15803631472764, + "grad_norm": 0.8751327395439148, + "learning_rate": 7.5100926145808605e-06, + "loss": 0.1915, + "step": 7670 + }, + { + "epoch": 5.164761264290518, + "grad_norm": 0.5034623146057129, + "learning_rate": 7.450724293516979e-06, + "loss": 0.1334, + "step": 7680 + }, + { + "epoch": 5.171486213853396, + "grad_norm": 0.7203763127326965, + "learning_rate": 7.3913559724531e-06, + "loss": 0.1682, + "step": 7690 + }, + { + "epoch": 5.178211163416274, + "grad_norm": 0.6495651006698608, + "learning_rate": 7.331987651389219e-06, + "loss": 0.1692, + "step": 7700 + }, + { + "epoch": 5.184936112979153, + "grad_norm": 0.6066912412643433, + "learning_rate": 7.2726193303253395e-06, + "loss": 0.1388, + "step": 7710 + }, + { + "epoch": 5.191661062542031, + "grad_norm": 0.6121117472648621, + "learning_rate": 7.213251009261458e-06, + "loss": 0.2234, + "step": 7720 + }, + { + "epoch": 5.198386012104909, + "grad_norm": 0.5731104612350464, + "learning_rate": 7.153882688197579e-06, + "loss": 0.1626, + "step": 7730 + }, + { + "epoch": 5.2051109616677875, + "grad_norm": 0.7278406023979187, + "learning_rate": 7.094514367133698e-06, + "loss": 0.1439, + "step": 7740 + }, + { + "epoch": 5.211835911230666, + "grad_norm": 0.7343350648880005, + "learning_rate": 7.035146046069817e-06, + "loss": 0.1619, + "step": 7750 + }, + { + "epoch": 5.218560860793544, + "grad_norm": 0.7979176640510559, + "learning_rate": 6.9757777250059375e-06, + "loss": 0.1873, + "step": 7760 + }, + { + "epoch": 5.225285810356422, + "grad_norm": 0.7361353039741516, + "learning_rate": 6.916409403942056e-06, + "loss": 0.1744, + "step": 7770 + }, + { + "epoch": 5.232010759919301, + "grad_norm": 1.0325379371643066, + "learning_rate": 6.857041082878177e-06, + "loss": 0.1671, + "step": 7780 + }, + { + "epoch": 5.238735709482179, + "grad_norm": 0.5549724102020264, + "learning_rate": 6.797672761814296e-06, + "loss": 0.1389, + "step": 7790 + }, + { + "epoch": 5.245460659045057, + "grad_norm": 0.6693004965782166, + "learning_rate": 6.7383044407504165e-06, + "loss": 0.1227, + "step": 7800 + }, + { + "epoch": 5.2521856086079355, + "grad_norm": 0.6957047581672668, + "learning_rate": 6.678936119686535e-06, + "loss": 0.142, + "step": 7810 + }, + { + "epoch": 5.258910558170814, + "grad_norm": 0.5779728889465332, + "learning_rate": 6.619567798622656e-06, + "loss": 0.1951, + "step": 7820 + }, + { + "epoch": 5.265635507733692, + "grad_norm": 0.5068587064743042, + "learning_rate": 6.560199477558775e-06, + "loss": 0.1603, + "step": 7830 + }, + { + "epoch": 5.27236045729657, + "grad_norm": 0.8560962080955505, + "learning_rate": 6.500831156494895e-06, + "loss": 0.1657, + "step": 7840 + }, + { + "epoch": 5.279085406859449, + "grad_norm": 0.9463856816291809, + "learning_rate": 6.4414628354310145e-06, + "loss": 0.1372, + "step": 7850 + }, + { + "epoch": 5.285810356422327, + "grad_norm": 0.5416259765625, + "learning_rate": 6.382094514367134e-06, + "loss": 0.1368, + "step": 7860 + }, + { + "epoch": 5.292535305985205, + "grad_norm": 0.35940733551979065, + "learning_rate": 6.322726193303254e-06, + "loss": 0.1313, + "step": 7870 + }, + { + "epoch": 5.299260255548083, + "grad_norm": 0.5176042914390564, + "learning_rate": 6.263357872239374e-06, + "loss": 0.1549, + "step": 7880 + }, + { + "epoch": 5.305985205110962, + "grad_norm": 0.8194878101348877, + "learning_rate": 6.2039895511754935e-06, + "loss": 0.1332, + "step": 7890 + }, + { + "epoch": 5.31271015467384, + "grad_norm": 0.718500018119812, + "learning_rate": 6.144621230111612e-06, + "loss": 0.1632, + "step": 7900 + }, + { + "epoch": 5.319435104236718, + "grad_norm": 0.6662378907203674, + "learning_rate": 6.085252909047732e-06, + "loss": 0.1731, + "step": 7910 + }, + { + "epoch": 5.326160053799597, + "grad_norm": 0.5502232313156128, + "learning_rate": 6.025884587983852e-06, + "loss": 0.179, + "step": 7920 + }, + { + "epoch": 5.332885003362475, + "grad_norm": 0.7982697486877441, + "learning_rate": 5.966516266919972e-06, + "loss": 0.1705, + "step": 7930 + }, + { + "epoch": 5.339609952925353, + "grad_norm": 0.6752830147743225, + "learning_rate": 5.9071479458560915e-06, + "loss": 0.1539, + "step": 7940 + }, + { + "epoch": 5.346334902488231, + "grad_norm": 0.7998142242431641, + "learning_rate": 5.847779624792211e-06, + "loss": 0.1536, + "step": 7950 + }, + { + "epoch": 5.35305985205111, + "grad_norm": 0.8491772413253784, + "learning_rate": 5.78841130372833e-06, + "loss": 0.1715, + "step": 7960 + }, + { + "epoch": 5.359784801613988, + "grad_norm": 0.6874935626983643, + "learning_rate": 5.72904298266445e-06, + "loss": 0.1375, + "step": 7970 + }, + { + "epoch": 5.366509751176866, + "grad_norm": 0.5897092223167419, + "learning_rate": 5.66967466160057e-06, + "loss": 0.1329, + "step": 7980 + }, + { + "epoch": 5.3732347007397445, + "grad_norm": 0.6634560227394104, + "learning_rate": 5.610306340536689e-06, + "loss": 0.1531, + "step": 7990 + }, + { + "epoch": 5.379959650302623, + "grad_norm": 0.7213748097419739, + "learning_rate": 5.550938019472809e-06, + "loss": 0.1606, + "step": 8000 + }, + { + "epoch": 5.379959650302623, + "eval_loss": 0.1459866315126419, + "eval_runtime": 14.4136, + "eval_samples_per_second": 183.437, + "eval_steps_per_second": 22.964, + "step": 8000 + }, + { + "epoch": 5.386684599865501, + "grad_norm": 0.7976545095443726, + "learning_rate": 5.491569698408929e-06, + "loss": 0.1638, + "step": 8010 + }, + { + "epoch": 5.393409549428379, + "grad_norm": 0.6053023338317871, + "learning_rate": 5.432201377345049e-06, + "loss": 0.1468, + "step": 8020 + }, + { + "epoch": 5.400134498991258, + "grad_norm": 0.6205041408538818, + "learning_rate": 5.3728330562811684e-06, + "loss": 0.1819, + "step": 8030 + }, + { + "epoch": 5.406859448554136, + "grad_norm": 0.8174419403076172, + "learning_rate": 5.313464735217288e-06, + "loss": 0.1687, + "step": 8040 + }, + { + "epoch": 5.413584398117014, + "grad_norm": 0.7457543611526489, + "learning_rate": 5.254096414153408e-06, + "loss": 0.1437, + "step": 8050 + }, + { + "epoch": 5.4203093476798925, + "grad_norm": 0.4972638487815857, + "learning_rate": 5.194728093089528e-06, + "loss": 0.2305, + "step": 8060 + }, + { + "epoch": 5.427034297242771, + "grad_norm": 0.49321138858795166, + "learning_rate": 5.1353597720256475e-06, + "loss": 0.1395, + "step": 8070 + }, + { + "epoch": 5.433759246805649, + "grad_norm": 0.6880173683166504, + "learning_rate": 5.075991450961767e-06, + "loss": 0.1116, + "step": 8080 + }, + { + "epoch": 5.440484196368527, + "grad_norm": 0.6798646450042725, + "learning_rate": 5.016623129897887e-06, + "loss": 0.1767, + "step": 8090 + }, + { + "epoch": 5.447209145931406, + "grad_norm": 0.5538681745529175, + "learning_rate": 4.957254808834006e-06, + "loss": 0.1596, + "step": 8100 + }, + { + "epoch": 5.453934095494284, + "grad_norm": 0.5008218884468079, + "learning_rate": 4.897886487770126e-06, + "loss": 0.1554, + "step": 8110 + }, + { + "epoch": 5.460659045057162, + "grad_norm": 0.6006589531898499, + "learning_rate": 4.8385181667062454e-06, + "loss": 0.1496, + "step": 8120 + }, + { + "epoch": 5.46738399462004, + "grad_norm": 0.7322587966918945, + "learning_rate": 4.779149845642365e-06, + "loss": 0.1592, + "step": 8130 + }, + { + "epoch": 5.474108944182919, + "grad_norm": 0.7133284211158752, + "learning_rate": 4.719781524578485e-06, + "loss": 0.128, + "step": 8140 + }, + { + "epoch": 5.480833893745797, + "grad_norm": 0.5079299807548523, + "learning_rate": 4.660413203514605e-06, + "loss": 0.1412, + "step": 8150 + }, + { + "epoch": 5.487558843308675, + "grad_norm": 0.4279893636703491, + "learning_rate": 4.6010448824507245e-06, + "loss": 0.1681, + "step": 8160 + }, + { + "epoch": 5.4942837928715536, + "grad_norm": 0.6305129528045654, + "learning_rate": 4.541676561386844e-06, + "loss": 0.1574, + "step": 8170 + }, + { + "epoch": 5.501008742434432, + "grad_norm": 0.6287931203842163, + "learning_rate": 4.482308240322964e-06, + "loss": 0.1691, + "step": 8180 + }, + { + "epoch": 5.50773369199731, + "grad_norm": 0.6833222508430481, + "learning_rate": 4.422939919259084e-06, + "loss": 0.1763, + "step": 8190 + }, + { + "epoch": 5.514458641560188, + "grad_norm": 0.621030867099762, + "learning_rate": 4.3635715981952035e-06, + "loss": 0.1394, + "step": 8200 + }, + { + "epoch": 5.521183591123067, + "grad_norm": 0.9984930753707886, + "learning_rate": 4.304203277131323e-06, + "loss": 0.1822, + "step": 8210 + }, + { + "epoch": 5.527908540685945, + "grad_norm": 0.7831975221633911, + "learning_rate": 4.244834956067443e-06, + "loss": 0.1812, + "step": 8220 + }, + { + "epoch": 5.534633490248823, + "grad_norm": 0.6752682328224182, + "learning_rate": 4.185466635003563e-06, + "loss": 0.1622, + "step": 8230 + }, + { + "epoch": 5.5413584398117015, + "grad_norm": 0.6233469843864441, + "learning_rate": 4.126098313939682e-06, + "loss": 0.1404, + "step": 8240 + }, + { + "epoch": 5.54808338937458, + "grad_norm": 0.825937032699585, + "learning_rate": 4.0667299928758015e-06, + "loss": 0.1266, + "step": 8250 + }, + { + "epoch": 5.554808338937458, + "grad_norm": 0.587255597114563, + "learning_rate": 4.007361671811921e-06, + "loss": 0.1724, + "step": 8260 + }, + { + "epoch": 5.561533288500336, + "grad_norm": 0.7307100892066956, + "learning_rate": 3.947993350748041e-06, + "loss": 0.1809, + "step": 8270 + }, + { + "epoch": 5.568258238063215, + "grad_norm": 0.6363770365715027, + "learning_rate": 3.888625029684161e-06, + "loss": 0.1444, + "step": 8280 + }, + { + "epoch": 5.574983187626093, + "grad_norm": 0.7173252701759338, + "learning_rate": 3.8292567086202805e-06, + "loss": 0.159, + "step": 8290 + }, + { + "epoch": 5.581708137188971, + "grad_norm": 0.6081172227859497, + "learning_rate": 3.7698883875564e-06, + "loss": 0.1433, + "step": 8300 + }, + { + "epoch": 5.5884330867518495, + "grad_norm": 1.3353235721588135, + "learning_rate": 3.7105200664925196e-06, + "loss": 0.1883, + "step": 8310 + }, + { + "epoch": 5.595158036314728, + "grad_norm": 0.4842171370983124, + "learning_rate": 3.6511517454286394e-06, + "loss": 0.1457, + "step": 8320 + }, + { + "epoch": 5.601882985877606, + "grad_norm": 0.8106813430786133, + "learning_rate": 3.591783424364759e-06, + "loss": 0.1681, + "step": 8330 + }, + { + "epoch": 5.608607935440484, + "grad_norm": 0.838200032711029, + "learning_rate": 3.532415103300879e-06, + "loss": 0.1572, + "step": 8340 + }, + { + "epoch": 5.615332885003363, + "grad_norm": 0.42966827750205994, + "learning_rate": 3.4730467822369987e-06, + "loss": 0.1395, + "step": 8350 + }, + { + "epoch": 5.622057834566241, + "grad_norm": 0.806328296661377, + "learning_rate": 3.4136784611731184e-06, + "loss": 0.2106, + "step": 8360 + }, + { + "epoch": 5.628782784129119, + "grad_norm": 0.6712527275085449, + "learning_rate": 3.354310140109238e-06, + "loss": 0.1519, + "step": 8370 + }, + { + "epoch": 5.635507733691997, + "grad_norm": 0.708582878112793, + "learning_rate": 3.294941819045357e-06, + "loss": 0.1394, + "step": 8380 + }, + { + "epoch": 5.642232683254876, + "grad_norm": 0.6497735977172852, + "learning_rate": 3.235573497981477e-06, + "loss": 0.1647, + "step": 8390 + }, + { + "epoch": 5.648957632817754, + "grad_norm": 0.6724647283554077, + "learning_rate": 3.1762051769175966e-06, + "loss": 0.1555, + "step": 8400 + }, + { + "epoch": 5.655682582380632, + "grad_norm": 0.7497691512107849, + "learning_rate": 3.116836855853717e-06, + "loss": 0.1468, + "step": 8410 + }, + { + "epoch": 5.6624075319435105, + "grad_norm": 0.6995181441307068, + "learning_rate": 3.057468534789836e-06, + "loss": 0.1491, + "step": 8420 + }, + { + "epoch": 5.669132481506389, + "grad_norm": 0.6918231248855591, + "learning_rate": 2.998100213725956e-06, + "loss": 0.1567, + "step": 8430 + }, + { + "epoch": 5.675857431069267, + "grad_norm": 0.700425386428833, + "learning_rate": 2.9387318926620757e-06, + "loss": 0.1493, + "step": 8440 + }, + { + "epoch": 5.682582380632145, + "grad_norm": 0.5970633029937744, + "learning_rate": 2.8793635715981954e-06, + "loss": 0.1779, + "step": 8450 + }, + { + "epoch": 5.689307330195024, + "grad_norm": 0.703995943069458, + "learning_rate": 2.819995250534315e-06, + "loss": 0.1482, + "step": 8460 + }, + { + "epoch": 5.696032279757902, + "grad_norm": 0.624579906463623, + "learning_rate": 2.760626929470435e-06, + "loss": 0.1697, + "step": 8470 + }, + { + "epoch": 5.70275722932078, + "grad_norm": 0.6102532744407654, + "learning_rate": 2.7012586084065547e-06, + "loss": 0.1518, + "step": 8480 + }, + { + "epoch": 5.7094821788836585, + "grad_norm": 0.7269166707992554, + "learning_rate": 2.641890287342674e-06, + "loss": 0.1841, + "step": 8490 + }, + { + "epoch": 5.716207128446537, + "grad_norm": 0.5745391249656677, + "learning_rate": 2.582521966278794e-06, + "loss": 0.1512, + "step": 8500 + }, + { + "epoch": 5.716207128446537, + "eval_loss": 0.144563689827919, + "eval_runtime": 14.4525, + "eval_samples_per_second": 182.944, + "eval_steps_per_second": 22.903, + "step": 8500 + }, + { + "epoch": 5.722932078009415, + "grad_norm": 0.89340740442276, + "learning_rate": 2.5231536452149136e-06, + "loss": 0.1528, + "step": 8510 + }, + { + "epoch": 5.729657027572293, + "grad_norm": 0.8557310700416565, + "learning_rate": 2.4637853241510333e-06, + "loss": 0.1803, + "step": 8520 + }, + { + "epoch": 5.736381977135172, + "grad_norm": 0.7893068790435791, + "learning_rate": 2.404417003087153e-06, + "loss": 0.145, + "step": 8530 + }, + { + "epoch": 5.74310692669805, + "grad_norm": 0.548139214515686, + "learning_rate": 2.3450486820232724e-06, + "loss": 0.1753, + "step": 8540 + }, + { + "epoch": 5.749831876260928, + "grad_norm": 0.7231383323669434, + "learning_rate": 2.285680360959392e-06, + "loss": 0.1417, + "step": 8550 + }, + { + "epoch": 5.7565568258238065, + "grad_norm": 0.7758724689483643, + "learning_rate": 2.226312039895512e-06, + "loss": 0.1606, + "step": 8560 + }, + { + "epoch": 5.763281775386685, + "grad_norm": 0.692232072353363, + "learning_rate": 2.1669437188316313e-06, + "loss": 0.1343, + "step": 8570 + }, + { + "epoch": 5.770006724949563, + "grad_norm": 0.6432099938392639, + "learning_rate": 2.107575397767751e-06, + "loss": 0.187, + "step": 8580 + }, + { + "epoch": 5.776731674512441, + "grad_norm": 0.581702709197998, + "learning_rate": 2.048207076703871e-06, + "loss": 0.1344, + "step": 8590 + }, + { + "epoch": 5.78345662407532, + "grad_norm": 0.7130106091499329, + "learning_rate": 1.9888387556399906e-06, + "loss": 0.1501, + "step": 8600 + }, + { + "epoch": 5.790181573638198, + "grad_norm": 0.7084644436836243, + "learning_rate": 1.9294704345761103e-06, + "loss": 0.1725, + "step": 8610 + }, + { + "epoch": 5.796906523201076, + "grad_norm": 0.6888975501060486, + "learning_rate": 1.87010211351223e-06, + "loss": 0.184, + "step": 8620 + }, + { + "epoch": 5.803631472763954, + "grad_norm": 0.636211633682251, + "learning_rate": 1.8107337924483494e-06, + "loss": 0.17, + "step": 8630 + }, + { + "epoch": 5.810356422326833, + "grad_norm": 0.4802153408527374, + "learning_rate": 1.7513654713844692e-06, + "loss": 0.1428, + "step": 8640 + }, + { + "epoch": 5.817081371889711, + "grad_norm": 0.5505760908126831, + "learning_rate": 1.691997150320589e-06, + "loss": 0.177, + "step": 8650 + }, + { + "epoch": 5.823806321452589, + "grad_norm": 0.7836665511131287, + "learning_rate": 1.6326288292567087e-06, + "loss": 0.1725, + "step": 8660 + }, + { + "epoch": 5.8305312710154675, + "grad_norm": 0.5304895043373108, + "learning_rate": 1.5732605081928285e-06, + "loss": 0.1339, + "step": 8670 + }, + { + "epoch": 5.837256220578346, + "grad_norm": 0.7424430251121521, + "learning_rate": 1.513892187128948e-06, + "loss": 0.1441, + "step": 8680 + }, + { + "epoch": 5.843981170141224, + "grad_norm": 0.9562227129936218, + "learning_rate": 1.4545238660650678e-06, + "loss": 0.1688, + "step": 8690 + }, + { + "epoch": 5.850706119704102, + "grad_norm": 0.6445603966712952, + "learning_rate": 1.3951555450011875e-06, + "loss": 0.148, + "step": 8700 + }, + { + "epoch": 5.857431069266981, + "grad_norm": 1.0529707670211792, + "learning_rate": 1.335787223937307e-06, + "loss": 0.1908, + "step": 8710 + }, + { + "epoch": 5.864156018829859, + "grad_norm": 1.1476075649261475, + "learning_rate": 1.2764189028734268e-06, + "loss": 0.1497, + "step": 8720 + }, + { + "epoch": 5.870880968392737, + "grad_norm": 0.548938512802124, + "learning_rate": 1.2170505818095466e-06, + "loss": 0.1498, + "step": 8730 + }, + { + "epoch": 5.8776059179556155, + "grad_norm": 0.5004345178604126, + "learning_rate": 1.1576822607456662e-06, + "loss": 0.1912, + "step": 8740 + }, + { + "epoch": 5.884330867518494, + "grad_norm": 0.8999865055084229, + "learning_rate": 1.0983139396817857e-06, + "loss": 0.1627, + "step": 8750 + }, + { + "epoch": 5.891055817081372, + "grad_norm": 0.8366420269012451, + "learning_rate": 1.0389456186179055e-06, + "loss": 0.1417, + "step": 8760 + }, + { + "epoch": 5.89778076664425, + "grad_norm": 0.7696868181228638, + "learning_rate": 9.795772975540252e-07, + "loss": 0.1651, + "step": 8770 + }, + { + "epoch": 5.904505716207129, + "grad_norm": 0.8378667831420898, + "learning_rate": 9.202089764901449e-07, + "loss": 0.1524, + "step": 8780 + }, + { + "epoch": 5.911230665770007, + "grad_norm": 0.5676979422569275, + "learning_rate": 8.608406554262645e-07, + "loss": 0.1355, + "step": 8790 + }, + { + "epoch": 5.917955615332885, + "grad_norm": 0.8161872625350952, + "learning_rate": 8.014723343623843e-07, + "loss": 0.1469, + "step": 8800 + }, + { + "epoch": 5.9246805648957634, + "grad_norm": 0.6396122574806213, + "learning_rate": 7.421040132985039e-07, + "loss": 0.1343, + "step": 8810 + }, + { + "epoch": 5.931405514458642, + "grad_norm": 0.49894294142723083, + "learning_rate": 6.827356922346237e-07, + "loss": 0.1436, + "step": 8820 + }, + { + "epoch": 5.93813046402152, + "grad_norm": 1.815820336341858, + "learning_rate": 6.233673711707434e-07, + "loss": 0.1587, + "step": 8830 + }, + { + "epoch": 5.944855413584398, + "grad_norm": 0.7993642091751099, + "learning_rate": 5.63999050106863e-07, + "loss": 0.1967, + "step": 8840 + }, + { + "epoch": 5.951580363147277, + "grad_norm": 0.9096683859825134, + "learning_rate": 5.046307290429827e-07, + "loss": 0.1467, + "step": 8850 + }, + { + "epoch": 5.958305312710155, + "grad_norm": 0.8135556578636169, + "learning_rate": 4.452624079791023e-07, + "loss": 0.1708, + "step": 8860 + }, + { + "epoch": 5.965030262273033, + "grad_norm": 0.699492335319519, + "learning_rate": 3.8589408691522203e-07, + "loss": 0.144, + "step": 8870 + }, + { + "epoch": 5.971755211835911, + "grad_norm": 0.707664430141449, + "learning_rate": 3.2652576585134174e-07, + "loss": 0.1314, + "step": 8880 + }, + { + "epoch": 5.97848016139879, + "grad_norm": 0.7655701637268066, + "learning_rate": 2.671574447874614e-07, + "loss": 0.1449, + "step": 8890 + }, + { + "epoch": 5.985205110961668, + "grad_norm": 0.7005723714828491, + "learning_rate": 2.077891237235811e-07, + "loss": 0.1502, + "step": 8900 + }, + { + "epoch": 5.991930060524546, + "grad_norm": 0.5278902649879456, + "learning_rate": 1.4842080265970078e-07, + "loss": 0.1591, + "step": 8910 + }, + { + "epoch": 5.9986550100874245, + "grad_norm": 0.9299124479293823, + "learning_rate": 8.905248159582048e-08, + "loss": 0.1779, + "step": 8920 + } + ], + "logging_steps": 10, + "max_steps": 8922, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.506527567364096e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}