diff --git "a/e3.0/trainer_state.json" "b/e3.0/trainer_state.json" new file mode 100644--- /dev/null +++ "b/e3.0/trainer_state.json" @@ -0,0 +1,33283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.984078371321436, + "eval_steps": 500, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000628227025541355, + "grad_norm": 37.5, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.9002, + "step": 2 + }, + { + "epoch": 0.00125645405108271, + "grad_norm": 5.65625, + "learning_rate": 5.000000000000001e-07, + "loss": 1.6234, + "step": 4 + }, + { + "epoch": 0.001884681076624065, + "grad_norm": 5.15625, + "learning_rate": 7.5e-07, + "loss": 1.6223, + "step": 6 + }, + { + "epoch": 0.00251290810216542, + "grad_norm": 4.8125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.6607, + "step": 8 + }, + { + "epoch": 0.003141135127706775, + "grad_norm": 5.03125, + "learning_rate": 1.25e-06, + "loss": 1.6644, + "step": 10 + }, + { + "epoch": 0.00376936215324813, + "grad_norm": 5.71875, + "learning_rate": 1.5e-06, + "loss": 1.8307, + "step": 12 + }, + { + "epoch": 0.004397589178789485, + "grad_norm": 5.28125, + "learning_rate": 1.75e-06, + "loss": 1.623, + "step": 14 + }, + { + "epoch": 0.00502581620433084, + "grad_norm": 3.625, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6915, + "step": 16 + }, + { + "epoch": 0.005654043229872195, + "grad_norm": 3.609375, + "learning_rate": 2.25e-06, + "loss": 1.8222, + "step": 18 + }, + { + "epoch": 0.00628227025541355, + "grad_norm": 3.21875, + "learning_rate": 2.5e-06, + "loss": 1.6702, + "step": 20 + }, + { + "epoch": 0.006910497280954905, + "grad_norm": 2.140625, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6118, + "step": 22 + }, + { + "epoch": 0.00753872430649626, + "grad_norm": 2.890625, + "learning_rate": 3e-06, + "loss": 1.6499, + "step": 24 + }, + { + "epoch": 0.008166951332037615, + "grad_norm": 2.421875, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.4785, + "step": 26 + }, + { + "epoch": 0.00879517835757897, + "grad_norm": 2.421875, + "learning_rate": 3.5e-06, + "loss": 1.7235, + "step": 28 + }, + { + "epoch": 0.009423405383120325, + "grad_norm": 2.25, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.6245, + "step": 30 + }, + { + "epoch": 0.01005163240866168, + "grad_norm": 1.6640625, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5486, + "step": 32 + }, + { + "epoch": 0.010679859434203035, + "grad_norm": 2.09375, + "learning_rate": 4.25e-06, + "loss": 1.4448, + "step": 34 + }, + { + "epoch": 0.01130808645974439, + "grad_norm": 1.5078125, + "learning_rate": 4.5e-06, + "loss": 1.48, + "step": 36 + }, + { + "epoch": 0.011936313485285744, + "grad_norm": 1.6796875, + "learning_rate": 4.75e-06, + "loss": 1.5128, + "step": 38 + }, + { + "epoch": 0.0125645405108271, + "grad_norm": 1.5234375, + "learning_rate": 5e-06, + "loss": 1.4774, + "step": 40 + }, + { + "epoch": 0.013192767536368456, + "grad_norm": 1.546875, + "learning_rate": 5.2500000000000006e-06, + "loss": 1.3692, + "step": 42 + }, + { + "epoch": 0.01382099456190981, + "grad_norm": 1.34375, + "learning_rate": 5.500000000000001e-06, + "loss": 1.5056, + "step": 44 + }, + { + "epoch": 0.014449221587451166, + "grad_norm": 1.21875, + "learning_rate": 5.75e-06, + "loss": 1.4744, + "step": 46 + }, + { + "epoch": 0.01507744861299252, + "grad_norm": 1.0078125, + "learning_rate": 6e-06, + "loss": 1.5776, + "step": 48 + }, + { + "epoch": 0.015705675638533874, + "grad_norm": 1.1015625, + "learning_rate": 6.25e-06, + "loss": 1.486, + "step": 50 + }, + { + "epoch": 0.01633390266407523, + "grad_norm": 0.85546875, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.5541, + "step": 52 + }, + { + "epoch": 0.016962129689616585, + "grad_norm": 0.984375, + "learning_rate": 6.750000000000001e-06, + "loss": 1.3999, + "step": 54 + }, + { + "epoch": 0.01759035671515794, + "grad_norm": 0.96875, + "learning_rate": 7e-06, + "loss": 1.4455, + "step": 56 + }, + { + "epoch": 0.018218583740699294, + "grad_norm": 1.0546875, + "learning_rate": 7.25e-06, + "loss": 1.5221, + "step": 58 + }, + { + "epoch": 0.01884681076624065, + "grad_norm": 0.88671875, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4798, + "step": 60 + }, + { + "epoch": 0.019475037791782005, + "grad_norm": 0.9140625, + "learning_rate": 7.75e-06, + "loss": 1.4334, + "step": 62 + }, + { + "epoch": 0.02010326481732336, + "grad_norm": 0.98046875, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3293, + "step": 64 + }, + { + "epoch": 0.020731491842864717, + "grad_norm": 0.83984375, + "learning_rate": 8.25e-06, + "loss": 1.4174, + "step": 66 + }, + { + "epoch": 0.02135971886840607, + "grad_norm": 0.859375, + "learning_rate": 8.5e-06, + "loss": 1.4177, + "step": 68 + }, + { + "epoch": 0.021987945893947425, + "grad_norm": 0.84765625, + "learning_rate": 8.750000000000001e-06, + "loss": 1.3708, + "step": 70 + }, + { + "epoch": 0.02261617291948878, + "grad_norm": 0.90234375, + "learning_rate": 9e-06, + "loss": 1.4062, + "step": 72 + }, + { + "epoch": 0.023244399945030136, + "grad_norm": 0.83984375, + "learning_rate": 9.250000000000001e-06, + "loss": 1.3829, + "step": 74 + }, + { + "epoch": 0.02387262697057149, + "grad_norm": 0.9375, + "learning_rate": 9.5e-06, + "loss": 1.3441, + "step": 76 + }, + { + "epoch": 0.024500853996112845, + "grad_norm": 0.90234375, + "learning_rate": 9.75e-06, + "loss": 1.5625, + "step": 78 + }, + { + "epoch": 0.0251290810216542, + "grad_norm": 0.81640625, + "learning_rate": 1e-05, + "loss": 1.4504, + "step": 80 + }, + { + "epoch": 0.025757308047195556, + "grad_norm": 0.83984375, + "learning_rate": 1.025e-05, + "loss": 1.4672, + "step": 82 + }, + { + "epoch": 0.026385535072736912, + "grad_norm": 0.82421875, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.4405, + "step": 84 + }, + { + "epoch": 0.027013762098278264, + "grad_norm": 1.0546875, + "learning_rate": 1.075e-05, + "loss": 1.3557, + "step": 86 + }, + { + "epoch": 0.02764198912381962, + "grad_norm": 0.90234375, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.4309, + "step": 88 + }, + { + "epoch": 0.028270216149360976, + "grad_norm": 0.78125, + "learning_rate": 1.125e-05, + "loss": 1.3528, + "step": 90 + }, + { + "epoch": 0.02889844317490233, + "grad_norm": 0.96875, + "learning_rate": 1.15e-05, + "loss": 1.4093, + "step": 92 + }, + { + "epoch": 0.029526670200443684, + "grad_norm": 0.87890625, + "learning_rate": 1.1750000000000001e-05, + "loss": 1.4324, + "step": 94 + }, + { + "epoch": 0.03015489722598504, + "grad_norm": 0.875, + "learning_rate": 1.2e-05, + "loss": 1.4622, + "step": 96 + }, + { + "epoch": 0.030783124251526395, + "grad_norm": 0.85546875, + "learning_rate": 1.2250000000000001e-05, + "loss": 1.5166, + "step": 98 + }, + { + "epoch": 0.03141135127706775, + "grad_norm": 0.76953125, + "learning_rate": 1.25e-05, + "loss": 1.4729, + "step": 100 + }, + { + "epoch": 0.032039578302609104, + "grad_norm": 0.8828125, + "learning_rate": 1.275e-05, + "loss": 1.4201, + "step": 102 + }, + { + "epoch": 0.03266780532815046, + "grad_norm": 0.97265625, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3646, + "step": 104 + }, + { + "epoch": 0.033296032353691815, + "grad_norm": 0.859375, + "learning_rate": 1.325e-05, + "loss": 1.3105, + "step": 106 + }, + { + "epoch": 0.03392425937923317, + "grad_norm": 0.78125, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.5302, + "step": 108 + }, + { + "epoch": 0.03455248640477453, + "grad_norm": 0.875, + "learning_rate": 1.375e-05, + "loss": 1.3979, + "step": 110 + }, + { + "epoch": 0.03518071343031588, + "grad_norm": 0.796875, + "learning_rate": 1.4e-05, + "loss": 1.3961, + "step": 112 + }, + { + "epoch": 0.03580894045585724, + "grad_norm": 0.796875, + "learning_rate": 1.425e-05, + "loss": 1.3645, + "step": 114 + }, + { + "epoch": 0.03643716748139859, + "grad_norm": 0.7421875, + "learning_rate": 1.45e-05, + "loss": 1.306, + "step": 116 + }, + { + "epoch": 0.03706539450693994, + "grad_norm": 0.8828125, + "learning_rate": 1.4750000000000003e-05, + "loss": 1.3799, + "step": 118 + }, + { + "epoch": 0.0376936215324813, + "grad_norm": 0.73828125, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3281, + "step": 120 + }, + { + "epoch": 0.038321848558022654, + "grad_norm": 0.87890625, + "learning_rate": 1.525e-05, + "loss": 1.4052, + "step": 122 + }, + { + "epoch": 0.03895007558356401, + "grad_norm": 0.8203125, + "learning_rate": 1.55e-05, + "loss": 1.4946, + "step": 124 + }, + { + "epoch": 0.039578302609105366, + "grad_norm": 0.80859375, + "learning_rate": 1.575e-05, + "loss": 1.4292, + "step": 126 + }, + { + "epoch": 0.04020652963464672, + "grad_norm": 1.0078125, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.4858, + "step": 128 + }, + { + "epoch": 0.04083475666018808, + "grad_norm": 0.9765625, + "learning_rate": 1.6250000000000002e-05, + "loss": 1.2745, + "step": 130 + }, + { + "epoch": 0.04146298368572943, + "grad_norm": 0.8046875, + "learning_rate": 1.65e-05, + "loss": 1.4684, + "step": 132 + }, + { + "epoch": 0.04209121071127078, + "grad_norm": 0.80859375, + "learning_rate": 1.675e-05, + "loss": 1.4275, + "step": 134 + }, + { + "epoch": 0.04271943773681214, + "grad_norm": 0.90625, + "learning_rate": 1.7e-05, + "loss": 1.2831, + "step": 136 + }, + { + "epoch": 0.043347664762353494, + "grad_norm": 0.953125, + "learning_rate": 1.7250000000000003e-05, + "loss": 1.445, + "step": 138 + }, + { + "epoch": 0.04397589178789485, + "grad_norm": 0.80859375, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.3457, + "step": 140 + }, + { + "epoch": 0.044604118813436205, + "grad_norm": 0.8046875, + "learning_rate": 1.775e-05, + "loss": 1.3961, + "step": 142 + }, + { + "epoch": 0.04523234583897756, + "grad_norm": 0.75, + "learning_rate": 1.8e-05, + "loss": 1.2985, + "step": 144 + }, + { + "epoch": 0.04586057286451892, + "grad_norm": 0.81640625, + "learning_rate": 1.825e-05, + "loss": 1.3075, + "step": 146 + }, + { + "epoch": 0.04648879989006027, + "grad_norm": 0.76953125, + "learning_rate": 1.8500000000000002e-05, + "loss": 1.3602, + "step": 148 + }, + { + "epoch": 0.04711702691560163, + "grad_norm": 0.8828125, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.4481, + "step": 150 + }, + { + "epoch": 0.04774525394114298, + "grad_norm": 0.80078125, + "learning_rate": 1.9e-05, + "loss": 1.409, + "step": 152 + }, + { + "epoch": 0.04837348096668433, + "grad_norm": 0.80859375, + "learning_rate": 1.925e-05, + "loss": 1.357, + "step": 154 + }, + { + "epoch": 0.04900170799222569, + "grad_norm": 0.77734375, + "learning_rate": 1.95e-05, + "loss": 1.2841, + "step": 156 + }, + { + "epoch": 0.049629935017767045, + "grad_norm": 0.7421875, + "learning_rate": 1.9750000000000002e-05, + "loss": 1.4336, + "step": 158 + }, + { + "epoch": 0.0502581620433084, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 1.3853, + "step": 160 + }, + { + "epoch": 0.050886389068849756, + "grad_norm": 0.89453125, + "learning_rate": 1.9997461123452876e-05, + "loss": 1.3465, + "step": 162 + }, + { + "epoch": 0.05151461609439111, + "grad_norm": 0.84375, + "learning_rate": 1.9994922246905744e-05, + "loss": 1.3669, + "step": 164 + }, + { + "epoch": 0.05214284311993247, + "grad_norm": 0.7421875, + "learning_rate": 1.999238337035862e-05, + "loss": 1.6046, + "step": 166 + }, + { + "epoch": 0.052771070145473824, + "grad_norm": 0.81640625, + "learning_rate": 1.998984449381149e-05, + "loss": 1.2993, + "step": 168 + }, + { + "epoch": 0.05339929717101517, + "grad_norm": 0.79296875, + "learning_rate": 1.9987305617264362e-05, + "loss": 1.4495, + "step": 170 + }, + { + "epoch": 0.05402752419655653, + "grad_norm": 0.7734375, + "learning_rate": 1.9984766740717233e-05, + "loss": 1.3141, + "step": 172 + }, + { + "epoch": 0.054655751222097884, + "grad_norm": 0.71484375, + "learning_rate": 1.9982227864170108e-05, + "loss": 1.4852, + "step": 174 + }, + { + "epoch": 0.05528397824763924, + "grad_norm": 0.875, + "learning_rate": 1.997968898762298e-05, + "loss": 1.4228, + "step": 176 + }, + { + "epoch": 0.055912205273180596, + "grad_norm": 0.80078125, + "learning_rate": 1.997715011107585e-05, + "loss": 1.5617, + "step": 178 + }, + { + "epoch": 0.05654043229872195, + "grad_norm": 0.82421875, + "learning_rate": 1.9974611234528722e-05, + "loss": 1.2938, + "step": 180 + }, + { + "epoch": 0.05716865932426331, + "grad_norm": 0.7421875, + "learning_rate": 1.9972072357981597e-05, + "loss": 1.469, + "step": 182 + }, + { + "epoch": 0.05779688634980466, + "grad_norm": 0.82421875, + "learning_rate": 1.9969533481434465e-05, + "loss": 1.41, + "step": 184 + }, + { + "epoch": 0.05842511337534602, + "grad_norm": 0.7265625, + "learning_rate": 1.996699460488734e-05, + "loss": 1.4414, + "step": 186 + }, + { + "epoch": 0.05905334040088737, + "grad_norm": 0.8828125, + "learning_rate": 1.996445572834021e-05, + "loss": 1.2717, + "step": 188 + }, + { + "epoch": 0.059681567426428724, + "grad_norm": 0.7890625, + "learning_rate": 1.9961916851793083e-05, + "loss": 1.3179, + "step": 190 + }, + { + "epoch": 0.06030979445197008, + "grad_norm": 0.88671875, + "learning_rate": 1.9959377975245954e-05, + "loss": 1.4353, + "step": 192 + }, + { + "epoch": 0.060938021477511435, + "grad_norm": 0.79296875, + "learning_rate": 1.995683909869883e-05, + "loss": 1.4721, + "step": 194 + }, + { + "epoch": 0.06156624850305279, + "grad_norm": 0.7890625, + "learning_rate": 1.99543002221517e-05, + "loss": 1.4394, + "step": 196 + }, + { + "epoch": 0.06219447552859415, + "grad_norm": 0.765625, + "learning_rate": 1.995176134560457e-05, + "loss": 1.4004, + "step": 198 + }, + { + "epoch": 0.0628227025541355, + "grad_norm": 0.703125, + "learning_rate": 1.9949222469057443e-05, + "loss": 1.3159, + "step": 200 + }, + { + "epoch": 0.06345092957967685, + "grad_norm": 0.7890625, + "learning_rate": 1.9946683592510318e-05, + "loss": 1.408, + "step": 202 + }, + { + "epoch": 0.06407915660521821, + "grad_norm": 0.77734375, + "learning_rate": 1.994414471596319e-05, + "loss": 1.362, + "step": 204 + }, + { + "epoch": 0.06470738363075956, + "grad_norm": 0.7421875, + "learning_rate": 1.994160583941606e-05, + "loss": 1.321, + "step": 206 + }, + { + "epoch": 0.06533561065630092, + "grad_norm": 0.76953125, + "learning_rate": 1.9939066962868932e-05, + "loss": 1.3576, + "step": 208 + }, + { + "epoch": 0.06596383768184227, + "grad_norm": 0.77734375, + "learning_rate": 1.9936528086321803e-05, + "loss": 1.4552, + "step": 210 + }, + { + "epoch": 0.06659206470738363, + "grad_norm": 0.875, + "learning_rate": 1.9933989209774675e-05, + "loss": 1.3144, + "step": 212 + }, + { + "epoch": 0.06722029173292499, + "grad_norm": 0.8515625, + "learning_rate": 1.993145033322755e-05, + "loss": 1.3575, + "step": 214 + }, + { + "epoch": 0.06784851875846634, + "grad_norm": 0.828125, + "learning_rate": 1.992891145668042e-05, + "loss": 1.25, + "step": 216 + }, + { + "epoch": 0.0684767457840077, + "grad_norm": 0.75, + "learning_rate": 1.9926372580133292e-05, + "loss": 1.4611, + "step": 218 + }, + { + "epoch": 0.06910497280954905, + "grad_norm": 0.73828125, + "learning_rate": 1.9923833703586163e-05, + "loss": 1.2994, + "step": 220 + }, + { + "epoch": 0.06973319983509041, + "grad_norm": 0.9375, + "learning_rate": 1.9921294827039038e-05, + "loss": 1.2697, + "step": 222 + }, + { + "epoch": 0.07036142686063176, + "grad_norm": 0.81640625, + "learning_rate": 1.991875595049191e-05, + "loss": 1.4699, + "step": 224 + }, + { + "epoch": 0.07098965388617312, + "grad_norm": 0.92578125, + "learning_rate": 1.991621707394478e-05, + "loss": 1.4276, + "step": 226 + }, + { + "epoch": 0.07161788091171448, + "grad_norm": 0.74609375, + "learning_rate": 1.9913678197397652e-05, + "loss": 1.4052, + "step": 228 + }, + { + "epoch": 0.07224610793725583, + "grad_norm": 0.8203125, + "learning_rate": 1.9911139320850527e-05, + "loss": 1.3744, + "step": 230 + }, + { + "epoch": 0.07287433496279717, + "grad_norm": 0.87109375, + "learning_rate": 1.9908600444303395e-05, + "loss": 1.363, + "step": 232 + }, + { + "epoch": 0.07350256198833853, + "grad_norm": 0.859375, + "learning_rate": 1.990606156775627e-05, + "loss": 1.421, + "step": 234 + }, + { + "epoch": 0.07413078901387989, + "grad_norm": 0.76171875, + "learning_rate": 1.990352269120914e-05, + "loss": 1.5131, + "step": 236 + }, + { + "epoch": 0.07475901603942124, + "grad_norm": 1.5625, + "learning_rate": 1.9900983814662013e-05, + "loss": 1.3155, + "step": 238 + }, + { + "epoch": 0.0753872430649626, + "grad_norm": 0.78125, + "learning_rate": 1.9898444938114884e-05, + "loss": 1.3595, + "step": 240 + }, + { + "epoch": 0.07601547009050395, + "grad_norm": 0.7890625, + "learning_rate": 1.989590606156776e-05, + "loss": 1.3706, + "step": 242 + }, + { + "epoch": 0.07664369711604531, + "grad_norm": 0.72265625, + "learning_rate": 1.989336718502063e-05, + "loss": 1.3058, + "step": 244 + }, + { + "epoch": 0.07727192414158666, + "grad_norm": 0.71484375, + "learning_rate": 1.98908283084735e-05, + "loss": 1.3404, + "step": 246 + }, + { + "epoch": 0.07790015116712802, + "grad_norm": 0.9453125, + "learning_rate": 1.9888289431926376e-05, + "loss": 1.235, + "step": 248 + }, + { + "epoch": 0.07852837819266938, + "grad_norm": 0.84765625, + "learning_rate": 1.9885750555379248e-05, + "loss": 1.3668, + "step": 250 + }, + { + "epoch": 0.07915660521821073, + "grad_norm": 0.71875, + "learning_rate": 1.988321167883212e-05, + "loss": 1.3602, + "step": 252 + }, + { + "epoch": 0.07978483224375209, + "grad_norm": 0.828125, + "learning_rate": 1.988067280228499e-05, + "loss": 1.3833, + "step": 254 + }, + { + "epoch": 0.08041305926929344, + "grad_norm": 0.796875, + "learning_rate": 1.9878133925737865e-05, + "loss": 1.4476, + "step": 256 + }, + { + "epoch": 0.0810412862948348, + "grad_norm": 0.68359375, + "learning_rate": 1.9875595049190733e-05, + "loss": 1.4111, + "step": 258 + }, + { + "epoch": 0.08166951332037616, + "grad_norm": 0.88671875, + "learning_rate": 1.9873056172643608e-05, + "loss": 1.3636, + "step": 260 + }, + { + "epoch": 0.08229774034591751, + "grad_norm": 0.78515625, + "learning_rate": 1.987051729609648e-05, + "loss": 1.2524, + "step": 262 + }, + { + "epoch": 0.08292596737145887, + "grad_norm": 0.9609375, + "learning_rate": 1.986797841954935e-05, + "loss": 1.4048, + "step": 264 + }, + { + "epoch": 0.08355419439700022, + "grad_norm": 0.7109375, + "learning_rate": 1.9865439543002222e-05, + "loss": 1.3619, + "step": 266 + }, + { + "epoch": 0.08418242142254156, + "grad_norm": 0.796875, + "learning_rate": 1.9862900666455097e-05, + "loss": 1.4125, + "step": 268 + }, + { + "epoch": 0.08481064844808292, + "grad_norm": 0.6875, + "learning_rate": 1.986036178990797e-05, + "loss": 1.4536, + "step": 270 + }, + { + "epoch": 0.08543887547362428, + "grad_norm": 0.8125, + "learning_rate": 1.985782291336084e-05, + "loss": 1.392, + "step": 272 + }, + { + "epoch": 0.08606710249916563, + "grad_norm": 0.77734375, + "learning_rate": 1.985528403681371e-05, + "loss": 1.393, + "step": 274 + }, + { + "epoch": 0.08669532952470699, + "grad_norm": 0.91015625, + "learning_rate": 1.9852745160266586e-05, + "loss": 1.3635, + "step": 276 + }, + { + "epoch": 0.08732355655024834, + "grad_norm": 0.75, + "learning_rate": 1.9850206283719454e-05, + "loss": 1.4626, + "step": 278 + }, + { + "epoch": 0.0879517835757897, + "grad_norm": 0.8671875, + "learning_rate": 1.984766740717233e-05, + "loss": 1.3507, + "step": 280 + }, + { + "epoch": 0.08858001060133106, + "grad_norm": 0.83203125, + "learning_rate": 1.98451285306252e-05, + "loss": 1.4432, + "step": 282 + }, + { + "epoch": 0.08920823762687241, + "grad_norm": 0.83203125, + "learning_rate": 1.984258965407807e-05, + "loss": 1.3932, + "step": 284 + }, + { + "epoch": 0.08983646465241377, + "grad_norm": 0.8203125, + "learning_rate": 1.9840050777530943e-05, + "loss": 1.391, + "step": 286 + }, + { + "epoch": 0.09046469167795512, + "grad_norm": 0.7109375, + "learning_rate": 1.9837511900983818e-05, + "loss": 1.4163, + "step": 288 + }, + { + "epoch": 0.09109291870349648, + "grad_norm": 1.171875, + "learning_rate": 1.983497302443669e-05, + "loss": 1.4135, + "step": 290 + }, + { + "epoch": 0.09172114572903783, + "grad_norm": 0.8515625, + "learning_rate": 1.983243414788956e-05, + "loss": 1.4099, + "step": 292 + }, + { + "epoch": 0.09234937275457919, + "grad_norm": 0.76171875, + "learning_rate": 1.982989527134243e-05, + "loss": 1.2512, + "step": 294 + }, + { + "epoch": 0.09297759978012055, + "grad_norm": 0.734375, + "learning_rate": 1.9827356394795306e-05, + "loss": 1.255, + "step": 296 + }, + { + "epoch": 0.0936058268056619, + "grad_norm": 0.87109375, + "learning_rate": 1.9824817518248174e-05, + "loss": 1.2295, + "step": 298 + }, + { + "epoch": 0.09423405383120326, + "grad_norm": 0.765625, + "learning_rate": 1.982227864170105e-05, + "loss": 1.4514, + "step": 300 + }, + { + "epoch": 0.09486228085674461, + "grad_norm": 0.8828125, + "learning_rate": 1.981973976515392e-05, + "loss": 1.3137, + "step": 302 + }, + { + "epoch": 0.09549050788228595, + "grad_norm": 0.86328125, + "learning_rate": 1.9817200888606792e-05, + "loss": 1.3511, + "step": 304 + }, + { + "epoch": 0.09611873490782731, + "grad_norm": 0.85546875, + "learning_rate": 1.9814662012059663e-05, + "loss": 1.3035, + "step": 306 + }, + { + "epoch": 0.09674696193336867, + "grad_norm": 0.734375, + "learning_rate": 1.9812123135512538e-05, + "loss": 1.4151, + "step": 308 + }, + { + "epoch": 0.09737518895891002, + "grad_norm": 0.79296875, + "learning_rate": 1.980958425896541e-05, + "loss": 1.3819, + "step": 310 + }, + { + "epoch": 0.09800341598445138, + "grad_norm": 0.76953125, + "learning_rate": 1.980704538241828e-05, + "loss": 1.3212, + "step": 312 + }, + { + "epoch": 0.09863164300999273, + "grad_norm": 0.86328125, + "learning_rate": 1.9804506505871152e-05, + "loss": 1.4313, + "step": 314 + }, + { + "epoch": 0.09925987003553409, + "grad_norm": 0.828125, + "learning_rate": 1.9801967629324027e-05, + "loss": 1.4021, + "step": 316 + }, + { + "epoch": 0.09988809706107545, + "grad_norm": 0.7578125, + "learning_rate": 1.97994287527769e-05, + "loss": 1.3191, + "step": 318 + }, + { + "epoch": 0.1005163240866168, + "grad_norm": 0.734375, + "learning_rate": 1.979688987622977e-05, + "loss": 1.3138, + "step": 320 + }, + { + "epoch": 0.10114455111215816, + "grad_norm": 0.78125, + "learning_rate": 1.979435099968264e-05, + "loss": 1.3881, + "step": 322 + }, + { + "epoch": 0.10177277813769951, + "grad_norm": 0.7265625, + "learning_rate": 1.9791812123135513e-05, + "loss": 1.1786, + "step": 324 + }, + { + "epoch": 0.10240100516324087, + "grad_norm": 0.73828125, + "learning_rate": 1.9789273246588384e-05, + "loss": 1.246, + "step": 326 + }, + { + "epoch": 0.10302923218878222, + "grad_norm": 0.8515625, + "learning_rate": 1.978673437004126e-05, + "loss": 1.359, + "step": 328 + }, + { + "epoch": 0.10365745921432358, + "grad_norm": 0.91796875, + "learning_rate": 1.978419549349413e-05, + "loss": 1.271, + "step": 330 + }, + { + "epoch": 0.10428568623986494, + "grad_norm": 0.75, + "learning_rate": 1.9781656616947e-05, + "loss": 1.4137, + "step": 332 + }, + { + "epoch": 0.10491391326540629, + "grad_norm": 0.75390625, + "learning_rate": 1.9779117740399876e-05, + "loss": 1.3471, + "step": 334 + }, + { + "epoch": 0.10554214029094765, + "grad_norm": 0.70703125, + "learning_rate": 1.9776578863852748e-05, + "loss": 1.4421, + "step": 336 + }, + { + "epoch": 0.10617036731648899, + "grad_norm": 0.73828125, + "learning_rate": 1.977403998730562e-05, + "loss": 1.2823, + "step": 338 + }, + { + "epoch": 0.10679859434203035, + "grad_norm": 0.76171875, + "learning_rate": 1.977150111075849e-05, + "loss": 1.463, + "step": 340 + }, + { + "epoch": 0.1074268213675717, + "grad_norm": 0.74609375, + "learning_rate": 1.9768962234211365e-05, + "loss": 1.2987, + "step": 342 + }, + { + "epoch": 0.10805504839311306, + "grad_norm": 0.88671875, + "learning_rate": 1.9766423357664237e-05, + "loss": 1.4113, + "step": 344 + }, + { + "epoch": 0.10868327541865441, + "grad_norm": 0.7578125, + "learning_rate": 1.9763884481117108e-05, + "loss": 1.4153, + "step": 346 + }, + { + "epoch": 0.10931150244419577, + "grad_norm": 0.703125, + "learning_rate": 1.976134560456998e-05, + "loss": 1.3976, + "step": 348 + }, + { + "epoch": 0.10993972946973712, + "grad_norm": 0.78515625, + "learning_rate": 1.975880672802285e-05, + "loss": 1.395, + "step": 350 + }, + { + "epoch": 0.11056795649527848, + "grad_norm": 0.83984375, + "learning_rate": 1.9756267851475722e-05, + "loss": 1.533, + "step": 352 + }, + { + "epoch": 0.11119618352081984, + "grad_norm": 0.796875, + "learning_rate": 1.9753728974928597e-05, + "loss": 1.3265, + "step": 354 + }, + { + "epoch": 0.11182441054636119, + "grad_norm": 0.76171875, + "learning_rate": 1.9751190098381468e-05, + "loss": 1.4088, + "step": 356 + }, + { + "epoch": 0.11245263757190255, + "grad_norm": 0.8671875, + "learning_rate": 1.974865122183434e-05, + "loss": 1.4432, + "step": 358 + }, + { + "epoch": 0.1130808645974439, + "grad_norm": 0.984375, + "learning_rate": 1.974611234528721e-05, + "loss": 1.2292, + "step": 360 + }, + { + "epoch": 0.11370909162298526, + "grad_norm": 0.73828125, + "learning_rate": 1.9743573468740086e-05, + "loss": 1.3708, + "step": 362 + }, + { + "epoch": 0.11433731864852661, + "grad_norm": 0.73046875, + "learning_rate": 1.9741034592192957e-05, + "loss": 1.2918, + "step": 364 + }, + { + "epoch": 0.11496554567406797, + "grad_norm": 0.79296875, + "learning_rate": 1.973849571564583e-05, + "loss": 1.4335, + "step": 366 + }, + { + "epoch": 0.11559377269960933, + "grad_norm": 0.78515625, + "learning_rate": 1.97359568390987e-05, + "loss": 1.2187, + "step": 368 + }, + { + "epoch": 0.11622199972515068, + "grad_norm": 10.625, + "learning_rate": 1.9733417962551575e-05, + "loss": 1.2494, + "step": 370 + }, + { + "epoch": 0.11685022675069204, + "grad_norm": 0.75, + "learning_rate": 1.9730879086004443e-05, + "loss": 1.2348, + "step": 372 + }, + { + "epoch": 0.11747845377623338, + "grad_norm": 0.6953125, + "learning_rate": 1.9728340209457317e-05, + "loss": 1.3933, + "step": 374 + }, + { + "epoch": 0.11810668080177474, + "grad_norm": 0.8515625, + "learning_rate": 1.972580133291019e-05, + "loss": 1.445, + "step": 376 + }, + { + "epoch": 0.11873490782731609, + "grad_norm": 0.8515625, + "learning_rate": 1.972326245636306e-05, + "loss": 1.3521, + "step": 378 + }, + { + "epoch": 0.11936313485285745, + "grad_norm": 0.69921875, + "learning_rate": 1.972072357981593e-05, + "loss": 1.4533, + "step": 380 + }, + { + "epoch": 0.1199913618783988, + "grad_norm": 0.78515625, + "learning_rate": 1.9718184703268806e-05, + "loss": 1.2909, + "step": 382 + }, + { + "epoch": 0.12061958890394016, + "grad_norm": 0.8203125, + "learning_rate": 1.9715645826721678e-05, + "loss": 1.5502, + "step": 384 + }, + { + "epoch": 0.12124781592948151, + "grad_norm": 0.84765625, + "learning_rate": 1.971310695017455e-05, + "loss": 1.3525, + "step": 386 + }, + { + "epoch": 0.12187604295502287, + "grad_norm": 0.796875, + "learning_rate": 1.971056807362742e-05, + "loss": 1.5028, + "step": 388 + }, + { + "epoch": 0.12250426998056423, + "grad_norm": 6.21875, + "learning_rate": 1.9708029197080295e-05, + "loss": 1.3943, + "step": 390 + }, + { + "epoch": 0.12313249700610558, + "grad_norm": 0.765625, + "learning_rate": 1.9705490320533163e-05, + "loss": 1.5042, + "step": 392 + }, + { + "epoch": 0.12376072403164694, + "grad_norm": 0.79296875, + "learning_rate": 1.9702951443986038e-05, + "loss": 1.3527, + "step": 394 + }, + { + "epoch": 0.1243889510571883, + "grad_norm": 0.734375, + "learning_rate": 1.970041256743891e-05, + "loss": 1.5268, + "step": 396 + }, + { + "epoch": 0.12501717808272964, + "grad_norm": 0.77734375, + "learning_rate": 1.969787369089178e-05, + "loss": 1.2923, + "step": 398 + }, + { + "epoch": 0.125645405108271, + "grad_norm": 0.85546875, + "learning_rate": 1.9695334814344652e-05, + "loss": 1.4865, + "step": 400 + }, + { + "epoch": 0.12627363213381235, + "grad_norm": 0.828125, + "learning_rate": 1.9692795937797527e-05, + "loss": 1.2926, + "step": 402 + }, + { + "epoch": 0.1269018591593537, + "grad_norm": 0.796875, + "learning_rate": 1.96902570612504e-05, + "loss": 1.3763, + "step": 404 + }, + { + "epoch": 0.12753008618489506, + "grad_norm": 0.77734375, + "learning_rate": 1.968771818470327e-05, + "loss": 1.4208, + "step": 406 + }, + { + "epoch": 0.12815831321043641, + "grad_norm": 0.8515625, + "learning_rate": 1.968517930815614e-05, + "loss": 1.2802, + "step": 408 + }, + { + "epoch": 0.12878654023597777, + "grad_norm": 0.7578125, + "learning_rate": 1.9682640431609016e-05, + "loss": 1.3137, + "step": 410 + }, + { + "epoch": 0.12941476726151913, + "grad_norm": 0.734375, + "learning_rate": 1.9680101555061887e-05, + "loss": 1.2313, + "step": 412 + }, + { + "epoch": 0.13004299428706048, + "grad_norm": 0.73828125, + "learning_rate": 1.967756267851476e-05, + "loss": 1.3286, + "step": 414 + }, + { + "epoch": 0.13067122131260184, + "grad_norm": 0.86328125, + "learning_rate": 1.967502380196763e-05, + "loss": 1.3544, + "step": 416 + }, + { + "epoch": 0.1312994483381432, + "grad_norm": 0.796875, + "learning_rate": 1.96724849254205e-05, + "loss": 1.4726, + "step": 418 + }, + { + "epoch": 0.13192767536368455, + "grad_norm": 0.71875, + "learning_rate": 1.9669946048873376e-05, + "loss": 1.3215, + "step": 420 + }, + { + "epoch": 0.1325559023892259, + "grad_norm": 0.78515625, + "learning_rate": 1.9667407172326248e-05, + "loss": 1.5521, + "step": 422 + }, + { + "epoch": 0.13318412941476726, + "grad_norm": 0.796875, + "learning_rate": 1.966486829577912e-05, + "loss": 1.3127, + "step": 424 + }, + { + "epoch": 0.13381235644030862, + "grad_norm": 0.8359375, + "learning_rate": 1.966232941923199e-05, + "loss": 1.2696, + "step": 426 + }, + { + "epoch": 0.13444058346584997, + "grad_norm": 0.8046875, + "learning_rate": 1.9659790542684865e-05, + "loss": 1.2138, + "step": 428 + }, + { + "epoch": 0.13506881049139133, + "grad_norm": 0.7890625, + "learning_rate": 1.9657251666137736e-05, + "loss": 1.4204, + "step": 430 + }, + { + "epoch": 0.13569703751693268, + "grad_norm": 0.75390625, + "learning_rate": 1.9654712789590608e-05, + "loss": 1.2865, + "step": 432 + }, + { + "epoch": 0.13632526454247404, + "grad_norm": 0.73828125, + "learning_rate": 1.965217391304348e-05, + "loss": 1.2856, + "step": 434 + }, + { + "epoch": 0.1369534915680154, + "grad_norm": 0.73046875, + "learning_rate": 1.9649635036496354e-05, + "loss": 1.4284, + "step": 436 + }, + { + "epoch": 0.13758171859355675, + "grad_norm": 0.73828125, + "learning_rate": 1.9647096159949225e-05, + "loss": 1.3569, + "step": 438 + }, + { + "epoch": 0.1382099456190981, + "grad_norm": 0.671875, + "learning_rate": 1.9644557283402097e-05, + "loss": 1.3295, + "step": 440 + }, + { + "epoch": 0.13883817264463946, + "grad_norm": 0.7421875, + "learning_rate": 1.9642018406854968e-05, + "loss": 1.2948, + "step": 442 + }, + { + "epoch": 0.13946639967018082, + "grad_norm": 0.79296875, + "learning_rate": 1.963947953030784e-05, + "loss": 1.4097, + "step": 444 + }, + { + "epoch": 0.14009462669572217, + "grad_norm": 1.0703125, + "learning_rate": 1.963694065376071e-05, + "loss": 1.3674, + "step": 446 + }, + { + "epoch": 0.14072285372126353, + "grad_norm": 0.8671875, + "learning_rate": 1.9634401777213586e-05, + "loss": 1.4544, + "step": 448 + }, + { + "epoch": 0.14135108074680489, + "grad_norm": 0.72265625, + "learning_rate": 1.9631862900666457e-05, + "loss": 1.3385, + "step": 450 + }, + { + "epoch": 0.14197930777234624, + "grad_norm": 0.75390625, + "learning_rate": 1.962932402411933e-05, + "loss": 1.3962, + "step": 452 + }, + { + "epoch": 0.1426075347978876, + "grad_norm": 0.80078125, + "learning_rate": 1.96267851475722e-05, + "loss": 1.3257, + "step": 454 + }, + { + "epoch": 0.14323576182342895, + "grad_norm": 0.8359375, + "learning_rate": 1.9624246271025075e-05, + "loss": 1.3572, + "step": 456 + }, + { + "epoch": 0.1438639888489703, + "grad_norm": 0.73046875, + "learning_rate": 1.9621707394477946e-05, + "loss": 1.5115, + "step": 458 + }, + { + "epoch": 0.14449221587451166, + "grad_norm": 0.7578125, + "learning_rate": 1.9619168517930817e-05, + "loss": 1.3532, + "step": 460 + }, + { + "epoch": 0.14512044290005302, + "grad_norm": 0.8046875, + "learning_rate": 1.961662964138369e-05, + "loss": 1.3612, + "step": 462 + }, + { + "epoch": 0.14574866992559435, + "grad_norm": 0.73828125, + "learning_rate": 1.9614090764836564e-05, + "loss": 1.3881, + "step": 464 + }, + { + "epoch": 0.1463768969511357, + "grad_norm": 0.94140625, + "learning_rate": 1.961155188828943e-05, + "loss": 1.534, + "step": 466 + }, + { + "epoch": 0.14700512397667706, + "grad_norm": 0.71484375, + "learning_rate": 1.9609013011742306e-05, + "loss": 1.4607, + "step": 468 + }, + { + "epoch": 0.14763335100221842, + "grad_norm": 0.72265625, + "learning_rate": 1.9606474135195178e-05, + "loss": 1.3466, + "step": 470 + }, + { + "epoch": 0.14826157802775977, + "grad_norm": 0.7109375, + "learning_rate": 1.960393525864805e-05, + "loss": 1.3187, + "step": 472 + }, + { + "epoch": 0.14888980505330113, + "grad_norm": 0.77734375, + "learning_rate": 1.960139638210092e-05, + "loss": 1.36, + "step": 474 + }, + { + "epoch": 0.14951803207884248, + "grad_norm": 0.859375, + "learning_rate": 1.9598857505553795e-05, + "loss": 1.2185, + "step": 476 + }, + { + "epoch": 0.15014625910438384, + "grad_norm": 0.71484375, + "learning_rate": 1.9596318629006667e-05, + "loss": 1.4085, + "step": 478 + }, + { + "epoch": 0.1507744861299252, + "grad_norm": 0.98046875, + "learning_rate": 1.9593779752459538e-05, + "loss": 1.3917, + "step": 480 + }, + { + "epoch": 0.15140271315546655, + "grad_norm": 0.7421875, + "learning_rate": 1.959124087591241e-05, + "loss": 1.3497, + "step": 482 + }, + { + "epoch": 0.1520309401810079, + "grad_norm": 0.76171875, + "learning_rate": 1.9588701999365284e-05, + "loss": 1.3855, + "step": 484 + }, + { + "epoch": 0.15265916720654926, + "grad_norm": 0.81640625, + "learning_rate": 1.9586163122818152e-05, + "loss": 1.4071, + "step": 486 + }, + { + "epoch": 0.15328739423209062, + "grad_norm": 0.80859375, + "learning_rate": 1.9583624246271027e-05, + "loss": 1.2817, + "step": 488 + }, + { + "epoch": 0.15391562125763197, + "grad_norm": 0.75, + "learning_rate": 1.9581085369723898e-05, + "loss": 1.3758, + "step": 490 + }, + { + "epoch": 0.15454384828317333, + "grad_norm": 0.78125, + "learning_rate": 1.957854649317677e-05, + "loss": 1.4021, + "step": 492 + }, + { + "epoch": 0.15517207530871469, + "grad_norm": 0.75390625, + "learning_rate": 1.957600761662964e-05, + "loss": 1.4163, + "step": 494 + }, + { + "epoch": 0.15580030233425604, + "grad_norm": 0.7890625, + "learning_rate": 1.9573468740082516e-05, + "loss": 1.3127, + "step": 496 + }, + { + "epoch": 0.1564285293597974, + "grad_norm": 0.79296875, + "learning_rate": 1.9570929863535387e-05, + "loss": 1.2817, + "step": 498 + }, + { + "epoch": 0.15705675638533875, + "grad_norm": 0.76171875, + "learning_rate": 1.956839098698826e-05, + "loss": 1.3561, + "step": 500 + }, + { + "epoch": 0.1576849834108801, + "grad_norm": 0.76171875, + "learning_rate": 1.956585211044113e-05, + "loss": 1.317, + "step": 502 + }, + { + "epoch": 0.15831321043642146, + "grad_norm": 1.3671875, + "learning_rate": 1.9563313233894005e-05, + "loss": 1.4507, + "step": 504 + }, + { + "epoch": 0.15894143746196282, + "grad_norm": 1.0703125, + "learning_rate": 1.9560774357346876e-05, + "loss": 1.3715, + "step": 506 + }, + { + "epoch": 0.15956966448750418, + "grad_norm": 0.765625, + "learning_rate": 1.9558235480799747e-05, + "loss": 1.3502, + "step": 508 + }, + { + "epoch": 0.16019789151304553, + "grad_norm": 0.72265625, + "learning_rate": 1.9555696604252622e-05, + "loss": 1.4822, + "step": 510 + }, + { + "epoch": 0.1608261185385869, + "grad_norm": 0.75, + "learning_rate": 1.955315772770549e-05, + "loss": 1.2966, + "step": 512 + }, + { + "epoch": 0.16145434556412824, + "grad_norm": 0.80078125, + "learning_rate": 1.9550618851158365e-05, + "loss": 1.2823, + "step": 514 + }, + { + "epoch": 0.1620825725896696, + "grad_norm": 0.87109375, + "learning_rate": 1.9548079974611236e-05, + "loss": 1.3542, + "step": 516 + }, + { + "epoch": 0.16271079961521095, + "grad_norm": 0.72265625, + "learning_rate": 1.9545541098064108e-05, + "loss": 1.2341, + "step": 518 + }, + { + "epoch": 0.1633390266407523, + "grad_norm": 0.90625, + "learning_rate": 1.954300222151698e-05, + "loss": 1.4864, + "step": 520 + }, + { + "epoch": 0.16396725366629367, + "grad_norm": 0.80078125, + "learning_rate": 1.9540463344969854e-05, + "loss": 1.4035, + "step": 522 + }, + { + "epoch": 0.16459548069183502, + "grad_norm": 0.7734375, + "learning_rate": 1.9537924468422725e-05, + "loss": 1.3945, + "step": 524 + }, + { + "epoch": 0.16522370771737638, + "grad_norm": 0.94921875, + "learning_rate": 1.9535385591875597e-05, + "loss": 1.3526, + "step": 526 + }, + { + "epoch": 0.16585193474291773, + "grad_norm": 0.71484375, + "learning_rate": 1.9532846715328468e-05, + "loss": 1.4006, + "step": 528 + }, + { + "epoch": 0.1664801617684591, + "grad_norm": 0.703125, + "learning_rate": 1.9530307838781343e-05, + "loss": 1.3114, + "step": 530 + }, + { + "epoch": 0.16710838879400045, + "grad_norm": 0.75390625, + "learning_rate": 1.9527768962234214e-05, + "loss": 1.4793, + "step": 532 + }, + { + "epoch": 0.16773661581954177, + "grad_norm": 0.7734375, + "learning_rate": 1.9525230085687086e-05, + "loss": 1.3226, + "step": 534 + }, + { + "epoch": 0.16836484284508313, + "grad_norm": 0.90625, + "learning_rate": 1.9522691209139957e-05, + "loss": 1.3562, + "step": 536 + }, + { + "epoch": 0.16899306987062448, + "grad_norm": 0.91796875, + "learning_rate": 1.952015233259283e-05, + "loss": 1.3007, + "step": 538 + }, + { + "epoch": 0.16962129689616584, + "grad_norm": 0.69140625, + "learning_rate": 1.95176134560457e-05, + "loss": 1.4449, + "step": 540 + }, + { + "epoch": 0.1702495239217072, + "grad_norm": 0.74609375, + "learning_rate": 1.9515074579498575e-05, + "loss": 1.3835, + "step": 542 + }, + { + "epoch": 0.17087775094724855, + "grad_norm": 0.74609375, + "learning_rate": 1.9512535702951446e-05, + "loss": 1.4734, + "step": 544 + }, + { + "epoch": 0.1715059779727899, + "grad_norm": 0.7109375, + "learning_rate": 1.9509996826404317e-05, + "loss": 1.4759, + "step": 546 + }, + { + "epoch": 0.17213420499833126, + "grad_norm": 0.6953125, + "learning_rate": 1.950745794985719e-05, + "loss": 1.3622, + "step": 548 + }, + { + "epoch": 0.17276243202387262, + "grad_norm": 0.703125, + "learning_rate": 1.9504919073310063e-05, + "loss": 1.5, + "step": 550 + }, + { + "epoch": 0.17339065904941398, + "grad_norm": 0.765625, + "learning_rate": 1.9502380196762935e-05, + "loss": 1.4584, + "step": 552 + }, + { + "epoch": 0.17401888607495533, + "grad_norm": 0.78125, + "learning_rate": 1.9499841320215806e-05, + "loss": 1.1847, + "step": 554 + }, + { + "epoch": 0.1746471131004967, + "grad_norm": 0.8046875, + "learning_rate": 1.9497302443668678e-05, + "loss": 1.4887, + "step": 556 + }, + { + "epoch": 0.17527534012603804, + "grad_norm": 0.703125, + "learning_rate": 1.9494763567121552e-05, + "loss": 1.3411, + "step": 558 + }, + { + "epoch": 0.1759035671515794, + "grad_norm": 0.8125, + "learning_rate": 1.949222469057442e-05, + "loss": 1.4525, + "step": 560 + }, + { + "epoch": 0.17653179417712075, + "grad_norm": 0.78125, + "learning_rate": 1.9489685814027295e-05, + "loss": 1.2743, + "step": 562 + }, + { + "epoch": 0.1771600212026621, + "grad_norm": 0.77734375, + "learning_rate": 1.9487146937480167e-05, + "loss": 1.385, + "step": 564 + }, + { + "epoch": 0.17778824822820347, + "grad_norm": 0.89453125, + "learning_rate": 1.9484608060933038e-05, + "loss": 1.3988, + "step": 566 + }, + { + "epoch": 0.17841647525374482, + "grad_norm": 0.78125, + "learning_rate": 1.948206918438591e-05, + "loss": 1.2637, + "step": 568 + }, + { + "epoch": 0.17904470227928618, + "grad_norm": 0.73046875, + "learning_rate": 1.9479530307838784e-05, + "loss": 1.3832, + "step": 570 + }, + { + "epoch": 0.17967292930482753, + "grad_norm": 0.83984375, + "learning_rate": 1.9476991431291655e-05, + "loss": 1.367, + "step": 572 + }, + { + "epoch": 0.1803011563303689, + "grad_norm": 0.85546875, + "learning_rate": 1.9474452554744527e-05, + "loss": 1.3174, + "step": 574 + }, + { + "epoch": 0.18092938335591024, + "grad_norm": 0.6875, + "learning_rate": 1.9471913678197398e-05, + "loss": 1.2966, + "step": 576 + }, + { + "epoch": 0.1815576103814516, + "grad_norm": 0.796875, + "learning_rate": 1.9469374801650273e-05, + "loss": 1.4582, + "step": 578 + }, + { + "epoch": 0.18218583740699296, + "grad_norm": 0.69921875, + "learning_rate": 1.946683592510314e-05, + "loss": 1.3229, + "step": 580 + }, + { + "epoch": 0.1828140644325343, + "grad_norm": 0.734375, + "learning_rate": 1.9464297048556016e-05, + "loss": 1.2895, + "step": 582 + }, + { + "epoch": 0.18344229145807567, + "grad_norm": 0.73046875, + "learning_rate": 1.9461758172008887e-05, + "loss": 1.5382, + "step": 584 + }, + { + "epoch": 0.18407051848361702, + "grad_norm": 0.92578125, + "learning_rate": 1.945921929546176e-05, + "loss": 1.4349, + "step": 586 + }, + { + "epoch": 0.18469874550915838, + "grad_norm": 0.828125, + "learning_rate": 1.945668041891463e-05, + "loss": 1.3861, + "step": 588 + }, + { + "epoch": 0.18532697253469974, + "grad_norm": 0.76953125, + "learning_rate": 1.9454141542367505e-05, + "loss": 1.2897, + "step": 590 + }, + { + "epoch": 0.1859551995602411, + "grad_norm": 0.8671875, + "learning_rate": 1.9451602665820376e-05, + "loss": 1.3362, + "step": 592 + }, + { + "epoch": 0.18658342658578245, + "grad_norm": 0.8046875, + "learning_rate": 1.9449063789273247e-05, + "loss": 1.3954, + "step": 594 + }, + { + "epoch": 0.1872116536113238, + "grad_norm": 0.734375, + "learning_rate": 1.9446524912726122e-05, + "loss": 1.3541, + "step": 596 + }, + { + "epoch": 0.18783988063686516, + "grad_norm": 0.80859375, + "learning_rate": 1.9443986036178994e-05, + "loss": 1.4498, + "step": 598 + }, + { + "epoch": 0.18846810766240651, + "grad_norm": 0.83203125, + "learning_rate": 1.9441447159631865e-05, + "loss": 1.3767, + "step": 600 + }, + { + "epoch": 0.18909633468794787, + "grad_norm": 0.9453125, + "learning_rate": 1.9438908283084736e-05, + "loss": 1.245, + "step": 602 + }, + { + "epoch": 0.18972456171348923, + "grad_norm": 0.66015625, + "learning_rate": 1.943636940653761e-05, + "loss": 1.4371, + "step": 604 + }, + { + "epoch": 0.19035278873903055, + "grad_norm": 0.7890625, + "learning_rate": 1.943383052999048e-05, + "loss": 1.3194, + "step": 606 + }, + { + "epoch": 0.1909810157645719, + "grad_norm": 0.7421875, + "learning_rate": 1.9431291653443354e-05, + "loss": 1.3339, + "step": 608 + }, + { + "epoch": 0.19160924279011327, + "grad_norm": 0.7578125, + "learning_rate": 1.9428752776896225e-05, + "loss": 1.3773, + "step": 610 + }, + { + "epoch": 0.19223746981565462, + "grad_norm": 0.7265625, + "learning_rate": 1.9426213900349097e-05, + "loss": 1.3456, + "step": 612 + }, + { + "epoch": 0.19286569684119598, + "grad_norm": 1.015625, + "learning_rate": 1.9423675023801968e-05, + "loss": 1.3713, + "step": 614 + }, + { + "epoch": 0.19349392386673733, + "grad_norm": 0.73828125, + "learning_rate": 1.9421136147254843e-05, + "loss": 1.4541, + "step": 616 + }, + { + "epoch": 0.1941221508922787, + "grad_norm": 0.71484375, + "learning_rate": 1.9418597270707714e-05, + "loss": 1.4132, + "step": 618 + }, + { + "epoch": 0.19475037791782004, + "grad_norm": 0.90234375, + "learning_rate": 1.9416058394160586e-05, + "loss": 1.461, + "step": 620 + }, + { + "epoch": 0.1953786049433614, + "grad_norm": 0.765625, + "learning_rate": 1.9413519517613457e-05, + "loss": 1.2741, + "step": 622 + }, + { + "epoch": 0.19600683196890276, + "grad_norm": 0.75, + "learning_rate": 1.9410980641066332e-05, + "loss": 1.4783, + "step": 624 + }, + { + "epoch": 0.1966350589944441, + "grad_norm": 0.9296875, + "learning_rate": 1.94084417645192e-05, + "loss": 1.3676, + "step": 626 + }, + { + "epoch": 0.19726328601998547, + "grad_norm": 0.83984375, + "learning_rate": 1.9405902887972074e-05, + "loss": 1.2958, + "step": 628 + }, + { + "epoch": 0.19789151304552682, + "grad_norm": 0.8125, + "learning_rate": 1.9403364011424946e-05, + "loss": 1.3159, + "step": 630 + }, + { + "epoch": 0.19851974007106818, + "grad_norm": 0.7578125, + "learning_rate": 1.9400825134877817e-05, + "loss": 1.3835, + "step": 632 + }, + { + "epoch": 0.19914796709660953, + "grad_norm": 0.890625, + "learning_rate": 1.939828625833069e-05, + "loss": 1.2554, + "step": 634 + }, + { + "epoch": 0.1997761941221509, + "grad_norm": 0.671875, + "learning_rate": 1.9395747381783563e-05, + "loss": 1.3815, + "step": 636 + }, + { + "epoch": 0.20040442114769225, + "grad_norm": 0.78125, + "learning_rate": 1.9393208505236435e-05, + "loss": 1.4323, + "step": 638 + }, + { + "epoch": 0.2010326481732336, + "grad_norm": 0.7265625, + "learning_rate": 1.9390669628689306e-05, + "loss": 1.3292, + "step": 640 + }, + { + "epoch": 0.20166087519877496, + "grad_norm": 0.82421875, + "learning_rate": 1.9388130752142178e-05, + "loss": 1.2865, + "step": 642 + }, + { + "epoch": 0.20228910222431631, + "grad_norm": 1.015625, + "learning_rate": 1.9385591875595052e-05, + "loss": 1.3822, + "step": 644 + }, + { + "epoch": 0.20291732924985767, + "grad_norm": 0.75, + "learning_rate": 1.9383052999047924e-05, + "loss": 1.3657, + "step": 646 + }, + { + "epoch": 0.20354555627539903, + "grad_norm": 0.82421875, + "learning_rate": 1.9380514122500795e-05, + "loss": 1.3554, + "step": 648 + }, + { + "epoch": 0.20417378330094038, + "grad_norm": 0.75, + "learning_rate": 1.9377975245953666e-05, + "loss": 1.331, + "step": 650 + }, + { + "epoch": 0.20480201032648174, + "grad_norm": 0.96484375, + "learning_rate": 1.9375436369406538e-05, + "loss": 1.3798, + "step": 652 + }, + { + "epoch": 0.2054302373520231, + "grad_norm": 0.80078125, + "learning_rate": 1.937289749285941e-05, + "loss": 1.4126, + "step": 654 + }, + { + "epoch": 0.20605846437756445, + "grad_norm": 0.7265625, + "learning_rate": 1.9370358616312284e-05, + "loss": 1.5412, + "step": 656 + }, + { + "epoch": 0.2066866914031058, + "grad_norm": 0.6875, + "learning_rate": 1.9367819739765155e-05, + "loss": 1.4367, + "step": 658 + }, + { + "epoch": 0.20731491842864716, + "grad_norm": 0.8828125, + "learning_rate": 1.9365280863218027e-05, + "loss": 1.3944, + "step": 660 + }, + { + "epoch": 0.20794314545418852, + "grad_norm": 0.74609375, + "learning_rate": 1.9362741986670898e-05, + "loss": 1.4311, + "step": 662 + }, + { + "epoch": 0.20857137247972987, + "grad_norm": 0.73046875, + "learning_rate": 1.9360203110123773e-05, + "loss": 1.452, + "step": 664 + }, + { + "epoch": 0.20919959950527123, + "grad_norm": 0.73828125, + "learning_rate": 1.9357664233576644e-05, + "loss": 1.3529, + "step": 666 + }, + { + "epoch": 0.20982782653081258, + "grad_norm": 0.703125, + "learning_rate": 1.9355125357029516e-05, + "loss": 1.3444, + "step": 668 + }, + { + "epoch": 0.21045605355635394, + "grad_norm": 0.70703125, + "learning_rate": 1.9352586480482387e-05, + "loss": 1.352, + "step": 670 + }, + { + "epoch": 0.2110842805818953, + "grad_norm": 0.81640625, + "learning_rate": 1.9350047603935262e-05, + "loss": 1.455, + "step": 672 + }, + { + "epoch": 0.21171250760743665, + "grad_norm": 0.7578125, + "learning_rate": 1.934750872738813e-05, + "loss": 1.2581, + "step": 674 + }, + { + "epoch": 0.21234073463297798, + "grad_norm": 0.8515625, + "learning_rate": 1.9344969850841005e-05, + "loss": 1.3224, + "step": 676 + }, + { + "epoch": 0.21296896165851933, + "grad_norm": 0.6875, + "learning_rate": 1.9342430974293876e-05, + "loss": 1.4604, + "step": 678 + }, + { + "epoch": 0.2135971886840607, + "grad_norm": 0.75, + "learning_rate": 1.9339892097746747e-05, + "loss": 1.2345, + "step": 680 + }, + { + "epoch": 0.21422541570960205, + "grad_norm": 0.70703125, + "learning_rate": 1.9337353221199622e-05, + "loss": 1.4289, + "step": 682 + }, + { + "epoch": 0.2148536427351434, + "grad_norm": 0.875, + "learning_rate": 1.9334814344652494e-05, + "loss": 1.4216, + "step": 684 + }, + { + "epoch": 0.21548186976068476, + "grad_norm": 0.796875, + "learning_rate": 1.9332275468105365e-05, + "loss": 1.4541, + "step": 686 + }, + { + "epoch": 0.2161100967862261, + "grad_norm": 0.83203125, + "learning_rate": 1.9329736591558236e-05, + "loss": 1.3089, + "step": 688 + }, + { + "epoch": 0.21673832381176747, + "grad_norm": 0.8828125, + "learning_rate": 1.932719771501111e-05, + "loss": 1.3822, + "step": 690 + }, + { + "epoch": 0.21736655083730883, + "grad_norm": 0.78125, + "learning_rate": 1.9324658838463982e-05, + "loss": 1.2839, + "step": 692 + }, + { + "epoch": 0.21799477786285018, + "grad_norm": 0.7421875, + "learning_rate": 1.9322119961916854e-05, + "loss": 1.2813, + "step": 694 + }, + { + "epoch": 0.21862300488839154, + "grad_norm": 0.7265625, + "learning_rate": 1.9319581085369725e-05, + "loss": 1.3437, + "step": 696 + }, + { + "epoch": 0.2192512319139329, + "grad_norm": 0.69140625, + "learning_rate": 1.93170422088226e-05, + "loss": 1.3649, + "step": 698 + }, + { + "epoch": 0.21987945893947425, + "grad_norm": 0.70703125, + "learning_rate": 1.9314503332275468e-05, + "loss": 1.3949, + "step": 700 + }, + { + "epoch": 0.2205076859650156, + "grad_norm": 0.9921875, + "learning_rate": 1.9311964455728343e-05, + "loss": 1.3488, + "step": 702 + }, + { + "epoch": 0.22113591299055696, + "grad_norm": 0.8671875, + "learning_rate": 1.9309425579181214e-05, + "loss": 1.268, + "step": 704 + }, + { + "epoch": 0.22176414001609832, + "grad_norm": 0.875, + "learning_rate": 1.9306886702634085e-05, + "loss": 1.3855, + "step": 706 + }, + { + "epoch": 0.22239236704163967, + "grad_norm": 0.765625, + "learning_rate": 1.9304347826086957e-05, + "loss": 1.2877, + "step": 708 + }, + { + "epoch": 0.22302059406718103, + "grad_norm": 0.7734375, + "learning_rate": 1.930180894953983e-05, + "loss": 1.3324, + "step": 710 + }, + { + "epoch": 0.22364882109272238, + "grad_norm": 0.75390625, + "learning_rate": 1.9299270072992703e-05, + "loss": 1.3602, + "step": 712 + }, + { + "epoch": 0.22427704811826374, + "grad_norm": 0.76171875, + "learning_rate": 1.9296731196445574e-05, + "loss": 1.3006, + "step": 714 + }, + { + "epoch": 0.2249052751438051, + "grad_norm": 0.7578125, + "learning_rate": 1.9294192319898446e-05, + "loss": 1.3667, + "step": 716 + }, + { + "epoch": 0.22553350216934645, + "grad_norm": 0.70703125, + "learning_rate": 1.929165344335132e-05, + "loss": 1.3536, + "step": 718 + }, + { + "epoch": 0.2261617291948878, + "grad_norm": 0.73828125, + "learning_rate": 1.928911456680419e-05, + "loss": 1.4343, + "step": 720 + }, + { + "epoch": 0.22678995622042916, + "grad_norm": 0.76171875, + "learning_rate": 1.9286575690257063e-05, + "loss": 1.355, + "step": 722 + }, + { + "epoch": 0.22741818324597052, + "grad_norm": 0.88671875, + "learning_rate": 1.9284036813709935e-05, + "loss": 1.3999, + "step": 724 + }, + { + "epoch": 0.22804641027151187, + "grad_norm": 0.9140625, + "learning_rate": 1.9281497937162806e-05, + "loss": 1.3638, + "step": 726 + }, + { + "epoch": 0.22867463729705323, + "grad_norm": 0.7265625, + "learning_rate": 1.9278959060615677e-05, + "loss": 1.2724, + "step": 728 + }, + { + "epoch": 0.22930286432259459, + "grad_norm": 1.0, + "learning_rate": 1.9276420184068552e-05, + "loss": 1.3783, + "step": 730 + }, + { + "epoch": 0.22993109134813594, + "grad_norm": 0.7578125, + "learning_rate": 1.9273881307521424e-05, + "loss": 1.2429, + "step": 732 + }, + { + "epoch": 0.2305593183736773, + "grad_norm": 0.76171875, + "learning_rate": 1.9271342430974295e-05, + "loss": 1.4618, + "step": 734 + }, + { + "epoch": 0.23118754539921865, + "grad_norm": 0.70703125, + "learning_rate": 1.9268803554427166e-05, + "loss": 1.3145, + "step": 736 + }, + { + "epoch": 0.23181577242476, + "grad_norm": 0.74609375, + "learning_rate": 1.926626467788004e-05, + "loss": 1.3562, + "step": 738 + }, + { + "epoch": 0.23244399945030136, + "grad_norm": 0.7734375, + "learning_rate": 1.9263725801332913e-05, + "loss": 1.3047, + "step": 740 + }, + { + "epoch": 0.23307222647584272, + "grad_norm": 0.765625, + "learning_rate": 1.9261186924785784e-05, + "loss": 1.4534, + "step": 742 + }, + { + "epoch": 0.23370045350138408, + "grad_norm": 0.84375, + "learning_rate": 1.9258648048238655e-05, + "loss": 1.3435, + "step": 744 + }, + { + "epoch": 0.23432868052692543, + "grad_norm": 0.75, + "learning_rate": 1.9256109171691527e-05, + "loss": 1.3951, + "step": 746 + }, + { + "epoch": 0.23495690755246676, + "grad_norm": 0.68359375, + "learning_rate": 1.9253570295144398e-05, + "loss": 1.3799, + "step": 748 + }, + { + "epoch": 0.23558513457800812, + "grad_norm": 0.8046875, + "learning_rate": 1.9251031418597273e-05, + "loss": 1.5794, + "step": 750 + }, + { + "epoch": 0.23621336160354947, + "grad_norm": 0.73828125, + "learning_rate": 1.9248492542050144e-05, + "loss": 1.3543, + "step": 752 + }, + { + "epoch": 0.23684158862909083, + "grad_norm": 0.71484375, + "learning_rate": 1.9245953665503016e-05, + "loss": 1.2956, + "step": 754 + }, + { + "epoch": 0.23746981565463218, + "grad_norm": 0.8359375, + "learning_rate": 1.9243414788955887e-05, + "loss": 1.2537, + "step": 756 + }, + { + "epoch": 0.23809804268017354, + "grad_norm": 0.83203125, + "learning_rate": 1.9240875912408762e-05, + "loss": 1.3696, + "step": 758 + }, + { + "epoch": 0.2387262697057149, + "grad_norm": 0.8359375, + "learning_rate": 1.9238337035861633e-05, + "loss": 1.4097, + "step": 760 + }, + { + "epoch": 0.23935449673125625, + "grad_norm": 1.015625, + "learning_rate": 1.9235798159314505e-05, + "loss": 1.307, + "step": 762 + }, + { + "epoch": 0.2399827237567976, + "grad_norm": 0.91015625, + "learning_rate": 1.923325928276738e-05, + "loss": 1.2294, + "step": 764 + }, + { + "epoch": 0.24061095078233896, + "grad_norm": 0.796875, + "learning_rate": 1.923072040622025e-05, + "loss": 1.3091, + "step": 766 + }, + { + "epoch": 0.24123917780788032, + "grad_norm": 0.859375, + "learning_rate": 1.9228181529673122e-05, + "loss": 1.3432, + "step": 768 + }, + { + "epoch": 0.24186740483342167, + "grad_norm": 0.80078125, + "learning_rate": 1.9225642653125993e-05, + "loss": 1.3201, + "step": 770 + }, + { + "epoch": 0.24249563185896303, + "grad_norm": 0.80078125, + "learning_rate": 1.9223103776578865e-05, + "loss": 1.4521, + "step": 772 + }, + { + "epoch": 0.24312385888450438, + "grad_norm": 0.95703125, + "learning_rate": 1.9220564900031736e-05, + "loss": 1.4254, + "step": 774 + }, + { + "epoch": 0.24375208591004574, + "grad_norm": 0.8125, + "learning_rate": 1.921802602348461e-05, + "loss": 1.3347, + "step": 776 + }, + { + "epoch": 0.2443803129355871, + "grad_norm": 0.7890625, + "learning_rate": 1.9215487146937482e-05, + "loss": 1.2956, + "step": 778 + }, + { + "epoch": 0.24500853996112845, + "grad_norm": 0.7734375, + "learning_rate": 1.9212948270390354e-05, + "loss": 1.3128, + "step": 780 + }, + { + "epoch": 0.2456367669866698, + "grad_norm": 0.84375, + "learning_rate": 1.9210409393843225e-05, + "loss": 1.3065, + "step": 782 + }, + { + "epoch": 0.24626499401221116, + "grad_norm": 1.015625, + "learning_rate": 1.92078705172961e-05, + "loss": 1.2968, + "step": 784 + }, + { + "epoch": 0.24689322103775252, + "grad_norm": 0.87890625, + "learning_rate": 1.920533164074897e-05, + "loss": 1.3041, + "step": 786 + }, + { + "epoch": 0.24752144806329388, + "grad_norm": 0.8046875, + "learning_rate": 1.9202792764201843e-05, + "loss": 1.4266, + "step": 788 + }, + { + "epoch": 0.24814967508883523, + "grad_norm": 0.68359375, + "learning_rate": 1.9200253887654714e-05, + "loss": 1.4958, + "step": 790 + }, + { + "epoch": 0.2487779021143766, + "grad_norm": 0.9375, + "learning_rate": 1.919771501110759e-05, + "loss": 1.4217, + "step": 792 + }, + { + "epoch": 0.24940612913991794, + "grad_norm": 0.70703125, + "learning_rate": 1.9195176134560457e-05, + "loss": 1.3905, + "step": 794 + }, + { + "epoch": 0.25003435616545927, + "grad_norm": 0.8203125, + "learning_rate": 1.919263725801333e-05, + "loss": 1.3715, + "step": 796 + }, + { + "epoch": 0.25066258319100065, + "grad_norm": 0.9296875, + "learning_rate": 1.9190098381466203e-05, + "loss": 1.4086, + "step": 798 + }, + { + "epoch": 0.251290810216542, + "grad_norm": 0.84375, + "learning_rate": 1.9187559504919074e-05, + "loss": 1.4157, + "step": 800 + }, + { + "epoch": 0.25191903724208337, + "grad_norm": 0.7109375, + "learning_rate": 1.9185020628371946e-05, + "loss": 1.2557, + "step": 802 + }, + { + "epoch": 0.2525472642676247, + "grad_norm": 0.7734375, + "learning_rate": 1.918248175182482e-05, + "loss": 1.3713, + "step": 804 + }, + { + "epoch": 0.2531754912931661, + "grad_norm": 0.7265625, + "learning_rate": 1.9179942875277692e-05, + "loss": 1.3549, + "step": 806 + }, + { + "epoch": 0.2538037183187074, + "grad_norm": 0.7734375, + "learning_rate": 1.9177403998730563e-05, + "loss": 1.4184, + "step": 808 + }, + { + "epoch": 0.2544319453442488, + "grad_norm": 0.828125, + "learning_rate": 1.9174865122183435e-05, + "loss": 1.3112, + "step": 810 + }, + { + "epoch": 0.2550601723697901, + "grad_norm": 0.7421875, + "learning_rate": 1.917232624563631e-05, + "loss": 1.3818, + "step": 812 + }, + { + "epoch": 0.2556883993953315, + "grad_norm": 0.796875, + "learning_rate": 1.9169787369089177e-05, + "loss": 1.4245, + "step": 814 + }, + { + "epoch": 0.25631662642087283, + "grad_norm": 0.91015625, + "learning_rate": 1.9167248492542052e-05, + "loss": 1.3986, + "step": 816 + }, + { + "epoch": 0.2569448534464142, + "grad_norm": 0.7421875, + "learning_rate": 1.9164709615994924e-05, + "loss": 1.3054, + "step": 818 + }, + { + "epoch": 0.25757308047195554, + "grad_norm": 0.8046875, + "learning_rate": 1.9162170739447795e-05, + "loss": 1.3303, + "step": 820 + }, + { + "epoch": 0.2582013074974969, + "grad_norm": 0.77734375, + "learning_rate": 1.9159631862900666e-05, + "loss": 1.3877, + "step": 822 + }, + { + "epoch": 0.25882953452303825, + "grad_norm": 0.8359375, + "learning_rate": 1.915709298635354e-05, + "loss": 1.3464, + "step": 824 + }, + { + "epoch": 0.25945776154857964, + "grad_norm": 0.74609375, + "learning_rate": 1.9154554109806412e-05, + "loss": 1.4358, + "step": 826 + }, + { + "epoch": 0.26008598857412096, + "grad_norm": 0.73046875, + "learning_rate": 1.9152015233259284e-05, + "loss": 1.2982, + "step": 828 + }, + { + "epoch": 0.26071421559966235, + "grad_norm": 0.796875, + "learning_rate": 1.9149476356712155e-05, + "loss": 1.398, + "step": 830 + }, + { + "epoch": 0.2613424426252037, + "grad_norm": 0.69921875, + "learning_rate": 1.914693748016503e-05, + "loss": 1.2641, + "step": 832 + }, + { + "epoch": 0.26197066965074506, + "grad_norm": 0.88671875, + "learning_rate": 1.91443986036179e-05, + "loss": 1.3669, + "step": 834 + }, + { + "epoch": 0.2625988966762864, + "grad_norm": 0.796875, + "learning_rate": 1.9141859727070773e-05, + "loss": 1.3182, + "step": 836 + }, + { + "epoch": 0.26322712370182777, + "grad_norm": 0.734375, + "learning_rate": 1.9139320850523644e-05, + "loss": 1.3939, + "step": 838 + }, + { + "epoch": 0.2638553507273691, + "grad_norm": 0.66796875, + "learning_rate": 1.9136781973976516e-05, + "loss": 1.4948, + "step": 840 + }, + { + "epoch": 0.2644835777529105, + "grad_norm": 0.88671875, + "learning_rate": 1.9134243097429387e-05, + "loss": 1.34, + "step": 842 + }, + { + "epoch": 0.2651118047784518, + "grad_norm": 0.890625, + "learning_rate": 1.913170422088226e-05, + "loss": 1.3576, + "step": 844 + }, + { + "epoch": 0.2657400318039932, + "grad_norm": 0.71875, + "learning_rate": 1.9129165344335133e-05, + "loss": 1.3366, + "step": 846 + }, + { + "epoch": 0.2663682588295345, + "grad_norm": 0.8359375, + "learning_rate": 1.9126626467788004e-05, + "loss": 1.4665, + "step": 848 + }, + { + "epoch": 0.2669964858550759, + "grad_norm": 0.69140625, + "learning_rate": 1.912408759124088e-05, + "loss": 1.4036, + "step": 850 + }, + { + "epoch": 0.26762471288061723, + "grad_norm": 0.73046875, + "learning_rate": 1.912154871469375e-05, + "loss": 1.2714, + "step": 852 + }, + { + "epoch": 0.26825293990615856, + "grad_norm": 0.71875, + "learning_rate": 1.9119009838146622e-05, + "loss": 1.3858, + "step": 854 + }, + { + "epoch": 0.26888116693169994, + "grad_norm": 0.734375, + "learning_rate": 1.9116470961599493e-05, + "loss": 1.4882, + "step": 856 + }, + { + "epoch": 0.26950939395724127, + "grad_norm": 0.77734375, + "learning_rate": 1.9113932085052368e-05, + "loss": 1.2592, + "step": 858 + }, + { + "epoch": 0.27013762098278266, + "grad_norm": 0.75, + "learning_rate": 1.911139320850524e-05, + "loss": 1.4349, + "step": 860 + }, + { + "epoch": 0.270765848008324, + "grad_norm": 0.921875, + "learning_rate": 1.910885433195811e-05, + "loss": 1.2003, + "step": 862 + }, + { + "epoch": 0.27139407503386537, + "grad_norm": 0.703125, + "learning_rate": 1.9106315455410982e-05, + "loss": 1.4485, + "step": 864 + }, + { + "epoch": 0.2720223020594067, + "grad_norm": 0.78125, + "learning_rate": 1.9103776578863854e-05, + "loss": 1.2389, + "step": 866 + }, + { + "epoch": 0.2726505290849481, + "grad_norm": 0.75, + "learning_rate": 1.9101237702316725e-05, + "loss": 1.4348, + "step": 868 + }, + { + "epoch": 0.2732787561104894, + "grad_norm": 0.78125, + "learning_rate": 1.90986988257696e-05, + "loss": 1.4559, + "step": 870 + }, + { + "epoch": 0.2739069831360308, + "grad_norm": 0.796875, + "learning_rate": 1.909615994922247e-05, + "loss": 1.4004, + "step": 872 + }, + { + "epoch": 0.2745352101615721, + "grad_norm": 0.8046875, + "learning_rate": 1.9093621072675343e-05, + "loss": 1.3105, + "step": 874 + }, + { + "epoch": 0.2751634371871135, + "grad_norm": 0.78125, + "learning_rate": 1.9091082196128214e-05, + "loss": 1.2796, + "step": 876 + }, + { + "epoch": 0.27579166421265483, + "grad_norm": 0.74609375, + "learning_rate": 1.908854331958109e-05, + "loss": 1.4628, + "step": 878 + }, + { + "epoch": 0.2764198912381962, + "grad_norm": 0.71484375, + "learning_rate": 1.908600444303396e-05, + "loss": 1.3618, + "step": 880 + }, + { + "epoch": 0.27704811826373754, + "grad_norm": 0.73828125, + "learning_rate": 1.908346556648683e-05, + "loss": 1.3635, + "step": 882 + }, + { + "epoch": 0.2776763452892789, + "grad_norm": 0.69921875, + "learning_rate": 1.9080926689939703e-05, + "loss": 1.3921, + "step": 884 + }, + { + "epoch": 0.27830457231482025, + "grad_norm": 0.70703125, + "learning_rate": 1.9078387813392578e-05, + "loss": 1.3431, + "step": 886 + }, + { + "epoch": 0.27893279934036164, + "grad_norm": 0.796875, + "learning_rate": 1.9075848936845446e-05, + "loss": 1.3725, + "step": 888 + }, + { + "epoch": 0.27956102636590296, + "grad_norm": 0.6640625, + "learning_rate": 1.907331006029832e-05, + "loss": 1.2754, + "step": 890 + }, + { + "epoch": 0.28018925339144435, + "grad_norm": 0.99609375, + "learning_rate": 1.9070771183751192e-05, + "loss": 1.1762, + "step": 892 + }, + { + "epoch": 0.2808174804169857, + "grad_norm": 0.80859375, + "learning_rate": 1.9068232307204063e-05, + "loss": 1.301, + "step": 894 + }, + { + "epoch": 0.28144570744252706, + "grad_norm": 0.68359375, + "learning_rate": 1.9065693430656935e-05, + "loss": 1.2999, + "step": 896 + }, + { + "epoch": 0.2820739344680684, + "grad_norm": 0.76171875, + "learning_rate": 1.906315455410981e-05, + "loss": 1.355, + "step": 898 + }, + { + "epoch": 0.28270216149360977, + "grad_norm": 0.71875, + "learning_rate": 1.906061567756268e-05, + "loss": 1.4332, + "step": 900 + }, + { + "epoch": 0.2833303885191511, + "grad_norm": 0.96875, + "learning_rate": 1.9058076801015552e-05, + "loss": 1.4116, + "step": 902 + }, + { + "epoch": 0.2839586155446925, + "grad_norm": 0.8203125, + "learning_rate": 1.9055537924468423e-05, + "loss": 1.3064, + "step": 904 + }, + { + "epoch": 0.2845868425702338, + "grad_norm": 0.81640625, + "learning_rate": 1.9052999047921298e-05, + "loss": 1.5111, + "step": 906 + }, + { + "epoch": 0.2852150695957752, + "grad_norm": 0.8125, + "learning_rate": 1.9050460171374166e-05, + "loss": 1.2457, + "step": 908 + }, + { + "epoch": 0.2858432966213165, + "grad_norm": 0.78125, + "learning_rate": 1.904792129482704e-05, + "loss": 1.346, + "step": 910 + }, + { + "epoch": 0.2864715236468579, + "grad_norm": 0.75390625, + "learning_rate": 1.9045382418279912e-05, + "loss": 1.3722, + "step": 912 + }, + { + "epoch": 0.28709975067239923, + "grad_norm": 0.8203125, + "learning_rate": 1.9042843541732784e-05, + "loss": 1.3245, + "step": 914 + }, + { + "epoch": 0.2877279776979406, + "grad_norm": 0.87109375, + "learning_rate": 1.9040304665185655e-05, + "loss": 1.42, + "step": 916 + }, + { + "epoch": 0.28835620472348195, + "grad_norm": 0.83984375, + "learning_rate": 1.903776578863853e-05, + "loss": 1.405, + "step": 918 + }, + { + "epoch": 0.28898443174902333, + "grad_norm": 0.703125, + "learning_rate": 1.90352269120914e-05, + "loss": 1.3066, + "step": 920 + }, + { + "epoch": 0.28961265877456466, + "grad_norm": 0.8046875, + "learning_rate": 1.9032688035544273e-05, + "loss": 1.3226, + "step": 922 + }, + { + "epoch": 0.29024088580010604, + "grad_norm": 0.875, + "learning_rate": 1.9030149158997144e-05, + "loss": 1.1937, + "step": 924 + }, + { + "epoch": 0.29086911282564737, + "grad_norm": 0.78125, + "learning_rate": 1.902761028245002e-05, + "loss": 1.3474, + "step": 926 + }, + { + "epoch": 0.2914973398511887, + "grad_norm": 0.76171875, + "learning_rate": 1.9025071405902887e-05, + "loss": 1.3306, + "step": 928 + }, + { + "epoch": 0.2921255668767301, + "grad_norm": 0.89453125, + "learning_rate": 1.902253252935576e-05, + "loss": 1.3498, + "step": 930 + }, + { + "epoch": 0.2927537939022714, + "grad_norm": 0.92578125, + "learning_rate": 1.9019993652808633e-05, + "loss": 1.4435, + "step": 932 + }, + { + "epoch": 0.2933820209278128, + "grad_norm": 0.75, + "learning_rate": 1.9017454776261504e-05, + "loss": 1.3682, + "step": 934 + }, + { + "epoch": 0.2940102479533541, + "grad_norm": 0.8203125, + "learning_rate": 1.901491589971438e-05, + "loss": 1.3391, + "step": 936 + }, + { + "epoch": 0.2946384749788955, + "grad_norm": 0.7109375, + "learning_rate": 1.901237702316725e-05, + "loss": 1.5098, + "step": 938 + }, + { + "epoch": 0.29526670200443683, + "grad_norm": 0.7109375, + "learning_rate": 1.9009838146620122e-05, + "loss": 1.4432, + "step": 940 + }, + { + "epoch": 0.2958949290299782, + "grad_norm": 0.703125, + "learning_rate": 1.9007299270072993e-05, + "loss": 1.3789, + "step": 942 + }, + { + "epoch": 0.29652315605551954, + "grad_norm": 0.703125, + "learning_rate": 1.9004760393525868e-05, + "loss": 1.2943, + "step": 944 + }, + { + "epoch": 0.2971513830810609, + "grad_norm": 0.8671875, + "learning_rate": 1.900222151697874e-05, + "loss": 1.3753, + "step": 946 + }, + { + "epoch": 0.29777961010660225, + "grad_norm": 0.70703125, + "learning_rate": 1.899968264043161e-05, + "loss": 1.3704, + "step": 948 + }, + { + "epoch": 0.29840783713214364, + "grad_norm": 0.78515625, + "learning_rate": 1.8997143763884482e-05, + "loss": 1.4176, + "step": 950 + }, + { + "epoch": 0.29903606415768497, + "grad_norm": 0.7890625, + "learning_rate": 1.8994604887337357e-05, + "loss": 1.2448, + "step": 952 + }, + { + "epoch": 0.29966429118322635, + "grad_norm": 0.7421875, + "learning_rate": 1.8992066010790225e-05, + "loss": 1.2357, + "step": 954 + }, + { + "epoch": 0.3002925182087677, + "grad_norm": 0.72265625, + "learning_rate": 1.89895271342431e-05, + "loss": 1.4002, + "step": 956 + }, + { + "epoch": 0.30092074523430906, + "grad_norm": 0.796875, + "learning_rate": 1.898698825769597e-05, + "loss": 1.3756, + "step": 958 + }, + { + "epoch": 0.3015489722598504, + "grad_norm": 0.75390625, + "learning_rate": 1.8984449381148842e-05, + "loss": 1.2851, + "step": 960 + }, + { + "epoch": 0.3021771992853918, + "grad_norm": 0.79296875, + "learning_rate": 1.8981910504601714e-05, + "loss": 1.3339, + "step": 962 + }, + { + "epoch": 0.3028054263109331, + "grad_norm": 0.6953125, + "learning_rate": 1.897937162805459e-05, + "loss": 1.4284, + "step": 964 + }, + { + "epoch": 0.3034336533364745, + "grad_norm": 0.83203125, + "learning_rate": 1.897683275150746e-05, + "loss": 1.3142, + "step": 966 + }, + { + "epoch": 0.3040618803620158, + "grad_norm": 0.76953125, + "learning_rate": 1.897429387496033e-05, + "loss": 1.4217, + "step": 968 + }, + { + "epoch": 0.3046901073875572, + "grad_norm": 0.7890625, + "learning_rate": 1.8971754998413203e-05, + "loss": 1.4308, + "step": 970 + }, + { + "epoch": 0.3053183344130985, + "grad_norm": 0.75390625, + "learning_rate": 1.8969216121866078e-05, + "loss": 1.2463, + "step": 972 + }, + { + "epoch": 0.3059465614386399, + "grad_norm": 0.72265625, + "learning_rate": 1.896667724531895e-05, + "loss": 1.3149, + "step": 974 + }, + { + "epoch": 0.30657478846418124, + "grad_norm": 0.91796875, + "learning_rate": 1.896413836877182e-05, + "loss": 1.3623, + "step": 976 + }, + { + "epoch": 0.3072030154897226, + "grad_norm": 0.69921875, + "learning_rate": 1.8961599492224692e-05, + "loss": 1.5585, + "step": 978 + }, + { + "epoch": 0.30783124251526395, + "grad_norm": 0.71875, + "learning_rate": 1.8959060615677563e-05, + "loss": 1.2155, + "step": 980 + }, + { + "epoch": 0.30845946954080533, + "grad_norm": 0.703125, + "learning_rate": 1.8956521739130434e-05, + "loss": 1.384, + "step": 982 + }, + { + "epoch": 0.30908769656634666, + "grad_norm": 0.890625, + "learning_rate": 1.895398286258331e-05, + "loss": 1.2724, + "step": 984 + }, + { + "epoch": 0.30971592359188804, + "grad_norm": 0.75, + "learning_rate": 1.895144398603618e-05, + "loss": 1.3504, + "step": 986 + }, + { + "epoch": 0.31034415061742937, + "grad_norm": 0.7578125, + "learning_rate": 1.8948905109489052e-05, + "loss": 1.3144, + "step": 988 + }, + { + "epoch": 0.31097237764297075, + "grad_norm": 0.71484375, + "learning_rate": 1.8946366232941923e-05, + "loss": 1.3399, + "step": 990 + }, + { + "epoch": 0.3116006046685121, + "grad_norm": 0.796875, + "learning_rate": 1.8943827356394798e-05, + "loss": 1.3355, + "step": 992 + }, + { + "epoch": 0.31222883169405347, + "grad_norm": 0.78125, + "learning_rate": 1.894128847984767e-05, + "loss": 1.2823, + "step": 994 + }, + { + "epoch": 0.3128570587195948, + "grad_norm": 0.78125, + "learning_rate": 1.893874960330054e-05, + "loss": 1.4969, + "step": 996 + }, + { + "epoch": 0.3134852857451361, + "grad_norm": 0.7890625, + "learning_rate": 1.8936210726753412e-05, + "loss": 1.3046, + "step": 998 + }, + { + "epoch": 0.3141135127706775, + "grad_norm": 0.7109375, + "learning_rate": 1.8933671850206287e-05, + "loss": 1.4317, + "step": 1000 + }, + { + "epoch": 0.31474173979621883, + "grad_norm": 0.71875, + "learning_rate": 1.8931132973659155e-05, + "loss": 1.3786, + "step": 1002 + }, + { + "epoch": 0.3153699668217602, + "grad_norm": 0.73046875, + "learning_rate": 1.892859409711203e-05, + "loss": 1.3259, + "step": 1004 + }, + { + "epoch": 0.31599819384730155, + "grad_norm": 0.72265625, + "learning_rate": 1.89260552205649e-05, + "loss": 1.3619, + "step": 1006 + }, + { + "epoch": 0.31662642087284293, + "grad_norm": 0.7578125, + "learning_rate": 1.8923516344017773e-05, + "loss": 1.4299, + "step": 1008 + }, + { + "epoch": 0.31725464789838426, + "grad_norm": 0.78515625, + "learning_rate": 1.8920977467470644e-05, + "loss": 1.389, + "step": 1010 + }, + { + "epoch": 0.31788287492392564, + "grad_norm": 0.8046875, + "learning_rate": 1.891843859092352e-05, + "loss": 1.3459, + "step": 1012 + }, + { + "epoch": 0.31851110194946697, + "grad_norm": 0.765625, + "learning_rate": 1.891589971437639e-05, + "loss": 1.4309, + "step": 1014 + }, + { + "epoch": 0.31913932897500835, + "grad_norm": 0.76953125, + "learning_rate": 1.891336083782926e-05, + "loss": 1.3712, + "step": 1016 + }, + { + "epoch": 0.3197675560005497, + "grad_norm": 0.80859375, + "learning_rate": 1.8910821961282133e-05, + "loss": 1.3044, + "step": 1018 + }, + { + "epoch": 0.32039578302609106, + "grad_norm": 0.6796875, + "learning_rate": 1.8908283084735008e-05, + "loss": 1.3589, + "step": 1020 + }, + { + "epoch": 0.3210240100516324, + "grad_norm": 0.69140625, + "learning_rate": 1.890574420818788e-05, + "loss": 1.2593, + "step": 1022 + }, + { + "epoch": 0.3216522370771738, + "grad_norm": 0.87109375, + "learning_rate": 1.890320533164075e-05, + "loss": 1.3657, + "step": 1024 + }, + { + "epoch": 0.3222804641027151, + "grad_norm": 0.6796875, + "learning_rate": 1.8900666455093625e-05, + "loss": 1.2129, + "step": 1026 + }, + { + "epoch": 0.3229086911282565, + "grad_norm": 0.71875, + "learning_rate": 1.8898127578546493e-05, + "loss": 1.09, + "step": 1028 + }, + { + "epoch": 0.3235369181537978, + "grad_norm": 0.8671875, + "learning_rate": 1.8895588701999368e-05, + "loss": 1.3569, + "step": 1030 + }, + { + "epoch": 0.3241651451793392, + "grad_norm": 0.78515625, + "learning_rate": 1.889304982545224e-05, + "loss": 1.4419, + "step": 1032 + }, + { + "epoch": 0.3247933722048805, + "grad_norm": 0.7578125, + "learning_rate": 1.889051094890511e-05, + "loss": 1.3802, + "step": 1034 + }, + { + "epoch": 0.3254215992304219, + "grad_norm": 0.75390625, + "learning_rate": 1.8887972072357982e-05, + "loss": 1.312, + "step": 1036 + }, + { + "epoch": 0.32604982625596324, + "grad_norm": 0.74609375, + "learning_rate": 1.8885433195810857e-05, + "loss": 1.4378, + "step": 1038 + }, + { + "epoch": 0.3266780532815046, + "grad_norm": 0.83203125, + "learning_rate": 1.8882894319263728e-05, + "loss": 1.2541, + "step": 1040 + }, + { + "epoch": 0.32730628030704595, + "grad_norm": 0.7421875, + "learning_rate": 1.88803554427166e-05, + "loss": 1.3656, + "step": 1042 + }, + { + "epoch": 0.32793450733258733, + "grad_norm": 0.7578125, + "learning_rate": 1.887781656616947e-05, + "loss": 1.4039, + "step": 1044 + }, + { + "epoch": 0.32856273435812866, + "grad_norm": 0.72265625, + "learning_rate": 1.8875277689622346e-05, + "loss": 1.3563, + "step": 1046 + }, + { + "epoch": 0.32919096138367004, + "grad_norm": 0.88671875, + "learning_rate": 1.8872738813075214e-05, + "loss": 1.285, + "step": 1048 + }, + { + "epoch": 0.3298191884092114, + "grad_norm": 0.84375, + "learning_rate": 1.887019993652809e-05, + "loss": 1.2465, + "step": 1050 + }, + { + "epoch": 0.33044741543475276, + "grad_norm": 0.92578125, + "learning_rate": 1.886766105998096e-05, + "loss": 1.2184, + "step": 1052 + }, + { + "epoch": 0.3310756424602941, + "grad_norm": 0.69921875, + "learning_rate": 1.886512218343383e-05, + "loss": 1.3098, + "step": 1054 + }, + { + "epoch": 0.33170386948583547, + "grad_norm": 0.76171875, + "learning_rate": 1.8862583306886703e-05, + "loss": 1.318, + "step": 1056 + }, + { + "epoch": 0.3323320965113768, + "grad_norm": 0.91015625, + "learning_rate": 1.8860044430339577e-05, + "loss": 1.2984, + "step": 1058 + }, + { + "epoch": 0.3329603235369182, + "grad_norm": 0.78515625, + "learning_rate": 1.885750555379245e-05, + "loss": 1.4075, + "step": 1060 + }, + { + "epoch": 0.3335885505624595, + "grad_norm": 0.94140625, + "learning_rate": 1.885496667724532e-05, + "loss": 1.354, + "step": 1062 + }, + { + "epoch": 0.3342167775880009, + "grad_norm": 0.74609375, + "learning_rate": 1.885242780069819e-05, + "loss": 1.2434, + "step": 1064 + }, + { + "epoch": 0.3348450046135422, + "grad_norm": 0.8359375, + "learning_rate": 1.8849888924151066e-05, + "loss": 1.4308, + "step": 1066 + }, + { + "epoch": 0.33547323163908355, + "grad_norm": 0.8984375, + "learning_rate": 1.8847350047603938e-05, + "loss": 1.2561, + "step": 1068 + }, + { + "epoch": 0.33610145866462493, + "grad_norm": 0.875, + "learning_rate": 1.884481117105681e-05, + "loss": 1.4753, + "step": 1070 + }, + { + "epoch": 0.33672968569016626, + "grad_norm": 0.69921875, + "learning_rate": 1.884227229450968e-05, + "loss": 1.369, + "step": 1072 + }, + { + "epoch": 0.33735791271570764, + "grad_norm": 0.76171875, + "learning_rate": 1.8839733417962552e-05, + "loss": 1.4776, + "step": 1074 + }, + { + "epoch": 0.33798613974124897, + "grad_norm": 0.73046875, + "learning_rate": 1.8837194541415423e-05, + "loss": 1.3619, + "step": 1076 + }, + { + "epoch": 0.33861436676679035, + "grad_norm": 0.77734375, + "learning_rate": 1.8834655664868298e-05, + "loss": 1.2684, + "step": 1078 + }, + { + "epoch": 0.3392425937923317, + "grad_norm": 0.7421875, + "learning_rate": 1.883211678832117e-05, + "loss": 1.4172, + "step": 1080 + }, + { + "epoch": 0.33987082081787306, + "grad_norm": 0.890625, + "learning_rate": 1.882957791177404e-05, + "loss": 1.501, + "step": 1082 + }, + { + "epoch": 0.3404990478434144, + "grad_norm": 0.82421875, + "learning_rate": 1.8827039035226912e-05, + "loss": 1.4823, + "step": 1084 + }, + { + "epoch": 0.3411272748689558, + "grad_norm": 0.8828125, + "learning_rate": 1.8824500158679787e-05, + "loss": 1.2784, + "step": 1086 + }, + { + "epoch": 0.3417555018944971, + "grad_norm": 0.76171875, + "learning_rate": 1.882196128213266e-05, + "loss": 1.359, + "step": 1088 + }, + { + "epoch": 0.3423837289200385, + "grad_norm": 0.79296875, + "learning_rate": 1.881942240558553e-05, + "loss": 1.2725, + "step": 1090 + }, + { + "epoch": 0.3430119559455798, + "grad_norm": 0.76171875, + "learning_rate": 1.88168835290384e-05, + "loss": 1.2185, + "step": 1092 + }, + { + "epoch": 0.3436401829711212, + "grad_norm": 0.703125, + "learning_rate": 1.8814344652491276e-05, + "loss": 1.3709, + "step": 1094 + }, + { + "epoch": 0.3442684099966625, + "grad_norm": 0.79296875, + "learning_rate": 1.8811805775944144e-05, + "loss": 1.4139, + "step": 1096 + }, + { + "epoch": 0.3448966370222039, + "grad_norm": 0.69921875, + "learning_rate": 1.880926689939702e-05, + "loss": 1.5253, + "step": 1098 + }, + { + "epoch": 0.34552486404774524, + "grad_norm": 0.72265625, + "learning_rate": 1.880672802284989e-05, + "loss": 1.2929, + "step": 1100 + }, + { + "epoch": 0.3461530910732866, + "grad_norm": 0.90625, + "learning_rate": 1.880418914630276e-05, + "loss": 1.3314, + "step": 1102 + }, + { + "epoch": 0.34678131809882795, + "grad_norm": 0.70703125, + "learning_rate": 1.8801650269755633e-05, + "loss": 1.1409, + "step": 1104 + }, + { + "epoch": 0.34740954512436933, + "grad_norm": 0.765625, + "learning_rate": 1.8799111393208508e-05, + "loss": 1.4453, + "step": 1106 + }, + { + "epoch": 0.34803777214991066, + "grad_norm": 0.671875, + "learning_rate": 1.879657251666138e-05, + "loss": 1.3495, + "step": 1108 + }, + { + "epoch": 0.34866599917545205, + "grad_norm": 0.77734375, + "learning_rate": 1.879403364011425e-05, + "loss": 1.3406, + "step": 1110 + }, + { + "epoch": 0.3492942262009934, + "grad_norm": 0.85546875, + "learning_rate": 1.8791494763567125e-05, + "loss": 1.2358, + "step": 1112 + }, + { + "epoch": 0.34992245322653476, + "grad_norm": 0.83984375, + "learning_rate": 1.8788955887019997e-05, + "loss": 1.3972, + "step": 1114 + }, + { + "epoch": 0.3505506802520761, + "grad_norm": 0.72265625, + "learning_rate": 1.8786417010472868e-05, + "loss": 1.3597, + "step": 1116 + }, + { + "epoch": 0.35117890727761747, + "grad_norm": 0.66015625, + "learning_rate": 1.878387813392574e-05, + "loss": 1.3003, + "step": 1118 + }, + { + "epoch": 0.3518071343031588, + "grad_norm": 0.86328125, + "learning_rate": 1.8781339257378614e-05, + "loss": 1.2663, + "step": 1120 + }, + { + "epoch": 0.3524353613287002, + "grad_norm": 0.73828125, + "learning_rate": 1.8778800380831482e-05, + "loss": 1.4089, + "step": 1122 + }, + { + "epoch": 0.3530635883542415, + "grad_norm": 0.828125, + "learning_rate": 1.8776261504284357e-05, + "loss": 1.3793, + "step": 1124 + }, + { + "epoch": 0.3536918153797829, + "grad_norm": 0.796875, + "learning_rate": 1.8773722627737228e-05, + "loss": 1.4041, + "step": 1126 + }, + { + "epoch": 0.3543200424053242, + "grad_norm": 0.8046875, + "learning_rate": 1.87711837511901e-05, + "loss": 1.252, + "step": 1128 + }, + { + "epoch": 0.3549482694308656, + "grad_norm": 0.76953125, + "learning_rate": 1.876864487464297e-05, + "loss": 1.3771, + "step": 1130 + }, + { + "epoch": 0.35557649645640693, + "grad_norm": 0.86328125, + "learning_rate": 1.8766105998095846e-05, + "loss": 1.2952, + "step": 1132 + }, + { + "epoch": 0.3562047234819483, + "grad_norm": 0.7734375, + "learning_rate": 1.8763567121548717e-05, + "loss": 1.2377, + "step": 1134 + }, + { + "epoch": 0.35683295050748964, + "grad_norm": 0.78125, + "learning_rate": 1.876102824500159e-05, + "loss": 1.429, + "step": 1136 + }, + { + "epoch": 0.35746117753303097, + "grad_norm": 0.7734375, + "learning_rate": 1.875848936845446e-05, + "loss": 1.3617, + "step": 1138 + }, + { + "epoch": 0.35808940455857236, + "grad_norm": 0.7109375, + "learning_rate": 1.8755950491907335e-05, + "loss": 1.4136, + "step": 1140 + }, + { + "epoch": 0.3587176315841137, + "grad_norm": 0.80859375, + "learning_rate": 1.8753411615360203e-05, + "loss": 1.2859, + "step": 1142 + }, + { + "epoch": 0.35934585860965507, + "grad_norm": 0.6796875, + "learning_rate": 1.8750872738813077e-05, + "loss": 1.2145, + "step": 1144 + }, + { + "epoch": 0.3599740856351964, + "grad_norm": 0.70703125, + "learning_rate": 1.874833386226595e-05, + "loss": 1.294, + "step": 1146 + }, + { + "epoch": 0.3606023126607378, + "grad_norm": 0.8203125, + "learning_rate": 1.874579498571882e-05, + "loss": 1.1749, + "step": 1148 + }, + { + "epoch": 0.3612305396862791, + "grad_norm": 0.75, + "learning_rate": 1.874325610917169e-05, + "loss": 1.2759, + "step": 1150 + }, + { + "epoch": 0.3618587667118205, + "grad_norm": 0.76953125, + "learning_rate": 1.8740717232624566e-05, + "loss": 1.2798, + "step": 1152 + }, + { + "epoch": 0.3624869937373618, + "grad_norm": 0.83203125, + "learning_rate": 1.8738178356077438e-05, + "loss": 1.3493, + "step": 1154 + }, + { + "epoch": 0.3631152207629032, + "grad_norm": 0.76953125, + "learning_rate": 1.873563947953031e-05, + "loss": 1.4311, + "step": 1156 + }, + { + "epoch": 0.36374344778844453, + "grad_norm": 0.765625, + "learning_rate": 1.873310060298318e-05, + "loss": 1.2613, + "step": 1158 + }, + { + "epoch": 0.3643716748139859, + "grad_norm": 1.0078125, + "learning_rate": 1.8730561726436055e-05, + "loss": 1.3474, + "step": 1160 + }, + { + "epoch": 0.36499990183952724, + "grad_norm": 0.7109375, + "learning_rate": 1.8728022849888923e-05, + "loss": 1.4257, + "step": 1162 + }, + { + "epoch": 0.3656281288650686, + "grad_norm": 0.78125, + "learning_rate": 1.8725483973341798e-05, + "loss": 1.3411, + "step": 1164 + }, + { + "epoch": 0.36625635589060995, + "grad_norm": 0.734375, + "learning_rate": 1.872294509679467e-05, + "loss": 1.3309, + "step": 1166 + }, + { + "epoch": 0.36688458291615134, + "grad_norm": 0.8984375, + "learning_rate": 1.872040622024754e-05, + "loss": 1.4467, + "step": 1168 + }, + { + "epoch": 0.36751280994169266, + "grad_norm": 0.8515625, + "learning_rate": 1.8717867343700412e-05, + "loss": 1.2754, + "step": 1170 + }, + { + "epoch": 0.36814103696723405, + "grad_norm": 0.7890625, + "learning_rate": 1.8715328467153287e-05, + "loss": 1.4556, + "step": 1172 + }, + { + "epoch": 0.3687692639927754, + "grad_norm": 0.84375, + "learning_rate": 1.871278959060616e-05, + "loss": 1.3598, + "step": 1174 + }, + { + "epoch": 0.36939749101831676, + "grad_norm": 0.6875, + "learning_rate": 1.871025071405903e-05, + "loss": 1.2428, + "step": 1176 + }, + { + "epoch": 0.3700257180438581, + "grad_norm": 0.8046875, + "learning_rate": 1.87077118375119e-05, + "loss": 1.3761, + "step": 1178 + }, + { + "epoch": 0.37065394506939947, + "grad_norm": 0.78515625, + "learning_rate": 1.8705172960964776e-05, + "loss": 1.3929, + "step": 1180 + }, + { + "epoch": 0.3712821720949408, + "grad_norm": 0.8671875, + "learning_rate": 1.8702634084417647e-05, + "loss": 1.2633, + "step": 1182 + }, + { + "epoch": 0.3719103991204822, + "grad_norm": 0.828125, + "learning_rate": 1.870009520787052e-05, + "loss": 1.4286, + "step": 1184 + }, + { + "epoch": 0.3725386261460235, + "grad_norm": 0.7734375, + "learning_rate": 1.869755633132339e-05, + "loss": 1.2967, + "step": 1186 + }, + { + "epoch": 0.3731668531715649, + "grad_norm": 1.015625, + "learning_rate": 1.869501745477626e-05, + "loss": 1.3566, + "step": 1188 + }, + { + "epoch": 0.3737950801971062, + "grad_norm": 0.71875, + "learning_rate": 1.8692478578229133e-05, + "loss": 1.3837, + "step": 1190 + }, + { + "epoch": 0.3744233072226476, + "grad_norm": 0.9296875, + "learning_rate": 1.8689939701682008e-05, + "loss": 1.369, + "step": 1192 + }, + { + "epoch": 0.37505153424818893, + "grad_norm": 0.71484375, + "learning_rate": 1.868740082513488e-05, + "loss": 1.4206, + "step": 1194 + }, + { + "epoch": 0.3756797612737303, + "grad_norm": 0.75390625, + "learning_rate": 1.868486194858775e-05, + "loss": 1.3345, + "step": 1196 + }, + { + "epoch": 0.37630798829927165, + "grad_norm": 0.84765625, + "learning_rate": 1.8682323072040625e-05, + "loss": 1.3843, + "step": 1198 + }, + { + "epoch": 0.37693621532481303, + "grad_norm": 0.71484375, + "learning_rate": 1.8679784195493496e-05, + "loss": 1.4273, + "step": 1200 + }, + { + "epoch": 0.37756444235035436, + "grad_norm": 0.7734375, + "learning_rate": 1.8677245318946368e-05, + "loss": 1.3729, + "step": 1202 + }, + { + "epoch": 0.37819266937589574, + "grad_norm": 1.15625, + "learning_rate": 1.867470644239924e-05, + "loss": 1.1632, + "step": 1204 + }, + { + "epoch": 0.37882089640143707, + "grad_norm": 0.6796875, + "learning_rate": 1.8672167565852114e-05, + "loss": 1.3493, + "step": 1206 + }, + { + "epoch": 0.37944912342697845, + "grad_norm": 0.7578125, + "learning_rate": 1.8669628689304985e-05, + "loss": 1.3056, + "step": 1208 + }, + { + "epoch": 0.3800773504525198, + "grad_norm": 0.7265625, + "learning_rate": 1.8667089812757857e-05, + "loss": 1.414, + "step": 1210 + }, + { + "epoch": 0.3807055774780611, + "grad_norm": 0.8359375, + "learning_rate": 1.8664550936210728e-05, + "loss": 1.33, + "step": 1212 + }, + { + "epoch": 0.3813338045036025, + "grad_norm": 0.80859375, + "learning_rate": 1.86620120596636e-05, + "loss": 1.378, + "step": 1214 + }, + { + "epoch": 0.3819620315291438, + "grad_norm": 0.95703125, + "learning_rate": 1.865947318311647e-05, + "loss": 1.2628, + "step": 1216 + }, + { + "epoch": 0.3825902585546852, + "grad_norm": 0.73046875, + "learning_rate": 1.8656934306569346e-05, + "loss": 1.2875, + "step": 1218 + }, + { + "epoch": 0.38321848558022653, + "grad_norm": 0.78515625, + "learning_rate": 1.8654395430022217e-05, + "loss": 1.3463, + "step": 1220 + }, + { + "epoch": 0.3838467126057679, + "grad_norm": 0.80078125, + "learning_rate": 1.865185655347509e-05, + "loss": 1.3272, + "step": 1222 + }, + { + "epoch": 0.38447493963130924, + "grad_norm": 0.71484375, + "learning_rate": 1.864931767692796e-05, + "loss": 1.3908, + "step": 1224 + }, + { + "epoch": 0.3851031666568506, + "grad_norm": 0.6796875, + "learning_rate": 1.8646778800380835e-05, + "loss": 1.3235, + "step": 1226 + }, + { + "epoch": 0.38573139368239195, + "grad_norm": 0.74609375, + "learning_rate": 1.8644239923833706e-05, + "loss": 1.2354, + "step": 1228 + }, + { + "epoch": 0.38635962070793334, + "grad_norm": 0.88671875, + "learning_rate": 1.8641701047286577e-05, + "loss": 1.2592, + "step": 1230 + }, + { + "epoch": 0.38698784773347467, + "grad_norm": 0.7265625, + "learning_rate": 1.863916217073945e-05, + "loss": 1.3272, + "step": 1232 + }, + { + "epoch": 0.38761607475901605, + "grad_norm": 0.77734375, + "learning_rate": 1.8636623294192323e-05, + "loss": 1.2147, + "step": 1234 + }, + { + "epoch": 0.3882443017845574, + "grad_norm": 0.7734375, + "learning_rate": 1.863408441764519e-05, + "loss": 1.3168, + "step": 1236 + }, + { + "epoch": 0.38887252881009876, + "grad_norm": 0.73828125, + "learning_rate": 1.8631545541098066e-05, + "loss": 1.2581, + "step": 1238 + }, + { + "epoch": 0.3895007558356401, + "grad_norm": 0.84375, + "learning_rate": 1.8629006664550938e-05, + "loss": 1.404, + "step": 1240 + }, + { + "epoch": 0.3901289828611815, + "grad_norm": 0.79296875, + "learning_rate": 1.862646778800381e-05, + "loss": 1.3546, + "step": 1242 + }, + { + "epoch": 0.3907572098867228, + "grad_norm": 0.74609375, + "learning_rate": 1.862392891145668e-05, + "loss": 1.2896, + "step": 1244 + }, + { + "epoch": 0.3913854369122642, + "grad_norm": 0.74609375, + "learning_rate": 1.8621390034909555e-05, + "loss": 1.3196, + "step": 1246 + }, + { + "epoch": 0.3920136639378055, + "grad_norm": 0.72265625, + "learning_rate": 1.8618851158362427e-05, + "loss": 1.3084, + "step": 1248 + }, + { + "epoch": 0.3926418909633469, + "grad_norm": 0.75390625, + "learning_rate": 1.8616312281815298e-05, + "loss": 1.2459, + "step": 1250 + }, + { + "epoch": 0.3932701179888882, + "grad_norm": 0.73828125, + "learning_rate": 1.861377340526817e-05, + "loss": 1.3642, + "step": 1252 + }, + { + "epoch": 0.3938983450144296, + "grad_norm": 0.9140625, + "learning_rate": 1.8611234528721044e-05, + "loss": 1.2232, + "step": 1254 + }, + { + "epoch": 0.39452657203997094, + "grad_norm": 0.6875, + "learning_rate": 1.8608695652173912e-05, + "loss": 1.2384, + "step": 1256 + }, + { + "epoch": 0.3951547990655123, + "grad_norm": 0.6640625, + "learning_rate": 1.8606156775626787e-05, + "loss": 1.3031, + "step": 1258 + }, + { + "epoch": 0.39578302609105365, + "grad_norm": 0.67578125, + "learning_rate": 1.8603617899079658e-05, + "loss": 1.3142, + "step": 1260 + }, + { + "epoch": 0.39641125311659503, + "grad_norm": 0.875, + "learning_rate": 1.860107902253253e-05, + "loss": 1.2851, + "step": 1262 + }, + { + "epoch": 0.39703948014213636, + "grad_norm": 0.73828125, + "learning_rate": 1.85985401459854e-05, + "loss": 1.3063, + "step": 1264 + }, + { + "epoch": 0.39766770716767774, + "grad_norm": 0.7578125, + "learning_rate": 1.8596001269438276e-05, + "loss": 1.4062, + "step": 1266 + }, + { + "epoch": 0.39829593419321907, + "grad_norm": 0.78125, + "learning_rate": 1.8593462392891147e-05, + "loss": 1.2698, + "step": 1268 + }, + { + "epoch": 0.39892416121876045, + "grad_norm": 0.6796875, + "learning_rate": 1.859092351634402e-05, + "loss": 1.3242, + "step": 1270 + }, + { + "epoch": 0.3995523882443018, + "grad_norm": 0.70703125, + "learning_rate": 1.858838463979689e-05, + "loss": 1.3655, + "step": 1272 + }, + { + "epoch": 0.40018061526984317, + "grad_norm": 0.75, + "learning_rate": 1.8585845763249765e-05, + "loss": 1.259, + "step": 1274 + }, + { + "epoch": 0.4008088422953845, + "grad_norm": 0.8984375, + "learning_rate": 1.8583306886702636e-05, + "loss": 1.2373, + "step": 1276 + }, + { + "epoch": 0.4014370693209259, + "grad_norm": 0.75390625, + "learning_rate": 1.8580768010155507e-05, + "loss": 1.3231, + "step": 1278 + }, + { + "epoch": 0.4020652963464672, + "grad_norm": 0.7421875, + "learning_rate": 1.8578229133608382e-05, + "loss": 1.3715, + "step": 1280 + }, + { + "epoch": 0.40269352337200853, + "grad_norm": 0.91015625, + "learning_rate": 1.857569025706125e-05, + "loss": 1.4227, + "step": 1282 + }, + { + "epoch": 0.4033217503975499, + "grad_norm": 0.72265625, + "learning_rate": 1.8573151380514125e-05, + "loss": 1.4352, + "step": 1284 + }, + { + "epoch": 0.40394997742309124, + "grad_norm": 0.8359375, + "learning_rate": 1.8570612503966996e-05, + "loss": 1.3358, + "step": 1286 + }, + { + "epoch": 0.40457820444863263, + "grad_norm": 0.7734375, + "learning_rate": 1.8568073627419868e-05, + "loss": 1.3508, + "step": 1288 + }, + { + "epoch": 0.40520643147417396, + "grad_norm": 0.94921875, + "learning_rate": 1.856553475087274e-05, + "loss": 1.4527, + "step": 1290 + }, + { + "epoch": 0.40583465849971534, + "grad_norm": 0.68359375, + "learning_rate": 1.8562995874325614e-05, + "loss": 1.4456, + "step": 1292 + }, + { + "epoch": 0.40646288552525667, + "grad_norm": 0.90625, + "learning_rate": 1.8560456997778485e-05, + "loss": 1.3093, + "step": 1294 + }, + { + "epoch": 0.40709111255079805, + "grad_norm": 0.74609375, + "learning_rate": 1.8557918121231357e-05, + "loss": 1.4534, + "step": 1296 + }, + { + "epoch": 0.4077193395763394, + "grad_norm": 0.9609375, + "learning_rate": 1.8555379244684228e-05, + "loss": 1.2337, + "step": 1298 + }, + { + "epoch": 0.40834756660188076, + "grad_norm": 0.71875, + "learning_rate": 1.8552840368137103e-05, + "loss": 1.212, + "step": 1300 + }, + { + "epoch": 0.4089757936274221, + "grad_norm": 0.70703125, + "learning_rate": 1.8550301491589974e-05, + "loss": 1.3673, + "step": 1302 + }, + { + "epoch": 0.4096040206529635, + "grad_norm": 0.6875, + "learning_rate": 1.8547762615042846e-05, + "loss": 1.3345, + "step": 1304 + }, + { + "epoch": 0.4102322476785048, + "grad_norm": 0.70703125, + "learning_rate": 1.8545223738495717e-05, + "loss": 1.3542, + "step": 1306 + }, + { + "epoch": 0.4108604747040462, + "grad_norm": 0.828125, + "learning_rate": 1.854268486194859e-05, + "loss": 1.4953, + "step": 1308 + }, + { + "epoch": 0.4114887017295875, + "grad_norm": 0.7421875, + "learning_rate": 1.854014598540146e-05, + "loss": 1.4254, + "step": 1310 + }, + { + "epoch": 0.4121169287551289, + "grad_norm": 0.71875, + "learning_rate": 1.8537607108854335e-05, + "loss": 1.3089, + "step": 1312 + }, + { + "epoch": 0.4127451557806702, + "grad_norm": 0.73046875, + "learning_rate": 1.8535068232307206e-05, + "loss": 1.3985, + "step": 1314 + }, + { + "epoch": 0.4133733828062116, + "grad_norm": 0.828125, + "learning_rate": 1.8532529355760077e-05, + "loss": 1.45, + "step": 1316 + }, + { + "epoch": 0.41400160983175294, + "grad_norm": 0.71875, + "learning_rate": 1.852999047921295e-05, + "loss": 1.472, + "step": 1318 + }, + { + "epoch": 0.4146298368572943, + "grad_norm": 0.69140625, + "learning_rate": 1.8527451602665823e-05, + "loss": 1.4135, + "step": 1320 + }, + { + "epoch": 0.41525806388283565, + "grad_norm": 0.76171875, + "learning_rate": 1.8524912726118695e-05, + "loss": 1.2985, + "step": 1322 + }, + { + "epoch": 0.41588629090837703, + "grad_norm": 0.84375, + "learning_rate": 1.8522373849571566e-05, + "loss": 1.292, + "step": 1324 + }, + { + "epoch": 0.41651451793391836, + "grad_norm": 0.73046875, + "learning_rate": 1.8519834973024438e-05, + "loss": 1.3459, + "step": 1326 + }, + { + "epoch": 0.41714274495945974, + "grad_norm": 0.72265625, + "learning_rate": 1.8517296096477312e-05, + "loss": 1.3259, + "step": 1328 + }, + { + "epoch": 0.41777097198500107, + "grad_norm": 0.70703125, + "learning_rate": 1.851475721993018e-05, + "loss": 1.3027, + "step": 1330 + }, + { + "epoch": 0.41839919901054246, + "grad_norm": 0.671875, + "learning_rate": 1.8512218343383055e-05, + "loss": 1.3385, + "step": 1332 + }, + { + "epoch": 0.4190274260360838, + "grad_norm": 0.7109375, + "learning_rate": 1.8509679466835926e-05, + "loss": 1.3775, + "step": 1334 + }, + { + "epoch": 0.41965565306162517, + "grad_norm": 0.79296875, + "learning_rate": 1.8507140590288798e-05, + "loss": 1.1561, + "step": 1336 + }, + { + "epoch": 0.4202838800871665, + "grad_norm": 0.8125, + "learning_rate": 1.850460171374167e-05, + "loss": 1.2644, + "step": 1338 + }, + { + "epoch": 0.4209121071127079, + "grad_norm": 0.72265625, + "learning_rate": 1.8502062837194544e-05, + "loss": 1.3686, + "step": 1340 + }, + { + "epoch": 0.4215403341382492, + "grad_norm": 0.79296875, + "learning_rate": 1.8499523960647415e-05, + "loss": 1.4161, + "step": 1342 + }, + { + "epoch": 0.4221685611637906, + "grad_norm": 0.796875, + "learning_rate": 1.8496985084100287e-05, + "loss": 1.3431, + "step": 1344 + }, + { + "epoch": 0.4227967881893319, + "grad_norm": 0.80859375, + "learning_rate": 1.8494446207553158e-05, + "loss": 1.3203, + "step": 1346 + }, + { + "epoch": 0.4234250152148733, + "grad_norm": 0.8359375, + "learning_rate": 1.8491907331006033e-05, + "loss": 1.3866, + "step": 1348 + }, + { + "epoch": 0.42405324224041463, + "grad_norm": 0.73828125, + "learning_rate": 1.84893684544589e-05, + "loss": 1.307, + "step": 1350 + }, + { + "epoch": 0.42468146926595596, + "grad_norm": 0.75390625, + "learning_rate": 1.8486829577911776e-05, + "loss": 1.3054, + "step": 1352 + }, + { + "epoch": 0.42530969629149734, + "grad_norm": 0.73828125, + "learning_rate": 1.8484290701364647e-05, + "loss": 1.263, + "step": 1354 + }, + { + "epoch": 0.42593792331703867, + "grad_norm": 0.7421875, + "learning_rate": 1.848175182481752e-05, + "loss": 1.2961, + "step": 1356 + }, + { + "epoch": 0.42656615034258005, + "grad_norm": 0.70703125, + "learning_rate": 1.847921294827039e-05, + "loss": 1.386, + "step": 1358 + }, + { + "epoch": 0.4271943773681214, + "grad_norm": 0.79296875, + "learning_rate": 1.8476674071723265e-05, + "loss": 1.2587, + "step": 1360 + }, + { + "epoch": 0.42782260439366276, + "grad_norm": 0.80078125, + "learning_rate": 1.8474135195176136e-05, + "loss": 1.3613, + "step": 1362 + }, + { + "epoch": 0.4284508314192041, + "grad_norm": 0.734375, + "learning_rate": 1.8471596318629007e-05, + "loss": 1.4578, + "step": 1364 + }, + { + "epoch": 0.4290790584447455, + "grad_norm": 0.75, + "learning_rate": 1.8469057442081882e-05, + "loss": 1.4915, + "step": 1366 + }, + { + "epoch": 0.4297072854702868, + "grad_norm": 0.984375, + "learning_rate": 1.8466518565534754e-05, + "loss": 1.2513, + "step": 1368 + }, + { + "epoch": 0.4303355124958282, + "grad_norm": 0.78125, + "learning_rate": 1.8463979688987625e-05, + "loss": 1.3317, + "step": 1370 + }, + { + "epoch": 0.4309637395213695, + "grad_norm": 0.76171875, + "learning_rate": 1.8461440812440496e-05, + "loss": 1.3281, + "step": 1372 + }, + { + "epoch": 0.4315919665469109, + "grad_norm": 0.89453125, + "learning_rate": 1.845890193589337e-05, + "loss": 1.2836, + "step": 1374 + }, + { + "epoch": 0.4322201935724522, + "grad_norm": 0.96875, + "learning_rate": 1.845636305934624e-05, + "loss": 1.3258, + "step": 1376 + }, + { + "epoch": 0.4328484205979936, + "grad_norm": 0.703125, + "learning_rate": 1.8453824182799114e-05, + "loss": 1.3192, + "step": 1378 + }, + { + "epoch": 0.43347664762353494, + "grad_norm": 0.7890625, + "learning_rate": 1.8451285306251985e-05, + "loss": 1.2383, + "step": 1380 + }, + { + "epoch": 0.4341048746490763, + "grad_norm": 0.6953125, + "learning_rate": 1.8448746429704857e-05, + "loss": 1.4198, + "step": 1382 + }, + { + "epoch": 0.43473310167461765, + "grad_norm": 0.84375, + "learning_rate": 1.8446207553157728e-05, + "loss": 1.3262, + "step": 1384 + }, + { + "epoch": 0.43536132870015903, + "grad_norm": 0.90234375, + "learning_rate": 1.8443668676610603e-05, + "loss": 1.3783, + "step": 1386 + }, + { + "epoch": 0.43598955572570036, + "grad_norm": 0.8046875, + "learning_rate": 1.8441129800063474e-05, + "loss": 1.3803, + "step": 1388 + }, + { + "epoch": 0.43661778275124175, + "grad_norm": 0.8359375, + "learning_rate": 1.8438590923516346e-05, + "loss": 1.2537, + "step": 1390 + }, + { + "epoch": 0.4372460097767831, + "grad_norm": 0.74609375, + "learning_rate": 1.8436052046969217e-05, + "loss": 1.4251, + "step": 1392 + }, + { + "epoch": 0.43787423680232446, + "grad_norm": 0.80078125, + "learning_rate": 1.843351317042209e-05, + "loss": 1.3708, + "step": 1394 + }, + { + "epoch": 0.4385024638278658, + "grad_norm": 0.81640625, + "learning_rate": 1.8430974293874963e-05, + "loss": 1.3983, + "step": 1396 + }, + { + "epoch": 0.43913069085340717, + "grad_norm": 0.703125, + "learning_rate": 1.8428435417327834e-05, + "loss": 1.3208, + "step": 1398 + }, + { + "epoch": 0.4397589178789485, + "grad_norm": 0.6484375, + "learning_rate": 1.8425896540780706e-05, + "loss": 1.2447, + "step": 1400 + }, + { + "epoch": 0.4403871449044899, + "grad_norm": 0.7265625, + "learning_rate": 1.8423357664233577e-05, + "loss": 1.4995, + "step": 1402 + }, + { + "epoch": 0.4410153719300312, + "grad_norm": 0.69140625, + "learning_rate": 1.842081878768645e-05, + "loss": 1.2333, + "step": 1404 + }, + { + "epoch": 0.4416435989555726, + "grad_norm": 0.72265625, + "learning_rate": 1.8418279911139323e-05, + "loss": 1.438, + "step": 1406 + }, + { + "epoch": 0.4422718259811139, + "grad_norm": 0.6796875, + "learning_rate": 1.8415741034592195e-05, + "loss": 1.3648, + "step": 1408 + }, + { + "epoch": 0.4429000530066553, + "grad_norm": 0.87890625, + "learning_rate": 1.8413202158045066e-05, + "loss": 1.3982, + "step": 1410 + }, + { + "epoch": 0.44352828003219663, + "grad_norm": 0.7734375, + "learning_rate": 1.8410663281497937e-05, + "loss": 1.2714, + "step": 1412 + }, + { + "epoch": 0.444156507057738, + "grad_norm": 0.66015625, + "learning_rate": 1.8408124404950812e-05, + "loss": 1.3464, + "step": 1414 + }, + { + "epoch": 0.44478473408327934, + "grad_norm": 0.671875, + "learning_rate": 1.8405585528403684e-05, + "loss": 1.3379, + "step": 1416 + }, + { + "epoch": 0.4454129611088207, + "grad_norm": 0.73046875, + "learning_rate": 1.8403046651856555e-05, + "loss": 1.3022, + "step": 1418 + }, + { + "epoch": 0.44604118813436205, + "grad_norm": 0.765625, + "learning_rate": 1.8400507775309426e-05, + "loss": 1.3677, + "step": 1420 + }, + { + "epoch": 0.4466694151599034, + "grad_norm": 0.6796875, + "learning_rate": 1.83979688987623e-05, + "loss": 1.3101, + "step": 1422 + }, + { + "epoch": 0.44729764218544477, + "grad_norm": 0.94140625, + "learning_rate": 1.839543002221517e-05, + "loss": 1.2118, + "step": 1424 + }, + { + "epoch": 0.4479258692109861, + "grad_norm": 2.84375, + "learning_rate": 1.8392891145668044e-05, + "loss": 1.2927, + "step": 1426 + }, + { + "epoch": 0.4485540962365275, + "grad_norm": 0.88671875, + "learning_rate": 1.8390352269120915e-05, + "loss": 1.4683, + "step": 1428 + }, + { + "epoch": 0.4491823232620688, + "grad_norm": 0.75, + "learning_rate": 1.8387813392573787e-05, + "loss": 1.2949, + "step": 1430 + }, + { + "epoch": 0.4498105502876102, + "grad_norm": 0.75, + "learning_rate": 1.8385274516026658e-05, + "loss": 1.3789, + "step": 1432 + }, + { + "epoch": 0.4504387773131515, + "grad_norm": 0.7265625, + "learning_rate": 1.8382735639479533e-05, + "loss": 1.3308, + "step": 1434 + }, + { + "epoch": 0.4510670043386929, + "grad_norm": 0.78125, + "learning_rate": 1.8380196762932404e-05, + "loss": 1.3221, + "step": 1436 + }, + { + "epoch": 0.45169523136423423, + "grad_norm": 0.703125, + "learning_rate": 1.8377657886385276e-05, + "loss": 1.353, + "step": 1438 + }, + { + "epoch": 0.4523234583897756, + "grad_norm": 0.84765625, + "learning_rate": 1.8375119009838147e-05, + "loss": 1.2386, + "step": 1440 + }, + { + "epoch": 0.45295168541531694, + "grad_norm": 0.70703125, + "learning_rate": 1.8372580133291022e-05, + "loss": 1.5192, + "step": 1442 + }, + { + "epoch": 0.4535799124408583, + "grad_norm": 0.7890625, + "learning_rate": 1.837004125674389e-05, + "loss": 1.4076, + "step": 1444 + }, + { + "epoch": 0.45420813946639965, + "grad_norm": 0.6953125, + "learning_rate": 1.8367502380196765e-05, + "loss": 1.4394, + "step": 1446 + }, + { + "epoch": 0.45483636649194104, + "grad_norm": 0.96484375, + "learning_rate": 1.836496350364964e-05, + "loss": 1.3238, + "step": 1448 + }, + { + "epoch": 0.45546459351748236, + "grad_norm": 0.75390625, + "learning_rate": 1.8362424627102507e-05, + "loss": 1.2685, + "step": 1450 + }, + { + "epoch": 0.45609282054302375, + "grad_norm": 0.7890625, + "learning_rate": 1.8359885750555382e-05, + "loss": 1.3883, + "step": 1452 + }, + { + "epoch": 0.4567210475685651, + "grad_norm": 0.71484375, + "learning_rate": 1.8357346874008253e-05, + "loss": 1.3735, + "step": 1454 + }, + { + "epoch": 0.45734927459410646, + "grad_norm": 0.7265625, + "learning_rate": 1.8354807997461125e-05, + "loss": 1.4432, + "step": 1456 + }, + { + "epoch": 0.4579775016196478, + "grad_norm": 0.8046875, + "learning_rate": 1.8352269120913996e-05, + "loss": 1.3395, + "step": 1458 + }, + { + "epoch": 0.45860572864518917, + "grad_norm": 0.78515625, + "learning_rate": 1.834973024436687e-05, + "loss": 1.2355, + "step": 1460 + }, + { + "epoch": 0.4592339556707305, + "grad_norm": 0.703125, + "learning_rate": 1.8347191367819742e-05, + "loss": 1.4257, + "step": 1462 + }, + { + "epoch": 0.4598621826962719, + "grad_norm": 0.78515625, + "learning_rate": 1.8344652491272614e-05, + "loss": 1.4014, + "step": 1464 + }, + { + "epoch": 0.4604904097218132, + "grad_norm": 0.66015625, + "learning_rate": 1.8342113614725485e-05, + "loss": 1.4452, + "step": 1466 + }, + { + "epoch": 0.4611186367473546, + "grad_norm": 0.7578125, + "learning_rate": 1.833957473817836e-05, + "loss": 1.2609, + "step": 1468 + }, + { + "epoch": 0.4617468637728959, + "grad_norm": 3.109375, + "learning_rate": 1.8337035861631228e-05, + "loss": 1.3392, + "step": 1470 + }, + { + "epoch": 0.4623750907984373, + "grad_norm": 0.8359375, + "learning_rate": 1.8334496985084103e-05, + "loss": 1.4992, + "step": 1472 + }, + { + "epoch": 0.46300331782397863, + "grad_norm": 0.71875, + "learning_rate": 1.8331958108536974e-05, + "loss": 1.3606, + "step": 1474 + }, + { + "epoch": 0.46363154484952, + "grad_norm": 0.73046875, + "learning_rate": 1.8329419231989845e-05, + "loss": 1.4007, + "step": 1476 + }, + { + "epoch": 0.46425977187506134, + "grad_norm": 0.71484375, + "learning_rate": 1.8326880355442717e-05, + "loss": 1.3096, + "step": 1478 + }, + { + "epoch": 0.46488799890060273, + "grad_norm": 0.75, + "learning_rate": 1.832434147889559e-05, + "loss": 1.3873, + "step": 1480 + }, + { + "epoch": 0.46551622592614406, + "grad_norm": 0.75390625, + "learning_rate": 1.8321802602348463e-05, + "loss": 1.3962, + "step": 1482 + }, + { + "epoch": 0.46614445295168544, + "grad_norm": 1.1171875, + "learning_rate": 1.8319263725801334e-05, + "loss": 1.3512, + "step": 1484 + }, + { + "epoch": 0.46677267997722677, + "grad_norm": 0.671875, + "learning_rate": 1.8316724849254206e-05, + "loss": 1.3546, + "step": 1486 + }, + { + "epoch": 0.46740090700276815, + "grad_norm": 0.76953125, + "learning_rate": 1.831418597270708e-05, + "loss": 1.3739, + "step": 1488 + }, + { + "epoch": 0.4680291340283095, + "grad_norm": 0.71875, + "learning_rate": 1.831164709615995e-05, + "loss": 1.3045, + "step": 1490 + }, + { + "epoch": 0.46865736105385086, + "grad_norm": 0.72265625, + "learning_rate": 1.8309108219612823e-05, + "loss": 1.385, + "step": 1492 + }, + { + "epoch": 0.4692855880793922, + "grad_norm": 0.765625, + "learning_rate": 1.8306569343065695e-05, + "loss": 1.282, + "step": 1494 + }, + { + "epoch": 0.4699138151049335, + "grad_norm": 0.671875, + "learning_rate": 1.8304030466518566e-05, + "loss": 1.5008, + "step": 1496 + }, + { + "epoch": 0.4705420421304749, + "grad_norm": 0.78515625, + "learning_rate": 1.8301491589971437e-05, + "loss": 1.422, + "step": 1498 + }, + { + "epoch": 0.47117026915601623, + "grad_norm": 0.70703125, + "learning_rate": 1.8298952713424312e-05, + "loss": 1.3419, + "step": 1500 + }, + { + "epoch": 0.4717984961815576, + "grad_norm": 0.72265625, + "learning_rate": 1.8296413836877184e-05, + "loss": 1.3878, + "step": 1502 + }, + { + "epoch": 0.47242672320709894, + "grad_norm": 0.7109375, + "learning_rate": 1.8293874960330055e-05, + "loss": 1.4378, + "step": 1504 + }, + { + "epoch": 0.4730549502326403, + "grad_norm": 0.69140625, + "learning_rate": 1.8291336083782926e-05, + "loss": 1.4115, + "step": 1506 + }, + { + "epoch": 0.47368317725818165, + "grad_norm": 0.75, + "learning_rate": 1.82887972072358e-05, + "loss": 1.2909, + "step": 1508 + }, + { + "epoch": 0.47431140428372304, + "grad_norm": 0.74609375, + "learning_rate": 1.8286258330688672e-05, + "loss": 1.3813, + "step": 1510 + }, + { + "epoch": 0.47493963130926437, + "grad_norm": 0.7890625, + "learning_rate": 1.8283719454141544e-05, + "loss": 1.3018, + "step": 1512 + }, + { + "epoch": 0.47556785833480575, + "grad_norm": 0.7109375, + "learning_rate": 1.8281180577594415e-05, + "loss": 1.228, + "step": 1514 + }, + { + "epoch": 0.4761960853603471, + "grad_norm": 0.65625, + "learning_rate": 1.8278641701047287e-05, + "loss": 1.3985, + "step": 1516 + }, + { + "epoch": 0.47682431238588846, + "grad_norm": 0.67578125, + "learning_rate": 1.8276102824500158e-05, + "loss": 1.4065, + "step": 1518 + }, + { + "epoch": 0.4774525394114298, + "grad_norm": 0.7421875, + "learning_rate": 1.8273563947953033e-05, + "loss": 1.34, + "step": 1520 + }, + { + "epoch": 0.47808076643697117, + "grad_norm": 0.73046875, + "learning_rate": 1.8271025071405904e-05, + "loss": 1.3451, + "step": 1522 + }, + { + "epoch": 0.4787089934625125, + "grad_norm": 0.75, + "learning_rate": 1.8268486194858776e-05, + "loss": 1.3477, + "step": 1524 + }, + { + "epoch": 0.4793372204880539, + "grad_norm": 0.734375, + "learning_rate": 1.8265947318311647e-05, + "loss": 1.3247, + "step": 1526 + }, + { + "epoch": 0.4799654475135952, + "grad_norm": 0.73046875, + "learning_rate": 1.8263408441764522e-05, + "loss": 1.21, + "step": 1528 + }, + { + "epoch": 0.4805936745391366, + "grad_norm": 0.71875, + "learning_rate": 1.8260869565217393e-05, + "loss": 1.3398, + "step": 1530 + }, + { + "epoch": 0.4812219015646779, + "grad_norm": 0.734375, + "learning_rate": 1.8258330688670264e-05, + "loss": 1.3262, + "step": 1532 + }, + { + "epoch": 0.4818501285902193, + "grad_norm": 0.75, + "learning_rate": 1.825579181212314e-05, + "loss": 1.4908, + "step": 1534 + }, + { + "epoch": 0.48247835561576063, + "grad_norm": 0.7890625, + "learning_rate": 1.825325293557601e-05, + "loss": 1.3113, + "step": 1536 + }, + { + "epoch": 0.483106582641302, + "grad_norm": 0.6640625, + "learning_rate": 1.8250714059028882e-05, + "loss": 1.2718, + "step": 1538 + }, + { + "epoch": 0.48373480966684335, + "grad_norm": 0.7265625, + "learning_rate": 1.8248175182481753e-05, + "loss": 1.406, + "step": 1540 + }, + { + "epoch": 0.48436303669238473, + "grad_norm": 0.6953125, + "learning_rate": 1.8245636305934625e-05, + "loss": 1.3577, + "step": 1542 + }, + { + "epoch": 0.48499126371792606, + "grad_norm": 0.7421875, + "learning_rate": 1.8243097429387496e-05, + "loss": 1.3054, + "step": 1544 + }, + { + "epoch": 0.48561949074346744, + "grad_norm": 0.78515625, + "learning_rate": 1.824055855284037e-05, + "loss": 1.3842, + "step": 1546 + }, + { + "epoch": 0.48624771776900877, + "grad_norm": 0.76953125, + "learning_rate": 1.8238019676293242e-05, + "loss": 1.3789, + "step": 1548 + }, + { + "epoch": 0.48687594479455015, + "grad_norm": 0.65625, + "learning_rate": 1.8235480799746114e-05, + "loss": 1.3439, + "step": 1550 + }, + { + "epoch": 0.4875041718200915, + "grad_norm": 0.69921875, + "learning_rate": 1.8232941923198985e-05, + "loss": 1.3715, + "step": 1552 + }, + { + "epoch": 0.48813239884563286, + "grad_norm": 0.70703125, + "learning_rate": 1.823040304665186e-05, + "loss": 1.4506, + "step": 1554 + }, + { + "epoch": 0.4887606258711742, + "grad_norm": 0.69140625, + "learning_rate": 1.822786417010473e-05, + "loss": 1.4064, + "step": 1556 + }, + { + "epoch": 0.4893888528967156, + "grad_norm": 0.7578125, + "learning_rate": 1.8225325293557603e-05, + "loss": 1.3322, + "step": 1558 + }, + { + "epoch": 0.4900170799222569, + "grad_norm": 0.76953125, + "learning_rate": 1.8222786417010474e-05, + "loss": 1.3075, + "step": 1560 + }, + { + "epoch": 0.4906453069477983, + "grad_norm": 0.703125, + "learning_rate": 1.822024754046335e-05, + "loss": 1.4187, + "step": 1562 + }, + { + "epoch": 0.4912735339733396, + "grad_norm": 0.8984375, + "learning_rate": 1.8217708663916217e-05, + "loss": 1.3365, + "step": 1564 + }, + { + "epoch": 0.49190176099888094, + "grad_norm": 0.7578125, + "learning_rate": 1.821516978736909e-05, + "loss": 1.3593, + "step": 1566 + }, + { + "epoch": 0.4925299880244223, + "grad_norm": 0.7421875, + "learning_rate": 1.8212630910821963e-05, + "loss": 1.298, + "step": 1568 + }, + { + "epoch": 0.49315821504996366, + "grad_norm": 0.71875, + "learning_rate": 1.8210092034274834e-05, + "loss": 1.4256, + "step": 1570 + }, + { + "epoch": 0.49378644207550504, + "grad_norm": 0.7890625, + "learning_rate": 1.8207553157727706e-05, + "loss": 1.4808, + "step": 1572 + }, + { + "epoch": 0.49441466910104637, + "grad_norm": 0.875, + "learning_rate": 1.820501428118058e-05, + "loss": 1.4233, + "step": 1574 + }, + { + "epoch": 0.49504289612658775, + "grad_norm": 0.7890625, + "learning_rate": 1.8202475404633452e-05, + "loss": 1.3273, + "step": 1576 + }, + { + "epoch": 0.4956711231521291, + "grad_norm": 0.73828125, + "learning_rate": 1.8199936528086323e-05, + "loss": 1.3384, + "step": 1578 + }, + { + "epoch": 0.49629935017767046, + "grad_norm": 0.7421875, + "learning_rate": 1.8197397651539195e-05, + "loss": 1.4455, + "step": 1580 + }, + { + "epoch": 0.4969275772032118, + "grad_norm": 0.7421875, + "learning_rate": 1.819485877499207e-05, + "loss": 1.3235, + "step": 1582 + }, + { + "epoch": 0.4975558042287532, + "grad_norm": 0.73828125, + "learning_rate": 1.8192319898444937e-05, + "loss": 1.3602, + "step": 1584 + }, + { + "epoch": 0.4981840312542945, + "grad_norm": 0.890625, + "learning_rate": 1.8189781021897812e-05, + "loss": 1.2892, + "step": 1586 + }, + { + "epoch": 0.4988122582798359, + "grad_norm": 0.85546875, + "learning_rate": 1.8187242145350684e-05, + "loss": 1.2329, + "step": 1588 + }, + { + "epoch": 0.4994404853053772, + "grad_norm": 0.68359375, + "learning_rate": 1.8184703268803555e-05, + "loss": 1.2801, + "step": 1590 + }, + { + "epoch": 0.5000687123309185, + "grad_norm": 0.76171875, + "learning_rate": 1.8182164392256426e-05, + "loss": 1.3857, + "step": 1592 + }, + { + "epoch": 0.50069693935646, + "grad_norm": 0.8671875, + "learning_rate": 1.81796255157093e-05, + "loss": 1.3113, + "step": 1594 + }, + { + "epoch": 0.5013251663820013, + "grad_norm": 0.78125, + "learning_rate": 1.8177086639162172e-05, + "loss": 1.4523, + "step": 1596 + }, + { + "epoch": 0.5019533934075426, + "grad_norm": 0.77734375, + "learning_rate": 1.8174547762615044e-05, + "loss": 1.3314, + "step": 1598 + }, + { + "epoch": 0.502581620433084, + "grad_norm": 0.796875, + "learning_rate": 1.8172008886067915e-05, + "loss": 1.3618, + "step": 1600 + }, + { + "epoch": 0.5032098474586254, + "grad_norm": 0.8203125, + "learning_rate": 1.816947000952079e-05, + "loss": 1.3553, + "step": 1602 + }, + { + "epoch": 0.5038380744841667, + "grad_norm": 0.671875, + "learning_rate": 1.816693113297366e-05, + "loss": 1.4201, + "step": 1604 + }, + { + "epoch": 0.5044663015097081, + "grad_norm": 0.8125, + "learning_rate": 1.8164392256426533e-05, + "loss": 1.309, + "step": 1606 + }, + { + "epoch": 0.5050945285352494, + "grad_norm": 0.7734375, + "learning_rate": 1.8161853379879404e-05, + "loss": 1.3145, + "step": 1608 + }, + { + "epoch": 0.5057227555607908, + "grad_norm": 0.87890625, + "learning_rate": 1.8159314503332275e-05, + "loss": 1.3546, + "step": 1610 + }, + { + "epoch": 0.5063509825863322, + "grad_norm": 0.7109375, + "learning_rate": 1.8156775626785147e-05, + "loss": 1.2818, + "step": 1612 + }, + { + "epoch": 0.5069792096118735, + "grad_norm": 0.796875, + "learning_rate": 1.815423675023802e-05, + "loss": 1.4176, + "step": 1614 + }, + { + "epoch": 0.5076074366374148, + "grad_norm": 0.734375, + "learning_rate": 1.8151697873690893e-05, + "loss": 1.3501, + "step": 1616 + }, + { + "epoch": 0.5082356636629562, + "grad_norm": 0.74609375, + "learning_rate": 1.8149158997143764e-05, + "loss": 1.3265, + "step": 1618 + }, + { + "epoch": 0.5088638906884976, + "grad_norm": 0.79296875, + "learning_rate": 1.814662012059664e-05, + "loss": 1.333, + "step": 1620 + }, + { + "epoch": 0.5094921177140389, + "grad_norm": 0.7265625, + "learning_rate": 1.814408124404951e-05, + "loss": 1.2086, + "step": 1622 + }, + { + "epoch": 0.5101203447395802, + "grad_norm": 0.8046875, + "learning_rate": 1.8141542367502382e-05, + "loss": 1.2181, + "step": 1624 + }, + { + "epoch": 0.5107485717651217, + "grad_norm": 0.72265625, + "learning_rate": 1.8139003490955253e-05, + "loss": 1.3269, + "step": 1626 + }, + { + "epoch": 0.511376798790663, + "grad_norm": 0.67578125, + "learning_rate": 1.8136464614408128e-05, + "loss": 1.2733, + "step": 1628 + }, + { + "epoch": 0.5120050258162043, + "grad_norm": 0.69921875, + "learning_rate": 1.8133925737861e-05, + "loss": 1.253, + "step": 1630 + }, + { + "epoch": 0.5126332528417457, + "grad_norm": 0.71484375, + "learning_rate": 1.813138686131387e-05, + "loss": 1.511, + "step": 1632 + }, + { + "epoch": 0.5132614798672871, + "grad_norm": 0.671875, + "learning_rate": 1.8128847984766742e-05, + "loss": 1.2451, + "step": 1634 + }, + { + "epoch": 0.5138897068928284, + "grad_norm": 0.66015625, + "learning_rate": 1.8126309108219614e-05, + "loss": 1.2587, + "step": 1636 + }, + { + "epoch": 0.5145179339183698, + "grad_norm": 0.875, + "learning_rate": 1.8123770231672485e-05, + "loss": 1.301, + "step": 1638 + }, + { + "epoch": 0.5151461609439111, + "grad_norm": 0.8046875, + "learning_rate": 1.812123135512536e-05, + "loss": 1.4174, + "step": 1640 + }, + { + "epoch": 0.5157743879694524, + "grad_norm": 0.7265625, + "learning_rate": 1.811869247857823e-05, + "loss": 1.2725, + "step": 1642 + }, + { + "epoch": 0.5164026149949938, + "grad_norm": 0.81640625, + "learning_rate": 1.8116153602031103e-05, + "loss": 1.3744, + "step": 1644 + }, + { + "epoch": 0.5170308420205352, + "grad_norm": 0.734375, + "learning_rate": 1.8113614725483974e-05, + "loss": 1.2455, + "step": 1646 + }, + { + "epoch": 0.5176590690460765, + "grad_norm": 0.68359375, + "learning_rate": 1.811107584893685e-05, + "loss": 1.4318, + "step": 1648 + }, + { + "epoch": 0.5182872960716178, + "grad_norm": 0.80859375, + "learning_rate": 1.810853697238972e-05, + "loss": 1.3426, + "step": 1650 + }, + { + "epoch": 0.5189155230971593, + "grad_norm": 0.73046875, + "learning_rate": 1.810599809584259e-05, + "loss": 1.1767, + "step": 1652 + }, + { + "epoch": 0.5195437501227006, + "grad_norm": 0.73046875, + "learning_rate": 1.8103459219295463e-05, + "loss": 1.2447, + "step": 1654 + }, + { + "epoch": 0.5201719771482419, + "grad_norm": 0.87890625, + "learning_rate": 1.8100920342748338e-05, + "loss": 1.3263, + "step": 1656 + }, + { + "epoch": 0.5208002041737833, + "grad_norm": 0.74609375, + "learning_rate": 1.8098381466201206e-05, + "loss": 1.3857, + "step": 1658 + }, + { + "epoch": 0.5214284311993247, + "grad_norm": 0.7421875, + "learning_rate": 1.809584258965408e-05, + "loss": 1.4577, + "step": 1660 + }, + { + "epoch": 0.522056658224866, + "grad_norm": 0.77734375, + "learning_rate": 1.8093303713106952e-05, + "loss": 1.4192, + "step": 1662 + }, + { + "epoch": 0.5226848852504073, + "grad_norm": 0.72265625, + "learning_rate": 1.8090764836559823e-05, + "loss": 1.2375, + "step": 1664 + }, + { + "epoch": 0.5233131122759487, + "grad_norm": 0.7578125, + "learning_rate": 1.8088225960012695e-05, + "loss": 1.3199, + "step": 1666 + }, + { + "epoch": 0.5239413393014901, + "grad_norm": 0.859375, + "learning_rate": 1.808568708346557e-05, + "loss": 1.3239, + "step": 1668 + }, + { + "epoch": 0.5245695663270314, + "grad_norm": 0.63671875, + "learning_rate": 1.808314820691844e-05, + "loss": 1.4124, + "step": 1670 + }, + { + "epoch": 0.5251977933525728, + "grad_norm": 0.8046875, + "learning_rate": 1.8080609330371312e-05, + "loss": 1.3539, + "step": 1672 + }, + { + "epoch": 0.5258260203781141, + "grad_norm": 0.75, + "learning_rate": 1.8078070453824183e-05, + "loss": 1.2979, + "step": 1674 + }, + { + "epoch": 0.5264542474036555, + "grad_norm": 0.67578125, + "learning_rate": 1.8075531577277058e-05, + "loss": 1.3582, + "step": 1676 + }, + { + "epoch": 0.5270824744291969, + "grad_norm": 0.7890625, + "learning_rate": 1.8072992700729926e-05, + "loss": 1.2866, + "step": 1678 + }, + { + "epoch": 0.5277107014547382, + "grad_norm": 0.83984375, + "learning_rate": 1.80704538241828e-05, + "loss": 1.1929, + "step": 1680 + }, + { + "epoch": 0.5283389284802795, + "grad_norm": 0.6875, + "learning_rate": 1.8067914947635672e-05, + "loss": 1.2081, + "step": 1682 + }, + { + "epoch": 0.528967155505821, + "grad_norm": 0.67578125, + "learning_rate": 1.8065376071088544e-05, + "loss": 1.4058, + "step": 1684 + }, + { + "epoch": 0.5295953825313623, + "grad_norm": 0.69140625, + "learning_rate": 1.8062837194541415e-05, + "loss": 1.3689, + "step": 1686 + }, + { + "epoch": 0.5302236095569036, + "grad_norm": 0.7734375, + "learning_rate": 1.806029831799429e-05, + "loss": 1.2963, + "step": 1688 + }, + { + "epoch": 0.530851836582445, + "grad_norm": 0.76953125, + "learning_rate": 1.805775944144716e-05, + "loss": 1.3622, + "step": 1690 + }, + { + "epoch": 0.5314800636079864, + "grad_norm": 0.765625, + "learning_rate": 1.8055220564900033e-05, + "loss": 1.2601, + "step": 1692 + }, + { + "epoch": 0.5321082906335277, + "grad_norm": 0.78515625, + "learning_rate": 1.8052681688352904e-05, + "loss": 1.2963, + "step": 1694 + }, + { + "epoch": 0.532736517659069, + "grad_norm": 0.7109375, + "learning_rate": 1.805014281180578e-05, + "loss": 1.3702, + "step": 1696 + }, + { + "epoch": 0.5333647446846104, + "grad_norm": 0.734375, + "learning_rate": 1.804760393525865e-05, + "loss": 1.2802, + "step": 1698 + }, + { + "epoch": 0.5339929717101518, + "grad_norm": 1.0078125, + "learning_rate": 1.804506505871152e-05, + "loss": 1.2703, + "step": 1700 + }, + { + "epoch": 0.5346211987356931, + "grad_norm": 0.71875, + "learning_rate": 1.8042526182164393e-05, + "loss": 1.2858, + "step": 1702 + }, + { + "epoch": 0.5352494257612345, + "grad_norm": 1.03125, + "learning_rate": 1.8039987305617264e-05, + "loss": 1.2983, + "step": 1704 + }, + { + "epoch": 0.5358776527867758, + "grad_norm": 0.71484375, + "learning_rate": 1.803744842907014e-05, + "loss": 1.4192, + "step": 1706 + }, + { + "epoch": 0.5365058798123171, + "grad_norm": 0.71484375, + "learning_rate": 1.803490955252301e-05, + "loss": 1.4011, + "step": 1708 + }, + { + "epoch": 0.5371341068378586, + "grad_norm": 0.6796875, + "learning_rate": 1.8032370675975882e-05, + "loss": 1.3894, + "step": 1710 + }, + { + "epoch": 0.5377623338633999, + "grad_norm": 0.75390625, + "learning_rate": 1.8029831799428753e-05, + "loss": 1.5168, + "step": 1712 + }, + { + "epoch": 0.5383905608889412, + "grad_norm": 1.4609375, + "learning_rate": 1.8027292922881628e-05, + "loss": 1.3708, + "step": 1714 + }, + { + "epoch": 0.5390187879144825, + "grad_norm": 0.76953125, + "learning_rate": 1.80247540463345e-05, + "loss": 1.3817, + "step": 1716 + }, + { + "epoch": 0.539647014940024, + "grad_norm": 0.7578125, + "learning_rate": 1.802221516978737e-05, + "loss": 1.3174, + "step": 1718 + }, + { + "epoch": 0.5402752419655653, + "grad_norm": 0.73828125, + "learning_rate": 1.8019676293240242e-05, + "loss": 1.3609, + "step": 1720 + }, + { + "epoch": 0.5409034689911066, + "grad_norm": 0.734375, + "learning_rate": 1.8017137416693117e-05, + "loss": 1.4835, + "step": 1722 + }, + { + "epoch": 0.541531696016648, + "grad_norm": 0.69921875, + "learning_rate": 1.801459854014599e-05, + "loss": 1.5052, + "step": 1724 + }, + { + "epoch": 0.5421599230421894, + "grad_norm": 0.72265625, + "learning_rate": 1.801205966359886e-05, + "loss": 1.3482, + "step": 1726 + }, + { + "epoch": 0.5427881500677307, + "grad_norm": 0.79296875, + "learning_rate": 1.800952078705173e-05, + "loss": 1.21, + "step": 1728 + }, + { + "epoch": 0.5434163770932721, + "grad_norm": 0.75390625, + "learning_rate": 1.8006981910504602e-05, + "loss": 1.3702, + "step": 1730 + }, + { + "epoch": 0.5440446041188134, + "grad_norm": 0.7578125, + "learning_rate": 1.8004443033957474e-05, + "loss": 1.4266, + "step": 1732 + }, + { + "epoch": 0.5446728311443548, + "grad_norm": 0.671875, + "learning_rate": 1.800190415741035e-05, + "loss": 1.339, + "step": 1734 + }, + { + "epoch": 0.5453010581698962, + "grad_norm": 0.74609375, + "learning_rate": 1.799936528086322e-05, + "loss": 1.3851, + "step": 1736 + }, + { + "epoch": 0.5459292851954375, + "grad_norm": 0.69140625, + "learning_rate": 1.799682640431609e-05, + "loss": 1.4017, + "step": 1738 + }, + { + "epoch": 0.5465575122209788, + "grad_norm": 0.734375, + "learning_rate": 1.7994287527768963e-05, + "loss": 1.2933, + "step": 1740 + }, + { + "epoch": 0.5471857392465203, + "grad_norm": 0.74609375, + "learning_rate": 1.7991748651221838e-05, + "loss": 1.3104, + "step": 1742 + }, + { + "epoch": 0.5478139662720616, + "grad_norm": 0.65625, + "learning_rate": 1.798920977467471e-05, + "loss": 1.231, + "step": 1744 + }, + { + "epoch": 0.5484421932976029, + "grad_norm": 0.7578125, + "learning_rate": 1.798667089812758e-05, + "loss": 1.4584, + "step": 1746 + }, + { + "epoch": 0.5490704203231442, + "grad_norm": 0.75390625, + "learning_rate": 1.798413202158045e-05, + "loss": 1.2988, + "step": 1748 + }, + { + "epoch": 0.5496986473486857, + "grad_norm": 0.7578125, + "learning_rate": 1.7981593145033326e-05, + "loss": 1.2553, + "step": 1750 + }, + { + "epoch": 0.550326874374227, + "grad_norm": 0.75, + "learning_rate": 1.7979054268486194e-05, + "loss": 1.3824, + "step": 1752 + }, + { + "epoch": 0.5509551013997683, + "grad_norm": 0.765625, + "learning_rate": 1.797651539193907e-05, + "loss": 1.4831, + "step": 1754 + }, + { + "epoch": 0.5515833284253097, + "grad_norm": 0.8046875, + "learning_rate": 1.797397651539194e-05, + "loss": 1.3839, + "step": 1756 + }, + { + "epoch": 0.5522115554508511, + "grad_norm": 0.7578125, + "learning_rate": 1.7971437638844812e-05, + "loss": 1.4556, + "step": 1758 + }, + { + "epoch": 0.5528397824763924, + "grad_norm": 0.67578125, + "learning_rate": 1.7968898762297683e-05, + "loss": 1.3564, + "step": 1760 + }, + { + "epoch": 0.5534680095019338, + "grad_norm": 0.81640625, + "learning_rate": 1.7966359885750558e-05, + "loss": 1.4027, + "step": 1762 + }, + { + "epoch": 0.5540962365274751, + "grad_norm": 0.93359375, + "learning_rate": 1.796382100920343e-05, + "loss": 1.3738, + "step": 1764 + }, + { + "epoch": 0.5547244635530165, + "grad_norm": 0.8203125, + "learning_rate": 1.79612821326563e-05, + "loss": 1.4116, + "step": 1766 + }, + { + "epoch": 0.5553526905785579, + "grad_norm": 0.9140625, + "learning_rate": 1.7958743256109172e-05, + "loss": 1.4383, + "step": 1768 + }, + { + "epoch": 0.5559809176040992, + "grad_norm": 0.76171875, + "learning_rate": 1.7956204379562047e-05, + "loss": 1.2174, + "step": 1770 + }, + { + "epoch": 0.5566091446296405, + "grad_norm": 0.75390625, + "learning_rate": 1.7953665503014915e-05, + "loss": 1.2893, + "step": 1772 + }, + { + "epoch": 0.557237371655182, + "grad_norm": 0.796875, + "learning_rate": 1.795112662646779e-05, + "loss": 1.291, + "step": 1774 + }, + { + "epoch": 0.5578655986807233, + "grad_norm": 0.82421875, + "learning_rate": 1.794858774992066e-05, + "loss": 1.4798, + "step": 1776 + }, + { + "epoch": 0.5584938257062646, + "grad_norm": 0.9296875, + "learning_rate": 1.7946048873373533e-05, + "loss": 1.2961, + "step": 1778 + }, + { + "epoch": 0.5591220527318059, + "grad_norm": 2.1875, + "learning_rate": 1.7943509996826404e-05, + "loss": 1.3342, + "step": 1780 + }, + { + "epoch": 0.5597502797573473, + "grad_norm": 0.890625, + "learning_rate": 1.794097112027928e-05, + "loss": 1.2582, + "step": 1782 + }, + { + "epoch": 0.5603785067828887, + "grad_norm": 0.66796875, + "learning_rate": 1.793843224373215e-05, + "loss": 1.3106, + "step": 1784 + }, + { + "epoch": 0.56100673380843, + "grad_norm": 0.8125, + "learning_rate": 1.793589336718502e-05, + "loss": 1.3369, + "step": 1786 + }, + { + "epoch": 0.5616349608339714, + "grad_norm": 0.859375, + "learning_rate": 1.7933354490637893e-05, + "loss": 1.2346, + "step": 1788 + }, + { + "epoch": 0.5622631878595127, + "grad_norm": 0.76171875, + "learning_rate": 1.7930815614090768e-05, + "loss": 1.2644, + "step": 1790 + }, + { + "epoch": 0.5628914148850541, + "grad_norm": 0.8359375, + "learning_rate": 1.792827673754364e-05, + "loss": 1.3247, + "step": 1792 + }, + { + "epoch": 0.5635196419105954, + "grad_norm": 0.7734375, + "learning_rate": 1.792573786099651e-05, + "loss": 1.2764, + "step": 1794 + }, + { + "epoch": 0.5641478689361368, + "grad_norm": 0.71484375, + "learning_rate": 1.7923198984449385e-05, + "loss": 1.2428, + "step": 1796 + }, + { + "epoch": 0.5647760959616781, + "grad_norm": 0.80078125, + "learning_rate": 1.7920660107902253e-05, + "loss": 1.4744, + "step": 1798 + }, + { + "epoch": 0.5654043229872195, + "grad_norm": 0.7421875, + "learning_rate": 1.7918121231355128e-05, + "loss": 1.3754, + "step": 1800 + }, + { + "epoch": 0.5660325500127609, + "grad_norm": 0.8828125, + "learning_rate": 1.7915582354808e-05, + "loss": 1.3084, + "step": 1802 + }, + { + "epoch": 0.5666607770383022, + "grad_norm": 0.75, + "learning_rate": 1.791304347826087e-05, + "loss": 1.3269, + "step": 1804 + }, + { + "epoch": 0.5672890040638435, + "grad_norm": 0.7265625, + "learning_rate": 1.7910504601713742e-05, + "loss": 1.3141, + "step": 1806 + }, + { + "epoch": 0.567917231089385, + "grad_norm": 0.90234375, + "learning_rate": 1.7907965725166617e-05, + "loss": 1.1482, + "step": 1808 + }, + { + "epoch": 0.5685454581149263, + "grad_norm": 0.69921875, + "learning_rate": 1.7905426848619488e-05, + "loss": 1.3093, + "step": 1810 + }, + { + "epoch": 0.5691736851404676, + "grad_norm": 0.6640625, + "learning_rate": 1.790288797207236e-05, + "loss": 1.4742, + "step": 1812 + }, + { + "epoch": 0.569801912166009, + "grad_norm": 0.8203125, + "learning_rate": 1.790034909552523e-05, + "loss": 1.3429, + "step": 1814 + }, + { + "epoch": 0.5704301391915504, + "grad_norm": 0.77734375, + "learning_rate": 1.7897810218978106e-05, + "loss": 1.3247, + "step": 1816 + }, + { + "epoch": 0.5710583662170917, + "grad_norm": 0.6875, + "learning_rate": 1.7895271342430974e-05, + "loss": 1.386, + "step": 1818 + }, + { + "epoch": 0.571686593242633, + "grad_norm": 0.6796875, + "learning_rate": 1.789273246588385e-05, + "loss": 1.3501, + "step": 1820 + }, + { + "epoch": 0.5723148202681744, + "grad_norm": 0.73828125, + "learning_rate": 1.789019358933672e-05, + "loss": 1.2759, + "step": 1822 + }, + { + "epoch": 0.5729430472937158, + "grad_norm": 0.78515625, + "learning_rate": 1.788765471278959e-05, + "loss": 1.2834, + "step": 1824 + }, + { + "epoch": 0.5735712743192571, + "grad_norm": 0.765625, + "learning_rate": 1.7885115836242463e-05, + "loss": 1.3764, + "step": 1826 + }, + { + "epoch": 0.5741995013447985, + "grad_norm": 0.80859375, + "learning_rate": 1.7882576959695337e-05, + "loss": 1.2428, + "step": 1828 + }, + { + "epoch": 0.5748277283703398, + "grad_norm": 0.78125, + "learning_rate": 1.788003808314821e-05, + "loss": 1.3577, + "step": 1830 + }, + { + "epoch": 0.5754559553958812, + "grad_norm": 0.94921875, + "learning_rate": 1.787749920660108e-05, + "loss": 1.2091, + "step": 1832 + }, + { + "epoch": 0.5760841824214226, + "grad_norm": 0.83203125, + "learning_rate": 1.787496033005395e-05, + "loss": 1.4, + "step": 1834 + }, + { + "epoch": 0.5767124094469639, + "grad_norm": 0.7109375, + "learning_rate": 1.7872421453506826e-05, + "loss": 1.3621, + "step": 1836 + }, + { + "epoch": 0.5773406364725052, + "grad_norm": 0.828125, + "learning_rate": 1.7869882576959698e-05, + "loss": 1.3756, + "step": 1838 + }, + { + "epoch": 0.5779688634980467, + "grad_norm": 0.68359375, + "learning_rate": 1.786734370041257e-05, + "loss": 1.3658, + "step": 1840 + }, + { + "epoch": 0.578597090523588, + "grad_norm": 0.7109375, + "learning_rate": 1.786480482386544e-05, + "loss": 1.2812, + "step": 1842 + }, + { + "epoch": 0.5792253175491293, + "grad_norm": 0.73828125, + "learning_rate": 1.7862265947318312e-05, + "loss": 1.4921, + "step": 1844 + }, + { + "epoch": 0.5798535445746706, + "grad_norm": 0.77734375, + "learning_rate": 1.7859727070771183e-05, + "loss": 1.3042, + "step": 1846 + }, + { + "epoch": 0.5804817716002121, + "grad_norm": 1.203125, + "learning_rate": 1.7857188194224058e-05, + "loss": 1.1429, + "step": 1848 + }, + { + "epoch": 0.5811099986257534, + "grad_norm": 0.73046875, + "learning_rate": 1.785464931767693e-05, + "loss": 1.4471, + "step": 1850 + }, + { + "epoch": 0.5817382256512947, + "grad_norm": 0.6953125, + "learning_rate": 1.78521104411298e-05, + "loss": 1.3808, + "step": 1852 + }, + { + "epoch": 0.5823664526768361, + "grad_norm": 0.94140625, + "learning_rate": 1.7849571564582672e-05, + "loss": 1.3266, + "step": 1854 + }, + { + "epoch": 0.5829946797023774, + "grad_norm": 0.68359375, + "learning_rate": 1.7847032688035547e-05, + "loss": 1.4399, + "step": 1856 + }, + { + "epoch": 0.5836229067279188, + "grad_norm": 0.75, + "learning_rate": 1.784449381148842e-05, + "loss": 1.2884, + "step": 1858 + }, + { + "epoch": 0.5842511337534602, + "grad_norm": 0.6796875, + "learning_rate": 1.784195493494129e-05, + "loss": 1.3308, + "step": 1860 + }, + { + "epoch": 0.5848793607790015, + "grad_norm": 0.7890625, + "learning_rate": 1.783941605839416e-05, + "loss": 1.3215, + "step": 1862 + }, + { + "epoch": 0.5855075878045428, + "grad_norm": 0.8671875, + "learning_rate": 1.7836877181847036e-05, + "loss": 1.4684, + "step": 1864 + }, + { + "epoch": 0.5861358148300843, + "grad_norm": 0.6875, + "learning_rate": 1.7834338305299904e-05, + "loss": 1.293, + "step": 1866 + }, + { + "epoch": 0.5867640418556256, + "grad_norm": 0.7578125, + "learning_rate": 1.783179942875278e-05, + "loss": 1.2667, + "step": 1868 + }, + { + "epoch": 0.5873922688811669, + "grad_norm": 0.76953125, + "learning_rate": 1.782926055220565e-05, + "loss": 1.3243, + "step": 1870 + }, + { + "epoch": 0.5880204959067082, + "grad_norm": 0.79296875, + "learning_rate": 1.782672167565852e-05, + "loss": 1.2651, + "step": 1872 + }, + { + "epoch": 0.5886487229322497, + "grad_norm": 0.69921875, + "learning_rate": 1.7824182799111393e-05, + "loss": 1.2973, + "step": 1874 + }, + { + "epoch": 0.589276949957791, + "grad_norm": 0.73828125, + "learning_rate": 1.7821643922564268e-05, + "loss": 1.2823, + "step": 1876 + }, + { + "epoch": 0.5899051769833323, + "grad_norm": 0.94921875, + "learning_rate": 1.781910504601714e-05, + "loss": 1.415, + "step": 1878 + }, + { + "epoch": 0.5905334040088737, + "grad_norm": 0.76171875, + "learning_rate": 1.781656616947001e-05, + "loss": 1.2477, + "step": 1880 + }, + { + "epoch": 0.5911616310344151, + "grad_norm": 0.80078125, + "learning_rate": 1.7814027292922885e-05, + "loss": 1.2649, + "step": 1882 + }, + { + "epoch": 0.5917898580599564, + "grad_norm": 0.64453125, + "learning_rate": 1.7811488416375756e-05, + "loss": 1.3797, + "step": 1884 + }, + { + "epoch": 0.5924180850854978, + "grad_norm": 0.75390625, + "learning_rate": 1.7808949539828628e-05, + "loss": 1.3717, + "step": 1886 + }, + { + "epoch": 0.5930463121110391, + "grad_norm": 0.70703125, + "learning_rate": 1.78064106632815e-05, + "loss": 1.2677, + "step": 1888 + }, + { + "epoch": 0.5936745391365805, + "grad_norm": 0.78515625, + "learning_rate": 1.7803871786734374e-05, + "loss": 1.4157, + "step": 1890 + }, + { + "epoch": 0.5943027661621219, + "grad_norm": 0.6875, + "learning_rate": 1.7801332910187242e-05, + "loss": 1.2478, + "step": 1892 + }, + { + "epoch": 0.5949309931876632, + "grad_norm": 0.73046875, + "learning_rate": 1.7798794033640117e-05, + "loss": 1.3108, + "step": 1894 + }, + { + "epoch": 0.5955592202132045, + "grad_norm": 0.75, + "learning_rate": 1.7796255157092988e-05, + "loss": 1.3043, + "step": 1896 + }, + { + "epoch": 0.596187447238746, + "grad_norm": 0.703125, + "learning_rate": 1.779371628054586e-05, + "loss": 1.259, + "step": 1898 + }, + { + "epoch": 0.5968156742642873, + "grad_norm": 0.83203125, + "learning_rate": 1.779117740399873e-05, + "loss": 1.2671, + "step": 1900 + }, + { + "epoch": 0.5974439012898286, + "grad_norm": 0.7734375, + "learning_rate": 1.7788638527451606e-05, + "loss": 1.3808, + "step": 1902 + }, + { + "epoch": 0.5980721283153699, + "grad_norm": 0.8828125, + "learning_rate": 1.7786099650904477e-05, + "loss": 1.3359, + "step": 1904 + }, + { + "epoch": 0.5987003553409114, + "grad_norm": 0.8359375, + "learning_rate": 1.778356077435735e-05, + "loss": 1.3205, + "step": 1906 + }, + { + "epoch": 0.5993285823664527, + "grad_norm": 0.73828125, + "learning_rate": 1.778102189781022e-05, + "loss": 1.3357, + "step": 1908 + }, + { + "epoch": 0.599956809391994, + "grad_norm": 0.76953125, + "learning_rate": 1.7778483021263095e-05, + "loss": 1.3952, + "step": 1910 + }, + { + "epoch": 0.6005850364175354, + "grad_norm": 0.828125, + "learning_rate": 1.7775944144715963e-05, + "loss": 1.2332, + "step": 1912 + }, + { + "epoch": 0.6012132634430768, + "grad_norm": 0.828125, + "learning_rate": 1.7773405268168837e-05, + "loss": 1.3406, + "step": 1914 + }, + { + "epoch": 0.6018414904686181, + "grad_norm": 0.71875, + "learning_rate": 1.777086639162171e-05, + "loss": 1.3423, + "step": 1916 + }, + { + "epoch": 0.6024697174941595, + "grad_norm": 0.74609375, + "learning_rate": 1.776832751507458e-05, + "loss": 1.3578, + "step": 1918 + }, + { + "epoch": 0.6030979445197008, + "grad_norm": 0.65625, + "learning_rate": 1.776578863852745e-05, + "loss": 1.3513, + "step": 1920 + }, + { + "epoch": 0.6037261715452421, + "grad_norm": 0.8203125, + "learning_rate": 1.7763249761980326e-05, + "loss": 1.2171, + "step": 1922 + }, + { + "epoch": 0.6043543985707835, + "grad_norm": 0.72265625, + "learning_rate": 1.7760710885433198e-05, + "loss": 1.3335, + "step": 1924 + }, + { + "epoch": 0.6049826255963249, + "grad_norm": 0.83203125, + "learning_rate": 1.775817200888607e-05, + "loss": 1.3946, + "step": 1926 + }, + { + "epoch": 0.6056108526218662, + "grad_norm": 0.76953125, + "learning_rate": 1.775563313233894e-05, + "loss": 1.2521, + "step": 1928 + }, + { + "epoch": 0.6062390796474075, + "grad_norm": 0.78125, + "learning_rate": 1.7753094255791815e-05, + "loss": 1.4663, + "step": 1930 + }, + { + "epoch": 0.606867306672949, + "grad_norm": 0.71484375, + "learning_rate": 1.7750555379244687e-05, + "loss": 1.1201, + "step": 1932 + }, + { + "epoch": 0.6074955336984903, + "grad_norm": 0.78515625, + "learning_rate": 1.7748016502697558e-05, + "loss": 1.4028, + "step": 1934 + }, + { + "epoch": 0.6081237607240316, + "grad_norm": 0.7734375, + "learning_rate": 1.774547762615043e-05, + "loss": 1.2642, + "step": 1936 + }, + { + "epoch": 0.608751987749573, + "grad_norm": 0.76953125, + "learning_rate": 1.77429387496033e-05, + "loss": 1.2945, + "step": 1938 + }, + { + "epoch": 0.6093802147751144, + "grad_norm": 0.76953125, + "learning_rate": 1.7740399873056172e-05, + "loss": 1.265, + "step": 1940 + }, + { + "epoch": 0.6100084418006557, + "grad_norm": 0.859375, + "learning_rate": 1.7737860996509047e-05, + "loss": 1.4148, + "step": 1942 + }, + { + "epoch": 0.610636668826197, + "grad_norm": 0.66796875, + "learning_rate": 1.7735322119961918e-05, + "loss": 1.2506, + "step": 1944 + }, + { + "epoch": 0.6112648958517384, + "grad_norm": 0.90234375, + "learning_rate": 1.773278324341479e-05, + "loss": 1.3281, + "step": 1946 + }, + { + "epoch": 0.6118931228772798, + "grad_norm": 0.7109375, + "learning_rate": 1.773024436686766e-05, + "loss": 1.273, + "step": 1948 + }, + { + "epoch": 0.6125213499028211, + "grad_norm": 0.75, + "learning_rate": 1.7727705490320536e-05, + "loss": 1.2533, + "step": 1950 + }, + { + "epoch": 0.6131495769283625, + "grad_norm": 0.77734375, + "learning_rate": 1.7725166613773407e-05, + "loss": 1.2262, + "step": 1952 + }, + { + "epoch": 0.6137778039539038, + "grad_norm": 0.72265625, + "learning_rate": 1.772262773722628e-05, + "loss": 1.2834, + "step": 1954 + }, + { + "epoch": 0.6144060309794452, + "grad_norm": 0.6796875, + "learning_rate": 1.772008886067915e-05, + "loss": 1.286, + "step": 1956 + }, + { + "epoch": 0.6150342580049866, + "grad_norm": 0.7265625, + "learning_rate": 1.7717549984132025e-05, + "loss": 1.2474, + "step": 1958 + }, + { + "epoch": 0.6156624850305279, + "grad_norm": 0.71484375, + "learning_rate": 1.7715011107584893e-05, + "loss": 1.345, + "step": 1960 + }, + { + "epoch": 0.6162907120560692, + "grad_norm": 0.79296875, + "learning_rate": 1.7712472231037767e-05, + "loss": 1.2253, + "step": 1962 + }, + { + "epoch": 0.6169189390816107, + "grad_norm": 0.76953125, + "learning_rate": 1.770993335449064e-05, + "loss": 1.3628, + "step": 1964 + }, + { + "epoch": 0.617547166107152, + "grad_norm": 0.76953125, + "learning_rate": 1.770739447794351e-05, + "loss": 1.2676, + "step": 1966 + }, + { + "epoch": 0.6181753931326933, + "grad_norm": 0.72265625, + "learning_rate": 1.7704855601396385e-05, + "loss": 1.2463, + "step": 1968 + }, + { + "epoch": 0.6188036201582346, + "grad_norm": 0.7109375, + "learning_rate": 1.7702316724849256e-05, + "loss": 1.3617, + "step": 1970 + }, + { + "epoch": 0.6194318471837761, + "grad_norm": 0.83984375, + "learning_rate": 1.7699777848302128e-05, + "loss": 1.4785, + "step": 1972 + }, + { + "epoch": 0.6200600742093174, + "grad_norm": 0.76953125, + "learning_rate": 1.7697238971755e-05, + "loss": 1.2933, + "step": 1974 + }, + { + "epoch": 0.6206883012348587, + "grad_norm": 0.70703125, + "learning_rate": 1.7694700095207874e-05, + "loss": 1.4211, + "step": 1976 + }, + { + "epoch": 0.6213165282604001, + "grad_norm": 0.734375, + "learning_rate": 1.7692161218660745e-05, + "loss": 1.4411, + "step": 1978 + }, + { + "epoch": 0.6219447552859415, + "grad_norm": 0.70703125, + "learning_rate": 1.7689622342113617e-05, + "loss": 1.2598, + "step": 1980 + }, + { + "epoch": 0.6225729823114828, + "grad_norm": 0.71875, + "learning_rate": 1.7687083465566488e-05, + "loss": 1.2098, + "step": 1982 + }, + { + "epoch": 0.6232012093370242, + "grad_norm": 0.69921875, + "learning_rate": 1.7684544589019363e-05, + "loss": 1.3236, + "step": 1984 + }, + { + "epoch": 0.6238294363625655, + "grad_norm": 0.73046875, + "learning_rate": 1.768200571247223e-05, + "loss": 1.3541, + "step": 1986 + }, + { + "epoch": 0.6244576633881069, + "grad_norm": 0.84765625, + "learning_rate": 1.7679466835925106e-05, + "loss": 1.2746, + "step": 1988 + }, + { + "epoch": 0.6250858904136483, + "grad_norm": 0.86328125, + "learning_rate": 1.7676927959377977e-05, + "loss": 1.3703, + "step": 1990 + }, + { + "epoch": 0.6257141174391896, + "grad_norm": 0.80859375, + "learning_rate": 1.767438908283085e-05, + "loss": 1.2673, + "step": 1992 + }, + { + "epoch": 0.6263423444647309, + "grad_norm": 0.88671875, + "learning_rate": 1.767185020628372e-05, + "loss": 1.2734, + "step": 1994 + }, + { + "epoch": 0.6269705714902722, + "grad_norm": 0.8125, + "learning_rate": 1.7669311329736595e-05, + "loss": 1.2994, + "step": 1996 + }, + { + "epoch": 0.6275987985158137, + "grad_norm": 0.84765625, + "learning_rate": 1.7666772453189466e-05, + "loss": 1.2314, + "step": 1998 + }, + { + "epoch": 0.628227025541355, + "grad_norm": 0.71875, + "learning_rate": 1.7664233576642337e-05, + "loss": 1.3692, + "step": 2000 + }, + { + "epoch": 0.6288552525668963, + "grad_norm": 0.703125, + "learning_rate": 1.766169470009521e-05, + "loss": 1.1083, + "step": 2002 + }, + { + "epoch": 0.6294834795924377, + "grad_norm": 0.71875, + "learning_rate": 1.7659155823548083e-05, + "loss": 1.3513, + "step": 2004 + }, + { + "epoch": 0.6301117066179791, + "grad_norm": 0.71875, + "learning_rate": 1.765661694700095e-05, + "loss": 1.2768, + "step": 2006 + }, + { + "epoch": 0.6307399336435204, + "grad_norm": 0.77734375, + "learning_rate": 1.7654078070453826e-05, + "loss": 1.399, + "step": 2008 + }, + { + "epoch": 0.6313681606690618, + "grad_norm": 0.7734375, + "learning_rate": 1.7651539193906698e-05, + "loss": 1.3596, + "step": 2010 + }, + { + "epoch": 0.6319963876946031, + "grad_norm": 0.99609375, + "learning_rate": 1.764900031735957e-05, + "loss": 1.3298, + "step": 2012 + }, + { + "epoch": 0.6326246147201445, + "grad_norm": 0.81640625, + "learning_rate": 1.764646144081244e-05, + "loss": 1.3194, + "step": 2014 + }, + { + "epoch": 0.6332528417456859, + "grad_norm": 0.78125, + "learning_rate": 1.7643922564265315e-05, + "loss": 1.2478, + "step": 2016 + }, + { + "epoch": 0.6338810687712272, + "grad_norm": 0.78125, + "learning_rate": 1.7641383687718187e-05, + "loss": 1.285, + "step": 2018 + }, + { + "epoch": 0.6345092957967685, + "grad_norm": 0.75, + "learning_rate": 1.7638844811171058e-05, + "loss": 1.4251, + "step": 2020 + }, + { + "epoch": 0.63513752282231, + "grad_norm": 0.97265625, + "learning_rate": 1.763630593462393e-05, + "loss": 1.281, + "step": 2022 + }, + { + "epoch": 0.6357657498478513, + "grad_norm": 0.859375, + "learning_rate": 1.7633767058076804e-05, + "loss": 1.3546, + "step": 2024 + }, + { + "epoch": 0.6363939768733926, + "grad_norm": 0.6796875, + "learning_rate": 1.7631228181529672e-05, + "loss": 1.2134, + "step": 2026 + }, + { + "epoch": 0.6370222038989339, + "grad_norm": 0.7734375, + "learning_rate": 1.7628689304982547e-05, + "loss": 1.352, + "step": 2028 + }, + { + "epoch": 0.6376504309244754, + "grad_norm": 0.69140625, + "learning_rate": 1.7626150428435418e-05, + "loss": 1.3923, + "step": 2030 + }, + { + "epoch": 0.6382786579500167, + "grad_norm": 0.69140625, + "learning_rate": 1.762361155188829e-05, + "loss": 1.3658, + "step": 2032 + }, + { + "epoch": 0.638906884975558, + "grad_norm": 0.96875, + "learning_rate": 1.762107267534116e-05, + "loss": 1.2421, + "step": 2034 + }, + { + "epoch": 0.6395351120010994, + "grad_norm": 0.71875, + "learning_rate": 1.7618533798794036e-05, + "loss": 1.3427, + "step": 2036 + }, + { + "epoch": 0.6401633390266408, + "grad_norm": 0.8515625, + "learning_rate": 1.7615994922246907e-05, + "loss": 1.3342, + "step": 2038 + }, + { + "epoch": 0.6407915660521821, + "grad_norm": 0.7578125, + "learning_rate": 1.761345604569978e-05, + "loss": 1.4221, + "step": 2040 + }, + { + "epoch": 0.6414197930777235, + "grad_norm": 0.73046875, + "learning_rate": 1.761091716915265e-05, + "loss": 1.3898, + "step": 2042 + }, + { + "epoch": 0.6420480201032648, + "grad_norm": 0.703125, + "learning_rate": 1.7608378292605525e-05, + "loss": 1.5576, + "step": 2044 + }, + { + "epoch": 0.6426762471288062, + "grad_norm": 0.79296875, + "learning_rate": 1.7605839416058396e-05, + "loss": 1.3117, + "step": 2046 + }, + { + "epoch": 0.6433044741543475, + "grad_norm": 0.76171875, + "learning_rate": 1.7603300539511267e-05, + "loss": 1.2932, + "step": 2048 + }, + { + "epoch": 0.6439327011798889, + "grad_norm": 0.7734375, + "learning_rate": 1.7600761662964142e-05, + "loss": 1.2463, + "step": 2050 + }, + { + "epoch": 0.6445609282054302, + "grad_norm": 0.703125, + "learning_rate": 1.759822278641701e-05, + "loss": 1.3657, + "step": 2052 + }, + { + "epoch": 0.6451891552309716, + "grad_norm": 0.7734375, + "learning_rate": 1.7595683909869885e-05, + "loss": 1.4386, + "step": 2054 + }, + { + "epoch": 0.645817382256513, + "grad_norm": 0.80078125, + "learning_rate": 1.7593145033322756e-05, + "loss": 1.3022, + "step": 2056 + }, + { + "epoch": 0.6464456092820543, + "grad_norm": 1.0546875, + "learning_rate": 1.7590606156775628e-05, + "loss": 1.5185, + "step": 2058 + }, + { + "epoch": 0.6470738363075956, + "grad_norm": 0.890625, + "learning_rate": 1.75880672802285e-05, + "loss": 1.1322, + "step": 2060 + }, + { + "epoch": 0.647702063333137, + "grad_norm": 0.671875, + "learning_rate": 1.7585528403681374e-05, + "loss": 1.3653, + "step": 2062 + }, + { + "epoch": 0.6483302903586784, + "grad_norm": 0.81640625, + "learning_rate": 1.7582989527134245e-05, + "loss": 1.2649, + "step": 2064 + }, + { + "epoch": 0.6489585173842197, + "grad_norm": 0.7578125, + "learning_rate": 1.7580450650587117e-05, + "loss": 1.4218, + "step": 2066 + }, + { + "epoch": 0.649586744409761, + "grad_norm": 0.84375, + "learning_rate": 1.7577911774039988e-05, + "loss": 1.42, + "step": 2068 + }, + { + "epoch": 0.6502149714353024, + "grad_norm": 0.83984375, + "learning_rate": 1.7575372897492863e-05, + "loss": 1.2207, + "step": 2070 + }, + { + "epoch": 0.6508431984608438, + "grad_norm": 0.73046875, + "learning_rate": 1.7572834020945734e-05, + "loss": 1.3358, + "step": 2072 + }, + { + "epoch": 0.6514714254863851, + "grad_norm": 0.921875, + "learning_rate": 1.7570295144398606e-05, + "loss": 1.2739, + "step": 2074 + }, + { + "epoch": 0.6520996525119265, + "grad_norm": 0.8125, + "learning_rate": 1.7567756267851477e-05, + "loss": 1.3296, + "step": 2076 + }, + { + "epoch": 0.6527278795374678, + "grad_norm": 0.76171875, + "learning_rate": 1.756521739130435e-05, + "loss": 1.3499, + "step": 2078 + }, + { + "epoch": 0.6533561065630092, + "grad_norm": 0.78515625, + "learning_rate": 1.756267851475722e-05, + "loss": 1.3431, + "step": 2080 + }, + { + "epoch": 0.6539843335885506, + "grad_norm": 0.6796875, + "learning_rate": 1.7560139638210094e-05, + "loss": 1.4784, + "step": 2082 + }, + { + "epoch": 0.6546125606140919, + "grad_norm": 0.7265625, + "learning_rate": 1.7557600761662966e-05, + "loss": 1.2925, + "step": 2084 + }, + { + "epoch": 0.6552407876396332, + "grad_norm": 0.828125, + "learning_rate": 1.7555061885115837e-05, + "loss": 1.2877, + "step": 2086 + }, + { + "epoch": 0.6558690146651747, + "grad_norm": 0.74609375, + "learning_rate": 1.755252300856871e-05, + "loss": 1.3971, + "step": 2088 + }, + { + "epoch": 0.656497241690716, + "grad_norm": 0.69921875, + "learning_rate": 1.7549984132021583e-05, + "loss": 1.2842, + "step": 2090 + }, + { + "epoch": 0.6571254687162573, + "grad_norm": 0.734375, + "learning_rate": 1.7547445255474455e-05, + "loss": 1.1632, + "step": 2092 + }, + { + "epoch": 0.6577536957417986, + "grad_norm": 0.828125, + "learning_rate": 1.7544906378927326e-05, + "loss": 1.3903, + "step": 2094 + }, + { + "epoch": 0.6583819227673401, + "grad_norm": 0.7421875, + "learning_rate": 1.7542367502380198e-05, + "loss": 1.4009, + "step": 2096 + }, + { + "epoch": 0.6590101497928814, + "grad_norm": 0.75390625, + "learning_rate": 1.7539828625833072e-05, + "loss": 1.353, + "step": 2098 + }, + { + "epoch": 0.6596383768184227, + "grad_norm": 0.7421875, + "learning_rate": 1.753728974928594e-05, + "loss": 1.4069, + "step": 2100 + }, + { + "epoch": 0.6602666038439641, + "grad_norm": 0.828125, + "learning_rate": 1.7534750872738815e-05, + "loss": 1.2694, + "step": 2102 + }, + { + "epoch": 0.6608948308695055, + "grad_norm": 0.6796875, + "learning_rate": 1.7532211996191686e-05, + "loss": 1.2923, + "step": 2104 + }, + { + "epoch": 0.6615230578950468, + "grad_norm": 0.75390625, + "learning_rate": 1.7529673119644558e-05, + "loss": 1.2756, + "step": 2106 + }, + { + "epoch": 0.6621512849205882, + "grad_norm": 0.7421875, + "learning_rate": 1.752713424309743e-05, + "loss": 1.4151, + "step": 2108 + }, + { + "epoch": 0.6627795119461295, + "grad_norm": 0.71875, + "learning_rate": 1.7524595366550304e-05, + "loss": 1.3067, + "step": 2110 + }, + { + "epoch": 0.6634077389716709, + "grad_norm": 0.70703125, + "learning_rate": 1.7522056490003175e-05, + "loss": 1.3295, + "step": 2112 + }, + { + "epoch": 0.6640359659972123, + "grad_norm": 0.7421875, + "learning_rate": 1.7519517613456047e-05, + "loss": 1.3994, + "step": 2114 + }, + { + "epoch": 0.6646641930227536, + "grad_norm": 0.79296875, + "learning_rate": 1.7516978736908918e-05, + "loss": 1.3512, + "step": 2116 + }, + { + "epoch": 0.6652924200482949, + "grad_norm": 0.71484375, + "learning_rate": 1.7514439860361793e-05, + "loss": 1.2393, + "step": 2118 + }, + { + "epoch": 0.6659206470738364, + "grad_norm": 0.7734375, + "learning_rate": 1.751190098381466e-05, + "loss": 1.2977, + "step": 2120 + }, + { + "epoch": 0.6665488740993777, + "grad_norm": 0.73828125, + "learning_rate": 1.7509362107267536e-05, + "loss": 1.4039, + "step": 2122 + }, + { + "epoch": 0.667177101124919, + "grad_norm": 0.7265625, + "learning_rate": 1.7506823230720407e-05, + "loss": 1.3294, + "step": 2124 + }, + { + "epoch": 0.6678053281504603, + "grad_norm": 0.69921875, + "learning_rate": 1.750428435417328e-05, + "loss": 1.2816, + "step": 2126 + }, + { + "epoch": 0.6684335551760018, + "grad_norm": 0.75, + "learning_rate": 1.750174547762615e-05, + "loss": 1.3298, + "step": 2128 + }, + { + "epoch": 0.6690617822015431, + "grad_norm": 0.6796875, + "learning_rate": 1.7499206601079025e-05, + "loss": 1.3823, + "step": 2130 + }, + { + "epoch": 0.6696900092270844, + "grad_norm": 0.72265625, + "learning_rate": 1.7496667724531896e-05, + "loss": 1.2973, + "step": 2132 + }, + { + "epoch": 0.6703182362526258, + "grad_norm": 0.67578125, + "learning_rate": 1.7494128847984767e-05, + "loss": 1.3873, + "step": 2134 + }, + { + "epoch": 0.6709464632781671, + "grad_norm": 0.71484375, + "learning_rate": 1.7491589971437642e-05, + "loss": 1.3746, + "step": 2136 + }, + { + "epoch": 0.6715746903037085, + "grad_norm": 0.71875, + "learning_rate": 1.7489051094890514e-05, + "loss": 1.2803, + "step": 2138 + }, + { + "epoch": 0.6722029173292499, + "grad_norm": 0.78515625, + "learning_rate": 1.7486512218343385e-05, + "loss": 1.3632, + "step": 2140 + }, + { + "epoch": 0.6728311443547912, + "grad_norm": 0.75, + "learning_rate": 1.7483973341796256e-05, + "loss": 1.3377, + "step": 2142 + }, + { + "epoch": 0.6734593713803325, + "grad_norm": 0.69921875, + "learning_rate": 1.748143446524913e-05, + "loss": 1.2896, + "step": 2144 + }, + { + "epoch": 0.674087598405874, + "grad_norm": 0.890625, + "learning_rate": 1.7478895588702e-05, + "loss": 1.2543, + "step": 2146 + }, + { + "epoch": 0.6747158254314153, + "grad_norm": 0.87109375, + "learning_rate": 1.7476356712154874e-05, + "loss": 1.2882, + "step": 2148 + }, + { + "epoch": 0.6753440524569566, + "grad_norm": 0.86328125, + "learning_rate": 1.7473817835607745e-05, + "loss": 1.3234, + "step": 2150 + }, + { + "epoch": 0.6759722794824979, + "grad_norm": 0.75390625, + "learning_rate": 1.7471278959060617e-05, + "loss": 1.2965, + "step": 2152 + }, + { + "epoch": 0.6766005065080394, + "grad_norm": 0.67578125, + "learning_rate": 1.7468740082513488e-05, + "loss": 1.4172, + "step": 2154 + }, + { + "epoch": 0.6772287335335807, + "grad_norm": 0.73828125, + "learning_rate": 1.7466201205966363e-05, + "loss": 1.3369, + "step": 2156 + }, + { + "epoch": 0.677856960559122, + "grad_norm": 0.7265625, + "learning_rate": 1.7463662329419234e-05, + "loss": 1.3239, + "step": 2158 + }, + { + "epoch": 0.6784851875846634, + "grad_norm": 0.7265625, + "learning_rate": 1.7461123452872105e-05, + "loss": 1.2926, + "step": 2160 + }, + { + "epoch": 0.6791134146102048, + "grad_norm": 0.83203125, + "learning_rate": 1.7458584576324977e-05, + "loss": 1.3588, + "step": 2162 + }, + { + "epoch": 0.6797416416357461, + "grad_norm": 0.87109375, + "learning_rate": 1.745604569977785e-05, + "loss": 1.1972, + "step": 2164 + }, + { + "epoch": 0.6803698686612875, + "grad_norm": 0.71484375, + "learning_rate": 1.7453506823230723e-05, + "loss": 1.2391, + "step": 2166 + }, + { + "epoch": 0.6809980956868288, + "grad_norm": 0.82421875, + "learning_rate": 1.7450967946683594e-05, + "loss": 1.3438, + "step": 2168 + }, + { + "epoch": 0.6816263227123702, + "grad_norm": 0.72265625, + "learning_rate": 1.7448429070136466e-05, + "loss": 1.4117, + "step": 2170 + }, + { + "epoch": 0.6822545497379116, + "grad_norm": 0.79296875, + "learning_rate": 1.7445890193589337e-05, + "loss": 1.388, + "step": 2172 + }, + { + "epoch": 0.6828827767634529, + "grad_norm": 0.6875, + "learning_rate": 1.744335131704221e-05, + "loss": 1.3602, + "step": 2174 + }, + { + "epoch": 0.6835110037889942, + "grad_norm": 1.0546875, + "learning_rate": 1.7440812440495083e-05, + "loss": 1.2999, + "step": 2176 + }, + { + "epoch": 0.6841392308145356, + "grad_norm": 0.828125, + "learning_rate": 1.7438273563947955e-05, + "loss": 1.3296, + "step": 2178 + }, + { + "epoch": 0.684767457840077, + "grad_norm": 0.78125, + "learning_rate": 1.7435734687400826e-05, + "loss": 1.302, + "step": 2180 + }, + { + "epoch": 0.6853956848656183, + "grad_norm": 0.73046875, + "learning_rate": 1.7433195810853697e-05, + "loss": 1.321, + "step": 2182 + }, + { + "epoch": 0.6860239118911596, + "grad_norm": 0.78515625, + "learning_rate": 1.7430656934306572e-05, + "loss": 1.3628, + "step": 2184 + }, + { + "epoch": 0.6866521389167011, + "grad_norm": 0.8671875, + "learning_rate": 1.7428118057759444e-05, + "loss": 1.3183, + "step": 2186 + }, + { + "epoch": 0.6872803659422424, + "grad_norm": 0.921875, + "learning_rate": 1.7425579181212315e-05, + "loss": 1.4956, + "step": 2188 + }, + { + "epoch": 0.6879085929677837, + "grad_norm": 0.7265625, + "learning_rate": 1.7423040304665186e-05, + "loss": 1.4264, + "step": 2190 + }, + { + "epoch": 0.688536819993325, + "grad_norm": 0.765625, + "learning_rate": 1.742050142811806e-05, + "loss": 1.3176, + "step": 2192 + }, + { + "epoch": 0.6891650470188665, + "grad_norm": 0.78515625, + "learning_rate": 1.741796255157093e-05, + "loss": 1.3268, + "step": 2194 + }, + { + "epoch": 0.6897932740444078, + "grad_norm": 1.046875, + "learning_rate": 1.7415423675023804e-05, + "loss": 1.346, + "step": 2196 + }, + { + "epoch": 0.6904215010699492, + "grad_norm": 0.80078125, + "learning_rate": 1.7412884798476675e-05, + "loss": 1.3614, + "step": 2198 + }, + { + "epoch": 0.6910497280954905, + "grad_norm": 0.7265625, + "learning_rate": 1.7410345921929547e-05, + "loss": 1.2779, + "step": 2200 + }, + { + "epoch": 0.6916779551210319, + "grad_norm": 0.7265625, + "learning_rate": 1.7407807045382418e-05, + "loss": 1.2913, + "step": 2202 + }, + { + "epoch": 0.6923061821465732, + "grad_norm": 0.86328125, + "learning_rate": 1.7405268168835293e-05, + "loss": 1.3669, + "step": 2204 + }, + { + "epoch": 0.6929344091721146, + "grad_norm": 0.71484375, + "learning_rate": 1.7402729292288164e-05, + "loss": 1.2992, + "step": 2206 + }, + { + "epoch": 0.6935626361976559, + "grad_norm": 0.80078125, + "learning_rate": 1.7400190415741036e-05, + "loss": 1.3157, + "step": 2208 + }, + { + "epoch": 0.6941908632231972, + "grad_norm": 0.828125, + "learning_rate": 1.7397651539193907e-05, + "loss": 1.1712, + "step": 2210 + }, + { + "epoch": 0.6948190902487387, + "grad_norm": 0.78515625, + "learning_rate": 1.7395112662646782e-05, + "loss": 1.1813, + "step": 2212 + }, + { + "epoch": 0.69544731727428, + "grad_norm": 0.76171875, + "learning_rate": 1.739257378609965e-05, + "loss": 1.3688, + "step": 2214 + }, + { + "epoch": 0.6960755442998213, + "grad_norm": 0.9453125, + "learning_rate": 1.7390034909552525e-05, + "loss": 1.3554, + "step": 2216 + }, + { + "epoch": 0.6967037713253627, + "grad_norm": 0.87890625, + "learning_rate": 1.7387496033005396e-05, + "loss": 1.3605, + "step": 2218 + }, + { + "epoch": 0.6973319983509041, + "grad_norm": 0.890625, + "learning_rate": 1.7384957156458267e-05, + "loss": 1.2138, + "step": 2220 + }, + { + "epoch": 0.6979602253764454, + "grad_norm": 0.8515625, + "learning_rate": 1.7382418279911142e-05, + "loss": 1.309, + "step": 2222 + }, + { + "epoch": 0.6985884524019867, + "grad_norm": 0.78515625, + "learning_rate": 1.7379879403364013e-05, + "loss": 1.2578, + "step": 2224 + }, + { + "epoch": 0.6992166794275281, + "grad_norm": 0.78515625, + "learning_rate": 1.7377340526816885e-05, + "loss": 1.3457, + "step": 2226 + }, + { + "epoch": 0.6998449064530695, + "grad_norm": 0.7890625, + "learning_rate": 1.7374801650269756e-05, + "loss": 1.2938, + "step": 2228 + }, + { + "epoch": 0.7004731334786108, + "grad_norm": 0.8984375, + "learning_rate": 1.737226277372263e-05, + "loss": 1.284, + "step": 2230 + }, + { + "epoch": 0.7011013605041522, + "grad_norm": 0.72265625, + "learning_rate": 1.7369723897175502e-05, + "loss": 1.415, + "step": 2232 + }, + { + "epoch": 0.7017295875296935, + "grad_norm": 0.94921875, + "learning_rate": 1.7367185020628374e-05, + "loss": 1.2291, + "step": 2234 + }, + { + "epoch": 0.7023578145552349, + "grad_norm": 0.74609375, + "learning_rate": 1.7364646144081245e-05, + "loss": 1.374, + "step": 2236 + }, + { + "epoch": 0.7029860415807763, + "grad_norm": 0.6953125, + "learning_rate": 1.736210726753412e-05, + "loss": 1.4697, + "step": 2238 + }, + { + "epoch": 0.7036142686063176, + "grad_norm": 0.71484375, + "learning_rate": 1.7359568390986988e-05, + "loss": 1.2784, + "step": 2240 + }, + { + "epoch": 0.7042424956318589, + "grad_norm": 0.73828125, + "learning_rate": 1.7357029514439863e-05, + "loss": 1.2381, + "step": 2242 + }, + { + "epoch": 0.7048707226574004, + "grad_norm": 0.78125, + "learning_rate": 1.7354490637892734e-05, + "loss": 1.2173, + "step": 2244 + }, + { + "epoch": 0.7054989496829417, + "grad_norm": 0.77734375, + "learning_rate": 1.7351951761345605e-05, + "loss": 1.2839, + "step": 2246 + }, + { + "epoch": 0.706127176708483, + "grad_norm": 0.6953125, + "learning_rate": 1.7349412884798477e-05, + "loss": 1.3768, + "step": 2248 + }, + { + "epoch": 0.7067554037340243, + "grad_norm": 0.81640625, + "learning_rate": 1.734687400825135e-05, + "loss": 1.3607, + "step": 2250 + }, + { + "epoch": 0.7073836307595658, + "grad_norm": 0.703125, + "learning_rate": 1.7344335131704223e-05, + "loss": 1.3943, + "step": 2252 + }, + { + "epoch": 0.7080118577851071, + "grad_norm": 0.6875, + "learning_rate": 1.7341796255157094e-05, + "loss": 1.4092, + "step": 2254 + }, + { + "epoch": 0.7086400848106484, + "grad_norm": 0.75, + "learning_rate": 1.7339257378609966e-05, + "loss": 1.2429, + "step": 2256 + }, + { + "epoch": 0.7092683118361898, + "grad_norm": 0.7109375, + "learning_rate": 1.733671850206284e-05, + "loss": 1.458, + "step": 2258 + }, + { + "epoch": 0.7098965388617312, + "grad_norm": 0.67578125, + "learning_rate": 1.7334179625515712e-05, + "loss": 1.3227, + "step": 2260 + }, + { + "epoch": 0.7105247658872725, + "grad_norm": 0.79296875, + "learning_rate": 1.7331640748968583e-05, + "loss": 1.4453, + "step": 2262 + }, + { + "epoch": 0.7111529929128139, + "grad_norm": 0.74609375, + "learning_rate": 1.7329101872421455e-05, + "loss": 1.2725, + "step": 2264 + }, + { + "epoch": 0.7117812199383552, + "grad_norm": 0.74609375, + "learning_rate": 1.7326562995874326e-05, + "loss": 1.2165, + "step": 2266 + }, + { + "epoch": 0.7124094469638966, + "grad_norm": 0.76171875, + "learning_rate": 1.7324024119327197e-05, + "loss": 1.4287, + "step": 2268 + }, + { + "epoch": 0.713037673989438, + "grad_norm": 0.6484375, + "learning_rate": 1.7321485242780072e-05, + "loss": 1.2783, + "step": 2270 + }, + { + "epoch": 0.7136659010149793, + "grad_norm": 0.6875, + "learning_rate": 1.7318946366232944e-05, + "loss": 1.3641, + "step": 2272 + }, + { + "epoch": 0.7142941280405206, + "grad_norm": 0.7109375, + "learning_rate": 1.7316407489685815e-05, + "loss": 1.3313, + "step": 2274 + }, + { + "epoch": 0.7149223550660619, + "grad_norm": 0.68359375, + "learning_rate": 1.7313868613138686e-05, + "loss": 1.3461, + "step": 2276 + }, + { + "epoch": 0.7155505820916034, + "grad_norm": 0.6953125, + "learning_rate": 1.731132973659156e-05, + "loss": 1.3159, + "step": 2278 + }, + { + "epoch": 0.7161788091171447, + "grad_norm": 0.765625, + "learning_rate": 1.7308790860044432e-05, + "loss": 1.2575, + "step": 2280 + }, + { + "epoch": 0.716807036142686, + "grad_norm": 0.65625, + "learning_rate": 1.7306251983497304e-05, + "loss": 1.3816, + "step": 2282 + }, + { + "epoch": 0.7174352631682274, + "grad_norm": 0.71875, + "learning_rate": 1.7303713106950175e-05, + "loss": 1.4548, + "step": 2284 + }, + { + "epoch": 0.7180634901937688, + "grad_norm": 0.83984375, + "learning_rate": 1.730117423040305e-05, + "loss": 1.2777, + "step": 2286 + }, + { + "epoch": 0.7186917172193101, + "grad_norm": 0.7421875, + "learning_rate": 1.7298635353855918e-05, + "loss": 1.3142, + "step": 2288 + }, + { + "epoch": 0.7193199442448515, + "grad_norm": 0.7890625, + "learning_rate": 1.7296096477308793e-05, + "loss": 1.2618, + "step": 2290 + }, + { + "epoch": 0.7199481712703928, + "grad_norm": 0.70703125, + "learning_rate": 1.7293557600761664e-05, + "loss": 1.3586, + "step": 2292 + }, + { + "epoch": 0.7205763982959342, + "grad_norm": 0.77734375, + "learning_rate": 1.7291018724214536e-05, + "loss": 1.2284, + "step": 2294 + }, + { + "epoch": 0.7212046253214756, + "grad_norm": 0.76953125, + "learning_rate": 1.7288479847667407e-05, + "loss": 1.3143, + "step": 2296 + }, + { + "epoch": 0.7218328523470169, + "grad_norm": 0.73828125, + "learning_rate": 1.728594097112028e-05, + "loss": 1.2988, + "step": 2298 + }, + { + "epoch": 0.7224610793725582, + "grad_norm": 0.78125, + "learning_rate": 1.7283402094573153e-05, + "loss": 1.3754, + "step": 2300 + }, + { + "epoch": 0.7230893063980997, + "grad_norm": 0.69140625, + "learning_rate": 1.7280863218026024e-05, + "loss": 1.3633, + "step": 2302 + }, + { + "epoch": 0.723717533423641, + "grad_norm": 0.71875, + "learning_rate": 1.7278324341478896e-05, + "loss": 1.4773, + "step": 2304 + }, + { + "epoch": 0.7243457604491823, + "grad_norm": 0.77734375, + "learning_rate": 1.727578546493177e-05, + "loss": 1.351, + "step": 2306 + }, + { + "epoch": 0.7249739874747236, + "grad_norm": 0.73828125, + "learning_rate": 1.7273246588384642e-05, + "loss": 1.3577, + "step": 2308 + }, + { + "epoch": 0.7256022145002651, + "grad_norm": 0.81640625, + "learning_rate": 1.7270707711837513e-05, + "loss": 1.2883, + "step": 2310 + }, + { + "epoch": 0.7262304415258064, + "grad_norm": 0.67578125, + "learning_rate": 1.7268168835290388e-05, + "loss": 1.4049, + "step": 2312 + }, + { + "epoch": 0.7268586685513477, + "grad_norm": 0.6796875, + "learning_rate": 1.7265629958743256e-05, + "loss": 1.3443, + "step": 2314 + }, + { + "epoch": 0.7274868955768891, + "grad_norm": 0.90234375, + "learning_rate": 1.726309108219613e-05, + "loss": 1.2131, + "step": 2316 + }, + { + "epoch": 0.7281151226024305, + "grad_norm": 0.71875, + "learning_rate": 1.7260552205649002e-05, + "loss": 1.353, + "step": 2318 + }, + { + "epoch": 0.7287433496279718, + "grad_norm": 0.73828125, + "learning_rate": 1.7258013329101874e-05, + "loss": 1.2911, + "step": 2320 + }, + { + "epoch": 0.7293715766535132, + "grad_norm": 0.90625, + "learning_rate": 1.7255474452554745e-05, + "loss": 1.3567, + "step": 2322 + }, + { + "epoch": 0.7299998036790545, + "grad_norm": 0.70703125, + "learning_rate": 1.725293557600762e-05, + "loss": 1.3589, + "step": 2324 + }, + { + "epoch": 0.7306280307045959, + "grad_norm": 0.87109375, + "learning_rate": 1.725039669946049e-05, + "loss": 1.3734, + "step": 2326 + }, + { + "epoch": 0.7312562577301372, + "grad_norm": 0.6484375, + "learning_rate": 1.7247857822913363e-05, + "loss": 1.3007, + "step": 2328 + }, + { + "epoch": 0.7318844847556786, + "grad_norm": 0.80859375, + "learning_rate": 1.7245318946366234e-05, + "loss": 1.3652, + "step": 2330 + }, + { + "epoch": 0.7325127117812199, + "grad_norm": 0.72265625, + "learning_rate": 1.724278006981911e-05, + "loss": 1.3929, + "step": 2332 + }, + { + "epoch": 0.7331409388067613, + "grad_norm": 0.6640625, + "learning_rate": 1.7240241193271977e-05, + "loss": 1.2893, + "step": 2334 + }, + { + "epoch": 0.7337691658323027, + "grad_norm": 0.77734375, + "learning_rate": 1.723770231672485e-05, + "loss": 1.4986, + "step": 2336 + }, + { + "epoch": 0.734397392857844, + "grad_norm": 0.6875, + "learning_rate": 1.7235163440177723e-05, + "loss": 1.3224, + "step": 2338 + }, + { + "epoch": 0.7350256198833853, + "grad_norm": 0.77734375, + "learning_rate": 1.7232624563630594e-05, + "loss": 1.422, + "step": 2340 + }, + { + "epoch": 0.7356538469089268, + "grad_norm": 0.703125, + "learning_rate": 1.7230085687083466e-05, + "loss": 1.4021, + "step": 2342 + }, + { + "epoch": 0.7362820739344681, + "grad_norm": 0.67578125, + "learning_rate": 1.722754681053634e-05, + "loss": 1.3948, + "step": 2344 + }, + { + "epoch": 0.7369103009600094, + "grad_norm": 0.73046875, + "learning_rate": 1.7225007933989212e-05, + "loss": 1.2958, + "step": 2346 + }, + { + "epoch": 0.7375385279855508, + "grad_norm": 0.734375, + "learning_rate": 1.7222469057442083e-05, + "loss": 1.2972, + "step": 2348 + }, + { + "epoch": 0.7381667550110921, + "grad_norm": 0.68359375, + "learning_rate": 1.7219930180894955e-05, + "loss": 1.3356, + "step": 2350 + }, + { + "epoch": 0.7387949820366335, + "grad_norm": 0.82421875, + "learning_rate": 1.721739130434783e-05, + "loss": 1.2247, + "step": 2352 + }, + { + "epoch": 0.7394232090621748, + "grad_norm": 0.70703125, + "learning_rate": 1.7214852427800697e-05, + "loss": 1.3243, + "step": 2354 + }, + { + "epoch": 0.7400514360877162, + "grad_norm": 0.7265625, + "learning_rate": 1.7212313551253572e-05, + "loss": 1.4064, + "step": 2356 + }, + { + "epoch": 0.7406796631132575, + "grad_norm": 0.77734375, + "learning_rate": 1.7209774674706443e-05, + "loss": 1.4806, + "step": 2358 + }, + { + "epoch": 0.7413078901387989, + "grad_norm": 0.85546875, + "learning_rate": 1.7207235798159315e-05, + "loss": 1.3769, + "step": 2360 + }, + { + "epoch": 0.7419361171643403, + "grad_norm": 0.71875, + "learning_rate": 1.7204696921612186e-05, + "loss": 1.2256, + "step": 2362 + }, + { + "epoch": 0.7425643441898816, + "grad_norm": 0.78125, + "learning_rate": 1.720215804506506e-05, + "loss": 1.389, + "step": 2364 + }, + { + "epoch": 0.7431925712154229, + "grad_norm": 0.6796875, + "learning_rate": 1.7199619168517932e-05, + "loss": 1.4362, + "step": 2366 + }, + { + "epoch": 0.7438207982409644, + "grad_norm": 0.8984375, + "learning_rate": 1.7197080291970804e-05, + "loss": 1.4191, + "step": 2368 + }, + { + "epoch": 0.7444490252665057, + "grad_norm": 0.7265625, + "learning_rate": 1.7194541415423675e-05, + "loss": 1.3115, + "step": 2370 + }, + { + "epoch": 0.745077252292047, + "grad_norm": 0.7578125, + "learning_rate": 1.719200253887655e-05, + "loss": 1.4019, + "step": 2372 + }, + { + "epoch": 0.7457054793175883, + "grad_norm": 0.734375, + "learning_rate": 1.718946366232942e-05, + "loss": 1.3587, + "step": 2374 + }, + { + "epoch": 0.7463337063431298, + "grad_norm": 0.87109375, + "learning_rate": 1.7186924785782293e-05, + "loss": 1.3749, + "step": 2376 + }, + { + "epoch": 0.7469619333686711, + "grad_norm": 0.6875, + "learning_rate": 1.7184385909235164e-05, + "loss": 1.3042, + "step": 2378 + }, + { + "epoch": 0.7475901603942124, + "grad_norm": 0.73828125, + "learning_rate": 1.7181847032688035e-05, + "loss": 1.2356, + "step": 2380 + }, + { + "epoch": 0.7482183874197538, + "grad_norm": 0.7734375, + "learning_rate": 1.7179308156140907e-05, + "loss": 1.2864, + "step": 2382 + }, + { + "epoch": 0.7488466144452952, + "grad_norm": 0.69921875, + "learning_rate": 1.717676927959378e-05, + "loss": 1.3995, + "step": 2384 + }, + { + "epoch": 0.7494748414708365, + "grad_norm": 0.78125, + "learning_rate": 1.7174230403046653e-05, + "loss": 1.2924, + "step": 2386 + }, + { + "epoch": 0.7501030684963779, + "grad_norm": 0.81640625, + "learning_rate": 1.7171691526499524e-05, + "loss": 1.2801, + "step": 2388 + }, + { + "epoch": 0.7507312955219192, + "grad_norm": 0.7890625, + "learning_rate": 1.7169152649952396e-05, + "loss": 1.2726, + "step": 2390 + }, + { + "epoch": 0.7513595225474606, + "grad_norm": 0.734375, + "learning_rate": 1.716661377340527e-05, + "loss": 1.35, + "step": 2392 + }, + { + "epoch": 0.751987749573002, + "grad_norm": 0.796875, + "learning_rate": 1.7164074896858142e-05, + "loss": 1.2783, + "step": 2394 + }, + { + "epoch": 0.7526159765985433, + "grad_norm": 0.78515625, + "learning_rate": 1.7161536020311013e-05, + "loss": 1.3665, + "step": 2396 + }, + { + "epoch": 0.7532442036240846, + "grad_norm": 0.98046875, + "learning_rate": 1.7158997143763888e-05, + "loss": 1.3679, + "step": 2398 + }, + { + "epoch": 0.7538724306496261, + "grad_norm": 0.78515625, + "learning_rate": 1.715645826721676e-05, + "loss": 1.3874, + "step": 2400 + }, + { + "epoch": 0.7545006576751674, + "grad_norm": 0.75390625, + "learning_rate": 1.715391939066963e-05, + "loss": 1.2631, + "step": 2402 + }, + { + "epoch": 0.7551288847007087, + "grad_norm": 0.796875, + "learning_rate": 1.7151380514122502e-05, + "loss": 1.2159, + "step": 2404 + }, + { + "epoch": 0.75575711172625, + "grad_norm": 0.74609375, + "learning_rate": 1.7148841637575374e-05, + "loss": 1.3067, + "step": 2406 + }, + { + "epoch": 0.7563853387517915, + "grad_norm": 0.7109375, + "learning_rate": 1.7146302761028245e-05, + "loss": 1.3503, + "step": 2408 + }, + { + "epoch": 0.7570135657773328, + "grad_norm": 0.7421875, + "learning_rate": 1.714376388448112e-05, + "loss": 1.369, + "step": 2410 + }, + { + "epoch": 0.7576417928028741, + "grad_norm": 0.88671875, + "learning_rate": 1.714122500793399e-05, + "loss": 1.2634, + "step": 2412 + }, + { + "epoch": 0.7582700198284155, + "grad_norm": 0.75390625, + "learning_rate": 1.7138686131386862e-05, + "loss": 1.2631, + "step": 2414 + }, + { + "epoch": 0.7588982468539569, + "grad_norm": 0.72265625, + "learning_rate": 1.7136147254839734e-05, + "loss": 1.33, + "step": 2416 + }, + { + "epoch": 0.7595264738794982, + "grad_norm": 0.72265625, + "learning_rate": 1.713360837829261e-05, + "loss": 1.3229, + "step": 2418 + }, + { + "epoch": 0.7601547009050396, + "grad_norm": 1.1640625, + "learning_rate": 1.713106950174548e-05, + "loss": 1.2857, + "step": 2420 + }, + { + "epoch": 0.7607829279305809, + "grad_norm": 0.875, + "learning_rate": 1.712853062519835e-05, + "loss": 1.3812, + "step": 2422 + }, + { + "epoch": 0.7614111549561222, + "grad_norm": 0.6953125, + "learning_rate": 1.7125991748651223e-05, + "loss": 1.3809, + "step": 2424 + }, + { + "epoch": 0.7620393819816637, + "grad_norm": 0.7890625, + "learning_rate": 1.7123452872104098e-05, + "loss": 1.3366, + "step": 2426 + }, + { + "epoch": 0.762667609007205, + "grad_norm": 0.77734375, + "learning_rate": 1.7120913995556966e-05, + "loss": 1.3628, + "step": 2428 + }, + { + "epoch": 0.7632958360327463, + "grad_norm": 0.8046875, + "learning_rate": 1.711837511900984e-05, + "loss": 1.3394, + "step": 2430 + }, + { + "epoch": 0.7639240630582876, + "grad_norm": 0.7265625, + "learning_rate": 1.7115836242462712e-05, + "loss": 1.4378, + "step": 2432 + }, + { + "epoch": 0.7645522900838291, + "grad_norm": 0.7890625, + "learning_rate": 1.7113297365915583e-05, + "loss": 1.1978, + "step": 2434 + }, + { + "epoch": 0.7651805171093704, + "grad_norm": 0.75, + "learning_rate": 1.7110758489368454e-05, + "loss": 1.2939, + "step": 2436 + }, + { + "epoch": 0.7658087441349117, + "grad_norm": 0.7109375, + "learning_rate": 1.710821961282133e-05, + "loss": 1.3248, + "step": 2438 + }, + { + "epoch": 0.7664369711604531, + "grad_norm": 0.7578125, + "learning_rate": 1.71056807362742e-05, + "loss": 1.2087, + "step": 2440 + }, + { + "epoch": 0.7670651981859945, + "grad_norm": 0.81640625, + "learning_rate": 1.7103141859727072e-05, + "loss": 1.1633, + "step": 2442 + }, + { + "epoch": 0.7676934252115358, + "grad_norm": 1.078125, + "learning_rate": 1.7100602983179943e-05, + "loss": 1.2432, + "step": 2444 + }, + { + "epoch": 0.7683216522370772, + "grad_norm": 0.75390625, + "learning_rate": 1.7098064106632818e-05, + "loss": 1.3272, + "step": 2446 + }, + { + "epoch": 0.7689498792626185, + "grad_norm": 0.78125, + "learning_rate": 1.7095525230085686e-05, + "loss": 1.2589, + "step": 2448 + }, + { + "epoch": 0.7695781062881599, + "grad_norm": 0.71484375, + "learning_rate": 1.709298635353856e-05, + "loss": 1.375, + "step": 2450 + }, + { + "epoch": 0.7702063333137013, + "grad_norm": 0.72265625, + "learning_rate": 1.7090447476991432e-05, + "loss": 1.2817, + "step": 2452 + }, + { + "epoch": 0.7708345603392426, + "grad_norm": 0.75390625, + "learning_rate": 1.7087908600444304e-05, + "loss": 1.2879, + "step": 2454 + }, + { + "epoch": 0.7714627873647839, + "grad_norm": 1.234375, + "learning_rate": 1.7085369723897175e-05, + "loss": 1.2573, + "step": 2456 + }, + { + "epoch": 0.7720910143903253, + "grad_norm": 0.7890625, + "learning_rate": 1.708283084735005e-05, + "loss": 1.343, + "step": 2458 + }, + { + "epoch": 0.7727192414158667, + "grad_norm": 0.7109375, + "learning_rate": 1.708029197080292e-05, + "loss": 1.357, + "step": 2460 + }, + { + "epoch": 0.773347468441408, + "grad_norm": 0.74609375, + "learning_rate": 1.7077753094255793e-05, + "loss": 1.3493, + "step": 2462 + }, + { + "epoch": 0.7739756954669493, + "grad_norm": 0.78515625, + "learning_rate": 1.7075214217708664e-05, + "loss": 1.2568, + "step": 2464 + }, + { + "epoch": 0.7746039224924908, + "grad_norm": 0.73828125, + "learning_rate": 1.707267534116154e-05, + "loss": 1.3476, + "step": 2466 + }, + { + "epoch": 0.7752321495180321, + "grad_norm": 0.76171875, + "learning_rate": 1.707013646461441e-05, + "loss": 1.2797, + "step": 2468 + }, + { + "epoch": 0.7758603765435734, + "grad_norm": 0.75390625, + "learning_rate": 1.706759758806728e-05, + "loss": 1.3368, + "step": 2470 + }, + { + "epoch": 0.7764886035691148, + "grad_norm": 0.671875, + "learning_rate": 1.7065058711520153e-05, + "loss": 1.2807, + "step": 2472 + }, + { + "epoch": 0.7771168305946562, + "grad_norm": 0.71484375, + "learning_rate": 1.7062519834973024e-05, + "loss": 1.5012, + "step": 2474 + }, + { + "epoch": 0.7777450576201975, + "grad_norm": 0.734375, + "learning_rate": 1.7059980958425896e-05, + "loss": 1.3204, + "step": 2476 + }, + { + "epoch": 0.7783732846457388, + "grad_norm": 0.8046875, + "learning_rate": 1.705744208187877e-05, + "loss": 1.3475, + "step": 2478 + }, + { + "epoch": 0.7790015116712802, + "grad_norm": 0.796875, + "learning_rate": 1.7054903205331642e-05, + "loss": 1.2051, + "step": 2480 + }, + { + "epoch": 0.7796297386968216, + "grad_norm": 0.68359375, + "learning_rate": 1.7052364328784513e-05, + "loss": 1.3502, + "step": 2482 + }, + { + "epoch": 0.780257965722363, + "grad_norm": 0.9140625, + "learning_rate": 1.7049825452237388e-05, + "loss": 1.2337, + "step": 2484 + }, + { + "epoch": 0.7808861927479043, + "grad_norm": 0.77734375, + "learning_rate": 1.704728657569026e-05, + "loss": 1.3524, + "step": 2486 + }, + { + "epoch": 0.7815144197734456, + "grad_norm": 0.82421875, + "learning_rate": 1.704474769914313e-05, + "loss": 1.3843, + "step": 2488 + }, + { + "epoch": 0.7821426467989869, + "grad_norm": 0.6953125, + "learning_rate": 1.7042208822596002e-05, + "loss": 1.3905, + "step": 2490 + }, + { + "epoch": 0.7827708738245284, + "grad_norm": 0.69921875, + "learning_rate": 1.7039669946048877e-05, + "loss": 1.3168, + "step": 2492 + }, + { + "epoch": 0.7833991008500697, + "grad_norm": 0.79296875, + "learning_rate": 1.7037131069501748e-05, + "loss": 1.233, + "step": 2494 + }, + { + "epoch": 0.784027327875611, + "grad_norm": 0.77734375, + "learning_rate": 1.703459219295462e-05, + "loss": 1.3278, + "step": 2496 + }, + { + "epoch": 0.7846555549011524, + "grad_norm": 0.6953125, + "learning_rate": 1.703205331640749e-05, + "loss": 1.2751, + "step": 2498 + }, + { + "epoch": 0.7852837819266938, + "grad_norm": 0.796875, + "learning_rate": 1.7029514439860362e-05, + "loss": 1.3463, + "step": 2500 + }, + { + "epoch": 0.7859120089522351, + "grad_norm": 0.80859375, + "learning_rate": 1.7026975563313234e-05, + "loss": 1.2921, + "step": 2502 + }, + { + "epoch": 0.7865402359777764, + "grad_norm": 0.71484375, + "learning_rate": 1.702443668676611e-05, + "loss": 1.1402, + "step": 2504 + }, + { + "epoch": 0.7871684630033178, + "grad_norm": 1.125, + "learning_rate": 1.702189781021898e-05, + "loss": 1.2382, + "step": 2506 + }, + { + "epoch": 0.7877966900288592, + "grad_norm": 0.63671875, + "learning_rate": 1.701935893367185e-05, + "loss": 1.3848, + "step": 2508 + }, + { + "epoch": 0.7884249170544005, + "grad_norm": 0.7578125, + "learning_rate": 1.7016820057124723e-05, + "loss": 1.2577, + "step": 2510 + }, + { + "epoch": 0.7890531440799419, + "grad_norm": 0.74609375, + "learning_rate": 1.7014281180577597e-05, + "loss": 1.4976, + "step": 2512 + }, + { + "epoch": 0.7896813711054832, + "grad_norm": 0.65234375, + "learning_rate": 1.701174230403047e-05, + "loss": 1.3051, + "step": 2514 + }, + { + "epoch": 0.7903095981310246, + "grad_norm": 0.75, + "learning_rate": 1.700920342748334e-05, + "loss": 1.3637, + "step": 2516 + }, + { + "epoch": 0.790937825156566, + "grad_norm": 0.828125, + "learning_rate": 1.700666455093621e-05, + "loss": 1.2335, + "step": 2518 + }, + { + "epoch": 0.7915660521821073, + "grad_norm": 0.73828125, + "learning_rate": 1.7004125674389086e-05, + "loss": 1.2534, + "step": 2520 + }, + { + "epoch": 0.7921942792076486, + "grad_norm": 0.78515625, + "learning_rate": 1.7001586797841954e-05, + "loss": 1.4272, + "step": 2522 + }, + { + "epoch": 0.7928225062331901, + "grad_norm": 0.66796875, + "learning_rate": 1.699904792129483e-05, + "loss": 1.2296, + "step": 2524 + }, + { + "epoch": 0.7934507332587314, + "grad_norm": 0.765625, + "learning_rate": 1.69965090447477e-05, + "loss": 1.3799, + "step": 2526 + }, + { + "epoch": 0.7940789602842727, + "grad_norm": 0.625, + "learning_rate": 1.6993970168200572e-05, + "loss": 1.4241, + "step": 2528 + }, + { + "epoch": 0.794707187309814, + "grad_norm": 0.8125, + "learning_rate": 1.6991431291653443e-05, + "loss": 1.2411, + "step": 2530 + }, + { + "epoch": 0.7953354143353555, + "grad_norm": 1.078125, + "learning_rate": 1.6988892415106318e-05, + "loss": 1.3962, + "step": 2532 + }, + { + "epoch": 0.7959636413608968, + "grad_norm": 0.8828125, + "learning_rate": 1.698635353855919e-05, + "loss": 1.3154, + "step": 2534 + }, + { + "epoch": 0.7965918683864381, + "grad_norm": 0.62890625, + "learning_rate": 1.698381466201206e-05, + "loss": 1.3236, + "step": 2536 + }, + { + "epoch": 0.7972200954119795, + "grad_norm": 0.80859375, + "learning_rate": 1.6981275785464932e-05, + "loss": 1.2605, + "step": 2538 + }, + { + "epoch": 0.7978483224375209, + "grad_norm": 0.7578125, + "learning_rate": 1.6978736908917807e-05, + "loss": 1.2216, + "step": 2540 + }, + { + "epoch": 0.7984765494630622, + "grad_norm": 0.6875, + "learning_rate": 1.6976198032370675e-05, + "loss": 1.3394, + "step": 2542 + }, + { + "epoch": 0.7991047764886036, + "grad_norm": 0.73828125, + "learning_rate": 1.697365915582355e-05, + "loss": 1.331, + "step": 2544 + }, + { + "epoch": 0.7997330035141449, + "grad_norm": 0.72265625, + "learning_rate": 1.697112027927642e-05, + "loss": 1.3703, + "step": 2546 + }, + { + "epoch": 0.8003612305396863, + "grad_norm": 0.828125, + "learning_rate": 1.6968581402729293e-05, + "loss": 1.3128, + "step": 2548 + }, + { + "epoch": 0.8009894575652277, + "grad_norm": 0.8125, + "learning_rate": 1.6966042526182164e-05, + "loss": 1.278, + "step": 2550 + }, + { + "epoch": 0.801617684590769, + "grad_norm": 0.65625, + "learning_rate": 1.696350364963504e-05, + "loss": 1.3876, + "step": 2552 + }, + { + "epoch": 0.8022459116163103, + "grad_norm": 0.71484375, + "learning_rate": 1.696096477308791e-05, + "loss": 1.2858, + "step": 2554 + }, + { + "epoch": 0.8028741386418518, + "grad_norm": 0.6953125, + "learning_rate": 1.695842589654078e-05, + "loss": 1.412, + "step": 2556 + }, + { + "epoch": 0.8035023656673931, + "grad_norm": 0.7109375, + "learning_rate": 1.6955887019993653e-05, + "loss": 1.4499, + "step": 2558 + }, + { + "epoch": 0.8041305926929344, + "grad_norm": 0.9140625, + "learning_rate": 1.6953348143446528e-05, + "loss": 1.291, + "step": 2560 + }, + { + "epoch": 0.8047588197184757, + "grad_norm": 0.90625, + "learning_rate": 1.69508092668994e-05, + "loss": 1.4154, + "step": 2562 + }, + { + "epoch": 0.8053870467440171, + "grad_norm": 0.82421875, + "learning_rate": 1.694827039035227e-05, + "loss": 1.4474, + "step": 2564 + }, + { + "epoch": 0.8060152737695585, + "grad_norm": 0.79296875, + "learning_rate": 1.6945731513805145e-05, + "loss": 1.3263, + "step": 2566 + }, + { + "epoch": 0.8066435007950998, + "grad_norm": 0.84375, + "learning_rate": 1.6943192637258013e-05, + "loss": 1.3238, + "step": 2568 + }, + { + "epoch": 0.8072717278206412, + "grad_norm": 0.83984375, + "learning_rate": 1.6940653760710888e-05, + "loss": 1.4225, + "step": 2570 + }, + { + "epoch": 0.8078999548461825, + "grad_norm": 0.70703125, + "learning_rate": 1.693811488416376e-05, + "loss": 1.2038, + "step": 2572 + }, + { + "epoch": 0.8085281818717239, + "grad_norm": 0.8359375, + "learning_rate": 1.693557600761663e-05, + "loss": 1.1913, + "step": 2574 + }, + { + "epoch": 0.8091564088972653, + "grad_norm": 0.76953125, + "learning_rate": 1.6933037131069502e-05, + "loss": 1.3431, + "step": 2576 + }, + { + "epoch": 0.8097846359228066, + "grad_norm": 0.87890625, + "learning_rate": 1.6930498254522377e-05, + "loss": 1.3336, + "step": 2578 + }, + { + "epoch": 0.8104128629483479, + "grad_norm": 0.87890625, + "learning_rate": 1.6927959377975248e-05, + "loss": 1.2205, + "step": 2580 + }, + { + "epoch": 0.8110410899738894, + "grad_norm": 0.69921875, + "learning_rate": 1.692542050142812e-05, + "loss": 1.3004, + "step": 2582 + }, + { + "epoch": 0.8116693169994307, + "grad_norm": 0.75390625, + "learning_rate": 1.692288162488099e-05, + "loss": 1.3125, + "step": 2584 + }, + { + "epoch": 0.812297544024972, + "grad_norm": 0.6953125, + "learning_rate": 1.6920342748333866e-05, + "loss": 1.4572, + "step": 2586 + }, + { + "epoch": 0.8129257710505133, + "grad_norm": 0.74609375, + "learning_rate": 1.6917803871786737e-05, + "loss": 1.2809, + "step": 2588 + }, + { + "epoch": 0.8135539980760548, + "grad_norm": 0.66796875, + "learning_rate": 1.691526499523961e-05, + "loss": 1.2979, + "step": 2590 + }, + { + "epoch": 0.8141822251015961, + "grad_norm": 0.890625, + "learning_rate": 1.691272611869248e-05, + "loss": 1.3751, + "step": 2592 + }, + { + "epoch": 0.8148104521271374, + "grad_norm": 0.8125, + "learning_rate": 1.691018724214535e-05, + "loss": 1.3556, + "step": 2594 + }, + { + "epoch": 0.8154386791526788, + "grad_norm": 0.734375, + "learning_rate": 1.6907648365598223e-05, + "loss": 1.2648, + "step": 2596 + }, + { + "epoch": 0.8160669061782202, + "grad_norm": 0.77734375, + "learning_rate": 1.6905109489051097e-05, + "loss": 1.3499, + "step": 2598 + }, + { + "epoch": 0.8166951332037615, + "grad_norm": 0.8359375, + "learning_rate": 1.690257061250397e-05, + "loss": 1.3424, + "step": 2600 + }, + { + "epoch": 0.8173233602293029, + "grad_norm": 0.72265625, + "learning_rate": 1.690003173595684e-05, + "loss": 1.3746, + "step": 2602 + }, + { + "epoch": 0.8179515872548442, + "grad_norm": 0.78515625, + "learning_rate": 1.689749285940971e-05, + "loss": 1.3152, + "step": 2604 + }, + { + "epoch": 0.8185798142803856, + "grad_norm": 0.7109375, + "learning_rate": 1.6894953982862586e-05, + "loss": 1.3755, + "step": 2606 + }, + { + "epoch": 0.819208041305927, + "grad_norm": 0.84765625, + "learning_rate": 1.6892415106315458e-05, + "loss": 1.2247, + "step": 2608 + }, + { + "epoch": 0.8198362683314683, + "grad_norm": 0.69921875, + "learning_rate": 1.688987622976833e-05, + "loss": 1.4328, + "step": 2610 + }, + { + "epoch": 0.8204644953570096, + "grad_norm": 0.6796875, + "learning_rate": 1.68873373532212e-05, + "loss": 1.2965, + "step": 2612 + }, + { + "epoch": 0.821092722382551, + "grad_norm": 0.91015625, + "learning_rate": 1.6884798476674075e-05, + "loss": 1.2175, + "step": 2614 + }, + { + "epoch": 0.8217209494080924, + "grad_norm": 0.8828125, + "learning_rate": 1.6882259600126943e-05, + "loss": 1.1868, + "step": 2616 + }, + { + "epoch": 0.8223491764336337, + "grad_norm": 0.9296875, + "learning_rate": 1.6879720723579818e-05, + "loss": 1.331, + "step": 2618 + }, + { + "epoch": 0.822977403459175, + "grad_norm": 0.69140625, + "learning_rate": 1.687718184703269e-05, + "loss": 1.3342, + "step": 2620 + }, + { + "epoch": 0.8236056304847165, + "grad_norm": 0.68359375, + "learning_rate": 1.687464297048556e-05, + "loss": 1.3036, + "step": 2622 + }, + { + "epoch": 0.8242338575102578, + "grad_norm": 0.75390625, + "learning_rate": 1.6872104093938432e-05, + "loss": 1.2481, + "step": 2624 + }, + { + "epoch": 0.8248620845357991, + "grad_norm": 0.703125, + "learning_rate": 1.6869565217391307e-05, + "loss": 1.3175, + "step": 2626 + }, + { + "epoch": 0.8254903115613405, + "grad_norm": 0.97265625, + "learning_rate": 1.686702634084418e-05, + "loss": 1.3181, + "step": 2628 + }, + { + "epoch": 0.8261185385868819, + "grad_norm": 0.7421875, + "learning_rate": 1.686448746429705e-05, + "loss": 1.3106, + "step": 2630 + }, + { + "epoch": 0.8267467656124232, + "grad_norm": 0.82421875, + "learning_rate": 1.686194858774992e-05, + "loss": 1.3216, + "step": 2632 + }, + { + "epoch": 0.8273749926379645, + "grad_norm": 0.85546875, + "learning_rate": 1.6859409711202796e-05, + "loss": 1.3221, + "step": 2634 + }, + { + "epoch": 0.8280032196635059, + "grad_norm": 0.7734375, + "learning_rate": 1.6856870834655664e-05, + "loss": 1.3614, + "step": 2636 + }, + { + "epoch": 0.8286314466890472, + "grad_norm": 0.7890625, + "learning_rate": 1.685433195810854e-05, + "loss": 1.3956, + "step": 2638 + }, + { + "epoch": 0.8292596737145886, + "grad_norm": 0.6875, + "learning_rate": 1.685179308156141e-05, + "loss": 1.1662, + "step": 2640 + }, + { + "epoch": 0.82988790074013, + "grad_norm": 0.76953125, + "learning_rate": 1.684925420501428e-05, + "loss": 1.2505, + "step": 2642 + }, + { + "epoch": 0.8305161277656713, + "grad_norm": 0.86328125, + "learning_rate": 1.6846715328467153e-05, + "loss": 1.251, + "step": 2644 + }, + { + "epoch": 0.8311443547912126, + "grad_norm": 0.78515625, + "learning_rate": 1.6844176451920028e-05, + "loss": 1.398, + "step": 2646 + }, + { + "epoch": 0.8317725818167541, + "grad_norm": 0.79296875, + "learning_rate": 1.68416375753729e-05, + "loss": 1.2618, + "step": 2648 + }, + { + "epoch": 0.8324008088422954, + "grad_norm": 0.66796875, + "learning_rate": 1.683909869882577e-05, + "loss": 1.3516, + "step": 2650 + }, + { + "epoch": 0.8330290358678367, + "grad_norm": 0.74609375, + "learning_rate": 1.6836559822278645e-05, + "loss": 1.4359, + "step": 2652 + }, + { + "epoch": 0.833657262893378, + "grad_norm": 0.703125, + "learning_rate": 1.6834020945731516e-05, + "loss": 1.3158, + "step": 2654 + }, + { + "epoch": 0.8342854899189195, + "grad_norm": 0.81640625, + "learning_rate": 1.6831482069184388e-05, + "loss": 1.2849, + "step": 2656 + }, + { + "epoch": 0.8349137169444608, + "grad_norm": 0.734375, + "learning_rate": 1.682894319263726e-05, + "loss": 1.4921, + "step": 2658 + }, + { + "epoch": 0.8355419439700021, + "grad_norm": 0.94921875, + "learning_rate": 1.6826404316090134e-05, + "loss": 1.2774, + "step": 2660 + }, + { + "epoch": 0.8361701709955435, + "grad_norm": 0.78125, + "learning_rate": 1.6823865439543002e-05, + "loss": 1.3282, + "step": 2662 + }, + { + "epoch": 0.8367983980210849, + "grad_norm": 0.75, + "learning_rate": 1.6821326562995877e-05, + "loss": 1.2604, + "step": 2664 + }, + { + "epoch": 0.8374266250466262, + "grad_norm": 0.75390625, + "learning_rate": 1.6818787686448748e-05, + "loss": 1.2322, + "step": 2666 + }, + { + "epoch": 0.8380548520721676, + "grad_norm": 0.75, + "learning_rate": 1.681624880990162e-05, + "loss": 1.3847, + "step": 2668 + }, + { + "epoch": 0.8386830790977089, + "grad_norm": 1.0, + "learning_rate": 1.681370993335449e-05, + "loss": 1.2521, + "step": 2670 + }, + { + "epoch": 0.8393113061232503, + "grad_norm": 0.73046875, + "learning_rate": 1.6811171056807366e-05, + "loss": 1.4187, + "step": 2672 + }, + { + "epoch": 0.8399395331487917, + "grad_norm": 0.7109375, + "learning_rate": 1.6808632180260237e-05, + "loss": 1.195, + "step": 2674 + }, + { + "epoch": 0.840567760174333, + "grad_norm": 1.015625, + "learning_rate": 1.680609330371311e-05, + "loss": 1.3454, + "step": 2676 + }, + { + "epoch": 0.8411959871998743, + "grad_norm": 0.78515625, + "learning_rate": 1.680355442716598e-05, + "loss": 1.453, + "step": 2678 + }, + { + "epoch": 0.8418242142254158, + "grad_norm": 0.68359375, + "learning_rate": 1.6801015550618855e-05, + "loss": 1.4218, + "step": 2680 + }, + { + "epoch": 0.8424524412509571, + "grad_norm": 0.85546875, + "learning_rate": 1.6798476674071723e-05, + "loss": 1.4194, + "step": 2682 + }, + { + "epoch": 0.8430806682764984, + "grad_norm": 0.80859375, + "learning_rate": 1.6795937797524597e-05, + "loss": 1.3225, + "step": 2684 + }, + { + "epoch": 0.8437088953020397, + "grad_norm": 0.7578125, + "learning_rate": 1.679339892097747e-05, + "loss": 1.205, + "step": 2686 + }, + { + "epoch": 0.8443371223275812, + "grad_norm": 1.046875, + "learning_rate": 1.679086004443034e-05, + "loss": 1.425, + "step": 2688 + }, + { + "epoch": 0.8449653493531225, + "grad_norm": 0.6875, + "learning_rate": 1.678832116788321e-05, + "loss": 1.3743, + "step": 2690 + }, + { + "epoch": 0.8455935763786638, + "grad_norm": 0.83203125, + "learning_rate": 1.6785782291336086e-05, + "loss": 1.2462, + "step": 2692 + }, + { + "epoch": 0.8462218034042052, + "grad_norm": 0.671875, + "learning_rate": 1.6783243414788958e-05, + "loss": 1.3989, + "step": 2694 + }, + { + "epoch": 0.8468500304297466, + "grad_norm": 0.76953125, + "learning_rate": 1.678070453824183e-05, + "loss": 1.4101, + "step": 2696 + }, + { + "epoch": 0.8474782574552879, + "grad_norm": 0.71484375, + "learning_rate": 1.67781656616947e-05, + "loss": 1.2639, + "step": 2698 + }, + { + "epoch": 0.8481064844808293, + "grad_norm": 0.79296875, + "learning_rate": 1.6775626785147575e-05, + "loss": 1.3388, + "step": 2700 + }, + { + "epoch": 0.8487347115063706, + "grad_norm": 0.78515625, + "learning_rate": 1.6773087908600447e-05, + "loss": 1.363, + "step": 2702 + }, + { + "epoch": 0.8493629385319119, + "grad_norm": 0.828125, + "learning_rate": 1.6770549032053318e-05, + "loss": 1.2831, + "step": 2704 + }, + { + "epoch": 0.8499911655574534, + "grad_norm": 0.7109375, + "learning_rate": 1.676801015550619e-05, + "loss": 1.2638, + "step": 2706 + }, + { + "epoch": 0.8506193925829947, + "grad_norm": 0.6875, + "learning_rate": 1.676547127895906e-05, + "loss": 1.3733, + "step": 2708 + }, + { + "epoch": 0.851247619608536, + "grad_norm": 0.6796875, + "learning_rate": 1.6762932402411932e-05, + "loss": 1.3726, + "step": 2710 + }, + { + "epoch": 0.8518758466340773, + "grad_norm": 0.73828125, + "learning_rate": 1.6760393525864807e-05, + "loss": 1.3406, + "step": 2712 + }, + { + "epoch": 0.8525040736596188, + "grad_norm": 0.69921875, + "learning_rate": 1.6757854649317678e-05, + "loss": 1.4331, + "step": 2714 + }, + { + "epoch": 0.8531323006851601, + "grad_norm": 0.7890625, + "learning_rate": 1.675531577277055e-05, + "loss": 1.302, + "step": 2716 + }, + { + "epoch": 0.8537605277107014, + "grad_norm": 0.79296875, + "learning_rate": 1.675277689622342e-05, + "loss": 1.3428, + "step": 2718 + }, + { + "epoch": 0.8543887547362428, + "grad_norm": 0.765625, + "learning_rate": 1.6750238019676296e-05, + "loss": 1.2827, + "step": 2720 + }, + { + "epoch": 0.8550169817617842, + "grad_norm": 0.67578125, + "learning_rate": 1.6747699143129167e-05, + "loss": 1.2744, + "step": 2722 + }, + { + "epoch": 0.8556452087873255, + "grad_norm": 0.75, + "learning_rate": 1.674516026658204e-05, + "loss": 1.2999, + "step": 2724 + }, + { + "epoch": 0.8562734358128669, + "grad_norm": 0.9765625, + "learning_rate": 1.674262139003491e-05, + "loss": 1.2288, + "step": 2726 + }, + { + "epoch": 0.8569016628384082, + "grad_norm": 0.7109375, + "learning_rate": 1.6740082513487785e-05, + "loss": 1.3101, + "step": 2728 + }, + { + "epoch": 0.8575298898639496, + "grad_norm": 0.71484375, + "learning_rate": 1.6737543636940653e-05, + "loss": 1.309, + "step": 2730 + }, + { + "epoch": 0.858158116889491, + "grad_norm": 0.69921875, + "learning_rate": 1.6735004760393527e-05, + "loss": 1.3683, + "step": 2732 + }, + { + "epoch": 0.8587863439150323, + "grad_norm": 1.015625, + "learning_rate": 1.67324658838464e-05, + "loss": 1.2708, + "step": 2734 + }, + { + "epoch": 0.8594145709405736, + "grad_norm": 0.7578125, + "learning_rate": 1.672992700729927e-05, + "loss": 1.5443, + "step": 2736 + }, + { + "epoch": 0.860042797966115, + "grad_norm": 0.73046875, + "learning_rate": 1.6727388130752145e-05, + "loss": 1.3305, + "step": 2738 + }, + { + "epoch": 0.8606710249916564, + "grad_norm": 0.86328125, + "learning_rate": 1.6724849254205016e-05, + "loss": 1.3512, + "step": 2740 + }, + { + "epoch": 0.8612992520171977, + "grad_norm": 0.73828125, + "learning_rate": 1.6722310377657888e-05, + "loss": 1.3854, + "step": 2742 + }, + { + "epoch": 0.861927479042739, + "grad_norm": 0.75390625, + "learning_rate": 1.671977150111076e-05, + "loss": 1.2901, + "step": 2744 + }, + { + "epoch": 0.8625557060682805, + "grad_norm": 0.68359375, + "learning_rate": 1.6717232624563634e-05, + "loss": 1.3502, + "step": 2746 + }, + { + "epoch": 0.8631839330938218, + "grad_norm": 0.7578125, + "learning_rate": 1.6714693748016505e-05, + "loss": 1.1293, + "step": 2748 + }, + { + "epoch": 0.8638121601193631, + "grad_norm": 0.74609375, + "learning_rate": 1.6712154871469377e-05, + "loss": 1.3325, + "step": 2750 + }, + { + "epoch": 0.8644403871449045, + "grad_norm": 0.7890625, + "learning_rate": 1.6709615994922248e-05, + "loss": 1.4138, + "step": 2752 + }, + { + "epoch": 0.8650686141704459, + "grad_norm": 0.69140625, + "learning_rate": 1.6707077118375123e-05, + "loss": 1.2818, + "step": 2754 + }, + { + "epoch": 0.8656968411959872, + "grad_norm": 0.73046875, + "learning_rate": 1.670453824182799e-05, + "loss": 1.2926, + "step": 2756 + }, + { + "epoch": 0.8663250682215285, + "grad_norm": 0.6953125, + "learning_rate": 1.6701999365280866e-05, + "loss": 1.3686, + "step": 2758 + }, + { + "epoch": 0.8669532952470699, + "grad_norm": 0.8359375, + "learning_rate": 1.6699460488733737e-05, + "loss": 1.2924, + "step": 2760 + }, + { + "epoch": 0.8675815222726113, + "grad_norm": 0.78515625, + "learning_rate": 1.669692161218661e-05, + "loss": 1.4022, + "step": 2762 + }, + { + "epoch": 0.8682097492981526, + "grad_norm": 0.8359375, + "learning_rate": 1.669438273563948e-05, + "loss": 1.429, + "step": 2764 + }, + { + "epoch": 0.868837976323694, + "grad_norm": 0.7890625, + "learning_rate": 1.6691843859092355e-05, + "loss": 1.2911, + "step": 2766 + }, + { + "epoch": 0.8694662033492353, + "grad_norm": 0.73046875, + "learning_rate": 1.6689304982545226e-05, + "loss": 1.4, + "step": 2768 + }, + { + "epoch": 0.8700944303747767, + "grad_norm": 0.88671875, + "learning_rate": 1.6686766105998097e-05, + "loss": 1.3409, + "step": 2770 + }, + { + "epoch": 0.8707226574003181, + "grad_norm": 1.0390625, + "learning_rate": 1.668422722945097e-05, + "loss": 1.2781, + "step": 2772 + }, + { + "epoch": 0.8713508844258594, + "grad_norm": 0.8359375, + "learning_rate": 1.6681688352903843e-05, + "loss": 1.3083, + "step": 2774 + }, + { + "epoch": 0.8719791114514007, + "grad_norm": 0.73046875, + "learning_rate": 1.667914947635671e-05, + "loss": 1.2491, + "step": 2776 + }, + { + "epoch": 0.872607338476942, + "grad_norm": 0.67578125, + "learning_rate": 1.6676610599809586e-05, + "loss": 1.3156, + "step": 2778 + }, + { + "epoch": 0.8732355655024835, + "grad_norm": 0.8515625, + "learning_rate": 1.6674071723262458e-05, + "loss": 1.2403, + "step": 2780 + }, + { + "epoch": 0.8738637925280248, + "grad_norm": 0.74609375, + "learning_rate": 1.667153284671533e-05, + "loss": 1.4226, + "step": 2782 + }, + { + "epoch": 0.8744920195535661, + "grad_norm": 0.84765625, + "learning_rate": 1.66689939701682e-05, + "loss": 1.2981, + "step": 2784 + }, + { + "epoch": 0.8751202465791075, + "grad_norm": 0.6953125, + "learning_rate": 1.6666455093621075e-05, + "loss": 1.256, + "step": 2786 + }, + { + "epoch": 0.8757484736046489, + "grad_norm": 0.734375, + "learning_rate": 1.6663916217073946e-05, + "loss": 1.255, + "step": 2788 + }, + { + "epoch": 0.8763767006301902, + "grad_norm": 0.7265625, + "learning_rate": 1.6661377340526818e-05, + "loss": 1.2185, + "step": 2790 + }, + { + "epoch": 0.8770049276557316, + "grad_norm": 0.6640625, + "learning_rate": 1.665883846397969e-05, + "loss": 1.4315, + "step": 2792 + }, + { + "epoch": 0.8776331546812729, + "grad_norm": 0.703125, + "learning_rate": 1.6656299587432564e-05, + "loss": 1.4531, + "step": 2794 + }, + { + "epoch": 0.8782613817068143, + "grad_norm": 0.8828125, + "learning_rate": 1.6653760710885435e-05, + "loss": 1.2937, + "step": 2796 + }, + { + "epoch": 0.8788896087323557, + "grad_norm": 0.9375, + "learning_rate": 1.6651221834338307e-05, + "loss": 1.2382, + "step": 2798 + }, + { + "epoch": 0.879517835757897, + "grad_norm": 0.8515625, + "learning_rate": 1.6648682957791178e-05, + "loss": 1.2398, + "step": 2800 + }, + { + "epoch": 0.8801460627834383, + "grad_norm": 0.7890625, + "learning_rate": 1.664614408124405e-05, + "loss": 1.3117, + "step": 2802 + }, + { + "epoch": 0.8807742898089798, + "grad_norm": 0.88671875, + "learning_rate": 1.664360520469692e-05, + "loss": 1.35, + "step": 2804 + }, + { + "epoch": 0.8814025168345211, + "grad_norm": 0.8125, + "learning_rate": 1.6641066328149796e-05, + "loss": 1.4186, + "step": 2806 + }, + { + "epoch": 0.8820307438600624, + "grad_norm": 0.67578125, + "learning_rate": 1.6638527451602667e-05, + "loss": 1.2733, + "step": 2808 + }, + { + "epoch": 0.8826589708856037, + "grad_norm": 0.734375, + "learning_rate": 1.663598857505554e-05, + "loss": 1.312, + "step": 2810 + }, + { + "epoch": 0.8832871979111452, + "grad_norm": 0.76171875, + "learning_rate": 1.663344969850841e-05, + "loss": 1.2711, + "step": 2812 + }, + { + "epoch": 0.8839154249366865, + "grad_norm": 0.7265625, + "learning_rate": 1.6630910821961285e-05, + "loss": 1.3649, + "step": 2814 + }, + { + "epoch": 0.8845436519622278, + "grad_norm": 0.734375, + "learning_rate": 1.6628371945414156e-05, + "loss": 1.5247, + "step": 2816 + }, + { + "epoch": 0.8851718789877692, + "grad_norm": 0.76171875, + "learning_rate": 1.6625833068867027e-05, + "loss": 1.2794, + "step": 2818 + }, + { + "epoch": 0.8858001060133106, + "grad_norm": 0.68359375, + "learning_rate": 1.66232941923199e-05, + "loss": 1.2475, + "step": 2820 + }, + { + "epoch": 0.8864283330388519, + "grad_norm": 0.8359375, + "learning_rate": 1.6620755315772774e-05, + "loss": 1.1927, + "step": 2822 + }, + { + "epoch": 0.8870565600643933, + "grad_norm": 0.8828125, + "learning_rate": 1.6618216439225645e-05, + "loss": 1.2817, + "step": 2824 + }, + { + "epoch": 0.8876847870899346, + "grad_norm": 0.72265625, + "learning_rate": 1.6615677562678516e-05, + "loss": 1.3958, + "step": 2826 + }, + { + "epoch": 0.888313014115476, + "grad_norm": 0.69140625, + "learning_rate": 1.6613138686131388e-05, + "loss": 1.4729, + "step": 2828 + }, + { + "epoch": 0.8889412411410174, + "grad_norm": 0.87109375, + "learning_rate": 1.661059980958426e-05, + "loss": 1.2705, + "step": 2830 + }, + { + "epoch": 0.8895694681665587, + "grad_norm": 0.73828125, + "learning_rate": 1.6608060933037134e-05, + "loss": 1.4104, + "step": 2832 + }, + { + "epoch": 0.8901976951921, + "grad_norm": 0.78125, + "learning_rate": 1.6605522056490005e-05, + "loss": 1.3625, + "step": 2834 + }, + { + "epoch": 0.8908259222176415, + "grad_norm": 0.8046875, + "learning_rate": 1.6602983179942877e-05, + "loss": 1.3754, + "step": 2836 + }, + { + "epoch": 0.8914541492431828, + "grad_norm": 0.7578125, + "learning_rate": 1.6600444303395748e-05, + "loss": 1.2763, + "step": 2838 + }, + { + "epoch": 0.8920823762687241, + "grad_norm": 0.6953125, + "learning_rate": 1.6597905426848623e-05, + "loss": 1.1745, + "step": 2840 + }, + { + "epoch": 0.8927106032942654, + "grad_norm": 0.75390625, + "learning_rate": 1.6595366550301494e-05, + "loss": 1.2782, + "step": 2842 + }, + { + "epoch": 0.8933388303198068, + "grad_norm": 0.796875, + "learning_rate": 1.6592827673754366e-05, + "loss": 1.3032, + "step": 2844 + }, + { + "epoch": 0.8939670573453482, + "grad_norm": 0.7265625, + "learning_rate": 1.6590288797207237e-05, + "loss": 1.4176, + "step": 2846 + }, + { + "epoch": 0.8945952843708895, + "grad_norm": 0.71875, + "learning_rate": 1.658774992066011e-05, + "loss": 1.3804, + "step": 2848 + }, + { + "epoch": 0.8952235113964309, + "grad_norm": 0.6796875, + "learning_rate": 1.658521104411298e-05, + "loss": 1.3173, + "step": 2850 + }, + { + "epoch": 0.8958517384219722, + "grad_norm": 0.86328125, + "learning_rate": 1.6582672167565854e-05, + "loss": 1.1673, + "step": 2852 + }, + { + "epoch": 0.8964799654475136, + "grad_norm": 0.69921875, + "learning_rate": 1.6580133291018726e-05, + "loss": 1.3201, + "step": 2854 + }, + { + "epoch": 0.897108192473055, + "grad_norm": 0.6953125, + "learning_rate": 1.6577594414471597e-05, + "loss": 1.3234, + "step": 2856 + }, + { + "epoch": 0.8977364194985963, + "grad_norm": 2.359375, + "learning_rate": 1.657505553792447e-05, + "loss": 1.4672, + "step": 2858 + }, + { + "epoch": 0.8983646465241376, + "grad_norm": 0.78515625, + "learning_rate": 1.6572516661377343e-05, + "loss": 1.3377, + "step": 2860 + }, + { + "epoch": 0.898992873549679, + "grad_norm": 0.71484375, + "learning_rate": 1.6569977784830215e-05, + "loss": 1.2545, + "step": 2862 + }, + { + "epoch": 0.8996211005752204, + "grad_norm": 0.8984375, + "learning_rate": 1.6567438908283086e-05, + "loss": 1.2684, + "step": 2864 + }, + { + "epoch": 0.9002493276007617, + "grad_norm": 0.7421875, + "learning_rate": 1.6564900031735957e-05, + "loss": 1.2329, + "step": 2866 + }, + { + "epoch": 0.900877554626303, + "grad_norm": 0.6796875, + "learning_rate": 1.6562361155188832e-05, + "loss": 1.4101, + "step": 2868 + }, + { + "epoch": 0.9015057816518445, + "grad_norm": 0.7578125, + "learning_rate": 1.65598222786417e-05, + "loss": 1.2965, + "step": 2870 + }, + { + "epoch": 0.9021340086773858, + "grad_norm": 0.90234375, + "learning_rate": 1.6557283402094575e-05, + "loss": 1.331, + "step": 2872 + }, + { + "epoch": 0.9027622357029271, + "grad_norm": 0.765625, + "learning_rate": 1.6554744525547446e-05, + "loss": 1.4061, + "step": 2874 + }, + { + "epoch": 0.9033904627284685, + "grad_norm": 0.76953125, + "learning_rate": 1.6552205649000318e-05, + "loss": 1.3482, + "step": 2876 + }, + { + "epoch": 0.9040186897540099, + "grad_norm": 0.68359375, + "learning_rate": 1.654966677245319e-05, + "loss": 1.3822, + "step": 2878 + }, + { + "epoch": 0.9046469167795512, + "grad_norm": 0.79296875, + "learning_rate": 1.6547127895906064e-05, + "loss": 1.2013, + "step": 2880 + }, + { + "epoch": 0.9052751438050926, + "grad_norm": 0.75390625, + "learning_rate": 1.6544589019358935e-05, + "loss": 1.2415, + "step": 2882 + }, + { + "epoch": 0.9059033708306339, + "grad_norm": 0.8984375, + "learning_rate": 1.6542050142811807e-05, + "loss": 1.3142, + "step": 2884 + }, + { + "epoch": 0.9065315978561753, + "grad_norm": 0.7734375, + "learning_rate": 1.6539511266264678e-05, + "loss": 1.3292, + "step": 2886 + }, + { + "epoch": 0.9071598248817166, + "grad_norm": 0.7421875, + "learning_rate": 1.6536972389717553e-05, + "loss": 1.3243, + "step": 2888 + }, + { + "epoch": 0.907788051907258, + "grad_norm": 0.75, + "learning_rate": 1.6534433513170424e-05, + "loss": 1.2548, + "step": 2890 + }, + { + "epoch": 0.9084162789327993, + "grad_norm": 0.78515625, + "learning_rate": 1.6531894636623296e-05, + "loss": 1.3526, + "step": 2892 + }, + { + "epoch": 0.9090445059583407, + "grad_norm": 0.7890625, + "learning_rate": 1.6529355760076167e-05, + "loss": 1.3198, + "step": 2894 + }, + { + "epoch": 0.9096727329838821, + "grad_norm": 0.6875, + "learning_rate": 1.652681688352904e-05, + "loss": 1.0987, + "step": 2896 + }, + { + "epoch": 0.9103009600094234, + "grad_norm": 0.7890625, + "learning_rate": 1.652427800698191e-05, + "loss": 1.2387, + "step": 2898 + }, + { + "epoch": 0.9109291870349647, + "grad_norm": 0.71484375, + "learning_rate": 1.6521739130434785e-05, + "loss": 1.1774, + "step": 2900 + }, + { + "epoch": 0.9115574140605062, + "grad_norm": 0.78515625, + "learning_rate": 1.6519200253887656e-05, + "loss": 1.2341, + "step": 2902 + }, + { + "epoch": 0.9121856410860475, + "grad_norm": 0.796875, + "learning_rate": 1.6516661377340527e-05, + "loss": 1.2046, + "step": 2904 + }, + { + "epoch": 0.9128138681115888, + "grad_norm": 0.7890625, + "learning_rate": 1.65141225007934e-05, + "loss": 1.477, + "step": 2906 + }, + { + "epoch": 0.9134420951371302, + "grad_norm": 0.7109375, + "learning_rate": 1.6511583624246273e-05, + "loss": 1.4045, + "step": 2908 + }, + { + "epoch": 0.9140703221626716, + "grad_norm": 0.7734375, + "learning_rate": 1.6509044747699145e-05, + "loss": 1.2798, + "step": 2910 + }, + { + "epoch": 0.9146985491882129, + "grad_norm": 0.703125, + "learning_rate": 1.6506505871152016e-05, + "loss": 1.3729, + "step": 2912 + }, + { + "epoch": 0.9153267762137542, + "grad_norm": 0.84375, + "learning_rate": 1.650396699460489e-05, + "loss": 1.3434, + "step": 2914 + }, + { + "epoch": 0.9159550032392956, + "grad_norm": 0.73046875, + "learning_rate": 1.650142811805776e-05, + "loss": 1.3866, + "step": 2916 + }, + { + "epoch": 0.9165832302648369, + "grad_norm": 0.71875, + "learning_rate": 1.6498889241510634e-05, + "loss": 1.2233, + "step": 2918 + }, + { + "epoch": 0.9172114572903783, + "grad_norm": 0.95703125, + "learning_rate": 1.6496350364963505e-05, + "loss": 1.2631, + "step": 2920 + }, + { + "epoch": 0.9178396843159197, + "grad_norm": 0.6640625, + "learning_rate": 1.6493811488416377e-05, + "loss": 1.3768, + "step": 2922 + }, + { + "epoch": 0.918467911341461, + "grad_norm": 0.84375, + "learning_rate": 1.6491272611869248e-05, + "loss": 1.2722, + "step": 2924 + }, + { + "epoch": 0.9190961383670023, + "grad_norm": 0.83203125, + "learning_rate": 1.6488733735322123e-05, + "loss": 1.2799, + "step": 2926 + }, + { + "epoch": 0.9197243653925438, + "grad_norm": 0.859375, + "learning_rate": 1.6486194858774994e-05, + "loss": 1.2571, + "step": 2928 + }, + { + "epoch": 0.9203525924180851, + "grad_norm": 0.71875, + "learning_rate": 1.6483655982227865e-05, + "loss": 1.2148, + "step": 2930 + }, + { + "epoch": 0.9209808194436264, + "grad_norm": 0.74609375, + "learning_rate": 1.6481117105680737e-05, + "loss": 1.3129, + "step": 2932 + }, + { + "epoch": 0.9216090464691677, + "grad_norm": 0.71484375, + "learning_rate": 1.647857822913361e-05, + "loss": 1.2683, + "step": 2934 + }, + { + "epoch": 0.9222372734947092, + "grad_norm": 0.703125, + "learning_rate": 1.6476039352586483e-05, + "loss": 1.356, + "step": 2936 + }, + { + "epoch": 0.9228655005202505, + "grad_norm": 0.74609375, + "learning_rate": 1.6473500476039354e-05, + "loss": 1.2901, + "step": 2938 + }, + { + "epoch": 0.9234937275457918, + "grad_norm": 0.68359375, + "learning_rate": 1.6470961599492226e-05, + "loss": 1.4158, + "step": 2940 + }, + { + "epoch": 0.9241219545713332, + "grad_norm": 0.796875, + "learning_rate": 1.6468422722945097e-05, + "loss": 1.2391, + "step": 2942 + }, + { + "epoch": 0.9247501815968746, + "grad_norm": 0.74609375, + "learning_rate": 1.646588384639797e-05, + "loss": 1.3964, + "step": 2944 + }, + { + "epoch": 0.9253784086224159, + "grad_norm": 0.71875, + "learning_rate": 1.6463344969850843e-05, + "loss": 1.3187, + "step": 2946 + }, + { + "epoch": 0.9260066356479573, + "grad_norm": 0.6640625, + "learning_rate": 1.6460806093303715e-05, + "loss": 1.3794, + "step": 2948 + }, + { + "epoch": 0.9266348626734986, + "grad_norm": 0.69140625, + "learning_rate": 1.6458267216756586e-05, + "loss": 1.3897, + "step": 2950 + }, + { + "epoch": 0.92726308969904, + "grad_norm": 0.73046875, + "learning_rate": 1.6455728340209457e-05, + "loss": 1.2514, + "step": 2952 + }, + { + "epoch": 0.9278913167245814, + "grad_norm": 0.7265625, + "learning_rate": 1.6453189463662332e-05, + "loss": 1.2275, + "step": 2954 + }, + { + "epoch": 0.9285195437501227, + "grad_norm": 0.8828125, + "learning_rate": 1.6450650587115204e-05, + "loss": 1.3661, + "step": 2956 + }, + { + "epoch": 0.929147770775664, + "grad_norm": 0.703125, + "learning_rate": 1.6448111710568075e-05, + "loss": 1.3095, + "step": 2958 + }, + { + "epoch": 0.9297759978012055, + "grad_norm": 0.80859375, + "learning_rate": 1.6445572834020946e-05, + "loss": 1.4244, + "step": 2960 + }, + { + "epoch": 0.9304042248267468, + "grad_norm": 0.69921875, + "learning_rate": 1.644303395747382e-05, + "loss": 1.3683, + "step": 2962 + }, + { + "epoch": 0.9310324518522881, + "grad_norm": 0.71875, + "learning_rate": 1.644049508092669e-05, + "loss": 1.512, + "step": 2964 + }, + { + "epoch": 0.9316606788778294, + "grad_norm": 0.80859375, + "learning_rate": 1.6437956204379564e-05, + "loss": 1.4732, + "step": 2966 + }, + { + "epoch": 0.9322889059033709, + "grad_norm": 0.734375, + "learning_rate": 1.6435417327832435e-05, + "loss": 1.34, + "step": 2968 + }, + { + "epoch": 0.9329171329289122, + "grad_norm": 0.77734375, + "learning_rate": 1.6432878451285307e-05, + "loss": 1.2436, + "step": 2970 + }, + { + "epoch": 0.9335453599544535, + "grad_norm": 0.7421875, + "learning_rate": 1.6430339574738178e-05, + "loss": 1.3719, + "step": 2972 + }, + { + "epoch": 0.9341735869799949, + "grad_norm": 0.79296875, + "learning_rate": 1.6427800698191053e-05, + "loss": 1.3081, + "step": 2974 + }, + { + "epoch": 0.9348018140055363, + "grad_norm": 0.74609375, + "learning_rate": 1.6425261821643924e-05, + "loss": 1.3141, + "step": 2976 + }, + { + "epoch": 0.9354300410310776, + "grad_norm": 0.73828125, + "learning_rate": 1.6422722945096796e-05, + "loss": 1.3385, + "step": 2978 + }, + { + "epoch": 0.936058268056619, + "grad_norm": 0.73046875, + "learning_rate": 1.6420184068549667e-05, + "loss": 1.3452, + "step": 2980 + }, + { + "epoch": 0.9366864950821603, + "grad_norm": 0.80078125, + "learning_rate": 1.6417645192002542e-05, + "loss": 1.2058, + "step": 2982 + }, + { + "epoch": 0.9373147221077017, + "grad_norm": 0.75390625, + "learning_rate": 1.641510631545541e-05, + "loss": 1.3226, + "step": 2984 + }, + { + "epoch": 0.937942949133243, + "grad_norm": 0.7109375, + "learning_rate": 1.6412567438908284e-05, + "loss": 1.362, + "step": 2986 + }, + { + "epoch": 0.9385711761587844, + "grad_norm": 0.734375, + "learning_rate": 1.6410028562361156e-05, + "loss": 1.2904, + "step": 2988 + }, + { + "epoch": 0.9391994031843257, + "grad_norm": 0.875, + "learning_rate": 1.6407489685814027e-05, + "loss": 1.3076, + "step": 2990 + }, + { + "epoch": 0.939827630209867, + "grad_norm": 1.2890625, + "learning_rate": 1.64049508092669e-05, + "loss": 1.2634, + "step": 2992 + }, + { + "epoch": 0.9404558572354085, + "grad_norm": 0.77734375, + "learning_rate": 1.6402411932719773e-05, + "loss": 1.3545, + "step": 2994 + }, + { + "epoch": 0.9410840842609498, + "grad_norm": 0.73046875, + "learning_rate": 1.6399873056172645e-05, + "loss": 1.3319, + "step": 2996 + }, + { + "epoch": 0.9417123112864911, + "grad_norm": 0.75, + "learning_rate": 1.6397334179625516e-05, + "loss": 1.3534, + "step": 2998 + }, + { + "epoch": 0.9423405383120325, + "grad_norm": 0.68359375, + "learning_rate": 1.639479530307839e-05, + "loss": 1.2353, + "step": 3000 + }, + { + "epoch": 0.9429687653375739, + "grad_norm": 0.68359375, + "learning_rate": 1.6392256426531262e-05, + "loss": 1.2108, + "step": 3002 + }, + { + "epoch": 0.9435969923631152, + "grad_norm": 0.71875, + "learning_rate": 1.6389717549984134e-05, + "loss": 1.2961, + "step": 3004 + }, + { + "epoch": 0.9442252193886566, + "grad_norm": 0.734375, + "learning_rate": 1.6387178673437005e-05, + "loss": 1.2746, + "step": 3006 + }, + { + "epoch": 0.9448534464141979, + "grad_norm": 0.72265625, + "learning_rate": 1.638463979688988e-05, + "loss": 1.2231, + "step": 3008 + }, + { + "epoch": 0.9454816734397393, + "grad_norm": 0.85546875, + "learning_rate": 1.6382100920342748e-05, + "loss": 1.4304, + "step": 3010 + }, + { + "epoch": 0.9461099004652807, + "grad_norm": 0.87890625, + "learning_rate": 1.6379562043795623e-05, + "loss": 1.336, + "step": 3012 + }, + { + "epoch": 0.946738127490822, + "grad_norm": 0.78125, + "learning_rate": 1.6377023167248494e-05, + "loss": 1.2532, + "step": 3014 + }, + { + "epoch": 0.9473663545163633, + "grad_norm": 0.73046875, + "learning_rate": 1.6374484290701365e-05, + "loss": 1.3438, + "step": 3016 + }, + { + "epoch": 0.9479945815419047, + "grad_norm": 0.765625, + "learning_rate": 1.6371945414154237e-05, + "loss": 1.3412, + "step": 3018 + }, + { + "epoch": 0.9486228085674461, + "grad_norm": 0.7578125, + "learning_rate": 1.636940653760711e-05, + "loss": 1.2943, + "step": 3020 + }, + { + "epoch": 0.9492510355929874, + "grad_norm": 0.71484375, + "learning_rate": 1.6366867661059983e-05, + "loss": 1.3679, + "step": 3022 + }, + { + "epoch": 0.9498792626185287, + "grad_norm": 0.6953125, + "learning_rate": 1.6364328784512854e-05, + "loss": 1.2899, + "step": 3024 + }, + { + "epoch": 0.9505074896440702, + "grad_norm": 0.87109375, + "learning_rate": 1.6361789907965726e-05, + "loss": 1.4329, + "step": 3026 + }, + { + "epoch": 0.9511357166696115, + "grad_norm": 0.6875, + "learning_rate": 1.63592510314186e-05, + "loss": 1.2883, + "step": 3028 + }, + { + "epoch": 0.9517639436951528, + "grad_norm": 0.66796875, + "learning_rate": 1.6356712154871472e-05, + "loss": 1.2225, + "step": 3030 + }, + { + "epoch": 0.9523921707206942, + "grad_norm": 0.796875, + "learning_rate": 1.6354173278324343e-05, + "loss": 1.3128, + "step": 3032 + }, + { + "epoch": 0.9530203977462356, + "grad_norm": 0.8828125, + "learning_rate": 1.6351634401777215e-05, + "loss": 1.3125, + "step": 3034 + }, + { + "epoch": 0.9536486247717769, + "grad_norm": 0.7265625, + "learning_rate": 1.6349095525230086e-05, + "loss": 1.3294, + "step": 3036 + }, + { + "epoch": 0.9542768517973182, + "grad_norm": 0.79296875, + "learning_rate": 1.6346556648682957e-05, + "loss": 1.2799, + "step": 3038 + }, + { + "epoch": 0.9549050788228596, + "grad_norm": 0.703125, + "learning_rate": 1.6344017772135832e-05, + "loss": 1.3259, + "step": 3040 + }, + { + "epoch": 0.955533305848401, + "grad_norm": 0.66796875, + "learning_rate": 1.6341478895588704e-05, + "loss": 1.2925, + "step": 3042 + }, + { + "epoch": 0.9561615328739423, + "grad_norm": 0.81640625, + "learning_rate": 1.6338940019041575e-05, + "loss": 1.4347, + "step": 3044 + }, + { + "epoch": 0.9567897598994837, + "grad_norm": 0.71875, + "learning_rate": 1.6336401142494446e-05, + "loss": 1.1746, + "step": 3046 + }, + { + "epoch": 0.957417986925025, + "grad_norm": 0.8671875, + "learning_rate": 1.633386226594732e-05, + "loss": 1.3328, + "step": 3048 + }, + { + "epoch": 0.9580462139505664, + "grad_norm": 0.6953125, + "learning_rate": 1.6331323389400192e-05, + "loss": 1.3283, + "step": 3050 + }, + { + "epoch": 0.9586744409761078, + "grad_norm": 0.67578125, + "learning_rate": 1.6328784512853064e-05, + "loss": 1.4036, + "step": 3052 + }, + { + "epoch": 0.9593026680016491, + "grad_norm": 0.83984375, + "learning_rate": 1.6326245636305935e-05, + "loss": 1.2255, + "step": 3054 + }, + { + "epoch": 0.9599308950271904, + "grad_norm": 0.7265625, + "learning_rate": 1.632370675975881e-05, + "loss": 1.2382, + "step": 3056 + }, + { + "epoch": 0.9605591220527318, + "grad_norm": 0.72265625, + "learning_rate": 1.6321167883211678e-05, + "loss": 1.311, + "step": 3058 + }, + { + "epoch": 0.9611873490782732, + "grad_norm": 0.7109375, + "learning_rate": 1.6318629006664553e-05, + "loss": 1.2916, + "step": 3060 + }, + { + "epoch": 0.9618155761038145, + "grad_norm": 0.734375, + "learning_rate": 1.6316090130117424e-05, + "loss": 1.2996, + "step": 3062 + }, + { + "epoch": 0.9624438031293558, + "grad_norm": 0.6796875, + "learning_rate": 1.6313551253570295e-05, + "loss": 1.3253, + "step": 3064 + }, + { + "epoch": 0.9630720301548972, + "grad_norm": 0.66796875, + "learning_rate": 1.6311012377023167e-05, + "loss": 1.3582, + "step": 3066 + }, + { + "epoch": 0.9637002571804386, + "grad_norm": 0.671875, + "learning_rate": 1.630847350047604e-05, + "loss": 1.3637, + "step": 3068 + }, + { + "epoch": 0.9643284842059799, + "grad_norm": 0.80859375, + "learning_rate": 1.6305934623928913e-05, + "loss": 1.2882, + "step": 3070 + }, + { + "epoch": 0.9649567112315213, + "grad_norm": 1.1171875, + "learning_rate": 1.6303395747381784e-05, + "loss": 1.2281, + "step": 3072 + }, + { + "epoch": 0.9655849382570626, + "grad_norm": 0.80078125, + "learning_rate": 1.6300856870834656e-05, + "loss": 1.3915, + "step": 3074 + }, + { + "epoch": 0.966213165282604, + "grad_norm": 0.75, + "learning_rate": 1.629831799428753e-05, + "loss": 1.401, + "step": 3076 + }, + { + "epoch": 0.9668413923081454, + "grad_norm": 0.73828125, + "learning_rate": 1.62957791177404e-05, + "loss": 1.2698, + "step": 3078 + }, + { + "epoch": 0.9674696193336867, + "grad_norm": 0.75, + "learning_rate": 1.6293240241193273e-05, + "loss": 1.3833, + "step": 3080 + }, + { + "epoch": 0.968097846359228, + "grad_norm": 0.84375, + "learning_rate": 1.6290701364646148e-05, + "loss": 1.2167, + "step": 3082 + }, + { + "epoch": 0.9687260733847695, + "grad_norm": 0.6875, + "learning_rate": 1.6288162488099016e-05, + "loss": 1.3872, + "step": 3084 + }, + { + "epoch": 0.9693543004103108, + "grad_norm": 0.65625, + "learning_rate": 1.628562361155189e-05, + "loss": 1.1942, + "step": 3086 + }, + { + "epoch": 0.9699825274358521, + "grad_norm": 0.91015625, + "learning_rate": 1.6283084735004762e-05, + "loss": 1.3442, + "step": 3088 + }, + { + "epoch": 0.9706107544613934, + "grad_norm": 0.68359375, + "learning_rate": 1.6280545858457634e-05, + "loss": 1.1895, + "step": 3090 + }, + { + "epoch": 0.9712389814869349, + "grad_norm": 0.765625, + "learning_rate": 1.6278006981910505e-05, + "loss": 1.2817, + "step": 3092 + }, + { + "epoch": 0.9718672085124762, + "grad_norm": 0.85546875, + "learning_rate": 1.627546810536338e-05, + "loss": 1.3469, + "step": 3094 + }, + { + "epoch": 0.9724954355380175, + "grad_norm": 0.76171875, + "learning_rate": 1.627292922881625e-05, + "loss": 1.3717, + "step": 3096 + }, + { + "epoch": 0.9731236625635589, + "grad_norm": 0.6953125, + "learning_rate": 1.6270390352269123e-05, + "loss": 1.3015, + "step": 3098 + }, + { + "epoch": 0.9737518895891003, + "grad_norm": 0.703125, + "learning_rate": 1.6267851475721994e-05, + "loss": 1.4265, + "step": 3100 + }, + { + "epoch": 0.9743801166146416, + "grad_norm": 0.80859375, + "learning_rate": 1.626531259917487e-05, + "loss": 1.2424, + "step": 3102 + }, + { + "epoch": 0.975008343640183, + "grad_norm": 0.8125, + "learning_rate": 1.6262773722627737e-05, + "loss": 1.3252, + "step": 3104 + }, + { + "epoch": 0.9756365706657243, + "grad_norm": 0.87890625, + "learning_rate": 1.626023484608061e-05, + "loss": 1.3464, + "step": 3106 + }, + { + "epoch": 0.9762647976912657, + "grad_norm": 0.81640625, + "learning_rate": 1.6257695969533483e-05, + "loss": 1.3007, + "step": 3108 + }, + { + "epoch": 0.9768930247168071, + "grad_norm": 0.90234375, + "learning_rate": 1.6255157092986354e-05, + "loss": 1.2549, + "step": 3110 + }, + { + "epoch": 0.9775212517423484, + "grad_norm": 0.76171875, + "learning_rate": 1.6252618216439226e-05, + "loss": 1.3299, + "step": 3112 + }, + { + "epoch": 0.9781494787678897, + "grad_norm": 0.77734375, + "learning_rate": 1.62500793398921e-05, + "loss": 1.3118, + "step": 3114 + }, + { + "epoch": 0.9787777057934312, + "grad_norm": 0.765625, + "learning_rate": 1.6247540463344972e-05, + "loss": 1.3114, + "step": 3116 + }, + { + "epoch": 0.9794059328189725, + "grad_norm": 0.66796875, + "learning_rate": 1.6245001586797843e-05, + "loss": 1.2729, + "step": 3118 + }, + { + "epoch": 0.9800341598445138, + "grad_norm": 0.69921875, + "learning_rate": 1.6242462710250715e-05, + "loss": 1.3769, + "step": 3120 + }, + { + "epoch": 0.9806623868700551, + "grad_norm": 0.65234375, + "learning_rate": 1.623992383370359e-05, + "loss": 1.2581, + "step": 3122 + }, + { + "epoch": 0.9812906138955966, + "grad_norm": 0.73046875, + "learning_rate": 1.623738495715646e-05, + "loss": 1.3578, + "step": 3124 + }, + { + "epoch": 0.9819188409211379, + "grad_norm": 0.67578125, + "learning_rate": 1.6234846080609332e-05, + "loss": 1.3055, + "step": 3126 + }, + { + "epoch": 0.9825470679466792, + "grad_norm": 1.3125, + "learning_rate": 1.6232307204062203e-05, + "loss": 1.2103, + "step": 3128 + }, + { + "epoch": 0.9831752949722206, + "grad_norm": 0.79296875, + "learning_rate": 1.6229768327515075e-05, + "loss": 1.3818, + "step": 3130 + }, + { + "epoch": 0.9838035219977619, + "grad_norm": 0.703125, + "learning_rate": 1.6227229450967946e-05, + "loss": 1.3531, + "step": 3132 + }, + { + "epoch": 0.9844317490233033, + "grad_norm": 0.73046875, + "learning_rate": 1.622469057442082e-05, + "loss": 1.2813, + "step": 3134 + }, + { + "epoch": 0.9850599760488447, + "grad_norm": 0.7578125, + "learning_rate": 1.6222151697873692e-05, + "loss": 1.3331, + "step": 3136 + }, + { + "epoch": 0.985688203074386, + "grad_norm": 0.72265625, + "learning_rate": 1.6219612821326564e-05, + "loss": 1.3681, + "step": 3138 + }, + { + "epoch": 0.9863164300999273, + "grad_norm": 0.7421875, + "learning_rate": 1.6217073944779435e-05, + "loss": 1.3545, + "step": 3140 + }, + { + "epoch": 0.9869446571254687, + "grad_norm": 0.703125, + "learning_rate": 1.621453506823231e-05, + "loss": 1.392, + "step": 3142 + }, + { + "epoch": 0.9875728841510101, + "grad_norm": 0.79296875, + "learning_rate": 1.621199619168518e-05, + "loss": 1.3232, + "step": 3144 + }, + { + "epoch": 0.9882011111765514, + "grad_norm": 0.7578125, + "learning_rate": 1.6209457315138053e-05, + "loss": 1.2144, + "step": 3146 + }, + { + "epoch": 0.9888293382020927, + "grad_norm": 0.6796875, + "learning_rate": 1.6206918438590924e-05, + "loss": 1.3129, + "step": 3148 + }, + { + "epoch": 0.9894575652276342, + "grad_norm": 0.74609375, + "learning_rate": 1.62043795620438e-05, + "loss": 1.3398, + "step": 3150 + }, + { + "epoch": 0.9900857922531755, + "grad_norm": 0.79296875, + "learning_rate": 1.6201840685496667e-05, + "loss": 1.3094, + "step": 3152 + }, + { + "epoch": 0.9907140192787168, + "grad_norm": 0.66796875, + "learning_rate": 1.619930180894954e-05, + "loss": 1.436, + "step": 3154 + }, + { + "epoch": 0.9913422463042582, + "grad_norm": 0.828125, + "learning_rate": 1.6196762932402413e-05, + "loss": 1.4225, + "step": 3156 + }, + { + "epoch": 0.9919704733297996, + "grad_norm": 0.76953125, + "learning_rate": 1.6194224055855284e-05, + "loss": 1.2521, + "step": 3158 + }, + { + "epoch": 0.9925987003553409, + "grad_norm": 0.7265625, + "learning_rate": 1.6191685179308156e-05, + "loss": 1.2926, + "step": 3160 + }, + { + "epoch": 0.9932269273808823, + "grad_norm": 0.66796875, + "learning_rate": 1.618914630276103e-05, + "loss": 1.222, + "step": 3162 + }, + { + "epoch": 0.9938551544064236, + "grad_norm": 0.87890625, + "learning_rate": 1.6186607426213902e-05, + "loss": 1.3083, + "step": 3164 + }, + { + "epoch": 0.994483381431965, + "grad_norm": 0.66015625, + "learning_rate": 1.6184068549666773e-05, + "loss": 1.3349, + "step": 3166 + }, + { + "epoch": 0.9951116084575063, + "grad_norm": 0.66015625, + "learning_rate": 1.6181529673119648e-05, + "loss": 1.306, + "step": 3168 + }, + { + "epoch": 0.9957398354830477, + "grad_norm": 0.796875, + "learning_rate": 1.617899079657252e-05, + "loss": 1.2053, + "step": 3170 + }, + { + "epoch": 0.996368062508589, + "grad_norm": 0.91015625, + "learning_rate": 1.617645192002539e-05, + "loss": 1.2241, + "step": 3172 + }, + { + "epoch": 0.9969962895341304, + "grad_norm": 0.78125, + "learning_rate": 1.6173913043478262e-05, + "loss": 1.1861, + "step": 3174 + }, + { + "epoch": 0.9976245165596718, + "grad_norm": 0.6953125, + "learning_rate": 1.6171374166931137e-05, + "loss": 1.3181, + "step": 3176 + }, + { + "epoch": 0.9982527435852131, + "grad_norm": 0.7265625, + "learning_rate": 1.6168835290384005e-05, + "loss": 1.2654, + "step": 3178 + }, + { + "epoch": 0.9988809706107544, + "grad_norm": 0.77734375, + "learning_rate": 1.616629641383688e-05, + "loss": 1.435, + "step": 3180 + }, + { + "epoch": 0.9995091976362959, + "grad_norm": 0.75, + "learning_rate": 1.616375753728975e-05, + "loss": 1.4596, + "step": 3182 + }, + { + "epoch": 1.000137424661837, + "grad_norm": 0.68359375, + "learning_rate": 1.6161218660742622e-05, + "loss": 1.3658, + "step": 3184 + }, + { + "epoch": 1.0007656516873786, + "grad_norm": 0.67578125, + "learning_rate": 1.6158679784195494e-05, + "loss": 1.278, + "step": 3186 + }, + { + "epoch": 1.00139387871292, + "grad_norm": 0.6640625, + "learning_rate": 1.615614090764837e-05, + "loss": 1.3165, + "step": 3188 + }, + { + "epoch": 1.0020221057384613, + "grad_norm": 0.68359375, + "learning_rate": 1.615360203110124e-05, + "loss": 1.292, + "step": 3190 + }, + { + "epoch": 1.0026503327640026, + "grad_norm": 0.7109375, + "learning_rate": 1.615106315455411e-05, + "loss": 1.2257, + "step": 3192 + }, + { + "epoch": 1.003278559789544, + "grad_norm": 0.78125, + "learning_rate": 1.6148524278006983e-05, + "loss": 1.256, + "step": 3194 + }, + { + "epoch": 1.0039067868150853, + "grad_norm": 0.83984375, + "learning_rate": 1.6145985401459858e-05, + "loss": 1.2126, + "step": 3196 + }, + { + "epoch": 1.0045350138406266, + "grad_norm": 0.75390625, + "learning_rate": 1.6143446524912726e-05, + "loss": 1.2712, + "step": 3198 + }, + { + "epoch": 1.005163240866168, + "grad_norm": 0.76953125, + "learning_rate": 1.61409076483656e-05, + "loss": 1.1845, + "step": 3200 + }, + { + "epoch": 1.0057914678917093, + "grad_norm": 0.796875, + "learning_rate": 1.613836877181847e-05, + "loss": 1.2542, + "step": 3202 + }, + { + "epoch": 1.0064196949172508, + "grad_norm": 0.8125, + "learning_rate": 1.6135829895271343e-05, + "loss": 1.2691, + "step": 3204 + }, + { + "epoch": 1.0070479219427921, + "grad_norm": 0.78125, + "learning_rate": 1.6133291018724214e-05, + "loss": 1.1913, + "step": 3206 + }, + { + "epoch": 1.0076761489683335, + "grad_norm": 0.78125, + "learning_rate": 1.613075214217709e-05, + "loss": 1.1205, + "step": 3208 + }, + { + "epoch": 1.0083043759938748, + "grad_norm": 0.7578125, + "learning_rate": 1.612821326562996e-05, + "loss": 1.2168, + "step": 3210 + }, + { + "epoch": 1.0089326030194161, + "grad_norm": 0.73046875, + "learning_rate": 1.6125674389082832e-05, + "loss": 1.0832, + "step": 3212 + }, + { + "epoch": 1.0095608300449574, + "grad_norm": 0.76953125, + "learning_rate": 1.6123135512535703e-05, + "loss": 1.2185, + "step": 3214 + }, + { + "epoch": 1.0101890570704988, + "grad_norm": 0.78515625, + "learning_rate": 1.6120596635988578e-05, + "loss": 1.3084, + "step": 3216 + }, + { + "epoch": 1.01081728409604, + "grad_norm": 0.82421875, + "learning_rate": 1.6118057759441446e-05, + "loss": 1.1159, + "step": 3218 + }, + { + "epoch": 1.0114455111215817, + "grad_norm": 0.7421875, + "learning_rate": 1.611551888289432e-05, + "loss": 1.165, + "step": 3220 + }, + { + "epoch": 1.012073738147123, + "grad_norm": 0.73046875, + "learning_rate": 1.6112980006347192e-05, + "loss": 1.1485, + "step": 3222 + }, + { + "epoch": 1.0127019651726643, + "grad_norm": 0.90234375, + "learning_rate": 1.6110441129800064e-05, + "loss": 1.23, + "step": 3224 + }, + { + "epoch": 1.0133301921982056, + "grad_norm": 0.6875, + "learning_rate": 1.6107902253252935e-05, + "loss": 1.1168, + "step": 3226 + }, + { + "epoch": 1.013958419223747, + "grad_norm": 0.8359375, + "learning_rate": 1.610536337670581e-05, + "loss": 1.0645, + "step": 3228 + }, + { + "epoch": 1.0145866462492883, + "grad_norm": 0.7890625, + "learning_rate": 1.610282450015868e-05, + "loss": 1.2309, + "step": 3230 + }, + { + "epoch": 1.0152148732748296, + "grad_norm": 0.83203125, + "learning_rate": 1.6100285623611553e-05, + "loss": 1.2314, + "step": 3232 + }, + { + "epoch": 1.015843100300371, + "grad_norm": 0.7734375, + "learning_rate": 1.6097746747064424e-05, + "loss": 1.3229, + "step": 3234 + }, + { + "epoch": 1.0164713273259125, + "grad_norm": 0.73828125, + "learning_rate": 1.60952078705173e-05, + "loss": 1.1837, + "step": 3236 + }, + { + "epoch": 1.0170995543514538, + "grad_norm": 0.8203125, + "learning_rate": 1.609266899397017e-05, + "loss": 1.2404, + "step": 3238 + }, + { + "epoch": 1.0177277813769952, + "grad_norm": 0.765625, + "learning_rate": 1.609013011742304e-05, + "loss": 1.344, + "step": 3240 + }, + { + "epoch": 1.0183560084025365, + "grad_norm": 0.74609375, + "learning_rate": 1.6087591240875913e-05, + "loss": 1.3881, + "step": 3242 + }, + { + "epoch": 1.0189842354280778, + "grad_norm": 0.7890625, + "learning_rate": 1.6085052364328784e-05, + "loss": 1.3539, + "step": 3244 + }, + { + "epoch": 1.0196124624536191, + "grad_norm": 0.72265625, + "learning_rate": 1.6082513487781656e-05, + "loss": 1.2862, + "step": 3246 + }, + { + "epoch": 1.0202406894791605, + "grad_norm": 0.73046875, + "learning_rate": 1.607997461123453e-05, + "loss": 1.3068, + "step": 3248 + }, + { + "epoch": 1.0208689165047018, + "grad_norm": 0.6875, + "learning_rate": 1.6077435734687402e-05, + "loss": 1.2403, + "step": 3250 + }, + { + "epoch": 1.0214971435302433, + "grad_norm": 0.7578125, + "learning_rate": 1.6074896858140273e-05, + "loss": 1.2603, + "step": 3252 + }, + { + "epoch": 1.0221253705557847, + "grad_norm": 0.83203125, + "learning_rate": 1.6072357981593148e-05, + "loss": 1.1728, + "step": 3254 + }, + { + "epoch": 1.022753597581326, + "grad_norm": 0.7734375, + "learning_rate": 1.606981910504602e-05, + "loss": 1.1989, + "step": 3256 + }, + { + "epoch": 1.0233818246068673, + "grad_norm": 0.84765625, + "learning_rate": 1.606728022849889e-05, + "loss": 1.2041, + "step": 3258 + }, + { + "epoch": 1.0240100516324087, + "grad_norm": 0.84375, + "learning_rate": 1.6064741351951762e-05, + "loss": 1.2197, + "step": 3260 + }, + { + "epoch": 1.02463827865795, + "grad_norm": 0.81640625, + "learning_rate": 1.6062202475404637e-05, + "loss": 1.1039, + "step": 3262 + }, + { + "epoch": 1.0252665056834913, + "grad_norm": 0.74609375, + "learning_rate": 1.6059663598857508e-05, + "loss": 1.2697, + "step": 3264 + }, + { + "epoch": 1.0258947327090326, + "grad_norm": 0.8515625, + "learning_rate": 1.605712472231038e-05, + "loss": 1.152, + "step": 3266 + }, + { + "epoch": 1.026522959734574, + "grad_norm": 0.83203125, + "learning_rate": 1.605458584576325e-05, + "loss": 1.2728, + "step": 3268 + }, + { + "epoch": 1.0271511867601155, + "grad_norm": 0.77734375, + "learning_rate": 1.6052046969216122e-05, + "loss": 1.1901, + "step": 3270 + }, + { + "epoch": 1.0277794137856568, + "grad_norm": 0.765625, + "learning_rate": 1.6049508092668994e-05, + "loss": 1.2918, + "step": 3272 + }, + { + "epoch": 1.0284076408111982, + "grad_norm": 0.7890625, + "learning_rate": 1.604696921612187e-05, + "loss": 1.2328, + "step": 3274 + }, + { + "epoch": 1.0290358678367395, + "grad_norm": 0.7578125, + "learning_rate": 1.604443033957474e-05, + "loss": 1.2688, + "step": 3276 + }, + { + "epoch": 1.0296640948622808, + "grad_norm": 0.78125, + "learning_rate": 1.604189146302761e-05, + "loss": 1.1562, + "step": 3278 + }, + { + "epoch": 1.0302923218878222, + "grad_norm": 0.83203125, + "learning_rate": 1.6039352586480483e-05, + "loss": 1.2546, + "step": 3280 + }, + { + "epoch": 1.0309205489133635, + "grad_norm": 0.75, + "learning_rate": 1.6036813709933357e-05, + "loss": 1.1542, + "step": 3282 + }, + { + "epoch": 1.0315487759389048, + "grad_norm": 0.87109375, + "learning_rate": 1.603427483338623e-05, + "loss": 1.2115, + "step": 3284 + }, + { + "epoch": 1.0321770029644464, + "grad_norm": 0.8671875, + "learning_rate": 1.60317359568391e-05, + "loss": 1.3227, + "step": 3286 + }, + { + "epoch": 1.0328052299899877, + "grad_norm": 0.77734375, + "learning_rate": 1.602919708029197e-05, + "loss": 1.2416, + "step": 3288 + }, + { + "epoch": 1.033433457015529, + "grad_norm": 0.8125, + "learning_rate": 1.6026658203744846e-05, + "loss": 1.2146, + "step": 3290 + }, + { + "epoch": 1.0340616840410704, + "grad_norm": 0.75390625, + "learning_rate": 1.6024119327197714e-05, + "loss": 1.2237, + "step": 3292 + }, + { + "epoch": 1.0346899110666117, + "grad_norm": 0.734375, + "learning_rate": 1.602158045065059e-05, + "loss": 1.1345, + "step": 3294 + }, + { + "epoch": 1.035318138092153, + "grad_norm": 0.76171875, + "learning_rate": 1.601904157410346e-05, + "loss": 1.1062, + "step": 3296 + }, + { + "epoch": 1.0359463651176943, + "grad_norm": 0.74609375, + "learning_rate": 1.6016502697556332e-05, + "loss": 1.2563, + "step": 3298 + }, + { + "epoch": 1.0365745921432357, + "grad_norm": 0.7890625, + "learning_rate": 1.6013963821009203e-05, + "loss": 1.1443, + "step": 3300 + }, + { + "epoch": 1.0372028191687772, + "grad_norm": 0.8125, + "learning_rate": 1.6011424944462078e-05, + "loss": 1.3163, + "step": 3302 + }, + { + "epoch": 1.0378310461943185, + "grad_norm": 0.80078125, + "learning_rate": 1.600888606791495e-05, + "loss": 1.2362, + "step": 3304 + }, + { + "epoch": 1.0384592732198599, + "grad_norm": 0.828125, + "learning_rate": 1.600634719136782e-05, + "loss": 1.2361, + "step": 3306 + }, + { + "epoch": 1.0390875002454012, + "grad_norm": 0.859375, + "learning_rate": 1.6003808314820692e-05, + "loss": 1.2125, + "step": 3308 + }, + { + "epoch": 1.0397157272709425, + "grad_norm": 0.9140625, + "learning_rate": 1.6001269438273567e-05, + "loss": 1.1605, + "step": 3310 + }, + { + "epoch": 1.0403439542964839, + "grad_norm": 0.88671875, + "learning_rate": 1.5998730561726435e-05, + "loss": 1.1425, + "step": 3312 + }, + { + "epoch": 1.0409721813220252, + "grad_norm": 0.72265625, + "learning_rate": 1.599619168517931e-05, + "loss": 1.1267, + "step": 3314 + }, + { + "epoch": 1.0416004083475665, + "grad_norm": 0.79296875, + "learning_rate": 1.599365280863218e-05, + "loss": 1.3268, + "step": 3316 + }, + { + "epoch": 1.042228635373108, + "grad_norm": 0.85546875, + "learning_rate": 1.5991113932085053e-05, + "loss": 1.2864, + "step": 3318 + }, + { + "epoch": 1.0428568623986494, + "grad_norm": 0.796875, + "learning_rate": 1.5988575055537924e-05, + "loss": 1.3753, + "step": 3320 + }, + { + "epoch": 1.0434850894241907, + "grad_norm": 0.734375, + "learning_rate": 1.59860361789908e-05, + "loss": 1.279, + "step": 3322 + }, + { + "epoch": 1.044113316449732, + "grad_norm": 0.77734375, + "learning_rate": 1.598349730244367e-05, + "loss": 1.2186, + "step": 3324 + }, + { + "epoch": 1.0447415434752734, + "grad_norm": 0.765625, + "learning_rate": 1.598095842589654e-05, + "loss": 1.2752, + "step": 3326 + }, + { + "epoch": 1.0453697705008147, + "grad_norm": 0.88671875, + "learning_rate": 1.5978419549349413e-05, + "loss": 1.0901, + "step": 3328 + }, + { + "epoch": 1.045997997526356, + "grad_norm": 0.71875, + "learning_rate": 1.5975880672802288e-05, + "loss": 1.2462, + "step": 3330 + }, + { + "epoch": 1.0466262245518974, + "grad_norm": 0.81640625, + "learning_rate": 1.597334179625516e-05, + "loss": 1.2196, + "step": 3332 + }, + { + "epoch": 1.0472544515774387, + "grad_norm": 0.77734375, + "learning_rate": 1.597080291970803e-05, + "loss": 1.181, + "step": 3334 + }, + { + "epoch": 1.0478826786029802, + "grad_norm": 0.828125, + "learning_rate": 1.5968264043160902e-05, + "loss": 1.2889, + "step": 3336 + }, + { + "epoch": 1.0485109056285216, + "grad_norm": 0.79296875, + "learning_rate": 1.5965725166613773e-05, + "loss": 1.2918, + "step": 3338 + }, + { + "epoch": 1.049139132654063, + "grad_norm": 0.8515625, + "learning_rate": 1.5963186290066648e-05, + "loss": 1.2074, + "step": 3340 + }, + { + "epoch": 1.0497673596796042, + "grad_norm": 0.72265625, + "learning_rate": 1.596064741351952e-05, + "loss": 1.0973, + "step": 3342 + }, + { + "epoch": 1.0503955867051455, + "grad_norm": 0.85546875, + "learning_rate": 1.595810853697239e-05, + "loss": 1.2965, + "step": 3344 + }, + { + "epoch": 1.0510238137306869, + "grad_norm": 0.90625, + "learning_rate": 1.5955569660425262e-05, + "loss": 1.1753, + "step": 3346 + }, + { + "epoch": 1.0516520407562282, + "grad_norm": 0.89453125, + "learning_rate": 1.5953030783878137e-05, + "loss": 1.1732, + "step": 3348 + }, + { + "epoch": 1.0522802677817698, + "grad_norm": 0.8125, + "learning_rate": 1.5950491907331008e-05, + "loss": 1.2432, + "step": 3350 + }, + { + "epoch": 1.052908494807311, + "grad_norm": 0.74609375, + "learning_rate": 1.594795303078388e-05, + "loss": 1.3405, + "step": 3352 + }, + { + "epoch": 1.0535367218328524, + "grad_norm": 0.73828125, + "learning_rate": 1.594541415423675e-05, + "loss": 1.2937, + "step": 3354 + }, + { + "epoch": 1.0541649488583937, + "grad_norm": 0.76953125, + "learning_rate": 1.5942875277689626e-05, + "loss": 1.2403, + "step": 3356 + }, + { + "epoch": 1.054793175883935, + "grad_norm": 0.71875, + "learning_rate": 1.5940336401142497e-05, + "loss": 1.1935, + "step": 3358 + }, + { + "epoch": 1.0554214029094764, + "grad_norm": 0.80859375, + "learning_rate": 1.593779752459537e-05, + "loss": 1.1554, + "step": 3360 + }, + { + "epoch": 1.0560496299350177, + "grad_norm": 0.79296875, + "learning_rate": 1.593525864804824e-05, + "loss": 1.2863, + "step": 3362 + }, + { + "epoch": 1.056677856960559, + "grad_norm": 0.76953125, + "learning_rate": 1.593271977150111e-05, + "loss": 1.3232, + "step": 3364 + }, + { + "epoch": 1.0573060839861004, + "grad_norm": 0.87109375, + "learning_rate": 1.5930180894953983e-05, + "loss": 1.165, + "step": 3366 + }, + { + "epoch": 1.057934311011642, + "grad_norm": 0.859375, + "learning_rate": 1.5927642018406857e-05, + "loss": 1.2739, + "step": 3368 + }, + { + "epoch": 1.0585625380371833, + "grad_norm": 0.72265625, + "learning_rate": 1.592510314185973e-05, + "loss": 1.1609, + "step": 3370 + }, + { + "epoch": 1.0591907650627246, + "grad_norm": 0.7421875, + "learning_rate": 1.59225642653126e-05, + "loss": 1.3293, + "step": 3372 + }, + { + "epoch": 1.059818992088266, + "grad_norm": 0.76171875, + "learning_rate": 1.592002538876547e-05, + "loss": 1.2502, + "step": 3374 + }, + { + "epoch": 1.0604472191138072, + "grad_norm": 0.7578125, + "learning_rate": 1.5917486512218346e-05, + "loss": 1.1905, + "step": 3376 + }, + { + "epoch": 1.0610754461393486, + "grad_norm": 0.79296875, + "learning_rate": 1.5914947635671218e-05, + "loss": 1.3572, + "step": 3378 + }, + { + "epoch": 1.06170367316489, + "grad_norm": 0.96875, + "learning_rate": 1.591240875912409e-05, + "loss": 1.2005, + "step": 3380 + }, + { + "epoch": 1.0623319001904312, + "grad_norm": 0.74609375, + "learning_rate": 1.590986988257696e-05, + "loss": 1.2663, + "step": 3382 + }, + { + "epoch": 1.0629601272159728, + "grad_norm": 0.80078125, + "learning_rate": 1.5907331006029835e-05, + "loss": 1.2143, + "step": 3384 + }, + { + "epoch": 1.063588354241514, + "grad_norm": 0.73828125, + "learning_rate": 1.5904792129482703e-05, + "loss": 1.2849, + "step": 3386 + }, + { + "epoch": 1.0642165812670554, + "grad_norm": 0.765625, + "learning_rate": 1.5902253252935578e-05, + "loss": 1.2787, + "step": 3388 + }, + { + "epoch": 1.0648448082925968, + "grad_norm": 0.8046875, + "learning_rate": 1.589971437638845e-05, + "loss": 1.1279, + "step": 3390 + }, + { + "epoch": 1.065473035318138, + "grad_norm": 0.7890625, + "learning_rate": 1.589717549984132e-05, + "loss": 1.2399, + "step": 3392 + }, + { + "epoch": 1.0661012623436794, + "grad_norm": 0.8125, + "learning_rate": 1.5894636623294192e-05, + "loss": 1.2965, + "step": 3394 + }, + { + "epoch": 1.0667294893692207, + "grad_norm": 0.7265625, + "learning_rate": 1.5892097746747067e-05, + "loss": 1.1753, + "step": 3396 + }, + { + "epoch": 1.067357716394762, + "grad_norm": 0.80859375, + "learning_rate": 1.5889558870199938e-05, + "loss": 1.229, + "step": 3398 + }, + { + "epoch": 1.0679859434203034, + "grad_norm": 0.73046875, + "learning_rate": 1.588701999365281e-05, + "loss": 1.2592, + "step": 3400 + }, + { + "epoch": 1.068614170445845, + "grad_norm": 0.78125, + "learning_rate": 1.588448111710568e-05, + "loss": 1.2941, + "step": 3402 + }, + { + "epoch": 1.0692423974713863, + "grad_norm": 0.75390625, + "learning_rate": 1.5881942240558556e-05, + "loss": 1.2962, + "step": 3404 + }, + { + "epoch": 1.0698706244969276, + "grad_norm": 0.796875, + "learning_rate": 1.5879403364011424e-05, + "loss": 1.2558, + "step": 3406 + }, + { + "epoch": 1.070498851522469, + "grad_norm": 0.7578125, + "learning_rate": 1.58768644874643e-05, + "loss": 1.3021, + "step": 3408 + }, + { + "epoch": 1.0711270785480103, + "grad_norm": 0.80859375, + "learning_rate": 1.587432561091717e-05, + "loss": 1.2742, + "step": 3410 + }, + { + "epoch": 1.0717553055735516, + "grad_norm": 0.796875, + "learning_rate": 1.587178673437004e-05, + "loss": 1.2945, + "step": 3412 + }, + { + "epoch": 1.072383532599093, + "grad_norm": 0.83984375, + "learning_rate": 1.5869247857822913e-05, + "loss": 1.323, + "step": 3414 + }, + { + "epoch": 1.0730117596246345, + "grad_norm": 0.76953125, + "learning_rate": 1.5866708981275787e-05, + "loss": 1.2498, + "step": 3416 + }, + { + "epoch": 1.0736399866501758, + "grad_norm": 0.765625, + "learning_rate": 1.586417010472866e-05, + "loss": 1.0935, + "step": 3418 + }, + { + "epoch": 1.0742682136757171, + "grad_norm": 0.77734375, + "learning_rate": 1.586163122818153e-05, + "loss": 1.1667, + "step": 3420 + }, + { + "epoch": 1.0748964407012584, + "grad_norm": 0.75390625, + "learning_rate": 1.58590923516344e-05, + "loss": 1.2579, + "step": 3422 + }, + { + "epoch": 1.0755246677267998, + "grad_norm": 0.79296875, + "learning_rate": 1.5856553475087276e-05, + "loss": 1.1673, + "step": 3424 + }, + { + "epoch": 1.076152894752341, + "grad_norm": 0.73046875, + "learning_rate": 1.5854014598540148e-05, + "loss": 1.411, + "step": 3426 + }, + { + "epoch": 1.0767811217778824, + "grad_norm": 0.82421875, + "learning_rate": 1.585147572199302e-05, + "loss": 1.1398, + "step": 3428 + }, + { + "epoch": 1.0774093488034238, + "grad_norm": 0.76953125, + "learning_rate": 1.5848936845445894e-05, + "loss": 1.2636, + "step": 3430 + }, + { + "epoch": 1.078037575828965, + "grad_norm": 0.71484375, + "learning_rate": 1.5846397968898762e-05, + "loss": 1.2364, + "step": 3432 + }, + { + "epoch": 1.0786658028545066, + "grad_norm": 0.78515625, + "learning_rate": 1.5843859092351637e-05, + "loss": 1.4491, + "step": 3434 + }, + { + "epoch": 1.079294029880048, + "grad_norm": 0.828125, + "learning_rate": 1.5841320215804508e-05, + "loss": 1.1533, + "step": 3436 + }, + { + "epoch": 1.0799222569055893, + "grad_norm": 0.95703125, + "learning_rate": 1.583878133925738e-05, + "loss": 1.229, + "step": 3438 + }, + { + "epoch": 1.0805504839311306, + "grad_norm": 0.76171875, + "learning_rate": 1.583624246271025e-05, + "loss": 1.4361, + "step": 3440 + }, + { + "epoch": 1.081178710956672, + "grad_norm": 0.7890625, + "learning_rate": 1.5833703586163126e-05, + "loss": 1.3107, + "step": 3442 + }, + { + "epoch": 1.0818069379822133, + "grad_norm": 0.84765625, + "learning_rate": 1.5831164709615997e-05, + "loss": 1.1652, + "step": 3444 + }, + { + "epoch": 1.0824351650077546, + "grad_norm": 0.8359375, + "learning_rate": 1.582862583306887e-05, + "loss": 1.2367, + "step": 3446 + }, + { + "epoch": 1.083063392033296, + "grad_norm": 0.76953125, + "learning_rate": 1.582608695652174e-05, + "loss": 1.178, + "step": 3448 + }, + { + "epoch": 1.0836916190588375, + "grad_norm": 0.82421875, + "learning_rate": 1.5823548079974615e-05, + "loss": 1.2211, + "step": 3450 + }, + { + "epoch": 1.0843198460843788, + "grad_norm": 0.9140625, + "learning_rate": 1.5821009203427486e-05, + "loss": 1.2025, + "step": 3452 + }, + { + "epoch": 1.0849480731099201, + "grad_norm": 0.83203125, + "learning_rate": 1.5818470326880357e-05, + "loss": 1.2151, + "step": 3454 + }, + { + "epoch": 1.0855763001354615, + "grad_norm": 0.79296875, + "learning_rate": 1.581593145033323e-05, + "loss": 1.2384, + "step": 3456 + }, + { + "epoch": 1.0862045271610028, + "grad_norm": 0.85546875, + "learning_rate": 1.58133925737861e-05, + "loss": 1.2462, + "step": 3458 + }, + { + "epoch": 1.0868327541865441, + "grad_norm": 0.78515625, + "learning_rate": 1.581085369723897e-05, + "loss": 1.1743, + "step": 3460 + }, + { + "epoch": 1.0874609812120855, + "grad_norm": 0.7890625, + "learning_rate": 1.5808314820691846e-05, + "loss": 1.2484, + "step": 3462 + }, + { + "epoch": 1.0880892082376268, + "grad_norm": 0.8203125, + "learning_rate": 1.5805775944144718e-05, + "loss": 1.1292, + "step": 3464 + }, + { + "epoch": 1.088717435263168, + "grad_norm": 0.7890625, + "learning_rate": 1.580323706759759e-05, + "loss": 1.3436, + "step": 3466 + }, + { + "epoch": 1.0893456622887097, + "grad_norm": 0.79296875, + "learning_rate": 1.580069819105046e-05, + "loss": 1.106, + "step": 3468 + }, + { + "epoch": 1.089973889314251, + "grad_norm": 0.7109375, + "learning_rate": 1.5798159314503335e-05, + "loss": 1.1988, + "step": 3470 + }, + { + "epoch": 1.0906021163397923, + "grad_norm": 0.84765625, + "learning_rate": 1.5795620437956207e-05, + "loss": 1.4247, + "step": 3472 + }, + { + "epoch": 1.0912303433653336, + "grad_norm": 0.79296875, + "learning_rate": 1.5793081561409078e-05, + "loss": 1.2297, + "step": 3474 + }, + { + "epoch": 1.091858570390875, + "grad_norm": 0.78125, + "learning_rate": 1.579054268486195e-05, + "loss": 1.1387, + "step": 3476 + }, + { + "epoch": 1.0924867974164163, + "grad_norm": 0.73046875, + "learning_rate": 1.5788003808314824e-05, + "loss": 1.3101, + "step": 3478 + }, + { + "epoch": 1.0931150244419576, + "grad_norm": 0.98046875, + "learning_rate": 1.5785464931767692e-05, + "loss": 1.2612, + "step": 3480 + }, + { + "epoch": 1.0937432514674992, + "grad_norm": 0.79296875, + "learning_rate": 1.5782926055220567e-05, + "loss": 1.1812, + "step": 3482 + }, + { + "epoch": 1.0943714784930405, + "grad_norm": 0.7265625, + "learning_rate": 1.5780387178673438e-05, + "loss": 1.2531, + "step": 3484 + }, + { + "epoch": 1.0949997055185818, + "grad_norm": 0.91796875, + "learning_rate": 1.577784830212631e-05, + "loss": 1.1636, + "step": 3486 + }, + { + "epoch": 1.0956279325441232, + "grad_norm": 0.75, + "learning_rate": 1.577530942557918e-05, + "loss": 1.2318, + "step": 3488 + }, + { + "epoch": 1.0962561595696645, + "grad_norm": 0.859375, + "learning_rate": 1.5772770549032056e-05, + "loss": 1.337, + "step": 3490 + }, + { + "epoch": 1.0968843865952058, + "grad_norm": 3.765625, + "learning_rate": 1.5770231672484927e-05, + "loss": 1.2689, + "step": 3492 + }, + { + "epoch": 1.0975126136207471, + "grad_norm": 0.80078125, + "learning_rate": 1.57676927959378e-05, + "loss": 1.2185, + "step": 3494 + }, + { + "epoch": 1.0981408406462885, + "grad_norm": 0.8359375, + "learning_rate": 1.576515391939067e-05, + "loss": 1.3053, + "step": 3496 + }, + { + "epoch": 1.0987690676718298, + "grad_norm": 0.76171875, + "learning_rate": 1.5762615042843545e-05, + "loss": 1.1346, + "step": 3498 + }, + { + "epoch": 1.0993972946973714, + "grad_norm": 0.7109375, + "learning_rate": 1.5760076166296413e-05, + "loss": 1.2331, + "step": 3500 + }, + { + "epoch": 1.1000255217229127, + "grad_norm": 0.74609375, + "learning_rate": 1.5757537289749287e-05, + "loss": 1.2669, + "step": 3502 + }, + { + "epoch": 1.100653748748454, + "grad_norm": 0.78515625, + "learning_rate": 1.575499841320216e-05, + "loss": 1.2222, + "step": 3504 + }, + { + "epoch": 1.1012819757739953, + "grad_norm": 0.8046875, + "learning_rate": 1.575245953665503e-05, + "loss": 1.1517, + "step": 3506 + }, + { + "epoch": 1.1019102027995367, + "grad_norm": 0.79296875, + "learning_rate": 1.57499206601079e-05, + "loss": 1.2206, + "step": 3508 + }, + { + "epoch": 1.102538429825078, + "grad_norm": 0.7890625, + "learning_rate": 1.5747381783560776e-05, + "loss": 1.2261, + "step": 3510 + }, + { + "epoch": 1.1031666568506193, + "grad_norm": 2.125, + "learning_rate": 1.5744842907013648e-05, + "loss": 1.0953, + "step": 3512 + }, + { + "epoch": 1.1037948838761606, + "grad_norm": 0.83984375, + "learning_rate": 1.574230403046652e-05, + "loss": 1.3215, + "step": 3514 + }, + { + "epoch": 1.1044231109017022, + "grad_norm": 0.79296875, + "learning_rate": 1.5739765153919394e-05, + "loss": 1.3369, + "step": 3516 + }, + { + "epoch": 1.1050513379272435, + "grad_norm": 0.84375, + "learning_rate": 1.5737226277372265e-05, + "loss": 1.2915, + "step": 3518 + }, + { + "epoch": 1.1056795649527849, + "grad_norm": 0.85546875, + "learning_rate": 1.5734687400825137e-05, + "loss": 1.3005, + "step": 3520 + }, + { + "epoch": 1.1063077919783262, + "grad_norm": 0.83984375, + "learning_rate": 1.5732148524278008e-05, + "loss": 1.2166, + "step": 3522 + }, + { + "epoch": 1.1069360190038675, + "grad_norm": 0.86328125, + "learning_rate": 1.5729609647730883e-05, + "loss": 1.1558, + "step": 3524 + }, + { + "epoch": 1.1075642460294088, + "grad_norm": 0.77734375, + "learning_rate": 1.572707077118375e-05, + "loss": 1.2904, + "step": 3526 + }, + { + "epoch": 1.1081924730549502, + "grad_norm": 0.859375, + "learning_rate": 1.5724531894636626e-05, + "loss": 1.1995, + "step": 3528 + }, + { + "epoch": 1.1088207000804915, + "grad_norm": 0.78515625, + "learning_rate": 1.5721993018089497e-05, + "loss": 1.3063, + "step": 3530 + }, + { + "epoch": 1.109448927106033, + "grad_norm": 0.85546875, + "learning_rate": 1.571945414154237e-05, + "loss": 1.3033, + "step": 3532 + }, + { + "epoch": 1.1100771541315744, + "grad_norm": 0.81640625, + "learning_rate": 1.571691526499524e-05, + "loss": 1.2816, + "step": 3534 + }, + { + "epoch": 1.1107053811571157, + "grad_norm": 0.81640625, + "learning_rate": 1.5714376388448114e-05, + "loss": 1.3709, + "step": 3536 + }, + { + "epoch": 1.111333608182657, + "grad_norm": 0.8125, + "learning_rate": 1.5711837511900986e-05, + "loss": 1.1248, + "step": 3538 + }, + { + "epoch": 1.1119618352081984, + "grad_norm": 0.81640625, + "learning_rate": 1.5709298635353857e-05, + "loss": 1.2373, + "step": 3540 + }, + { + "epoch": 1.1125900622337397, + "grad_norm": 0.8671875, + "learning_rate": 1.570675975880673e-05, + "loss": 1.3827, + "step": 3542 + }, + { + "epoch": 1.113218289259281, + "grad_norm": 0.90234375, + "learning_rate": 1.5704220882259603e-05, + "loss": 1.2102, + "step": 3544 + }, + { + "epoch": 1.1138465162848223, + "grad_norm": 0.80859375, + "learning_rate": 1.570168200571247e-05, + "loss": 1.1883, + "step": 3546 + }, + { + "epoch": 1.114474743310364, + "grad_norm": 0.828125, + "learning_rate": 1.5699143129165346e-05, + "loss": 1.1861, + "step": 3548 + }, + { + "epoch": 1.1151029703359052, + "grad_norm": 0.80078125, + "learning_rate": 1.5696604252618218e-05, + "loss": 1.3095, + "step": 3550 + }, + { + "epoch": 1.1157311973614465, + "grad_norm": 0.76171875, + "learning_rate": 1.569406537607109e-05, + "loss": 1.1794, + "step": 3552 + }, + { + "epoch": 1.1163594243869879, + "grad_norm": 0.84765625, + "learning_rate": 1.569152649952396e-05, + "loss": 1.2195, + "step": 3554 + }, + { + "epoch": 1.1169876514125292, + "grad_norm": 0.7578125, + "learning_rate": 1.5688987622976835e-05, + "loss": 1.3026, + "step": 3556 + }, + { + "epoch": 1.1176158784380705, + "grad_norm": 0.859375, + "learning_rate": 1.5686448746429706e-05, + "loss": 1.308, + "step": 3558 + }, + { + "epoch": 1.1182441054636119, + "grad_norm": 0.734375, + "learning_rate": 1.5683909869882578e-05, + "loss": 1.192, + "step": 3560 + }, + { + "epoch": 1.1188723324891532, + "grad_norm": 0.76953125, + "learning_rate": 1.568137099333545e-05, + "loss": 1.1283, + "step": 3562 + }, + { + "epoch": 1.1195005595146945, + "grad_norm": 0.765625, + "learning_rate": 1.5678832116788324e-05, + "loss": 1.2358, + "step": 3564 + }, + { + "epoch": 1.120128786540236, + "grad_norm": 0.77734375, + "learning_rate": 1.5676293240241195e-05, + "loss": 1.1856, + "step": 3566 + }, + { + "epoch": 1.1207570135657774, + "grad_norm": 0.828125, + "learning_rate": 1.5673754363694067e-05, + "loss": 1.2214, + "step": 3568 + }, + { + "epoch": 1.1213852405913187, + "grad_norm": 0.796875, + "learning_rate": 1.5671215487146938e-05, + "loss": 1.1597, + "step": 3570 + }, + { + "epoch": 1.12201346761686, + "grad_norm": 0.796875, + "learning_rate": 1.566867661059981e-05, + "loss": 1.1691, + "step": 3572 + }, + { + "epoch": 1.1226416946424014, + "grad_norm": 0.78125, + "learning_rate": 1.566613773405268e-05, + "loss": 1.3458, + "step": 3574 + }, + { + "epoch": 1.1232699216679427, + "grad_norm": 0.7890625, + "learning_rate": 1.5663598857505556e-05, + "loss": 1.0832, + "step": 3576 + }, + { + "epoch": 1.123898148693484, + "grad_norm": 0.93359375, + "learning_rate": 1.5661059980958427e-05, + "loss": 1.2571, + "step": 3578 + }, + { + "epoch": 1.1245263757190254, + "grad_norm": 0.78515625, + "learning_rate": 1.56585211044113e-05, + "loss": 1.3284, + "step": 3580 + }, + { + "epoch": 1.125154602744567, + "grad_norm": 0.8984375, + "learning_rate": 1.565598222786417e-05, + "loss": 1.1614, + "step": 3582 + }, + { + "epoch": 1.1257828297701082, + "grad_norm": 0.78125, + "learning_rate": 1.5653443351317045e-05, + "loss": 1.208, + "step": 3584 + }, + { + "epoch": 1.1264110567956496, + "grad_norm": 0.75, + "learning_rate": 1.5650904474769916e-05, + "loss": 1.1761, + "step": 3586 + }, + { + "epoch": 1.127039283821191, + "grad_norm": 0.734375, + "learning_rate": 1.5648365598222787e-05, + "loss": 1.3036, + "step": 3588 + }, + { + "epoch": 1.1276675108467322, + "grad_norm": 0.765625, + "learning_rate": 1.564582672167566e-05, + "loss": 1.1291, + "step": 3590 + }, + { + "epoch": 1.1282957378722736, + "grad_norm": 0.8125, + "learning_rate": 1.5643287845128534e-05, + "loss": 1.2135, + "step": 3592 + }, + { + "epoch": 1.1289239648978149, + "grad_norm": 0.7421875, + "learning_rate": 1.56407489685814e-05, + "loss": 1.2834, + "step": 3594 + }, + { + "epoch": 1.1295521919233562, + "grad_norm": 0.828125, + "learning_rate": 1.5638210092034276e-05, + "loss": 1.328, + "step": 3596 + }, + { + "epoch": 1.1301804189488975, + "grad_norm": 0.79296875, + "learning_rate": 1.5635671215487148e-05, + "loss": 1.346, + "step": 3598 + }, + { + "epoch": 1.130808645974439, + "grad_norm": 0.74609375, + "learning_rate": 1.563313233894002e-05, + "loss": 1.0939, + "step": 3600 + }, + { + "epoch": 1.1314368729999804, + "grad_norm": 0.8046875, + "learning_rate": 1.5630593462392894e-05, + "loss": 1.243, + "step": 3602 + }, + { + "epoch": 1.1320651000255217, + "grad_norm": 0.84765625, + "learning_rate": 1.5628054585845765e-05, + "loss": 1.2313, + "step": 3604 + }, + { + "epoch": 1.132693327051063, + "grad_norm": 0.81640625, + "learning_rate": 1.5625515709298637e-05, + "loss": 1.2229, + "step": 3606 + }, + { + "epoch": 1.1333215540766044, + "grad_norm": 0.890625, + "learning_rate": 1.5622976832751508e-05, + "loss": 1.3329, + "step": 3608 + }, + { + "epoch": 1.1339497811021457, + "grad_norm": 0.96484375, + "learning_rate": 1.5620437956204383e-05, + "loss": 1.1156, + "step": 3610 + }, + { + "epoch": 1.134578008127687, + "grad_norm": 0.828125, + "learning_rate": 1.5617899079657254e-05, + "loss": 1.1354, + "step": 3612 + }, + { + "epoch": 1.1352062351532286, + "grad_norm": 0.85546875, + "learning_rate": 1.5615360203110125e-05, + "loss": 1.2974, + "step": 3614 + }, + { + "epoch": 1.13583446217877, + "grad_norm": 0.76953125, + "learning_rate": 1.5612821326562997e-05, + "loss": 1.3404, + "step": 3616 + }, + { + "epoch": 1.1364626892043113, + "grad_norm": 0.84765625, + "learning_rate": 1.561028245001587e-05, + "loss": 1.2677, + "step": 3618 + }, + { + "epoch": 1.1370909162298526, + "grad_norm": 0.82421875, + "learning_rate": 1.560774357346874e-05, + "loss": 1.1016, + "step": 3620 + }, + { + "epoch": 1.137719143255394, + "grad_norm": 0.703125, + "learning_rate": 1.5605204696921614e-05, + "loss": 1.2943, + "step": 3622 + }, + { + "epoch": 1.1383473702809352, + "grad_norm": 0.796875, + "learning_rate": 1.5602665820374486e-05, + "loss": 1.196, + "step": 3624 + }, + { + "epoch": 1.1389755973064766, + "grad_norm": 0.79296875, + "learning_rate": 1.5600126943827357e-05, + "loss": 1.1846, + "step": 3626 + }, + { + "epoch": 1.139603824332018, + "grad_norm": 0.8046875, + "learning_rate": 1.559758806728023e-05, + "loss": 1.2626, + "step": 3628 + }, + { + "epoch": 1.1402320513575592, + "grad_norm": 0.80078125, + "learning_rate": 1.5595049190733103e-05, + "loss": 1.3086, + "step": 3630 + }, + { + "epoch": 1.1408602783831008, + "grad_norm": 0.9453125, + "learning_rate": 1.5592510314185975e-05, + "loss": 1.2576, + "step": 3632 + }, + { + "epoch": 1.141488505408642, + "grad_norm": 0.8515625, + "learning_rate": 1.5589971437638846e-05, + "loss": 1.3336, + "step": 3634 + }, + { + "epoch": 1.1421167324341834, + "grad_norm": 0.71484375, + "learning_rate": 1.5587432561091717e-05, + "loss": 1.1999, + "step": 3636 + }, + { + "epoch": 1.1427449594597248, + "grad_norm": 0.90625, + "learning_rate": 1.5584893684544592e-05, + "loss": 1.2824, + "step": 3638 + }, + { + "epoch": 1.143373186485266, + "grad_norm": 0.8125, + "learning_rate": 1.558235480799746e-05, + "loss": 1.3522, + "step": 3640 + }, + { + "epoch": 1.1440014135108074, + "grad_norm": 0.96484375, + "learning_rate": 1.5579815931450335e-05, + "loss": 1.2944, + "step": 3642 + }, + { + "epoch": 1.1446296405363487, + "grad_norm": 0.77734375, + "learning_rate": 1.5577277054903206e-05, + "loss": 1.2327, + "step": 3644 + }, + { + "epoch": 1.1452578675618903, + "grad_norm": 0.796875, + "learning_rate": 1.5574738178356078e-05, + "loss": 1.3088, + "step": 3646 + }, + { + "epoch": 1.1458860945874316, + "grad_norm": 0.8046875, + "learning_rate": 1.557219930180895e-05, + "loss": 1.2625, + "step": 3648 + }, + { + "epoch": 1.146514321612973, + "grad_norm": 0.79296875, + "learning_rate": 1.5569660425261824e-05, + "loss": 1.3164, + "step": 3650 + }, + { + "epoch": 1.1471425486385143, + "grad_norm": 0.875, + "learning_rate": 1.5567121548714695e-05, + "loss": 1.1474, + "step": 3652 + }, + { + "epoch": 1.1477707756640556, + "grad_norm": 0.76953125, + "learning_rate": 1.5564582672167567e-05, + "loss": 1.3335, + "step": 3654 + }, + { + "epoch": 1.148399002689597, + "grad_norm": 0.74609375, + "learning_rate": 1.5562043795620438e-05, + "loss": 1.2914, + "step": 3656 + }, + { + "epoch": 1.1490272297151383, + "grad_norm": 0.8125, + "learning_rate": 1.5559504919073313e-05, + "loss": 1.2283, + "step": 3658 + }, + { + "epoch": 1.1496554567406796, + "grad_norm": 0.8203125, + "learning_rate": 1.5556966042526184e-05, + "loss": 1.2828, + "step": 3660 + }, + { + "epoch": 1.150283683766221, + "grad_norm": 0.96875, + "learning_rate": 1.5554427165979056e-05, + "loss": 1.2971, + "step": 3662 + }, + { + "epoch": 1.1509119107917622, + "grad_norm": 0.80859375, + "learning_rate": 1.5551888289431927e-05, + "loss": 1.1939, + "step": 3664 + }, + { + "epoch": 1.1515401378173038, + "grad_norm": 0.72265625, + "learning_rate": 1.55493494128848e-05, + "loss": 1.2101, + "step": 3666 + }, + { + "epoch": 1.1521683648428451, + "grad_norm": 0.79296875, + "learning_rate": 1.554681053633767e-05, + "loss": 1.3979, + "step": 3668 + }, + { + "epoch": 1.1527965918683865, + "grad_norm": 0.84375, + "learning_rate": 1.5544271659790545e-05, + "loss": 1.1002, + "step": 3670 + }, + { + "epoch": 1.1534248188939278, + "grad_norm": 0.80859375, + "learning_rate": 1.5541732783243416e-05, + "loss": 1.2375, + "step": 3672 + }, + { + "epoch": 1.1540530459194691, + "grad_norm": 0.71875, + "learning_rate": 1.5539193906696287e-05, + "loss": 1.315, + "step": 3674 + }, + { + "epoch": 1.1546812729450104, + "grad_norm": 0.74609375, + "learning_rate": 1.553665503014916e-05, + "loss": 1.3209, + "step": 3676 + }, + { + "epoch": 1.1553094999705518, + "grad_norm": 0.7265625, + "learning_rate": 1.5534116153602033e-05, + "loss": 1.2149, + "step": 3678 + }, + { + "epoch": 1.1559377269960933, + "grad_norm": 0.73828125, + "learning_rate": 1.5531577277054905e-05, + "loss": 1.2292, + "step": 3680 + }, + { + "epoch": 1.1565659540216346, + "grad_norm": 0.79296875, + "learning_rate": 1.5529038400507776e-05, + "loss": 1.148, + "step": 3682 + }, + { + "epoch": 1.157194181047176, + "grad_norm": 0.73828125, + "learning_rate": 1.552649952396065e-05, + "loss": 1.2629, + "step": 3684 + }, + { + "epoch": 1.1578224080727173, + "grad_norm": 0.7734375, + "learning_rate": 1.5523960647413522e-05, + "loss": 1.13, + "step": 3686 + }, + { + "epoch": 1.1584506350982586, + "grad_norm": 0.7578125, + "learning_rate": 1.5521421770866394e-05, + "loss": 1.2941, + "step": 3688 + }, + { + "epoch": 1.1590788621238, + "grad_norm": 0.8046875, + "learning_rate": 1.5518882894319265e-05, + "loss": 1.2437, + "step": 3690 + }, + { + "epoch": 1.1597070891493413, + "grad_norm": 0.84765625, + "learning_rate": 1.5516344017772136e-05, + "loss": 1.3109, + "step": 3692 + }, + { + "epoch": 1.1603353161748826, + "grad_norm": 0.8125, + "learning_rate": 1.5513805141225008e-05, + "loss": 1.0916, + "step": 3694 + }, + { + "epoch": 1.160963543200424, + "grad_norm": 0.84375, + "learning_rate": 1.5511266264677883e-05, + "loss": 1.1996, + "step": 3696 + }, + { + "epoch": 1.1615917702259655, + "grad_norm": 0.94140625, + "learning_rate": 1.5508727388130754e-05, + "loss": 1.2492, + "step": 3698 + }, + { + "epoch": 1.1622199972515068, + "grad_norm": 0.77734375, + "learning_rate": 1.5506188511583625e-05, + "loss": 1.2623, + "step": 3700 + }, + { + "epoch": 1.1628482242770481, + "grad_norm": 0.83203125, + "learning_rate": 1.5503649635036497e-05, + "loss": 1.4083, + "step": 3702 + }, + { + "epoch": 1.1634764513025895, + "grad_norm": 0.78515625, + "learning_rate": 1.550111075848937e-05, + "loss": 1.2405, + "step": 3704 + }, + { + "epoch": 1.1641046783281308, + "grad_norm": 0.86328125, + "learning_rate": 1.5498571881942243e-05, + "loss": 1.0913, + "step": 3706 + }, + { + "epoch": 1.1647329053536721, + "grad_norm": 0.83984375, + "learning_rate": 1.5496033005395114e-05, + "loss": 1.1841, + "step": 3708 + }, + { + "epoch": 1.1653611323792135, + "grad_norm": 0.77734375, + "learning_rate": 1.5493494128847986e-05, + "loss": 1.3798, + "step": 3710 + }, + { + "epoch": 1.165989359404755, + "grad_norm": 0.765625, + "learning_rate": 1.549095525230086e-05, + "loss": 1.3218, + "step": 3712 + }, + { + "epoch": 1.1666175864302963, + "grad_norm": 1.0, + "learning_rate": 1.548841637575373e-05, + "loss": 1.3271, + "step": 3714 + }, + { + "epoch": 1.1672458134558377, + "grad_norm": 0.796875, + "learning_rate": 1.5485877499206603e-05, + "loss": 1.3283, + "step": 3716 + }, + { + "epoch": 1.167874040481379, + "grad_norm": 0.88671875, + "learning_rate": 1.5483338622659475e-05, + "loss": 1.1618, + "step": 3718 + }, + { + "epoch": 1.1685022675069203, + "grad_norm": 0.83984375, + "learning_rate": 1.5480799746112346e-05, + "loss": 1.2939, + "step": 3720 + }, + { + "epoch": 1.1691304945324617, + "grad_norm": 0.8203125, + "learning_rate": 1.5478260869565217e-05, + "loss": 1.2001, + "step": 3722 + }, + { + "epoch": 1.169758721558003, + "grad_norm": 0.8125, + "learning_rate": 1.5475721993018092e-05, + "loss": 1.2964, + "step": 3724 + }, + { + "epoch": 1.1703869485835443, + "grad_norm": 0.75390625, + "learning_rate": 1.5473183116470964e-05, + "loss": 1.2945, + "step": 3726 + }, + { + "epoch": 1.1710151756090856, + "grad_norm": 0.7578125, + "learning_rate": 1.5470644239923835e-05, + "loss": 1.3001, + "step": 3728 + }, + { + "epoch": 1.1716434026346272, + "grad_norm": 0.78515625, + "learning_rate": 1.5468105363376706e-05, + "loss": 1.0717, + "step": 3730 + }, + { + "epoch": 1.1722716296601685, + "grad_norm": 0.890625, + "learning_rate": 1.546556648682958e-05, + "loss": 1.1697, + "step": 3732 + }, + { + "epoch": 1.1728998566857098, + "grad_norm": 0.91796875, + "learning_rate": 1.546302761028245e-05, + "loss": 1.1945, + "step": 3734 + }, + { + "epoch": 1.1735280837112512, + "grad_norm": 0.765625, + "learning_rate": 1.5460488733735324e-05, + "loss": 1.1745, + "step": 3736 + }, + { + "epoch": 1.1741563107367925, + "grad_norm": 0.83203125, + "learning_rate": 1.5457949857188195e-05, + "loss": 1.2856, + "step": 3738 + }, + { + "epoch": 1.1747845377623338, + "grad_norm": 0.76953125, + "learning_rate": 1.5455410980641067e-05, + "loss": 1.2289, + "step": 3740 + }, + { + "epoch": 1.1754127647878752, + "grad_norm": 0.8671875, + "learning_rate": 1.5452872104093938e-05, + "loss": 1.1443, + "step": 3742 + }, + { + "epoch": 1.1760409918134165, + "grad_norm": 0.81640625, + "learning_rate": 1.5450333227546813e-05, + "loss": 1.2261, + "step": 3744 + }, + { + "epoch": 1.176669218838958, + "grad_norm": 0.77734375, + "learning_rate": 1.5447794350999684e-05, + "loss": 1.2703, + "step": 3746 + }, + { + "epoch": 1.1772974458644994, + "grad_norm": 0.73828125, + "learning_rate": 1.5445255474452556e-05, + "loss": 1.2568, + "step": 3748 + }, + { + "epoch": 1.1779256728900407, + "grad_norm": 0.7265625, + "learning_rate": 1.5442716597905427e-05, + "loss": 1.2741, + "step": 3750 + }, + { + "epoch": 1.178553899915582, + "grad_norm": 0.796875, + "learning_rate": 1.54401777213583e-05, + "loss": 1.2063, + "step": 3752 + }, + { + "epoch": 1.1791821269411233, + "grad_norm": 0.75390625, + "learning_rate": 1.5437638844811173e-05, + "loss": 1.2187, + "step": 3754 + }, + { + "epoch": 1.1798103539666647, + "grad_norm": 0.77734375, + "learning_rate": 1.5435099968264044e-05, + "loss": 1.2115, + "step": 3756 + }, + { + "epoch": 1.180438580992206, + "grad_norm": 0.84375, + "learning_rate": 1.5432561091716916e-05, + "loss": 1.2467, + "step": 3758 + }, + { + "epoch": 1.1810668080177473, + "grad_norm": 0.76953125, + "learning_rate": 1.5430022215169787e-05, + "loss": 1.3909, + "step": 3760 + }, + { + "epoch": 1.1816950350432887, + "grad_norm": 0.8359375, + "learning_rate": 1.542748333862266e-05, + "loss": 1.3045, + "step": 3762 + }, + { + "epoch": 1.1823232620688302, + "grad_norm": 0.76953125, + "learning_rate": 1.5424944462075533e-05, + "loss": 1.2189, + "step": 3764 + }, + { + "epoch": 1.1829514890943715, + "grad_norm": 0.80078125, + "learning_rate": 1.5422405585528405e-05, + "loss": 1.0881, + "step": 3766 + }, + { + "epoch": 1.1835797161199129, + "grad_norm": 0.74609375, + "learning_rate": 1.5419866708981276e-05, + "loss": 1.2768, + "step": 3768 + }, + { + "epoch": 1.1842079431454542, + "grad_norm": 0.84765625, + "learning_rate": 1.541732783243415e-05, + "loss": 1.3732, + "step": 3770 + }, + { + "epoch": 1.1848361701709955, + "grad_norm": 0.8203125, + "learning_rate": 1.5414788955887022e-05, + "loss": 1.3118, + "step": 3772 + }, + { + "epoch": 1.1854643971965368, + "grad_norm": 0.8359375, + "learning_rate": 1.5412250079339894e-05, + "loss": 1.3224, + "step": 3774 + }, + { + "epoch": 1.1860926242220782, + "grad_norm": 0.828125, + "learning_rate": 1.5409711202792765e-05, + "loss": 1.1572, + "step": 3776 + }, + { + "epoch": 1.1867208512476197, + "grad_norm": 0.73828125, + "learning_rate": 1.540717232624564e-05, + "loss": 1.2863, + "step": 3778 + }, + { + "epoch": 1.187349078273161, + "grad_norm": 0.76171875, + "learning_rate": 1.5404633449698508e-05, + "loss": 1.1624, + "step": 3780 + }, + { + "epoch": 1.1879773052987024, + "grad_norm": 0.78515625, + "learning_rate": 1.5402094573151383e-05, + "loss": 1.3295, + "step": 3782 + }, + { + "epoch": 1.1886055323242437, + "grad_norm": 0.79296875, + "learning_rate": 1.5399555696604254e-05, + "loss": 1.3428, + "step": 3784 + }, + { + "epoch": 1.189233759349785, + "grad_norm": 0.8046875, + "learning_rate": 1.5397016820057125e-05, + "loss": 1.2549, + "step": 3786 + }, + { + "epoch": 1.1898619863753264, + "grad_norm": 0.734375, + "learning_rate": 1.5394477943509997e-05, + "loss": 1.2031, + "step": 3788 + }, + { + "epoch": 1.1904902134008677, + "grad_norm": 0.74609375, + "learning_rate": 1.539193906696287e-05, + "loss": 1.13, + "step": 3790 + }, + { + "epoch": 1.191118440426409, + "grad_norm": 0.82421875, + "learning_rate": 1.5389400190415743e-05, + "loss": 1.2701, + "step": 3792 + }, + { + "epoch": 1.1917466674519503, + "grad_norm": 0.828125, + "learning_rate": 1.5386861313868614e-05, + "loss": 1.208, + "step": 3794 + }, + { + "epoch": 1.192374894477492, + "grad_norm": 0.80078125, + "learning_rate": 1.5384322437321486e-05, + "loss": 1.2884, + "step": 3796 + }, + { + "epoch": 1.1930031215030332, + "grad_norm": 0.78125, + "learning_rate": 1.538178356077436e-05, + "loss": 1.1525, + "step": 3798 + }, + { + "epoch": 1.1936313485285746, + "grad_norm": 0.8515625, + "learning_rate": 1.5379244684227232e-05, + "loss": 1.3432, + "step": 3800 + }, + { + "epoch": 1.1942595755541159, + "grad_norm": 0.73828125, + "learning_rate": 1.5376705807680103e-05, + "loss": 1.1941, + "step": 3802 + }, + { + "epoch": 1.1948878025796572, + "grad_norm": 0.76953125, + "learning_rate": 1.5374166931132975e-05, + "loss": 1.1627, + "step": 3804 + }, + { + "epoch": 1.1955160296051985, + "grad_norm": 0.83984375, + "learning_rate": 1.5371628054585846e-05, + "loss": 1.1927, + "step": 3806 + }, + { + "epoch": 1.1961442566307399, + "grad_norm": 0.73046875, + "learning_rate": 1.5369089178038717e-05, + "loss": 1.2496, + "step": 3808 + }, + { + "epoch": 1.1967724836562812, + "grad_norm": 0.7578125, + "learning_rate": 1.5366550301491592e-05, + "loss": 1.3009, + "step": 3810 + }, + { + "epoch": 1.1974007106818227, + "grad_norm": 0.76953125, + "learning_rate": 1.5364011424944463e-05, + "loss": 1.2321, + "step": 3812 + }, + { + "epoch": 1.198028937707364, + "grad_norm": 0.78515625, + "learning_rate": 1.5361472548397335e-05, + "loss": 1.3292, + "step": 3814 + }, + { + "epoch": 1.1986571647329054, + "grad_norm": 0.8125, + "learning_rate": 1.5358933671850206e-05, + "loss": 1.1956, + "step": 3816 + }, + { + "epoch": 1.1992853917584467, + "grad_norm": 0.77734375, + "learning_rate": 1.535639479530308e-05, + "loss": 1.1833, + "step": 3818 + }, + { + "epoch": 1.199913618783988, + "grad_norm": 0.828125, + "learning_rate": 1.5353855918755952e-05, + "loss": 1.3083, + "step": 3820 + }, + { + "epoch": 1.2005418458095294, + "grad_norm": 0.84375, + "learning_rate": 1.5351317042208824e-05, + "loss": 1.2926, + "step": 3822 + }, + { + "epoch": 1.2011700728350707, + "grad_norm": 0.859375, + "learning_rate": 1.5348778165661695e-05, + "loss": 1.2154, + "step": 3824 + }, + { + "epoch": 1.201798299860612, + "grad_norm": 0.828125, + "learning_rate": 1.534623928911457e-05, + "loss": 1.2833, + "step": 3826 + }, + { + "epoch": 1.2024265268861534, + "grad_norm": 0.84765625, + "learning_rate": 1.5343700412567438e-05, + "loss": 1.2619, + "step": 3828 + }, + { + "epoch": 1.203054753911695, + "grad_norm": 0.89453125, + "learning_rate": 1.5341161536020313e-05, + "loss": 1.1693, + "step": 3830 + }, + { + "epoch": 1.2036829809372362, + "grad_norm": 0.82421875, + "learning_rate": 1.5338622659473184e-05, + "loss": 1.388, + "step": 3832 + }, + { + "epoch": 1.2043112079627776, + "grad_norm": 0.73828125, + "learning_rate": 1.5336083782926055e-05, + "loss": 1.1663, + "step": 3834 + }, + { + "epoch": 1.204939434988319, + "grad_norm": 0.80078125, + "learning_rate": 1.5333544906378927e-05, + "loss": 1.2422, + "step": 3836 + }, + { + "epoch": 1.2055676620138602, + "grad_norm": 0.76953125, + "learning_rate": 1.53310060298318e-05, + "loss": 1.3582, + "step": 3838 + }, + { + "epoch": 1.2061958890394016, + "grad_norm": 0.75, + "learning_rate": 1.5328467153284673e-05, + "loss": 1.279, + "step": 3840 + }, + { + "epoch": 1.2068241160649429, + "grad_norm": 0.796875, + "learning_rate": 1.5325928276737544e-05, + "loss": 1.1224, + "step": 3842 + }, + { + "epoch": 1.2074523430904844, + "grad_norm": 0.75, + "learning_rate": 1.5323389400190416e-05, + "loss": 1.164, + "step": 3844 + }, + { + "epoch": 1.2080805701160258, + "grad_norm": 0.76171875, + "learning_rate": 1.532085052364329e-05, + "loss": 1.2088, + "step": 3846 + }, + { + "epoch": 1.208708797141567, + "grad_norm": 0.80078125, + "learning_rate": 1.531831164709616e-05, + "loss": 1.2329, + "step": 3848 + }, + { + "epoch": 1.2093370241671084, + "grad_norm": 0.7890625, + "learning_rate": 1.5315772770549033e-05, + "loss": 1.251, + "step": 3850 + }, + { + "epoch": 1.2099652511926497, + "grad_norm": 0.88671875, + "learning_rate": 1.5313233894001908e-05, + "loss": 1.2042, + "step": 3852 + }, + { + "epoch": 1.210593478218191, + "grad_norm": 0.8046875, + "learning_rate": 1.5310695017454776e-05, + "loss": 1.143, + "step": 3854 + }, + { + "epoch": 1.2112217052437324, + "grad_norm": 0.7734375, + "learning_rate": 1.530815614090765e-05, + "loss": 1.319, + "step": 3856 + }, + { + "epoch": 1.2118499322692737, + "grad_norm": 0.80078125, + "learning_rate": 1.5305617264360522e-05, + "loss": 1.2545, + "step": 3858 + }, + { + "epoch": 1.212478159294815, + "grad_norm": 0.75390625, + "learning_rate": 1.5303078387813394e-05, + "loss": 1.2631, + "step": 3860 + }, + { + "epoch": 1.2131063863203566, + "grad_norm": 0.84375, + "learning_rate": 1.5300539511266265e-05, + "loss": 1.2877, + "step": 3862 + }, + { + "epoch": 1.213734613345898, + "grad_norm": 0.76953125, + "learning_rate": 1.529800063471914e-05, + "loss": 1.2363, + "step": 3864 + }, + { + "epoch": 1.2143628403714393, + "grad_norm": 0.87890625, + "learning_rate": 1.529546175817201e-05, + "loss": 1.1702, + "step": 3866 + }, + { + "epoch": 1.2149910673969806, + "grad_norm": 0.7734375, + "learning_rate": 1.5292922881624882e-05, + "loss": 1.25, + "step": 3868 + }, + { + "epoch": 1.215619294422522, + "grad_norm": 0.83984375, + "learning_rate": 1.5290384005077754e-05, + "loss": 1.3495, + "step": 3870 + }, + { + "epoch": 1.2162475214480633, + "grad_norm": 0.7734375, + "learning_rate": 1.528784512853063e-05, + "loss": 1.2843, + "step": 3872 + }, + { + "epoch": 1.2168757484736046, + "grad_norm": 0.7109375, + "learning_rate": 1.5285306251983497e-05, + "loss": 1.2542, + "step": 3874 + }, + { + "epoch": 1.217503975499146, + "grad_norm": 0.74609375, + "learning_rate": 1.528276737543637e-05, + "loss": 1.3101, + "step": 3876 + }, + { + "epoch": 1.2181322025246875, + "grad_norm": 0.7734375, + "learning_rate": 1.5280228498889243e-05, + "loss": 1.28, + "step": 3878 + }, + { + "epoch": 1.2187604295502288, + "grad_norm": 0.7734375, + "learning_rate": 1.5277689622342114e-05, + "loss": 1.2766, + "step": 3880 + }, + { + "epoch": 1.2193886565757701, + "grad_norm": 0.796875, + "learning_rate": 1.5275150745794986e-05, + "loss": 1.236, + "step": 3882 + }, + { + "epoch": 1.2200168836013114, + "grad_norm": 0.79296875, + "learning_rate": 1.527261186924786e-05, + "loss": 1.1882, + "step": 3884 + }, + { + "epoch": 1.2206451106268528, + "grad_norm": 0.82421875, + "learning_rate": 1.5270072992700732e-05, + "loss": 1.1035, + "step": 3886 + }, + { + "epoch": 1.221273337652394, + "grad_norm": 0.796875, + "learning_rate": 1.5267534116153603e-05, + "loss": 1.1533, + "step": 3888 + }, + { + "epoch": 1.2219015646779354, + "grad_norm": 0.8125, + "learning_rate": 1.5264995239606474e-05, + "loss": 1.1649, + "step": 3890 + }, + { + "epoch": 1.2225297917034768, + "grad_norm": 0.875, + "learning_rate": 1.526245636305935e-05, + "loss": 1.2608, + "step": 3892 + }, + { + "epoch": 1.223158018729018, + "grad_norm": 0.83984375, + "learning_rate": 1.525991748651222e-05, + "loss": 1.4731, + "step": 3894 + }, + { + "epoch": 1.2237862457545596, + "grad_norm": 0.7578125, + "learning_rate": 1.5257378609965092e-05, + "loss": 1.276, + "step": 3896 + }, + { + "epoch": 1.224414472780101, + "grad_norm": 0.7265625, + "learning_rate": 1.5254839733417963e-05, + "loss": 1.1836, + "step": 3898 + }, + { + "epoch": 1.2250426998056423, + "grad_norm": 0.8515625, + "learning_rate": 1.5252300856870836e-05, + "loss": 1.1804, + "step": 3900 + }, + { + "epoch": 1.2256709268311836, + "grad_norm": 0.765625, + "learning_rate": 1.5249761980323708e-05, + "loss": 1.3125, + "step": 3902 + }, + { + "epoch": 1.226299153856725, + "grad_norm": 0.7421875, + "learning_rate": 1.5247223103776581e-05, + "loss": 1.2143, + "step": 3904 + }, + { + "epoch": 1.2269273808822663, + "grad_norm": 0.80859375, + "learning_rate": 1.5244684227229452e-05, + "loss": 1.2005, + "step": 3906 + }, + { + "epoch": 1.2275556079078076, + "grad_norm": 0.8359375, + "learning_rate": 1.5242145350682325e-05, + "loss": 1.2187, + "step": 3908 + }, + { + "epoch": 1.2281838349333492, + "grad_norm": 0.734375, + "learning_rate": 1.5239606474135195e-05, + "loss": 1.3649, + "step": 3910 + }, + { + "epoch": 1.2288120619588905, + "grad_norm": 0.85546875, + "learning_rate": 1.5237067597588068e-05, + "loss": 1.2188, + "step": 3912 + }, + { + "epoch": 1.2294402889844318, + "grad_norm": 0.80078125, + "learning_rate": 1.523452872104094e-05, + "loss": 1.1949, + "step": 3914 + }, + { + "epoch": 1.2300685160099731, + "grad_norm": 0.796875, + "learning_rate": 1.5231989844493813e-05, + "loss": 1.2312, + "step": 3916 + }, + { + "epoch": 1.2306967430355145, + "grad_norm": 0.765625, + "learning_rate": 1.5229450967946684e-05, + "loss": 1.1944, + "step": 3918 + }, + { + "epoch": 1.2313249700610558, + "grad_norm": 0.81640625, + "learning_rate": 1.5226912091399557e-05, + "loss": 1.2278, + "step": 3920 + }, + { + "epoch": 1.2319531970865971, + "grad_norm": 0.7578125, + "learning_rate": 1.5224373214852428e-05, + "loss": 1.2428, + "step": 3922 + }, + { + "epoch": 1.2325814241121384, + "grad_norm": 0.73828125, + "learning_rate": 1.5221834338305302e-05, + "loss": 1.3336, + "step": 3924 + }, + { + "epoch": 1.2332096511376798, + "grad_norm": 0.76171875, + "learning_rate": 1.5219295461758173e-05, + "loss": 1.3555, + "step": 3926 + }, + { + "epoch": 1.2338378781632213, + "grad_norm": 0.875, + "learning_rate": 1.5216756585211046e-05, + "loss": 1.1514, + "step": 3928 + }, + { + "epoch": 1.2344661051887627, + "grad_norm": 0.70703125, + "learning_rate": 1.5214217708663916e-05, + "loss": 1.3315, + "step": 3930 + }, + { + "epoch": 1.235094332214304, + "grad_norm": 0.7890625, + "learning_rate": 1.521167883211679e-05, + "loss": 1.2199, + "step": 3932 + }, + { + "epoch": 1.2357225592398453, + "grad_norm": 0.84375, + "learning_rate": 1.520913995556966e-05, + "loss": 1.1983, + "step": 3934 + }, + { + "epoch": 1.2363507862653866, + "grad_norm": 0.73828125, + "learning_rate": 1.5206601079022533e-05, + "loss": 1.2382, + "step": 3936 + }, + { + "epoch": 1.236979013290928, + "grad_norm": 0.91015625, + "learning_rate": 1.5204062202475406e-05, + "loss": 1.1452, + "step": 3938 + }, + { + "epoch": 1.2376072403164693, + "grad_norm": 0.84765625, + "learning_rate": 1.5201523325928278e-05, + "loss": 1.2165, + "step": 3940 + }, + { + "epoch": 1.2382354673420108, + "grad_norm": 0.95703125, + "learning_rate": 1.519898444938115e-05, + "loss": 1.1627, + "step": 3942 + }, + { + "epoch": 1.2388636943675522, + "grad_norm": 0.8125, + "learning_rate": 1.5196445572834022e-05, + "loss": 1.1965, + "step": 3944 + }, + { + "epoch": 1.2394919213930935, + "grad_norm": 0.69921875, + "learning_rate": 1.5193906696286895e-05, + "loss": 1.19, + "step": 3946 + }, + { + "epoch": 1.2401201484186348, + "grad_norm": 0.796875, + "learning_rate": 1.5191367819739767e-05, + "loss": 1.2672, + "step": 3948 + }, + { + "epoch": 1.2407483754441762, + "grad_norm": 0.84765625, + "learning_rate": 1.518882894319264e-05, + "loss": 1.2684, + "step": 3950 + }, + { + "epoch": 1.2413766024697175, + "grad_norm": 0.76953125, + "learning_rate": 1.5186290066645511e-05, + "loss": 1.3026, + "step": 3952 + }, + { + "epoch": 1.2420048294952588, + "grad_norm": 0.7421875, + "learning_rate": 1.5183751190098384e-05, + "loss": 1.2142, + "step": 3954 + }, + { + "epoch": 1.2426330565208001, + "grad_norm": 0.7421875, + "learning_rate": 1.5181212313551254e-05, + "loss": 1.2394, + "step": 3956 + }, + { + "epoch": 1.2432612835463415, + "grad_norm": 0.84765625, + "learning_rate": 1.5178673437004129e-05, + "loss": 1.1269, + "step": 3958 + }, + { + "epoch": 1.2438895105718828, + "grad_norm": 0.80078125, + "learning_rate": 1.5176134560456998e-05, + "loss": 1.2404, + "step": 3960 + }, + { + "epoch": 1.2445177375974243, + "grad_norm": 0.8671875, + "learning_rate": 1.5173595683909871e-05, + "loss": 1.2585, + "step": 3962 + }, + { + "epoch": 1.2451459646229657, + "grad_norm": 0.80859375, + "learning_rate": 1.5171056807362743e-05, + "loss": 1.2264, + "step": 3964 + }, + { + "epoch": 1.245774191648507, + "grad_norm": 0.80859375, + "learning_rate": 1.5168517930815616e-05, + "loss": 1.3453, + "step": 3966 + }, + { + "epoch": 1.2464024186740483, + "grad_norm": 0.80859375, + "learning_rate": 1.5165979054268487e-05, + "loss": 1.2773, + "step": 3968 + }, + { + "epoch": 1.2470306456995897, + "grad_norm": 0.8671875, + "learning_rate": 1.516344017772136e-05, + "loss": 1.15, + "step": 3970 + }, + { + "epoch": 1.247658872725131, + "grad_norm": 0.80078125, + "learning_rate": 1.5160901301174232e-05, + "loss": 1.3222, + "step": 3972 + }, + { + "epoch": 1.2482870997506723, + "grad_norm": 0.796875, + "learning_rate": 1.5158362424627105e-05, + "loss": 1.1521, + "step": 3974 + }, + { + "epoch": 1.2489153267762139, + "grad_norm": 0.91015625, + "learning_rate": 1.5155823548079976e-05, + "loss": 1.2704, + "step": 3976 + }, + { + "epoch": 1.2495435538017552, + "grad_norm": 0.76171875, + "learning_rate": 1.515328467153285e-05, + "loss": 1.3046, + "step": 3978 + }, + { + "epoch": 1.2501717808272965, + "grad_norm": 0.75390625, + "learning_rate": 1.5150745794985719e-05, + "loss": 1.1866, + "step": 3980 + }, + { + "epoch": 1.2508000078528378, + "grad_norm": 0.73828125, + "learning_rate": 1.5148206918438592e-05, + "loss": 1.3763, + "step": 3982 + }, + { + "epoch": 1.2514282348783792, + "grad_norm": 0.76171875, + "learning_rate": 1.5145668041891463e-05, + "loss": 1.3093, + "step": 3984 + }, + { + "epoch": 1.2520564619039205, + "grad_norm": 0.79296875, + "learning_rate": 1.5143129165344336e-05, + "loss": 1.1302, + "step": 3986 + }, + { + "epoch": 1.2526846889294618, + "grad_norm": 0.8203125, + "learning_rate": 1.5140590288797208e-05, + "loss": 1.2198, + "step": 3988 + }, + { + "epoch": 1.2533129159550032, + "grad_norm": 0.7421875, + "learning_rate": 1.5138051412250081e-05, + "loss": 1.2666, + "step": 3990 + }, + { + "epoch": 1.2539411429805445, + "grad_norm": 0.921875, + "learning_rate": 1.5135512535702952e-05, + "loss": 1.0958, + "step": 3992 + }, + { + "epoch": 1.2545693700060858, + "grad_norm": 0.8515625, + "learning_rate": 1.5132973659155825e-05, + "loss": 1.2114, + "step": 3994 + }, + { + "epoch": 1.2551975970316274, + "grad_norm": 0.78515625, + "learning_rate": 1.5130434782608697e-05, + "loss": 1.3153, + "step": 3996 + }, + { + "epoch": 1.2558258240571687, + "grad_norm": 0.8125, + "learning_rate": 1.512789590606157e-05, + "loss": 1.2129, + "step": 3998 + }, + { + "epoch": 1.25645405108271, + "grad_norm": 0.7890625, + "learning_rate": 1.512535702951444e-05, + "loss": 1.2922, + "step": 4000 + }, + { + "epoch": 1.2570822781082513, + "grad_norm": 0.7578125, + "learning_rate": 1.5122818152967314e-05, + "loss": 1.1252, + "step": 4002 + }, + { + "epoch": 1.2577105051337927, + "grad_norm": 0.77734375, + "learning_rate": 1.5120279276420184e-05, + "loss": 1.1488, + "step": 4004 + }, + { + "epoch": 1.258338732159334, + "grad_norm": 0.80078125, + "learning_rate": 1.5117740399873057e-05, + "loss": 1.2191, + "step": 4006 + }, + { + "epoch": 1.2589669591848756, + "grad_norm": 0.859375, + "learning_rate": 1.5115201523325928e-05, + "loss": 1.345, + "step": 4008 + }, + { + "epoch": 1.2595951862104169, + "grad_norm": 0.7734375, + "learning_rate": 1.5112662646778801e-05, + "loss": 1.2722, + "step": 4010 + }, + { + "epoch": 1.2602234132359582, + "grad_norm": 0.75390625, + "learning_rate": 1.5110123770231673e-05, + "loss": 1.2438, + "step": 4012 + }, + { + "epoch": 1.2608516402614995, + "grad_norm": 0.7734375, + "learning_rate": 1.5107584893684546e-05, + "loss": 1.2488, + "step": 4014 + }, + { + "epoch": 1.2614798672870409, + "grad_norm": 0.765625, + "learning_rate": 1.5105046017137417e-05, + "loss": 1.1744, + "step": 4016 + }, + { + "epoch": 1.2621080943125822, + "grad_norm": 0.78515625, + "learning_rate": 1.510250714059029e-05, + "loss": 1.2071, + "step": 4018 + }, + { + "epoch": 1.2627363213381235, + "grad_norm": 0.75, + "learning_rate": 1.5099968264043162e-05, + "loss": 1.3156, + "step": 4020 + }, + { + "epoch": 1.2633645483636649, + "grad_norm": 0.890625, + "learning_rate": 1.5097429387496035e-05, + "loss": 1.1231, + "step": 4022 + }, + { + "epoch": 1.2639927753892062, + "grad_norm": 0.796875, + "learning_rate": 1.5094890510948908e-05, + "loss": 1.151, + "step": 4024 + }, + { + "epoch": 1.2646210024147475, + "grad_norm": 0.75, + "learning_rate": 1.5092351634401778e-05, + "loss": 1.3427, + "step": 4026 + }, + { + "epoch": 1.265249229440289, + "grad_norm": 0.796875, + "learning_rate": 1.5089812757854652e-05, + "loss": 1.2093, + "step": 4028 + }, + { + "epoch": 1.2658774564658304, + "grad_norm": 0.765625, + "learning_rate": 1.5087273881307522e-05, + "loss": 1.179, + "step": 4030 + }, + { + "epoch": 1.2665056834913717, + "grad_norm": 0.8125, + "learning_rate": 1.5084735004760395e-05, + "loss": 1.2487, + "step": 4032 + }, + { + "epoch": 1.267133910516913, + "grad_norm": 0.81640625, + "learning_rate": 1.5082196128213267e-05, + "loss": 1.2788, + "step": 4034 + }, + { + "epoch": 1.2677621375424544, + "grad_norm": 0.78515625, + "learning_rate": 1.507965725166614e-05, + "loss": 1.2915, + "step": 4036 + }, + { + "epoch": 1.2683903645679957, + "grad_norm": 0.8046875, + "learning_rate": 1.5077118375119011e-05, + "loss": 1.2832, + "step": 4038 + }, + { + "epoch": 1.2690185915935372, + "grad_norm": 0.84375, + "learning_rate": 1.5074579498571884e-05, + "loss": 1.2789, + "step": 4040 + }, + { + "epoch": 1.2696468186190786, + "grad_norm": 0.74609375, + "learning_rate": 1.5072040622024755e-05, + "loss": 1.0852, + "step": 4042 + }, + { + "epoch": 1.27027504564462, + "grad_norm": 0.8828125, + "learning_rate": 1.5069501745477629e-05, + "loss": 1.2983, + "step": 4044 + }, + { + "epoch": 1.2709032726701612, + "grad_norm": 0.8203125, + "learning_rate": 1.50669628689305e-05, + "loss": 1.2256, + "step": 4046 + }, + { + "epoch": 1.2715314996957026, + "grad_norm": 0.79296875, + "learning_rate": 1.5064423992383373e-05, + "loss": 1.2011, + "step": 4048 + }, + { + "epoch": 1.2721597267212439, + "grad_norm": 0.7109375, + "learning_rate": 1.5061885115836243e-05, + "loss": 1.2055, + "step": 4050 + }, + { + "epoch": 1.2727879537467852, + "grad_norm": 0.828125, + "learning_rate": 1.5059346239289116e-05, + "loss": 1.1325, + "step": 4052 + }, + { + "epoch": 1.2734161807723265, + "grad_norm": 0.828125, + "learning_rate": 1.5056807362741987e-05, + "loss": 1.182, + "step": 4054 + }, + { + "epoch": 1.2740444077978679, + "grad_norm": 1.1796875, + "learning_rate": 1.505426848619486e-05, + "loss": 1.2868, + "step": 4056 + }, + { + "epoch": 1.2746726348234092, + "grad_norm": 0.7578125, + "learning_rate": 1.5051729609647732e-05, + "loss": 1.3885, + "step": 4058 + }, + { + "epoch": 1.2753008618489505, + "grad_norm": 0.7265625, + "learning_rate": 1.5049190733100605e-05, + "loss": 1.2637, + "step": 4060 + }, + { + "epoch": 1.275929088874492, + "grad_norm": 0.7265625, + "learning_rate": 1.5046651856553476e-05, + "loss": 1.2665, + "step": 4062 + }, + { + "epoch": 1.2765573159000334, + "grad_norm": 0.72265625, + "learning_rate": 1.5044112980006349e-05, + "loss": 1.3012, + "step": 4064 + }, + { + "epoch": 1.2771855429255747, + "grad_norm": 0.953125, + "learning_rate": 1.504157410345922e-05, + "loss": 1.2597, + "step": 4066 + }, + { + "epoch": 1.277813769951116, + "grad_norm": 0.7265625, + "learning_rate": 1.5039035226912094e-05, + "loss": 1.2633, + "step": 4068 + }, + { + "epoch": 1.2784419969766574, + "grad_norm": 0.7890625, + "learning_rate": 1.5036496350364965e-05, + "loss": 1.2377, + "step": 4070 + }, + { + "epoch": 1.2790702240021987, + "grad_norm": 0.80859375, + "learning_rate": 1.5033957473817838e-05, + "loss": 1.4411, + "step": 4072 + }, + { + "epoch": 1.2796984510277403, + "grad_norm": 0.75390625, + "learning_rate": 1.5031418597270708e-05, + "loss": 1.1997, + "step": 4074 + }, + { + "epoch": 1.2803266780532816, + "grad_norm": 0.77734375, + "learning_rate": 1.502887972072358e-05, + "loss": 1.2864, + "step": 4076 + }, + { + "epoch": 1.280954905078823, + "grad_norm": 0.8125, + "learning_rate": 1.5026340844176452e-05, + "loss": 1.293, + "step": 4078 + }, + { + "epoch": 1.2815831321043643, + "grad_norm": 0.77734375, + "learning_rate": 1.5023801967629325e-05, + "loss": 1.2866, + "step": 4080 + }, + { + "epoch": 1.2822113591299056, + "grad_norm": 0.73046875, + "learning_rate": 1.5021263091082197e-05, + "loss": 1.2788, + "step": 4082 + }, + { + "epoch": 1.282839586155447, + "grad_norm": 0.828125, + "learning_rate": 1.501872421453507e-05, + "loss": 1.1705, + "step": 4084 + }, + { + "epoch": 1.2834678131809882, + "grad_norm": 0.86328125, + "learning_rate": 1.5016185337987941e-05, + "loss": 1.1257, + "step": 4086 + }, + { + "epoch": 1.2840960402065296, + "grad_norm": 0.76953125, + "learning_rate": 1.5013646461440814e-05, + "loss": 1.2574, + "step": 4088 + }, + { + "epoch": 1.284724267232071, + "grad_norm": 0.6953125, + "learning_rate": 1.5011107584893686e-05, + "loss": 1.2126, + "step": 4090 + }, + { + "epoch": 1.2853524942576122, + "grad_norm": 0.6953125, + "learning_rate": 1.5008568708346559e-05, + "loss": 1.281, + "step": 4092 + }, + { + "epoch": 1.2859807212831538, + "grad_norm": 2.875, + "learning_rate": 1.5006029831799428e-05, + "loss": 1.2376, + "step": 4094 + }, + { + "epoch": 1.286608948308695, + "grad_norm": 0.76171875, + "learning_rate": 1.5003490955252303e-05, + "loss": 1.2656, + "step": 4096 + }, + { + "epoch": 1.2872371753342364, + "grad_norm": 0.83203125, + "learning_rate": 1.5000952078705173e-05, + "loss": 1.3466, + "step": 4098 + }, + { + "epoch": 1.2878654023597778, + "grad_norm": 0.79296875, + "learning_rate": 1.4998413202158046e-05, + "loss": 1.266, + "step": 4100 + }, + { + "epoch": 1.288493629385319, + "grad_norm": 0.78125, + "learning_rate": 1.4995874325610917e-05, + "loss": 1.283, + "step": 4102 + }, + { + "epoch": 1.2891218564108604, + "grad_norm": 0.74609375, + "learning_rate": 1.499333544906379e-05, + "loss": 1.2892, + "step": 4104 + }, + { + "epoch": 1.289750083436402, + "grad_norm": 0.76953125, + "learning_rate": 1.4990796572516662e-05, + "loss": 1.2792, + "step": 4106 + }, + { + "epoch": 1.2903783104619433, + "grad_norm": 0.7421875, + "learning_rate": 1.4988257695969535e-05, + "loss": 1.3327, + "step": 4108 + }, + { + "epoch": 1.2910065374874846, + "grad_norm": 0.859375, + "learning_rate": 1.4985718819422408e-05, + "loss": 1.1793, + "step": 4110 + }, + { + "epoch": 1.291634764513026, + "grad_norm": 0.75, + "learning_rate": 1.498317994287528e-05, + "loss": 1.3966, + "step": 4112 + }, + { + "epoch": 1.2922629915385673, + "grad_norm": 0.78515625, + "learning_rate": 1.4980641066328152e-05, + "loss": 1.235, + "step": 4114 + }, + { + "epoch": 1.2928912185641086, + "grad_norm": 0.8984375, + "learning_rate": 1.4978102189781024e-05, + "loss": 1.1285, + "step": 4116 + }, + { + "epoch": 1.29351944558965, + "grad_norm": 0.7578125, + "learning_rate": 1.4975563313233897e-05, + "loss": 1.1071, + "step": 4118 + }, + { + "epoch": 1.2941476726151913, + "grad_norm": 0.7421875, + "learning_rate": 1.4973024436686766e-05, + "loss": 1.3497, + "step": 4120 + }, + { + "epoch": 1.2947758996407326, + "grad_norm": 0.89453125, + "learning_rate": 1.4970485560139641e-05, + "loss": 1.2041, + "step": 4122 + }, + { + "epoch": 1.295404126666274, + "grad_norm": 0.8203125, + "learning_rate": 1.4967946683592511e-05, + "loss": 1.2132, + "step": 4124 + }, + { + "epoch": 1.2960323536918152, + "grad_norm": 0.8828125, + "learning_rate": 1.4965407807045384e-05, + "loss": 1.2878, + "step": 4126 + }, + { + "epoch": 1.2966605807173568, + "grad_norm": 0.91015625, + "learning_rate": 1.4962868930498255e-05, + "loss": 1.337, + "step": 4128 + }, + { + "epoch": 1.2972888077428981, + "grad_norm": 0.82421875, + "learning_rate": 1.4960330053951128e-05, + "loss": 1.2482, + "step": 4130 + }, + { + "epoch": 1.2979170347684394, + "grad_norm": 0.78515625, + "learning_rate": 1.4957791177404e-05, + "loss": 1.3392, + "step": 4132 + }, + { + "epoch": 1.2985452617939808, + "grad_norm": 0.76953125, + "learning_rate": 1.4955252300856873e-05, + "loss": 1.3088, + "step": 4134 + }, + { + "epoch": 1.299173488819522, + "grad_norm": 0.78125, + "learning_rate": 1.4952713424309744e-05, + "loss": 1.3334, + "step": 4136 + }, + { + "epoch": 1.2998017158450634, + "grad_norm": 0.91796875, + "learning_rate": 1.4950174547762617e-05, + "loss": 1.1723, + "step": 4138 + }, + { + "epoch": 1.300429942870605, + "grad_norm": 0.78515625, + "learning_rate": 1.4947635671215489e-05, + "loss": 1.3257, + "step": 4140 + }, + { + "epoch": 1.3010581698961463, + "grad_norm": 0.8125, + "learning_rate": 1.4945096794668362e-05, + "loss": 1.3229, + "step": 4142 + }, + { + "epoch": 1.3016863969216876, + "grad_norm": 0.76953125, + "learning_rate": 1.4942557918121231e-05, + "loss": 1.3304, + "step": 4144 + }, + { + "epoch": 1.302314623947229, + "grad_norm": 0.80078125, + "learning_rate": 1.4940019041574105e-05, + "loss": 1.3206, + "step": 4146 + }, + { + "epoch": 1.3029428509727703, + "grad_norm": 0.796875, + "learning_rate": 1.4937480165026976e-05, + "loss": 1.1349, + "step": 4148 + }, + { + "epoch": 1.3035710779983116, + "grad_norm": 0.8046875, + "learning_rate": 1.4934941288479849e-05, + "loss": 1.2338, + "step": 4150 + }, + { + "epoch": 1.304199305023853, + "grad_norm": 0.82421875, + "learning_rate": 1.493240241193272e-05, + "loss": 1.1981, + "step": 4152 + }, + { + "epoch": 1.3048275320493943, + "grad_norm": 0.96875, + "learning_rate": 1.4929863535385593e-05, + "loss": 1.2488, + "step": 4154 + }, + { + "epoch": 1.3054557590749356, + "grad_norm": 0.75390625, + "learning_rate": 1.4927324658838465e-05, + "loss": 1.1739, + "step": 4156 + }, + { + "epoch": 1.306083986100477, + "grad_norm": 0.703125, + "learning_rate": 1.4924785782291338e-05, + "loss": 1.1962, + "step": 4158 + }, + { + "epoch": 1.3067122131260185, + "grad_norm": 0.875, + "learning_rate": 1.492224690574421e-05, + "loss": 1.2288, + "step": 4160 + }, + { + "epoch": 1.3073404401515598, + "grad_norm": 0.81640625, + "learning_rate": 1.4919708029197082e-05, + "loss": 1.2643, + "step": 4162 + }, + { + "epoch": 1.3079686671771011, + "grad_norm": 0.859375, + "learning_rate": 1.4917169152649952e-05, + "loss": 1.3054, + "step": 4164 + }, + { + "epoch": 1.3085968942026425, + "grad_norm": 0.7265625, + "learning_rate": 1.4914630276102827e-05, + "loss": 1.135, + "step": 4166 + }, + { + "epoch": 1.3092251212281838, + "grad_norm": 0.92578125, + "learning_rate": 1.4912091399555697e-05, + "loss": 1.1145, + "step": 4168 + }, + { + "epoch": 1.3098533482537251, + "grad_norm": 0.78125, + "learning_rate": 1.490955252300857e-05, + "loss": 1.3165, + "step": 4170 + }, + { + "epoch": 1.3104815752792667, + "grad_norm": 0.8828125, + "learning_rate": 1.4907013646461441e-05, + "loss": 1.2578, + "step": 4172 + }, + { + "epoch": 1.311109802304808, + "grad_norm": 0.84375, + "learning_rate": 1.4904474769914314e-05, + "loss": 1.3692, + "step": 4174 + }, + { + "epoch": 1.3117380293303493, + "grad_norm": 0.8359375, + "learning_rate": 1.4901935893367185e-05, + "loss": 1.3078, + "step": 4176 + }, + { + "epoch": 1.3123662563558907, + "grad_norm": 0.9296875, + "learning_rate": 1.4899397016820059e-05, + "loss": 1.2137, + "step": 4178 + }, + { + "epoch": 1.312994483381432, + "grad_norm": 0.84375, + "learning_rate": 1.489685814027293e-05, + "loss": 1.2295, + "step": 4180 + }, + { + "epoch": 1.3136227104069733, + "grad_norm": 0.85546875, + "learning_rate": 1.4894319263725803e-05, + "loss": 1.1217, + "step": 4182 + }, + { + "epoch": 1.3142509374325146, + "grad_norm": 0.80859375, + "learning_rate": 1.4891780387178674e-05, + "loss": 1.2411, + "step": 4184 + }, + { + "epoch": 1.314879164458056, + "grad_norm": 0.78125, + "learning_rate": 1.4889241510631547e-05, + "loss": 1.3081, + "step": 4186 + }, + { + "epoch": 1.3155073914835973, + "grad_norm": 0.828125, + "learning_rate": 1.4886702634084417e-05, + "loss": 1.3283, + "step": 4188 + }, + { + "epoch": 1.3161356185091386, + "grad_norm": 0.76171875, + "learning_rate": 1.488416375753729e-05, + "loss": 1.2081, + "step": 4190 + }, + { + "epoch": 1.3167638455346802, + "grad_norm": 0.7734375, + "learning_rate": 1.4881624880990162e-05, + "loss": 1.2085, + "step": 4192 + }, + { + "epoch": 1.3173920725602215, + "grad_norm": 0.765625, + "learning_rate": 1.4879086004443035e-05, + "loss": 1.3019, + "step": 4194 + }, + { + "epoch": 1.3180202995857628, + "grad_norm": 0.75390625, + "learning_rate": 1.4876547127895908e-05, + "loss": 1.2424, + "step": 4196 + }, + { + "epoch": 1.3186485266113042, + "grad_norm": 0.76171875, + "learning_rate": 1.4874008251348779e-05, + "loss": 1.2922, + "step": 4198 + }, + { + "epoch": 1.3192767536368455, + "grad_norm": 0.7890625, + "learning_rate": 1.4871469374801652e-05, + "loss": 1.262, + "step": 4200 + }, + { + "epoch": 1.3199049806623868, + "grad_norm": 0.7265625, + "learning_rate": 1.4868930498254524e-05, + "loss": 1.2194, + "step": 4202 + }, + { + "epoch": 1.3205332076879281, + "grad_norm": 0.75, + "learning_rate": 1.4866391621707397e-05, + "loss": 1.1209, + "step": 4204 + }, + { + "epoch": 1.3211614347134697, + "grad_norm": 0.80078125, + "learning_rate": 1.4863852745160268e-05, + "loss": 1.3312, + "step": 4206 + }, + { + "epoch": 1.321789661739011, + "grad_norm": 0.86328125, + "learning_rate": 1.4861313868613141e-05, + "loss": 1.2484, + "step": 4208 + }, + { + "epoch": 1.3224178887645524, + "grad_norm": 0.87890625, + "learning_rate": 1.4858774992066013e-05, + "loss": 1.2436, + "step": 4210 + }, + { + "epoch": 1.3230461157900937, + "grad_norm": 0.7890625, + "learning_rate": 1.4856236115518886e-05, + "loss": 1.2811, + "step": 4212 + }, + { + "epoch": 1.323674342815635, + "grad_norm": 2.375, + "learning_rate": 1.4853697238971755e-05, + "loss": 1.0794, + "step": 4214 + }, + { + "epoch": 1.3243025698411763, + "grad_norm": 0.82421875, + "learning_rate": 1.4851158362424628e-05, + "loss": 1.2972, + "step": 4216 + }, + { + "epoch": 1.3249307968667177, + "grad_norm": 0.7734375, + "learning_rate": 1.48486194858775e-05, + "loss": 1.2615, + "step": 4218 + }, + { + "epoch": 1.325559023892259, + "grad_norm": 0.7734375, + "learning_rate": 1.4846080609330373e-05, + "loss": 1.2853, + "step": 4220 + }, + { + "epoch": 1.3261872509178003, + "grad_norm": 0.83984375, + "learning_rate": 1.4843541732783244e-05, + "loss": 1.2105, + "step": 4222 + }, + { + "epoch": 1.3268154779433416, + "grad_norm": 0.87109375, + "learning_rate": 1.4841002856236117e-05, + "loss": 1.0318, + "step": 4224 + }, + { + "epoch": 1.3274437049688832, + "grad_norm": 0.7578125, + "learning_rate": 1.4838463979688989e-05, + "loss": 1.2111, + "step": 4226 + }, + { + "epoch": 1.3280719319944245, + "grad_norm": 0.86328125, + "learning_rate": 1.4835925103141862e-05, + "loss": 1.0525, + "step": 4228 + }, + { + "epoch": 1.3287001590199659, + "grad_norm": 0.78515625, + "learning_rate": 1.4833386226594733e-05, + "loss": 1.2753, + "step": 4230 + }, + { + "epoch": 1.3293283860455072, + "grad_norm": 0.83203125, + "learning_rate": 1.4830847350047606e-05, + "loss": 1.1766, + "step": 4232 + }, + { + "epoch": 1.3299566130710485, + "grad_norm": 0.765625, + "learning_rate": 1.4828308473500478e-05, + "loss": 1.3159, + "step": 4234 + }, + { + "epoch": 1.3305848400965898, + "grad_norm": 0.7890625, + "learning_rate": 1.482576959695335e-05, + "loss": 1.1587, + "step": 4236 + }, + { + "epoch": 1.3312130671221314, + "grad_norm": 0.8203125, + "learning_rate": 1.482323072040622e-05, + "loss": 1.1362, + "step": 4238 + }, + { + "epoch": 1.3318412941476727, + "grad_norm": 0.9140625, + "learning_rate": 1.4820691843859093e-05, + "loss": 1.2305, + "step": 4240 + }, + { + "epoch": 1.332469521173214, + "grad_norm": 0.77734375, + "learning_rate": 1.4818152967311965e-05, + "loss": 1.163, + "step": 4242 + }, + { + "epoch": 1.3330977481987554, + "grad_norm": 0.80078125, + "learning_rate": 1.4815614090764838e-05, + "loss": 1.2654, + "step": 4244 + }, + { + "epoch": 1.3337259752242967, + "grad_norm": 0.76171875, + "learning_rate": 1.481307521421771e-05, + "loss": 1.1764, + "step": 4246 + }, + { + "epoch": 1.334354202249838, + "grad_norm": 0.7890625, + "learning_rate": 1.4810536337670582e-05, + "loss": 1.2239, + "step": 4248 + }, + { + "epoch": 1.3349824292753794, + "grad_norm": 0.77734375, + "learning_rate": 1.4807997461123454e-05, + "loss": 1.2153, + "step": 4250 + }, + { + "epoch": 1.3356106563009207, + "grad_norm": 0.73046875, + "learning_rate": 1.4805458584576327e-05, + "loss": 1.1689, + "step": 4252 + }, + { + "epoch": 1.336238883326462, + "grad_norm": 0.8671875, + "learning_rate": 1.4802919708029198e-05, + "loss": 1.2603, + "step": 4254 + }, + { + "epoch": 1.3368671103520033, + "grad_norm": 0.8046875, + "learning_rate": 1.4800380831482071e-05, + "loss": 1.1904, + "step": 4256 + }, + { + "epoch": 1.337495337377545, + "grad_norm": 0.7578125, + "learning_rate": 1.4797841954934941e-05, + "loss": 1.4012, + "step": 4258 + }, + { + "epoch": 1.3381235644030862, + "grad_norm": 0.9453125, + "learning_rate": 1.4795303078387816e-05, + "loss": 1.2366, + "step": 4260 + }, + { + "epoch": 1.3387517914286275, + "grad_norm": 0.7734375, + "learning_rate": 1.4792764201840685e-05, + "loss": 1.2833, + "step": 4262 + }, + { + "epoch": 1.3393800184541689, + "grad_norm": 0.7578125, + "learning_rate": 1.4790225325293558e-05, + "loss": 1.3163, + "step": 4264 + }, + { + "epoch": 1.3400082454797102, + "grad_norm": 0.80859375, + "learning_rate": 1.478768644874643e-05, + "loss": 1.2823, + "step": 4266 + }, + { + "epoch": 1.3406364725052515, + "grad_norm": 0.8515625, + "learning_rate": 1.4785147572199303e-05, + "loss": 1.1118, + "step": 4268 + }, + { + "epoch": 1.3412646995307929, + "grad_norm": 0.7578125, + "learning_rate": 1.4782608695652174e-05, + "loss": 1.1947, + "step": 4270 + }, + { + "epoch": 1.3418929265563344, + "grad_norm": 0.71875, + "learning_rate": 1.4780069819105047e-05, + "loss": 1.3296, + "step": 4272 + }, + { + "epoch": 1.3425211535818757, + "grad_norm": 0.7578125, + "learning_rate": 1.4777530942557919e-05, + "loss": 1.3039, + "step": 4274 + }, + { + "epoch": 1.343149380607417, + "grad_norm": 0.9609375, + "learning_rate": 1.4774992066010792e-05, + "loss": 1.1281, + "step": 4276 + }, + { + "epoch": 1.3437776076329584, + "grad_norm": 0.79296875, + "learning_rate": 1.4772453189463663e-05, + "loss": 1.2095, + "step": 4278 + }, + { + "epoch": 1.3444058346584997, + "grad_norm": 0.796875, + "learning_rate": 1.4769914312916536e-05, + "loss": 1.2037, + "step": 4280 + }, + { + "epoch": 1.345034061684041, + "grad_norm": 0.79296875, + "learning_rate": 1.476737543636941e-05, + "loss": 1.2451, + "step": 4282 + }, + { + "epoch": 1.3456622887095824, + "grad_norm": 0.76953125, + "learning_rate": 1.4764836559822279e-05, + "loss": 1.2812, + "step": 4284 + }, + { + "epoch": 1.3462905157351237, + "grad_norm": 0.8125, + "learning_rate": 1.4762297683275154e-05, + "loss": 1.1563, + "step": 4286 + }, + { + "epoch": 1.346918742760665, + "grad_norm": 0.74609375, + "learning_rate": 1.4759758806728024e-05, + "loss": 1.2046, + "step": 4288 + }, + { + "epoch": 1.3475469697862064, + "grad_norm": 0.8046875, + "learning_rate": 1.4757219930180897e-05, + "loss": 1.1475, + "step": 4290 + }, + { + "epoch": 1.348175196811748, + "grad_norm": 0.84375, + "learning_rate": 1.4754681053633768e-05, + "loss": 1.19, + "step": 4292 + }, + { + "epoch": 1.3488034238372892, + "grad_norm": 0.83203125, + "learning_rate": 1.4752142177086641e-05, + "loss": 1.1384, + "step": 4294 + }, + { + "epoch": 1.3494316508628306, + "grad_norm": 0.75390625, + "learning_rate": 1.4749603300539512e-05, + "loss": 1.3681, + "step": 4296 + }, + { + "epoch": 1.350059877888372, + "grad_norm": 0.77734375, + "learning_rate": 1.4747064423992386e-05, + "loss": 1.4912, + "step": 4298 + }, + { + "epoch": 1.3506881049139132, + "grad_norm": 0.796875, + "learning_rate": 1.4744525547445257e-05, + "loss": 1.2292, + "step": 4300 + }, + { + "epoch": 1.3513163319394546, + "grad_norm": 0.80859375, + "learning_rate": 1.474198667089813e-05, + "loss": 1.2261, + "step": 4302 + }, + { + "epoch": 1.351944558964996, + "grad_norm": 0.7578125, + "learning_rate": 1.4739447794351001e-05, + "loss": 1.3621, + "step": 4304 + }, + { + "epoch": 1.3525727859905374, + "grad_norm": 0.8515625, + "learning_rate": 1.4736908917803874e-05, + "loss": 1.1424, + "step": 4306 + }, + { + "epoch": 1.3532010130160788, + "grad_norm": 0.8203125, + "learning_rate": 1.4734370041256744e-05, + "loss": 1.1803, + "step": 4308 + }, + { + "epoch": 1.35382924004162, + "grad_norm": 0.73828125, + "learning_rate": 1.4731831164709617e-05, + "loss": 1.2921, + "step": 4310 + }, + { + "epoch": 1.3544574670671614, + "grad_norm": 0.77734375, + "learning_rate": 1.4729292288162489e-05, + "loss": 1.359, + "step": 4312 + }, + { + "epoch": 1.3550856940927027, + "grad_norm": 0.7578125, + "learning_rate": 1.4726753411615362e-05, + "loss": 1.2099, + "step": 4314 + }, + { + "epoch": 1.355713921118244, + "grad_norm": 0.73046875, + "learning_rate": 1.4724214535068233e-05, + "loss": 1.3183, + "step": 4316 + }, + { + "epoch": 1.3563421481437854, + "grad_norm": 0.80078125, + "learning_rate": 1.4721675658521106e-05, + "loss": 1.3491, + "step": 4318 + }, + { + "epoch": 1.3569703751693267, + "grad_norm": 0.8203125, + "learning_rate": 1.4719136781973978e-05, + "loss": 1.239, + "step": 4320 + }, + { + "epoch": 1.357598602194868, + "grad_norm": 0.80859375, + "learning_rate": 1.471659790542685e-05, + "loss": 1.2861, + "step": 4322 + }, + { + "epoch": 1.3582268292204096, + "grad_norm": 0.79296875, + "learning_rate": 1.4714059028879722e-05, + "loss": 1.2679, + "step": 4324 + }, + { + "epoch": 1.358855056245951, + "grad_norm": 0.72265625, + "learning_rate": 1.4711520152332595e-05, + "loss": 1.2569, + "step": 4326 + }, + { + "epoch": 1.3594832832714923, + "grad_norm": 0.7890625, + "learning_rate": 1.4708981275785465e-05, + "loss": 1.2359, + "step": 4328 + }, + { + "epoch": 1.3601115102970336, + "grad_norm": 0.85546875, + "learning_rate": 1.470644239923834e-05, + "loss": 1.2838, + "step": 4330 + }, + { + "epoch": 1.360739737322575, + "grad_norm": 0.8359375, + "learning_rate": 1.470390352269121e-05, + "loss": 1.302, + "step": 4332 + }, + { + "epoch": 1.3613679643481162, + "grad_norm": 0.83984375, + "learning_rate": 1.4701364646144082e-05, + "loss": 1.3001, + "step": 4334 + }, + { + "epoch": 1.3619961913736578, + "grad_norm": 0.828125, + "learning_rate": 1.4698825769596954e-05, + "loss": 1.2303, + "step": 4336 + }, + { + "epoch": 1.3626244183991991, + "grad_norm": 0.83984375, + "learning_rate": 1.4696286893049827e-05, + "loss": 1.2391, + "step": 4338 + }, + { + "epoch": 1.3632526454247405, + "grad_norm": 0.8515625, + "learning_rate": 1.4693748016502698e-05, + "loss": 1.2481, + "step": 4340 + }, + { + "epoch": 1.3638808724502818, + "grad_norm": 0.76171875, + "learning_rate": 1.4691209139955571e-05, + "loss": 1.2994, + "step": 4342 + }, + { + "epoch": 1.364509099475823, + "grad_norm": 0.96484375, + "learning_rate": 1.4688670263408443e-05, + "loss": 1.392, + "step": 4344 + }, + { + "epoch": 1.3651373265013644, + "grad_norm": 0.83984375, + "learning_rate": 1.4686131386861316e-05, + "loss": 1.3212, + "step": 4346 + }, + { + "epoch": 1.3657655535269058, + "grad_norm": 0.81640625, + "learning_rate": 1.4683592510314187e-05, + "loss": 1.3631, + "step": 4348 + }, + { + "epoch": 1.366393780552447, + "grad_norm": 0.79296875, + "learning_rate": 1.468105363376706e-05, + "loss": 1.2164, + "step": 4350 + }, + { + "epoch": 1.3670220075779884, + "grad_norm": 0.82421875, + "learning_rate": 1.467851475721993e-05, + "loss": 1.1514, + "step": 4352 + }, + { + "epoch": 1.3676502346035297, + "grad_norm": 0.79296875, + "learning_rate": 1.4675975880672803e-05, + "loss": 1.3862, + "step": 4354 + }, + { + "epoch": 1.368278461629071, + "grad_norm": 0.890625, + "learning_rate": 1.4673437004125674e-05, + "loss": 1.2115, + "step": 4356 + }, + { + "epoch": 1.3689066886546126, + "grad_norm": 0.91796875, + "learning_rate": 1.4670898127578547e-05, + "loss": 1.1759, + "step": 4358 + }, + { + "epoch": 1.369534915680154, + "grad_norm": 0.78515625, + "learning_rate": 1.4668359251031419e-05, + "loss": 1.2677, + "step": 4360 + }, + { + "epoch": 1.3701631427056953, + "grad_norm": 0.79296875, + "learning_rate": 1.4665820374484292e-05, + "loss": 1.287, + "step": 4362 + }, + { + "epoch": 1.3707913697312366, + "grad_norm": 0.76171875, + "learning_rate": 1.4663281497937163e-05, + "loss": 1.3592, + "step": 4364 + }, + { + "epoch": 1.371419596756778, + "grad_norm": 0.828125, + "learning_rate": 1.4660742621390036e-05, + "loss": 1.2563, + "step": 4366 + }, + { + "epoch": 1.3720478237823193, + "grad_norm": 0.76953125, + "learning_rate": 1.465820374484291e-05, + "loss": 1.1968, + "step": 4368 + }, + { + "epoch": 1.3726760508078608, + "grad_norm": 0.78125, + "learning_rate": 1.465566486829578e-05, + "loss": 1.3271, + "step": 4370 + }, + { + "epoch": 1.3733042778334021, + "grad_norm": 0.74609375, + "learning_rate": 1.4653125991748654e-05, + "loss": 1.304, + "step": 4372 + }, + { + "epoch": 1.3739325048589435, + "grad_norm": 0.7421875, + "learning_rate": 1.4650587115201525e-05, + "loss": 1.3881, + "step": 4374 + }, + { + "epoch": 1.3745607318844848, + "grad_norm": 0.765625, + "learning_rate": 1.4648048238654398e-05, + "loss": 1.2676, + "step": 4376 + }, + { + "epoch": 1.3751889589100261, + "grad_norm": 0.76171875, + "learning_rate": 1.4645509362107268e-05, + "loss": 1.2412, + "step": 4378 + }, + { + "epoch": 1.3758171859355675, + "grad_norm": 0.921875, + "learning_rate": 1.4642970485560141e-05, + "loss": 1.1142, + "step": 4380 + }, + { + "epoch": 1.3764454129611088, + "grad_norm": 0.78515625, + "learning_rate": 1.4640431609013012e-05, + "loss": 1.2977, + "step": 4382 + }, + { + "epoch": 1.37707363998665, + "grad_norm": 0.76953125, + "learning_rate": 1.4637892732465885e-05, + "loss": 1.1754, + "step": 4384 + }, + { + "epoch": 1.3777018670121914, + "grad_norm": 0.7890625, + "learning_rate": 1.4635353855918757e-05, + "loss": 1.3088, + "step": 4386 + }, + { + "epoch": 1.3783300940377328, + "grad_norm": 0.79296875, + "learning_rate": 1.463281497937163e-05, + "loss": 1.359, + "step": 4388 + }, + { + "epoch": 1.3789583210632743, + "grad_norm": 0.7578125, + "learning_rate": 1.4630276102824501e-05, + "loss": 1.1869, + "step": 4390 + }, + { + "epoch": 1.3795865480888156, + "grad_norm": 0.7265625, + "learning_rate": 1.4627737226277374e-05, + "loss": 1.3206, + "step": 4392 + }, + { + "epoch": 1.380214775114357, + "grad_norm": 0.7109375, + "learning_rate": 1.4625198349730246e-05, + "loss": 1.308, + "step": 4394 + }, + { + "epoch": 1.3808430021398983, + "grad_norm": 0.77734375, + "learning_rate": 1.4622659473183119e-05, + "loss": 1.2614, + "step": 4396 + }, + { + "epoch": 1.3814712291654396, + "grad_norm": 0.8671875, + "learning_rate": 1.462012059663599e-05, + "loss": 1.1872, + "step": 4398 + }, + { + "epoch": 1.382099456190981, + "grad_norm": 0.82421875, + "learning_rate": 1.4617581720088863e-05, + "loss": 1.2658, + "step": 4400 + }, + { + "epoch": 1.3827276832165225, + "grad_norm": 0.82421875, + "learning_rate": 1.4615042843541733e-05, + "loss": 1.3195, + "step": 4402 + }, + { + "epoch": 1.3833559102420638, + "grad_norm": 0.83984375, + "learning_rate": 1.4612503966994606e-05, + "loss": 1.1573, + "step": 4404 + }, + { + "epoch": 1.3839841372676052, + "grad_norm": 0.85546875, + "learning_rate": 1.4609965090447477e-05, + "loss": 1.2768, + "step": 4406 + }, + { + "epoch": 1.3846123642931465, + "grad_norm": 0.83984375, + "learning_rate": 1.460742621390035e-05, + "loss": 1.2157, + "step": 4408 + }, + { + "epoch": 1.3852405913186878, + "grad_norm": 0.76171875, + "learning_rate": 1.4604887337353222e-05, + "loss": 1.1608, + "step": 4410 + }, + { + "epoch": 1.3858688183442291, + "grad_norm": 0.8984375, + "learning_rate": 1.4602348460806095e-05, + "loss": 1.1619, + "step": 4412 + }, + { + "epoch": 1.3864970453697705, + "grad_norm": 0.73828125, + "learning_rate": 1.4599809584258966e-05, + "loss": 1.1692, + "step": 4414 + }, + { + "epoch": 1.3871252723953118, + "grad_norm": 0.76171875, + "learning_rate": 1.459727070771184e-05, + "loss": 1.3201, + "step": 4416 + }, + { + "epoch": 1.3877534994208531, + "grad_norm": 0.890625, + "learning_rate": 1.459473183116471e-05, + "loss": 1.2667, + "step": 4418 + }, + { + "epoch": 1.3883817264463945, + "grad_norm": 0.84375, + "learning_rate": 1.4592192954617584e-05, + "loss": 1.3936, + "step": 4420 + }, + { + "epoch": 1.3890099534719358, + "grad_norm": 0.796875, + "learning_rate": 1.4589654078070454e-05, + "loss": 1.271, + "step": 4422 + }, + { + "epoch": 1.3896381804974773, + "grad_norm": 0.84375, + "learning_rate": 1.4587115201523328e-05, + "loss": 1.2298, + "step": 4424 + }, + { + "epoch": 1.3902664075230187, + "grad_norm": 0.7421875, + "learning_rate": 1.4584576324976198e-05, + "loss": 1.3023, + "step": 4426 + }, + { + "epoch": 1.39089463454856, + "grad_norm": 0.78125, + "learning_rate": 1.4582037448429071e-05, + "loss": 1.0285, + "step": 4428 + }, + { + "epoch": 1.3915228615741013, + "grad_norm": 0.90234375, + "learning_rate": 1.4579498571881942e-05, + "loss": 1.345, + "step": 4430 + }, + { + "epoch": 1.3921510885996426, + "grad_norm": 0.76953125, + "learning_rate": 1.4576959695334816e-05, + "loss": 1.3171, + "step": 4432 + }, + { + "epoch": 1.392779315625184, + "grad_norm": 0.83203125, + "learning_rate": 1.4574420818787687e-05, + "loss": 1.1312, + "step": 4434 + }, + { + "epoch": 1.3934075426507255, + "grad_norm": 0.80078125, + "learning_rate": 1.457188194224056e-05, + "loss": 1.281, + "step": 4436 + }, + { + "epoch": 1.3940357696762669, + "grad_norm": 0.8046875, + "learning_rate": 1.4569343065693431e-05, + "loss": 1.3061, + "step": 4438 + }, + { + "epoch": 1.3946639967018082, + "grad_norm": 0.90625, + "learning_rate": 1.4566804189146304e-05, + "loss": 1.2167, + "step": 4440 + }, + { + "epoch": 1.3952922237273495, + "grad_norm": 0.80859375, + "learning_rate": 1.4564265312599176e-05, + "loss": 1.2837, + "step": 4442 + }, + { + "epoch": 1.3959204507528908, + "grad_norm": 0.9140625, + "learning_rate": 1.4561726436052049e-05, + "loss": 1.1497, + "step": 4444 + }, + { + "epoch": 1.3965486777784322, + "grad_norm": 1.1015625, + "learning_rate": 1.4559187559504919e-05, + "loss": 1.2668, + "step": 4446 + }, + { + "epoch": 1.3971769048039735, + "grad_norm": 0.81640625, + "learning_rate": 1.4556648682957792e-05, + "loss": 1.2495, + "step": 4448 + }, + { + "epoch": 1.3978051318295148, + "grad_norm": 0.73046875, + "learning_rate": 1.4554109806410663e-05, + "loss": 1.3183, + "step": 4450 + }, + { + "epoch": 1.3984333588550562, + "grad_norm": 0.88671875, + "learning_rate": 1.4551570929863536e-05, + "loss": 1.2387, + "step": 4452 + }, + { + "epoch": 1.3990615858805975, + "grad_norm": 0.84375, + "learning_rate": 1.454903205331641e-05, + "loss": 1.3808, + "step": 4454 + }, + { + "epoch": 1.399689812906139, + "grad_norm": 0.796875, + "learning_rate": 1.454649317676928e-05, + "loss": 1.2578, + "step": 4456 + }, + { + "epoch": 1.4003180399316804, + "grad_norm": 0.8125, + "learning_rate": 1.4543954300222154e-05, + "loss": 1.3209, + "step": 4458 + }, + { + "epoch": 1.4009462669572217, + "grad_norm": 0.83984375, + "learning_rate": 1.4541415423675025e-05, + "loss": 1.2369, + "step": 4460 + }, + { + "epoch": 1.401574493982763, + "grad_norm": 0.765625, + "learning_rate": 1.4538876547127898e-05, + "loss": 1.2486, + "step": 4462 + }, + { + "epoch": 1.4022027210083043, + "grad_norm": 0.7421875, + "learning_rate": 1.453633767058077e-05, + "loss": 1.1907, + "step": 4464 + }, + { + "epoch": 1.4028309480338457, + "grad_norm": 0.74609375, + "learning_rate": 1.4533798794033643e-05, + "loss": 1.1616, + "step": 4466 + }, + { + "epoch": 1.4034591750593872, + "grad_norm": 0.84375, + "learning_rate": 1.4531259917486514e-05, + "loss": 1.1429, + "step": 4468 + }, + { + "epoch": 1.4040874020849285, + "grad_norm": 0.8125, + "learning_rate": 1.4528721040939387e-05, + "loss": 1.2914, + "step": 4470 + }, + { + "epoch": 1.4047156291104699, + "grad_norm": 0.89453125, + "learning_rate": 1.4526182164392257e-05, + "loss": 1.3573, + "step": 4472 + }, + { + "epoch": 1.4053438561360112, + "grad_norm": 0.8125, + "learning_rate": 1.452364328784513e-05, + "loss": 1.258, + "step": 4474 + }, + { + "epoch": 1.4059720831615525, + "grad_norm": 0.76171875, + "learning_rate": 1.4521104411298001e-05, + "loss": 1.3615, + "step": 4476 + }, + { + "epoch": 1.4066003101870939, + "grad_norm": 0.734375, + "learning_rate": 1.4518565534750874e-05, + "loss": 1.2793, + "step": 4478 + }, + { + "epoch": 1.4072285372126352, + "grad_norm": 0.75390625, + "learning_rate": 1.4516026658203746e-05, + "loss": 1.2621, + "step": 4480 + }, + { + "epoch": 1.4078567642381765, + "grad_norm": 0.734375, + "learning_rate": 1.4513487781656619e-05, + "loss": 1.2854, + "step": 4482 + }, + { + "epoch": 1.4084849912637178, + "grad_norm": 0.80859375, + "learning_rate": 1.451094890510949e-05, + "loss": 1.2765, + "step": 4484 + }, + { + "epoch": 1.4091132182892592, + "grad_norm": 0.7734375, + "learning_rate": 1.4508410028562363e-05, + "loss": 1.2565, + "step": 4486 + }, + { + "epoch": 1.4097414453148005, + "grad_norm": 0.83203125, + "learning_rate": 1.4505871152015235e-05, + "loss": 1.1619, + "step": 4488 + }, + { + "epoch": 1.410369672340342, + "grad_norm": 0.73828125, + "learning_rate": 1.4503332275468108e-05, + "loss": 1.2787, + "step": 4490 + }, + { + "epoch": 1.4109978993658834, + "grad_norm": 0.7890625, + "learning_rate": 1.4500793398920977e-05, + "loss": 1.1383, + "step": 4492 + }, + { + "epoch": 1.4116261263914247, + "grad_norm": 0.84375, + "learning_rate": 1.4498254522373852e-05, + "loss": 1.2907, + "step": 4494 + }, + { + "epoch": 1.412254353416966, + "grad_norm": 0.7734375, + "learning_rate": 1.4495715645826722e-05, + "loss": 1.2964, + "step": 4496 + }, + { + "epoch": 1.4128825804425074, + "grad_norm": 0.78125, + "learning_rate": 1.4493176769279595e-05, + "loss": 1.4016, + "step": 4498 + }, + { + "epoch": 1.4135108074680487, + "grad_norm": 0.83203125, + "learning_rate": 1.4490637892732466e-05, + "loss": 1.1816, + "step": 4500 + }, + { + "epoch": 1.4141390344935902, + "grad_norm": 0.796875, + "learning_rate": 1.448809901618534e-05, + "loss": 1.2369, + "step": 4502 + }, + { + "epoch": 1.4147672615191316, + "grad_norm": 0.8515625, + "learning_rate": 1.448556013963821e-05, + "loss": 1.2545, + "step": 4504 + }, + { + "epoch": 1.415395488544673, + "grad_norm": 0.76171875, + "learning_rate": 1.4483021263091084e-05, + "loss": 1.3477, + "step": 4506 + }, + { + "epoch": 1.4160237155702142, + "grad_norm": 0.8671875, + "learning_rate": 1.4480482386543955e-05, + "loss": 1.2051, + "step": 4508 + }, + { + "epoch": 1.4166519425957556, + "grad_norm": 0.890625, + "learning_rate": 1.4477943509996828e-05, + "loss": 1.1868, + "step": 4510 + }, + { + "epoch": 1.4172801696212969, + "grad_norm": 0.91015625, + "learning_rate": 1.44754046334497e-05, + "loss": 1.2113, + "step": 4512 + }, + { + "epoch": 1.4179083966468382, + "grad_norm": 0.78125, + "learning_rate": 1.4472865756902573e-05, + "loss": 1.2932, + "step": 4514 + }, + { + "epoch": 1.4185366236723795, + "grad_norm": 0.80859375, + "learning_rate": 1.4470326880355442e-05, + "loss": 1.2502, + "step": 4516 + }, + { + "epoch": 1.4191648506979209, + "grad_norm": 0.8515625, + "learning_rate": 1.4467788003808315e-05, + "loss": 1.3597, + "step": 4518 + }, + { + "epoch": 1.4197930777234622, + "grad_norm": 0.796875, + "learning_rate": 1.4465249127261187e-05, + "loss": 1.2579, + "step": 4520 + }, + { + "epoch": 1.4204213047490037, + "grad_norm": 0.765625, + "learning_rate": 1.446271025071406e-05, + "loss": 1.315, + "step": 4522 + }, + { + "epoch": 1.421049531774545, + "grad_norm": 0.80078125, + "learning_rate": 1.4460171374166931e-05, + "loss": 1.1785, + "step": 4524 + }, + { + "epoch": 1.4216777588000864, + "grad_norm": 0.75, + "learning_rate": 1.4457632497619804e-05, + "loss": 1.2612, + "step": 4526 + }, + { + "epoch": 1.4223059858256277, + "grad_norm": 0.76953125, + "learning_rate": 1.4455093621072676e-05, + "loss": 1.2523, + "step": 4528 + }, + { + "epoch": 1.422934212851169, + "grad_norm": 0.83984375, + "learning_rate": 1.4452554744525549e-05, + "loss": 1.288, + "step": 4530 + }, + { + "epoch": 1.4235624398767104, + "grad_norm": 0.7421875, + "learning_rate": 1.445001586797842e-05, + "loss": 1.2778, + "step": 4532 + }, + { + "epoch": 1.424190666902252, + "grad_norm": 0.8203125, + "learning_rate": 1.4447476991431293e-05, + "loss": 1.3666, + "step": 4534 + }, + { + "epoch": 1.4248188939277933, + "grad_norm": 0.8671875, + "learning_rate": 1.4444938114884165e-05, + "loss": 1.2365, + "step": 4536 + }, + { + "epoch": 1.4254471209533346, + "grad_norm": 0.81640625, + "learning_rate": 1.4442399238337038e-05, + "loss": 1.4104, + "step": 4538 + }, + { + "epoch": 1.426075347978876, + "grad_norm": 0.78125, + "learning_rate": 1.4439860361789911e-05, + "loss": 1.3859, + "step": 4540 + }, + { + "epoch": 1.4267035750044172, + "grad_norm": 0.77734375, + "learning_rate": 1.443732148524278e-05, + "loss": 1.2868, + "step": 4542 + }, + { + "epoch": 1.4273318020299586, + "grad_norm": 0.91796875, + "learning_rate": 1.4434782608695654e-05, + "loss": 1.2102, + "step": 4544 + }, + { + "epoch": 1.4279600290555, + "grad_norm": 0.81640625, + "learning_rate": 1.4432243732148525e-05, + "loss": 1.4068, + "step": 4546 + }, + { + "epoch": 1.4285882560810412, + "grad_norm": 0.75, + "learning_rate": 1.4429704855601398e-05, + "loss": 1.1468, + "step": 4548 + }, + { + "epoch": 1.4292164831065826, + "grad_norm": 0.74609375, + "learning_rate": 1.442716597905427e-05, + "loss": 1.114, + "step": 4550 + }, + { + "epoch": 1.4298447101321239, + "grad_norm": 0.7578125, + "learning_rate": 1.4424627102507143e-05, + "loss": 1.1786, + "step": 4552 + }, + { + "epoch": 1.4304729371576652, + "grad_norm": 0.8515625, + "learning_rate": 1.4422088225960014e-05, + "loss": 1.2882, + "step": 4554 + }, + { + "epoch": 1.4311011641832068, + "grad_norm": 0.875, + "learning_rate": 1.4419549349412887e-05, + "loss": 1.2154, + "step": 4556 + }, + { + "epoch": 1.431729391208748, + "grad_norm": 0.91015625, + "learning_rate": 1.4417010472865758e-05, + "loss": 1.3404, + "step": 4558 + }, + { + "epoch": 1.4323576182342894, + "grad_norm": 0.83203125, + "learning_rate": 1.4414471596318631e-05, + "loss": 1.17, + "step": 4560 + }, + { + "epoch": 1.4329858452598307, + "grad_norm": 0.7578125, + "learning_rate": 1.4411932719771503e-05, + "loss": 1.2197, + "step": 4562 + }, + { + "epoch": 1.433614072285372, + "grad_norm": 0.8515625, + "learning_rate": 1.4409393843224376e-05, + "loss": 1.2704, + "step": 4564 + }, + { + "epoch": 1.4342422993109134, + "grad_norm": 0.73828125, + "learning_rate": 1.4406854966677246e-05, + "loss": 1.217, + "step": 4566 + }, + { + "epoch": 1.434870526336455, + "grad_norm": 0.796875, + "learning_rate": 1.4404316090130119e-05, + "loss": 1.158, + "step": 4568 + }, + { + "epoch": 1.4354987533619963, + "grad_norm": 0.78125, + "learning_rate": 1.440177721358299e-05, + "loss": 1.2378, + "step": 4570 + }, + { + "epoch": 1.4361269803875376, + "grad_norm": 0.7421875, + "learning_rate": 1.4399238337035863e-05, + "loss": 1.1346, + "step": 4572 + }, + { + "epoch": 1.436755207413079, + "grad_norm": 0.83984375, + "learning_rate": 1.4396699460488735e-05, + "loss": 1.3427, + "step": 4574 + }, + { + "epoch": 1.4373834344386203, + "grad_norm": 0.80078125, + "learning_rate": 1.4394160583941608e-05, + "loss": 1.2679, + "step": 4576 + }, + { + "epoch": 1.4380116614641616, + "grad_norm": 0.76953125, + "learning_rate": 1.4391621707394479e-05, + "loss": 1.1892, + "step": 4578 + }, + { + "epoch": 1.438639888489703, + "grad_norm": 0.73828125, + "learning_rate": 1.4389082830847352e-05, + "loss": 1.3267, + "step": 4580 + }, + { + "epoch": 1.4392681155152443, + "grad_norm": 0.734375, + "learning_rate": 1.4386543954300223e-05, + "loss": 1.2524, + "step": 4582 + }, + { + "epoch": 1.4398963425407856, + "grad_norm": 0.734375, + "learning_rate": 1.4384005077753097e-05, + "loss": 1.1367, + "step": 4584 + }, + { + "epoch": 1.440524569566327, + "grad_norm": 0.7578125, + "learning_rate": 1.4381466201205966e-05, + "loss": 1.3201, + "step": 4586 + }, + { + "epoch": 1.4411527965918685, + "grad_norm": 0.75390625, + "learning_rate": 1.437892732465884e-05, + "loss": 1.2344, + "step": 4588 + }, + { + "epoch": 1.4417810236174098, + "grad_norm": 0.8125, + "learning_rate": 1.437638844811171e-05, + "loss": 1.1569, + "step": 4590 + }, + { + "epoch": 1.4424092506429511, + "grad_norm": 0.75, + "learning_rate": 1.4373849571564584e-05, + "loss": 1.1568, + "step": 4592 + }, + { + "epoch": 1.4430374776684924, + "grad_norm": 0.82421875, + "learning_rate": 1.4371310695017455e-05, + "loss": 1.1668, + "step": 4594 + }, + { + "epoch": 1.4436657046940338, + "grad_norm": 0.8046875, + "learning_rate": 1.4368771818470328e-05, + "loss": 1.318, + "step": 4596 + }, + { + "epoch": 1.444293931719575, + "grad_norm": 0.7578125, + "learning_rate": 1.43662329419232e-05, + "loss": 1.3324, + "step": 4598 + }, + { + "epoch": 1.4449221587451166, + "grad_norm": 0.73828125, + "learning_rate": 1.4363694065376073e-05, + "loss": 1.2083, + "step": 4600 + }, + { + "epoch": 1.445550385770658, + "grad_norm": 0.75, + "learning_rate": 1.4361155188828944e-05, + "loss": 1.1944, + "step": 4602 + }, + { + "epoch": 1.4461786127961993, + "grad_norm": 0.84375, + "learning_rate": 1.4358616312281817e-05, + "loss": 1.2482, + "step": 4604 + }, + { + "epoch": 1.4468068398217406, + "grad_norm": 0.75390625, + "learning_rate": 1.4356077435734688e-05, + "loss": 1.162, + "step": 4606 + }, + { + "epoch": 1.447435066847282, + "grad_norm": 0.83984375, + "learning_rate": 1.4353538559187562e-05, + "loss": 1.1867, + "step": 4608 + }, + { + "epoch": 1.4480632938728233, + "grad_norm": 0.828125, + "learning_rate": 1.4350999682640431e-05, + "loss": 1.2864, + "step": 4610 + }, + { + "epoch": 1.4486915208983646, + "grad_norm": 0.95703125, + "learning_rate": 1.4348460806093304e-05, + "loss": 1.1464, + "step": 4612 + }, + { + "epoch": 1.449319747923906, + "grad_norm": 0.8046875, + "learning_rate": 1.4345921929546176e-05, + "loss": 1.1811, + "step": 4614 + }, + { + "epoch": 1.4499479749494473, + "grad_norm": 0.796875, + "learning_rate": 1.4343383052999049e-05, + "loss": 1.2498, + "step": 4616 + }, + { + "epoch": 1.4505762019749886, + "grad_norm": 0.7890625, + "learning_rate": 1.434084417645192e-05, + "loss": 1.192, + "step": 4618 + }, + { + "epoch": 1.4512044290005301, + "grad_norm": 0.78515625, + "learning_rate": 1.4338305299904793e-05, + "loss": 1.3238, + "step": 4620 + }, + { + "epoch": 1.4518326560260715, + "grad_norm": 0.85546875, + "learning_rate": 1.4335766423357665e-05, + "loss": 1.1391, + "step": 4622 + }, + { + "epoch": 1.4524608830516128, + "grad_norm": 0.77734375, + "learning_rate": 1.4333227546810538e-05, + "loss": 1.2083, + "step": 4624 + }, + { + "epoch": 1.4530891100771541, + "grad_norm": 0.84375, + "learning_rate": 1.433068867026341e-05, + "loss": 1.2331, + "step": 4626 + }, + { + "epoch": 1.4537173371026955, + "grad_norm": 0.79296875, + "learning_rate": 1.4328149793716282e-05, + "loss": 1.2771, + "step": 4628 + }, + { + "epoch": 1.4543455641282368, + "grad_norm": 0.8359375, + "learning_rate": 1.4325610917169155e-05, + "loss": 1.1523, + "step": 4630 + }, + { + "epoch": 1.4549737911537781, + "grad_norm": 0.81640625, + "learning_rate": 1.4323072040622027e-05, + "loss": 1.3079, + "step": 4632 + }, + { + "epoch": 1.4556020181793197, + "grad_norm": 0.765625, + "learning_rate": 1.43205331640749e-05, + "loss": 1.1524, + "step": 4634 + }, + { + "epoch": 1.456230245204861, + "grad_norm": 0.78125, + "learning_rate": 1.431799428752777e-05, + "loss": 1.1978, + "step": 4636 + }, + { + "epoch": 1.4568584722304023, + "grad_norm": 0.78125, + "learning_rate": 1.4315455410980642e-05, + "loss": 1.2417, + "step": 4638 + }, + { + "epoch": 1.4574866992559437, + "grad_norm": 0.7734375, + "learning_rate": 1.4312916534433514e-05, + "loss": 1.3005, + "step": 4640 + }, + { + "epoch": 1.458114926281485, + "grad_norm": 0.90234375, + "learning_rate": 1.4310377657886387e-05, + "loss": 1.2385, + "step": 4642 + }, + { + "epoch": 1.4587431533070263, + "grad_norm": 0.76171875, + "learning_rate": 1.4307838781339258e-05, + "loss": 1.324, + "step": 4644 + }, + { + "epoch": 1.4593713803325676, + "grad_norm": 0.74609375, + "learning_rate": 1.4305299904792131e-05, + "loss": 1.3922, + "step": 4646 + }, + { + "epoch": 1.459999607358109, + "grad_norm": 0.91015625, + "learning_rate": 1.4302761028245003e-05, + "loss": 1.3316, + "step": 4648 + }, + { + "epoch": 1.4606278343836503, + "grad_norm": 0.88671875, + "learning_rate": 1.4300222151697876e-05, + "loss": 1.2378, + "step": 4650 + }, + { + "epoch": 1.4612560614091916, + "grad_norm": 0.85546875, + "learning_rate": 1.4297683275150747e-05, + "loss": 1.2244, + "step": 4652 + }, + { + "epoch": 1.4618842884347332, + "grad_norm": 0.7578125, + "learning_rate": 1.429514439860362e-05, + "loss": 1.2384, + "step": 4654 + }, + { + "epoch": 1.4625125154602745, + "grad_norm": 0.76953125, + "learning_rate": 1.429260552205649e-05, + "loss": 1.39, + "step": 4656 + }, + { + "epoch": 1.4631407424858158, + "grad_norm": 0.79296875, + "learning_rate": 1.4290066645509365e-05, + "loss": 1.1775, + "step": 4658 + }, + { + "epoch": 1.4637689695113572, + "grad_norm": 0.75390625, + "learning_rate": 1.4287527768962234e-05, + "loss": 1.2629, + "step": 4660 + }, + { + "epoch": 1.4643971965368985, + "grad_norm": 0.90234375, + "learning_rate": 1.4284988892415108e-05, + "loss": 1.1894, + "step": 4662 + }, + { + "epoch": 1.4650254235624398, + "grad_norm": 0.75, + "learning_rate": 1.4282450015867979e-05, + "loss": 1.4307, + "step": 4664 + }, + { + "epoch": 1.4656536505879814, + "grad_norm": 0.7734375, + "learning_rate": 1.4279911139320852e-05, + "loss": 1.2757, + "step": 4666 + }, + { + "epoch": 1.4662818776135227, + "grad_norm": 0.80078125, + "learning_rate": 1.4277372262773723e-05, + "loss": 1.1575, + "step": 4668 + }, + { + "epoch": 1.466910104639064, + "grad_norm": 0.75, + "learning_rate": 1.4274833386226596e-05, + "loss": 1.2476, + "step": 4670 + }, + { + "epoch": 1.4675383316646053, + "grad_norm": 0.7421875, + "learning_rate": 1.4272294509679468e-05, + "loss": 1.1947, + "step": 4672 + }, + { + "epoch": 1.4681665586901467, + "grad_norm": 0.8046875, + "learning_rate": 1.4269755633132341e-05, + "loss": 1.2767, + "step": 4674 + }, + { + "epoch": 1.468794785715688, + "grad_norm": 0.76953125, + "learning_rate": 1.4267216756585212e-05, + "loss": 1.1294, + "step": 4676 + }, + { + "epoch": 1.4694230127412293, + "grad_norm": 0.88671875, + "learning_rate": 1.4264677880038085e-05, + "loss": 1.1821, + "step": 4678 + }, + { + "epoch": 1.4700512397667707, + "grad_norm": 0.80859375, + "learning_rate": 1.4262139003490955e-05, + "loss": 1.1981, + "step": 4680 + }, + { + "epoch": 1.470679466792312, + "grad_norm": 0.8671875, + "learning_rate": 1.4259600126943828e-05, + "loss": 1.2921, + "step": 4682 + }, + { + "epoch": 1.4713076938178533, + "grad_norm": 0.87109375, + "learning_rate": 1.42570612503967e-05, + "loss": 1.2258, + "step": 4684 + }, + { + "epoch": 1.4719359208433949, + "grad_norm": 0.79296875, + "learning_rate": 1.4254522373849573e-05, + "loss": 1.1364, + "step": 4686 + }, + { + "epoch": 1.4725641478689362, + "grad_norm": 0.8671875, + "learning_rate": 1.4251983497302444e-05, + "loss": 1.2497, + "step": 4688 + }, + { + "epoch": 1.4731923748944775, + "grad_norm": 0.796875, + "learning_rate": 1.4249444620755317e-05, + "loss": 1.1485, + "step": 4690 + }, + { + "epoch": 1.4738206019200188, + "grad_norm": 0.8125, + "learning_rate": 1.4246905744208188e-05, + "loss": 1.1257, + "step": 4692 + }, + { + "epoch": 1.4744488289455602, + "grad_norm": 0.7421875, + "learning_rate": 1.4244366867661061e-05, + "loss": 1.2718, + "step": 4694 + }, + { + "epoch": 1.4750770559711015, + "grad_norm": 0.90234375, + "learning_rate": 1.4241827991113933e-05, + "loss": 1.0906, + "step": 4696 + }, + { + "epoch": 1.4757052829966428, + "grad_norm": 0.7578125, + "learning_rate": 1.4239289114566806e-05, + "loss": 1.2542, + "step": 4698 + }, + { + "epoch": 1.4763335100221844, + "grad_norm": 0.8125, + "learning_rate": 1.4236750238019676e-05, + "loss": 1.2242, + "step": 4700 + }, + { + "epoch": 1.4769617370477257, + "grad_norm": 0.78125, + "learning_rate": 1.423421136147255e-05, + "loss": 1.1202, + "step": 4702 + }, + { + "epoch": 1.477589964073267, + "grad_norm": 0.78515625, + "learning_rate": 1.423167248492542e-05, + "loss": 1.2829, + "step": 4704 + }, + { + "epoch": 1.4782181910988084, + "grad_norm": 0.7421875, + "learning_rate": 1.4229133608378293e-05, + "loss": 1.2276, + "step": 4706 + }, + { + "epoch": 1.4788464181243497, + "grad_norm": 0.82421875, + "learning_rate": 1.4226594731831165e-05, + "loss": 1.0959, + "step": 4708 + }, + { + "epoch": 1.479474645149891, + "grad_norm": 0.91796875, + "learning_rate": 1.4224055855284038e-05, + "loss": 1.2402, + "step": 4710 + }, + { + "epoch": 1.4801028721754323, + "grad_norm": 0.73828125, + "learning_rate": 1.422151697873691e-05, + "loss": 1.41, + "step": 4712 + }, + { + "epoch": 1.4807310992009737, + "grad_norm": 0.8515625, + "learning_rate": 1.4218978102189782e-05, + "loss": 1.2751, + "step": 4714 + }, + { + "epoch": 1.481359326226515, + "grad_norm": 0.7734375, + "learning_rate": 1.4216439225642655e-05, + "loss": 1.1848, + "step": 4716 + }, + { + "epoch": 1.4819875532520563, + "grad_norm": 0.85546875, + "learning_rate": 1.4213900349095527e-05, + "loss": 1.1541, + "step": 4718 + }, + { + "epoch": 1.4826157802775979, + "grad_norm": 0.79296875, + "learning_rate": 1.42113614725484e-05, + "loss": 1.248, + "step": 4720 + }, + { + "epoch": 1.4832440073031392, + "grad_norm": 0.84375, + "learning_rate": 1.4208822596001271e-05, + "loss": 1.0783, + "step": 4722 + }, + { + "epoch": 1.4838722343286805, + "grad_norm": 0.82421875, + "learning_rate": 1.4206283719454144e-05, + "loss": 1.3559, + "step": 4724 + }, + { + "epoch": 1.4845004613542219, + "grad_norm": 0.83984375, + "learning_rate": 1.4203744842907014e-05, + "loss": 1.2525, + "step": 4726 + }, + { + "epoch": 1.4851286883797632, + "grad_norm": 0.83203125, + "learning_rate": 1.4201205966359889e-05, + "loss": 1.3513, + "step": 4728 + }, + { + "epoch": 1.4857569154053045, + "grad_norm": 0.84765625, + "learning_rate": 1.4198667089812758e-05, + "loss": 1.1958, + "step": 4730 + }, + { + "epoch": 1.486385142430846, + "grad_norm": 0.83984375, + "learning_rate": 1.4196128213265631e-05, + "loss": 1.2256, + "step": 4732 + }, + { + "epoch": 1.4870133694563874, + "grad_norm": 0.828125, + "learning_rate": 1.4193589336718503e-05, + "loss": 1.2719, + "step": 4734 + }, + { + "epoch": 1.4876415964819287, + "grad_norm": 0.8984375, + "learning_rate": 1.4191050460171376e-05, + "loss": 1.186, + "step": 4736 + }, + { + "epoch": 1.48826982350747, + "grad_norm": 0.78125, + "learning_rate": 1.4188511583624247e-05, + "loss": 1.3374, + "step": 4738 + }, + { + "epoch": 1.4888980505330114, + "grad_norm": 0.90234375, + "learning_rate": 1.418597270707712e-05, + "loss": 1.139, + "step": 4740 + }, + { + "epoch": 1.4895262775585527, + "grad_norm": 0.84375, + "learning_rate": 1.4183433830529992e-05, + "loss": 1.269, + "step": 4742 + }, + { + "epoch": 1.490154504584094, + "grad_norm": 0.796875, + "learning_rate": 1.4180894953982865e-05, + "loss": 1.1428, + "step": 4744 + }, + { + "epoch": 1.4907827316096354, + "grad_norm": 0.73046875, + "learning_rate": 1.4178356077435736e-05, + "loss": 1.304, + "step": 4746 + }, + { + "epoch": 1.4914109586351767, + "grad_norm": 0.81640625, + "learning_rate": 1.4175817200888609e-05, + "loss": 1.4207, + "step": 4748 + }, + { + "epoch": 1.492039185660718, + "grad_norm": 0.7109375, + "learning_rate": 1.4173278324341479e-05, + "loss": 1.2348, + "step": 4750 + }, + { + "epoch": 1.4926674126862596, + "grad_norm": 0.796875, + "learning_rate": 1.4170739447794352e-05, + "loss": 1.2315, + "step": 4752 + }, + { + "epoch": 1.493295639711801, + "grad_norm": 0.78515625, + "learning_rate": 1.4168200571247223e-05, + "loss": 1.408, + "step": 4754 + }, + { + "epoch": 1.4939238667373422, + "grad_norm": 0.796875, + "learning_rate": 1.4165661694700096e-05, + "loss": 1.2126, + "step": 4756 + }, + { + "epoch": 1.4945520937628836, + "grad_norm": 0.80859375, + "learning_rate": 1.4163122818152968e-05, + "loss": 1.3488, + "step": 4758 + }, + { + "epoch": 1.4951803207884249, + "grad_norm": 0.75, + "learning_rate": 1.416058394160584e-05, + "loss": 1.2919, + "step": 4760 + }, + { + "epoch": 1.4958085478139662, + "grad_norm": 0.79296875, + "learning_rate": 1.4158045065058712e-05, + "loss": 1.2386, + "step": 4762 + }, + { + "epoch": 1.4964367748395075, + "grad_norm": 0.93359375, + "learning_rate": 1.4155506188511585e-05, + "loss": 1.1495, + "step": 4764 + }, + { + "epoch": 1.497065001865049, + "grad_norm": 0.80859375, + "learning_rate": 1.4152967311964457e-05, + "loss": 1.0899, + "step": 4766 + }, + { + "epoch": 1.4976932288905904, + "grad_norm": 0.703125, + "learning_rate": 1.415042843541733e-05, + "loss": 1.1979, + "step": 4768 + }, + { + "epoch": 1.4983214559161318, + "grad_norm": 0.8125, + "learning_rate": 1.4147889558870201e-05, + "loss": 1.2938, + "step": 4770 + }, + { + "epoch": 1.498949682941673, + "grad_norm": 1.6875, + "learning_rate": 1.4145350682323074e-05, + "loss": 1.1939, + "step": 4772 + }, + { + "epoch": 1.4995779099672144, + "grad_norm": 0.76953125, + "learning_rate": 1.4142811805775944e-05, + "loss": 1.2171, + "step": 4774 + }, + { + "epoch": 1.5002061369927557, + "grad_norm": 0.8046875, + "learning_rate": 1.4140272929228817e-05, + "loss": 1.202, + "step": 4776 + }, + { + "epoch": 1.500834364018297, + "grad_norm": 0.90234375, + "learning_rate": 1.4137734052681688e-05, + "loss": 1.2284, + "step": 4778 + }, + { + "epoch": 1.5014625910438384, + "grad_norm": 0.75, + "learning_rate": 1.4135195176134561e-05, + "loss": 1.2907, + "step": 4780 + }, + { + "epoch": 1.5020908180693797, + "grad_norm": 0.87109375, + "learning_rate": 1.4132656299587433e-05, + "loss": 1.304, + "step": 4782 + }, + { + "epoch": 1.502719045094921, + "grad_norm": 0.85546875, + "learning_rate": 1.4130117423040306e-05, + "loss": 1.3084, + "step": 4784 + }, + { + "epoch": 1.5033472721204624, + "grad_norm": 0.7890625, + "learning_rate": 1.4127578546493177e-05, + "loss": 1.277, + "step": 4786 + }, + { + "epoch": 1.503975499146004, + "grad_norm": 0.7734375, + "learning_rate": 1.412503966994605e-05, + "loss": 1.2962, + "step": 4788 + }, + { + "epoch": 1.5046037261715453, + "grad_norm": 0.73046875, + "learning_rate": 1.4122500793398922e-05, + "loss": 1.1253, + "step": 4790 + }, + { + "epoch": 1.5052319531970866, + "grad_norm": 0.95703125, + "learning_rate": 1.4119961916851795e-05, + "loss": 1.1104, + "step": 4792 + }, + { + "epoch": 1.505860180222628, + "grad_norm": 0.7890625, + "learning_rate": 1.4117423040304664e-05, + "loss": 1.2353, + "step": 4794 + }, + { + "epoch": 1.5064884072481695, + "grad_norm": 0.81640625, + "learning_rate": 1.411488416375754e-05, + "loss": 1.2573, + "step": 4796 + }, + { + "epoch": 1.5071166342737108, + "grad_norm": 0.85546875, + "learning_rate": 1.4112345287210412e-05, + "loss": 1.2517, + "step": 4798 + }, + { + "epoch": 1.5077448612992521, + "grad_norm": 0.75390625, + "learning_rate": 1.4109806410663282e-05, + "loss": 1.3036, + "step": 4800 + }, + { + "epoch": 1.5083730883247934, + "grad_norm": 0.8046875, + "learning_rate": 1.4107267534116155e-05, + "loss": 1.364, + "step": 4802 + }, + { + "epoch": 1.5090013153503348, + "grad_norm": 0.890625, + "learning_rate": 1.4104728657569026e-05, + "loss": 1.1784, + "step": 4804 + }, + { + "epoch": 1.509629542375876, + "grad_norm": 0.74609375, + "learning_rate": 1.41021897810219e-05, + "loss": 1.1693, + "step": 4806 + }, + { + "epoch": 1.5102577694014174, + "grad_norm": 0.8203125, + "learning_rate": 1.4099650904474771e-05, + "loss": 1.2911, + "step": 4808 + }, + { + "epoch": 1.5108859964269588, + "grad_norm": 0.796875, + "learning_rate": 1.4097112027927644e-05, + "loss": 1.1448, + "step": 4810 + }, + { + "epoch": 1.5115142234525, + "grad_norm": 0.828125, + "learning_rate": 1.4094573151380515e-05, + "loss": 1.3268, + "step": 4812 + }, + { + "epoch": 1.5121424504780414, + "grad_norm": 0.7890625, + "learning_rate": 1.4092034274833388e-05, + "loss": 1.3592, + "step": 4814 + }, + { + "epoch": 1.5127706775035827, + "grad_norm": 0.8125, + "learning_rate": 1.408949539828626e-05, + "loss": 1.2284, + "step": 4816 + }, + { + "epoch": 1.513398904529124, + "grad_norm": 0.82421875, + "learning_rate": 1.4086956521739133e-05, + "loss": 1.3063, + "step": 4818 + }, + { + "epoch": 1.5140271315546656, + "grad_norm": 0.859375, + "learning_rate": 1.4084417645192003e-05, + "loss": 1.291, + "step": 4820 + }, + { + "epoch": 1.514655358580207, + "grad_norm": 0.8515625, + "learning_rate": 1.4081878768644877e-05, + "loss": 1.1867, + "step": 4822 + }, + { + "epoch": 1.5152835856057483, + "grad_norm": 0.7890625, + "learning_rate": 1.4079339892097747e-05, + "loss": 1.1726, + "step": 4824 + }, + { + "epoch": 1.5159118126312896, + "grad_norm": 0.76171875, + "learning_rate": 1.407680101555062e-05, + "loss": 1.2812, + "step": 4826 + }, + { + "epoch": 1.5165400396568312, + "grad_norm": 0.83984375, + "learning_rate": 1.4074262139003492e-05, + "loss": 1.2503, + "step": 4828 + }, + { + "epoch": 1.5171682666823725, + "grad_norm": 0.8125, + "learning_rate": 1.4071723262456365e-05, + "loss": 1.1713, + "step": 4830 + }, + { + "epoch": 1.5177964937079138, + "grad_norm": 0.7578125, + "learning_rate": 1.4069184385909236e-05, + "loss": 1.2758, + "step": 4832 + }, + { + "epoch": 1.5184247207334551, + "grad_norm": 0.796875, + "learning_rate": 1.4066645509362109e-05, + "loss": 1.4859, + "step": 4834 + }, + { + "epoch": 1.5190529477589965, + "grad_norm": 0.8125, + "learning_rate": 1.406410663281498e-05, + "loss": 1.1555, + "step": 4836 + }, + { + "epoch": 1.5196811747845378, + "grad_norm": 0.953125, + "learning_rate": 1.4061567756267854e-05, + "loss": 1.211, + "step": 4838 + }, + { + "epoch": 1.5203094018100791, + "grad_norm": 0.89453125, + "learning_rate": 1.4059028879720725e-05, + "loss": 1.1481, + "step": 4840 + }, + { + "epoch": 1.5209376288356204, + "grad_norm": 0.84765625, + "learning_rate": 1.4056490003173598e-05, + "loss": 1.1373, + "step": 4842 + }, + { + "epoch": 1.5215658558611618, + "grad_norm": 0.7734375, + "learning_rate": 1.4053951126626468e-05, + "loss": 1.2099, + "step": 4844 + }, + { + "epoch": 1.522194082886703, + "grad_norm": 0.8125, + "learning_rate": 1.405141225007934e-05, + "loss": 1.331, + "step": 4846 + }, + { + "epoch": 1.5228223099122444, + "grad_norm": 0.7578125, + "learning_rate": 1.4048873373532212e-05, + "loss": 1.2244, + "step": 4848 + }, + { + "epoch": 1.5234505369377858, + "grad_norm": 0.7109375, + "learning_rate": 1.4046334496985085e-05, + "loss": 1.301, + "step": 4850 + }, + { + "epoch": 1.524078763963327, + "grad_norm": 0.83984375, + "learning_rate": 1.4043795620437957e-05, + "loss": 1.2133, + "step": 4852 + }, + { + "epoch": 1.5247069909888686, + "grad_norm": 0.828125, + "learning_rate": 1.404125674389083e-05, + "loss": 1.1242, + "step": 4854 + }, + { + "epoch": 1.52533521801441, + "grad_norm": 0.84375, + "learning_rate": 1.4038717867343701e-05, + "loss": 1.2627, + "step": 4856 + }, + { + "epoch": 1.5259634450399513, + "grad_norm": 0.71875, + "learning_rate": 1.4036178990796574e-05, + "loss": 1.3087, + "step": 4858 + }, + { + "epoch": 1.5265916720654926, + "grad_norm": 0.85546875, + "learning_rate": 1.4033640114249446e-05, + "loss": 1.1323, + "step": 4860 + }, + { + "epoch": 1.5272198990910342, + "grad_norm": 0.75390625, + "learning_rate": 1.4031101237702319e-05, + "loss": 1.2379, + "step": 4862 + }, + { + "epoch": 1.5278481261165755, + "grad_norm": 0.765625, + "learning_rate": 1.4028562361155188e-05, + "loss": 1.2671, + "step": 4864 + }, + { + "epoch": 1.5284763531421168, + "grad_norm": 0.765625, + "learning_rate": 1.4026023484608063e-05, + "loss": 1.2881, + "step": 4866 + }, + { + "epoch": 1.5291045801676582, + "grad_norm": 0.85546875, + "learning_rate": 1.4023484608060933e-05, + "loss": 1.2102, + "step": 4868 + }, + { + "epoch": 1.5297328071931995, + "grad_norm": 0.828125, + "learning_rate": 1.4020945731513806e-05, + "loss": 1.2949, + "step": 4870 + }, + { + "epoch": 1.5303610342187408, + "grad_norm": 0.7734375, + "learning_rate": 1.4018406854966677e-05, + "loss": 1.2383, + "step": 4872 + }, + { + "epoch": 1.5309892612442821, + "grad_norm": 0.96484375, + "learning_rate": 1.401586797841955e-05, + "loss": 1.1815, + "step": 4874 + }, + { + "epoch": 1.5316174882698235, + "grad_norm": 0.8125, + "learning_rate": 1.4013329101872422e-05, + "loss": 1.2497, + "step": 4876 + }, + { + "epoch": 1.5322457152953648, + "grad_norm": 0.8125, + "learning_rate": 1.4010790225325295e-05, + "loss": 1.1109, + "step": 4878 + }, + { + "epoch": 1.5328739423209061, + "grad_norm": 0.8359375, + "learning_rate": 1.4008251348778166e-05, + "loss": 1.2732, + "step": 4880 + }, + { + "epoch": 1.5335021693464475, + "grad_norm": 0.81640625, + "learning_rate": 1.400571247223104e-05, + "loss": 1.2428, + "step": 4882 + }, + { + "epoch": 1.5341303963719888, + "grad_norm": 0.87890625, + "learning_rate": 1.4003173595683912e-05, + "loss": 1.2501, + "step": 4884 + }, + { + "epoch": 1.5347586233975303, + "grad_norm": 0.83984375, + "learning_rate": 1.4000634719136784e-05, + "loss": 1.2886, + "step": 4886 + }, + { + "epoch": 1.5353868504230717, + "grad_norm": 0.828125, + "learning_rate": 1.3998095842589657e-05, + "loss": 1.2391, + "step": 4888 + }, + { + "epoch": 1.536015077448613, + "grad_norm": 0.828125, + "learning_rate": 1.3995556966042526e-05, + "loss": 1.2547, + "step": 4890 + }, + { + "epoch": 1.5366433044741543, + "grad_norm": 0.7734375, + "learning_rate": 1.3993018089495401e-05, + "loss": 1.2538, + "step": 4892 + }, + { + "epoch": 1.5372715314996959, + "grad_norm": 0.8671875, + "learning_rate": 1.3990479212948271e-05, + "loss": 1.2798, + "step": 4894 + }, + { + "epoch": 1.5378997585252372, + "grad_norm": 0.7734375, + "learning_rate": 1.3987940336401144e-05, + "loss": 1.2126, + "step": 4896 + }, + { + "epoch": 1.5385279855507785, + "grad_norm": 0.79296875, + "learning_rate": 1.3985401459854015e-05, + "loss": 1.2583, + "step": 4898 + }, + { + "epoch": 1.5391562125763198, + "grad_norm": 0.83203125, + "learning_rate": 1.3982862583306888e-05, + "loss": 1.2099, + "step": 4900 + }, + { + "epoch": 1.5397844396018612, + "grad_norm": 0.859375, + "learning_rate": 1.398032370675976e-05, + "loss": 1.3456, + "step": 4902 + }, + { + "epoch": 1.5404126666274025, + "grad_norm": 0.78125, + "learning_rate": 1.3977784830212633e-05, + "loss": 1.2055, + "step": 4904 + }, + { + "epoch": 1.5410408936529438, + "grad_norm": 0.78515625, + "learning_rate": 1.3975245953665504e-05, + "loss": 1.2495, + "step": 4906 + }, + { + "epoch": 1.5416691206784852, + "grad_norm": 0.78125, + "learning_rate": 1.3972707077118377e-05, + "loss": 1.2034, + "step": 4908 + }, + { + "epoch": 1.5422973477040265, + "grad_norm": 0.734375, + "learning_rate": 1.3970168200571249e-05, + "loss": 1.2081, + "step": 4910 + }, + { + "epoch": 1.5429255747295678, + "grad_norm": 0.79296875, + "learning_rate": 1.3967629324024122e-05, + "loss": 1.0711, + "step": 4912 + }, + { + "epoch": 1.5435538017551091, + "grad_norm": 0.75390625, + "learning_rate": 1.3965090447476991e-05, + "loss": 1.233, + "step": 4914 + }, + { + "epoch": 1.5441820287806505, + "grad_norm": 0.7578125, + "learning_rate": 1.3962551570929865e-05, + "loss": 1.2307, + "step": 4916 + }, + { + "epoch": 1.5448102558061918, + "grad_norm": 0.7890625, + "learning_rate": 1.3960012694382736e-05, + "loss": 1.3425, + "step": 4918 + }, + { + "epoch": 1.5454384828317334, + "grad_norm": 0.7578125, + "learning_rate": 1.3957473817835609e-05, + "loss": 1.2415, + "step": 4920 + }, + { + "epoch": 1.5460667098572747, + "grad_norm": 0.859375, + "learning_rate": 1.395493494128848e-05, + "loss": 1.1973, + "step": 4922 + }, + { + "epoch": 1.546694936882816, + "grad_norm": 0.71875, + "learning_rate": 1.3952396064741353e-05, + "loss": 1.2275, + "step": 4924 + }, + { + "epoch": 1.5473231639083573, + "grad_norm": 0.8125, + "learning_rate": 1.3949857188194225e-05, + "loss": 1.416, + "step": 4926 + }, + { + "epoch": 1.5479513909338989, + "grad_norm": 0.80859375, + "learning_rate": 1.3947318311647098e-05, + "loss": 1.262, + "step": 4928 + }, + { + "epoch": 1.5485796179594402, + "grad_norm": 0.75, + "learning_rate": 1.394477943509997e-05, + "loss": 1.0942, + "step": 4930 + }, + { + "epoch": 1.5492078449849815, + "grad_norm": 0.8203125, + "learning_rate": 1.3942240558552842e-05, + "loss": 1.2883, + "step": 4932 + }, + { + "epoch": 1.5498360720105229, + "grad_norm": 0.84765625, + "learning_rate": 1.3939701682005714e-05, + "loss": 1.2058, + "step": 4934 + }, + { + "epoch": 1.5504642990360642, + "grad_norm": 0.75390625, + "learning_rate": 1.3937162805458587e-05, + "loss": 1.2756, + "step": 4936 + }, + { + "epoch": 1.5510925260616055, + "grad_norm": 0.8359375, + "learning_rate": 1.3934623928911457e-05, + "loss": 1.2265, + "step": 4938 + }, + { + "epoch": 1.5517207530871469, + "grad_norm": 0.82421875, + "learning_rate": 1.393208505236433e-05, + "loss": 1.0057, + "step": 4940 + }, + { + "epoch": 1.5523489801126882, + "grad_norm": 0.77734375, + "learning_rate": 1.3929546175817201e-05, + "loss": 1.1519, + "step": 4942 + }, + { + "epoch": 1.5529772071382295, + "grad_norm": 0.7734375, + "learning_rate": 1.3927007299270074e-05, + "loss": 1.3281, + "step": 4944 + }, + { + "epoch": 1.5536054341637708, + "grad_norm": 0.74609375, + "learning_rate": 1.3924468422722945e-05, + "loss": 1.267, + "step": 4946 + }, + { + "epoch": 1.5542336611893122, + "grad_norm": 0.8046875, + "learning_rate": 1.3921929546175819e-05, + "loss": 1.2482, + "step": 4948 + }, + { + "epoch": 1.5548618882148535, + "grad_norm": 0.7578125, + "learning_rate": 1.391939066962869e-05, + "loss": 1.3847, + "step": 4950 + }, + { + "epoch": 1.555490115240395, + "grad_norm": 0.7890625, + "learning_rate": 1.3916851793081563e-05, + "loss": 1.4112, + "step": 4952 + }, + { + "epoch": 1.5561183422659364, + "grad_norm": 0.85546875, + "learning_rate": 1.3914312916534434e-05, + "loss": 1.2111, + "step": 4954 + }, + { + "epoch": 1.5567465692914777, + "grad_norm": 0.7734375, + "learning_rate": 1.3911774039987307e-05, + "loss": 1.4313, + "step": 4956 + }, + { + "epoch": 1.557374796317019, + "grad_norm": 0.74609375, + "learning_rate": 1.3909235163440177e-05, + "loss": 1.2122, + "step": 4958 + }, + { + "epoch": 1.5580030233425606, + "grad_norm": 0.7578125, + "learning_rate": 1.3906696286893052e-05, + "loss": 1.2285, + "step": 4960 + }, + { + "epoch": 1.558631250368102, + "grad_norm": 0.890625, + "learning_rate": 1.3904157410345922e-05, + "loss": 1.2074, + "step": 4962 + }, + { + "epoch": 1.5592594773936432, + "grad_norm": 0.79296875, + "learning_rate": 1.3901618533798795e-05, + "loss": 1.3096, + "step": 4964 + }, + { + "epoch": 1.5598877044191846, + "grad_norm": 0.859375, + "learning_rate": 1.3899079657251666e-05, + "loss": 1.1488, + "step": 4966 + }, + { + "epoch": 1.560515931444726, + "grad_norm": 0.79296875, + "learning_rate": 1.3896540780704539e-05, + "loss": 1.358, + "step": 4968 + }, + { + "epoch": 1.5611441584702672, + "grad_norm": 0.78515625, + "learning_rate": 1.3894001904157412e-05, + "loss": 1.2134, + "step": 4970 + }, + { + "epoch": 1.5617723854958085, + "grad_norm": 0.7890625, + "learning_rate": 1.3891463027610284e-05, + "loss": 1.3555, + "step": 4972 + }, + { + "epoch": 1.5624006125213499, + "grad_norm": 0.87109375, + "learning_rate": 1.3888924151063157e-05, + "loss": 1.2697, + "step": 4974 + }, + { + "epoch": 1.5630288395468912, + "grad_norm": 0.94140625, + "learning_rate": 1.3886385274516028e-05, + "loss": 1.1357, + "step": 4976 + }, + { + "epoch": 1.5636570665724325, + "grad_norm": 0.8359375, + "learning_rate": 1.3883846397968901e-05, + "loss": 1.1122, + "step": 4978 + }, + { + "epoch": 1.5642852935979739, + "grad_norm": 0.9375, + "learning_rate": 1.3881307521421772e-05, + "loss": 1.3238, + "step": 4980 + }, + { + "epoch": 1.5649135206235152, + "grad_norm": 0.85546875, + "learning_rate": 1.3878768644874646e-05, + "loss": 1.1624, + "step": 4982 + }, + { + "epoch": 1.5655417476490565, + "grad_norm": 0.83203125, + "learning_rate": 1.3876229768327515e-05, + "loss": 1.1995, + "step": 4984 + }, + { + "epoch": 1.566169974674598, + "grad_norm": 0.921875, + "learning_rate": 1.387369089178039e-05, + "loss": 1.433, + "step": 4986 + }, + { + "epoch": 1.5667982017001394, + "grad_norm": 0.828125, + "learning_rate": 1.387115201523326e-05, + "loss": 1.2223, + "step": 4988 + }, + { + "epoch": 1.5674264287256807, + "grad_norm": 0.8203125, + "learning_rate": 1.3868613138686133e-05, + "loss": 1.2335, + "step": 4990 + }, + { + "epoch": 1.568054655751222, + "grad_norm": 0.78515625, + "learning_rate": 1.3866074262139004e-05, + "loss": 1.2387, + "step": 4992 + }, + { + "epoch": 1.5686828827767636, + "grad_norm": 0.74609375, + "learning_rate": 1.3863535385591877e-05, + "loss": 1.2488, + "step": 4994 + }, + { + "epoch": 1.569311109802305, + "grad_norm": 0.765625, + "learning_rate": 1.3860996509044749e-05, + "loss": 1.3808, + "step": 4996 + }, + { + "epoch": 1.5699393368278463, + "grad_norm": 0.73828125, + "learning_rate": 1.3858457632497622e-05, + "loss": 1.4113, + "step": 4998 + }, + { + "epoch": 1.5705675638533876, + "grad_norm": 0.8671875, + "learning_rate": 1.3855918755950493e-05, + "loss": 1.2078, + "step": 5000 + }, + { + "epoch": 1.571195790878929, + "grad_norm": 0.7734375, + "learning_rate": 1.3853379879403366e-05, + "loss": 1.2806, + "step": 5002 + }, + { + "epoch": 1.5718240179044702, + "grad_norm": 0.8515625, + "learning_rate": 1.3850841002856238e-05, + "loss": 1.2592, + "step": 5004 + }, + { + "epoch": 1.5724522449300116, + "grad_norm": 0.76171875, + "learning_rate": 1.384830212630911e-05, + "loss": 1.2786, + "step": 5006 + }, + { + "epoch": 1.573080471955553, + "grad_norm": 0.8515625, + "learning_rate": 1.384576324976198e-05, + "loss": 1.2668, + "step": 5008 + }, + { + "epoch": 1.5737086989810942, + "grad_norm": 0.796875, + "learning_rate": 1.3843224373214853e-05, + "loss": 1.1614, + "step": 5010 + }, + { + "epoch": 1.5743369260066356, + "grad_norm": 0.78515625, + "learning_rate": 1.3840685496667725e-05, + "loss": 1.1552, + "step": 5012 + }, + { + "epoch": 1.5749651530321769, + "grad_norm": 0.79296875, + "learning_rate": 1.3838146620120598e-05, + "loss": 1.295, + "step": 5014 + }, + { + "epoch": 1.5755933800577182, + "grad_norm": 0.8203125, + "learning_rate": 1.383560774357347e-05, + "loss": 1.2958, + "step": 5016 + }, + { + "epoch": 1.5762216070832598, + "grad_norm": 0.73828125, + "learning_rate": 1.3833068867026342e-05, + "loss": 1.3272, + "step": 5018 + }, + { + "epoch": 1.576849834108801, + "grad_norm": 0.8828125, + "learning_rate": 1.3830529990479214e-05, + "loss": 1.2085, + "step": 5020 + }, + { + "epoch": 1.5774780611343424, + "grad_norm": 0.82421875, + "learning_rate": 1.3827991113932087e-05, + "loss": 1.28, + "step": 5022 + }, + { + "epoch": 1.5781062881598837, + "grad_norm": 0.78125, + "learning_rate": 1.3825452237384958e-05, + "loss": 1.2884, + "step": 5024 + }, + { + "epoch": 1.5787345151854253, + "grad_norm": 0.69140625, + "learning_rate": 1.3822913360837831e-05, + "loss": 1.3882, + "step": 5026 + }, + { + "epoch": 1.5793627422109666, + "grad_norm": 0.77734375, + "learning_rate": 1.3820374484290701e-05, + "loss": 1.3267, + "step": 5028 + }, + { + "epoch": 1.579990969236508, + "grad_norm": 0.859375, + "learning_rate": 1.3817835607743576e-05, + "loss": 1.2475, + "step": 5030 + }, + { + "epoch": 1.5806191962620493, + "grad_norm": 0.6953125, + "learning_rate": 1.3815296731196445e-05, + "loss": 1.2973, + "step": 5032 + }, + { + "epoch": 1.5812474232875906, + "grad_norm": 0.80078125, + "learning_rate": 1.3812757854649318e-05, + "loss": 1.1669, + "step": 5034 + }, + { + "epoch": 1.581875650313132, + "grad_norm": 0.78515625, + "learning_rate": 1.381021897810219e-05, + "loss": 1.2783, + "step": 5036 + }, + { + "epoch": 1.5825038773386733, + "grad_norm": 0.765625, + "learning_rate": 1.3807680101555063e-05, + "loss": 1.1044, + "step": 5038 + }, + { + "epoch": 1.5831321043642146, + "grad_norm": 0.71875, + "learning_rate": 1.3805141225007934e-05, + "loss": 1.2039, + "step": 5040 + }, + { + "epoch": 1.583760331389756, + "grad_norm": 0.76171875, + "learning_rate": 1.3802602348460807e-05, + "loss": 1.3272, + "step": 5042 + }, + { + "epoch": 1.5843885584152972, + "grad_norm": 0.859375, + "learning_rate": 1.3800063471913679e-05, + "loss": 1.1171, + "step": 5044 + }, + { + "epoch": 1.5850167854408386, + "grad_norm": 0.83984375, + "learning_rate": 1.3797524595366552e-05, + "loss": 1.085, + "step": 5046 + }, + { + "epoch": 1.58564501246638, + "grad_norm": 1.0703125, + "learning_rate": 1.3794985718819423e-05, + "loss": 1.1656, + "step": 5048 + }, + { + "epoch": 1.5862732394919212, + "grad_norm": 0.8203125, + "learning_rate": 1.3792446842272296e-05, + "loss": 1.2341, + "step": 5050 + }, + { + "epoch": 1.5869014665174628, + "grad_norm": 0.8203125, + "learning_rate": 1.3789907965725166e-05, + "loss": 1.2506, + "step": 5052 + }, + { + "epoch": 1.587529693543004, + "grad_norm": 0.78125, + "learning_rate": 1.3787369089178039e-05, + "loss": 1.2872, + "step": 5054 + }, + { + "epoch": 1.5881579205685454, + "grad_norm": 0.81640625, + "learning_rate": 1.3784830212630914e-05, + "loss": 1.1942, + "step": 5056 + }, + { + "epoch": 1.5887861475940868, + "grad_norm": 0.75390625, + "learning_rate": 1.3782291336083783e-05, + "loss": 1.2892, + "step": 5058 + }, + { + "epoch": 1.5894143746196283, + "grad_norm": 0.796875, + "learning_rate": 1.3779752459536657e-05, + "loss": 1.2254, + "step": 5060 + }, + { + "epoch": 1.5900426016451696, + "grad_norm": 0.765625, + "learning_rate": 1.3777213582989528e-05, + "loss": 1.2721, + "step": 5062 + }, + { + "epoch": 1.590670828670711, + "grad_norm": 0.78515625, + "learning_rate": 1.3774674706442401e-05, + "loss": 1.2791, + "step": 5064 + }, + { + "epoch": 1.5912990556962523, + "grad_norm": 0.76953125, + "learning_rate": 1.3772135829895272e-05, + "loss": 1.2474, + "step": 5066 + }, + { + "epoch": 1.5919272827217936, + "grad_norm": 0.796875, + "learning_rate": 1.3769596953348145e-05, + "loss": 1.2996, + "step": 5068 + }, + { + "epoch": 1.592555509747335, + "grad_norm": 0.796875, + "learning_rate": 1.3767058076801017e-05, + "loss": 1.1624, + "step": 5070 + }, + { + "epoch": 1.5931837367728763, + "grad_norm": 0.79296875, + "learning_rate": 1.376451920025389e-05, + "loss": 1.1621, + "step": 5072 + }, + { + "epoch": 1.5938119637984176, + "grad_norm": 0.8203125, + "learning_rate": 1.3761980323706761e-05, + "loss": 1.1595, + "step": 5074 + }, + { + "epoch": 1.594440190823959, + "grad_norm": 0.75, + "learning_rate": 1.3759441447159634e-05, + "loss": 1.2585, + "step": 5076 + }, + { + "epoch": 1.5950684178495003, + "grad_norm": 0.859375, + "learning_rate": 1.3756902570612504e-05, + "loss": 1.1875, + "step": 5078 + }, + { + "epoch": 1.5956966448750416, + "grad_norm": 0.8359375, + "learning_rate": 1.3754363694065377e-05, + "loss": 1.286, + "step": 5080 + }, + { + "epoch": 1.596324871900583, + "grad_norm": 0.7421875, + "learning_rate": 1.3751824817518249e-05, + "loss": 1.313, + "step": 5082 + }, + { + "epoch": 1.5969530989261245, + "grad_norm": 0.8984375, + "learning_rate": 1.3749285940971122e-05, + "loss": 1.2761, + "step": 5084 + }, + { + "epoch": 1.5975813259516658, + "grad_norm": 0.82421875, + "learning_rate": 1.3746747064423993e-05, + "loss": 1.2133, + "step": 5086 + }, + { + "epoch": 1.5982095529772071, + "grad_norm": 0.7890625, + "learning_rate": 1.3744208187876866e-05, + "loss": 1.2465, + "step": 5088 + }, + { + "epoch": 1.5988377800027485, + "grad_norm": 0.76171875, + "learning_rate": 1.3741669311329737e-05, + "loss": 1.3734, + "step": 5090 + }, + { + "epoch": 1.59946600702829, + "grad_norm": 0.80859375, + "learning_rate": 1.373913043478261e-05, + "loss": 1.4389, + "step": 5092 + }, + { + "epoch": 1.6000942340538313, + "grad_norm": 0.8125, + "learning_rate": 1.3736591558235482e-05, + "loss": 1.2194, + "step": 5094 + }, + { + "epoch": 1.6007224610793727, + "grad_norm": 0.86328125, + "learning_rate": 1.3734052681688355e-05, + "loss": 1.1488, + "step": 5096 + }, + { + "epoch": 1.601350688104914, + "grad_norm": 0.87109375, + "learning_rate": 1.3731513805141226e-05, + "loss": 1.324, + "step": 5098 + }, + { + "epoch": 1.6019789151304553, + "grad_norm": 0.8125, + "learning_rate": 1.37289749285941e-05, + "loss": 1.2289, + "step": 5100 + }, + { + "epoch": 1.6026071421559966, + "grad_norm": 0.76953125, + "learning_rate": 1.3726436052046969e-05, + "loss": 1.1883, + "step": 5102 + }, + { + "epoch": 1.603235369181538, + "grad_norm": 0.83203125, + "learning_rate": 1.3723897175499842e-05, + "loss": 1.2215, + "step": 5104 + }, + { + "epoch": 1.6038635962070793, + "grad_norm": 0.81640625, + "learning_rate": 1.3721358298952714e-05, + "loss": 1.2423, + "step": 5106 + }, + { + "epoch": 1.6044918232326206, + "grad_norm": 0.7890625, + "learning_rate": 1.3718819422405587e-05, + "loss": 1.1615, + "step": 5108 + }, + { + "epoch": 1.605120050258162, + "grad_norm": 0.7578125, + "learning_rate": 1.3716280545858458e-05, + "loss": 1.1432, + "step": 5110 + }, + { + "epoch": 1.6057482772837033, + "grad_norm": 0.8203125, + "learning_rate": 1.3713741669311331e-05, + "loss": 1.3432, + "step": 5112 + }, + { + "epoch": 1.6063765043092446, + "grad_norm": 0.8828125, + "learning_rate": 1.3711202792764203e-05, + "loss": 1.2572, + "step": 5114 + }, + { + "epoch": 1.6070047313347862, + "grad_norm": 0.83984375, + "learning_rate": 1.3708663916217076e-05, + "loss": 1.141, + "step": 5116 + }, + { + "epoch": 1.6076329583603275, + "grad_norm": 0.80078125, + "learning_rate": 1.3706125039669947e-05, + "loss": 1.2724, + "step": 5118 + }, + { + "epoch": 1.6082611853858688, + "grad_norm": 0.98046875, + "learning_rate": 1.370358616312282e-05, + "loss": 1.2636, + "step": 5120 + }, + { + "epoch": 1.6088894124114101, + "grad_norm": 0.89453125, + "learning_rate": 1.370104728657569e-05, + "loss": 1.2074, + "step": 5122 + }, + { + "epoch": 1.6095176394369515, + "grad_norm": 0.81640625, + "learning_rate": 1.3698508410028565e-05, + "loss": 1.3499, + "step": 5124 + }, + { + "epoch": 1.610145866462493, + "grad_norm": 0.76953125, + "learning_rate": 1.3695969533481434e-05, + "loss": 1.3866, + "step": 5126 + }, + { + "epoch": 1.6107740934880344, + "grad_norm": 0.796875, + "learning_rate": 1.3693430656934307e-05, + "loss": 1.2968, + "step": 5128 + }, + { + "epoch": 1.6114023205135757, + "grad_norm": 0.75, + "learning_rate": 1.3690891780387179e-05, + "loss": 1.4032, + "step": 5130 + }, + { + "epoch": 1.612030547539117, + "grad_norm": 0.73046875, + "learning_rate": 1.3688352903840052e-05, + "loss": 1.3089, + "step": 5132 + }, + { + "epoch": 1.6126587745646583, + "grad_norm": 0.7734375, + "learning_rate": 1.3685814027292923e-05, + "loss": 1.332, + "step": 5134 + }, + { + "epoch": 1.6132870015901997, + "grad_norm": 0.79296875, + "learning_rate": 1.3683275150745796e-05, + "loss": 1.1898, + "step": 5136 + }, + { + "epoch": 1.613915228615741, + "grad_norm": 0.87890625, + "learning_rate": 1.3680736274198668e-05, + "loss": 1.2405, + "step": 5138 + }, + { + "epoch": 1.6145434556412823, + "grad_norm": 0.77734375, + "learning_rate": 1.367819739765154e-05, + "loss": 1.2174, + "step": 5140 + }, + { + "epoch": 1.6151716826668236, + "grad_norm": 0.75, + "learning_rate": 1.3675658521104414e-05, + "loss": 1.0467, + "step": 5142 + }, + { + "epoch": 1.615799909692365, + "grad_norm": 0.765625, + "learning_rate": 1.3673119644557285e-05, + "loss": 1.2141, + "step": 5144 + }, + { + "epoch": 1.6164281367179063, + "grad_norm": 0.81640625, + "learning_rate": 1.3670580768010158e-05, + "loss": 1.362, + "step": 5146 + }, + { + "epoch": 1.6170563637434476, + "grad_norm": 0.80078125, + "learning_rate": 1.3668041891463028e-05, + "loss": 1.239, + "step": 5148 + }, + { + "epoch": 1.6176845907689892, + "grad_norm": 0.890625, + "learning_rate": 1.3665503014915903e-05, + "loss": 1.3551, + "step": 5150 + }, + { + "epoch": 1.6183128177945305, + "grad_norm": 0.89453125, + "learning_rate": 1.3662964138368772e-05, + "loss": 1.3027, + "step": 5152 + }, + { + "epoch": 1.6189410448200718, + "grad_norm": 0.70703125, + "learning_rate": 1.3660425261821645e-05, + "loss": 1.262, + "step": 5154 + }, + { + "epoch": 1.6195692718456132, + "grad_norm": 0.81640625, + "learning_rate": 1.3657886385274517e-05, + "loss": 1.3053, + "step": 5156 + }, + { + "epoch": 1.6201974988711547, + "grad_norm": 0.74609375, + "learning_rate": 1.365534750872739e-05, + "loss": 1.3218, + "step": 5158 + }, + { + "epoch": 1.620825725896696, + "grad_norm": 0.765625, + "learning_rate": 1.3652808632180261e-05, + "loss": 1.2845, + "step": 5160 + }, + { + "epoch": 1.6214539529222374, + "grad_norm": 0.953125, + "learning_rate": 1.3650269755633134e-05, + "loss": 1.1991, + "step": 5162 + }, + { + "epoch": 1.6220821799477787, + "grad_norm": 0.734375, + "learning_rate": 1.3647730879086006e-05, + "loss": 1.3062, + "step": 5164 + }, + { + "epoch": 1.62271040697332, + "grad_norm": 0.80859375, + "learning_rate": 1.3645192002538879e-05, + "loss": 1.0548, + "step": 5166 + }, + { + "epoch": 1.6233386339988614, + "grad_norm": 0.80078125, + "learning_rate": 1.364265312599175e-05, + "loss": 1.1821, + "step": 5168 + }, + { + "epoch": 1.6239668610244027, + "grad_norm": 0.78125, + "learning_rate": 1.3640114249444623e-05, + "loss": 1.1822, + "step": 5170 + }, + { + "epoch": 1.624595088049944, + "grad_norm": 0.90234375, + "learning_rate": 1.3637575372897493e-05, + "loss": 1.2382, + "step": 5172 + }, + { + "epoch": 1.6252233150754853, + "grad_norm": 0.828125, + "learning_rate": 1.3635036496350366e-05, + "loss": 1.2857, + "step": 5174 + }, + { + "epoch": 1.6258515421010267, + "grad_norm": 0.87890625, + "learning_rate": 1.3632497619803237e-05, + "loss": 1.0598, + "step": 5176 + }, + { + "epoch": 1.626479769126568, + "grad_norm": 0.765625, + "learning_rate": 1.362995874325611e-05, + "loss": 1.2989, + "step": 5178 + }, + { + "epoch": 1.6271079961521093, + "grad_norm": 0.84375, + "learning_rate": 1.3627419866708982e-05, + "loss": 1.2732, + "step": 5180 + }, + { + "epoch": 1.6277362231776509, + "grad_norm": 0.77734375, + "learning_rate": 1.3624880990161855e-05, + "loss": 1.1808, + "step": 5182 + }, + { + "epoch": 1.6283644502031922, + "grad_norm": 0.79296875, + "learning_rate": 1.3622342113614726e-05, + "loss": 1.2011, + "step": 5184 + }, + { + "epoch": 1.6289926772287335, + "grad_norm": 0.7734375, + "learning_rate": 1.36198032370676e-05, + "loss": 1.2063, + "step": 5186 + }, + { + "epoch": 1.6296209042542749, + "grad_norm": 0.91015625, + "learning_rate": 1.361726436052047e-05, + "loss": 1.243, + "step": 5188 + }, + { + "epoch": 1.6302491312798162, + "grad_norm": 0.90625, + "learning_rate": 1.3614725483973344e-05, + "loss": 1.3291, + "step": 5190 + }, + { + "epoch": 1.6308773583053577, + "grad_norm": 0.76953125, + "learning_rate": 1.3612186607426214e-05, + "loss": 1.2388, + "step": 5192 + }, + { + "epoch": 1.631505585330899, + "grad_norm": 0.8203125, + "learning_rate": 1.3609647730879088e-05, + "loss": 1.2186, + "step": 5194 + }, + { + "epoch": 1.6321338123564404, + "grad_norm": 0.83203125, + "learning_rate": 1.3607108854331958e-05, + "loss": 1.3375, + "step": 5196 + }, + { + "epoch": 1.6327620393819817, + "grad_norm": 0.7890625, + "learning_rate": 1.3604569977784831e-05, + "loss": 1.1837, + "step": 5198 + }, + { + "epoch": 1.633390266407523, + "grad_norm": 0.87890625, + "learning_rate": 1.3602031101237702e-05, + "loss": 1.1972, + "step": 5200 + }, + { + "epoch": 1.6340184934330644, + "grad_norm": 0.80859375, + "learning_rate": 1.3599492224690576e-05, + "loss": 1.303, + "step": 5202 + }, + { + "epoch": 1.6346467204586057, + "grad_norm": 1.015625, + "learning_rate": 1.3596953348143447e-05, + "loss": 1.2091, + "step": 5204 + }, + { + "epoch": 1.635274947484147, + "grad_norm": 0.7734375, + "learning_rate": 1.359441447159632e-05, + "loss": 1.2598, + "step": 5206 + }, + { + "epoch": 1.6359031745096884, + "grad_norm": 0.8671875, + "learning_rate": 1.3591875595049191e-05, + "loss": 1.0553, + "step": 5208 + }, + { + "epoch": 1.6365314015352297, + "grad_norm": 0.8203125, + "learning_rate": 1.3589336718502064e-05, + "loss": 1.2889, + "step": 5210 + }, + { + "epoch": 1.637159628560771, + "grad_norm": 1.2109375, + "learning_rate": 1.3586797841954936e-05, + "loss": 1.1413, + "step": 5212 + }, + { + "epoch": 1.6377878555863123, + "grad_norm": 0.828125, + "learning_rate": 1.3584258965407809e-05, + "loss": 1.1636, + "step": 5214 + }, + { + "epoch": 1.638416082611854, + "grad_norm": 1.0625, + "learning_rate": 1.3581720088860679e-05, + "loss": 1.276, + "step": 5216 + }, + { + "epoch": 1.6390443096373952, + "grad_norm": 0.80859375, + "learning_rate": 1.3579181212313552e-05, + "loss": 1.305, + "step": 5218 + }, + { + "epoch": 1.6396725366629366, + "grad_norm": 0.7734375, + "learning_rate": 1.3576642335766423e-05, + "loss": 1.201, + "step": 5220 + }, + { + "epoch": 1.6403007636884779, + "grad_norm": 0.80078125, + "learning_rate": 1.3574103459219296e-05, + "loss": 1.4048, + "step": 5222 + }, + { + "epoch": 1.6409289907140194, + "grad_norm": 0.80078125, + "learning_rate": 1.3571564582672168e-05, + "loss": 1.2899, + "step": 5224 + }, + { + "epoch": 1.6415572177395608, + "grad_norm": 0.765625, + "learning_rate": 1.356902570612504e-05, + "loss": 1.2957, + "step": 5226 + }, + { + "epoch": 1.642185444765102, + "grad_norm": 0.7734375, + "learning_rate": 1.3566486829577914e-05, + "loss": 1.1528, + "step": 5228 + }, + { + "epoch": 1.6428136717906434, + "grad_norm": 0.79296875, + "learning_rate": 1.3563947953030785e-05, + "loss": 1.3055, + "step": 5230 + }, + { + "epoch": 1.6434418988161847, + "grad_norm": 0.953125, + "learning_rate": 1.3561409076483658e-05, + "loss": 1.0612, + "step": 5232 + }, + { + "epoch": 1.644070125841726, + "grad_norm": 0.80078125, + "learning_rate": 1.355887019993653e-05, + "loss": 1.1704, + "step": 5234 + }, + { + "epoch": 1.6446983528672674, + "grad_norm": 0.89453125, + "learning_rate": 1.3556331323389403e-05, + "loss": 1.1888, + "step": 5236 + }, + { + "epoch": 1.6453265798928087, + "grad_norm": 0.75390625, + "learning_rate": 1.3553792446842274e-05, + "loss": 1.2332, + "step": 5238 + }, + { + "epoch": 1.64595480691835, + "grad_norm": 0.74609375, + "learning_rate": 1.3551253570295147e-05, + "loss": 1.2469, + "step": 5240 + }, + { + "epoch": 1.6465830339438914, + "grad_norm": 0.80859375, + "learning_rate": 1.3548714693748017e-05, + "loss": 1.2556, + "step": 5242 + }, + { + "epoch": 1.6472112609694327, + "grad_norm": 0.80859375, + "learning_rate": 1.354617581720089e-05, + "loss": 1.2354, + "step": 5244 + }, + { + "epoch": 1.647839487994974, + "grad_norm": 0.75390625, + "learning_rate": 1.3543636940653761e-05, + "loss": 1.4014, + "step": 5246 + }, + { + "epoch": 1.6484677150205156, + "grad_norm": 0.75, + "learning_rate": 1.3541098064106634e-05, + "loss": 1.3373, + "step": 5248 + }, + { + "epoch": 1.649095942046057, + "grad_norm": 0.91015625, + "learning_rate": 1.3538559187559506e-05, + "loss": 1.3094, + "step": 5250 + }, + { + "epoch": 1.6497241690715982, + "grad_norm": 0.765625, + "learning_rate": 1.3536020311012379e-05, + "loss": 1.2154, + "step": 5252 + }, + { + "epoch": 1.6503523960971396, + "grad_norm": 0.7890625, + "learning_rate": 1.353348143446525e-05, + "loss": 1.1919, + "step": 5254 + }, + { + "epoch": 1.6509806231226811, + "grad_norm": 0.80859375, + "learning_rate": 1.3530942557918123e-05, + "loss": 1.2142, + "step": 5256 + }, + { + "epoch": 1.6516088501482225, + "grad_norm": 0.78125, + "learning_rate": 1.3528403681370995e-05, + "loss": 1.2202, + "step": 5258 + }, + { + "epoch": 1.6522370771737638, + "grad_norm": 0.78515625, + "learning_rate": 1.3525864804823868e-05, + "loss": 1.3794, + "step": 5260 + }, + { + "epoch": 1.652865304199305, + "grad_norm": 0.765625, + "learning_rate": 1.3523325928276739e-05, + "loss": 1.1983, + "step": 5262 + }, + { + "epoch": 1.6534935312248464, + "grad_norm": 0.8046875, + "learning_rate": 1.3520787051729612e-05, + "loss": 1.3406, + "step": 5264 + }, + { + "epoch": 1.6541217582503878, + "grad_norm": 0.96484375, + "learning_rate": 1.3518248175182482e-05, + "loss": 1.1838, + "step": 5266 + }, + { + "epoch": 1.654749985275929, + "grad_norm": 0.77734375, + "learning_rate": 1.3515709298635355e-05, + "loss": 1.3651, + "step": 5268 + }, + { + "epoch": 1.6553782123014704, + "grad_norm": 0.703125, + "learning_rate": 1.3513170422088226e-05, + "loss": 1.2919, + "step": 5270 + }, + { + "epoch": 1.6560064393270117, + "grad_norm": 0.71875, + "learning_rate": 1.35106315455411e-05, + "loss": 1.3154, + "step": 5272 + }, + { + "epoch": 1.656634666352553, + "grad_norm": 0.8125, + "learning_rate": 1.350809266899397e-05, + "loss": 1.2643, + "step": 5274 + }, + { + "epoch": 1.6572628933780944, + "grad_norm": 0.77734375, + "learning_rate": 1.3505553792446844e-05, + "loss": 1.3239, + "step": 5276 + }, + { + "epoch": 1.6578911204036357, + "grad_norm": 0.9140625, + "learning_rate": 1.3503014915899715e-05, + "loss": 1.2137, + "step": 5278 + }, + { + "epoch": 1.658519347429177, + "grad_norm": 0.82421875, + "learning_rate": 1.3500476039352588e-05, + "loss": 1.2215, + "step": 5280 + }, + { + "epoch": 1.6591475744547186, + "grad_norm": 0.953125, + "learning_rate": 1.349793716280546e-05, + "loss": 1.1606, + "step": 5282 + }, + { + "epoch": 1.65977580148026, + "grad_norm": 0.7734375, + "learning_rate": 1.3495398286258333e-05, + "loss": 1.2496, + "step": 5284 + }, + { + "epoch": 1.6604040285058013, + "grad_norm": 0.73828125, + "learning_rate": 1.3492859409711202e-05, + "loss": 1.1227, + "step": 5286 + }, + { + "epoch": 1.6610322555313426, + "grad_norm": 0.75, + "learning_rate": 1.3490320533164077e-05, + "loss": 1.2112, + "step": 5288 + }, + { + "epoch": 1.6616604825568841, + "grad_norm": 0.79296875, + "learning_rate": 1.3487781656616947e-05, + "loss": 1.206, + "step": 5290 + }, + { + "epoch": 1.6622887095824255, + "grad_norm": 0.90625, + "learning_rate": 1.348524278006982e-05, + "loss": 1.2691, + "step": 5292 + }, + { + "epoch": 1.6629169366079668, + "grad_norm": 0.81640625, + "learning_rate": 1.3482703903522691e-05, + "loss": 1.2359, + "step": 5294 + }, + { + "epoch": 1.6635451636335081, + "grad_norm": 0.765625, + "learning_rate": 1.3480165026975564e-05, + "loss": 1.2491, + "step": 5296 + }, + { + "epoch": 1.6641733906590495, + "grad_norm": 0.8984375, + "learning_rate": 1.3477626150428436e-05, + "loss": 1.1253, + "step": 5298 + }, + { + "epoch": 1.6648016176845908, + "grad_norm": 0.79296875, + "learning_rate": 1.3475087273881309e-05, + "loss": 1.2954, + "step": 5300 + }, + { + "epoch": 1.6654298447101321, + "grad_norm": 0.88671875, + "learning_rate": 1.347254839733418e-05, + "loss": 1.178, + "step": 5302 + }, + { + "epoch": 1.6660580717356734, + "grad_norm": 0.81640625, + "learning_rate": 1.3470009520787053e-05, + "loss": 1.2116, + "step": 5304 + }, + { + "epoch": 1.6666862987612148, + "grad_norm": 0.90234375, + "learning_rate": 1.3467470644239925e-05, + "loss": 1.2565, + "step": 5306 + }, + { + "epoch": 1.667314525786756, + "grad_norm": 0.8984375, + "learning_rate": 1.3464931767692798e-05, + "loss": 1.0026, + "step": 5308 + }, + { + "epoch": 1.6679427528122974, + "grad_norm": 0.83203125, + "learning_rate": 1.3462392891145667e-05, + "loss": 1.292, + "step": 5310 + }, + { + "epoch": 1.6685709798378388, + "grad_norm": 0.86328125, + "learning_rate": 1.345985401459854e-05, + "loss": 1.104, + "step": 5312 + }, + { + "epoch": 1.6691992068633803, + "grad_norm": 0.828125, + "learning_rate": 1.3457315138051414e-05, + "loss": 1.2361, + "step": 5314 + }, + { + "epoch": 1.6698274338889216, + "grad_norm": 0.90234375, + "learning_rate": 1.3454776261504285e-05, + "loss": 1.2665, + "step": 5316 + }, + { + "epoch": 1.670455660914463, + "grad_norm": 0.8125, + "learning_rate": 1.3452237384957158e-05, + "loss": 1.2824, + "step": 5318 + }, + { + "epoch": 1.6710838879400043, + "grad_norm": 0.8515625, + "learning_rate": 1.344969850841003e-05, + "loss": 1.1185, + "step": 5320 + }, + { + "epoch": 1.6717121149655458, + "grad_norm": 0.77734375, + "learning_rate": 1.3447159631862903e-05, + "loss": 1.2738, + "step": 5322 + }, + { + "epoch": 1.6723403419910872, + "grad_norm": 0.77734375, + "learning_rate": 1.3444620755315774e-05, + "loss": 1.2713, + "step": 5324 + }, + { + "epoch": 1.6729685690166285, + "grad_norm": 0.91015625, + "learning_rate": 1.3442081878768647e-05, + "loss": 1.1093, + "step": 5326 + }, + { + "epoch": 1.6735967960421698, + "grad_norm": 0.86328125, + "learning_rate": 1.3439543002221518e-05, + "loss": 1.3263, + "step": 5328 + }, + { + "epoch": 1.6742250230677111, + "grad_norm": 0.76953125, + "learning_rate": 1.3437004125674391e-05, + "loss": 1.3777, + "step": 5330 + }, + { + "epoch": 1.6748532500932525, + "grad_norm": 1.0546875, + "learning_rate": 1.3434465249127263e-05, + "loss": 1.1151, + "step": 5332 + }, + { + "epoch": 1.6754814771187938, + "grad_norm": 0.77734375, + "learning_rate": 1.3431926372580136e-05, + "loss": 1.5733, + "step": 5334 + }, + { + "epoch": 1.6761097041443351, + "grad_norm": 0.796875, + "learning_rate": 1.3429387496033006e-05, + "loss": 1.1432, + "step": 5336 + }, + { + "epoch": 1.6767379311698765, + "grad_norm": 0.8671875, + "learning_rate": 1.3426848619485879e-05, + "loss": 1.3853, + "step": 5338 + }, + { + "epoch": 1.6773661581954178, + "grad_norm": 0.78125, + "learning_rate": 1.342430974293875e-05, + "loss": 1.3414, + "step": 5340 + }, + { + "epoch": 1.6779943852209591, + "grad_norm": 0.85546875, + "learning_rate": 1.3421770866391623e-05, + "loss": 1.2539, + "step": 5342 + }, + { + "epoch": 1.6786226122465004, + "grad_norm": 0.99609375, + "learning_rate": 1.3419231989844494e-05, + "loss": 1.2596, + "step": 5344 + }, + { + "epoch": 1.6792508392720418, + "grad_norm": 0.890625, + "learning_rate": 1.3416693113297368e-05, + "loss": 1.1853, + "step": 5346 + }, + { + "epoch": 1.6798790662975833, + "grad_norm": 0.76953125, + "learning_rate": 1.3414154236750239e-05, + "loss": 1.3223, + "step": 5348 + }, + { + "epoch": 1.6805072933231247, + "grad_norm": 0.83203125, + "learning_rate": 1.3411615360203112e-05, + "loss": 1.3358, + "step": 5350 + }, + { + "epoch": 1.681135520348666, + "grad_norm": 0.796875, + "learning_rate": 1.3409076483655983e-05, + "loss": 1.1666, + "step": 5352 + }, + { + "epoch": 1.6817637473742073, + "grad_norm": 0.9453125, + "learning_rate": 1.3406537607108856e-05, + "loss": 1.2604, + "step": 5354 + }, + { + "epoch": 1.6823919743997489, + "grad_norm": 0.76953125, + "learning_rate": 1.3403998730561726e-05, + "loss": 1.2772, + "step": 5356 + }, + { + "epoch": 1.6830202014252902, + "grad_norm": 0.87109375, + "learning_rate": 1.3401459854014601e-05, + "loss": 1.2507, + "step": 5358 + }, + { + "epoch": 1.6836484284508315, + "grad_norm": 0.82421875, + "learning_rate": 1.339892097746747e-05, + "loss": 1.1198, + "step": 5360 + }, + { + "epoch": 1.6842766554763728, + "grad_norm": 0.79296875, + "learning_rate": 1.3396382100920344e-05, + "loss": 1.3049, + "step": 5362 + }, + { + "epoch": 1.6849048825019142, + "grad_norm": 0.828125, + "learning_rate": 1.3393843224373215e-05, + "loss": 1.1426, + "step": 5364 + }, + { + "epoch": 1.6855331095274555, + "grad_norm": 0.83984375, + "learning_rate": 1.3391304347826088e-05, + "loss": 1.1442, + "step": 5366 + }, + { + "epoch": 1.6861613365529968, + "grad_norm": 0.8359375, + "learning_rate": 1.338876547127896e-05, + "loss": 1.2436, + "step": 5368 + }, + { + "epoch": 1.6867895635785382, + "grad_norm": 0.7734375, + "learning_rate": 1.3386226594731833e-05, + "loss": 1.2264, + "step": 5370 + }, + { + "epoch": 1.6874177906040795, + "grad_norm": 0.828125, + "learning_rate": 1.3383687718184704e-05, + "loss": 1.2157, + "step": 5372 + }, + { + "epoch": 1.6880460176296208, + "grad_norm": 0.77734375, + "learning_rate": 1.3381148841637577e-05, + "loss": 1.2976, + "step": 5374 + }, + { + "epoch": 1.6886742446551621, + "grad_norm": 0.81640625, + "learning_rate": 1.3378609965090448e-05, + "loss": 1.2154, + "step": 5376 + }, + { + "epoch": 1.6893024716807035, + "grad_norm": 0.7265625, + "learning_rate": 1.3376071088543322e-05, + "loss": 1.1889, + "step": 5378 + }, + { + "epoch": 1.689930698706245, + "grad_norm": 0.83203125, + "learning_rate": 1.3373532211996191e-05, + "loss": 1.2453, + "step": 5380 + }, + { + "epoch": 1.6905589257317863, + "grad_norm": 0.796875, + "learning_rate": 1.3370993335449064e-05, + "loss": 1.1871, + "step": 5382 + }, + { + "epoch": 1.6911871527573277, + "grad_norm": 0.74609375, + "learning_rate": 1.3368454458901936e-05, + "loss": 1.3467, + "step": 5384 + }, + { + "epoch": 1.691815379782869, + "grad_norm": 0.77734375, + "learning_rate": 1.3365915582354809e-05, + "loss": 1.3004, + "step": 5386 + }, + { + "epoch": 1.6924436068084106, + "grad_norm": 0.9140625, + "learning_rate": 1.336337670580768e-05, + "loss": 1.3259, + "step": 5388 + }, + { + "epoch": 1.6930718338339519, + "grad_norm": 0.85546875, + "learning_rate": 1.3360837829260553e-05, + "loss": 1.2515, + "step": 5390 + }, + { + "epoch": 1.6937000608594932, + "grad_norm": 0.80859375, + "learning_rate": 1.3358298952713425e-05, + "loss": 1.2712, + "step": 5392 + }, + { + "epoch": 1.6943282878850345, + "grad_norm": 0.76171875, + "learning_rate": 1.3355760076166298e-05, + "loss": 1.2064, + "step": 5394 + }, + { + "epoch": 1.6949565149105759, + "grad_norm": 0.75, + "learning_rate": 1.3353221199619169e-05, + "loss": 1.3884, + "step": 5396 + }, + { + "epoch": 1.6955847419361172, + "grad_norm": 0.796875, + "learning_rate": 1.3350682323072042e-05, + "loss": 1.24, + "step": 5398 + }, + { + "epoch": 1.6962129689616585, + "grad_norm": 0.77734375, + "learning_rate": 1.3348143446524915e-05, + "loss": 1.2544, + "step": 5400 + }, + { + "epoch": 1.6968411959871998, + "grad_norm": 0.83984375, + "learning_rate": 1.3345604569977787e-05, + "loss": 1.2522, + "step": 5402 + }, + { + "epoch": 1.6974694230127412, + "grad_norm": 0.83984375, + "learning_rate": 1.334306569343066e-05, + "loss": 1.2362, + "step": 5404 + }, + { + "epoch": 1.6980976500382825, + "grad_norm": 0.9375, + "learning_rate": 1.334052681688353e-05, + "loss": 1.3574, + "step": 5406 + }, + { + "epoch": 1.6987258770638238, + "grad_norm": 0.765625, + "learning_rate": 1.3337987940336402e-05, + "loss": 1.274, + "step": 5408 + }, + { + "epoch": 1.6993541040893652, + "grad_norm": 0.87890625, + "learning_rate": 1.3335449063789274e-05, + "loss": 1.2817, + "step": 5410 + }, + { + "epoch": 1.6999823311149065, + "grad_norm": 0.765625, + "learning_rate": 1.3332910187242147e-05, + "loss": 1.3324, + "step": 5412 + }, + { + "epoch": 1.700610558140448, + "grad_norm": 0.76953125, + "learning_rate": 1.3330371310695018e-05, + "loss": 1.2094, + "step": 5414 + }, + { + "epoch": 1.7012387851659894, + "grad_norm": 0.8203125, + "learning_rate": 1.3327832434147891e-05, + "loss": 1.1927, + "step": 5416 + }, + { + "epoch": 1.7018670121915307, + "grad_norm": 0.8203125, + "learning_rate": 1.3325293557600763e-05, + "loss": 1.166, + "step": 5418 + }, + { + "epoch": 1.702495239217072, + "grad_norm": 0.859375, + "learning_rate": 1.3322754681053636e-05, + "loss": 1.3582, + "step": 5420 + }, + { + "epoch": 1.7031234662426136, + "grad_norm": 0.80078125, + "learning_rate": 1.3320215804506507e-05, + "loss": 1.3184, + "step": 5422 + }, + { + "epoch": 1.703751693268155, + "grad_norm": 0.8046875, + "learning_rate": 1.331767692795938e-05, + "loss": 1.2835, + "step": 5424 + }, + { + "epoch": 1.7043799202936962, + "grad_norm": 0.80859375, + "learning_rate": 1.3315138051412252e-05, + "loss": 1.2446, + "step": 5426 + }, + { + "epoch": 1.7050081473192376, + "grad_norm": 0.8359375, + "learning_rate": 1.3312599174865125e-05, + "loss": 1.2357, + "step": 5428 + }, + { + "epoch": 1.7056363743447789, + "grad_norm": 0.921875, + "learning_rate": 1.3310060298317994e-05, + "loss": 1.2273, + "step": 5430 + }, + { + "epoch": 1.7062646013703202, + "grad_norm": 0.8203125, + "learning_rate": 1.3307521421770867e-05, + "loss": 1.2684, + "step": 5432 + }, + { + "epoch": 1.7068928283958615, + "grad_norm": 0.85546875, + "learning_rate": 1.3304982545223739e-05, + "loss": 1.2107, + "step": 5434 + }, + { + "epoch": 1.7075210554214029, + "grad_norm": 0.77734375, + "learning_rate": 1.3302443668676612e-05, + "loss": 1.3441, + "step": 5436 + }, + { + "epoch": 1.7081492824469442, + "grad_norm": 0.69921875, + "learning_rate": 1.3299904792129483e-05, + "loss": 1.2601, + "step": 5438 + }, + { + "epoch": 1.7087775094724855, + "grad_norm": 0.78515625, + "learning_rate": 1.3297365915582356e-05, + "loss": 1.2648, + "step": 5440 + }, + { + "epoch": 1.7094057364980269, + "grad_norm": 0.80078125, + "learning_rate": 1.3294827039035228e-05, + "loss": 1.1744, + "step": 5442 + }, + { + "epoch": 1.7100339635235682, + "grad_norm": 0.8203125, + "learning_rate": 1.3292288162488101e-05, + "loss": 1.0949, + "step": 5444 + }, + { + "epoch": 1.7106621905491097, + "grad_norm": 0.8515625, + "learning_rate": 1.3289749285940972e-05, + "loss": 1.1723, + "step": 5446 + }, + { + "epoch": 1.711290417574651, + "grad_norm": 0.86328125, + "learning_rate": 1.3287210409393845e-05, + "loss": 1.2938, + "step": 5448 + }, + { + "epoch": 1.7119186446001924, + "grad_norm": 0.81640625, + "learning_rate": 1.3284671532846715e-05, + "loss": 1.3799, + "step": 5450 + }, + { + "epoch": 1.7125468716257337, + "grad_norm": 0.77734375, + "learning_rate": 1.3282132656299588e-05, + "loss": 1.0393, + "step": 5452 + }, + { + "epoch": 1.7131750986512753, + "grad_norm": 0.75390625, + "learning_rate": 1.327959377975246e-05, + "loss": 1.0883, + "step": 5454 + }, + { + "epoch": 1.7138033256768166, + "grad_norm": 0.82421875, + "learning_rate": 1.3277054903205333e-05, + "loss": 1.1746, + "step": 5456 + }, + { + "epoch": 1.714431552702358, + "grad_norm": 0.8984375, + "learning_rate": 1.3274516026658204e-05, + "loss": 1.2404, + "step": 5458 + }, + { + "epoch": 1.7150597797278992, + "grad_norm": 0.84765625, + "learning_rate": 1.3271977150111077e-05, + "loss": 1.1545, + "step": 5460 + }, + { + "epoch": 1.7156880067534406, + "grad_norm": 0.8828125, + "learning_rate": 1.3269438273563948e-05, + "loss": 1.3406, + "step": 5462 + }, + { + "epoch": 1.716316233778982, + "grad_norm": 0.8125, + "learning_rate": 1.3266899397016821e-05, + "loss": 1.2526, + "step": 5464 + }, + { + "epoch": 1.7169444608045232, + "grad_norm": 0.8046875, + "learning_rate": 1.3264360520469693e-05, + "loss": 1.2369, + "step": 5466 + }, + { + "epoch": 1.7175726878300646, + "grad_norm": 0.84375, + "learning_rate": 1.3261821643922566e-05, + "loss": 1.2515, + "step": 5468 + }, + { + "epoch": 1.7182009148556059, + "grad_norm": 0.79296875, + "learning_rate": 1.3259282767375437e-05, + "loss": 1.1968, + "step": 5470 + }, + { + "epoch": 1.7188291418811472, + "grad_norm": 0.84375, + "learning_rate": 1.325674389082831e-05, + "loss": 1.2784, + "step": 5472 + }, + { + "epoch": 1.7194573689066885, + "grad_norm": 0.8125, + "learning_rate": 1.325420501428118e-05, + "loss": 1.2446, + "step": 5474 + }, + { + "epoch": 1.7200855959322299, + "grad_norm": 1.2421875, + "learning_rate": 1.3251666137734053e-05, + "loss": 1.1592, + "step": 5476 + }, + { + "epoch": 1.7207138229577712, + "grad_norm": 0.80859375, + "learning_rate": 1.3249127261186925e-05, + "loss": 1.2955, + "step": 5478 + }, + { + "epoch": 1.7213420499833127, + "grad_norm": 0.76171875, + "learning_rate": 1.3246588384639798e-05, + "loss": 1.334, + "step": 5480 + }, + { + "epoch": 1.721970277008854, + "grad_norm": 0.8359375, + "learning_rate": 1.3244049508092669e-05, + "loss": 1.114, + "step": 5482 + }, + { + "epoch": 1.7225985040343954, + "grad_norm": 0.859375, + "learning_rate": 1.3241510631545542e-05, + "loss": 1.0863, + "step": 5484 + }, + { + "epoch": 1.7232267310599367, + "grad_norm": 0.78515625, + "learning_rate": 1.3238971754998415e-05, + "loss": 1.3088, + "step": 5486 + }, + { + "epoch": 1.7238549580854783, + "grad_norm": 0.765625, + "learning_rate": 1.3236432878451287e-05, + "loss": 1.0641, + "step": 5488 + }, + { + "epoch": 1.7244831851110196, + "grad_norm": 0.796875, + "learning_rate": 1.323389400190416e-05, + "loss": 1.3067, + "step": 5490 + }, + { + "epoch": 1.725111412136561, + "grad_norm": 0.80859375, + "learning_rate": 1.3231355125357031e-05, + "loss": 1.2372, + "step": 5492 + }, + { + "epoch": 1.7257396391621023, + "grad_norm": 0.90234375, + "learning_rate": 1.3228816248809904e-05, + "loss": 1.0898, + "step": 5494 + }, + { + "epoch": 1.7263678661876436, + "grad_norm": 0.80078125, + "learning_rate": 1.3226277372262775e-05, + "loss": 1.297, + "step": 5496 + }, + { + "epoch": 1.726996093213185, + "grad_norm": 0.72265625, + "learning_rate": 1.3223738495715649e-05, + "loss": 1.2627, + "step": 5498 + }, + { + "epoch": 1.7276243202387263, + "grad_norm": 0.7421875, + "learning_rate": 1.3221199619168518e-05, + "loss": 1.3876, + "step": 5500 + }, + { + "epoch": 1.7282525472642676, + "grad_norm": 0.85546875, + "learning_rate": 1.3218660742621391e-05, + "loss": 1.1941, + "step": 5502 + }, + { + "epoch": 1.728880774289809, + "grad_norm": 0.7578125, + "learning_rate": 1.3216121866074263e-05, + "loss": 1.2344, + "step": 5504 + }, + { + "epoch": 1.7295090013153502, + "grad_norm": 0.86328125, + "learning_rate": 1.3213582989527136e-05, + "loss": 1.1713, + "step": 5506 + }, + { + "epoch": 1.7301372283408916, + "grad_norm": 0.80859375, + "learning_rate": 1.3211044112980007e-05, + "loss": 1.242, + "step": 5508 + }, + { + "epoch": 1.730765455366433, + "grad_norm": 0.74609375, + "learning_rate": 1.320850523643288e-05, + "loss": 1.1983, + "step": 5510 + }, + { + "epoch": 1.7313936823919744, + "grad_norm": 0.80078125, + "learning_rate": 1.3205966359885752e-05, + "loss": 1.2646, + "step": 5512 + }, + { + "epoch": 1.7320219094175158, + "grad_norm": 0.80078125, + "learning_rate": 1.3203427483338625e-05, + "loss": 1.3675, + "step": 5514 + }, + { + "epoch": 1.732650136443057, + "grad_norm": 0.82421875, + "learning_rate": 1.3200888606791496e-05, + "loss": 1.2915, + "step": 5516 + }, + { + "epoch": 1.7332783634685984, + "grad_norm": 0.8046875, + "learning_rate": 1.3198349730244369e-05, + "loss": 1.3231, + "step": 5518 + }, + { + "epoch": 1.73390659049414, + "grad_norm": 0.8203125, + "learning_rate": 1.3195810853697239e-05, + "loss": 1.228, + "step": 5520 + }, + { + "epoch": 1.7345348175196813, + "grad_norm": 0.91796875, + "learning_rate": 1.3193271977150114e-05, + "loss": 1.154, + "step": 5522 + }, + { + "epoch": 1.7351630445452226, + "grad_norm": 0.82421875, + "learning_rate": 1.3190733100602983e-05, + "loss": 1.2426, + "step": 5524 + }, + { + "epoch": 1.735791271570764, + "grad_norm": 0.76953125, + "learning_rate": 1.3188194224055856e-05, + "loss": 1.1921, + "step": 5526 + }, + { + "epoch": 1.7364194985963053, + "grad_norm": 0.90625, + "learning_rate": 1.3185655347508728e-05, + "loss": 1.2466, + "step": 5528 + }, + { + "epoch": 1.7370477256218466, + "grad_norm": 0.7578125, + "learning_rate": 1.31831164709616e-05, + "loss": 1.2553, + "step": 5530 + }, + { + "epoch": 1.737675952647388, + "grad_norm": 0.765625, + "learning_rate": 1.3180577594414472e-05, + "loss": 1.2312, + "step": 5532 + }, + { + "epoch": 1.7383041796729293, + "grad_norm": 0.765625, + "learning_rate": 1.3178038717867345e-05, + "loss": 1.2277, + "step": 5534 + }, + { + "epoch": 1.7389324066984706, + "grad_norm": 0.84375, + "learning_rate": 1.3175499841320217e-05, + "loss": 1.2192, + "step": 5536 + }, + { + "epoch": 1.739560633724012, + "grad_norm": 0.796875, + "learning_rate": 1.317296096477309e-05, + "loss": 1.1536, + "step": 5538 + }, + { + "epoch": 1.7401888607495533, + "grad_norm": 0.890625, + "learning_rate": 1.3170422088225961e-05, + "loss": 1.16, + "step": 5540 + }, + { + "epoch": 1.7408170877750946, + "grad_norm": 0.90625, + "learning_rate": 1.3167883211678834e-05, + "loss": 1.472, + "step": 5542 + }, + { + "epoch": 1.7414453148006361, + "grad_norm": 0.796875, + "learning_rate": 1.3165344335131704e-05, + "loss": 1.3138, + "step": 5544 + }, + { + "epoch": 1.7420735418261775, + "grad_norm": 0.79296875, + "learning_rate": 1.3162805458584577e-05, + "loss": 1.2316, + "step": 5546 + }, + { + "epoch": 1.7427017688517188, + "grad_norm": 0.8359375, + "learning_rate": 1.3160266582037448e-05, + "loss": 1.2371, + "step": 5548 + }, + { + "epoch": 1.7433299958772601, + "grad_norm": 0.8125, + "learning_rate": 1.3157727705490321e-05, + "loss": 1.302, + "step": 5550 + }, + { + "epoch": 1.7439582229028014, + "grad_norm": 0.83203125, + "learning_rate": 1.3155188828943193e-05, + "loss": 1.1977, + "step": 5552 + }, + { + "epoch": 1.744586449928343, + "grad_norm": 0.84765625, + "learning_rate": 1.3152649952396066e-05, + "loss": 1.234, + "step": 5554 + }, + { + "epoch": 1.7452146769538843, + "grad_norm": 0.80859375, + "learning_rate": 1.3150111075848937e-05, + "loss": 1.2929, + "step": 5556 + }, + { + "epoch": 1.7458429039794257, + "grad_norm": 0.77734375, + "learning_rate": 1.314757219930181e-05, + "loss": 1.2919, + "step": 5558 + }, + { + "epoch": 1.746471131004967, + "grad_norm": 0.71875, + "learning_rate": 1.3145033322754682e-05, + "loss": 1.3428, + "step": 5560 + }, + { + "epoch": 1.7470993580305083, + "grad_norm": 0.875, + "learning_rate": 1.3142494446207555e-05, + "loss": 1.1509, + "step": 5562 + }, + { + "epoch": 1.7477275850560496, + "grad_norm": 0.96875, + "learning_rate": 1.3139955569660424e-05, + "loss": 1.1809, + "step": 5564 + }, + { + "epoch": 1.748355812081591, + "grad_norm": 0.78515625, + "learning_rate": 1.31374166931133e-05, + "loss": 1.2189, + "step": 5566 + }, + { + "epoch": 1.7489840391071323, + "grad_norm": 0.85546875, + "learning_rate": 1.3134877816566169e-05, + "loss": 1.1861, + "step": 5568 + }, + { + "epoch": 1.7496122661326736, + "grad_norm": 0.875, + "learning_rate": 1.3132338940019042e-05, + "loss": 1.1364, + "step": 5570 + }, + { + "epoch": 1.750240493158215, + "grad_norm": 0.77734375, + "learning_rate": 1.3129800063471915e-05, + "loss": 1.1878, + "step": 5572 + }, + { + "epoch": 1.7508687201837563, + "grad_norm": 0.76953125, + "learning_rate": 1.3127261186924786e-05, + "loss": 1.1927, + "step": 5574 + }, + { + "epoch": 1.7514969472092976, + "grad_norm": 0.8359375, + "learning_rate": 1.312472231037766e-05, + "loss": 1.1183, + "step": 5576 + }, + { + "epoch": 1.7521251742348392, + "grad_norm": 0.80078125, + "learning_rate": 1.3122183433830531e-05, + "loss": 1.293, + "step": 5578 + }, + { + "epoch": 1.7527534012603805, + "grad_norm": 0.7890625, + "learning_rate": 1.3119644557283404e-05, + "loss": 1.2526, + "step": 5580 + }, + { + "epoch": 1.7533816282859218, + "grad_norm": 0.83203125, + "learning_rate": 1.3117105680736275e-05, + "loss": 1.1872, + "step": 5582 + }, + { + "epoch": 1.7540098553114631, + "grad_norm": 0.88671875, + "learning_rate": 1.3114566804189148e-05, + "loss": 1.2216, + "step": 5584 + }, + { + "epoch": 1.7546380823370047, + "grad_norm": 0.734375, + "learning_rate": 1.311202792764202e-05, + "loss": 1.1968, + "step": 5586 + }, + { + "epoch": 1.755266309362546, + "grad_norm": 0.80078125, + "learning_rate": 1.3109489051094893e-05, + "loss": 1.3127, + "step": 5588 + }, + { + "epoch": 1.7558945363880873, + "grad_norm": 0.81640625, + "learning_rate": 1.3106950174547763e-05, + "loss": 1.143, + "step": 5590 + }, + { + "epoch": 1.7565227634136287, + "grad_norm": 1.0546875, + "learning_rate": 1.3104411298000637e-05, + "loss": 1.2036, + "step": 5592 + }, + { + "epoch": 1.75715099043917, + "grad_norm": 0.875, + "learning_rate": 1.3101872421453507e-05, + "loss": 1.3434, + "step": 5594 + }, + { + "epoch": 1.7577792174647113, + "grad_norm": 0.97265625, + "learning_rate": 1.309933354490638e-05, + "loss": 1.1842, + "step": 5596 + }, + { + "epoch": 1.7584074444902527, + "grad_norm": 0.796875, + "learning_rate": 1.3096794668359251e-05, + "loss": 1.2543, + "step": 5598 + }, + { + "epoch": 1.759035671515794, + "grad_norm": 0.91015625, + "learning_rate": 1.3094255791812125e-05, + "loss": 1.2244, + "step": 5600 + }, + { + "epoch": 1.7596638985413353, + "grad_norm": 0.8046875, + "learning_rate": 1.3091716915264996e-05, + "loss": 1.1647, + "step": 5602 + }, + { + "epoch": 1.7602921255668766, + "grad_norm": 0.79296875, + "learning_rate": 1.3089178038717869e-05, + "loss": 1.3046, + "step": 5604 + }, + { + "epoch": 1.760920352592418, + "grad_norm": 0.765625, + "learning_rate": 1.308663916217074e-05, + "loss": 1.1823, + "step": 5606 + }, + { + "epoch": 1.7615485796179593, + "grad_norm": 0.875, + "learning_rate": 1.3084100285623613e-05, + "loss": 1.3261, + "step": 5608 + }, + { + "epoch": 1.7621768066435008, + "grad_norm": 0.73828125, + "learning_rate": 1.3081561409076485e-05, + "loss": 1.2197, + "step": 5610 + }, + { + "epoch": 1.7628050336690422, + "grad_norm": 0.75, + "learning_rate": 1.3079022532529358e-05, + "loss": 1.3214, + "step": 5612 + }, + { + "epoch": 1.7634332606945835, + "grad_norm": 0.76171875, + "learning_rate": 1.3076483655982228e-05, + "loss": 1.3303, + "step": 5614 + }, + { + "epoch": 1.7640614877201248, + "grad_norm": 0.796875, + "learning_rate": 1.30739447794351e-05, + "loss": 1.2344, + "step": 5616 + }, + { + "epoch": 1.7646897147456662, + "grad_norm": 0.90625, + "learning_rate": 1.3071405902887972e-05, + "loss": 1.2934, + "step": 5618 + }, + { + "epoch": 1.7653179417712077, + "grad_norm": 0.8203125, + "learning_rate": 1.3068867026340845e-05, + "loss": 1.1482, + "step": 5620 + }, + { + "epoch": 1.765946168796749, + "grad_norm": 0.70703125, + "learning_rate": 1.3066328149793717e-05, + "loss": 1.2069, + "step": 5622 + }, + { + "epoch": 1.7665743958222904, + "grad_norm": 0.82421875, + "learning_rate": 1.306378927324659e-05, + "loss": 1.1891, + "step": 5624 + }, + { + "epoch": 1.7672026228478317, + "grad_norm": 0.71484375, + "learning_rate": 1.3061250396699461e-05, + "loss": 1.3099, + "step": 5626 + }, + { + "epoch": 1.767830849873373, + "grad_norm": 0.7578125, + "learning_rate": 1.3058711520152334e-05, + "loss": 1.2631, + "step": 5628 + }, + { + "epoch": 1.7684590768989144, + "grad_norm": 0.8125, + "learning_rate": 1.3056172643605205e-05, + "loss": 1.2516, + "step": 5630 + }, + { + "epoch": 1.7690873039244557, + "grad_norm": 0.8046875, + "learning_rate": 1.3053633767058079e-05, + "loss": 1.2557, + "step": 5632 + }, + { + "epoch": 1.769715530949997, + "grad_norm": 0.92578125, + "learning_rate": 1.305109489051095e-05, + "loss": 1.2499, + "step": 5634 + }, + { + "epoch": 1.7703437579755383, + "grad_norm": 0.765625, + "learning_rate": 1.3048556013963823e-05, + "loss": 1.3002, + "step": 5636 + }, + { + "epoch": 1.7709719850010797, + "grad_norm": 0.82421875, + "learning_rate": 1.3046017137416693e-05, + "loss": 1.2968, + "step": 5638 + }, + { + "epoch": 1.771600212026621, + "grad_norm": 0.8125, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.3062, + "step": 5640 + }, + { + "epoch": 1.7722284390521623, + "grad_norm": 0.765625, + "learning_rate": 1.3040939384322437e-05, + "loss": 1.1898, + "step": 5642 + }, + { + "epoch": 1.7728566660777039, + "grad_norm": 0.8203125, + "learning_rate": 1.303840050777531e-05, + "loss": 1.3012, + "step": 5644 + }, + { + "epoch": 1.7734848931032452, + "grad_norm": 0.82421875, + "learning_rate": 1.3035861631228182e-05, + "loss": 1.1432, + "step": 5646 + }, + { + "epoch": 1.7741131201287865, + "grad_norm": 0.75390625, + "learning_rate": 1.3033322754681055e-05, + "loss": 1.2746, + "step": 5648 + }, + { + "epoch": 1.7747413471543279, + "grad_norm": 0.8359375, + "learning_rate": 1.3030783878133926e-05, + "loss": 1.2069, + "step": 5650 + }, + { + "epoch": 1.7753695741798694, + "grad_norm": 0.8125, + "learning_rate": 1.3028245001586799e-05, + "loss": 1.4756, + "step": 5652 + }, + { + "epoch": 1.7759978012054107, + "grad_norm": 0.79296875, + "learning_rate": 1.302570612503967e-05, + "loss": 1.2794, + "step": 5654 + }, + { + "epoch": 1.776626028230952, + "grad_norm": 0.8125, + "learning_rate": 1.3023167248492544e-05, + "loss": 1.4529, + "step": 5656 + }, + { + "epoch": 1.7772542552564934, + "grad_norm": 0.74609375, + "learning_rate": 1.3020628371945417e-05, + "loss": 1.2728, + "step": 5658 + }, + { + "epoch": 1.7778824822820347, + "grad_norm": 0.796875, + "learning_rate": 1.3018089495398288e-05, + "loss": 1.2532, + "step": 5660 + }, + { + "epoch": 1.778510709307576, + "grad_norm": 0.76171875, + "learning_rate": 1.3015550618851161e-05, + "loss": 1.1706, + "step": 5662 + }, + { + "epoch": 1.7791389363331174, + "grad_norm": 0.78125, + "learning_rate": 1.301301174230403e-05, + "loss": 1.3033, + "step": 5664 + }, + { + "epoch": 1.7797671633586587, + "grad_norm": 0.74609375, + "learning_rate": 1.3010472865756904e-05, + "loss": 1.2695, + "step": 5666 + }, + { + "epoch": 1.7803953903842, + "grad_norm": 0.84375, + "learning_rate": 1.3007933989209775e-05, + "loss": 1.1627, + "step": 5668 + }, + { + "epoch": 1.7810236174097414, + "grad_norm": 0.87109375, + "learning_rate": 1.3005395112662648e-05, + "loss": 1.2485, + "step": 5670 + }, + { + "epoch": 1.7816518444352827, + "grad_norm": 0.77734375, + "learning_rate": 1.300285623611552e-05, + "loss": 1.2374, + "step": 5672 + }, + { + "epoch": 1.782280071460824, + "grad_norm": 0.76953125, + "learning_rate": 1.3000317359568393e-05, + "loss": 1.302, + "step": 5674 + }, + { + "epoch": 1.7829082984863656, + "grad_norm": 0.8828125, + "learning_rate": 1.2997778483021264e-05, + "loss": 1.3329, + "step": 5676 + }, + { + "epoch": 1.783536525511907, + "grad_norm": 0.77734375, + "learning_rate": 1.2995239606474137e-05, + "loss": 1.2451, + "step": 5678 + }, + { + "epoch": 1.7841647525374482, + "grad_norm": 0.81640625, + "learning_rate": 1.2992700729927009e-05, + "loss": 1.2296, + "step": 5680 + }, + { + "epoch": 1.7847929795629895, + "grad_norm": 0.76171875, + "learning_rate": 1.2990161853379882e-05, + "loss": 1.2764, + "step": 5682 + }, + { + "epoch": 1.785421206588531, + "grad_norm": 0.76953125, + "learning_rate": 1.2987622976832751e-05, + "loss": 1.3022, + "step": 5684 + }, + { + "epoch": 1.7860494336140724, + "grad_norm": 0.78515625, + "learning_rate": 1.2985084100285626e-05, + "loss": 1.1862, + "step": 5686 + }, + { + "epoch": 1.7866776606396138, + "grad_norm": 0.9375, + "learning_rate": 1.2982545223738496e-05, + "loss": 1.097, + "step": 5688 + }, + { + "epoch": 1.787305887665155, + "grad_norm": 0.7421875, + "learning_rate": 1.2980006347191369e-05, + "loss": 1.3138, + "step": 5690 + }, + { + "epoch": 1.7879341146906964, + "grad_norm": 0.73828125, + "learning_rate": 1.297746747064424e-05, + "loss": 1.2888, + "step": 5692 + }, + { + "epoch": 1.7885623417162377, + "grad_norm": 0.8671875, + "learning_rate": 1.2974928594097113e-05, + "loss": 1.3605, + "step": 5694 + }, + { + "epoch": 1.789190568741779, + "grad_norm": 0.7734375, + "learning_rate": 1.2972389717549985e-05, + "loss": 1.2925, + "step": 5696 + }, + { + "epoch": 1.7898187957673204, + "grad_norm": 0.89453125, + "learning_rate": 1.2969850841002858e-05, + "loss": 1.0879, + "step": 5698 + }, + { + "epoch": 1.7904470227928617, + "grad_norm": 0.78515625, + "learning_rate": 1.296731196445573e-05, + "loss": 1.1732, + "step": 5700 + }, + { + "epoch": 1.791075249818403, + "grad_norm": 0.71875, + "learning_rate": 1.2964773087908602e-05, + "loss": 1.2151, + "step": 5702 + }, + { + "epoch": 1.7917034768439444, + "grad_norm": 0.7734375, + "learning_rate": 1.2962234211361474e-05, + "loss": 1.1779, + "step": 5704 + }, + { + "epoch": 1.7923317038694857, + "grad_norm": 0.8125, + "learning_rate": 1.2959695334814347e-05, + "loss": 1.2626, + "step": 5706 + }, + { + "epoch": 1.792959930895027, + "grad_norm": 0.7890625, + "learning_rate": 1.2957156458267216e-05, + "loss": 1.1991, + "step": 5708 + }, + { + "epoch": 1.7935881579205686, + "grad_norm": 0.74609375, + "learning_rate": 1.295461758172009e-05, + "loss": 1.3407, + "step": 5710 + }, + { + "epoch": 1.79421638494611, + "grad_norm": 0.78515625, + "learning_rate": 1.2952078705172961e-05, + "loss": 1.2508, + "step": 5712 + }, + { + "epoch": 1.7948446119716512, + "grad_norm": 0.74609375, + "learning_rate": 1.2949539828625834e-05, + "loss": 1.4018, + "step": 5714 + }, + { + "epoch": 1.7954728389971926, + "grad_norm": 0.76953125, + "learning_rate": 1.2947000952078705e-05, + "loss": 1.1721, + "step": 5716 + }, + { + "epoch": 1.7961010660227341, + "grad_norm": 0.79296875, + "learning_rate": 1.2944462075531578e-05, + "loss": 1.2232, + "step": 5718 + }, + { + "epoch": 1.7967292930482754, + "grad_norm": 0.73828125, + "learning_rate": 1.294192319898445e-05, + "loss": 1.2397, + "step": 5720 + }, + { + "epoch": 1.7973575200738168, + "grad_norm": 0.8203125, + "learning_rate": 1.2939384322437323e-05, + "loss": 1.172, + "step": 5722 + }, + { + "epoch": 1.797985747099358, + "grad_norm": 0.73828125, + "learning_rate": 1.2936845445890194e-05, + "loss": 1.3208, + "step": 5724 + }, + { + "epoch": 1.7986139741248994, + "grad_norm": 0.77734375, + "learning_rate": 1.2934306569343067e-05, + "loss": 1.2101, + "step": 5726 + }, + { + "epoch": 1.7992422011504408, + "grad_norm": 0.84375, + "learning_rate": 1.2931767692795937e-05, + "loss": 1.2136, + "step": 5728 + }, + { + "epoch": 1.799870428175982, + "grad_norm": 0.81640625, + "learning_rate": 1.2929228816248812e-05, + "loss": 1.3315, + "step": 5730 + }, + { + "epoch": 1.8004986552015234, + "grad_norm": 0.82421875, + "learning_rate": 1.2926689939701682e-05, + "loss": 1.1851, + "step": 5732 + }, + { + "epoch": 1.8011268822270647, + "grad_norm": 0.83984375, + "learning_rate": 1.2924151063154555e-05, + "loss": 1.1484, + "step": 5734 + }, + { + "epoch": 1.801755109252606, + "grad_norm": 0.78515625, + "learning_rate": 1.2921612186607426e-05, + "loss": 1.1852, + "step": 5736 + }, + { + "epoch": 1.8023833362781474, + "grad_norm": 0.82421875, + "learning_rate": 1.2919073310060299e-05, + "loss": 1.2899, + "step": 5738 + }, + { + "epoch": 1.8030115633036887, + "grad_norm": 0.84375, + "learning_rate": 1.291653443351317e-05, + "loss": 1.2624, + "step": 5740 + }, + { + "epoch": 1.8036397903292303, + "grad_norm": 0.73828125, + "learning_rate": 1.2913995556966044e-05, + "loss": 1.1763, + "step": 5742 + }, + { + "epoch": 1.8042680173547716, + "grad_norm": 0.828125, + "learning_rate": 1.2911456680418917e-05, + "loss": 1.384, + "step": 5744 + }, + { + "epoch": 1.804896244380313, + "grad_norm": 0.83203125, + "learning_rate": 1.2908917803871788e-05, + "loss": 1.241, + "step": 5746 + }, + { + "epoch": 1.8055244714058543, + "grad_norm": 0.83203125, + "learning_rate": 1.2906378927324661e-05, + "loss": 1.3487, + "step": 5748 + }, + { + "epoch": 1.8061526984313958, + "grad_norm": 0.76953125, + "learning_rate": 1.2903840050777532e-05, + "loss": 1.2097, + "step": 5750 + }, + { + "epoch": 1.8067809254569371, + "grad_norm": 0.72265625, + "learning_rate": 1.2901301174230406e-05, + "loss": 1.2788, + "step": 5752 + }, + { + "epoch": 1.8074091524824785, + "grad_norm": 0.7578125, + "learning_rate": 1.2898762297683275e-05, + "loss": 1.2052, + "step": 5754 + }, + { + "epoch": 1.8080373795080198, + "grad_norm": 0.765625, + "learning_rate": 1.289622342113615e-05, + "loss": 1.3531, + "step": 5756 + }, + { + "epoch": 1.8086656065335611, + "grad_norm": 0.83984375, + "learning_rate": 1.289368454458902e-05, + "loss": 1.2512, + "step": 5758 + }, + { + "epoch": 1.8092938335591024, + "grad_norm": 0.765625, + "learning_rate": 1.2891145668041893e-05, + "loss": 1.2617, + "step": 5760 + }, + { + "epoch": 1.8099220605846438, + "grad_norm": 0.80078125, + "learning_rate": 1.2888606791494764e-05, + "loss": 1.2172, + "step": 5762 + }, + { + "epoch": 1.810550287610185, + "grad_norm": 0.80859375, + "learning_rate": 1.2886067914947637e-05, + "loss": 1.3569, + "step": 5764 + }, + { + "epoch": 1.8111785146357264, + "grad_norm": 0.8828125, + "learning_rate": 1.2883529038400509e-05, + "loss": 1.1964, + "step": 5766 + }, + { + "epoch": 1.8118067416612678, + "grad_norm": 0.85546875, + "learning_rate": 1.2880990161853382e-05, + "loss": 1.1786, + "step": 5768 + }, + { + "epoch": 1.812434968686809, + "grad_norm": 0.7578125, + "learning_rate": 1.2878451285306253e-05, + "loss": 1.2148, + "step": 5770 + }, + { + "epoch": 1.8130631957123504, + "grad_norm": 0.90234375, + "learning_rate": 1.2875912408759126e-05, + "loss": 1.1685, + "step": 5772 + }, + { + "epoch": 1.8136914227378917, + "grad_norm": 0.8046875, + "learning_rate": 1.2873373532211998e-05, + "loss": 1.1826, + "step": 5774 + }, + { + "epoch": 1.8143196497634333, + "grad_norm": 0.8828125, + "learning_rate": 1.287083465566487e-05, + "loss": 1.2594, + "step": 5776 + }, + { + "epoch": 1.8149478767889746, + "grad_norm": 0.828125, + "learning_rate": 1.286829577911774e-05, + "loss": 1.209, + "step": 5778 + }, + { + "epoch": 1.815576103814516, + "grad_norm": 0.83984375, + "learning_rate": 1.2865756902570613e-05, + "loss": 1.2065, + "step": 5780 + }, + { + "epoch": 1.8162043308400573, + "grad_norm": 0.78125, + "learning_rate": 1.2863218026023485e-05, + "loss": 1.1886, + "step": 5782 + }, + { + "epoch": 1.8168325578655988, + "grad_norm": 0.79296875, + "learning_rate": 1.2860679149476358e-05, + "loss": 1.2196, + "step": 5784 + }, + { + "epoch": 1.8174607848911402, + "grad_norm": 0.92578125, + "learning_rate": 1.285814027292923e-05, + "loss": 1.1996, + "step": 5786 + }, + { + "epoch": 1.8180890119166815, + "grad_norm": 0.7578125, + "learning_rate": 1.2855601396382102e-05, + "loss": 1.1933, + "step": 5788 + }, + { + "epoch": 1.8187172389422228, + "grad_norm": 0.80859375, + "learning_rate": 1.2853062519834974e-05, + "loss": 1.2438, + "step": 5790 + }, + { + "epoch": 1.8193454659677641, + "grad_norm": 0.8671875, + "learning_rate": 1.2850523643287847e-05, + "loss": 1.2136, + "step": 5792 + }, + { + "epoch": 1.8199736929933055, + "grad_norm": 0.78125, + "learning_rate": 1.2847984766740718e-05, + "loss": 1.3761, + "step": 5794 + }, + { + "epoch": 1.8206019200188468, + "grad_norm": 0.75390625, + "learning_rate": 1.2845445890193591e-05, + "loss": 1.1976, + "step": 5796 + }, + { + "epoch": 1.8212301470443881, + "grad_norm": 0.78125, + "learning_rate": 1.2842907013646463e-05, + "loss": 1.3462, + "step": 5798 + }, + { + "epoch": 1.8218583740699295, + "grad_norm": 0.875, + "learning_rate": 1.2840368137099336e-05, + "loss": 1.2293, + "step": 5800 + }, + { + "epoch": 1.8224866010954708, + "grad_norm": 0.92578125, + "learning_rate": 1.2837829260552205e-05, + "loss": 1.2233, + "step": 5802 + }, + { + "epoch": 1.823114828121012, + "grad_norm": 0.78125, + "learning_rate": 1.2835290384005078e-05, + "loss": 1.2824, + "step": 5804 + }, + { + "epoch": 1.8237430551465534, + "grad_norm": 0.75390625, + "learning_rate": 1.283275150745795e-05, + "loss": 1.2794, + "step": 5806 + }, + { + "epoch": 1.824371282172095, + "grad_norm": 0.78515625, + "learning_rate": 1.2830212630910823e-05, + "loss": 1.1825, + "step": 5808 + }, + { + "epoch": 1.8249995091976363, + "grad_norm": 0.9296875, + "learning_rate": 1.2827673754363694e-05, + "loss": 1.1168, + "step": 5810 + }, + { + "epoch": 1.8256277362231776, + "grad_norm": 0.83984375, + "learning_rate": 1.2825134877816567e-05, + "loss": 1.2021, + "step": 5812 + }, + { + "epoch": 1.826255963248719, + "grad_norm": 0.81640625, + "learning_rate": 1.2822596001269439e-05, + "loss": 1.3024, + "step": 5814 + }, + { + "epoch": 1.8268841902742605, + "grad_norm": 0.76953125, + "learning_rate": 1.2820057124722312e-05, + "loss": 1.2621, + "step": 5816 + }, + { + "epoch": 1.8275124172998019, + "grad_norm": 0.80078125, + "learning_rate": 1.2817518248175183e-05, + "loss": 1.0934, + "step": 5818 + }, + { + "epoch": 1.8281406443253432, + "grad_norm": 0.7890625, + "learning_rate": 1.2814979371628056e-05, + "loss": 1.3914, + "step": 5820 + }, + { + "epoch": 1.8287688713508845, + "grad_norm": 0.8828125, + "learning_rate": 1.2812440495080926e-05, + "loss": 1.1312, + "step": 5822 + }, + { + "epoch": 1.8293970983764258, + "grad_norm": 0.83203125, + "learning_rate": 1.28099016185338e-05, + "loss": 1.1222, + "step": 5824 + }, + { + "epoch": 1.8300253254019672, + "grad_norm": 0.73828125, + "learning_rate": 1.280736274198667e-05, + "loss": 1.2303, + "step": 5826 + }, + { + "epoch": 1.8306535524275085, + "grad_norm": 0.86328125, + "learning_rate": 1.2804823865439543e-05, + "loss": 1.2426, + "step": 5828 + }, + { + "epoch": 1.8312817794530498, + "grad_norm": 0.77734375, + "learning_rate": 1.2802284988892417e-05, + "loss": 1.1024, + "step": 5830 + }, + { + "epoch": 1.8319100064785911, + "grad_norm": 0.8515625, + "learning_rate": 1.2799746112345288e-05, + "loss": 1.3141, + "step": 5832 + }, + { + "epoch": 1.8325382335041325, + "grad_norm": 0.75390625, + "learning_rate": 1.2797207235798161e-05, + "loss": 1.1605, + "step": 5834 + }, + { + "epoch": 1.8331664605296738, + "grad_norm": 0.81640625, + "learning_rate": 1.2794668359251032e-05, + "loss": 1.1718, + "step": 5836 + }, + { + "epoch": 1.8337946875552151, + "grad_norm": 0.7421875, + "learning_rate": 1.2792129482703905e-05, + "loss": 1.1882, + "step": 5838 + }, + { + "epoch": 1.8344229145807565, + "grad_norm": 0.80859375, + "learning_rate": 1.2789590606156777e-05, + "loss": 1.3604, + "step": 5840 + }, + { + "epoch": 1.835051141606298, + "grad_norm": 0.79296875, + "learning_rate": 1.278705172960965e-05, + "loss": 1.1974, + "step": 5842 + }, + { + "epoch": 1.8356793686318393, + "grad_norm": 0.8828125, + "learning_rate": 1.2784512853062521e-05, + "loss": 1.1672, + "step": 5844 + }, + { + "epoch": 1.8363075956573807, + "grad_norm": 0.80859375, + "learning_rate": 1.2781973976515394e-05, + "loss": 1.2177, + "step": 5846 + }, + { + "epoch": 1.836935822682922, + "grad_norm": 0.91796875, + "learning_rate": 1.2779435099968264e-05, + "loss": 1.2313, + "step": 5848 + }, + { + "epoch": 1.8375640497084635, + "grad_norm": 0.76171875, + "learning_rate": 1.2776896223421139e-05, + "loss": 1.3003, + "step": 5850 + }, + { + "epoch": 1.8381922767340049, + "grad_norm": 0.84375, + "learning_rate": 1.2774357346874009e-05, + "loss": 1.192, + "step": 5852 + }, + { + "epoch": 1.8388205037595462, + "grad_norm": 0.96484375, + "learning_rate": 1.2771818470326882e-05, + "loss": 1.2617, + "step": 5854 + }, + { + "epoch": 1.8394487307850875, + "grad_norm": 0.76171875, + "learning_rate": 1.2769279593779753e-05, + "loss": 1.0411, + "step": 5856 + }, + { + "epoch": 1.8400769578106289, + "grad_norm": 0.76953125, + "learning_rate": 1.2766740717232626e-05, + "loss": 1.2904, + "step": 5858 + }, + { + "epoch": 1.8407051848361702, + "grad_norm": 0.79296875, + "learning_rate": 1.2764201840685497e-05, + "loss": 1.3183, + "step": 5860 + }, + { + "epoch": 1.8413334118617115, + "grad_norm": 0.83984375, + "learning_rate": 1.276166296413837e-05, + "loss": 1.3205, + "step": 5862 + }, + { + "epoch": 1.8419616388872528, + "grad_norm": 0.828125, + "learning_rate": 1.2759124087591242e-05, + "loss": 1.2196, + "step": 5864 + }, + { + "epoch": 1.8425898659127942, + "grad_norm": 0.78515625, + "learning_rate": 1.2756585211044115e-05, + "loss": 1.243, + "step": 5866 + }, + { + "epoch": 1.8432180929383355, + "grad_norm": 0.82421875, + "learning_rate": 1.2754046334496986e-05, + "loss": 1.1317, + "step": 5868 + }, + { + "epoch": 1.8438463199638768, + "grad_norm": 0.82421875, + "learning_rate": 1.275150745794986e-05, + "loss": 1.2339, + "step": 5870 + }, + { + "epoch": 1.8444745469894182, + "grad_norm": 0.77734375, + "learning_rate": 1.2748968581402729e-05, + "loss": 1.1604, + "step": 5872 + }, + { + "epoch": 1.8451027740149597, + "grad_norm": 0.79296875, + "learning_rate": 1.2746429704855602e-05, + "loss": 1.1184, + "step": 5874 + }, + { + "epoch": 1.845731001040501, + "grad_norm": 0.78125, + "learning_rate": 1.2743890828308474e-05, + "loss": 1.2737, + "step": 5876 + }, + { + "epoch": 1.8463592280660424, + "grad_norm": 0.78125, + "learning_rate": 1.2741351951761347e-05, + "loss": 1.2273, + "step": 5878 + }, + { + "epoch": 1.8469874550915837, + "grad_norm": 0.79296875, + "learning_rate": 1.2738813075214218e-05, + "loss": 1.2768, + "step": 5880 + }, + { + "epoch": 1.8476156821171252, + "grad_norm": 0.73828125, + "learning_rate": 1.2736274198667091e-05, + "loss": 1.3388, + "step": 5882 + }, + { + "epoch": 1.8482439091426666, + "grad_norm": 0.85546875, + "learning_rate": 1.2733735322119962e-05, + "loss": 1.2175, + "step": 5884 + }, + { + "epoch": 1.848872136168208, + "grad_norm": 0.75390625, + "learning_rate": 1.2731196445572836e-05, + "loss": 1.2659, + "step": 5886 + }, + { + "epoch": 1.8495003631937492, + "grad_norm": 0.84765625, + "learning_rate": 1.2728657569025707e-05, + "loss": 1.2642, + "step": 5888 + }, + { + "epoch": 1.8501285902192905, + "grad_norm": 0.8203125, + "learning_rate": 1.272611869247858e-05, + "loss": 1.0971, + "step": 5890 + }, + { + "epoch": 1.8507568172448319, + "grad_norm": 0.74609375, + "learning_rate": 1.272357981593145e-05, + "loss": 1.3001, + "step": 5892 + }, + { + "epoch": 1.8513850442703732, + "grad_norm": 0.6953125, + "learning_rate": 1.2721040939384324e-05, + "loss": 1.457, + "step": 5894 + }, + { + "epoch": 1.8520132712959145, + "grad_norm": 0.72265625, + "learning_rate": 1.2718502062837194e-05, + "loss": 1.3597, + "step": 5896 + }, + { + "epoch": 1.8526414983214559, + "grad_norm": 0.89453125, + "learning_rate": 1.2715963186290067e-05, + "loss": 1.2853, + "step": 5898 + }, + { + "epoch": 1.8532697253469972, + "grad_norm": 0.7734375, + "learning_rate": 1.2713424309742939e-05, + "loss": 1.2468, + "step": 5900 + }, + { + "epoch": 1.8538979523725385, + "grad_norm": 0.76953125, + "learning_rate": 1.2710885433195812e-05, + "loss": 1.3252, + "step": 5902 + }, + { + "epoch": 1.8545261793980798, + "grad_norm": 0.7421875, + "learning_rate": 1.2708346556648683e-05, + "loss": 1.3121, + "step": 5904 + }, + { + "epoch": 1.8551544064236212, + "grad_norm": 0.81640625, + "learning_rate": 1.2705807680101556e-05, + "loss": 1.1819, + "step": 5906 + }, + { + "epoch": 1.8557826334491627, + "grad_norm": 0.8359375, + "learning_rate": 1.2703268803554428e-05, + "loss": 1.2418, + "step": 5908 + }, + { + "epoch": 1.856410860474704, + "grad_norm": 0.71875, + "learning_rate": 1.27007299270073e-05, + "loss": 1.3177, + "step": 5910 + }, + { + "epoch": 1.8570390875002454, + "grad_norm": 0.78515625, + "learning_rate": 1.2698191050460172e-05, + "loss": 1.2514, + "step": 5912 + }, + { + "epoch": 1.8576673145257867, + "grad_norm": 0.89453125, + "learning_rate": 1.2695652173913045e-05, + "loss": 1.2886, + "step": 5914 + }, + { + "epoch": 1.8582955415513283, + "grad_norm": 0.82421875, + "learning_rate": 1.2693113297365918e-05, + "loss": 1.3111, + "step": 5916 + }, + { + "epoch": 1.8589237685768696, + "grad_norm": 0.90625, + "learning_rate": 1.2690574420818788e-05, + "loss": 1.1563, + "step": 5918 + }, + { + "epoch": 1.859551995602411, + "grad_norm": 2.40625, + "learning_rate": 1.2688035544271663e-05, + "loss": 1.2238, + "step": 5920 + }, + { + "epoch": 1.8601802226279522, + "grad_norm": 0.8515625, + "learning_rate": 1.2685496667724532e-05, + "loss": 1.2487, + "step": 5922 + }, + { + "epoch": 1.8608084496534936, + "grad_norm": 0.80859375, + "learning_rate": 1.2682957791177405e-05, + "loss": 1.2178, + "step": 5924 + }, + { + "epoch": 1.861436676679035, + "grad_norm": 0.8125, + "learning_rate": 1.2680418914630277e-05, + "loss": 1.1426, + "step": 5926 + }, + { + "epoch": 1.8620649037045762, + "grad_norm": 0.80859375, + "learning_rate": 1.267788003808315e-05, + "loss": 1.2623, + "step": 5928 + }, + { + "epoch": 1.8626931307301176, + "grad_norm": 0.87109375, + "learning_rate": 1.2675341161536021e-05, + "loss": 1.0865, + "step": 5930 + }, + { + "epoch": 1.8633213577556589, + "grad_norm": 0.85546875, + "learning_rate": 1.2672802284988894e-05, + "loss": 1.1582, + "step": 5932 + }, + { + "epoch": 1.8639495847812002, + "grad_norm": 0.84765625, + "learning_rate": 1.2670263408441766e-05, + "loss": 0.9822, + "step": 5934 + }, + { + "epoch": 1.8645778118067415, + "grad_norm": 1.0625, + "learning_rate": 1.2667724531894639e-05, + "loss": 1.1975, + "step": 5936 + }, + { + "epoch": 1.8652060388322829, + "grad_norm": 0.87109375, + "learning_rate": 1.266518565534751e-05, + "loss": 1.1873, + "step": 5938 + }, + { + "epoch": 1.8658342658578244, + "grad_norm": 0.8125, + "learning_rate": 1.2662646778800383e-05, + "loss": 1.2006, + "step": 5940 + }, + { + "epoch": 1.8664624928833657, + "grad_norm": 0.78515625, + "learning_rate": 1.2660107902253253e-05, + "loss": 1.2259, + "step": 5942 + }, + { + "epoch": 1.867090719908907, + "grad_norm": 0.8359375, + "learning_rate": 1.2657569025706126e-05, + "loss": 1.1884, + "step": 5944 + }, + { + "epoch": 1.8677189469344484, + "grad_norm": 0.8671875, + "learning_rate": 1.2655030149158997e-05, + "loss": 1.1981, + "step": 5946 + }, + { + "epoch": 1.86834717395999, + "grad_norm": 0.77734375, + "learning_rate": 1.265249127261187e-05, + "loss": 1.1178, + "step": 5948 + }, + { + "epoch": 1.8689754009855313, + "grad_norm": 0.7578125, + "learning_rate": 1.2649952396064742e-05, + "loss": 1.2778, + "step": 5950 + }, + { + "epoch": 1.8696036280110726, + "grad_norm": 0.78125, + "learning_rate": 1.2647413519517615e-05, + "loss": 1.171, + "step": 5952 + }, + { + "epoch": 1.870231855036614, + "grad_norm": 0.7890625, + "learning_rate": 1.2644874642970486e-05, + "loss": 1.2154, + "step": 5954 + }, + { + "epoch": 1.8708600820621553, + "grad_norm": 0.79296875, + "learning_rate": 1.264233576642336e-05, + "loss": 1.2481, + "step": 5956 + }, + { + "epoch": 1.8714883090876966, + "grad_norm": 0.84375, + "learning_rate": 1.263979688987623e-05, + "loss": 1.2464, + "step": 5958 + }, + { + "epoch": 1.872116536113238, + "grad_norm": 0.76171875, + "learning_rate": 1.2637258013329104e-05, + "loss": 1.1656, + "step": 5960 + }, + { + "epoch": 1.8727447631387792, + "grad_norm": 0.73046875, + "learning_rate": 1.2634719136781975e-05, + "loss": 1.3967, + "step": 5962 + }, + { + "epoch": 1.8733729901643206, + "grad_norm": 0.81640625, + "learning_rate": 1.2632180260234848e-05, + "loss": 1.295, + "step": 5964 + }, + { + "epoch": 1.874001217189862, + "grad_norm": 0.71875, + "learning_rate": 1.2629641383687718e-05, + "loss": 1.1594, + "step": 5966 + }, + { + "epoch": 1.8746294442154032, + "grad_norm": 0.86328125, + "learning_rate": 1.2627102507140591e-05, + "loss": 1.1973, + "step": 5968 + }, + { + "epoch": 1.8752576712409446, + "grad_norm": 0.9765625, + "learning_rate": 1.2624563630593462e-05, + "loss": 1.3587, + "step": 5970 + }, + { + "epoch": 1.8758858982664859, + "grad_norm": 0.80078125, + "learning_rate": 1.2622024754046335e-05, + "loss": 1.3327, + "step": 5972 + }, + { + "epoch": 1.8765141252920274, + "grad_norm": 0.8515625, + "learning_rate": 1.2619485877499207e-05, + "loss": 1.1387, + "step": 5974 + }, + { + "epoch": 1.8771423523175688, + "grad_norm": 0.9140625, + "learning_rate": 1.261694700095208e-05, + "loss": 1.1447, + "step": 5976 + }, + { + "epoch": 1.87777057934311, + "grad_norm": 0.75, + "learning_rate": 1.2614408124404951e-05, + "loss": 1.1872, + "step": 5978 + }, + { + "epoch": 1.8783988063686514, + "grad_norm": 0.7890625, + "learning_rate": 1.2611869247857824e-05, + "loss": 1.2919, + "step": 5980 + }, + { + "epoch": 1.879027033394193, + "grad_norm": 0.78125, + "learning_rate": 1.2609330371310696e-05, + "loss": 1.3053, + "step": 5982 + }, + { + "epoch": 1.8796552604197343, + "grad_norm": 0.8125, + "learning_rate": 1.2606791494763569e-05, + "loss": 1.1493, + "step": 5984 + }, + { + "epoch": 1.8802834874452756, + "grad_norm": 0.765625, + "learning_rate": 1.2604252618216439e-05, + "loss": 1.2792, + "step": 5986 + }, + { + "epoch": 1.880911714470817, + "grad_norm": 0.765625, + "learning_rate": 1.2601713741669313e-05, + "loss": 1.3208, + "step": 5988 + }, + { + "epoch": 1.8815399414963583, + "grad_norm": 0.80859375, + "learning_rate": 1.2599174865122183e-05, + "loss": 1.1665, + "step": 5990 + }, + { + "epoch": 1.8821681685218996, + "grad_norm": 0.73828125, + "learning_rate": 1.2596635988575056e-05, + "loss": 1.2419, + "step": 5992 + }, + { + "epoch": 1.882796395547441, + "grad_norm": 0.83203125, + "learning_rate": 1.2594097112027927e-05, + "loss": 1.4174, + "step": 5994 + }, + { + "epoch": 1.8834246225729823, + "grad_norm": 0.8984375, + "learning_rate": 1.25915582354808e-05, + "loss": 1.2457, + "step": 5996 + }, + { + "epoch": 1.8840528495985236, + "grad_norm": 0.80859375, + "learning_rate": 1.2589019358933672e-05, + "loss": 1.2441, + "step": 5998 + }, + { + "epoch": 1.884681076624065, + "grad_norm": 0.8046875, + "learning_rate": 1.2586480482386545e-05, + "loss": 1.2268, + "step": 6000 + }, + { + "epoch": 1.8853093036496062, + "grad_norm": 0.8828125, + "learning_rate": 1.2583941605839418e-05, + "loss": 1.117, + "step": 6002 + }, + { + "epoch": 1.8859375306751476, + "grad_norm": 0.76953125, + "learning_rate": 1.258140272929229e-05, + "loss": 1.2595, + "step": 6004 + }, + { + "epoch": 1.8865657577006891, + "grad_norm": 1.0078125, + "learning_rate": 1.2578863852745163e-05, + "loss": 1.2623, + "step": 6006 + }, + { + "epoch": 1.8871939847262305, + "grad_norm": 0.8671875, + "learning_rate": 1.2576324976198034e-05, + "loss": 1.2805, + "step": 6008 + }, + { + "epoch": 1.8878222117517718, + "grad_norm": 0.8359375, + "learning_rate": 1.2573786099650907e-05, + "loss": 1.1744, + "step": 6010 + }, + { + "epoch": 1.8884504387773131, + "grad_norm": 0.875, + "learning_rate": 1.2571247223103777e-05, + "loss": 1.3327, + "step": 6012 + }, + { + "epoch": 1.8890786658028547, + "grad_norm": 0.73046875, + "learning_rate": 1.2568708346556651e-05, + "loss": 1.1917, + "step": 6014 + }, + { + "epoch": 1.889706892828396, + "grad_norm": 0.75390625, + "learning_rate": 1.2566169470009521e-05, + "loss": 1.4492, + "step": 6016 + }, + { + "epoch": 1.8903351198539373, + "grad_norm": 0.8515625, + "learning_rate": 1.2563630593462394e-05, + "loss": 1.1507, + "step": 6018 + }, + { + "epoch": 1.8909633468794786, + "grad_norm": 0.80859375, + "learning_rate": 1.2561091716915266e-05, + "loss": 1.2665, + "step": 6020 + }, + { + "epoch": 1.89159157390502, + "grad_norm": 0.8671875, + "learning_rate": 1.2558552840368139e-05, + "loss": 1.2222, + "step": 6022 + }, + { + "epoch": 1.8922198009305613, + "grad_norm": 0.81640625, + "learning_rate": 1.255601396382101e-05, + "loss": 1.2067, + "step": 6024 + }, + { + "epoch": 1.8928480279561026, + "grad_norm": 0.82421875, + "learning_rate": 1.2553475087273883e-05, + "loss": 1.1897, + "step": 6026 + }, + { + "epoch": 1.893476254981644, + "grad_norm": 0.9375, + "learning_rate": 1.2550936210726755e-05, + "loss": 1.1622, + "step": 6028 + }, + { + "epoch": 1.8941044820071853, + "grad_norm": 0.91796875, + "learning_rate": 1.2548397334179628e-05, + "loss": 1.3068, + "step": 6030 + }, + { + "epoch": 1.8947327090327266, + "grad_norm": 0.8984375, + "learning_rate": 1.2545858457632499e-05, + "loss": 1.2051, + "step": 6032 + }, + { + "epoch": 1.895360936058268, + "grad_norm": 0.76171875, + "learning_rate": 1.2543319581085372e-05, + "loss": 1.1956, + "step": 6034 + }, + { + "epoch": 1.8959891630838093, + "grad_norm": 0.8125, + "learning_rate": 1.2540780704538242e-05, + "loss": 1.3253, + "step": 6036 + }, + { + "epoch": 1.8966173901093508, + "grad_norm": 0.83203125, + "learning_rate": 1.2538241827991115e-05, + "loss": 1.2558, + "step": 6038 + }, + { + "epoch": 1.8972456171348921, + "grad_norm": 0.80078125, + "learning_rate": 1.2535702951443986e-05, + "loss": 1.2556, + "step": 6040 + }, + { + "epoch": 1.8978738441604335, + "grad_norm": 0.8125, + "learning_rate": 1.253316407489686e-05, + "loss": 1.1596, + "step": 6042 + }, + { + "epoch": 1.8985020711859748, + "grad_norm": 0.80859375, + "learning_rate": 1.253062519834973e-05, + "loss": 1.2663, + "step": 6044 + }, + { + "epoch": 1.8991302982115161, + "grad_norm": 0.81640625, + "learning_rate": 1.2528086321802604e-05, + "loss": 1.2155, + "step": 6046 + }, + { + "epoch": 1.8997585252370577, + "grad_norm": 0.75390625, + "learning_rate": 1.2525547445255475e-05, + "loss": 1.272, + "step": 6048 + }, + { + "epoch": 1.900386752262599, + "grad_norm": 0.78125, + "learning_rate": 1.2523008568708348e-05, + "loss": 1.2012, + "step": 6050 + }, + { + "epoch": 1.9010149792881403, + "grad_norm": 0.76171875, + "learning_rate": 1.252046969216122e-05, + "loss": 1.2281, + "step": 6052 + }, + { + "epoch": 1.9016432063136817, + "grad_norm": 0.81640625, + "learning_rate": 1.2517930815614093e-05, + "loss": 1.2045, + "step": 6054 + }, + { + "epoch": 1.902271433339223, + "grad_norm": 0.79296875, + "learning_rate": 1.2515391939066962e-05, + "loss": 1.2902, + "step": 6056 + }, + { + "epoch": 1.9028996603647643, + "grad_norm": 0.80078125, + "learning_rate": 1.2512853062519837e-05, + "loss": 1.2146, + "step": 6058 + }, + { + "epoch": 1.9035278873903057, + "grad_norm": 0.83203125, + "learning_rate": 1.2510314185972707e-05, + "loss": 1.1062, + "step": 6060 + }, + { + "epoch": 1.904156114415847, + "grad_norm": 0.83984375, + "learning_rate": 1.250777530942558e-05, + "loss": 1.1466, + "step": 6062 + }, + { + "epoch": 1.9047843414413883, + "grad_norm": 0.7890625, + "learning_rate": 1.2505236432878451e-05, + "loss": 1.2591, + "step": 6064 + }, + { + "epoch": 1.9054125684669296, + "grad_norm": 0.8203125, + "learning_rate": 1.2502697556331324e-05, + "loss": 1.2363, + "step": 6066 + }, + { + "epoch": 1.906040795492471, + "grad_norm": 0.7265625, + "learning_rate": 1.2500158679784196e-05, + "loss": 1.3046, + "step": 6068 + }, + { + "epoch": 1.9066690225180123, + "grad_norm": 0.765625, + "learning_rate": 1.2497619803237069e-05, + "loss": 1.3882, + "step": 6070 + }, + { + "epoch": 1.9072972495435538, + "grad_norm": 0.81640625, + "learning_rate": 1.249508092668994e-05, + "loss": 1.2277, + "step": 6072 + }, + { + "epoch": 1.9079254765690952, + "grad_norm": 0.76171875, + "learning_rate": 1.2492542050142813e-05, + "loss": 1.2135, + "step": 6074 + }, + { + "epoch": 1.9085537035946365, + "grad_norm": 0.80859375, + "learning_rate": 1.2490003173595685e-05, + "loss": 1.171, + "step": 6076 + }, + { + "epoch": 1.9091819306201778, + "grad_norm": 0.8359375, + "learning_rate": 1.2487464297048558e-05, + "loss": 1.2015, + "step": 6078 + }, + { + "epoch": 1.9098101576457194, + "grad_norm": 0.78515625, + "learning_rate": 1.2484925420501427e-05, + "loss": 1.3251, + "step": 6080 + }, + { + "epoch": 1.9104383846712607, + "grad_norm": 0.74609375, + "learning_rate": 1.24823865439543e-05, + "loss": 1.3621, + "step": 6082 + }, + { + "epoch": 1.911066611696802, + "grad_norm": 0.71484375, + "learning_rate": 1.2479847667407172e-05, + "loss": 1.2972, + "step": 6084 + }, + { + "epoch": 1.9116948387223434, + "grad_norm": 0.8671875, + "learning_rate": 1.2477308790860045e-05, + "loss": 1.1877, + "step": 6086 + }, + { + "epoch": 1.9123230657478847, + "grad_norm": 0.78125, + "learning_rate": 1.2474769914312918e-05, + "loss": 1.221, + "step": 6088 + }, + { + "epoch": 1.912951292773426, + "grad_norm": 0.85546875, + "learning_rate": 1.247223103776579e-05, + "loss": 1.3329, + "step": 6090 + }, + { + "epoch": 1.9135795197989673, + "grad_norm": 0.84375, + "learning_rate": 1.2469692161218662e-05, + "loss": 1.2478, + "step": 6092 + }, + { + "epoch": 1.9142077468245087, + "grad_norm": 0.859375, + "learning_rate": 1.2467153284671534e-05, + "loss": 1.2821, + "step": 6094 + }, + { + "epoch": 1.91483597385005, + "grad_norm": 0.765625, + "learning_rate": 1.2464614408124407e-05, + "loss": 1.1848, + "step": 6096 + }, + { + "epoch": 1.9154642008755913, + "grad_norm": 0.81640625, + "learning_rate": 1.2462075531577278e-05, + "loss": 1.2984, + "step": 6098 + }, + { + "epoch": 1.9160924279011327, + "grad_norm": 0.69921875, + "learning_rate": 1.2459536655030151e-05, + "loss": 1.1486, + "step": 6100 + }, + { + "epoch": 1.916720654926674, + "grad_norm": 0.74609375, + "learning_rate": 1.2456997778483023e-05, + "loss": 1.3557, + "step": 6102 + }, + { + "epoch": 1.9173488819522155, + "grad_norm": 0.76953125, + "learning_rate": 1.2454458901935896e-05, + "loss": 1.2475, + "step": 6104 + }, + { + "epoch": 1.9179771089777569, + "grad_norm": 0.83984375, + "learning_rate": 1.2451920025388766e-05, + "loss": 1.1533, + "step": 6106 + }, + { + "epoch": 1.9186053360032982, + "grad_norm": 0.796875, + "learning_rate": 1.2449381148841639e-05, + "loss": 1.2381, + "step": 6108 + }, + { + "epoch": 1.9192335630288395, + "grad_norm": 1.0234375, + "learning_rate": 1.244684227229451e-05, + "loss": 1.2304, + "step": 6110 + }, + { + "epoch": 1.919861790054381, + "grad_norm": 0.75, + "learning_rate": 1.2444303395747383e-05, + "loss": 1.3272, + "step": 6112 + }, + { + "epoch": 1.9204900170799224, + "grad_norm": 0.796875, + "learning_rate": 1.2441764519200254e-05, + "loss": 1.1725, + "step": 6114 + }, + { + "epoch": 1.9211182441054637, + "grad_norm": 0.953125, + "learning_rate": 1.2439225642653128e-05, + "loss": 1.0932, + "step": 6116 + }, + { + "epoch": 1.921746471131005, + "grad_norm": 0.75390625, + "learning_rate": 1.2436686766105999e-05, + "loss": 1.2361, + "step": 6118 + }, + { + "epoch": 1.9223746981565464, + "grad_norm": 0.7421875, + "learning_rate": 1.2434147889558872e-05, + "loss": 1.3047, + "step": 6120 + }, + { + "epoch": 1.9230029251820877, + "grad_norm": 0.86328125, + "learning_rate": 1.2431609013011743e-05, + "loss": 1.3639, + "step": 6122 + }, + { + "epoch": 1.923631152207629, + "grad_norm": 0.9140625, + "learning_rate": 1.2429070136464616e-05, + "loss": 1.1849, + "step": 6124 + }, + { + "epoch": 1.9242593792331704, + "grad_norm": 0.77734375, + "learning_rate": 1.2426531259917488e-05, + "loss": 1.2506, + "step": 6126 + }, + { + "epoch": 1.9248876062587117, + "grad_norm": 0.79296875, + "learning_rate": 1.2423992383370361e-05, + "loss": 1.2748, + "step": 6128 + }, + { + "epoch": 1.925515833284253, + "grad_norm": 0.921875, + "learning_rate": 1.242145350682323e-05, + "loss": 1.3513, + "step": 6130 + }, + { + "epoch": 1.9261440603097943, + "grad_norm": 0.75, + "learning_rate": 1.2418914630276104e-05, + "loss": 1.3052, + "step": 6132 + }, + { + "epoch": 1.9267722873353357, + "grad_norm": 0.796875, + "learning_rate": 1.2416375753728975e-05, + "loss": 1.1691, + "step": 6134 + }, + { + "epoch": 1.927400514360877, + "grad_norm": 0.74609375, + "learning_rate": 1.2413836877181848e-05, + "loss": 1.1503, + "step": 6136 + }, + { + "epoch": 1.9280287413864186, + "grad_norm": 0.85546875, + "learning_rate": 1.241129800063472e-05, + "loss": 1.218, + "step": 6138 + }, + { + "epoch": 1.9286569684119599, + "grad_norm": 0.93359375, + "learning_rate": 1.2408759124087593e-05, + "loss": 1.1987, + "step": 6140 + }, + { + "epoch": 1.9292851954375012, + "grad_norm": 0.76171875, + "learning_rate": 1.2406220247540464e-05, + "loss": 1.1148, + "step": 6142 + }, + { + "epoch": 1.9299134224630425, + "grad_norm": 0.80859375, + "learning_rate": 1.2403681370993337e-05, + "loss": 1.3232, + "step": 6144 + }, + { + "epoch": 1.930541649488584, + "grad_norm": 0.7734375, + "learning_rate": 1.2401142494446208e-05, + "loss": 1.2638, + "step": 6146 + }, + { + "epoch": 1.9311698765141254, + "grad_norm": 0.8359375, + "learning_rate": 1.2398603617899081e-05, + "loss": 1.282, + "step": 6148 + }, + { + "epoch": 1.9317981035396667, + "grad_norm": 0.80859375, + "learning_rate": 1.2396064741351951e-05, + "loss": 1.292, + "step": 6150 + }, + { + "epoch": 1.932426330565208, + "grad_norm": 0.796875, + "learning_rate": 1.2393525864804826e-05, + "loss": 1.3647, + "step": 6152 + }, + { + "epoch": 1.9330545575907494, + "grad_norm": 0.828125, + "learning_rate": 1.2390986988257696e-05, + "loss": 1.1766, + "step": 6154 + }, + { + "epoch": 1.9336827846162907, + "grad_norm": 0.7578125, + "learning_rate": 1.2388448111710569e-05, + "loss": 1.2779, + "step": 6156 + }, + { + "epoch": 1.934311011641832, + "grad_norm": 0.8984375, + "learning_rate": 1.238590923516344e-05, + "loss": 1.2979, + "step": 6158 + }, + { + "epoch": 1.9349392386673734, + "grad_norm": 0.7890625, + "learning_rate": 1.2383370358616313e-05, + "loss": 1.3134, + "step": 6160 + }, + { + "epoch": 1.9355674656929147, + "grad_norm": 0.96875, + "learning_rate": 1.2380831482069185e-05, + "loss": 1.1554, + "step": 6162 + }, + { + "epoch": 1.936195692718456, + "grad_norm": 0.8046875, + "learning_rate": 1.2378292605522058e-05, + "loss": 1.3649, + "step": 6164 + }, + { + "epoch": 1.9368239197439974, + "grad_norm": 0.76953125, + "learning_rate": 1.2375753728974929e-05, + "loss": 1.2077, + "step": 6166 + }, + { + "epoch": 1.9374521467695387, + "grad_norm": 0.84765625, + "learning_rate": 1.2373214852427802e-05, + "loss": 1.288, + "step": 6168 + }, + { + "epoch": 1.9380803737950802, + "grad_norm": 0.85546875, + "learning_rate": 1.2370675975880673e-05, + "loss": 1.3333, + "step": 6170 + }, + { + "epoch": 1.9387086008206216, + "grad_norm": 0.84375, + "learning_rate": 1.2368137099333547e-05, + "loss": 1.345, + "step": 6172 + }, + { + "epoch": 1.939336827846163, + "grad_norm": 0.83203125, + "learning_rate": 1.236559822278642e-05, + "loss": 1.3079, + "step": 6174 + }, + { + "epoch": 1.9399650548717042, + "grad_norm": 0.80859375, + "learning_rate": 1.236305934623929e-05, + "loss": 1.2672, + "step": 6176 + }, + { + "epoch": 1.9405932818972458, + "grad_norm": 0.75390625, + "learning_rate": 1.2360520469692162e-05, + "loss": 1.2107, + "step": 6178 + }, + { + "epoch": 1.941221508922787, + "grad_norm": 0.76171875, + "learning_rate": 1.2357981593145034e-05, + "loss": 1.2125, + "step": 6180 + }, + { + "epoch": 1.9418497359483284, + "grad_norm": 0.77734375, + "learning_rate": 1.2355442716597907e-05, + "loss": 1.2236, + "step": 6182 + }, + { + "epoch": 1.9424779629738698, + "grad_norm": 0.8359375, + "learning_rate": 1.2352903840050778e-05, + "loss": 1.1823, + "step": 6184 + }, + { + "epoch": 1.943106189999411, + "grad_norm": 0.9765625, + "learning_rate": 1.2350364963503651e-05, + "loss": 1.2182, + "step": 6186 + }, + { + "epoch": 1.9437344170249524, + "grad_norm": 0.8125, + "learning_rate": 1.2347826086956523e-05, + "loss": 1.2996, + "step": 6188 + }, + { + "epoch": 1.9443626440504937, + "grad_norm": 0.796875, + "learning_rate": 1.2345287210409396e-05, + "loss": 1.1519, + "step": 6190 + }, + { + "epoch": 1.944990871076035, + "grad_norm": 0.7578125, + "learning_rate": 1.2342748333862267e-05, + "loss": 1.125, + "step": 6192 + }, + { + "epoch": 1.9456190981015764, + "grad_norm": 0.8046875, + "learning_rate": 1.234020945731514e-05, + "loss": 1.1524, + "step": 6194 + }, + { + "epoch": 1.9462473251271177, + "grad_norm": 0.765625, + "learning_rate": 1.2337670580768012e-05, + "loss": 1.2544, + "step": 6196 + }, + { + "epoch": 1.946875552152659, + "grad_norm": 0.8125, + "learning_rate": 1.2335131704220885e-05, + "loss": 1.1555, + "step": 6198 + }, + { + "epoch": 1.9475037791782004, + "grad_norm": 0.75390625, + "learning_rate": 1.2332592827673754e-05, + "loss": 1.2127, + "step": 6200 + }, + { + "epoch": 1.9481320062037417, + "grad_norm": 0.984375, + "learning_rate": 1.2330053951126627e-05, + "loss": 1.2985, + "step": 6202 + }, + { + "epoch": 1.9487602332292833, + "grad_norm": 0.83984375, + "learning_rate": 1.2327515074579499e-05, + "loss": 1.3043, + "step": 6204 + }, + { + "epoch": 1.9493884602548246, + "grad_norm": 0.78515625, + "learning_rate": 1.2324976198032372e-05, + "loss": 1.2911, + "step": 6206 + }, + { + "epoch": 1.950016687280366, + "grad_norm": 0.875, + "learning_rate": 1.2322437321485243e-05, + "loss": 1.2217, + "step": 6208 + }, + { + "epoch": 1.9506449143059073, + "grad_norm": 0.7734375, + "learning_rate": 1.2319898444938116e-05, + "loss": 1.2718, + "step": 6210 + }, + { + "epoch": 1.9512731413314488, + "grad_norm": 0.79296875, + "learning_rate": 1.2317359568390988e-05, + "loss": 1.3247, + "step": 6212 + }, + { + "epoch": 1.9519013683569901, + "grad_norm": 0.7890625, + "learning_rate": 1.231482069184386e-05, + "loss": 1.31, + "step": 6214 + }, + { + "epoch": 1.9525295953825315, + "grad_norm": 0.7265625, + "learning_rate": 1.2312281815296732e-05, + "loss": 1.2022, + "step": 6216 + }, + { + "epoch": 1.9531578224080728, + "grad_norm": 0.78515625, + "learning_rate": 1.2309742938749605e-05, + "loss": 1.1422, + "step": 6218 + }, + { + "epoch": 1.9537860494336141, + "grad_norm": 0.78125, + "learning_rate": 1.2307204062202475e-05, + "loss": 1.1487, + "step": 6220 + }, + { + "epoch": 1.9544142764591554, + "grad_norm": 0.9609375, + "learning_rate": 1.230466518565535e-05, + "loss": 1.2259, + "step": 6222 + }, + { + "epoch": 1.9550425034846968, + "grad_norm": 0.859375, + "learning_rate": 1.230212630910822e-05, + "loss": 1.1845, + "step": 6224 + }, + { + "epoch": 1.955670730510238, + "grad_norm": 0.76953125, + "learning_rate": 1.2299587432561093e-05, + "loss": 1.2533, + "step": 6226 + }, + { + "epoch": 1.9562989575357794, + "grad_norm": 0.78515625, + "learning_rate": 1.2297048556013964e-05, + "loss": 1.2527, + "step": 6228 + }, + { + "epoch": 1.9569271845613208, + "grad_norm": 0.78125, + "learning_rate": 1.2294509679466837e-05, + "loss": 1.2083, + "step": 6230 + }, + { + "epoch": 1.957555411586862, + "grad_norm": 0.75, + "learning_rate": 1.2291970802919708e-05, + "loss": 1.2065, + "step": 6232 + }, + { + "epoch": 1.9581836386124034, + "grad_norm": 0.7578125, + "learning_rate": 1.2289431926372581e-05, + "loss": 1.2186, + "step": 6234 + }, + { + "epoch": 1.958811865637945, + "grad_norm": 0.78125, + "learning_rate": 1.2286893049825453e-05, + "loss": 1.2687, + "step": 6236 + }, + { + "epoch": 1.9594400926634863, + "grad_norm": 0.76171875, + "learning_rate": 1.2284354173278326e-05, + "loss": 1.2311, + "step": 6238 + }, + { + "epoch": 1.9600683196890276, + "grad_norm": 0.87890625, + "learning_rate": 1.2281815296731197e-05, + "loss": 1.1847, + "step": 6240 + }, + { + "epoch": 1.960696546714569, + "grad_norm": 0.7578125, + "learning_rate": 1.227927642018407e-05, + "loss": 1.2003, + "step": 6242 + }, + { + "epoch": 1.9613247737401105, + "grad_norm": 0.7890625, + "learning_rate": 1.227673754363694e-05, + "loss": 1.4039, + "step": 6244 + }, + { + "epoch": 1.9619530007656518, + "grad_norm": 0.828125, + "learning_rate": 1.2274198667089813e-05, + "loss": 1.2274, + "step": 6246 + }, + { + "epoch": 1.9625812277911932, + "grad_norm": 0.82421875, + "learning_rate": 1.2271659790542684e-05, + "loss": 1.2743, + "step": 6248 + }, + { + "epoch": 1.9632094548167345, + "grad_norm": 0.734375, + "learning_rate": 1.2269120913995558e-05, + "loss": 1.414, + "step": 6250 + }, + { + "epoch": 1.9638376818422758, + "grad_norm": 0.90625, + "learning_rate": 1.2266582037448429e-05, + "loss": 1.2246, + "step": 6252 + }, + { + "epoch": 1.9644659088678171, + "grad_norm": 0.73046875, + "learning_rate": 1.2264043160901302e-05, + "loss": 1.2005, + "step": 6254 + }, + { + "epoch": 1.9650941358933585, + "grad_norm": 0.91015625, + "learning_rate": 1.2261504284354175e-05, + "loss": 1.2002, + "step": 6256 + }, + { + "epoch": 1.9657223629188998, + "grad_norm": 0.7421875, + "learning_rate": 1.2258965407807046e-05, + "loss": 1.2792, + "step": 6258 + }, + { + "epoch": 1.9663505899444411, + "grad_norm": 0.796875, + "learning_rate": 1.225642653125992e-05, + "loss": 1.0757, + "step": 6260 + }, + { + "epoch": 1.9669788169699824, + "grad_norm": 0.7890625, + "learning_rate": 1.2253887654712791e-05, + "loss": 1.3089, + "step": 6262 + }, + { + "epoch": 1.9676070439955238, + "grad_norm": 0.828125, + "learning_rate": 1.2251348778165664e-05, + "loss": 1.2696, + "step": 6264 + }, + { + "epoch": 1.968235271021065, + "grad_norm": 0.7890625, + "learning_rate": 1.2248809901618535e-05, + "loss": 1.2951, + "step": 6266 + }, + { + "epoch": 1.9688634980466064, + "grad_norm": 0.74609375, + "learning_rate": 1.2246271025071408e-05, + "loss": 1.232, + "step": 6268 + }, + { + "epoch": 1.969491725072148, + "grad_norm": 0.76171875, + "learning_rate": 1.2243732148524278e-05, + "loss": 1.3751, + "step": 6270 + }, + { + "epoch": 1.9701199520976893, + "grad_norm": 0.83203125, + "learning_rate": 1.2241193271977151e-05, + "loss": 1.3549, + "step": 6272 + }, + { + "epoch": 1.9707481791232306, + "grad_norm": 1.046875, + "learning_rate": 1.2238654395430023e-05, + "loss": 1.2652, + "step": 6274 + }, + { + "epoch": 1.971376406148772, + "grad_norm": 0.80859375, + "learning_rate": 1.2236115518882896e-05, + "loss": 1.2687, + "step": 6276 + }, + { + "epoch": 1.9720046331743135, + "grad_norm": 0.7890625, + "learning_rate": 1.2233576642335767e-05, + "loss": 1.4137, + "step": 6278 + }, + { + "epoch": 1.9726328601998548, + "grad_norm": 0.828125, + "learning_rate": 1.223103776578864e-05, + "loss": 1.1847, + "step": 6280 + }, + { + "epoch": 1.9732610872253962, + "grad_norm": 0.78125, + "learning_rate": 1.2228498889241512e-05, + "loss": 1.3383, + "step": 6282 + }, + { + "epoch": 1.9738893142509375, + "grad_norm": 0.8125, + "learning_rate": 1.2225960012694385e-05, + "loss": 1.2893, + "step": 6284 + }, + { + "epoch": 1.9745175412764788, + "grad_norm": 0.765625, + "learning_rate": 1.2223421136147256e-05, + "loss": 1.2479, + "step": 6286 + }, + { + "epoch": 1.9751457683020202, + "grad_norm": 0.8359375, + "learning_rate": 1.2220882259600129e-05, + "loss": 1.2674, + "step": 6288 + }, + { + "epoch": 1.9757739953275615, + "grad_norm": 0.71484375, + "learning_rate": 1.2218343383053e-05, + "loss": 1.332, + "step": 6290 + }, + { + "epoch": 1.9764022223531028, + "grad_norm": 0.8203125, + "learning_rate": 1.2215804506505874e-05, + "loss": 1.3086, + "step": 6292 + }, + { + "epoch": 1.9770304493786441, + "grad_norm": 0.74609375, + "learning_rate": 1.2213265629958743e-05, + "loss": 1.3607, + "step": 6294 + }, + { + "epoch": 1.9776586764041855, + "grad_norm": 0.78515625, + "learning_rate": 1.2210726753411616e-05, + "loss": 1.2653, + "step": 6296 + }, + { + "epoch": 1.9782869034297268, + "grad_norm": 0.77734375, + "learning_rate": 1.2208187876864488e-05, + "loss": 1.3309, + "step": 6298 + }, + { + "epoch": 1.9789151304552681, + "grad_norm": 0.80859375, + "learning_rate": 1.220564900031736e-05, + "loss": 1.3084, + "step": 6300 + }, + { + "epoch": 1.9795433574808097, + "grad_norm": 0.82421875, + "learning_rate": 1.2203110123770232e-05, + "loss": 1.2319, + "step": 6302 + }, + { + "epoch": 1.980171584506351, + "grad_norm": 0.76171875, + "learning_rate": 1.2200571247223105e-05, + "loss": 1.193, + "step": 6304 + }, + { + "epoch": 1.9807998115318923, + "grad_norm": 0.7890625, + "learning_rate": 1.2198032370675977e-05, + "loss": 1.1819, + "step": 6306 + }, + { + "epoch": 1.9814280385574337, + "grad_norm": 0.75390625, + "learning_rate": 1.219549349412885e-05, + "loss": 1.2694, + "step": 6308 + }, + { + "epoch": 1.9820562655829752, + "grad_norm": 0.796875, + "learning_rate": 1.2192954617581721e-05, + "loss": 1.2051, + "step": 6310 + }, + { + "epoch": 1.9826844926085165, + "grad_norm": 0.8125, + "learning_rate": 1.2190415741034594e-05, + "loss": 1.1432, + "step": 6312 + }, + { + "epoch": 1.9833127196340579, + "grad_norm": 0.859375, + "learning_rate": 1.2187876864487464e-05, + "loss": 1.1966, + "step": 6314 + }, + { + "epoch": 1.9839409466595992, + "grad_norm": 0.73046875, + "learning_rate": 1.2185337987940337e-05, + "loss": 1.1952, + "step": 6316 + }, + { + "epoch": 1.9845691736851405, + "grad_norm": 0.796875, + "learning_rate": 1.2182799111393208e-05, + "loss": 1.2597, + "step": 6318 + }, + { + "epoch": 1.9851974007106818, + "grad_norm": 1.2578125, + "learning_rate": 1.2180260234846081e-05, + "loss": 1.1049, + "step": 6320 + }, + { + "epoch": 1.9858256277362232, + "grad_norm": 0.73828125, + "learning_rate": 1.2177721358298953e-05, + "loss": 1.2622, + "step": 6322 + }, + { + "epoch": 1.9864538547617645, + "grad_norm": 0.75390625, + "learning_rate": 1.2175182481751826e-05, + "loss": 1.2554, + "step": 6324 + }, + { + "epoch": 1.9870820817873058, + "grad_norm": 0.8046875, + "learning_rate": 1.2172643605204697e-05, + "loss": 1.4156, + "step": 6326 + }, + { + "epoch": 1.9877103088128472, + "grad_norm": 0.78125, + "learning_rate": 1.217010472865757e-05, + "loss": 1.2517, + "step": 6328 + }, + { + "epoch": 1.9883385358383885, + "grad_norm": 0.83203125, + "learning_rate": 1.2167565852110442e-05, + "loss": 1.3161, + "step": 6330 + }, + { + "epoch": 1.9889667628639298, + "grad_norm": 0.828125, + "learning_rate": 1.2165026975563315e-05, + "loss": 1.2247, + "step": 6332 + }, + { + "epoch": 1.9895949898894711, + "grad_norm": 0.8203125, + "learning_rate": 1.2162488099016186e-05, + "loss": 1.1215, + "step": 6334 + }, + { + "epoch": 1.9902232169150127, + "grad_norm": 0.8046875, + "learning_rate": 1.215994922246906e-05, + "loss": 1.1817, + "step": 6336 + }, + { + "epoch": 1.990851443940554, + "grad_norm": 0.8515625, + "learning_rate": 1.2157410345921929e-05, + "loss": 1.2248, + "step": 6338 + }, + { + "epoch": 1.9914796709660953, + "grad_norm": 0.7734375, + "learning_rate": 1.2154871469374802e-05, + "loss": 1.3265, + "step": 6340 + }, + { + "epoch": 1.9921078979916367, + "grad_norm": 0.81640625, + "learning_rate": 1.2152332592827675e-05, + "loss": 1.2027, + "step": 6342 + }, + { + "epoch": 1.9927361250171782, + "grad_norm": 1.234375, + "learning_rate": 1.2149793716280546e-05, + "loss": 1.1223, + "step": 6344 + }, + { + "epoch": 1.9933643520427196, + "grad_norm": 0.86328125, + "learning_rate": 1.214725483973342e-05, + "loss": 1.2041, + "step": 6346 + }, + { + "epoch": 1.9939925790682609, + "grad_norm": 0.86328125, + "learning_rate": 1.2144715963186291e-05, + "loss": 1.2209, + "step": 6348 + }, + { + "epoch": 1.9946208060938022, + "grad_norm": 0.83984375, + "learning_rate": 1.2142177086639164e-05, + "loss": 1.3876, + "step": 6350 + }, + { + "epoch": 1.9952490331193435, + "grad_norm": 0.83203125, + "learning_rate": 1.2139638210092035e-05, + "loss": 1.341, + "step": 6352 + }, + { + "epoch": 1.9958772601448849, + "grad_norm": 0.83203125, + "learning_rate": 1.2137099333544908e-05, + "loss": 1.2775, + "step": 6354 + }, + { + "epoch": 1.9965054871704262, + "grad_norm": 0.8046875, + "learning_rate": 1.213456045699778e-05, + "loss": 1.2379, + "step": 6356 + }, + { + "epoch": 1.9971337141959675, + "grad_norm": 0.85546875, + "learning_rate": 1.2132021580450653e-05, + "loss": 1.311, + "step": 6358 + }, + { + "epoch": 1.9977619412215089, + "grad_norm": 0.84765625, + "learning_rate": 1.2129482703903524e-05, + "loss": 1.3089, + "step": 6360 + }, + { + "epoch": 1.9983901682470502, + "grad_norm": 0.84765625, + "learning_rate": 1.2126943827356397e-05, + "loss": 1.1843, + "step": 6362 + }, + { + "epoch": 1.9990183952725915, + "grad_norm": 0.7734375, + "learning_rate": 1.2124404950809267e-05, + "loss": 1.1485, + "step": 6364 + }, + { + "epoch": 1.9996466222981328, + "grad_norm": 0.875, + "learning_rate": 1.212186607426214e-05, + "loss": 1.0792, + "step": 6366 + }, + { + "epoch": 2.000274849323674, + "grad_norm": 0.7265625, + "learning_rate": 1.2119327197715011e-05, + "loss": 1.3043, + "step": 6368 + }, + { + "epoch": 2.0009030763492155, + "grad_norm": 0.78515625, + "learning_rate": 1.2116788321167885e-05, + "loss": 1.1913, + "step": 6370 + }, + { + "epoch": 2.0015313033747573, + "grad_norm": 0.8046875, + "learning_rate": 1.2114249444620756e-05, + "loss": 1.1807, + "step": 6372 + }, + { + "epoch": 2.0021595304002986, + "grad_norm": 0.91015625, + "learning_rate": 1.2111710568073629e-05, + "loss": 1.1107, + "step": 6374 + }, + { + "epoch": 2.00278775742584, + "grad_norm": 0.89453125, + "learning_rate": 1.21091716915265e-05, + "loss": 1.2526, + "step": 6376 + }, + { + "epoch": 2.0034159844513812, + "grad_norm": 0.78125, + "learning_rate": 1.2106632814979373e-05, + "loss": 1.1686, + "step": 6378 + }, + { + "epoch": 2.0040442114769226, + "grad_norm": 0.87890625, + "learning_rate": 1.2104093938432245e-05, + "loss": 1.0691, + "step": 6380 + }, + { + "epoch": 2.004672438502464, + "grad_norm": 0.96875, + "learning_rate": 1.2101555061885118e-05, + "loss": 1.199, + "step": 6382 + }, + { + "epoch": 2.0053006655280052, + "grad_norm": 0.875, + "learning_rate": 1.2099016185337988e-05, + "loss": 1.194, + "step": 6384 + }, + { + "epoch": 2.0059288925535466, + "grad_norm": 1.0, + "learning_rate": 1.2096477308790862e-05, + "loss": 1.2757, + "step": 6386 + }, + { + "epoch": 2.006557119579088, + "grad_norm": 0.75, + "learning_rate": 1.2093938432243732e-05, + "loss": 1.2772, + "step": 6388 + }, + { + "epoch": 2.007185346604629, + "grad_norm": 0.87890625, + "learning_rate": 1.2091399555696605e-05, + "loss": 1.2131, + "step": 6390 + }, + { + "epoch": 2.0078135736301705, + "grad_norm": 0.90625, + "learning_rate": 1.2088860679149477e-05, + "loss": 1.0315, + "step": 6392 + }, + { + "epoch": 2.008441800655712, + "grad_norm": 0.8203125, + "learning_rate": 1.208632180260235e-05, + "loss": 1.207, + "step": 6394 + }, + { + "epoch": 2.009070027681253, + "grad_norm": 0.90625, + "learning_rate": 1.2083782926055221e-05, + "loss": 1.1659, + "step": 6396 + }, + { + "epoch": 2.0096982547067945, + "grad_norm": 0.859375, + "learning_rate": 1.2081244049508094e-05, + "loss": 1.1712, + "step": 6398 + }, + { + "epoch": 2.010326481732336, + "grad_norm": 1.0390625, + "learning_rate": 1.2078705172960965e-05, + "loss": 1.235, + "step": 6400 + }, + { + "epoch": 2.010954708757877, + "grad_norm": 0.9296875, + "learning_rate": 1.2076166296413839e-05, + "loss": 1.1337, + "step": 6402 + }, + { + "epoch": 2.0115829357834185, + "grad_norm": 0.8046875, + "learning_rate": 1.207362741986671e-05, + "loss": 1.1306, + "step": 6404 + }, + { + "epoch": 2.0122111628089603, + "grad_norm": 0.8125, + "learning_rate": 1.2071088543319583e-05, + "loss": 1.1945, + "step": 6406 + }, + { + "epoch": 2.0128393898345016, + "grad_norm": 0.859375, + "learning_rate": 1.2068549666772453e-05, + "loss": 1.1251, + "step": 6408 + }, + { + "epoch": 2.013467616860043, + "grad_norm": 0.8359375, + "learning_rate": 1.2066010790225326e-05, + "loss": 1.1349, + "step": 6410 + }, + { + "epoch": 2.0140958438855843, + "grad_norm": 0.8984375, + "learning_rate": 1.2063471913678197e-05, + "loss": 1.0783, + "step": 6412 + }, + { + "epoch": 2.0147240709111256, + "grad_norm": 0.9296875, + "learning_rate": 1.206093303713107e-05, + "loss": 1.0695, + "step": 6414 + }, + { + "epoch": 2.015352297936667, + "grad_norm": 1.0390625, + "learning_rate": 1.2058394160583942e-05, + "loss": 1.1497, + "step": 6416 + }, + { + "epoch": 2.0159805249622083, + "grad_norm": 0.8046875, + "learning_rate": 1.2055855284036815e-05, + "loss": 1.3358, + "step": 6418 + }, + { + "epoch": 2.0166087519877496, + "grad_norm": 0.90625, + "learning_rate": 1.2053316407489686e-05, + "loss": 1.1467, + "step": 6420 + }, + { + "epoch": 2.017236979013291, + "grad_norm": 0.80078125, + "learning_rate": 1.2050777530942559e-05, + "loss": 1.1837, + "step": 6422 + }, + { + "epoch": 2.0178652060388322, + "grad_norm": 0.87109375, + "learning_rate": 1.204823865439543e-05, + "loss": 1.1541, + "step": 6424 + }, + { + "epoch": 2.0184934330643736, + "grad_norm": 0.8828125, + "learning_rate": 1.2045699777848304e-05, + "loss": 1.2496, + "step": 6426 + }, + { + "epoch": 2.019121660089915, + "grad_norm": 0.8203125, + "learning_rate": 1.2043160901301177e-05, + "loss": 1.1149, + "step": 6428 + }, + { + "epoch": 2.019749887115456, + "grad_norm": 0.8359375, + "learning_rate": 1.2040622024754048e-05, + "loss": 1.236, + "step": 6430 + }, + { + "epoch": 2.0203781141409975, + "grad_norm": 0.90625, + "learning_rate": 1.2038083148206921e-05, + "loss": 1.1907, + "step": 6432 + }, + { + "epoch": 2.021006341166539, + "grad_norm": 0.80078125, + "learning_rate": 1.203554427165979e-05, + "loss": 1.1428, + "step": 6434 + }, + { + "epoch": 2.02163456819208, + "grad_norm": 0.8515625, + "learning_rate": 1.2033005395112664e-05, + "loss": 1.2013, + "step": 6436 + }, + { + "epoch": 2.022262795217622, + "grad_norm": 0.921875, + "learning_rate": 1.2030466518565535e-05, + "loss": 1.2182, + "step": 6438 + }, + { + "epoch": 2.0228910222431633, + "grad_norm": 0.85546875, + "learning_rate": 1.2027927642018408e-05, + "loss": 1.1166, + "step": 6440 + }, + { + "epoch": 2.0235192492687046, + "grad_norm": 0.828125, + "learning_rate": 1.202538876547128e-05, + "loss": 1.1307, + "step": 6442 + }, + { + "epoch": 2.024147476294246, + "grad_norm": 0.89453125, + "learning_rate": 1.2022849888924153e-05, + "loss": 0.9865, + "step": 6444 + }, + { + "epoch": 2.0247757033197873, + "grad_norm": 0.84375, + "learning_rate": 1.2020311012377024e-05, + "loss": 1.0833, + "step": 6446 + }, + { + "epoch": 2.0254039303453286, + "grad_norm": 0.89453125, + "learning_rate": 1.2017772135829897e-05, + "loss": 1.1807, + "step": 6448 + }, + { + "epoch": 2.02603215737087, + "grad_norm": 0.94140625, + "learning_rate": 1.2015233259282769e-05, + "loss": 1.0344, + "step": 6450 + }, + { + "epoch": 2.0266603843964113, + "grad_norm": 0.83984375, + "learning_rate": 1.2012694382735642e-05, + "loss": 1.0775, + "step": 6452 + }, + { + "epoch": 2.0272886114219526, + "grad_norm": 0.86328125, + "learning_rate": 1.2010155506188511e-05, + "loss": 1.1049, + "step": 6454 + }, + { + "epoch": 2.027916838447494, + "grad_norm": 0.83984375, + "learning_rate": 1.2007616629641386e-05, + "loss": 1.3446, + "step": 6456 + }, + { + "epoch": 2.0285450654730353, + "grad_norm": 0.91015625, + "learning_rate": 1.2005077753094256e-05, + "loss": 1.3205, + "step": 6458 + }, + { + "epoch": 2.0291732924985766, + "grad_norm": 0.8671875, + "learning_rate": 1.2002538876547129e-05, + "loss": 1.28, + "step": 6460 + }, + { + "epoch": 2.029801519524118, + "grad_norm": 0.828125, + "learning_rate": 1.2e-05, + "loss": 1.157, + "step": 6462 + }, + { + "epoch": 2.0304297465496592, + "grad_norm": 0.8828125, + "learning_rate": 1.1997461123452873e-05, + "loss": 1.132, + "step": 6464 + }, + { + "epoch": 2.0310579735752006, + "grad_norm": 0.8671875, + "learning_rate": 1.1994922246905745e-05, + "loss": 1.1373, + "step": 6466 + }, + { + "epoch": 2.031686200600742, + "grad_norm": 0.89453125, + "learning_rate": 1.1992383370358618e-05, + "loss": 1.2225, + "step": 6468 + }, + { + "epoch": 2.0323144276262832, + "grad_norm": 0.84765625, + "learning_rate": 1.198984449381149e-05, + "loss": 1.21, + "step": 6470 + }, + { + "epoch": 2.032942654651825, + "grad_norm": 0.80859375, + "learning_rate": 1.1987305617264362e-05, + "loss": 1.1602, + "step": 6472 + }, + { + "epoch": 2.0335708816773663, + "grad_norm": 0.8828125, + "learning_rate": 1.1984766740717234e-05, + "loss": 1.1242, + "step": 6474 + }, + { + "epoch": 2.0341991087029077, + "grad_norm": 0.875, + "learning_rate": 1.1982227864170107e-05, + "loss": 1.1352, + "step": 6476 + }, + { + "epoch": 2.034827335728449, + "grad_norm": 0.87890625, + "learning_rate": 1.1979688987622976e-05, + "loss": 1.1541, + "step": 6478 + }, + { + "epoch": 2.0354555627539903, + "grad_norm": 0.8515625, + "learning_rate": 1.197715011107585e-05, + "loss": 1.1356, + "step": 6480 + }, + { + "epoch": 2.0360837897795316, + "grad_norm": 0.84375, + "learning_rate": 1.1974611234528721e-05, + "loss": 1.3641, + "step": 6482 + }, + { + "epoch": 2.036712016805073, + "grad_norm": 1.21875, + "learning_rate": 1.1972072357981594e-05, + "loss": 1.0959, + "step": 6484 + }, + { + "epoch": 2.0373402438306143, + "grad_norm": 0.859375, + "learning_rate": 1.1969533481434465e-05, + "loss": 1.2647, + "step": 6486 + }, + { + "epoch": 2.0379684708561556, + "grad_norm": 0.81640625, + "learning_rate": 1.1966994604887338e-05, + "loss": 1.0983, + "step": 6488 + }, + { + "epoch": 2.038596697881697, + "grad_norm": 0.84375, + "learning_rate": 1.196445572834021e-05, + "loss": 1.1839, + "step": 6490 + }, + { + "epoch": 2.0392249249072383, + "grad_norm": 0.859375, + "learning_rate": 1.1961916851793083e-05, + "loss": 1.2528, + "step": 6492 + }, + { + "epoch": 2.0398531519327796, + "grad_norm": 0.87890625, + "learning_rate": 1.1959377975245954e-05, + "loss": 1.1767, + "step": 6494 + }, + { + "epoch": 2.040481378958321, + "grad_norm": 0.87890625, + "learning_rate": 1.1956839098698827e-05, + "loss": 1.2149, + "step": 6496 + }, + { + "epoch": 2.0411096059838623, + "grad_norm": 0.88671875, + "learning_rate": 1.1954300222151699e-05, + "loss": 1.1772, + "step": 6498 + }, + { + "epoch": 2.0417378330094036, + "grad_norm": 0.91796875, + "learning_rate": 1.1951761345604572e-05, + "loss": 1.1659, + "step": 6500 + }, + { + "epoch": 2.042366060034945, + "grad_norm": 0.9140625, + "learning_rate": 1.1949222469057442e-05, + "loss": 1.1775, + "step": 6502 + }, + { + "epoch": 2.0429942870604867, + "grad_norm": 0.953125, + "learning_rate": 1.1946683592510315e-05, + "loss": 1.2286, + "step": 6504 + }, + { + "epoch": 2.043622514086028, + "grad_norm": 0.8203125, + "learning_rate": 1.1944144715963186e-05, + "loss": 1.265, + "step": 6506 + }, + { + "epoch": 2.0442507411115693, + "grad_norm": 0.89453125, + "learning_rate": 1.1941605839416059e-05, + "loss": 1.2133, + "step": 6508 + }, + { + "epoch": 2.0448789681371107, + "grad_norm": 1.0, + "learning_rate": 1.193906696286893e-05, + "loss": 1.0619, + "step": 6510 + }, + { + "epoch": 2.045507195162652, + "grad_norm": 0.8515625, + "learning_rate": 1.1936528086321803e-05, + "loss": 1.292, + "step": 6512 + }, + { + "epoch": 2.0461354221881933, + "grad_norm": 0.91015625, + "learning_rate": 1.1933989209774677e-05, + "loss": 1.1345, + "step": 6514 + }, + { + "epoch": 2.0467636492137347, + "grad_norm": 0.87109375, + "learning_rate": 1.1931450333227548e-05, + "loss": 1.0742, + "step": 6516 + }, + { + "epoch": 2.047391876239276, + "grad_norm": 0.88671875, + "learning_rate": 1.1928911456680421e-05, + "loss": 1.0563, + "step": 6518 + }, + { + "epoch": 2.0480201032648173, + "grad_norm": 0.84765625, + "learning_rate": 1.1926372580133292e-05, + "loss": 1.0728, + "step": 6520 + }, + { + "epoch": 2.0486483302903586, + "grad_norm": 0.87109375, + "learning_rate": 1.1923833703586165e-05, + "loss": 1.2013, + "step": 6522 + }, + { + "epoch": 2.0492765573159, + "grad_norm": 0.76953125, + "learning_rate": 1.1921294827039037e-05, + "loss": 1.2654, + "step": 6524 + }, + { + "epoch": 2.0499047843414413, + "grad_norm": 0.8984375, + "learning_rate": 1.191875595049191e-05, + "loss": 1.1785, + "step": 6526 + }, + { + "epoch": 2.0505330113669826, + "grad_norm": 0.98046875, + "learning_rate": 1.191621707394478e-05, + "loss": 1.166, + "step": 6528 + }, + { + "epoch": 2.051161238392524, + "grad_norm": 0.8125, + "learning_rate": 1.1913678197397653e-05, + "loss": 1.0234, + "step": 6530 + }, + { + "epoch": 2.0517894654180653, + "grad_norm": 0.8203125, + "learning_rate": 1.1911139320850524e-05, + "loss": 1.1007, + "step": 6532 + }, + { + "epoch": 2.0524176924436066, + "grad_norm": 0.8828125, + "learning_rate": 1.1908600444303397e-05, + "loss": 1.2466, + "step": 6534 + }, + { + "epoch": 2.053045919469148, + "grad_norm": 1.0234375, + "learning_rate": 1.1906061567756269e-05, + "loss": 1.1554, + "step": 6536 + }, + { + "epoch": 2.0536741464946897, + "grad_norm": 0.79296875, + "learning_rate": 1.1903522691209142e-05, + "loss": 1.293, + "step": 6538 + }, + { + "epoch": 2.054302373520231, + "grad_norm": 0.7890625, + "learning_rate": 1.1900983814662013e-05, + "loss": 1.1603, + "step": 6540 + }, + { + "epoch": 2.0549306005457724, + "grad_norm": 0.98046875, + "learning_rate": 1.1898444938114886e-05, + "loss": 1.1157, + "step": 6542 + }, + { + "epoch": 2.0555588275713137, + "grad_norm": 0.84765625, + "learning_rate": 1.1895906061567757e-05, + "loss": 1.102, + "step": 6544 + }, + { + "epoch": 2.056187054596855, + "grad_norm": 0.8046875, + "learning_rate": 1.189336718502063e-05, + "loss": 1.1276, + "step": 6546 + }, + { + "epoch": 2.0568152816223964, + "grad_norm": 0.8125, + "learning_rate": 1.18908283084735e-05, + "loss": 1.3061, + "step": 6548 + }, + { + "epoch": 2.0574435086479377, + "grad_norm": 0.83984375, + "learning_rate": 1.1888289431926375e-05, + "loss": 1.2868, + "step": 6550 + }, + { + "epoch": 2.058071735673479, + "grad_norm": 0.90234375, + "learning_rate": 1.1885750555379245e-05, + "loss": 1.2344, + "step": 6552 + }, + { + "epoch": 2.0586999626990203, + "grad_norm": 0.91015625, + "learning_rate": 1.1883211678832118e-05, + "loss": 1.1805, + "step": 6554 + }, + { + "epoch": 2.0593281897245617, + "grad_norm": 0.8671875, + "learning_rate": 1.1880672802284989e-05, + "loss": 1.1347, + "step": 6556 + }, + { + "epoch": 2.059956416750103, + "grad_norm": 0.8828125, + "learning_rate": 1.1878133925737862e-05, + "loss": 1.1707, + "step": 6558 + }, + { + "epoch": 2.0605846437756443, + "grad_norm": 0.8828125, + "learning_rate": 1.1875595049190734e-05, + "loss": 1.2383, + "step": 6560 + }, + { + "epoch": 2.0612128708011856, + "grad_norm": 0.83203125, + "learning_rate": 1.1873056172643607e-05, + "loss": 1.1534, + "step": 6562 + }, + { + "epoch": 2.061841097826727, + "grad_norm": 0.83984375, + "learning_rate": 1.1870517296096478e-05, + "loss": 1.1963, + "step": 6564 + }, + { + "epoch": 2.0624693248522683, + "grad_norm": 0.828125, + "learning_rate": 1.1867978419549351e-05, + "loss": 1.2344, + "step": 6566 + }, + { + "epoch": 2.0630975518778096, + "grad_norm": 0.8359375, + "learning_rate": 1.1865439543002223e-05, + "loss": 1.158, + "step": 6568 + }, + { + "epoch": 2.0637257789033514, + "grad_norm": 0.8359375, + "learning_rate": 1.1862900666455096e-05, + "loss": 1.2401, + "step": 6570 + }, + { + "epoch": 2.0643540059288927, + "grad_norm": 0.85546875, + "learning_rate": 1.1860361789907965e-05, + "loss": 1.1387, + "step": 6572 + }, + { + "epoch": 2.064982232954434, + "grad_norm": 0.8359375, + "learning_rate": 1.1857822913360838e-05, + "loss": 1.1706, + "step": 6574 + }, + { + "epoch": 2.0656104599799754, + "grad_norm": 0.80859375, + "learning_rate": 1.185528403681371e-05, + "loss": 1.2223, + "step": 6576 + }, + { + "epoch": 2.0662386870055167, + "grad_norm": 0.828125, + "learning_rate": 1.1852745160266583e-05, + "loss": 1.1918, + "step": 6578 + }, + { + "epoch": 2.066866914031058, + "grad_norm": 0.8203125, + "learning_rate": 1.1850206283719454e-05, + "loss": 1.1429, + "step": 6580 + }, + { + "epoch": 2.0674951410565994, + "grad_norm": 0.91796875, + "learning_rate": 1.1847667407172327e-05, + "loss": 1.1263, + "step": 6582 + }, + { + "epoch": 2.0681233680821407, + "grad_norm": 0.84375, + "learning_rate": 1.1845128530625199e-05, + "loss": 1.2082, + "step": 6584 + }, + { + "epoch": 2.068751595107682, + "grad_norm": 0.90625, + "learning_rate": 1.1842589654078072e-05, + "loss": 1.1829, + "step": 6586 + }, + { + "epoch": 2.0693798221332234, + "grad_norm": 0.8046875, + "learning_rate": 1.1840050777530943e-05, + "loss": 1.0186, + "step": 6588 + }, + { + "epoch": 2.0700080491587647, + "grad_norm": 0.828125, + "learning_rate": 1.1837511900983816e-05, + "loss": 1.2465, + "step": 6590 + }, + { + "epoch": 2.070636276184306, + "grad_norm": 0.8671875, + "learning_rate": 1.1834973024436686e-05, + "loss": 1.1994, + "step": 6592 + }, + { + "epoch": 2.0712645032098473, + "grad_norm": 0.8828125, + "learning_rate": 1.183243414788956e-05, + "loss": 1.1866, + "step": 6594 + }, + { + "epoch": 2.0718927302353887, + "grad_norm": 0.89453125, + "learning_rate": 1.182989527134243e-05, + "loss": 1.2716, + "step": 6596 + }, + { + "epoch": 2.07252095726093, + "grad_norm": 0.921875, + "learning_rate": 1.1827356394795303e-05, + "loss": 1.3113, + "step": 6598 + }, + { + "epoch": 2.0731491842864713, + "grad_norm": 0.859375, + "learning_rate": 1.1824817518248176e-05, + "loss": 1.1559, + "step": 6600 + }, + { + "epoch": 2.0737774113120127, + "grad_norm": 0.83984375, + "learning_rate": 1.1822278641701048e-05, + "loss": 1.0895, + "step": 6602 + }, + { + "epoch": 2.0744056383375544, + "grad_norm": 0.87109375, + "learning_rate": 1.1819739765153921e-05, + "loss": 1.2286, + "step": 6604 + }, + { + "epoch": 2.0750338653630958, + "grad_norm": 0.9375, + "learning_rate": 1.1817200888606792e-05, + "loss": 1.1859, + "step": 6606 + }, + { + "epoch": 2.075662092388637, + "grad_norm": 0.87890625, + "learning_rate": 1.1814662012059665e-05, + "loss": 1.0625, + "step": 6608 + }, + { + "epoch": 2.0762903194141784, + "grad_norm": 0.84375, + "learning_rate": 1.1812123135512537e-05, + "loss": 1.1966, + "step": 6610 + }, + { + "epoch": 2.0769185464397197, + "grad_norm": 0.90234375, + "learning_rate": 1.180958425896541e-05, + "loss": 1.2155, + "step": 6612 + }, + { + "epoch": 2.077546773465261, + "grad_norm": 0.828125, + "learning_rate": 1.1807045382418281e-05, + "loss": 1.2382, + "step": 6614 + }, + { + "epoch": 2.0781750004908024, + "grad_norm": 0.875, + "learning_rate": 1.1804506505871154e-05, + "loss": 1.3356, + "step": 6616 + }, + { + "epoch": 2.0788032275163437, + "grad_norm": 0.859375, + "learning_rate": 1.1801967629324024e-05, + "loss": 1.1531, + "step": 6618 + }, + { + "epoch": 2.079431454541885, + "grad_norm": 0.828125, + "learning_rate": 1.1799428752776899e-05, + "loss": 1.1634, + "step": 6620 + }, + { + "epoch": 2.0800596815674264, + "grad_norm": 0.81640625, + "learning_rate": 1.1796889876229768e-05, + "loss": 1.0372, + "step": 6622 + }, + { + "epoch": 2.0806879085929677, + "grad_norm": 0.82421875, + "learning_rate": 1.1794350999682642e-05, + "loss": 1.1603, + "step": 6624 + }, + { + "epoch": 2.081316135618509, + "grad_norm": 0.87890625, + "learning_rate": 1.1791812123135513e-05, + "loss": 1.2412, + "step": 6626 + }, + { + "epoch": 2.0819443626440504, + "grad_norm": 0.765625, + "learning_rate": 1.1789273246588386e-05, + "loss": 1.2592, + "step": 6628 + }, + { + "epoch": 2.0825725896695917, + "grad_norm": 0.82421875, + "learning_rate": 1.1786734370041257e-05, + "loss": 1.197, + "step": 6630 + }, + { + "epoch": 2.083200816695133, + "grad_norm": 0.86328125, + "learning_rate": 1.178419549349413e-05, + "loss": 1.0592, + "step": 6632 + }, + { + "epoch": 2.0838290437206743, + "grad_norm": 0.82421875, + "learning_rate": 1.1781656616947002e-05, + "loss": 1.1009, + "step": 6634 + }, + { + "epoch": 2.084457270746216, + "grad_norm": 0.8515625, + "learning_rate": 1.1779117740399875e-05, + "loss": 1.1818, + "step": 6636 + }, + { + "epoch": 2.0850854977717574, + "grad_norm": 0.875, + "learning_rate": 1.1776578863852746e-05, + "loss": 1.1779, + "step": 6638 + }, + { + "epoch": 2.0857137247972988, + "grad_norm": 0.95703125, + "learning_rate": 1.177403998730562e-05, + "loss": 1.1441, + "step": 6640 + }, + { + "epoch": 2.08634195182284, + "grad_norm": 0.8125, + "learning_rate": 1.1771501110758489e-05, + "loss": 1.0996, + "step": 6642 + }, + { + "epoch": 2.0869701788483814, + "grad_norm": 0.8828125, + "learning_rate": 1.1768962234211362e-05, + "loss": 1.0396, + "step": 6644 + }, + { + "epoch": 2.0875984058739228, + "grad_norm": 0.83203125, + "learning_rate": 1.1766423357664234e-05, + "loss": 1.1889, + "step": 6646 + }, + { + "epoch": 2.088226632899464, + "grad_norm": 0.89453125, + "learning_rate": 1.1763884481117107e-05, + "loss": 1.0357, + "step": 6648 + }, + { + "epoch": 2.0888548599250054, + "grad_norm": 0.8359375, + "learning_rate": 1.1761345604569978e-05, + "loss": 1.2735, + "step": 6650 + }, + { + "epoch": 2.0894830869505467, + "grad_norm": 0.828125, + "learning_rate": 1.1758806728022851e-05, + "loss": 1.0615, + "step": 6652 + }, + { + "epoch": 2.090111313976088, + "grad_norm": 0.80859375, + "learning_rate": 1.1756267851475722e-05, + "loss": 1.2334, + "step": 6654 + }, + { + "epoch": 2.0907395410016294, + "grad_norm": 0.84375, + "learning_rate": 1.1753728974928596e-05, + "loss": 1.1619, + "step": 6656 + }, + { + "epoch": 2.0913677680271707, + "grad_norm": 1.0234375, + "learning_rate": 1.1751190098381467e-05, + "loss": 1.2739, + "step": 6658 + }, + { + "epoch": 2.091995995052712, + "grad_norm": 0.90234375, + "learning_rate": 1.174865122183434e-05, + "loss": 1.1238, + "step": 6660 + }, + { + "epoch": 2.0926242220782534, + "grad_norm": 0.8359375, + "learning_rate": 1.1746112345287211e-05, + "loss": 1.2821, + "step": 6662 + }, + { + "epoch": 2.0932524491037947, + "grad_norm": 0.859375, + "learning_rate": 1.1743573468740084e-05, + "loss": 1.128, + "step": 6664 + }, + { + "epoch": 2.093880676129336, + "grad_norm": 0.875, + "learning_rate": 1.1741034592192954e-05, + "loss": 1.2666, + "step": 6666 + }, + { + "epoch": 2.0945089031548774, + "grad_norm": 1.0, + "learning_rate": 1.1738495715645827e-05, + "loss": 1.2499, + "step": 6668 + }, + { + "epoch": 2.095137130180419, + "grad_norm": 0.81640625, + "learning_rate": 1.1735956839098699e-05, + "loss": 1.1686, + "step": 6670 + }, + { + "epoch": 2.0957653572059605, + "grad_norm": 0.8515625, + "learning_rate": 1.1733417962551572e-05, + "loss": 1.2688, + "step": 6672 + }, + { + "epoch": 2.096393584231502, + "grad_norm": 0.86328125, + "learning_rate": 1.1730879086004443e-05, + "loss": 1.1446, + "step": 6674 + }, + { + "epoch": 2.097021811257043, + "grad_norm": 0.9453125, + "learning_rate": 1.1728340209457316e-05, + "loss": 1.0533, + "step": 6676 + }, + { + "epoch": 2.0976500382825845, + "grad_norm": 0.8203125, + "learning_rate": 1.1725801332910188e-05, + "loss": 1.1993, + "step": 6678 + }, + { + "epoch": 2.098278265308126, + "grad_norm": 0.8515625, + "learning_rate": 1.172326245636306e-05, + "loss": 1.2439, + "step": 6680 + }, + { + "epoch": 2.098906492333667, + "grad_norm": 0.875, + "learning_rate": 1.1720723579815932e-05, + "loss": 1.1796, + "step": 6682 + }, + { + "epoch": 2.0995347193592084, + "grad_norm": 0.86328125, + "learning_rate": 1.1718184703268805e-05, + "loss": 1.1686, + "step": 6684 + }, + { + "epoch": 2.1001629463847498, + "grad_norm": 0.90234375, + "learning_rate": 1.1715645826721678e-05, + "loss": 1.2925, + "step": 6686 + }, + { + "epoch": 2.100791173410291, + "grad_norm": 0.8671875, + "learning_rate": 1.171310695017455e-05, + "loss": 1.1615, + "step": 6688 + }, + { + "epoch": 2.1014194004358324, + "grad_norm": 0.8125, + "learning_rate": 1.1710568073627423e-05, + "loss": 1.1193, + "step": 6690 + }, + { + "epoch": 2.1020476274613737, + "grad_norm": 0.87109375, + "learning_rate": 1.1708029197080292e-05, + "loss": 1.1156, + "step": 6692 + }, + { + "epoch": 2.102675854486915, + "grad_norm": 0.828125, + "learning_rate": 1.1705490320533165e-05, + "loss": 1.3214, + "step": 6694 + }, + { + "epoch": 2.1033040815124564, + "grad_norm": 0.828125, + "learning_rate": 1.1702951443986037e-05, + "loss": 1.3833, + "step": 6696 + }, + { + "epoch": 2.1039323085379977, + "grad_norm": 0.92578125, + "learning_rate": 1.170041256743891e-05, + "loss": 1.1531, + "step": 6698 + }, + { + "epoch": 2.1045605355635395, + "grad_norm": 0.8515625, + "learning_rate": 1.1697873690891781e-05, + "loss": 1.0297, + "step": 6700 + }, + { + "epoch": 2.105188762589081, + "grad_norm": 0.87109375, + "learning_rate": 1.1695334814344654e-05, + "loss": 1.1562, + "step": 6702 + }, + { + "epoch": 2.105816989614622, + "grad_norm": 0.87109375, + "learning_rate": 1.1692795937797526e-05, + "loss": 1.171, + "step": 6704 + }, + { + "epoch": 2.1064452166401635, + "grad_norm": 0.87890625, + "learning_rate": 1.1690257061250399e-05, + "loss": 1.0549, + "step": 6706 + }, + { + "epoch": 2.107073443665705, + "grad_norm": 0.8828125, + "learning_rate": 1.168771818470327e-05, + "loss": 1.094, + "step": 6708 + }, + { + "epoch": 2.107701670691246, + "grad_norm": 0.84375, + "learning_rate": 1.1685179308156143e-05, + "loss": 1.1696, + "step": 6710 + }, + { + "epoch": 2.1083298977167875, + "grad_norm": 0.828125, + "learning_rate": 1.1682640431609013e-05, + "loss": 1.1725, + "step": 6712 + }, + { + "epoch": 2.108958124742329, + "grad_norm": 0.875, + "learning_rate": 1.1680101555061888e-05, + "loss": 1.3241, + "step": 6714 + }, + { + "epoch": 2.10958635176787, + "grad_norm": 0.890625, + "learning_rate": 1.1677562678514757e-05, + "loss": 1.0591, + "step": 6716 + }, + { + "epoch": 2.1102145787934115, + "grad_norm": 0.99609375, + "learning_rate": 1.167502380196763e-05, + "loss": 1.1485, + "step": 6718 + }, + { + "epoch": 2.110842805818953, + "grad_norm": 0.83203125, + "learning_rate": 1.1672484925420502e-05, + "loss": 1.1271, + "step": 6720 + }, + { + "epoch": 2.111471032844494, + "grad_norm": 0.83984375, + "learning_rate": 1.1669946048873375e-05, + "loss": 1.1642, + "step": 6722 + }, + { + "epoch": 2.1120992598700354, + "grad_norm": 0.82421875, + "learning_rate": 1.1667407172326246e-05, + "loss": 1.2039, + "step": 6724 + }, + { + "epoch": 2.1127274868955768, + "grad_norm": 0.87109375, + "learning_rate": 1.166486829577912e-05, + "loss": 1.1125, + "step": 6726 + }, + { + "epoch": 2.113355713921118, + "grad_norm": 0.96484375, + "learning_rate": 1.166232941923199e-05, + "loss": 1.2027, + "step": 6728 + }, + { + "epoch": 2.1139839409466594, + "grad_norm": 0.9375, + "learning_rate": 1.1659790542684864e-05, + "loss": 1.1512, + "step": 6730 + }, + { + "epoch": 2.1146121679722008, + "grad_norm": 0.9453125, + "learning_rate": 1.1657251666137735e-05, + "loss": 1.2524, + "step": 6732 + }, + { + "epoch": 2.1152403949977425, + "grad_norm": 0.8828125, + "learning_rate": 1.1654712789590608e-05, + "loss": 1.1181, + "step": 6734 + }, + { + "epoch": 2.115868622023284, + "grad_norm": 0.890625, + "learning_rate": 1.1652173913043478e-05, + "loss": 1.1363, + "step": 6736 + }, + { + "epoch": 2.116496849048825, + "grad_norm": 0.859375, + "learning_rate": 1.1649635036496351e-05, + "loss": 1.2479, + "step": 6738 + }, + { + "epoch": 2.1171250760743665, + "grad_norm": 0.94140625, + "learning_rate": 1.1647096159949222e-05, + "loss": 1.1432, + "step": 6740 + }, + { + "epoch": 2.117753303099908, + "grad_norm": 0.953125, + "learning_rate": 1.1644557283402095e-05, + "loss": 1.2668, + "step": 6742 + }, + { + "epoch": 2.118381530125449, + "grad_norm": 0.88671875, + "learning_rate": 1.1642018406854967e-05, + "loss": 1.1356, + "step": 6744 + }, + { + "epoch": 2.1190097571509905, + "grad_norm": 0.9140625, + "learning_rate": 1.163947953030784e-05, + "loss": 1.0993, + "step": 6746 + }, + { + "epoch": 2.119637984176532, + "grad_norm": 0.98046875, + "learning_rate": 1.1636940653760711e-05, + "loss": 1.0308, + "step": 6748 + }, + { + "epoch": 2.120266211202073, + "grad_norm": 0.94921875, + "learning_rate": 1.1634401777213584e-05, + "loss": 1.1342, + "step": 6750 + }, + { + "epoch": 2.1208944382276145, + "grad_norm": 0.9765625, + "learning_rate": 1.1631862900666456e-05, + "loss": 1.0846, + "step": 6752 + }, + { + "epoch": 2.121522665253156, + "grad_norm": 0.96484375, + "learning_rate": 1.1629324024119329e-05, + "loss": 1.1746, + "step": 6754 + }, + { + "epoch": 2.122150892278697, + "grad_norm": 0.8359375, + "learning_rate": 1.1626785147572199e-05, + "loss": 1.114, + "step": 6756 + }, + { + "epoch": 2.1227791193042385, + "grad_norm": 0.82421875, + "learning_rate": 1.1624246271025073e-05, + "loss": 1.123, + "step": 6758 + }, + { + "epoch": 2.12340734632978, + "grad_norm": 0.828125, + "learning_rate": 1.1621707394477943e-05, + "loss": 1.2326, + "step": 6760 + }, + { + "epoch": 2.124035573355321, + "grad_norm": 0.84765625, + "learning_rate": 1.1619168517930816e-05, + "loss": 1.2209, + "step": 6762 + }, + { + "epoch": 2.1246638003808624, + "grad_norm": 0.9375, + "learning_rate": 1.1616629641383687e-05, + "loss": 0.9569, + "step": 6764 + }, + { + "epoch": 2.125292027406404, + "grad_norm": 0.83984375, + "learning_rate": 1.161409076483656e-05, + "loss": 1.139, + "step": 6766 + }, + { + "epoch": 2.1259202544319455, + "grad_norm": 0.81640625, + "learning_rate": 1.1611551888289432e-05, + "loss": 1.2186, + "step": 6768 + }, + { + "epoch": 2.126548481457487, + "grad_norm": 0.8828125, + "learning_rate": 1.1609013011742305e-05, + "loss": 1.2514, + "step": 6770 + }, + { + "epoch": 2.127176708483028, + "grad_norm": 0.85546875, + "learning_rate": 1.1606474135195178e-05, + "loss": 1.2252, + "step": 6772 + }, + { + "epoch": 2.1278049355085695, + "grad_norm": 0.8515625, + "learning_rate": 1.160393525864805e-05, + "loss": 1.1557, + "step": 6774 + }, + { + "epoch": 2.128433162534111, + "grad_norm": 0.88671875, + "learning_rate": 1.1601396382100923e-05, + "loss": 1.2549, + "step": 6776 + }, + { + "epoch": 2.129061389559652, + "grad_norm": 0.8984375, + "learning_rate": 1.1598857505553794e-05, + "loss": 1.1201, + "step": 6778 + }, + { + "epoch": 2.1296896165851935, + "grad_norm": 0.96484375, + "learning_rate": 1.1596318629006667e-05, + "loss": 1.2005, + "step": 6780 + }, + { + "epoch": 2.130317843610735, + "grad_norm": 0.80078125, + "learning_rate": 1.1593779752459537e-05, + "loss": 1.1851, + "step": 6782 + }, + { + "epoch": 2.130946070636276, + "grad_norm": 0.8515625, + "learning_rate": 1.1591240875912411e-05, + "loss": 1.0721, + "step": 6784 + }, + { + "epoch": 2.1315742976618175, + "grad_norm": 0.8515625, + "learning_rate": 1.1588701999365281e-05, + "loss": 1.1945, + "step": 6786 + }, + { + "epoch": 2.132202524687359, + "grad_norm": 0.85546875, + "learning_rate": 1.1586163122818154e-05, + "loss": 1.1655, + "step": 6788 + }, + { + "epoch": 2.1328307517129, + "grad_norm": 0.80859375, + "learning_rate": 1.1583624246271026e-05, + "loss": 1.0261, + "step": 6790 + }, + { + "epoch": 2.1334589787384415, + "grad_norm": 0.84375, + "learning_rate": 1.1581085369723899e-05, + "loss": 1.1892, + "step": 6792 + }, + { + "epoch": 2.134087205763983, + "grad_norm": 0.875, + "learning_rate": 1.157854649317677e-05, + "loss": 1.196, + "step": 6794 + }, + { + "epoch": 2.134715432789524, + "grad_norm": 0.9140625, + "learning_rate": 1.1576007616629643e-05, + "loss": 1.2396, + "step": 6796 + }, + { + "epoch": 2.1353436598150655, + "grad_norm": 0.90234375, + "learning_rate": 1.1573468740082514e-05, + "loss": 1.2077, + "step": 6798 + }, + { + "epoch": 2.135971886840607, + "grad_norm": 0.92578125, + "learning_rate": 1.1570929863535388e-05, + "loss": 1.1368, + "step": 6800 + }, + { + "epoch": 2.1366001138661486, + "grad_norm": 0.85546875, + "learning_rate": 1.1568390986988259e-05, + "loss": 1.1799, + "step": 6802 + }, + { + "epoch": 2.13722834089169, + "grad_norm": 0.83984375, + "learning_rate": 1.1565852110441132e-05, + "loss": 1.1639, + "step": 6804 + }, + { + "epoch": 2.137856567917231, + "grad_norm": 0.859375, + "learning_rate": 1.1563313233894002e-05, + "loss": 1.207, + "step": 6806 + }, + { + "epoch": 2.1384847949427725, + "grad_norm": 0.88671875, + "learning_rate": 1.1560774357346875e-05, + "loss": 1.0282, + "step": 6808 + }, + { + "epoch": 2.139113021968314, + "grad_norm": 0.859375, + "learning_rate": 1.1558235480799746e-05, + "loss": 1.095, + "step": 6810 + }, + { + "epoch": 2.139741248993855, + "grad_norm": 0.8515625, + "learning_rate": 1.155569660425262e-05, + "loss": 1.1784, + "step": 6812 + }, + { + "epoch": 2.1403694760193965, + "grad_norm": 0.85546875, + "learning_rate": 1.155315772770549e-05, + "loss": 1.2698, + "step": 6814 + }, + { + "epoch": 2.140997703044938, + "grad_norm": 0.88671875, + "learning_rate": 1.1550618851158364e-05, + "loss": 1.1665, + "step": 6816 + }, + { + "epoch": 2.141625930070479, + "grad_norm": 0.8046875, + "learning_rate": 1.1548079974611235e-05, + "loss": 1.1065, + "step": 6818 + }, + { + "epoch": 2.1422541570960205, + "grad_norm": 0.7890625, + "learning_rate": 1.1545541098064108e-05, + "loss": 1.1438, + "step": 6820 + }, + { + "epoch": 2.142882384121562, + "grad_norm": 0.84375, + "learning_rate": 1.154300222151698e-05, + "loss": 1.1842, + "step": 6822 + }, + { + "epoch": 2.143510611147103, + "grad_norm": 0.875, + "learning_rate": 1.1540463344969853e-05, + "loss": 1.1727, + "step": 6824 + }, + { + "epoch": 2.1441388381726445, + "grad_norm": 0.87109375, + "learning_rate": 1.1537924468422724e-05, + "loss": 1.2433, + "step": 6826 + }, + { + "epoch": 2.144767065198186, + "grad_norm": 0.95703125, + "learning_rate": 1.1535385591875597e-05, + "loss": 1.1275, + "step": 6828 + }, + { + "epoch": 2.145395292223727, + "grad_norm": 0.81640625, + "learning_rate": 1.1532846715328467e-05, + "loss": 1.1126, + "step": 6830 + }, + { + "epoch": 2.146023519249269, + "grad_norm": 1.578125, + "learning_rate": 1.153030783878134e-05, + "loss": 1.0254, + "step": 6832 + }, + { + "epoch": 2.1466517462748103, + "grad_norm": 0.84765625, + "learning_rate": 1.1527768962234211e-05, + "loss": 1.1911, + "step": 6834 + }, + { + "epoch": 2.1472799733003516, + "grad_norm": 0.875, + "learning_rate": 1.1525230085687084e-05, + "loss": 1.2118, + "step": 6836 + }, + { + "epoch": 2.147908200325893, + "grad_norm": 0.93359375, + "learning_rate": 1.1522691209139956e-05, + "loss": 1.081, + "step": 6838 + }, + { + "epoch": 2.1485364273514342, + "grad_norm": 0.9296875, + "learning_rate": 1.1520152332592829e-05, + "loss": 1.2892, + "step": 6840 + }, + { + "epoch": 2.1491646543769756, + "grad_norm": 0.83203125, + "learning_rate": 1.15176134560457e-05, + "loss": 1.0985, + "step": 6842 + }, + { + "epoch": 2.149792881402517, + "grad_norm": 0.890625, + "learning_rate": 1.1515074579498573e-05, + "loss": 1.0704, + "step": 6844 + }, + { + "epoch": 2.1504211084280582, + "grad_norm": 0.875, + "learning_rate": 1.1512535702951445e-05, + "loss": 1.0915, + "step": 6846 + }, + { + "epoch": 2.1510493354535996, + "grad_norm": 0.859375, + "learning_rate": 1.1509996826404318e-05, + "loss": 1.1404, + "step": 6848 + }, + { + "epoch": 2.151677562479141, + "grad_norm": 0.8828125, + "learning_rate": 1.1507457949857187e-05, + "loss": 1.0769, + "step": 6850 + }, + { + "epoch": 2.152305789504682, + "grad_norm": 0.90625, + "learning_rate": 1.1504919073310062e-05, + "loss": 1.0789, + "step": 6852 + }, + { + "epoch": 2.1529340165302235, + "grad_norm": 0.88671875, + "learning_rate": 1.1502380196762932e-05, + "loss": 1.2021, + "step": 6854 + }, + { + "epoch": 2.153562243555765, + "grad_norm": 0.90234375, + "learning_rate": 1.1499841320215805e-05, + "loss": 1.1089, + "step": 6856 + }, + { + "epoch": 2.154190470581306, + "grad_norm": 0.81640625, + "learning_rate": 1.1497302443668678e-05, + "loss": 1.1592, + "step": 6858 + }, + { + "epoch": 2.1548186976068475, + "grad_norm": 0.9296875, + "learning_rate": 1.149476356712155e-05, + "loss": 1.2386, + "step": 6860 + }, + { + "epoch": 2.155446924632389, + "grad_norm": 0.90234375, + "learning_rate": 1.1492224690574422e-05, + "loss": 1.1695, + "step": 6862 + }, + { + "epoch": 2.15607515165793, + "grad_norm": 0.8515625, + "learning_rate": 1.1489685814027294e-05, + "loss": 1.1925, + "step": 6864 + }, + { + "epoch": 2.1567033786834715, + "grad_norm": 0.8515625, + "learning_rate": 1.1487146937480167e-05, + "loss": 1.1248, + "step": 6866 + }, + { + "epoch": 2.1573316057090133, + "grad_norm": 0.8828125, + "learning_rate": 1.1484608060933038e-05, + "loss": 1.1595, + "step": 6868 + }, + { + "epoch": 2.1579598327345546, + "grad_norm": 0.859375, + "learning_rate": 1.1482069184385911e-05, + "loss": 1.2658, + "step": 6870 + }, + { + "epoch": 2.158588059760096, + "grad_norm": 0.8671875, + "learning_rate": 1.1479530307838783e-05, + "loss": 1.1204, + "step": 6872 + }, + { + "epoch": 2.1592162867856373, + "grad_norm": 0.84375, + "learning_rate": 1.1476991431291656e-05, + "loss": 1.2738, + "step": 6874 + }, + { + "epoch": 2.1598445138111786, + "grad_norm": 0.90234375, + "learning_rate": 1.1474452554744525e-05, + "loss": 1.183, + "step": 6876 + }, + { + "epoch": 2.16047274083672, + "grad_norm": 0.8671875, + "learning_rate": 1.14719136781974e-05, + "loss": 1.2339, + "step": 6878 + }, + { + "epoch": 2.1611009678622612, + "grad_norm": 0.8125, + "learning_rate": 1.146937480165027e-05, + "loss": 1.2331, + "step": 6880 + }, + { + "epoch": 2.1617291948878026, + "grad_norm": 0.9296875, + "learning_rate": 1.1466835925103143e-05, + "loss": 1.2599, + "step": 6882 + }, + { + "epoch": 2.162357421913344, + "grad_norm": 0.82421875, + "learning_rate": 1.1464297048556014e-05, + "loss": 1.1194, + "step": 6884 + }, + { + "epoch": 2.1629856489388852, + "grad_norm": 0.8671875, + "learning_rate": 1.1461758172008887e-05, + "loss": 1.1056, + "step": 6886 + }, + { + "epoch": 2.1636138759644266, + "grad_norm": 0.8828125, + "learning_rate": 1.1459219295461759e-05, + "loss": 1.1664, + "step": 6888 + }, + { + "epoch": 2.164242102989968, + "grad_norm": 0.890625, + "learning_rate": 1.1456680418914632e-05, + "loss": 1.3292, + "step": 6890 + }, + { + "epoch": 2.164870330015509, + "grad_norm": 0.8828125, + "learning_rate": 1.1454141542367503e-05, + "loss": 1.1255, + "step": 6892 + }, + { + "epoch": 2.1654985570410505, + "grad_norm": 0.80078125, + "learning_rate": 1.1451602665820376e-05, + "loss": 1.193, + "step": 6894 + }, + { + "epoch": 2.166126784066592, + "grad_norm": 0.88671875, + "learning_rate": 1.1449063789273248e-05, + "loss": 1.1852, + "step": 6896 + }, + { + "epoch": 2.1667550110921336, + "grad_norm": 0.91796875, + "learning_rate": 1.1446524912726121e-05, + "loss": 1.1961, + "step": 6898 + }, + { + "epoch": 2.167383238117675, + "grad_norm": 0.8828125, + "learning_rate": 1.144398603617899e-05, + "loss": 1.0907, + "step": 6900 + }, + { + "epoch": 2.1680114651432163, + "grad_norm": 0.88671875, + "learning_rate": 1.1441447159631864e-05, + "loss": 1.3612, + "step": 6902 + }, + { + "epoch": 2.1686396921687576, + "grad_norm": 0.9140625, + "learning_rate": 1.1438908283084735e-05, + "loss": 1.1934, + "step": 6904 + }, + { + "epoch": 2.169267919194299, + "grad_norm": 0.85546875, + "learning_rate": 1.1436369406537608e-05, + "loss": 1.2283, + "step": 6906 + }, + { + "epoch": 2.1698961462198403, + "grad_norm": 0.83203125, + "learning_rate": 1.143383052999048e-05, + "loss": 1.1552, + "step": 6908 + }, + { + "epoch": 2.1705243732453816, + "grad_norm": 0.82421875, + "learning_rate": 1.1431291653443353e-05, + "loss": 1.173, + "step": 6910 + }, + { + "epoch": 2.171152600270923, + "grad_norm": 0.8515625, + "learning_rate": 1.1428752776896224e-05, + "loss": 1.148, + "step": 6912 + }, + { + "epoch": 2.1717808272964643, + "grad_norm": 0.90625, + "learning_rate": 1.1426213900349097e-05, + "loss": 1.2096, + "step": 6914 + }, + { + "epoch": 2.1724090543220056, + "grad_norm": 0.8671875, + "learning_rate": 1.1423675023801968e-05, + "loss": 1.193, + "step": 6916 + }, + { + "epoch": 2.173037281347547, + "grad_norm": 0.8359375, + "learning_rate": 1.1421136147254841e-05, + "loss": 1.1284, + "step": 6918 + }, + { + "epoch": 2.1736655083730883, + "grad_norm": 0.87109375, + "learning_rate": 1.1418597270707711e-05, + "loss": 1.1559, + "step": 6920 + }, + { + "epoch": 2.1742937353986296, + "grad_norm": 0.87890625, + "learning_rate": 1.1416058394160586e-05, + "loss": 1.3299, + "step": 6922 + }, + { + "epoch": 2.174921962424171, + "grad_norm": 0.84375, + "learning_rate": 1.1413519517613456e-05, + "loss": 1.2832, + "step": 6924 + }, + { + "epoch": 2.1755501894497122, + "grad_norm": 0.875, + "learning_rate": 1.1410980641066329e-05, + "loss": 1.1829, + "step": 6926 + }, + { + "epoch": 2.1761784164752536, + "grad_norm": 0.828125, + "learning_rate": 1.14084417645192e-05, + "loss": 1.2522, + "step": 6928 + }, + { + "epoch": 2.176806643500795, + "grad_norm": 0.828125, + "learning_rate": 1.1405902887972073e-05, + "loss": 1.0834, + "step": 6930 + }, + { + "epoch": 2.177434870526336, + "grad_norm": 0.796875, + "learning_rate": 1.1403364011424945e-05, + "loss": 1.225, + "step": 6932 + }, + { + "epoch": 2.178063097551878, + "grad_norm": 0.95703125, + "learning_rate": 1.1400825134877818e-05, + "loss": 1.0854, + "step": 6934 + }, + { + "epoch": 2.1786913245774193, + "grad_norm": 0.89453125, + "learning_rate": 1.1398286258330689e-05, + "loss": 1.1618, + "step": 6936 + }, + { + "epoch": 2.1793195516029606, + "grad_norm": 0.875, + "learning_rate": 1.1395747381783562e-05, + "loss": 1.2538, + "step": 6938 + }, + { + "epoch": 2.179947778628502, + "grad_norm": 0.90625, + "learning_rate": 1.1393208505236433e-05, + "loss": 1.2129, + "step": 6940 + }, + { + "epoch": 2.1805760056540433, + "grad_norm": 0.84765625, + "learning_rate": 1.1390669628689307e-05, + "loss": 1.1104, + "step": 6942 + }, + { + "epoch": 2.1812042326795846, + "grad_norm": 0.80078125, + "learning_rate": 1.138813075214218e-05, + "loss": 1.1527, + "step": 6944 + }, + { + "epoch": 2.181832459705126, + "grad_norm": 0.91015625, + "learning_rate": 1.138559187559505e-05, + "loss": 1.1983, + "step": 6946 + }, + { + "epoch": 2.1824606867306673, + "grad_norm": 0.81640625, + "learning_rate": 1.1383052999047924e-05, + "loss": 1.1743, + "step": 6948 + }, + { + "epoch": 2.1830889137562086, + "grad_norm": 0.91015625, + "learning_rate": 1.1380514122500794e-05, + "loss": 1.0212, + "step": 6950 + }, + { + "epoch": 2.18371714078175, + "grad_norm": 0.859375, + "learning_rate": 1.1377975245953667e-05, + "loss": 1.1853, + "step": 6952 + }, + { + "epoch": 2.1843453678072913, + "grad_norm": 0.87890625, + "learning_rate": 1.1375436369406538e-05, + "loss": 1.1346, + "step": 6954 + }, + { + "epoch": 2.1849735948328326, + "grad_norm": 0.8828125, + "learning_rate": 1.1372897492859411e-05, + "loss": 1.3125, + "step": 6956 + }, + { + "epoch": 2.185601821858374, + "grad_norm": 0.8359375, + "learning_rate": 1.1370358616312283e-05, + "loss": 1.3227, + "step": 6958 + }, + { + "epoch": 2.1862300488839153, + "grad_norm": 0.8828125, + "learning_rate": 1.1367819739765156e-05, + "loss": 1.2947, + "step": 6960 + }, + { + "epoch": 2.1868582759094566, + "grad_norm": 0.93359375, + "learning_rate": 1.1365280863218027e-05, + "loss": 1.1531, + "step": 6962 + }, + { + "epoch": 2.1874865029349984, + "grad_norm": 0.9375, + "learning_rate": 1.13627419866709e-05, + "loss": 1.0524, + "step": 6964 + }, + { + "epoch": 2.1881147299605397, + "grad_norm": 0.8515625, + "learning_rate": 1.1360203110123772e-05, + "loss": 1.1572, + "step": 6966 + }, + { + "epoch": 2.188742956986081, + "grad_norm": 0.94921875, + "learning_rate": 1.1357664233576645e-05, + "loss": 1.3277, + "step": 6968 + }, + { + "epoch": 2.1893711840116223, + "grad_norm": 0.92578125, + "learning_rate": 1.1355125357029514e-05, + "loss": 1.2082, + "step": 6970 + }, + { + "epoch": 2.1899994110371637, + "grad_norm": 0.8359375, + "learning_rate": 1.1352586480482387e-05, + "loss": 1.2906, + "step": 6972 + }, + { + "epoch": 2.190627638062705, + "grad_norm": 0.88671875, + "learning_rate": 1.1350047603935259e-05, + "loss": 1.1647, + "step": 6974 + }, + { + "epoch": 2.1912558650882463, + "grad_norm": 0.8046875, + "learning_rate": 1.1347508727388132e-05, + "loss": 1.2105, + "step": 6976 + }, + { + "epoch": 2.1918840921137877, + "grad_norm": 0.88671875, + "learning_rate": 1.1344969850841003e-05, + "loss": 1.2325, + "step": 6978 + }, + { + "epoch": 2.192512319139329, + "grad_norm": 0.79296875, + "learning_rate": 1.1342430974293876e-05, + "loss": 1.2232, + "step": 6980 + }, + { + "epoch": 2.1931405461648703, + "grad_norm": 0.87109375, + "learning_rate": 1.1339892097746748e-05, + "loss": 1.2276, + "step": 6982 + }, + { + "epoch": 2.1937687731904116, + "grad_norm": 0.859375, + "learning_rate": 1.133735322119962e-05, + "loss": 1.1263, + "step": 6984 + }, + { + "epoch": 2.194397000215953, + "grad_norm": 0.8515625, + "learning_rate": 1.1334814344652492e-05, + "loss": 1.1551, + "step": 6986 + }, + { + "epoch": 2.1950252272414943, + "grad_norm": 0.875, + "learning_rate": 1.1332275468105365e-05, + "loss": 1.1824, + "step": 6988 + }, + { + "epoch": 2.1956534542670356, + "grad_norm": 0.8828125, + "learning_rate": 1.1329736591558237e-05, + "loss": 1.2166, + "step": 6990 + }, + { + "epoch": 2.196281681292577, + "grad_norm": 0.8359375, + "learning_rate": 1.132719771501111e-05, + "loss": 1.2635, + "step": 6992 + }, + { + "epoch": 2.1969099083181183, + "grad_norm": 0.8984375, + "learning_rate": 1.132465883846398e-05, + "loss": 1.2588, + "step": 6994 + }, + { + "epoch": 2.1975381353436596, + "grad_norm": 0.83203125, + "learning_rate": 1.1322119961916852e-05, + "loss": 1.3271, + "step": 6996 + }, + { + "epoch": 2.1981663623692014, + "grad_norm": 0.8359375, + "learning_rate": 1.1319581085369724e-05, + "loss": 1.1563, + "step": 6998 + }, + { + "epoch": 2.1987945893947427, + "grad_norm": 0.8828125, + "learning_rate": 1.1317042208822597e-05, + "loss": 1.109, + "step": 7000 + }, + { + "epoch": 2.199422816420284, + "grad_norm": 0.80859375, + "learning_rate": 1.1314503332275468e-05, + "loss": 1.144, + "step": 7002 + }, + { + "epoch": 2.2000510434458254, + "grad_norm": 0.83203125, + "learning_rate": 1.1311964455728341e-05, + "loss": 1.2422, + "step": 7004 + }, + { + "epoch": 2.2006792704713667, + "grad_norm": 0.87109375, + "learning_rate": 1.1309425579181213e-05, + "loss": 1.2329, + "step": 7006 + }, + { + "epoch": 2.201307497496908, + "grad_norm": 0.92578125, + "learning_rate": 1.1306886702634086e-05, + "loss": 1.1037, + "step": 7008 + }, + { + "epoch": 2.2019357245224493, + "grad_norm": 0.92578125, + "learning_rate": 1.1304347826086957e-05, + "loss": 1.2786, + "step": 7010 + }, + { + "epoch": 2.2025639515479907, + "grad_norm": 0.8125, + "learning_rate": 1.130180894953983e-05, + "loss": 1.3284, + "step": 7012 + }, + { + "epoch": 2.203192178573532, + "grad_norm": 0.90625, + "learning_rate": 1.12992700729927e-05, + "loss": 1.1047, + "step": 7014 + }, + { + "epoch": 2.2038204055990733, + "grad_norm": 0.88671875, + "learning_rate": 1.1296731196445575e-05, + "loss": 1.2299, + "step": 7016 + }, + { + "epoch": 2.2044486326246147, + "grad_norm": 0.8359375, + "learning_rate": 1.1294192319898444e-05, + "loss": 1.1535, + "step": 7018 + }, + { + "epoch": 2.205076859650156, + "grad_norm": 0.9609375, + "learning_rate": 1.1291653443351318e-05, + "loss": 1.3391, + "step": 7020 + }, + { + "epoch": 2.2057050866756973, + "grad_norm": 0.8828125, + "learning_rate": 1.1289114566804189e-05, + "loss": 1.092, + "step": 7022 + }, + { + "epoch": 2.2063333137012386, + "grad_norm": 0.9140625, + "learning_rate": 1.1286575690257062e-05, + "loss": 1.2479, + "step": 7024 + }, + { + "epoch": 2.20696154072678, + "grad_norm": 1.0078125, + "learning_rate": 1.1284036813709933e-05, + "loss": 1.0299, + "step": 7026 + }, + { + "epoch": 2.2075897677523213, + "grad_norm": 0.91796875, + "learning_rate": 1.1281497937162806e-05, + "loss": 1.1123, + "step": 7028 + }, + { + "epoch": 2.208217994777863, + "grad_norm": 0.86328125, + "learning_rate": 1.127895906061568e-05, + "loss": 1.1984, + "step": 7030 + }, + { + "epoch": 2.2088462218034044, + "grad_norm": 0.84765625, + "learning_rate": 1.1276420184068551e-05, + "loss": 1.1251, + "step": 7032 + }, + { + "epoch": 2.2094744488289457, + "grad_norm": 0.9453125, + "learning_rate": 1.1273881307521424e-05, + "loss": 1.0623, + "step": 7034 + }, + { + "epoch": 2.210102675854487, + "grad_norm": 0.85546875, + "learning_rate": 1.1271342430974295e-05, + "loss": 1.1495, + "step": 7036 + }, + { + "epoch": 2.2107309028800284, + "grad_norm": 0.87890625, + "learning_rate": 1.1268803554427168e-05, + "loss": 1.1898, + "step": 7038 + }, + { + "epoch": 2.2113591299055697, + "grad_norm": 0.97265625, + "learning_rate": 1.1266264677880038e-05, + "loss": 1.1346, + "step": 7040 + }, + { + "epoch": 2.211987356931111, + "grad_norm": 0.8125, + "learning_rate": 1.1263725801332913e-05, + "loss": 1.0782, + "step": 7042 + }, + { + "epoch": 2.2126155839566524, + "grad_norm": 0.953125, + "learning_rate": 1.1261186924785783e-05, + "loss": 1.0988, + "step": 7044 + }, + { + "epoch": 2.2132438109821937, + "grad_norm": 0.8671875, + "learning_rate": 1.1258648048238656e-05, + "loss": 1.2726, + "step": 7046 + }, + { + "epoch": 2.213872038007735, + "grad_norm": 0.89453125, + "learning_rate": 1.1256109171691527e-05, + "loss": 1.3077, + "step": 7048 + }, + { + "epoch": 2.2145002650332763, + "grad_norm": 0.890625, + "learning_rate": 1.12535702951444e-05, + "loss": 1.1586, + "step": 7050 + }, + { + "epoch": 2.2151284920588177, + "grad_norm": 0.87109375, + "learning_rate": 1.1251031418597271e-05, + "loss": 1.1196, + "step": 7052 + }, + { + "epoch": 2.215756719084359, + "grad_norm": 0.87109375, + "learning_rate": 1.1248492542050145e-05, + "loss": 1.3712, + "step": 7054 + }, + { + "epoch": 2.2163849461099003, + "grad_norm": 0.8359375, + "learning_rate": 1.1245953665503016e-05, + "loss": 1.1446, + "step": 7056 + }, + { + "epoch": 2.2170131731354417, + "grad_norm": 0.8515625, + "learning_rate": 1.1243414788955889e-05, + "loss": 1.2527, + "step": 7058 + }, + { + "epoch": 2.217641400160983, + "grad_norm": 0.921875, + "learning_rate": 1.124087591240876e-05, + "loss": 1.1417, + "step": 7060 + }, + { + "epoch": 2.2182696271865243, + "grad_norm": 1.734375, + "learning_rate": 1.1238337035861633e-05, + "loss": 1.1248, + "step": 7062 + }, + { + "epoch": 2.218897854212066, + "grad_norm": 0.9296875, + "learning_rate": 1.1235798159314503e-05, + "loss": 1.1408, + "step": 7064 + }, + { + "epoch": 2.2195260812376074, + "grad_norm": 0.890625, + "learning_rate": 1.1233259282767376e-05, + "loss": 1.1888, + "step": 7066 + }, + { + "epoch": 2.2201543082631487, + "grad_norm": 0.875, + "learning_rate": 1.1230720406220248e-05, + "loss": 1.3227, + "step": 7068 + }, + { + "epoch": 2.22078253528869, + "grad_norm": 0.91796875, + "learning_rate": 1.122818152967312e-05, + "loss": 1.2715, + "step": 7070 + }, + { + "epoch": 2.2214107623142314, + "grad_norm": 0.8125, + "learning_rate": 1.1225642653125992e-05, + "loss": 1.1767, + "step": 7072 + }, + { + "epoch": 2.2220389893397727, + "grad_norm": 0.86328125, + "learning_rate": 1.1223103776578865e-05, + "loss": 1.2829, + "step": 7074 + }, + { + "epoch": 2.222667216365314, + "grad_norm": 0.95703125, + "learning_rate": 1.1220564900031737e-05, + "loss": 1.1629, + "step": 7076 + }, + { + "epoch": 2.2232954433908554, + "grad_norm": 0.9296875, + "learning_rate": 1.121802602348461e-05, + "loss": 1.252, + "step": 7078 + }, + { + "epoch": 2.2239236704163967, + "grad_norm": 0.79296875, + "learning_rate": 1.1215487146937481e-05, + "loss": 1.0698, + "step": 7080 + }, + { + "epoch": 2.224551897441938, + "grad_norm": 0.88671875, + "learning_rate": 1.1212948270390354e-05, + "loss": 1.136, + "step": 7082 + }, + { + "epoch": 2.2251801244674794, + "grad_norm": 0.98046875, + "learning_rate": 1.1210409393843224e-05, + "loss": 1.1405, + "step": 7084 + }, + { + "epoch": 2.2258083514930207, + "grad_norm": 0.86328125, + "learning_rate": 1.1207870517296099e-05, + "loss": 1.2, + "step": 7086 + }, + { + "epoch": 2.226436578518562, + "grad_norm": 0.89453125, + "learning_rate": 1.1205331640748968e-05, + "loss": 1.1154, + "step": 7088 + }, + { + "epoch": 2.2270648055441034, + "grad_norm": 0.85546875, + "learning_rate": 1.1202792764201841e-05, + "loss": 1.1577, + "step": 7090 + }, + { + "epoch": 2.2276930325696447, + "grad_norm": 0.85546875, + "learning_rate": 1.1200253887654713e-05, + "loss": 1.1263, + "step": 7092 + }, + { + "epoch": 2.228321259595186, + "grad_norm": 0.8515625, + "learning_rate": 1.1197715011107586e-05, + "loss": 1.0956, + "step": 7094 + }, + { + "epoch": 2.228949486620728, + "grad_norm": 0.91015625, + "learning_rate": 1.1195176134560457e-05, + "loss": 1.0752, + "step": 7096 + }, + { + "epoch": 2.229577713646269, + "grad_norm": 0.83203125, + "learning_rate": 1.119263725801333e-05, + "loss": 1.08, + "step": 7098 + }, + { + "epoch": 2.2302059406718104, + "grad_norm": 0.7734375, + "learning_rate": 1.1190098381466202e-05, + "loss": 1.206, + "step": 7100 + }, + { + "epoch": 2.2308341676973518, + "grad_norm": 0.875, + "learning_rate": 1.1187559504919075e-05, + "loss": 1.1989, + "step": 7102 + }, + { + "epoch": 2.231462394722893, + "grad_norm": 1.0, + "learning_rate": 1.1185020628371946e-05, + "loss": 1.2279, + "step": 7104 + }, + { + "epoch": 2.2320906217484344, + "grad_norm": 0.8671875, + "learning_rate": 1.1182481751824819e-05, + "loss": 1.2506, + "step": 7106 + }, + { + "epoch": 2.2327188487739758, + "grad_norm": 0.92578125, + "learning_rate": 1.1179942875277689e-05, + "loss": 1.1432, + "step": 7108 + }, + { + "epoch": 2.233347075799517, + "grad_norm": 0.85546875, + "learning_rate": 1.1177403998730562e-05, + "loss": 1.2258, + "step": 7110 + }, + { + "epoch": 2.2339753028250584, + "grad_norm": 0.88671875, + "learning_rate": 1.1174865122183433e-05, + "loss": 1.1279, + "step": 7112 + }, + { + "epoch": 2.2346035298505997, + "grad_norm": 0.87890625, + "learning_rate": 1.1172326245636306e-05, + "loss": 1.2938, + "step": 7114 + }, + { + "epoch": 2.235231756876141, + "grad_norm": 0.84765625, + "learning_rate": 1.116978736908918e-05, + "loss": 1.1135, + "step": 7116 + }, + { + "epoch": 2.2358599839016824, + "grad_norm": 0.8359375, + "learning_rate": 1.116724849254205e-05, + "loss": 1.1685, + "step": 7118 + }, + { + "epoch": 2.2364882109272237, + "grad_norm": 0.85546875, + "learning_rate": 1.1164709615994924e-05, + "loss": 1.1768, + "step": 7120 + }, + { + "epoch": 2.237116437952765, + "grad_norm": 0.921875, + "learning_rate": 1.1162170739447795e-05, + "loss": 1.118, + "step": 7122 + }, + { + "epoch": 2.2377446649783064, + "grad_norm": 0.95703125, + "learning_rate": 1.1159631862900668e-05, + "loss": 1.1579, + "step": 7124 + }, + { + "epoch": 2.2383728920038477, + "grad_norm": 0.8203125, + "learning_rate": 1.115709298635354e-05, + "loss": 1.2208, + "step": 7126 + }, + { + "epoch": 2.239001119029389, + "grad_norm": 0.84765625, + "learning_rate": 1.1154554109806413e-05, + "loss": 1.2217, + "step": 7128 + }, + { + "epoch": 2.239629346054931, + "grad_norm": 0.8515625, + "learning_rate": 1.1152015233259284e-05, + "loss": 1.0771, + "step": 7130 + }, + { + "epoch": 2.240257573080472, + "grad_norm": 0.84765625, + "learning_rate": 1.1149476356712157e-05, + "loss": 1.366, + "step": 7132 + }, + { + "epoch": 2.2408858001060135, + "grad_norm": 0.83984375, + "learning_rate": 1.1146937480165027e-05, + "loss": 1.118, + "step": 7134 + }, + { + "epoch": 2.241514027131555, + "grad_norm": 0.8984375, + "learning_rate": 1.11443986036179e-05, + "loss": 1.1676, + "step": 7136 + }, + { + "epoch": 2.242142254157096, + "grad_norm": 0.8671875, + "learning_rate": 1.1141859727070771e-05, + "loss": 1.2008, + "step": 7138 + }, + { + "epoch": 2.2427704811826374, + "grad_norm": 0.875, + "learning_rate": 1.1139320850523645e-05, + "loss": 1.1776, + "step": 7140 + }, + { + "epoch": 2.2433987082081788, + "grad_norm": 0.8515625, + "learning_rate": 1.1136781973976516e-05, + "loss": 1.2754, + "step": 7142 + }, + { + "epoch": 2.24402693523372, + "grad_norm": 0.80859375, + "learning_rate": 1.1134243097429389e-05, + "loss": 1.2551, + "step": 7144 + }, + { + "epoch": 2.2446551622592614, + "grad_norm": 0.8828125, + "learning_rate": 1.113170422088226e-05, + "loss": 1.1994, + "step": 7146 + }, + { + "epoch": 2.2452833892848028, + "grad_norm": 0.859375, + "learning_rate": 1.1129165344335133e-05, + "loss": 1.2192, + "step": 7148 + }, + { + "epoch": 2.245911616310344, + "grad_norm": 0.9296875, + "learning_rate": 1.1126626467788005e-05, + "loss": 1.2826, + "step": 7150 + }, + { + "epoch": 2.2465398433358854, + "grad_norm": 0.90625, + "learning_rate": 1.1124087591240878e-05, + "loss": 1.2019, + "step": 7152 + }, + { + "epoch": 2.2471680703614267, + "grad_norm": 0.81640625, + "learning_rate": 1.112154871469375e-05, + "loss": 1.1506, + "step": 7154 + }, + { + "epoch": 2.247796297386968, + "grad_norm": 0.9453125, + "learning_rate": 1.1119009838146622e-05, + "loss": 1.0458, + "step": 7156 + }, + { + "epoch": 2.2484245244125094, + "grad_norm": 0.8828125, + "learning_rate": 1.1116470961599492e-05, + "loss": 1.1759, + "step": 7158 + }, + { + "epoch": 2.2490527514380507, + "grad_norm": 0.95703125, + "learning_rate": 1.1113932085052365e-05, + "loss": 1.1225, + "step": 7160 + }, + { + "epoch": 2.2496809784635925, + "grad_norm": 0.87109375, + "learning_rate": 1.1111393208505236e-05, + "loss": 1.1735, + "step": 7162 + }, + { + "epoch": 2.250309205489134, + "grad_norm": 0.9140625, + "learning_rate": 1.110885433195811e-05, + "loss": 1.0964, + "step": 7164 + }, + { + "epoch": 2.250937432514675, + "grad_norm": 0.94140625, + "learning_rate": 1.1106315455410981e-05, + "loss": 1.1575, + "step": 7166 + }, + { + "epoch": 2.2515656595402165, + "grad_norm": 0.828125, + "learning_rate": 1.1103776578863854e-05, + "loss": 1.308, + "step": 7168 + }, + { + "epoch": 2.252193886565758, + "grad_norm": 0.9296875, + "learning_rate": 1.1101237702316725e-05, + "loss": 1.2186, + "step": 7170 + }, + { + "epoch": 2.252822113591299, + "grad_norm": 0.90625, + "learning_rate": 1.1098698825769598e-05, + "loss": 1.2044, + "step": 7172 + }, + { + "epoch": 2.2534503406168405, + "grad_norm": 0.91796875, + "learning_rate": 1.109615994922247e-05, + "loss": 1.1716, + "step": 7174 + }, + { + "epoch": 2.254078567642382, + "grad_norm": 0.8984375, + "learning_rate": 1.1093621072675343e-05, + "loss": 1.1242, + "step": 7176 + }, + { + "epoch": 2.254706794667923, + "grad_norm": 0.79296875, + "learning_rate": 1.1091082196128213e-05, + "loss": 1.254, + "step": 7178 + }, + { + "epoch": 2.2553350216934644, + "grad_norm": 0.95703125, + "learning_rate": 1.1088543319581086e-05, + "loss": 1.2207, + "step": 7180 + }, + { + "epoch": 2.2559632487190058, + "grad_norm": 0.88671875, + "learning_rate": 1.1086004443033957e-05, + "loss": 1.2314, + "step": 7182 + }, + { + "epoch": 2.256591475744547, + "grad_norm": 0.8671875, + "learning_rate": 1.108346556648683e-05, + "loss": 1.1507, + "step": 7184 + }, + { + "epoch": 2.2572197027700884, + "grad_norm": 0.85546875, + "learning_rate": 1.1080926689939702e-05, + "loss": 1.0976, + "step": 7186 + }, + { + "epoch": 2.2578479297956298, + "grad_norm": 0.87109375, + "learning_rate": 1.1078387813392575e-05, + "loss": 1.2232, + "step": 7188 + }, + { + "epoch": 2.258476156821171, + "grad_norm": 0.8515625, + "learning_rate": 1.1075848936845446e-05, + "loss": 1.3585, + "step": 7190 + }, + { + "epoch": 2.2591043838467124, + "grad_norm": 0.93359375, + "learning_rate": 1.1073310060298319e-05, + "loss": 1.2097, + "step": 7192 + }, + { + "epoch": 2.2597326108722537, + "grad_norm": 0.81640625, + "learning_rate": 1.107077118375119e-05, + "loss": 1.1563, + "step": 7194 + }, + { + "epoch": 2.260360837897795, + "grad_norm": 0.94140625, + "learning_rate": 1.1068232307204064e-05, + "loss": 1.1191, + "step": 7196 + }, + { + "epoch": 2.260989064923337, + "grad_norm": 0.98046875, + "learning_rate": 1.1065693430656935e-05, + "loss": 1.1386, + "step": 7198 + }, + { + "epoch": 2.261617291948878, + "grad_norm": 0.9765625, + "learning_rate": 1.1063154554109808e-05, + "loss": 1.0648, + "step": 7200 + }, + { + "epoch": 2.2622455189744195, + "grad_norm": 0.97265625, + "learning_rate": 1.1060615677562681e-05, + "loss": 1.2043, + "step": 7202 + }, + { + "epoch": 2.262873745999961, + "grad_norm": 0.8828125, + "learning_rate": 1.105807680101555e-05, + "loss": 1.1253, + "step": 7204 + }, + { + "epoch": 2.263501973025502, + "grad_norm": 0.83203125, + "learning_rate": 1.1055537924468424e-05, + "loss": 1.2436, + "step": 7206 + }, + { + "epoch": 2.2641302000510435, + "grad_norm": 1.0234375, + "learning_rate": 1.1052999047921295e-05, + "loss": 1.1287, + "step": 7208 + }, + { + "epoch": 2.264758427076585, + "grad_norm": 0.875, + "learning_rate": 1.1050460171374168e-05, + "loss": 1.1132, + "step": 7210 + }, + { + "epoch": 2.265386654102126, + "grad_norm": 0.8515625, + "learning_rate": 1.104792129482704e-05, + "loss": 1.1685, + "step": 7212 + }, + { + "epoch": 2.2660148811276675, + "grad_norm": 1.0546875, + "learning_rate": 1.1045382418279913e-05, + "loss": 1.1164, + "step": 7214 + }, + { + "epoch": 2.266643108153209, + "grad_norm": 0.8515625, + "learning_rate": 1.1042843541732784e-05, + "loss": 1.259, + "step": 7216 + }, + { + "epoch": 2.26727133517875, + "grad_norm": 0.78125, + "learning_rate": 1.1040304665185657e-05, + "loss": 1.2067, + "step": 7218 + }, + { + "epoch": 2.2678995622042915, + "grad_norm": 0.9375, + "learning_rate": 1.1037765788638529e-05, + "loss": 1.2234, + "step": 7220 + }, + { + "epoch": 2.268527789229833, + "grad_norm": 0.8203125, + "learning_rate": 1.1035226912091402e-05, + "loss": 1.2389, + "step": 7222 + }, + { + "epoch": 2.269156016255374, + "grad_norm": 0.81640625, + "learning_rate": 1.1032688035544273e-05, + "loss": 1.307, + "step": 7224 + }, + { + "epoch": 2.269784243280916, + "grad_norm": 0.91015625, + "learning_rate": 1.1030149158997146e-05, + "loss": 1.1486, + "step": 7226 + }, + { + "epoch": 2.270412470306457, + "grad_norm": 0.90234375, + "learning_rate": 1.1027610282450016e-05, + "loss": 1.2867, + "step": 7228 + }, + { + "epoch": 2.2710406973319985, + "grad_norm": 1.0390625, + "learning_rate": 1.1025071405902889e-05, + "loss": 1.1398, + "step": 7230 + }, + { + "epoch": 2.27166892435754, + "grad_norm": 0.875, + "learning_rate": 1.102253252935576e-05, + "loss": 1.1913, + "step": 7232 + }, + { + "epoch": 2.272297151383081, + "grad_norm": 0.828125, + "learning_rate": 1.1019993652808633e-05, + "loss": 1.2196, + "step": 7234 + }, + { + "epoch": 2.2729253784086225, + "grad_norm": 0.87109375, + "learning_rate": 1.1017454776261505e-05, + "loss": 1.209, + "step": 7236 + }, + { + "epoch": 2.273553605434164, + "grad_norm": 0.84765625, + "learning_rate": 1.1014915899714378e-05, + "loss": 1.0517, + "step": 7238 + }, + { + "epoch": 2.274181832459705, + "grad_norm": 0.84765625, + "learning_rate": 1.101237702316725e-05, + "loss": 1.1559, + "step": 7240 + }, + { + "epoch": 2.2748100594852465, + "grad_norm": 0.8828125, + "learning_rate": 1.1009838146620122e-05, + "loss": 1.2096, + "step": 7242 + }, + { + "epoch": 2.275438286510788, + "grad_norm": 0.82421875, + "learning_rate": 1.1007299270072994e-05, + "loss": 1.0206, + "step": 7244 + }, + { + "epoch": 2.276066513536329, + "grad_norm": 0.8671875, + "learning_rate": 1.1004760393525867e-05, + "loss": 1.2358, + "step": 7246 + }, + { + "epoch": 2.2766947405618705, + "grad_norm": 0.87109375, + "learning_rate": 1.1002221516978736e-05, + "loss": 1.1215, + "step": 7248 + }, + { + "epoch": 2.277322967587412, + "grad_norm": 0.8359375, + "learning_rate": 1.0999682640431611e-05, + "loss": 1.313, + "step": 7250 + }, + { + "epoch": 2.277951194612953, + "grad_norm": 0.87890625, + "learning_rate": 1.0997143763884481e-05, + "loss": 1.1596, + "step": 7252 + }, + { + "epoch": 2.2785794216384945, + "grad_norm": 0.8515625, + "learning_rate": 1.0994604887337354e-05, + "loss": 1.1222, + "step": 7254 + }, + { + "epoch": 2.279207648664036, + "grad_norm": 0.8984375, + "learning_rate": 1.0992066010790225e-05, + "loss": 1.116, + "step": 7256 + }, + { + "epoch": 2.279835875689577, + "grad_norm": 0.86328125, + "learning_rate": 1.0989527134243098e-05, + "loss": 1.1827, + "step": 7258 + }, + { + "epoch": 2.2804641027151185, + "grad_norm": 0.91796875, + "learning_rate": 1.098698825769597e-05, + "loss": 1.1286, + "step": 7260 + }, + { + "epoch": 2.28109232974066, + "grad_norm": 0.83203125, + "learning_rate": 1.0984449381148843e-05, + "loss": 0.9997, + "step": 7262 + }, + { + "epoch": 2.2817205567662016, + "grad_norm": 0.875, + "learning_rate": 1.0981910504601714e-05, + "loss": 1.1823, + "step": 7264 + }, + { + "epoch": 2.282348783791743, + "grad_norm": 0.8203125, + "learning_rate": 1.0979371628054587e-05, + "loss": 1.2145, + "step": 7266 + }, + { + "epoch": 2.282977010817284, + "grad_norm": 0.89453125, + "learning_rate": 1.0976832751507459e-05, + "loss": 1.1597, + "step": 7268 + }, + { + "epoch": 2.2836052378428255, + "grad_norm": 0.8359375, + "learning_rate": 1.0974293874960332e-05, + "loss": 1.1206, + "step": 7270 + }, + { + "epoch": 2.284233464868367, + "grad_norm": 0.8984375, + "learning_rate": 1.0971754998413201e-05, + "loss": 1.1522, + "step": 7272 + }, + { + "epoch": 2.284861691893908, + "grad_norm": 0.8515625, + "learning_rate": 1.0969216121866075e-05, + "loss": 1.2065, + "step": 7274 + }, + { + "epoch": 2.2854899189194495, + "grad_norm": 0.9140625, + "learning_rate": 1.0966677245318946e-05, + "loss": 1.2534, + "step": 7276 + }, + { + "epoch": 2.286118145944991, + "grad_norm": 0.84375, + "learning_rate": 1.0964138368771819e-05, + "loss": 1.3111, + "step": 7278 + }, + { + "epoch": 2.286746372970532, + "grad_norm": 0.83203125, + "learning_rate": 1.096159949222469e-05, + "loss": 1.1712, + "step": 7280 + }, + { + "epoch": 2.2873745999960735, + "grad_norm": 0.81640625, + "learning_rate": 1.0959060615677563e-05, + "loss": 1.2978, + "step": 7282 + }, + { + "epoch": 2.288002827021615, + "grad_norm": 1.0078125, + "learning_rate": 1.0956521739130435e-05, + "loss": 1.1384, + "step": 7284 + }, + { + "epoch": 2.288631054047156, + "grad_norm": 0.91796875, + "learning_rate": 1.0953982862583308e-05, + "loss": 1.1677, + "step": 7286 + }, + { + "epoch": 2.2892592810726975, + "grad_norm": 0.84765625, + "learning_rate": 1.0951443986036181e-05, + "loss": 1.2128, + "step": 7288 + }, + { + "epoch": 2.289887508098239, + "grad_norm": 1.0703125, + "learning_rate": 1.0948905109489052e-05, + "loss": 1.274, + "step": 7290 + }, + { + "epoch": 2.2905157351237806, + "grad_norm": 0.890625, + "learning_rate": 1.0946366232941925e-05, + "loss": 1.1172, + "step": 7292 + }, + { + "epoch": 2.291143962149322, + "grad_norm": 0.8671875, + "learning_rate": 1.0943827356394797e-05, + "loss": 1.2049, + "step": 7294 + }, + { + "epoch": 2.2917721891748633, + "grad_norm": 0.859375, + "learning_rate": 1.094128847984767e-05, + "loss": 1.1945, + "step": 7296 + }, + { + "epoch": 2.2924004162004046, + "grad_norm": 0.8671875, + "learning_rate": 1.093874960330054e-05, + "loss": 1.0366, + "step": 7298 + }, + { + "epoch": 2.293028643225946, + "grad_norm": 0.84765625, + "learning_rate": 1.0936210726753413e-05, + "loss": 1.1452, + "step": 7300 + }, + { + "epoch": 2.2936568702514872, + "grad_norm": 0.87109375, + "learning_rate": 1.0933671850206284e-05, + "loss": 1.2821, + "step": 7302 + }, + { + "epoch": 2.2942850972770286, + "grad_norm": 0.88671875, + "learning_rate": 1.0931132973659157e-05, + "loss": 1.0961, + "step": 7304 + }, + { + "epoch": 2.29491332430257, + "grad_norm": 0.78515625, + "learning_rate": 1.0928594097112029e-05, + "loss": 1.2104, + "step": 7306 + }, + { + "epoch": 2.295541551328111, + "grad_norm": 0.8515625, + "learning_rate": 1.0926055220564902e-05, + "loss": 1.29, + "step": 7308 + }, + { + "epoch": 2.2961697783536525, + "grad_norm": 0.8515625, + "learning_rate": 1.0923516344017773e-05, + "loss": 1.2525, + "step": 7310 + }, + { + "epoch": 2.296798005379194, + "grad_norm": 0.8828125, + "learning_rate": 1.0920977467470646e-05, + "loss": 1.2847, + "step": 7312 + }, + { + "epoch": 2.297426232404735, + "grad_norm": 0.90234375, + "learning_rate": 1.0918438590923517e-05, + "loss": 1.1948, + "step": 7314 + }, + { + "epoch": 2.2980544594302765, + "grad_norm": 0.91796875, + "learning_rate": 1.091589971437639e-05, + "loss": 1.1595, + "step": 7316 + }, + { + "epoch": 2.298682686455818, + "grad_norm": 0.84375, + "learning_rate": 1.091336083782926e-05, + "loss": 1.2342, + "step": 7318 + }, + { + "epoch": 2.299310913481359, + "grad_norm": 0.83203125, + "learning_rate": 1.0910821961282135e-05, + "loss": 1.1913, + "step": 7320 + }, + { + "epoch": 2.2999391405069005, + "grad_norm": 0.97265625, + "learning_rate": 1.0908283084735005e-05, + "loss": 1.1333, + "step": 7322 + }, + { + "epoch": 2.300567367532442, + "grad_norm": 0.875, + "learning_rate": 1.0905744208187878e-05, + "loss": 1.1376, + "step": 7324 + }, + { + "epoch": 2.301195594557983, + "grad_norm": 0.88671875, + "learning_rate": 1.0903205331640749e-05, + "loss": 1.1524, + "step": 7326 + }, + { + "epoch": 2.3018238215835245, + "grad_norm": 0.8515625, + "learning_rate": 1.0900666455093622e-05, + "loss": 1.1174, + "step": 7328 + }, + { + "epoch": 2.3024520486090663, + "grad_norm": 0.82421875, + "learning_rate": 1.0898127578546494e-05, + "loss": 1.1047, + "step": 7330 + }, + { + "epoch": 2.3030802756346076, + "grad_norm": 0.8203125, + "learning_rate": 1.0895588701999367e-05, + "loss": 1.1898, + "step": 7332 + }, + { + "epoch": 2.303708502660149, + "grad_norm": 0.8125, + "learning_rate": 1.0893049825452238e-05, + "loss": 1.3392, + "step": 7334 + }, + { + "epoch": 2.3043367296856903, + "grad_norm": 0.8671875, + "learning_rate": 1.0890510948905111e-05, + "loss": 1.2683, + "step": 7336 + }, + { + "epoch": 2.3049649567112316, + "grad_norm": 0.7890625, + "learning_rate": 1.0887972072357982e-05, + "loss": 1.0879, + "step": 7338 + }, + { + "epoch": 2.305593183736773, + "grad_norm": 0.92578125, + "learning_rate": 1.0885433195810856e-05, + "loss": 1.095, + "step": 7340 + }, + { + "epoch": 2.3062214107623142, + "grad_norm": 0.83984375, + "learning_rate": 1.0882894319263725e-05, + "loss": 1.3192, + "step": 7342 + }, + { + "epoch": 2.3068496377878556, + "grad_norm": 0.8828125, + "learning_rate": 1.0880355442716598e-05, + "loss": 1.3008, + "step": 7344 + }, + { + "epoch": 2.307477864813397, + "grad_norm": 0.8203125, + "learning_rate": 1.087781656616947e-05, + "loss": 1.1895, + "step": 7346 + }, + { + "epoch": 2.3081060918389382, + "grad_norm": 0.87109375, + "learning_rate": 1.0875277689622343e-05, + "loss": 1.1833, + "step": 7348 + }, + { + "epoch": 2.3087343188644796, + "grad_norm": 0.8515625, + "learning_rate": 1.0872738813075214e-05, + "loss": 1.1241, + "step": 7350 + }, + { + "epoch": 2.309362545890021, + "grad_norm": 0.921875, + "learning_rate": 1.0870199936528087e-05, + "loss": 1.1805, + "step": 7352 + }, + { + "epoch": 2.309990772915562, + "grad_norm": 0.87109375, + "learning_rate": 1.0867661059980959e-05, + "loss": 1.0791, + "step": 7354 + }, + { + "epoch": 2.3106189999411035, + "grad_norm": 0.8828125, + "learning_rate": 1.0865122183433832e-05, + "loss": 1.1263, + "step": 7356 + }, + { + "epoch": 2.3112472269666453, + "grad_norm": 0.828125, + "learning_rate": 1.0862583306886703e-05, + "loss": 1.1829, + "step": 7358 + }, + { + "epoch": 2.3118754539921866, + "grad_norm": 1.0078125, + "learning_rate": 1.0860044430339576e-05, + "loss": 1.2385, + "step": 7360 + }, + { + "epoch": 2.312503681017728, + "grad_norm": 0.83203125, + "learning_rate": 1.0857505553792448e-05, + "loss": 1.1448, + "step": 7362 + }, + { + "epoch": 2.3131319080432693, + "grad_norm": 1.140625, + "learning_rate": 1.085496667724532e-05, + "loss": 1.2102, + "step": 7364 + }, + { + "epoch": 2.3137601350688106, + "grad_norm": 0.83203125, + "learning_rate": 1.085242780069819e-05, + "loss": 1.2639, + "step": 7366 + }, + { + "epoch": 2.314388362094352, + "grad_norm": 0.8515625, + "learning_rate": 1.0849888924151063e-05, + "loss": 1.1321, + "step": 7368 + }, + { + "epoch": 2.3150165891198933, + "grad_norm": 0.89453125, + "learning_rate": 1.0847350047603935e-05, + "loss": 1.0745, + "step": 7370 + }, + { + "epoch": 2.3156448161454346, + "grad_norm": 0.95703125, + "learning_rate": 1.0844811171056808e-05, + "loss": 1.1193, + "step": 7372 + }, + { + "epoch": 2.316273043170976, + "grad_norm": 0.9140625, + "learning_rate": 1.0842272294509681e-05, + "loss": 1.303, + "step": 7374 + }, + { + "epoch": 2.3169012701965173, + "grad_norm": 0.8359375, + "learning_rate": 1.0839733417962552e-05, + "loss": 1.1937, + "step": 7376 + }, + { + "epoch": 2.3175294972220586, + "grad_norm": 0.8359375, + "learning_rate": 1.0837194541415425e-05, + "loss": 1.229, + "step": 7378 + }, + { + "epoch": 2.3181577242476, + "grad_norm": 1.0234375, + "learning_rate": 1.0834655664868297e-05, + "loss": 1.0703, + "step": 7380 + }, + { + "epoch": 2.3187859512731412, + "grad_norm": 0.7578125, + "learning_rate": 1.083211678832117e-05, + "loss": 1.3122, + "step": 7382 + }, + { + "epoch": 2.3194141782986826, + "grad_norm": 0.875, + "learning_rate": 1.0829577911774041e-05, + "loss": 1.208, + "step": 7384 + }, + { + "epoch": 2.320042405324224, + "grad_norm": 0.8125, + "learning_rate": 1.0827039035226914e-05, + "loss": 1.1552, + "step": 7386 + }, + { + "epoch": 2.3206706323497652, + "grad_norm": 0.96875, + "learning_rate": 1.0824500158679786e-05, + "loss": 1.1077, + "step": 7388 + }, + { + "epoch": 2.3212988593753066, + "grad_norm": 0.81640625, + "learning_rate": 1.0821961282132659e-05, + "loss": 1.2388, + "step": 7390 + }, + { + "epoch": 2.321927086400848, + "grad_norm": 0.91015625, + "learning_rate": 1.0819422405585528e-05, + "loss": 1.2115, + "step": 7392 + }, + { + "epoch": 2.322555313426389, + "grad_norm": 0.87890625, + "learning_rate": 1.0816883529038402e-05, + "loss": 1.1479, + "step": 7394 + }, + { + "epoch": 2.323183540451931, + "grad_norm": 0.8203125, + "learning_rate": 1.0814344652491273e-05, + "loss": 1.2548, + "step": 7396 + }, + { + "epoch": 2.3238117674774723, + "grad_norm": 0.90234375, + "learning_rate": 1.0811805775944146e-05, + "loss": 1.1445, + "step": 7398 + }, + { + "epoch": 2.3244399945030136, + "grad_norm": 0.84765625, + "learning_rate": 1.0809266899397017e-05, + "loss": 1.1688, + "step": 7400 + }, + { + "epoch": 2.325068221528555, + "grad_norm": 0.8203125, + "learning_rate": 1.080672802284989e-05, + "loss": 1.2903, + "step": 7402 + }, + { + "epoch": 2.3256964485540963, + "grad_norm": 0.828125, + "learning_rate": 1.0804189146302762e-05, + "loss": 1.2819, + "step": 7404 + }, + { + "epoch": 2.3263246755796376, + "grad_norm": 0.87890625, + "learning_rate": 1.0801650269755635e-05, + "loss": 1.1805, + "step": 7406 + }, + { + "epoch": 2.326952902605179, + "grad_norm": 0.921875, + "learning_rate": 1.0799111393208506e-05, + "loss": 0.9431, + "step": 7408 + }, + { + "epoch": 2.3275811296307203, + "grad_norm": 0.828125, + "learning_rate": 1.079657251666138e-05, + "loss": 1.0793, + "step": 7410 + }, + { + "epoch": 2.3282093566562616, + "grad_norm": 1.0078125, + "learning_rate": 1.0794033640114249e-05, + "loss": 1.1367, + "step": 7412 + }, + { + "epoch": 2.328837583681803, + "grad_norm": 0.84765625, + "learning_rate": 1.0791494763567124e-05, + "loss": 1.1915, + "step": 7414 + }, + { + "epoch": 2.3294658107073443, + "grad_norm": 0.80859375, + "learning_rate": 1.0788955887019993e-05, + "loss": 1.1777, + "step": 7416 + }, + { + "epoch": 2.3300940377328856, + "grad_norm": 0.86328125, + "learning_rate": 1.0786417010472867e-05, + "loss": 1.1717, + "step": 7418 + }, + { + "epoch": 2.330722264758427, + "grad_norm": 0.83984375, + "learning_rate": 1.0783878133925738e-05, + "loss": 1.229, + "step": 7420 + }, + { + "epoch": 2.3313504917839682, + "grad_norm": 0.86328125, + "learning_rate": 1.0781339257378611e-05, + "loss": 1.1072, + "step": 7422 + }, + { + "epoch": 2.33197871880951, + "grad_norm": 0.90234375, + "learning_rate": 1.0778800380831482e-05, + "loss": 1.2157, + "step": 7424 + }, + { + "epoch": 2.3326069458350513, + "grad_norm": 0.828125, + "learning_rate": 1.0776261504284355e-05, + "loss": 1.1263, + "step": 7426 + }, + { + "epoch": 2.3332351728605927, + "grad_norm": 0.83984375, + "learning_rate": 1.0773722627737227e-05, + "loss": 1.2423, + "step": 7428 + }, + { + "epoch": 2.333863399886134, + "grad_norm": 0.89453125, + "learning_rate": 1.07711837511901e-05, + "loss": 1.2594, + "step": 7430 + }, + { + "epoch": 2.3344916269116753, + "grad_norm": 0.8671875, + "learning_rate": 1.0768644874642971e-05, + "loss": 1.2169, + "step": 7432 + }, + { + "epoch": 2.3351198539372167, + "grad_norm": 0.85546875, + "learning_rate": 1.0766105998095844e-05, + "loss": 1.2165, + "step": 7434 + }, + { + "epoch": 2.335748080962758, + "grad_norm": 0.84765625, + "learning_rate": 1.0763567121548714e-05, + "loss": 1.1919, + "step": 7436 + }, + { + "epoch": 2.3363763079882993, + "grad_norm": 0.91796875, + "learning_rate": 1.0761028245001587e-05, + "loss": 1.1077, + "step": 7438 + }, + { + "epoch": 2.3370045350138406, + "grad_norm": 0.84375, + "learning_rate": 1.0758489368454459e-05, + "loss": 1.1936, + "step": 7440 + }, + { + "epoch": 2.337632762039382, + "grad_norm": 0.8671875, + "learning_rate": 1.0755950491907332e-05, + "loss": 1.088, + "step": 7442 + }, + { + "epoch": 2.3382609890649233, + "grad_norm": 0.92578125, + "learning_rate": 1.0753411615360203e-05, + "loss": 1.2247, + "step": 7444 + }, + { + "epoch": 2.3388892160904646, + "grad_norm": 0.90234375, + "learning_rate": 1.0750872738813076e-05, + "loss": 1.1426, + "step": 7446 + }, + { + "epoch": 2.339517443116006, + "grad_norm": 0.94921875, + "learning_rate": 1.0748333862265947e-05, + "loss": 1.1249, + "step": 7448 + }, + { + "epoch": 2.3401456701415473, + "grad_norm": 0.87109375, + "learning_rate": 1.074579498571882e-05, + "loss": 0.98, + "step": 7450 + }, + { + "epoch": 2.3407738971670886, + "grad_norm": 0.96875, + "learning_rate": 1.0743256109171692e-05, + "loss": 1.1628, + "step": 7452 + }, + { + "epoch": 2.34140212419263, + "grad_norm": 0.91796875, + "learning_rate": 1.0740717232624565e-05, + "loss": 1.067, + "step": 7454 + }, + { + "epoch": 2.3420303512181713, + "grad_norm": 0.890625, + "learning_rate": 1.0738178356077435e-05, + "loss": 1.1918, + "step": 7456 + }, + { + "epoch": 2.3426585782437126, + "grad_norm": 0.859375, + "learning_rate": 1.073563947953031e-05, + "loss": 1.2056, + "step": 7458 + }, + { + "epoch": 2.3432868052692544, + "grad_norm": 0.859375, + "learning_rate": 1.0733100602983183e-05, + "loss": 1.1389, + "step": 7460 + }, + { + "epoch": 2.3439150322947957, + "grad_norm": 0.8125, + "learning_rate": 1.0730561726436052e-05, + "loss": 1.1479, + "step": 7462 + }, + { + "epoch": 2.344543259320337, + "grad_norm": 0.97265625, + "learning_rate": 1.0728022849888925e-05, + "loss": 1.119, + "step": 7464 + }, + { + "epoch": 2.3451714863458784, + "grad_norm": 0.9296875, + "learning_rate": 1.0725483973341797e-05, + "loss": 1.0562, + "step": 7466 + }, + { + "epoch": 2.3457997133714197, + "grad_norm": 0.91015625, + "learning_rate": 1.072294509679467e-05, + "loss": 1.097, + "step": 7468 + }, + { + "epoch": 2.346427940396961, + "grad_norm": 0.88671875, + "learning_rate": 1.0720406220247541e-05, + "loss": 1.2363, + "step": 7470 + }, + { + "epoch": 2.3470561674225023, + "grad_norm": 0.8125, + "learning_rate": 1.0717867343700414e-05, + "loss": 1.168, + "step": 7472 + }, + { + "epoch": 2.3476843944480437, + "grad_norm": 0.9140625, + "learning_rate": 1.0715328467153286e-05, + "loss": 1.0952, + "step": 7474 + }, + { + "epoch": 2.348312621473585, + "grad_norm": 0.88671875, + "learning_rate": 1.0712789590606159e-05, + "loss": 1.2332, + "step": 7476 + }, + { + "epoch": 2.3489408484991263, + "grad_norm": 0.85546875, + "learning_rate": 1.071025071405903e-05, + "loss": 1.1856, + "step": 7478 + }, + { + "epoch": 2.3495690755246676, + "grad_norm": 0.859375, + "learning_rate": 1.0707711837511903e-05, + "loss": 1.2385, + "step": 7480 + }, + { + "epoch": 2.350197302550209, + "grad_norm": 0.96875, + "learning_rate": 1.0705172960964773e-05, + "loss": 1.2637, + "step": 7482 + }, + { + "epoch": 2.3508255295757503, + "grad_norm": 0.94140625, + "learning_rate": 1.0702634084417648e-05, + "loss": 1.0995, + "step": 7484 + }, + { + "epoch": 2.3514537566012916, + "grad_norm": 0.953125, + "learning_rate": 1.0700095207870517e-05, + "loss": 1.2033, + "step": 7486 + }, + { + "epoch": 2.352081983626833, + "grad_norm": 1.0546875, + "learning_rate": 1.069755633132339e-05, + "loss": 1.2124, + "step": 7488 + }, + { + "epoch": 2.3527102106523747, + "grad_norm": 0.9765625, + "learning_rate": 1.0695017454776262e-05, + "loss": 1.2302, + "step": 7490 + }, + { + "epoch": 2.353338437677916, + "grad_norm": 0.8984375, + "learning_rate": 1.0692478578229135e-05, + "loss": 1.2885, + "step": 7492 + }, + { + "epoch": 2.3539666647034574, + "grad_norm": 0.796875, + "learning_rate": 1.0689939701682006e-05, + "loss": 1.1231, + "step": 7494 + }, + { + "epoch": 2.3545948917289987, + "grad_norm": 0.796875, + "learning_rate": 1.068740082513488e-05, + "loss": 1.1761, + "step": 7496 + }, + { + "epoch": 2.35522311875454, + "grad_norm": 0.859375, + "learning_rate": 1.068486194858775e-05, + "loss": 1.2641, + "step": 7498 + }, + { + "epoch": 2.3558513457800814, + "grad_norm": 0.79296875, + "learning_rate": 1.0682323072040624e-05, + "loss": 1.1151, + "step": 7500 + }, + { + "epoch": 2.3564795728056227, + "grad_norm": 0.86328125, + "learning_rate": 1.0679784195493495e-05, + "loss": 1.2444, + "step": 7502 + }, + { + "epoch": 2.357107799831164, + "grad_norm": 0.85546875, + "learning_rate": 1.0677245318946368e-05, + "loss": 1.272, + "step": 7504 + }, + { + "epoch": 2.3577360268567054, + "grad_norm": 0.86328125, + "learning_rate": 1.0674706442399238e-05, + "loss": 1.1819, + "step": 7506 + }, + { + "epoch": 2.3583642538822467, + "grad_norm": 0.859375, + "learning_rate": 1.0672167565852111e-05, + "loss": 1.2951, + "step": 7508 + }, + { + "epoch": 2.358992480907788, + "grad_norm": 0.828125, + "learning_rate": 1.0669628689304982e-05, + "loss": 1.1947, + "step": 7510 + }, + { + "epoch": 2.3596207079333293, + "grad_norm": 0.83984375, + "learning_rate": 1.0667089812757855e-05, + "loss": 1.2004, + "step": 7512 + }, + { + "epoch": 2.3602489349588707, + "grad_norm": 1.0703125, + "learning_rate": 1.0664550936210727e-05, + "loss": 1.2573, + "step": 7514 + }, + { + "epoch": 2.360877161984412, + "grad_norm": 0.83203125, + "learning_rate": 1.06620120596636e-05, + "loss": 1.1788, + "step": 7516 + }, + { + "epoch": 2.3615053890099533, + "grad_norm": 0.83984375, + "learning_rate": 1.0659473183116471e-05, + "loss": 1.3298, + "step": 7518 + }, + { + "epoch": 2.3621336160354947, + "grad_norm": 0.85546875, + "learning_rate": 1.0656934306569344e-05, + "loss": 1.2565, + "step": 7520 + }, + { + "epoch": 2.362761843061036, + "grad_norm": 0.87890625, + "learning_rate": 1.0654395430022216e-05, + "loss": 1.3338, + "step": 7522 + }, + { + "epoch": 2.3633900700865773, + "grad_norm": 0.96875, + "learning_rate": 1.0651856553475089e-05, + "loss": 1.12, + "step": 7524 + }, + { + "epoch": 2.364018297112119, + "grad_norm": 0.97265625, + "learning_rate": 1.064931767692796e-05, + "loss": 0.9746, + "step": 7526 + }, + { + "epoch": 2.3646465241376604, + "grad_norm": 0.859375, + "learning_rate": 1.0646778800380833e-05, + "loss": 1.2273, + "step": 7528 + }, + { + "epoch": 2.3652747511632017, + "grad_norm": 0.84765625, + "learning_rate": 1.0644239923833703e-05, + "loss": 1.2064, + "step": 7530 + }, + { + "epoch": 2.365902978188743, + "grad_norm": 0.9296875, + "learning_rate": 1.0641701047286576e-05, + "loss": 1.118, + "step": 7532 + }, + { + "epoch": 2.3665312052142844, + "grad_norm": 0.99609375, + "learning_rate": 1.0639162170739447e-05, + "loss": 1.1464, + "step": 7534 + }, + { + "epoch": 2.3671594322398257, + "grad_norm": 0.8984375, + "learning_rate": 1.063662329419232e-05, + "loss": 1.2686, + "step": 7536 + }, + { + "epoch": 2.367787659265367, + "grad_norm": 0.8984375, + "learning_rate": 1.0634084417645192e-05, + "loss": 1.0966, + "step": 7538 + }, + { + "epoch": 2.3684158862909084, + "grad_norm": 0.890625, + "learning_rate": 1.0631545541098065e-05, + "loss": 1.0444, + "step": 7540 + }, + { + "epoch": 2.3690441133164497, + "grad_norm": 1.015625, + "learning_rate": 1.0629006664550936e-05, + "loss": 1.1434, + "step": 7542 + }, + { + "epoch": 2.369672340341991, + "grad_norm": 0.875, + "learning_rate": 1.062646778800381e-05, + "loss": 1.2808, + "step": 7544 + }, + { + "epoch": 2.3703005673675324, + "grad_norm": 0.953125, + "learning_rate": 1.0623928911456682e-05, + "loss": 1.207, + "step": 7546 + }, + { + "epoch": 2.3709287943930737, + "grad_norm": 0.9296875, + "learning_rate": 1.0621390034909554e-05, + "loss": 1.2094, + "step": 7548 + }, + { + "epoch": 2.371557021418615, + "grad_norm": 0.89453125, + "learning_rate": 1.0618851158362427e-05, + "loss": 1.1322, + "step": 7550 + }, + { + "epoch": 2.3721852484441563, + "grad_norm": 0.81640625, + "learning_rate": 1.0616312281815298e-05, + "loss": 1.1648, + "step": 7552 + }, + { + "epoch": 2.3728134754696977, + "grad_norm": 0.828125, + "learning_rate": 1.0613773405268171e-05, + "loss": 1.1761, + "step": 7554 + }, + { + "epoch": 2.3734417024952394, + "grad_norm": 0.83203125, + "learning_rate": 1.0611234528721041e-05, + "loss": 1.1397, + "step": 7556 + }, + { + "epoch": 2.3740699295207808, + "grad_norm": 0.87109375, + "learning_rate": 1.0608695652173914e-05, + "loss": 1.0505, + "step": 7558 + }, + { + "epoch": 2.374698156546322, + "grad_norm": 0.859375, + "learning_rate": 1.0606156775626786e-05, + "loss": 1.3071, + "step": 7560 + }, + { + "epoch": 2.3753263835718634, + "grad_norm": 0.76953125, + "learning_rate": 1.0603617899079659e-05, + "loss": 1.1697, + "step": 7562 + }, + { + "epoch": 2.3759546105974048, + "grad_norm": 0.875, + "learning_rate": 1.060107902253253e-05, + "loss": 1.3307, + "step": 7564 + }, + { + "epoch": 2.376582837622946, + "grad_norm": 0.96484375, + "learning_rate": 1.0598540145985403e-05, + "loss": 1.1086, + "step": 7566 + }, + { + "epoch": 2.3772110646484874, + "grad_norm": 0.875, + "learning_rate": 1.0596001269438274e-05, + "loss": 1.3303, + "step": 7568 + }, + { + "epoch": 2.3778392916740287, + "grad_norm": 0.8828125, + "learning_rate": 1.0593462392891148e-05, + "loss": 1.176, + "step": 7570 + }, + { + "epoch": 2.37846751869957, + "grad_norm": 0.81640625, + "learning_rate": 1.0590923516344019e-05, + "loss": 1.3351, + "step": 7572 + }, + { + "epoch": 2.3790957457251114, + "grad_norm": 0.84375, + "learning_rate": 1.0588384639796892e-05, + "loss": 1.1274, + "step": 7574 + }, + { + "epoch": 2.3797239727506527, + "grad_norm": 0.9296875, + "learning_rate": 1.0585845763249762e-05, + "loss": 1.0805, + "step": 7576 + }, + { + "epoch": 2.380352199776194, + "grad_norm": 0.99609375, + "learning_rate": 1.0583306886702636e-05, + "loss": 1.1506, + "step": 7578 + }, + { + "epoch": 2.3809804268017354, + "grad_norm": 0.90234375, + "learning_rate": 1.0580768010155506e-05, + "loss": 1.1659, + "step": 7580 + }, + { + "epoch": 2.3816086538272767, + "grad_norm": 0.82421875, + "learning_rate": 1.057822913360838e-05, + "loss": 1.2277, + "step": 7582 + }, + { + "epoch": 2.382236880852818, + "grad_norm": 0.86328125, + "learning_rate": 1.057569025706125e-05, + "loss": 1.1857, + "step": 7584 + }, + { + "epoch": 2.3828651078783594, + "grad_norm": 0.90625, + "learning_rate": 1.0573151380514124e-05, + "loss": 1.1172, + "step": 7586 + }, + { + "epoch": 2.3834933349039007, + "grad_norm": 0.81640625, + "learning_rate": 1.0570612503966995e-05, + "loss": 1.1305, + "step": 7588 + }, + { + "epoch": 2.384121561929442, + "grad_norm": 0.890625, + "learning_rate": 1.0568073627419868e-05, + "loss": 1.2697, + "step": 7590 + }, + { + "epoch": 2.384749788954984, + "grad_norm": 0.84375, + "learning_rate": 1.056553475087274e-05, + "loss": 1.2452, + "step": 7592 + }, + { + "epoch": 2.385378015980525, + "grad_norm": 0.88671875, + "learning_rate": 1.0562995874325613e-05, + "loss": 1.2094, + "step": 7594 + }, + { + "epoch": 2.3860062430060665, + "grad_norm": 0.89453125, + "learning_rate": 1.0560456997778484e-05, + "loss": 1.1119, + "step": 7596 + }, + { + "epoch": 2.386634470031608, + "grad_norm": 0.93359375, + "learning_rate": 1.0557918121231357e-05, + "loss": 1.2432, + "step": 7598 + }, + { + "epoch": 2.387262697057149, + "grad_norm": 0.96484375, + "learning_rate": 1.0555379244684227e-05, + "loss": 1.35, + "step": 7600 + }, + { + "epoch": 2.3878909240826904, + "grad_norm": 0.82421875, + "learning_rate": 1.05528403681371e-05, + "loss": 1.2152, + "step": 7602 + }, + { + "epoch": 2.3885191511082318, + "grad_norm": 0.890625, + "learning_rate": 1.0550301491589971e-05, + "loss": 1.0723, + "step": 7604 + }, + { + "epoch": 2.389147378133773, + "grad_norm": 0.84375, + "learning_rate": 1.0547762615042844e-05, + "loss": 1.2081, + "step": 7606 + }, + { + "epoch": 2.3897756051593144, + "grad_norm": 0.859375, + "learning_rate": 1.0545223738495716e-05, + "loss": 1.1111, + "step": 7608 + }, + { + "epoch": 2.3904038321848557, + "grad_norm": 0.83984375, + "learning_rate": 1.0542684861948589e-05, + "loss": 1.2082, + "step": 7610 + }, + { + "epoch": 2.391032059210397, + "grad_norm": 0.8828125, + "learning_rate": 1.054014598540146e-05, + "loss": 1.2288, + "step": 7612 + }, + { + "epoch": 2.3916602862359384, + "grad_norm": 0.85546875, + "learning_rate": 1.0537607108854333e-05, + "loss": 1.181, + "step": 7614 + }, + { + "epoch": 2.3922885132614797, + "grad_norm": 0.90625, + "learning_rate": 1.0535068232307205e-05, + "loss": 1.1182, + "step": 7616 + }, + { + "epoch": 2.392916740287021, + "grad_norm": 0.90625, + "learning_rate": 1.0532529355760078e-05, + "loss": 1.2174, + "step": 7618 + }, + { + "epoch": 2.3935449673125624, + "grad_norm": 0.8203125, + "learning_rate": 1.0529990479212947e-05, + "loss": 1.1535, + "step": 7620 + }, + { + "epoch": 2.394173194338104, + "grad_norm": 0.90234375, + "learning_rate": 1.0527451602665822e-05, + "loss": 1.1007, + "step": 7622 + }, + { + "epoch": 2.3948014213636455, + "grad_norm": 0.93359375, + "learning_rate": 1.0524912726118692e-05, + "loss": 1.2071, + "step": 7624 + }, + { + "epoch": 2.395429648389187, + "grad_norm": 0.875, + "learning_rate": 1.0522373849571565e-05, + "loss": 1.2248, + "step": 7626 + }, + { + "epoch": 2.396057875414728, + "grad_norm": 0.83203125, + "learning_rate": 1.0519834973024436e-05, + "loss": 1.1908, + "step": 7628 + }, + { + "epoch": 2.3966861024402695, + "grad_norm": 0.96484375, + "learning_rate": 1.051729609647731e-05, + "loss": 1.2519, + "step": 7630 + }, + { + "epoch": 2.397314329465811, + "grad_norm": 0.90234375, + "learning_rate": 1.0514757219930182e-05, + "loss": 1.1041, + "step": 7632 + }, + { + "epoch": 2.397942556491352, + "grad_norm": 0.8671875, + "learning_rate": 1.0512218343383054e-05, + "loss": 1.1484, + "step": 7634 + }, + { + "epoch": 2.3985707835168935, + "grad_norm": 0.88671875, + "learning_rate": 1.0509679466835927e-05, + "loss": 1.165, + "step": 7636 + }, + { + "epoch": 2.399199010542435, + "grad_norm": 0.83203125, + "learning_rate": 1.0507140590288798e-05, + "loss": 1.2547, + "step": 7638 + }, + { + "epoch": 2.399827237567976, + "grad_norm": 0.93359375, + "learning_rate": 1.0504601713741671e-05, + "loss": 1.3175, + "step": 7640 + }, + { + "epoch": 2.4004554645935174, + "grad_norm": 1.0078125, + "learning_rate": 1.0502062837194543e-05, + "loss": 1.1751, + "step": 7642 + }, + { + "epoch": 2.4010836916190588, + "grad_norm": 0.8203125, + "learning_rate": 1.0499523960647416e-05, + "loss": 1.1984, + "step": 7644 + }, + { + "epoch": 2.4017119186446, + "grad_norm": 0.84765625, + "learning_rate": 1.0496985084100285e-05, + "loss": 1.036, + "step": 7646 + }, + { + "epoch": 2.4023401456701414, + "grad_norm": 0.9375, + "learning_rate": 1.049444620755316e-05, + "loss": 1.3037, + "step": 7648 + }, + { + "epoch": 2.4029683726956828, + "grad_norm": 1.046875, + "learning_rate": 1.049190733100603e-05, + "loss": 1.2453, + "step": 7650 + }, + { + "epoch": 2.403596599721224, + "grad_norm": 0.8828125, + "learning_rate": 1.0489368454458903e-05, + "loss": 1.1922, + "step": 7652 + }, + { + "epoch": 2.4042248267467654, + "grad_norm": 0.8203125, + "learning_rate": 1.0486829577911774e-05, + "loss": 1.282, + "step": 7654 + }, + { + "epoch": 2.4048530537723067, + "grad_norm": 0.890625, + "learning_rate": 1.0484290701364647e-05, + "loss": 1.1515, + "step": 7656 + }, + { + "epoch": 2.4054812807978485, + "grad_norm": 0.84375, + "learning_rate": 1.0481751824817519e-05, + "loss": 1.1316, + "step": 7658 + }, + { + "epoch": 2.40610950782339, + "grad_norm": 0.9296875, + "learning_rate": 1.0479212948270392e-05, + "loss": 1.1016, + "step": 7660 + }, + { + "epoch": 2.406737734848931, + "grad_norm": 0.83203125, + "learning_rate": 1.0476674071723263e-05, + "loss": 1.2394, + "step": 7662 + }, + { + "epoch": 2.4073659618744725, + "grad_norm": 0.91015625, + "learning_rate": 1.0474135195176136e-05, + "loss": 1.2902, + "step": 7664 + }, + { + "epoch": 2.407994188900014, + "grad_norm": 0.8828125, + "learning_rate": 1.0471596318629008e-05, + "loss": 1.1772, + "step": 7666 + }, + { + "epoch": 2.408622415925555, + "grad_norm": 0.9609375, + "learning_rate": 1.046905744208188e-05, + "loss": 1.1476, + "step": 7668 + }, + { + "epoch": 2.4092506429510965, + "grad_norm": 0.8828125, + "learning_rate": 1.046651856553475e-05, + "loss": 1.1821, + "step": 7670 + }, + { + "epoch": 2.409878869976638, + "grad_norm": 0.8515625, + "learning_rate": 1.0463979688987624e-05, + "loss": 1.1742, + "step": 7672 + }, + { + "epoch": 2.410507097002179, + "grad_norm": 0.76953125, + "learning_rate": 1.0461440812440495e-05, + "loss": 1.2042, + "step": 7674 + }, + { + "epoch": 2.4111353240277205, + "grad_norm": 0.7890625, + "learning_rate": 1.0458901935893368e-05, + "loss": 1.1789, + "step": 7676 + }, + { + "epoch": 2.411763551053262, + "grad_norm": 0.87890625, + "learning_rate": 1.045636305934624e-05, + "loss": 1.297, + "step": 7678 + }, + { + "epoch": 2.412391778078803, + "grad_norm": 0.9140625, + "learning_rate": 1.0453824182799113e-05, + "loss": 1.3176, + "step": 7680 + }, + { + "epoch": 2.4130200051043444, + "grad_norm": 0.92578125, + "learning_rate": 1.0451285306251984e-05, + "loss": 1.2168, + "step": 7682 + }, + { + "epoch": 2.4136482321298858, + "grad_norm": 0.86328125, + "learning_rate": 1.0448746429704857e-05, + "loss": 1.1846, + "step": 7684 + }, + { + "epoch": 2.414276459155427, + "grad_norm": 0.9375, + "learning_rate": 1.0446207553157728e-05, + "loss": 1.0846, + "step": 7686 + }, + { + "epoch": 2.414904686180969, + "grad_norm": 0.84375, + "learning_rate": 1.0443668676610601e-05, + "loss": 1.4388, + "step": 7688 + }, + { + "epoch": 2.41553291320651, + "grad_norm": 0.96484375, + "learning_rate": 1.0441129800063473e-05, + "loss": 1.253, + "step": 7690 + }, + { + "epoch": 2.4161611402320515, + "grad_norm": 0.84765625, + "learning_rate": 1.0438590923516346e-05, + "loss": 1.1133, + "step": 7692 + }, + { + "epoch": 2.416789367257593, + "grad_norm": 0.95703125, + "learning_rate": 1.0436052046969216e-05, + "loss": 1.1624, + "step": 7694 + }, + { + "epoch": 2.417417594283134, + "grad_norm": 0.89453125, + "learning_rate": 1.0433513170422089e-05, + "loss": 1.256, + "step": 7696 + }, + { + "epoch": 2.4180458213086755, + "grad_norm": 0.90625, + "learning_rate": 1.043097429387496e-05, + "loss": 1.1515, + "step": 7698 + }, + { + "epoch": 2.418674048334217, + "grad_norm": 0.88671875, + "learning_rate": 1.0428435417327833e-05, + "loss": 1.2425, + "step": 7700 + }, + { + "epoch": 2.419302275359758, + "grad_norm": 0.88671875, + "learning_rate": 1.0425896540780704e-05, + "loss": 1.1807, + "step": 7702 + }, + { + "epoch": 2.4199305023852995, + "grad_norm": 0.9296875, + "learning_rate": 1.0423357664233578e-05, + "loss": 1.1746, + "step": 7704 + }, + { + "epoch": 2.420558729410841, + "grad_norm": 0.8046875, + "learning_rate": 1.0420818787686449e-05, + "loss": 1.2989, + "step": 7706 + }, + { + "epoch": 2.421186956436382, + "grad_norm": 0.90234375, + "learning_rate": 1.0418279911139322e-05, + "loss": 1.1241, + "step": 7708 + }, + { + "epoch": 2.4218151834619235, + "grad_norm": 0.94140625, + "learning_rate": 1.0415741034592193e-05, + "loss": 1.1698, + "step": 7710 + }, + { + "epoch": 2.422443410487465, + "grad_norm": 0.84375, + "learning_rate": 1.0413202158045066e-05, + "loss": 1.2301, + "step": 7712 + }, + { + "epoch": 2.423071637513006, + "grad_norm": 0.83984375, + "learning_rate": 1.0410663281497936e-05, + "loss": 1.2581, + "step": 7714 + }, + { + "epoch": 2.4236998645385475, + "grad_norm": 0.83203125, + "learning_rate": 1.0408124404950811e-05, + "loss": 1.2973, + "step": 7716 + }, + { + "epoch": 2.424328091564089, + "grad_norm": 0.85546875, + "learning_rate": 1.0405585528403684e-05, + "loss": 1.2082, + "step": 7718 + }, + { + "epoch": 2.42495631858963, + "grad_norm": 0.91796875, + "learning_rate": 1.0403046651856554e-05, + "loss": 1.1844, + "step": 7720 + }, + { + "epoch": 2.4255845456151715, + "grad_norm": 0.93359375, + "learning_rate": 1.0400507775309427e-05, + "loss": 1.2662, + "step": 7722 + }, + { + "epoch": 2.4262127726407132, + "grad_norm": 0.94140625, + "learning_rate": 1.0397968898762298e-05, + "loss": 1.2522, + "step": 7724 + }, + { + "epoch": 2.4268409996662546, + "grad_norm": 0.87890625, + "learning_rate": 1.0395430022215171e-05, + "loss": 1.1265, + "step": 7726 + }, + { + "epoch": 2.427469226691796, + "grad_norm": 0.95703125, + "learning_rate": 1.0392891145668043e-05, + "loss": 1.1795, + "step": 7728 + }, + { + "epoch": 2.428097453717337, + "grad_norm": 0.85546875, + "learning_rate": 1.0390352269120916e-05, + "loss": 1.1316, + "step": 7730 + }, + { + "epoch": 2.4287256807428785, + "grad_norm": 1.1171875, + "learning_rate": 1.0387813392573787e-05, + "loss": 1.1675, + "step": 7732 + }, + { + "epoch": 2.42935390776842, + "grad_norm": 0.8671875, + "learning_rate": 1.038527451602666e-05, + "loss": 1.1802, + "step": 7734 + }, + { + "epoch": 2.429982134793961, + "grad_norm": 0.85546875, + "learning_rate": 1.0382735639479532e-05, + "loss": 1.216, + "step": 7736 + }, + { + "epoch": 2.4306103618195025, + "grad_norm": 0.84375, + "learning_rate": 1.0380196762932405e-05, + "loss": 1.059, + "step": 7738 + }, + { + "epoch": 2.431238588845044, + "grad_norm": 0.8828125, + "learning_rate": 1.0377657886385274e-05, + "loss": 1.31, + "step": 7740 + }, + { + "epoch": 2.431866815870585, + "grad_norm": 0.9765625, + "learning_rate": 1.0375119009838149e-05, + "loss": 1.1443, + "step": 7742 + }, + { + "epoch": 2.4324950428961265, + "grad_norm": 0.82421875, + "learning_rate": 1.0372580133291019e-05, + "loss": 1.0858, + "step": 7744 + }, + { + "epoch": 2.433123269921668, + "grad_norm": 0.890625, + "learning_rate": 1.0370041256743892e-05, + "loss": 1.0358, + "step": 7746 + }, + { + "epoch": 2.433751496947209, + "grad_norm": 0.91796875, + "learning_rate": 1.0367502380196763e-05, + "loss": 1.2427, + "step": 7748 + }, + { + "epoch": 2.4343797239727505, + "grad_norm": 0.85546875, + "learning_rate": 1.0364963503649636e-05, + "loss": 1.265, + "step": 7750 + }, + { + "epoch": 2.435007950998292, + "grad_norm": 0.94140625, + "learning_rate": 1.0362424627102508e-05, + "loss": 1.2406, + "step": 7752 + }, + { + "epoch": 2.4356361780238336, + "grad_norm": 0.94921875, + "learning_rate": 1.035988575055538e-05, + "loss": 1.2867, + "step": 7754 + }, + { + "epoch": 2.436264405049375, + "grad_norm": 0.87890625, + "learning_rate": 1.0357346874008252e-05, + "loss": 1.1434, + "step": 7756 + }, + { + "epoch": 2.4368926320749162, + "grad_norm": 0.88671875, + "learning_rate": 1.0354807997461125e-05, + "loss": 1.1873, + "step": 7758 + }, + { + "epoch": 2.4375208591004576, + "grad_norm": 0.83984375, + "learning_rate": 1.0352269120913997e-05, + "loss": 1.1372, + "step": 7760 + }, + { + "epoch": 2.438149086125999, + "grad_norm": 0.87890625, + "learning_rate": 1.034973024436687e-05, + "loss": 1.0095, + "step": 7762 + }, + { + "epoch": 2.4387773131515402, + "grad_norm": 0.91015625, + "learning_rate": 1.034719136781974e-05, + "loss": 1.2339, + "step": 7764 + }, + { + "epoch": 2.4394055401770816, + "grad_norm": 0.88671875, + "learning_rate": 1.0344652491272612e-05, + "loss": 1.1116, + "step": 7766 + }, + { + "epoch": 2.440033767202623, + "grad_norm": 0.9140625, + "learning_rate": 1.0342113614725484e-05, + "loss": 1.3111, + "step": 7768 + }, + { + "epoch": 2.440661994228164, + "grad_norm": 0.82421875, + "learning_rate": 1.0339574738178357e-05, + "loss": 1.119, + "step": 7770 + }, + { + "epoch": 2.4412902212537055, + "grad_norm": 0.89453125, + "learning_rate": 1.0337035861631228e-05, + "loss": 1.1471, + "step": 7772 + }, + { + "epoch": 2.441918448279247, + "grad_norm": 0.95703125, + "learning_rate": 1.0334496985084101e-05, + "loss": 1.1424, + "step": 7774 + }, + { + "epoch": 2.442546675304788, + "grad_norm": 0.9140625, + "learning_rate": 1.0331958108536973e-05, + "loss": 1.2449, + "step": 7776 + }, + { + "epoch": 2.4431749023303295, + "grad_norm": 0.8515625, + "learning_rate": 1.0329419231989846e-05, + "loss": 1.1482, + "step": 7778 + }, + { + "epoch": 2.443803129355871, + "grad_norm": 0.8515625, + "learning_rate": 1.0326880355442717e-05, + "loss": 1.3518, + "step": 7780 + }, + { + "epoch": 2.444431356381412, + "grad_norm": 0.8984375, + "learning_rate": 1.032434147889559e-05, + "loss": 1.2404, + "step": 7782 + }, + { + "epoch": 2.4450595834069535, + "grad_norm": 0.9296875, + "learning_rate": 1.032180260234846e-05, + "loss": 1.1999, + "step": 7784 + }, + { + "epoch": 2.445687810432495, + "grad_norm": 0.796875, + "learning_rate": 1.0319263725801335e-05, + "loss": 1.1479, + "step": 7786 + }, + { + "epoch": 2.446316037458036, + "grad_norm": 0.88671875, + "learning_rate": 1.0316724849254204e-05, + "loss": 1.164, + "step": 7788 + }, + { + "epoch": 2.446944264483578, + "grad_norm": 0.8828125, + "learning_rate": 1.0314185972707077e-05, + "loss": 1.2489, + "step": 7790 + }, + { + "epoch": 2.4475724915091193, + "grad_norm": 0.828125, + "learning_rate": 1.0311647096159949e-05, + "loss": 1.0448, + "step": 7792 + }, + { + "epoch": 2.4482007185346606, + "grad_norm": 0.97265625, + "learning_rate": 1.0309108219612822e-05, + "loss": 1.1342, + "step": 7794 + }, + { + "epoch": 2.448828945560202, + "grad_norm": 0.91796875, + "learning_rate": 1.0306569343065693e-05, + "loss": 1.1211, + "step": 7796 + }, + { + "epoch": 2.4494571725857432, + "grad_norm": 0.9296875, + "learning_rate": 1.0304030466518566e-05, + "loss": 1.1104, + "step": 7798 + }, + { + "epoch": 2.4500853996112846, + "grad_norm": 0.91015625, + "learning_rate": 1.0301491589971438e-05, + "loss": 1.0956, + "step": 7800 + }, + { + "epoch": 2.450713626636826, + "grad_norm": 0.84375, + "learning_rate": 1.0298952713424311e-05, + "loss": 1.2627, + "step": 7802 + }, + { + "epoch": 2.4513418536623672, + "grad_norm": 0.91796875, + "learning_rate": 1.0296413836877184e-05, + "loss": 1.1842, + "step": 7804 + }, + { + "epoch": 2.4519700806879086, + "grad_norm": 0.82421875, + "learning_rate": 1.0293874960330055e-05, + "loss": 1.2095, + "step": 7806 + }, + { + "epoch": 2.45259830771345, + "grad_norm": 0.90625, + "learning_rate": 1.0291336083782928e-05, + "loss": 1.2931, + "step": 7808 + }, + { + "epoch": 2.453226534738991, + "grad_norm": 0.83203125, + "learning_rate": 1.0288797207235798e-05, + "loss": 1.1803, + "step": 7810 + }, + { + "epoch": 2.4538547617645325, + "grad_norm": 0.89453125, + "learning_rate": 1.0286258330688673e-05, + "loss": 1.1717, + "step": 7812 + }, + { + "epoch": 2.454482988790074, + "grad_norm": 0.83203125, + "learning_rate": 1.0283719454141543e-05, + "loss": 0.999, + "step": 7814 + }, + { + "epoch": 2.455111215815615, + "grad_norm": 0.94140625, + "learning_rate": 1.0281180577594416e-05, + "loss": 1.103, + "step": 7816 + }, + { + "epoch": 2.455739442841157, + "grad_norm": 0.87109375, + "learning_rate": 1.0278641701047287e-05, + "loss": 1.2353, + "step": 7818 + }, + { + "epoch": 2.4563676698666983, + "grad_norm": 0.8359375, + "learning_rate": 1.027610282450016e-05, + "loss": 1.1041, + "step": 7820 + }, + { + "epoch": 2.4569958968922396, + "grad_norm": 0.828125, + "learning_rate": 1.0273563947953031e-05, + "loss": 1.2442, + "step": 7822 + }, + { + "epoch": 2.457624123917781, + "grad_norm": 0.87109375, + "learning_rate": 1.0271025071405905e-05, + "loss": 1.1837, + "step": 7824 + }, + { + "epoch": 2.4582523509433223, + "grad_norm": 0.921875, + "learning_rate": 1.0268486194858776e-05, + "loss": 1.0694, + "step": 7826 + }, + { + "epoch": 2.4588805779688636, + "grad_norm": 0.8515625, + "learning_rate": 1.0265947318311649e-05, + "loss": 1.1621, + "step": 7828 + }, + { + "epoch": 2.459508804994405, + "grad_norm": 0.890625, + "learning_rate": 1.026340844176452e-05, + "loss": 1.2203, + "step": 7830 + }, + { + "epoch": 2.4601370320199463, + "grad_norm": 0.84765625, + "learning_rate": 1.0260869565217393e-05, + "loss": 1.2414, + "step": 7832 + }, + { + "epoch": 2.4607652590454876, + "grad_norm": 0.81640625, + "learning_rate": 1.0258330688670263e-05, + "loss": 1.298, + "step": 7834 + }, + { + "epoch": 2.461393486071029, + "grad_norm": 0.921875, + "learning_rate": 1.0255791812123136e-05, + "loss": 1.0458, + "step": 7836 + }, + { + "epoch": 2.4620217130965703, + "grad_norm": 0.8359375, + "learning_rate": 1.0253252935576008e-05, + "loss": 1.2697, + "step": 7838 + }, + { + "epoch": 2.4626499401221116, + "grad_norm": 0.88671875, + "learning_rate": 1.025071405902888e-05, + "loss": 1.1559, + "step": 7840 + }, + { + "epoch": 2.463278167147653, + "grad_norm": 0.90625, + "learning_rate": 1.0248175182481752e-05, + "loss": 1.1766, + "step": 7842 + }, + { + "epoch": 2.4639063941731942, + "grad_norm": 0.859375, + "learning_rate": 1.0245636305934625e-05, + "loss": 1.1575, + "step": 7844 + }, + { + "epoch": 2.4645346211987356, + "grad_norm": 0.85546875, + "learning_rate": 1.0243097429387497e-05, + "loss": 1.1253, + "step": 7846 + }, + { + "epoch": 2.465162848224277, + "grad_norm": 0.8515625, + "learning_rate": 1.024055855284037e-05, + "loss": 1.1804, + "step": 7848 + }, + { + "epoch": 2.465791075249818, + "grad_norm": 0.86328125, + "learning_rate": 1.0238019676293241e-05, + "loss": 1.2267, + "step": 7850 + }, + { + "epoch": 2.4664193022753595, + "grad_norm": 0.796875, + "learning_rate": 1.0235480799746114e-05, + "loss": 1.2627, + "step": 7852 + }, + { + "epoch": 2.467047529300901, + "grad_norm": 0.8515625, + "learning_rate": 1.0232941923198985e-05, + "loss": 1.2447, + "step": 7854 + }, + { + "epoch": 2.4676757563264426, + "grad_norm": 0.83984375, + "learning_rate": 1.0230403046651859e-05, + "loss": 1.1281, + "step": 7856 + }, + { + "epoch": 2.468303983351984, + "grad_norm": 0.8515625, + "learning_rate": 1.0227864170104728e-05, + "loss": 1.1362, + "step": 7858 + }, + { + "epoch": 2.4689322103775253, + "grad_norm": 0.90625, + "learning_rate": 1.0225325293557601e-05, + "loss": 1.1659, + "step": 7860 + }, + { + "epoch": 2.4695604374030666, + "grad_norm": 0.86328125, + "learning_rate": 1.0222786417010473e-05, + "loss": 1.1334, + "step": 7862 + }, + { + "epoch": 2.470188664428608, + "grad_norm": 0.92578125, + "learning_rate": 1.0220247540463346e-05, + "loss": 1.2789, + "step": 7864 + }, + { + "epoch": 2.4708168914541493, + "grad_norm": 0.8515625, + "learning_rate": 1.0217708663916217e-05, + "loss": 1.2958, + "step": 7866 + }, + { + "epoch": 2.4714451184796906, + "grad_norm": 0.90234375, + "learning_rate": 1.021516978736909e-05, + "loss": 1.1327, + "step": 7868 + }, + { + "epoch": 2.472073345505232, + "grad_norm": 0.875, + "learning_rate": 1.0212630910821962e-05, + "loss": 1.2687, + "step": 7870 + }, + { + "epoch": 2.4727015725307733, + "grad_norm": 0.953125, + "learning_rate": 1.0210092034274835e-05, + "loss": 1.2305, + "step": 7872 + }, + { + "epoch": 2.4733297995563146, + "grad_norm": 0.83203125, + "learning_rate": 1.0207553157727706e-05, + "loss": 1.3892, + "step": 7874 + }, + { + "epoch": 2.473958026581856, + "grad_norm": 0.84765625, + "learning_rate": 1.0205014281180579e-05, + "loss": 1.1705, + "step": 7876 + }, + { + "epoch": 2.4745862536073973, + "grad_norm": 0.89453125, + "learning_rate": 1.0202475404633449e-05, + "loss": 1.148, + "step": 7878 + }, + { + "epoch": 2.4752144806329386, + "grad_norm": 0.91015625, + "learning_rate": 1.0199936528086324e-05, + "loss": 1.3282, + "step": 7880 + }, + { + "epoch": 2.47584270765848, + "grad_norm": 0.90625, + "learning_rate": 1.0197397651539193e-05, + "loss": 1.1612, + "step": 7882 + }, + { + "epoch": 2.4764709346840217, + "grad_norm": 0.84375, + "learning_rate": 1.0194858774992066e-05, + "loss": 1.2072, + "step": 7884 + }, + { + "epoch": 2.477099161709563, + "grad_norm": 0.8671875, + "learning_rate": 1.0192319898444938e-05, + "loss": 1.3195, + "step": 7886 + }, + { + "epoch": 2.4777273887351043, + "grad_norm": 0.80859375, + "learning_rate": 1.018978102189781e-05, + "loss": 1.1765, + "step": 7888 + }, + { + "epoch": 2.4783556157606457, + "grad_norm": 0.83984375, + "learning_rate": 1.0187242145350684e-05, + "loss": 1.1609, + "step": 7890 + }, + { + "epoch": 2.478983842786187, + "grad_norm": 1.0, + "learning_rate": 1.0184703268803555e-05, + "loss": 1.1592, + "step": 7892 + }, + { + "epoch": 2.4796120698117283, + "grad_norm": 0.8828125, + "learning_rate": 1.0182164392256428e-05, + "loss": 1.1972, + "step": 7894 + }, + { + "epoch": 2.4802402968372697, + "grad_norm": 0.8984375, + "learning_rate": 1.01796255157093e-05, + "loss": 1.2579, + "step": 7896 + }, + { + "epoch": 2.480868523862811, + "grad_norm": 0.859375, + "learning_rate": 1.0177086639162173e-05, + "loss": 1.1676, + "step": 7898 + }, + { + "epoch": 2.4814967508883523, + "grad_norm": 0.83203125, + "learning_rate": 1.0174547762615044e-05, + "loss": 1.299, + "step": 7900 + }, + { + "epoch": 2.4821249779138936, + "grad_norm": 0.89453125, + "learning_rate": 1.0172008886067917e-05, + "loss": 1.2395, + "step": 7902 + }, + { + "epoch": 2.482753204939435, + "grad_norm": 0.9296875, + "learning_rate": 1.0169470009520787e-05, + "loss": 1.2603, + "step": 7904 + }, + { + "epoch": 2.4833814319649763, + "grad_norm": 0.8515625, + "learning_rate": 1.0166931132973662e-05, + "loss": 1.129, + "step": 7906 + }, + { + "epoch": 2.4840096589905176, + "grad_norm": 0.8203125, + "learning_rate": 1.0164392256426531e-05, + "loss": 1.3505, + "step": 7908 + }, + { + "epoch": 2.484637886016059, + "grad_norm": 0.8359375, + "learning_rate": 1.0161853379879404e-05, + "loss": 1.1911, + "step": 7910 + }, + { + "epoch": 2.4852661130416003, + "grad_norm": 0.85546875, + "learning_rate": 1.0159314503332276e-05, + "loss": 1.2497, + "step": 7912 + }, + { + "epoch": 2.4858943400671416, + "grad_norm": 0.875, + "learning_rate": 1.0156775626785149e-05, + "loss": 1.2195, + "step": 7914 + }, + { + "epoch": 2.486522567092683, + "grad_norm": 0.8984375, + "learning_rate": 1.015423675023802e-05, + "loss": 1.2527, + "step": 7916 + }, + { + "epoch": 2.4871507941182243, + "grad_norm": 0.8046875, + "learning_rate": 1.0151697873690893e-05, + "loss": 1.2442, + "step": 7918 + }, + { + "epoch": 2.4877790211437656, + "grad_norm": 0.83203125, + "learning_rate": 1.0149158997143765e-05, + "loss": 1.2383, + "step": 7920 + }, + { + "epoch": 2.4884072481693074, + "grad_norm": 0.88671875, + "learning_rate": 1.0146620120596638e-05, + "loss": 1.1643, + "step": 7922 + }, + { + "epoch": 2.4890354751948487, + "grad_norm": 0.93359375, + "learning_rate": 1.014408124404951e-05, + "loss": 1.1913, + "step": 7924 + }, + { + "epoch": 2.48966370222039, + "grad_norm": 0.84765625, + "learning_rate": 1.0141542367502382e-05, + "loss": 1.1843, + "step": 7926 + }, + { + "epoch": 2.4902919292459313, + "grad_norm": 0.8359375, + "learning_rate": 1.0139003490955252e-05, + "loss": 1.1378, + "step": 7928 + }, + { + "epoch": 2.4909201562714727, + "grad_norm": 0.98046875, + "learning_rate": 1.0136464614408125e-05, + "loss": 1.1175, + "step": 7930 + }, + { + "epoch": 2.491548383297014, + "grad_norm": 0.90625, + "learning_rate": 1.0133925737860996e-05, + "loss": 1.2275, + "step": 7932 + }, + { + "epoch": 2.4921766103225553, + "grad_norm": 0.80078125, + "learning_rate": 1.013138686131387e-05, + "loss": 1.0809, + "step": 7934 + }, + { + "epoch": 2.4928048373480967, + "grad_norm": 0.86328125, + "learning_rate": 1.0128847984766741e-05, + "loss": 1.223, + "step": 7936 + }, + { + "epoch": 2.493433064373638, + "grad_norm": 0.90234375, + "learning_rate": 1.0126309108219614e-05, + "loss": 1.2318, + "step": 7938 + }, + { + "epoch": 2.4940612913991793, + "grad_norm": 0.80859375, + "learning_rate": 1.0123770231672485e-05, + "loss": 1.1458, + "step": 7940 + }, + { + "epoch": 2.4946895184247206, + "grad_norm": 0.85546875, + "learning_rate": 1.0121231355125358e-05, + "loss": 1.1603, + "step": 7942 + }, + { + "epoch": 2.495317745450262, + "grad_norm": 0.890625, + "learning_rate": 1.011869247857823e-05, + "loss": 1.1126, + "step": 7944 + }, + { + "epoch": 2.4959459724758033, + "grad_norm": 0.8671875, + "learning_rate": 1.0116153602031103e-05, + "loss": 1.2311, + "step": 7946 + }, + { + "epoch": 2.4965741995013446, + "grad_norm": 0.8984375, + "learning_rate": 1.0113614725483973e-05, + "loss": 1.2241, + "step": 7948 + }, + { + "epoch": 2.4972024265268864, + "grad_norm": 0.8671875, + "learning_rate": 1.0111075848936847e-05, + "loss": 1.1663, + "step": 7950 + }, + { + "epoch": 2.4978306535524277, + "grad_norm": 0.8671875, + "learning_rate": 1.0108536972389717e-05, + "loss": 1.1048, + "step": 7952 + }, + { + "epoch": 2.498458880577969, + "grad_norm": 0.85546875, + "learning_rate": 1.010599809584259e-05, + "loss": 1.1416, + "step": 7954 + }, + { + "epoch": 2.4990871076035104, + "grad_norm": 1.0390625, + "learning_rate": 1.0103459219295462e-05, + "loss": 1.2229, + "step": 7956 + }, + { + "epoch": 2.4997153346290517, + "grad_norm": 0.890625, + "learning_rate": 1.0100920342748335e-05, + "loss": 1.0936, + "step": 7958 + }, + { + "epoch": 2.500343561654593, + "grad_norm": 0.859375, + "learning_rate": 1.0098381466201206e-05, + "loss": 1.2353, + "step": 7960 + }, + { + "epoch": 2.5009717886801344, + "grad_norm": 0.828125, + "learning_rate": 1.0095842589654079e-05, + "loss": 1.1868, + "step": 7962 + }, + { + "epoch": 2.5016000157056757, + "grad_norm": 0.87890625, + "learning_rate": 1.009330371310695e-05, + "loss": 1.1775, + "step": 7964 + }, + { + "epoch": 2.502228242731217, + "grad_norm": 0.8671875, + "learning_rate": 1.0090764836559823e-05, + "loss": 1.1873, + "step": 7966 + }, + { + "epoch": 2.5028564697567584, + "grad_norm": 1.0, + "learning_rate": 1.0088225960012695e-05, + "loss": 1.2451, + "step": 7968 + }, + { + "epoch": 2.5034846967822997, + "grad_norm": 0.8828125, + "learning_rate": 1.0085687083465568e-05, + "loss": 1.1386, + "step": 7970 + }, + { + "epoch": 2.504112923807841, + "grad_norm": 0.84765625, + "learning_rate": 1.0083148206918438e-05, + "loss": 1.2973, + "step": 7972 + }, + { + "epoch": 2.5047411508333823, + "grad_norm": 0.96875, + "learning_rate": 1.008060933037131e-05, + "loss": 1.1667, + "step": 7974 + }, + { + "epoch": 2.5053693778589237, + "grad_norm": 0.8828125, + "learning_rate": 1.0078070453824185e-05, + "loss": 1.3141, + "step": 7976 + }, + { + "epoch": 2.505997604884465, + "grad_norm": 0.8828125, + "learning_rate": 1.0075531577277055e-05, + "loss": 1.1125, + "step": 7978 + }, + { + "epoch": 2.5066258319100063, + "grad_norm": 0.9375, + "learning_rate": 1.0072992700729928e-05, + "loss": 1.2697, + "step": 7980 + }, + { + "epoch": 2.5072540589355476, + "grad_norm": 0.93359375, + "learning_rate": 1.00704538241828e-05, + "loss": 1.1035, + "step": 7982 + }, + { + "epoch": 2.507882285961089, + "grad_norm": 0.83984375, + "learning_rate": 1.0067914947635673e-05, + "loss": 1.1884, + "step": 7984 + }, + { + "epoch": 2.5085105129866303, + "grad_norm": 0.921875, + "learning_rate": 1.0065376071088544e-05, + "loss": 1.2257, + "step": 7986 + }, + { + "epoch": 2.5091387400121716, + "grad_norm": 0.90234375, + "learning_rate": 1.0062837194541417e-05, + "loss": 1.2123, + "step": 7988 + }, + { + "epoch": 2.5097669670377134, + "grad_norm": 0.890625, + "learning_rate": 1.0060298317994289e-05, + "loss": 1.2626, + "step": 7990 + }, + { + "epoch": 2.5103951940632547, + "grad_norm": 0.86328125, + "learning_rate": 1.0057759441447162e-05, + "loss": 1.2488, + "step": 7992 + }, + { + "epoch": 2.511023421088796, + "grad_norm": 0.8515625, + "learning_rate": 1.0055220564900033e-05, + "loss": 1.3123, + "step": 7994 + }, + { + "epoch": 2.5116516481143374, + "grad_norm": 0.84765625, + "learning_rate": 1.0052681688352906e-05, + "loss": 1.204, + "step": 7996 + }, + { + "epoch": 2.5122798751398787, + "grad_norm": 0.96875, + "learning_rate": 1.0050142811805776e-05, + "loss": 1.1576, + "step": 7998 + }, + { + "epoch": 2.51290810216542, + "grad_norm": 0.94921875, + "learning_rate": 1.0047603935258649e-05, + "loss": 1.1342, + "step": 8000 + }, + { + "epoch": 2.5135363291909614, + "grad_norm": 0.9375, + "learning_rate": 1.004506505871152e-05, + "loss": 1.1785, + "step": 8002 + }, + { + "epoch": 2.5141645562165027, + "grad_norm": 0.88671875, + "learning_rate": 1.0042526182164393e-05, + "loss": 1.1783, + "step": 8004 + }, + { + "epoch": 2.514792783242044, + "grad_norm": 0.890625, + "learning_rate": 1.0039987305617265e-05, + "loss": 1.2789, + "step": 8006 + }, + { + "epoch": 2.5154210102675854, + "grad_norm": 0.890625, + "learning_rate": 1.0037448429070138e-05, + "loss": 1.3081, + "step": 8008 + }, + { + "epoch": 2.5160492372931267, + "grad_norm": 0.9375, + "learning_rate": 1.0034909552523009e-05, + "loss": 1.1454, + "step": 8010 + }, + { + "epoch": 2.516677464318668, + "grad_norm": 0.921875, + "learning_rate": 1.0032370675975882e-05, + "loss": 1.1284, + "step": 8012 + }, + { + "epoch": 2.51730569134421, + "grad_norm": 0.8984375, + "learning_rate": 1.0029831799428754e-05, + "loss": 1.1488, + "step": 8014 + }, + { + "epoch": 2.517933918369751, + "grad_norm": 0.83203125, + "learning_rate": 1.0027292922881627e-05, + "loss": 1.2606, + "step": 8016 + }, + { + "epoch": 2.5185621453952924, + "grad_norm": 0.8828125, + "learning_rate": 1.0024754046334498e-05, + "loss": 1.2357, + "step": 8018 + }, + { + "epoch": 2.5191903724208338, + "grad_norm": 0.90625, + "learning_rate": 1.0022215169787371e-05, + "loss": 1.1624, + "step": 8020 + }, + { + "epoch": 2.519818599446375, + "grad_norm": 0.9453125, + "learning_rate": 1.001967629324024e-05, + "loss": 1.1266, + "step": 8022 + }, + { + "epoch": 2.5204468264719164, + "grad_norm": 1.0234375, + "learning_rate": 1.0017137416693114e-05, + "loss": 1.2335, + "step": 8024 + }, + { + "epoch": 2.5210750534974578, + "grad_norm": 0.87890625, + "learning_rate": 1.0014598540145985e-05, + "loss": 1.1071, + "step": 8026 + }, + { + "epoch": 2.521703280522999, + "grad_norm": 0.8671875, + "learning_rate": 1.0012059663598858e-05, + "loss": 1.1173, + "step": 8028 + }, + { + "epoch": 2.5223315075485404, + "grad_norm": 0.87109375, + "learning_rate": 1.000952078705173e-05, + "loss": 1.1481, + "step": 8030 + }, + { + "epoch": 2.5229597345740817, + "grad_norm": 0.875, + "learning_rate": 1.0006981910504603e-05, + "loss": 1.2871, + "step": 8032 + }, + { + "epoch": 2.523587961599623, + "grad_norm": 0.98828125, + "learning_rate": 1.0004443033957474e-05, + "loss": 1.1876, + "step": 8034 + }, + { + "epoch": 2.5242161886251644, + "grad_norm": 0.97265625, + "learning_rate": 1.0001904157410347e-05, + "loss": 1.22, + "step": 8036 + }, + { + "epoch": 2.5248444156507057, + "grad_norm": 2.921875, + "learning_rate": 9.999365280863219e-06, + "loss": 1.0271, + "step": 8038 + }, + { + "epoch": 2.525472642676247, + "grad_norm": 0.83984375, + "learning_rate": 9.996826404316092e-06, + "loss": 1.2888, + "step": 8040 + }, + { + "epoch": 2.5261008697017884, + "grad_norm": 0.984375, + "learning_rate": 9.994287527768963e-06, + "loss": 1.3042, + "step": 8042 + }, + { + "epoch": 2.5267290967273297, + "grad_norm": 0.84765625, + "learning_rate": 9.991748651221835e-06, + "loss": 1.2412, + "step": 8044 + }, + { + "epoch": 2.527357323752871, + "grad_norm": 0.921875, + "learning_rate": 9.989209774674708e-06, + "loss": 1.2465, + "step": 8046 + }, + { + "epoch": 2.5279855507784124, + "grad_norm": 1.0546875, + "learning_rate": 9.986670898127579e-06, + "loss": 1.1235, + "step": 8048 + }, + { + "epoch": 2.5286137778039537, + "grad_norm": 0.953125, + "learning_rate": 9.984132021580452e-06, + "loss": 1.1607, + "step": 8050 + }, + { + "epoch": 2.529242004829495, + "grad_norm": 0.83984375, + "learning_rate": 9.981593145033323e-06, + "loss": 1.1748, + "step": 8052 + }, + { + "epoch": 2.5298702318550363, + "grad_norm": 0.89453125, + "learning_rate": 9.979054268486196e-06, + "loss": 1.0597, + "step": 8054 + }, + { + "epoch": 2.530498458880578, + "grad_norm": 0.953125, + "learning_rate": 9.976515391939068e-06, + "loss": 1.0744, + "step": 8056 + }, + { + "epoch": 2.5311266859061194, + "grad_norm": 0.890625, + "learning_rate": 9.97397651539194e-06, + "loss": 1.2989, + "step": 8058 + }, + { + "epoch": 2.5317549129316608, + "grad_norm": 0.921875, + "learning_rate": 9.971437638844812e-06, + "loss": 1.1052, + "step": 8060 + }, + { + "epoch": 2.532383139957202, + "grad_norm": 0.8125, + "learning_rate": 9.968898762297684e-06, + "loss": 1.2326, + "step": 8062 + }, + { + "epoch": 2.5330113669827434, + "grad_norm": 0.890625, + "learning_rate": 9.966359885750557e-06, + "loss": 1.258, + "step": 8064 + }, + { + "epoch": 2.5336395940082848, + "grad_norm": 0.828125, + "learning_rate": 9.963821009203428e-06, + "loss": 1.3359, + "step": 8066 + }, + { + "epoch": 2.534267821033826, + "grad_norm": 0.87890625, + "learning_rate": 9.9612821326563e-06, + "loss": 1.1445, + "step": 8068 + }, + { + "epoch": 2.5348960480593674, + "grad_norm": 0.8515625, + "learning_rate": 9.958743256109173e-06, + "loss": 1.1916, + "step": 8070 + }, + { + "epoch": 2.5355242750849087, + "grad_norm": 0.85546875, + "learning_rate": 9.956204379562044e-06, + "loss": 1.2186, + "step": 8072 + }, + { + "epoch": 2.53615250211045, + "grad_norm": 0.953125, + "learning_rate": 9.953665503014917e-06, + "loss": 1.1839, + "step": 8074 + }, + { + "epoch": 2.5367807291359914, + "grad_norm": 0.81640625, + "learning_rate": 9.951126626467788e-06, + "loss": 1.1174, + "step": 8076 + }, + { + "epoch": 2.5374089561615327, + "grad_norm": 0.85546875, + "learning_rate": 9.94858774992066e-06, + "loss": 1.3167, + "step": 8078 + }, + { + "epoch": 2.5380371831870745, + "grad_norm": 0.8359375, + "learning_rate": 9.946048873373533e-06, + "loss": 1.2239, + "step": 8080 + }, + { + "epoch": 2.538665410212616, + "grad_norm": 0.8515625, + "learning_rate": 9.943509996826404e-06, + "loss": 1.2857, + "step": 8082 + }, + { + "epoch": 2.539293637238157, + "grad_norm": 1.265625, + "learning_rate": 9.940971120279277e-06, + "loss": 1.2533, + "step": 8084 + }, + { + "epoch": 2.5399218642636985, + "grad_norm": 0.87890625, + "learning_rate": 9.938432243732149e-06, + "loss": 1.2453, + "step": 8086 + }, + { + "epoch": 2.54055009128924, + "grad_norm": 1.4375, + "learning_rate": 9.935893367185022e-06, + "loss": 1.2452, + "step": 8088 + }, + { + "epoch": 2.541178318314781, + "grad_norm": 0.93359375, + "learning_rate": 9.933354490637893e-06, + "loss": 1.1738, + "step": 8090 + }, + { + "epoch": 2.5418065453403225, + "grad_norm": 0.96484375, + "learning_rate": 9.930815614090765e-06, + "loss": 1.2676, + "step": 8092 + }, + { + "epoch": 2.542434772365864, + "grad_norm": 0.921875, + "learning_rate": 9.928276737543638e-06, + "loss": 1.1425, + "step": 8094 + }, + { + "epoch": 2.543062999391405, + "grad_norm": 1.03125, + "learning_rate": 9.925737860996509e-06, + "loss": 1.134, + "step": 8096 + }, + { + "epoch": 2.5436912264169464, + "grad_norm": 0.875, + "learning_rate": 9.923198984449382e-06, + "loss": 1.1667, + "step": 8098 + }, + { + "epoch": 2.5443194534424878, + "grad_norm": 0.90234375, + "learning_rate": 9.920660107902254e-06, + "loss": 1.1229, + "step": 8100 + }, + { + "epoch": 2.544947680468029, + "grad_norm": 0.9375, + "learning_rate": 9.918121231355127e-06, + "loss": 1.2614, + "step": 8102 + }, + { + "epoch": 2.5455759074935704, + "grad_norm": 0.86328125, + "learning_rate": 9.915582354807998e-06, + "loss": 1.2031, + "step": 8104 + }, + { + "epoch": 2.5462041345191118, + "grad_norm": 0.8359375, + "learning_rate": 9.913043478260871e-06, + "loss": 1.0952, + "step": 8106 + }, + { + "epoch": 2.546832361544653, + "grad_norm": 0.84375, + "learning_rate": 9.910504601713742e-06, + "loss": 1.3312, + "step": 8108 + }, + { + "epoch": 2.5474605885701944, + "grad_norm": 0.90234375, + "learning_rate": 9.907965725166616e-06, + "loss": 1.1553, + "step": 8110 + }, + { + "epoch": 2.5480888155957357, + "grad_norm": 0.85546875, + "learning_rate": 9.905426848619487e-06, + "loss": 1.1358, + "step": 8112 + }, + { + "epoch": 2.548717042621277, + "grad_norm": 0.890625, + "learning_rate": 9.90288797207236e-06, + "loss": 1.1889, + "step": 8114 + }, + { + "epoch": 2.5493452696468184, + "grad_norm": 0.87109375, + "learning_rate": 9.900349095525231e-06, + "loss": 1.3103, + "step": 8116 + }, + { + "epoch": 2.5499734966723597, + "grad_norm": 0.77734375, + "learning_rate": 9.897810218978103e-06, + "loss": 1.2243, + "step": 8118 + }, + { + "epoch": 2.550601723697901, + "grad_norm": 0.92578125, + "learning_rate": 9.895271342430976e-06, + "loss": 1.1031, + "step": 8120 + }, + { + "epoch": 2.551229950723443, + "grad_norm": 0.89453125, + "learning_rate": 9.892732465883847e-06, + "loss": 1.2186, + "step": 8122 + }, + { + "epoch": 2.551858177748984, + "grad_norm": 0.89453125, + "learning_rate": 9.89019358933672e-06, + "loss": 1.2599, + "step": 8124 + }, + { + "epoch": 2.5524864047745255, + "grad_norm": 0.94921875, + "learning_rate": 9.887654712789592e-06, + "loss": 1.2472, + "step": 8126 + }, + { + "epoch": 2.553114631800067, + "grad_norm": 0.8125, + "learning_rate": 9.885115836242463e-06, + "loss": 1.1494, + "step": 8128 + }, + { + "epoch": 2.553742858825608, + "grad_norm": 1.0390625, + "learning_rate": 9.882576959695336e-06, + "loss": 1.0775, + "step": 8130 + }, + { + "epoch": 2.5543710858511495, + "grad_norm": 0.90625, + "learning_rate": 9.880038083148208e-06, + "loss": 1.2127, + "step": 8132 + }, + { + "epoch": 2.554999312876691, + "grad_norm": 0.91796875, + "learning_rate": 9.87749920660108e-06, + "loss": 1.1725, + "step": 8134 + }, + { + "epoch": 2.555627539902232, + "grad_norm": 0.86328125, + "learning_rate": 9.874960330053952e-06, + "loss": 1.2109, + "step": 8136 + }, + { + "epoch": 2.5562557669277735, + "grad_norm": 0.96875, + "learning_rate": 9.872421453506823e-06, + "loss": 1.1498, + "step": 8138 + }, + { + "epoch": 2.556883993953315, + "grad_norm": 0.91796875, + "learning_rate": 9.869882576959696e-06, + "loss": 1.2461, + "step": 8140 + }, + { + "epoch": 2.557512220978856, + "grad_norm": 0.984375, + "learning_rate": 9.867343700412568e-06, + "loss": 1.1263, + "step": 8142 + }, + { + "epoch": 2.5581404480043974, + "grad_norm": 0.984375, + "learning_rate": 9.864804823865441e-06, + "loss": 1.0624, + "step": 8144 + }, + { + "epoch": 2.558768675029939, + "grad_norm": 0.921875, + "learning_rate": 9.862265947318312e-06, + "loss": 1.1112, + "step": 8146 + }, + { + "epoch": 2.5593969020554805, + "grad_norm": 0.84375, + "learning_rate": 9.859727070771185e-06, + "loss": 1.1871, + "step": 8148 + }, + { + "epoch": 2.560025129081022, + "grad_norm": 0.87890625, + "learning_rate": 9.857188194224057e-06, + "loss": 1.2094, + "step": 8150 + }, + { + "epoch": 2.560653356106563, + "grad_norm": 0.9140625, + "learning_rate": 9.854649317676928e-06, + "loss": 1.0918, + "step": 8152 + }, + { + "epoch": 2.5612815831321045, + "grad_norm": 0.83984375, + "learning_rate": 9.852110441129801e-06, + "loss": 1.1145, + "step": 8154 + }, + { + "epoch": 2.561909810157646, + "grad_norm": 0.84765625, + "learning_rate": 9.849571564582673e-06, + "loss": 1.178, + "step": 8156 + }, + { + "epoch": 2.562538037183187, + "grad_norm": 0.90625, + "learning_rate": 9.847032688035546e-06, + "loss": 1.2077, + "step": 8158 + }, + { + "epoch": 2.5631662642087285, + "grad_norm": 0.87109375, + "learning_rate": 9.844493811488417e-06, + "loss": 1.2177, + "step": 8160 + }, + { + "epoch": 2.56379449123427, + "grad_norm": 0.86328125, + "learning_rate": 9.841954934941288e-06, + "loss": 1.1381, + "step": 8162 + }, + { + "epoch": 2.564422718259811, + "grad_norm": 0.9765625, + "learning_rate": 9.839416058394161e-06, + "loss": 1.1781, + "step": 8164 + }, + { + "epoch": 2.5650509452853525, + "grad_norm": 0.8359375, + "learning_rate": 9.836877181847033e-06, + "loss": 1.1931, + "step": 8166 + }, + { + "epoch": 2.565679172310894, + "grad_norm": 0.79296875, + "learning_rate": 9.834338305299906e-06, + "loss": 1.203, + "step": 8168 + }, + { + "epoch": 2.566307399336435, + "grad_norm": 1.03125, + "learning_rate": 9.831799428752777e-06, + "loss": 1.2147, + "step": 8170 + }, + { + "epoch": 2.5669356263619765, + "grad_norm": 0.8984375, + "learning_rate": 9.829260552205649e-06, + "loss": 1.2632, + "step": 8172 + }, + { + "epoch": 2.567563853387518, + "grad_norm": 0.88671875, + "learning_rate": 9.826721675658522e-06, + "loss": 1.2249, + "step": 8174 + }, + { + "epoch": 2.568192080413059, + "grad_norm": 0.8359375, + "learning_rate": 9.824182799111393e-06, + "loss": 1.1366, + "step": 8176 + }, + { + "epoch": 2.5688203074386005, + "grad_norm": 0.875, + "learning_rate": 9.821643922564266e-06, + "loss": 1.1498, + "step": 8178 + }, + { + "epoch": 2.569448534464142, + "grad_norm": 1.171875, + "learning_rate": 9.819105046017138e-06, + "loss": 1.121, + "step": 8180 + }, + { + "epoch": 2.570076761489683, + "grad_norm": 0.8828125, + "learning_rate": 9.816566169470009e-06, + "loss": 1.1996, + "step": 8182 + }, + { + "epoch": 2.5707049885152244, + "grad_norm": 0.90625, + "learning_rate": 9.814027292922882e-06, + "loss": 1.2784, + "step": 8184 + }, + { + "epoch": 2.5713332155407658, + "grad_norm": 0.90625, + "learning_rate": 9.811488416375753e-06, + "loss": 1.145, + "step": 8186 + }, + { + "epoch": 2.5719614425663075, + "grad_norm": 0.8125, + "learning_rate": 9.808949539828627e-06, + "loss": 1.1618, + "step": 8188 + }, + { + "epoch": 2.572589669591849, + "grad_norm": 0.97265625, + "learning_rate": 9.8064106632815e-06, + "loss": 1.2192, + "step": 8190 + }, + { + "epoch": 2.57321789661739, + "grad_norm": 0.87890625, + "learning_rate": 9.803871786734371e-06, + "loss": 1.166, + "step": 8192 + }, + { + "epoch": 2.5738461236429315, + "grad_norm": 0.83984375, + "learning_rate": 9.801332910187244e-06, + "loss": 1.3588, + "step": 8194 + }, + { + "epoch": 2.574474350668473, + "grad_norm": 0.91796875, + "learning_rate": 9.798794033640115e-06, + "loss": 1.0738, + "step": 8196 + }, + { + "epoch": 2.575102577694014, + "grad_norm": 0.84765625, + "learning_rate": 9.796255157092987e-06, + "loss": 1.3274, + "step": 8198 + }, + { + "epoch": 2.5757308047195555, + "grad_norm": 0.828125, + "learning_rate": 9.79371628054586e-06, + "loss": 1.3462, + "step": 8200 + }, + { + "epoch": 2.576359031745097, + "grad_norm": 0.8984375, + "learning_rate": 9.791177403998731e-06, + "loss": 1.1931, + "step": 8202 + }, + { + "epoch": 2.576987258770638, + "grad_norm": 0.8125, + "learning_rate": 9.788638527451604e-06, + "loss": 1.0762, + "step": 8204 + }, + { + "epoch": 2.5776154857961795, + "grad_norm": 0.8671875, + "learning_rate": 9.786099650904476e-06, + "loss": 1.0787, + "step": 8206 + }, + { + "epoch": 2.578243712821721, + "grad_norm": 0.875, + "learning_rate": 9.783560774357347e-06, + "loss": 1.1584, + "step": 8208 + }, + { + "epoch": 2.578871939847262, + "grad_norm": 0.953125, + "learning_rate": 9.78102189781022e-06, + "loss": 1.1298, + "step": 8210 + }, + { + "epoch": 2.579500166872804, + "grad_norm": 0.9609375, + "learning_rate": 9.778483021263092e-06, + "loss": 1.1622, + "step": 8212 + }, + { + "epoch": 2.5801283938983453, + "grad_norm": 0.90625, + "learning_rate": 9.775944144715965e-06, + "loss": 1.2961, + "step": 8214 + }, + { + "epoch": 2.5807566209238866, + "grad_norm": 0.9453125, + "learning_rate": 9.773405268168836e-06, + "loss": 1.1895, + "step": 8216 + }, + { + "epoch": 2.581384847949428, + "grad_norm": 0.9296875, + "learning_rate": 9.770866391621709e-06, + "loss": 1.1523, + "step": 8218 + }, + { + "epoch": 2.5820130749749692, + "grad_norm": 0.9375, + "learning_rate": 9.76832751507458e-06, + "loss": 1.0955, + "step": 8220 + }, + { + "epoch": 2.5826413020005106, + "grad_norm": 0.890625, + "learning_rate": 9.765788638527452e-06, + "loss": 1.1717, + "step": 8222 + }, + { + "epoch": 2.583269529026052, + "grad_norm": 0.99609375, + "learning_rate": 9.763249761980325e-06, + "loss": 1.1134, + "step": 8224 + }, + { + "epoch": 2.583897756051593, + "grad_norm": 0.86328125, + "learning_rate": 9.760710885433196e-06, + "loss": 1.1845, + "step": 8226 + }, + { + "epoch": 2.5845259830771345, + "grad_norm": 0.91796875, + "learning_rate": 9.75817200888607e-06, + "loss": 1.0873, + "step": 8228 + }, + { + "epoch": 2.585154210102676, + "grad_norm": 0.8515625, + "learning_rate": 9.75563313233894e-06, + "loss": 1.1864, + "step": 8230 + }, + { + "epoch": 2.585782437128217, + "grad_norm": 0.9140625, + "learning_rate": 9.753094255791812e-06, + "loss": 1.0575, + "step": 8232 + }, + { + "epoch": 2.5864106641537585, + "grad_norm": 0.89453125, + "learning_rate": 9.750555379244685e-06, + "loss": 1.2174, + "step": 8234 + }, + { + "epoch": 2.5870388911793, + "grad_norm": 0.91015625, + "learning_rate": 9.748016502697557e-06, + "loss": 1.0819, + "step": 8236 + }, + { + "epoch": 2.587667118204841, + "grad_norm": 0.83984375, + "learning_rate": 9.74547762615043e-06, + "loss": 1.3066, + "step": 8238 + }, + { + "epoch": 2.5882953452303825, + "grad_norm": 0.9296875, + "learning_rate": 9.742938749603301e-06, + "loss": 1.0422, + "step": 8240 + }, + { + "epoch": 2.588923572255924, + "grad_norm": 1.0078125, + "learning_rate": 9.740399873056172e-06, + "loss": 1.315, + "step": 8242 + }, + { + "epoch": 2.589551799281465, + "grad_norm": 0.953125, + "learning_rate": 9.737860996509046e-06, + "loss": 1.1542, + "step": 8244 + }, + { + "epoch": 2.5901800263070065, + "grad_norm": 0.8125, + "learning_rate": 9.735322119961917e-06, + "loss": 1.1142, + "step": 8246 + }, + { + "epoch": 2.590808253332548, + "grad_norm": 0.87890625, + "learning_rate": 9.73278324341479e-06, + "loss": 1.1555, + "step": 8248 + }, + { + "epoch": 2.591436480358089, + "grad_norm": 0.91796875, + "learning_rate": 9.730244366867661e-06, + "loss": 1.1318, + "step": 8250 + }, + { + "epoch": 2.5920647073836305, + "grad_norm": 0.8671875, + "learning_rate": 9.727705490320534e-06, + "loss": 1.1723, + "step": 8252 + }, + { + "epoch": 2.5926929344091723, + "grad_norm": 0.8515625, + "learning_rate": 9.725166613773406e-06, + "loss": 1.1697, + "step": 8254 + }, + { + "epoch": 2.5933211614347136, + "grad_norm": 0.9921875, + "learning_rate": 9.722627737226277e-06, + "loss": 1.142, + "step": 8256 + }, + { + "epoch": 2.593949388460255, + "grad_norm": 0.89453125, + "learning_rate": 9.72008886067915e-06, + "loss": 1.0324, + "step": 8258 + }, + { + "epoch": 2.5945776154857962, + "grad_norm": 0.95703125, + "learning_rate": 9.717549984132022e-06, + "loss": 1.0065, + "step": 8260 + }, + { + "epoch": 2.5952058425113376, + "grad_norm": 0.82421875, + "learning_rate": 9.715011107584895e-06, + "loss": 1.104, + "step": 8262 + }, + { + "epoch": 2.595834069536879, + "grad_norm": 0.890625, + "learning_rate": 9.712472231037766e-06, + "loss": 1.1189, + "step": 8264 + }, + { + "epoch": 2.5964622965624202, + "grad_norm": 0.89453125, + "learning_rate": 9.709933354490638e-06, + "loss": 1.1932, + "step": 8266 + }, + { + "epoch": 2.5970905235879616, + "grad_norm": 0.8828125, + "learning_rate": 9.70739447794351e-06, + "loss": 1.2323, + "step": 8268 + }, + { + "epoch": 2.597718750613503, + "grad_norm": 0.9375, + "learning_rate": 9.704855601396382e-06, + "loss": 1.1692, + "step": 8270 + }, + { + "epoch": 2.598346977639044, + "grad_norm": 0.8359375, + "learning_rate": 9.702316724849255e-06, + "loss": 1.2307, + "step": 8272 + }, + { + "epoch": 2.5989752046645855, + "grad_norm": 0.88671875, + "learning_rate": 9.699777848302128e-06, + "loss": 1.112, + "step": 8274 + }, + { + "epoch": 2.599603431690127, + "grad_norm": 0.8828125, + "learning_rate": 9.697238971755e-06, + "loss": 1.3361, + "step": 8276 + }, + { + "epoch": 2.6002316587156686, + "grad_norm": 0.93359375, + "learning_rate": 9.694700095207873e-06, + "loss": 0.9816, + "step": 8278 + }, + { + "epoch": 2.60085988574121, + "grad_norm": 0.85546875, + "learning_rate": 9.692161218660744e-06, + "loss": 1.2718, + "step": 8280 + }, + { + "epoch": 2.6014881127667513, + "grad_norm": 0.953125, + "learning_rate": 9.689622342113615e-06, + "loss": 1.2261, + "step": 8282 + }, + { + "epoch": 2.6021163397922926, + "grad_norm": 1.109375, + "learning_rate": 9.687083465566488e-06, + "loss": 1.1821, + "step": 8284 + }, + { + "epoch": 2.602744566817834, + "grad_norm": 0.953125, + "learning_rate": 9.68454458901936e-06, + "loss": 1.1777, + "step": 8286 + }, + { + "epoch": 2.6033727938433753, + "grad_norm": 0.875, + "learning_rate": 9.682005712472233e-06, + "loss": 1.1155, + "step": 8288 + }, + { + "epoch": 2.6040010208689166, + "grad_norm": 1.546875, + "learning_rate": 9.679466835925104e-06, + "loss": 1.1201, + "step": 8290 + }, + { + "epoch": 2.604629247894458, + "grad_norm": 0.87109375, + "learning_rate": 9.676927959377976e-06, + "loss": 1.0158, + "step": 8292 + }, + { + "epoch": 2.6052574749199993, + "grad_norm": 0.97265625, + "learning_rate": 9.674389082830849e-06, + "loss": 1.0516, + "step": 8294 + }, + { + "epoch": 2.6058857019455406, + "grad_norm": 0.91015625, + "learning_rate": 9.67185020628372e-06, + "loss": 1.3452, + "step": 8296 + }, + { + "epoch": 2.606513928971082, + "grad_norm": 0.85546875, + "learning_rate": 9.669311329736593e-06, + "loss": 1.2381, + "step": 8298 + }, + { + "epoch": 2.6071421559966232, + "grad_norm": 0.8515625, + "learning_rate": 9.666772453189465e-06, + "loss": 1.1524, + "step": 8300 + }, + { + "epoch": 2.6077703830221646, + "grad_norm": 0.984375, + "learning_rate": 9.664233576642336e-06, + "loss": 1.0884, + "step": 8302 + }, + { + "epoch": 2.608398610047706, + "grad_norm": 1.0390625, + "learning_rate": 9.661694700095209e-06, + "loss": 1.2137, + "step": 8304 + }, + { + "epoch": 2.6090268370732472, + "grad_norm": 0.86328125, + "learning_rate": 9.65915582354808e-06, + "loss": 1.2906, + "step": 8306 + }, + { + "epoch": 2.6096550640987886, + "grad_norm": 0.88671875, + "learning_rate": 9.656616947000954e-06, + "loss": 1.1416, + "step": 8308 + }, + { + "epoch": 2.61028329112433, + "grad_norm": 0.87109375, + "learning_rate": 9.654078070453825e-06, + "loss": 1.2919, + "step": 8310 + }, + { + "epoch": 2.610911518149871, + "grad_norm": 0.82421875, + "learning_rate": 9.651539193906698e-06, + "loss": 1.3006, + "step": 8312 + }, + { + "epoch": 2.6115397451754125, + "grad_norm": 0.87109375, + "learning_rate": 9.64900031735957e-06, + "loss": 1.2853, + "step": 8314 + }, + { + "epoch": 2.612167972200954, + "grad_norm": 0.953125, + "learning_rate": 9.64646144081244e-06, + "loss": 1.1489, + "step": 8316 + }, + { + "epoch": 2.612796199226495, + "grad_norm": 0.99609375, + "learning_rate": 9.643922564265314e-06, + "loss": 1.2045, + "step": 8318 + }, + { + "epoch": 2.613424426252037, + "grad_norm": 0.8359375, + "learning_rate": 9.641383687718185e-06, + "loss": 1.1971, + "step": 8320 + }, + { + "epoch": 2.6140526532775783, + "grad_norm": 0.83203125, + "learning_rate": 9.638844811171058e-06, + "loss": 1.1142, + "step": 8322 + }, + { + "epoch": 2.6146808803031196, + "grad_norm": 0.890625, + "learning_rate": 9.63630593462393e-06, + "loss": 1.3033, + "step": 8324 + }, + { + "epoch": 2.615309107328661, + "grad_norm": 0.84375, + "learning_rate": 9.633767058076801e-06, + "loss": 1.2125, + "step": 8326 + }, + { + "epoch": 2.6159373343542023, + "grad_norm": 0.828125, + "learning_rate": 9.631228181529674e-06, + "loss": 1.0768, + "step": 8328 + }, + { + "epoch": 2.6165655613797436, + "grad_norm": 0.84765625, + "learning_rate": 9.628689304982545e-06, + "loss": 1.3004, + "step": 8330 + }, + { + "epoch": 2.617193788405285, + "grad_norm": 0.921875, + "learning_rate": 9.626150428435419e-06, + "loss": 1.247, + "step": 8332 + }, + { + "epoch": 2.6178220154308263, + "grad_norm": 0.828125, + "learning_rate": 9.62361155188829e-06, + "loss": 1.1625, + "step": 8334 + }, + { + "epoch": 2.6184502424563676, + "grad_norm": 0.953125, + "learning_rate": 9.621072675341161e-06, + "loss": 1.1265, + "step": 8336 + }, + { + "epoch": 2.619078469481909, + "grad_norm": 0.87890625, + "learning_rate": 9.618533798794034e-06, + "loss": 1.2211, + "step": 8338 + }, + { + "epoch": 2.6197066965074502, + "grad_norm": 0.95703125, + "learning_rate": 9.615994922246906e-06, + "loss": 1.1047, + "step": 8340 + }, + { + "epoch": 2.6203349235329916, + "grad_norm": 0.84765625, + "learning_rate": 9.613456045699779e-06, + "loss": 1.2186, + "step": 8342 + }, + { + "epoch": 2.6209631505585333, + "grad_norm": 0.8671875, + "learning_rate": 9.61091716915265e-06, + "loss": 1.0722, + "step": 8344 + }, + { + "epoch": 2.6215913775840747, + "grad_norm": 0.9609375, + "learning_rate": 9.608378292605522e-06, + "loss": 1.2318, + "step": 8346 + }, + { + "epoch": 2.622219604609616, + "grad_norm": 0.85546875, + "learning_rate": 9.605839416058395e-06, + "loss": 1.1319, + "step": 8348 + }, + { + "epoch": 2.6228478316351573, + "grad_norm": 0.85546875, + "learning_rate": 9.603300539511266e-06, + "loss": 1.1703, + "step": 8350 + }, + { + "epoch": 2.6234760586606987, + "grad_norm": 0.87109375, + "learning_rate": 9.60076166296414e-06, + "loss": 1.2391, + "step": 8352 + }, + { + "epoch": 2.62410428568624, + "grad_norm": 0.890625, + "learning_rate": 9.59822278641701e-06, + "loss": 1.2098, + "step": 8354 + }, + { + "epoch": 2.6247325127117813, + "grad_norm": 0.91796875, + "learning_rate": 9.595683909869884e-06, + "loss": 1.239, + "step": 8356 + }, + { + "epoch": 2.6253607397373226, + "grad_norm": 0.84375, + "learning_rate": 9.593145033322755e-06, + "loss": 1.2125, + "step": 8358 + }, + { + "epoch": 2.625988966762864, + "grad_norm": 0.93359375, + "learning_rate": 9.590606156775628e-06, + "loss": 1.2088, + "step": 8360 + }, + { + "epoch": 2.6266171937884053, + "grad_norm": 0.828125, + "learning_rate": 9.5880672802285e-06, + "loss": 1.166, + "step": 8362 + }, + { + "epoch": 2.6272454208139466, + "grad_norm": 0.9453125, + "learning_rate": 9.585528403681373e-06, + "loss": 1.204, + "step": 8364 + }, + { + "epoch": 2.627873647839488, + "grad_norm": 1.03125, + "learning_rate": 9.582989527134244e-06, + "loss": 1.1946, + "step": 8366 + }, + { + "epoch": 2.6285018748650293, + "grad_norm": 0.94140625, + "learning_rate": 9.580450650587117e-06, + "loss": 1.2513, + "step": 8368 + }, + { + "epoch": 2.6291301018905706, + "grad_norm": 0.91796875, + "learning_rate": 9.577911774039988e-06, + "loss": 1.2714, + "step": 8370 + }, + { + "epoch": 2.629758328916112, + "grad_norm": 0.87890625, + "learning_rate": 9.57537289749286e-06, + "loss": 1.3492, + "step": 8372 + }, + { + "epoch": 2.6303865559416533, + "grad_norm": 0.93359375, + "learning_rate": 9.572834020945733e-06, + "loss": 1.2335, + "step": 8374 + }, + { + "epoch": 2.6310147829671946, + "grad_norm": 0.85546875, + "learning_rate": 9.570295144398604e-06, + "loss": 1.3027, + "step": 8376 + }, + { + "epoch": 2.631643009992736, + "grad_norm": 0.9296875, + "learning_rate": 9.567756267851477e-06, + "loss": 1.2272, + "step": 8378 + }, + { + "epoch": 2.6322712370182773, + "grad_norm": 0.9140625, + "learning_rate": 9.565217391304349e-06, + "loss": 1.086, + "step": 8380 + }, + { + "epoch": 2.6328994640438186, + "grad_norm": 0.85546875, + "learning_rate": 9.562678514757222e-06, + "loss": 1.2643, + "step": 8382 + }, + { + "epoch": 2.6335276910693604, + "grad_norm": 0.87109375, + "learning_rate": 9.560139638210093e-06, + "loss": 1.1945, + "step": 8384 + }, + { + "epoch": 2.6341559180949017, + "grad_norm": 0.8515625, + "learning_rate": 9.557600761662965e-06, + "loss": 1.2233, + "step": 8386 + }, + { + "epoch": 2.634784145120443, + "grad_norm": 0.984375, + "learning_rate": 9.555061885115838e-06, + "loss": 1.2933, + "step": 8388 + }, + { + "epoch": 2.6354123721459843, + "grad_norm": 0.93359375, + "learning_rate": 9.552523008568709e-06, + "loss": 1.2637, + "step": 8390 + }, + { + "epoch": 2.6360405991715257, + "grad_norm": 0.90625, + "learning_rate": 9.549984132021582e-06, + "loss": 1.2201, + "step": 8392 + }, + { + "epoch": 2.636668826197067, + "grad_norm": 0.8359375, + "learning_rate": 9.547445255474453e-06, + "loss": 1.1775, + "step": 8394 + }, + { + "epoch": 2.6372970532226083, + "grad_norm": 0.8359375, + "learning_rate": 9.544906378927325e-06, + "loss": 1.1219, + "step": 8396 + }, + { + "epoch": 2.6379252802481497, + "grad_norm": 0.90625, + "learning_rate": 9.542367502380198e-06, + "loss": 1.2112, + "step": 8398 + }, + { + "epoch": 2.638553507273691, + "grad_norm": 0.96484375, + "learning_rate": 9.53982862583307e-06, + "loss": 1.1167, + "step": 8400 + }, + { + "epoch": 2.6391817342992323, + "grad_norm": 0.953125, + "learning_rate": 9.537289749285942e-06, + "loss": 1.2177, + "step": 8402 + }, + { + "epoch": 2.6398099613247736, + "grad_norm": 0.8671875, + "learning_rate": 9.534750872738814e-06, + "loss": 1.2147, + "step": 8404 + }, + { + "epoch": 2.640438188350315, + "grad_norm": 0.8671875, + "learning_rate": 9.532211996191685e-06, + "loss": 1.2503, + "step": 8406 + }, + { + "epoch": 2.6410664153758563, + "grad_norm": 0.8828125, + "learning_rate": 9.529673119644558e-06, + "loss": 1.2971, + "step": 8408 + }, + { + "epoch": 2.641694642401398, + "grad_norm": 0.8671875, + "learning_rate": 9.52713424309743e-06, + "loss": 1.1697, + "step": 8410 + }, + { + "epoch": 2.6423228694269394, + "grad_norm": 0.83984375, + "learning_rate": 9.524595366550303e-06, + "loss": 1.1083, + "step": 8412 + }, + { + "epoch": 2.6429510964524807, + "grad_norm": 0.85546875, + "learning_rate": 9.522056490003174e-06, + "loss": 1.2256, + "step": 8414 + }, + { + "epoch": 2.643579323478022, + "grad_norm": 0.828125, + "learning_rate": 9.519517613456047e-06, + "loss": 1.1345, + "step": 8416 + }, + { + "epoch": 2.6442075505035634, + "grad_norm": 0.97265625, + "learning_rate": 9.516978736908918e-06, + "loss": 1.1583, + "step": 8418 + }, + { + "epoch": 2.6448357775291047, + "grad_norm": 0.91796875, + "learning_rate": 9.51443986036179e-06, + "loss": 1.1522, + "step": 8420 + }, + { + "epoch": 2.645464004554646, + "grad_norm": 0.8125, + "learning_rate": 9.511900983814663e-06, + "loss": 1.1517, + "step": 8422 + }, + { + "epoch": 2.6460922315801874, + "grad_norm": 0.9609375, + "learning_rate": 9.509362107267534e-06, + "loss": 1.0531, + "step": 8424 + }, + { + "epoch": 2.6467204586057287, + "grad_norm": 0.98046875, + "learning_rate": 9.506823230720407e-06, + "loss": 1.177, + "step": 8426 + }, + { + "epoch": 2.64734868563127, + "grad_norm": 0.8671875, + "learning_rate": 9.504284354173279e-06, + "loss": 1.2406, + "step": 8428 + }, + { + "epoch": 2.6479769126568113, + "grad_norm": 0.8671875, + "learning_rate": 9.50174547762615e-06, + "loss": 1.2265, + "step": 8430 + }, + { + "epoch": 2.6486051396823527, + "grad_norm": 0.99609375, + "learning_rate": 9.499206601079023e-06, + "loss": 1.161, + "step": 8432 + }, + { + "epoch": 2.649233366707894, + "grad_norm": 0.94921875, + "learning_rate": 9.496667724531895e-06, + "loss": 1.1494, + "step": 8434 + }, + { + "epoch": 2.6498615937334353, + "grad_norm": 0.859375, + "learning_rate": 9.494128847984768e-06, + "loss": 1.1465, + "step": 8436 + }, + { + "epoch": 2.6504898207589767, + "grad_norm": 0.84375, + "learning_rate": 9.491589971437639e-06, + "loss": 1.1225, + "step": 8438 + }, + { + "epoch": 2.651118047784518, + "grad_norm": 0.8828125, + "learning_rate": 9.48905109489051e-06, + "loss": 1.094, + "step": 8440 + }, + { + "epoch": 2.6517462748100593, + "grad_norm": 0.859375, + "learning_rate": 9.486512218343384e-06, + "loss": 1.0814, + "step": 8442 + }, + { + "epoch": 2.6523745018356006, + "grad_norm": 1.03125, + "learning_rate": 9.483973341796255e-06, + "loss": 1.0285, + "step": 8444 + }, + { + "epoch": 2.653002728861142, + "grad_norm": 0.8515625, + "learning_rate": 9.481434465249128e-06, + "loss": 1.2455, + "step": 8446 + }, + { + "epoch": 2.6536309558866833, + "grad_norm": 0.86328125, + "learning_rate": 9.478895588702001e-06, + "loss": 1.1987, + "step": 8448 + }, + { + "epoch": 2.654259182912225, + "grad_norm": 0.8828125, + "learning_rate": 9.476356712154872e-06, + "loss": 1.2405, + "step": 8450 + }, + { + "epoch": 2.6548874099377664, + "grad_norm": 0.92578125, + "learning_rate": 9.473817835607746e-06, + "loss": 1.0695, + "step": 8452 + }, + { + "epoch": 2.6555156369633077, + "grad_norm": 0.8984375, + "learning_rate": 9.471278959060617e-06, + "loss": 1.0887, + "step": 8454 + }, + { + "epoch": 2.656143863988849, + "grad_norm": 0.90625, + "learning_rate": 9.468740082513488e-06, + "loss": 1.1871, + "step": 8456 + }, + { + "epoch": 2.6567720910143904, + "grad_norm": 0.82421875, + "learning_rate": 9.466201205966361e-06, + "loss": 1.235, + "step": 8458 + }, + { + "epoch": 2.6574003180399317, + "grad_norm": 0.8671875, + "learning_rate": 9.463662329419233e-06, + "loss": 1.1305, + "step": 8460 + }, + { + "epoch": 2.658028545065473, + "grad_norm": 0.83984375, + "learning_rate": 9.461123452872106e-06, + "loss": 1.0736, + "step": 8462 + }, + { + "epoch": 2.6586567720910144, + "grad_norm": 0.9453125, + "learning_rate": 9.458584576324977e-06, + "loss": 1.1682, + "step": 8464 + }, + { + "epoch": 2.6592849991165557, + "grad_norm": 0.9296875, + "learning_rate": 9.456045699777849e-06, + "loss": 1.1639, + "step": 8466 + }, + { + "epoch": 2.659913226142097, + "grad_norm": 0.8671875, + "learning_rate": 9.453506823230722e-06, + "loss": 1.2223, + "step": 8468 + }, + { + "epoch": 2.6605414531676383, + "grad_norm": 0.94921875, + "learning_rate": 9.450967946683593e-06, + "loss": 1.1154, + "step": 8470 + }, + { + "epoch": 2.6611696801931797, + "grad_norm": 0.87890625, + "learning_rate": 9.448429070136466e-06, + "loss": 1.0826, + "step": 8472 + }, + { + "epoch": 2.661797907218721, + "grad_norm": 1.015625, + "learning_rate": 9.445890193589338e-06, + "loss": 1.2074, + "step": 8474 + }, + { + "epoch": 2.6624261342442628, + "grad_norm": 0.890625, + "learning_rate": 9.443351317042209e-06, + "loss": 1.1333, + "step": 8476 + }, + { + "epoch": 2.663054361269804, + "grad_norm": 1.03125, + "learning_rate": 9.440812440495082e-06, + "loss": 1.2202, + "step": 8478 + }, + { + "epoch": 2.6636825882953454, + "grad_norm": 0.87890625, + "learning_rate": 9.438273563947953e-06, + "loss": 1.2403, + "step": 8480 + }, + { + "epoch": 2.6643108153208868, + "grad_norm": 0.84375, + "learning_rate": 9.435734687400826e-06, + "loss": 1.1786, + "step": 8482 + }, + { + "epoch": 2.664939042346428, + "grad_norm": 0.94921875, + "learning_rate": 9.433195810853698e-06, + "loss": 1.1325, + "step": 8484 + }, + { + "epoch": 2.6655672693719694, + "grad_norm": 0.86328125, + "learning_rate": 9.430656934306571e-06, + "loss": 1.0782, + "step": 8486 + }, + { + "epoch": 2.6661954963975107, + "grad_norm": 0.94140625, + "learning_rate": 9.428118057759442e-06, + "loss": 1.217, + "step": 8488 + }, + { + "epoch": 2.666823723423052, + "grad_norm": 0.875, + "learning_rate": 9.425579181212314e-06, + "loss": 1.1414, + "step": 8490 + }, + { + "epoch": 2.6674519504485934, + "grad_norm": 0.83984375, + "learning_rate": 9.423040304665187e-06, + "loss": 1.2201, + "step": 8492 + }, + { + "epoch": 2.6680801774741347, + "grad_norm": 0.83203125, + "learning_rate": 9.420501428118058e-06, + "loss": 1.3012, + "step": 8494 + }, + { + "epoch": 2.668708404499676, + "grad_norm": 0.90234375, + "learning_rate": 9.417962551570931e-06, + "loss": 1.2001, + "step": 8496 + }, + { + "epoch": 2.6693366315252174, + "grad_norm": 0.8515625, + "learning_rate": 9.415423675023803e-06, + "loss": 1.2408, + "step": 8498 + }, + { + "epoch": 2.6699648585507587, + "grad_norm": 0.84375, + "learning_rate": 9.412884798476674e-06, + "loss": 1.129, + "step": 8500 + }, + { + "epoch": 2.6705930855763, + "grad_norm": 0.8515625, + "learning_rate": 9.410345921929547e-06, + "loss": 1.3063, + "step": 8502 + }, + { + "epoch": 2.6712213126018414, + "grad_norm": 1.0234375, + "learning_rate": 9.407807045382418e-06, + "loss": 1.2523, + "step": 8504 + }, + { + "epoch": 2.6718495396273827, + "grad_norm": 0.90625, + "learning_rate": 9.405268168835291e-06, + "loss": 1.1856, + "step": 8506 + }, + { + "epoch": 2.672477766652924, + "grad_norm": 0.984375, + "learning_rate": 9.402729292288163e-06, + "loss": 1.1575, + "step": 8508 + }, + { + "epoch": 2.6731059936784654, + "grad_norm": 0.8515625, + "learning_rate": 9.400190415741034e-06, + "loss": 1.0457, + "step": 8510 + }, + { + "epoch": 2.6737342207040067, + "grad_norm": 0.8828125, + "learning_rate": 9.397651539193907e-06, + "loss": 1.1486, + "step": 8512 + }, + { + "epoch": 2.674362447729548, + "grad_norm": 0.9609375, + "learning_rate": 9.395112662646779e-06, + "loss": 1.094, + "step": 8514 + }, + { + "epoch": 2.67499067475509, + "grad_norm": 0.8671875, + "learning_rate": 9.392573786099652e-06, + "loss": 1.1416, + "step": 8516 + }, + { + "epoch": 2.675618901780631, + "grad_norm": 0.828125, + "learning_rate": 9.390034909552523e-06, + "loss": 1.2959, + "step": 8518 + }, + { + "epoch": 2.6762471288061724, + "grad_norm": 0.87109375, + "learning_rate": 9.387496033005396e-06, + "loss": 1.0791, + "step": 8520 + }, + { + "epoch": 2.6768753558317138, + "grad_norm": 0.8984375, + "learning_rate": 9.384957156458268e-06, + "loss": 1.2186, + "step": 8522 + }, + { + "epoch": 2.677503582857255, + "grad_norm": 0.94140625, + "learning_rate": 9.382418279911139e-06, + "loss": 1.1701, + "step": 8524 + }, + { + "epoch": 2.6781318098827964, + "grad_norm": 0.8984375, + "learning_rate": 9.379879403364012e-06, + "loss": 1.0534, + "step": 8526 + }, + { + "epoch": 2.6787600369083377, + "grad_norm": 0.86328125, + "learning_rate": 9.377340526816883e-06, + "loss": 1.0403, + "step": 8528 + }, + { + "epoch": 2.679388263933879, + "grad_norm": 0.859375, + "learning_rate": 9.374801650269757e-06, + "loss": 1.2593, + "step": 8530 + }, + { + "epoch": 2.6800164909594204, + "grad_norm": 0.87109375, + "learning_rate": 9.37226277372263e-06, + "loss": 1.401, + "step": 8532 + }, + { + "epoch": 2.6806447179849617, + "grad_norm": 0.83203125, + "learning_rate": 9.369723897175501e-06, + "loss": 1.113, + "step": 8534 + }, + { + "epoch": 2.681272945010503, + "grad_norm": 0.859375, + "learning_rate": 9.367185020628372e-06, + "loss": 1.2193, + "step": 8536 + }, + { + "epoch": 2.6819011720360444, + "grad_norm": 0.89453125, + "learning_rate": 9.364646144081245e-06, + "loss": 1.2076, + "step": 8538 + }, + { + "epoch": 2.6825293990615857, + "grad_norm": 0.859375, + "learning_rate": 9.362107267534117e-06, + "loss": 1.2201, + "step": 8540 + }, + { + "epoch": 2.6831576260871275, + "grad_norm": 0.96484375, + "learning_rate": 9.35956839098699e-06, + "loss": 1.2454, + "step": 8542 + }, + { + "epoch": 2.683785853112669, + "grad_norm": 0.84765625, + "learning_rate": 9.357029514439861e-06, + "loss": 1.1856, + "step": 8544 + }, + { + "epoch": 2.68441408013821, + "grad_norm": 1.1640625, + "learning_rate": 9.354490637892734e-06, + "loss": 1.1561, + "step": 8546 + }, + { + "epoch": 2.6850423071637515, + "grad_norm": 0.92578125, + "learning_rate": 9.351951761345606e-06, + "loss": 1.2809, + "step": 8548 + }, + { + "epoch": 2.685670534189293, + "grad_norm": 1.0078125, + "learning_rate": 9.349412884798477e-06, + "loss": 1.1202, + "step": 8550 + }, + { + "epoch": 2.686298761214834, + "grad_norm": 0.8828125, + "learning_rate": 9.34687400825135e-06, + "loss": 1.2268, + "step": 8552 + }, + { + "epoch": 2.6869269882403755, + "grad_norm": 0.859375, + "learning_rate": 9.344335131704222e-06, + "loss": 1.3182, + "step": 8554 + }, + { + "epoch": 2.687555215265917, + "grad_norm": 0.890625, + "learning_rate": 9.341796255157095e-06, + "loss": 1.1671, + "step": 8556 + }, + { + "epoch": 2.688183442291458, + "grad_norm": 0.84375, + "learning_rate": 9.339257378609966e-06, + "loss": 1.1365, + "step": 8558 + }, + { + "epoch": 2.6888116693169994, + "grad_norm": 0.91015625, + "learning_rate": 9.336718502062837e-06, + "loss": 1.3515, + "step": 8560 + }, + { + "epoch": 2.6894398963425408, + "grad_norm": 0.859375, + "learning_rate": 9.33417962551571e-06, + "loss": 1.2687, + "step": 8562 + }, + { + "epoch": 2.690068123368082, + "grad_norm": 0.84375, + "learning_rate": 9.331640748968582e-06, + "loss": 1.3292, + "step": 8564 + }, + { + "epoch": 2.6906963503936234, + "grad_norm": 0.8359375, + "learning_rate": 9.329101872421455e-06, + "loss": 1.1909, + "step": 8566 + }, + { + "epoch": 2.6913245774191648, + "grad_norm": 0.8984375, + "learning_rate": 9.326562995874326e-06, + "loss": 1.1728, + "step": 8568 + }, + { + "epoch": 2.691952804444706, + "grad_norm": 0.95703125, + "learning_rate": 9.324024119327198e-06, + "loss": 1.129, + "step": 8570 + }, + { + "epoch": 2.6925810314702474, + "grad_norm": 0.859375, + "learning_rate": 9.32148524278007e-06, + "loss": 1.1925, + "step": 8572 + }, + { + "epoch": 2.6932092584957887, + "grad_norm": 0.99609375, + "learning_rate": 9.318946366232942e-06, + "loss": 1.2486, + "step": 8574 + }, + { + "epoch": 2.69383748552133, + "grad_norm": 0.84375, + "learning_rate": 9.316407489685815e-06, + "loss": 1.3359, + "step": 8576 + }, + { + "epoch": 2.6944657125468714, + "grad_norm": 0.8125, + "learning_rate": 9.313868613138687e-06, + "loss": 1.1934, + "step": 8578 + }, + { + "epoch": 2.6950939395724127, + "grad_norm": 0.84765625, + "learning_rate": 9.31132973659156e-06, + "loss": 1.1376, + "step": 8580 + }, + { + "epoch": 2.6957221665979545, + "grad_norm": 0.9296875, + "learning_rate": 9.308790860044431e-06, + "loss": 1.1097, + "step": 8582 + }, + { + "epoch": 2.696350393623496, + "grad_norm": 1.09375, + "learning_rate": 9.306251983497303e-06, + "loss": 1.1256, + "step": 8584 + }, + { + "epoch": 2.696978620649037, + "grad_norm": 0.9453125, + "learning_rate": 9.303713106950176e-06, + "loss": 1.1685, + "step": 8586 + }, + { + "epoch": 2.6976068476745785, + "grad_norm": 0.83203125, + "learning_rate": 9.301174230403047e-06, + "loss": 1.2755, + "step": 8588 + }, + { + "epoch": 2.69823507470012, + "grad_norm": 0.9140625, + "learning_rate": 9.29863535385592e-06, + "loss": 1.0006, + "step": 8590 + }, + { + "epoch": 2.698863301725661, + "grad_norm": 0.88671875, + "learning_rate": 9.296096477308791e-06, + "loss": 1.1454, + "step": 8592 + }, + { + "epoch": 2.6994915287512025, + "grad_norm": 0.875, + "learning_rate": 9.293557600761663e-06, + "loss": 1.1855, + "step": 8594 + }, + { + "epoch": 2.700119755776744, + "grad_norm": 0.99609375, + "learning_rate": 9.291018724214536e-06, + "loss": 1.0808, + "step": 8596 + }, + { + "epoch": 2.700747982802285, + "grad_norm": 0.875, + "learning_rate": 9.288479847667407e-06, + "loss": 1.2162, + "step": 8598 + }, + { + "epoch": 2.7013762098278264, + "grad_norm": 1.0625, + "learning_rate": 9.28594097112028e-06, + "loss": 1.252, + "step": 8600 + }, + { + "epoch": 2.7020044368533678, + "grad_norm": 0.86328125, + "learning_rate": 9.283402094573152e-06, + "loss": 1.1132, + "step": 8602 + }, + { + "epoch": 2.702632663878909, + "grad_norm": 0.8984375, + "learning_rate": 9.280863218026023e-06, + "loss": 1.2573, + "step": 8604 + }, + { + "epoch": 2.7032608909044504, + "grad_norm": 0.92578125, + "learning_rate": 9.278324341478896e-06, + "loss": 0.9743, + "step": 8606 + }, + { + "epoch": 2.703889117929992, + "grad_norm": 0.953125, + "learning_rate": 9.275785464931768e-06, + "loss": 1.2334, + "step": 8608 + }, + { + "epoch": 2.7045173449555335, + "grad_norm": 0.875, + "learning_rate": 9.27324658838464e-06, + "loss": 1.2431, + "step": 8610 + }, + { + "epoch": 2.705145571981075, + "grad_norm": 0.88671875, + "learning_rate": 9.270707711837512e-06, + "loss": 1.1449, + "step": 8612 + }, + { + "epoch": 2.705773799006616, + "grad_norm": 0.8828125, + "learning_rate": 9.268168835290383e-06, + "loss": 1.2655, + "step": 8614 + }, + { + "epoch": 2.7064020260321575, + "grad_norm": 0.91015625, + "learning_rate": 9.265629958743256e-06, + "loss": 1.0926, + "step": 8616 + }, + { + "epoch": 2.707030253057699, + "grad_norm": 0.90625, + "learning_rate": 9.26309108219613e-06, + "loss": 1.2308, + "step": 8618 + }, + { + "epoch": 2.70765848008324, + "grad_norm": 0.91015625, + "learning_rate": 9.260552205649001e-06, + "loss": 1.1774, + "step": 8620 + }, + { + "epoch": 2.7082867071087815, + "grad_norm": 0.91796875, + "learning_rate": 9.258013329101874e-06, + "loss": 1.2488, + "step": 8622 + }, + { + "epoch": 2.708914934134323, + "grad_norm": 0.91015625, + "learning_rate": 9.255474452554745e-06, + "loss": 1.1772, + "step": 8624 + }, + { + "epoch": 2.709543161159864, + "grad_norm": 0.921875, + "learning_rate": 9.252935576007618e-06, + "loss": 1.0649, + "step": 8626 + }, + { + "epoch": 2.7101713881854055, + "grad_norm": 0.9375, + "learning_rate": 9.25039669946049e-06, + "loss": 1.1924, + "step": 8628 + }, + { + "epoch": 2.710799615210947, + "grad_norm": 0.88671875, + "learning_rate": 9.247857822913361e-06, + "loss": 1.2255, + "step": 8630 + }, + { + "epoch": 2.711427842236488, + "grad_norm": 0.8828125, + "learning_rate": 9.245318946366234e-06, + "loss": 1.2982, + "step": 8632 + }, + { + "epoch": 2.7120560692620295, + "grad_norm": 0.890625, + "learning_rate": 9.242780069819106e-06, + "loss": 1.242, + "step": 8634 + }, + { + "epoch": 2.712684296287571, + "grad_norm": 0.84765625, + "learning_rate": 9.240241193271979e-06, + "loss": 1.2739, + "step": 8636 + }, + { + "epoch": 2.713312523313112, + "grad_norm": 0.890625, + "learning_rate": 9.23770231672485e-06, + "loss": 1.1578, + "step": 8638 + }, + { + "epoch": 2.7139407503386535, + "grad_norm": 0.90234375, + "learning_rate": 9.235163440177722e-06, + "loss": 1.3106, + "step": 8640 + }, + { + "epoch": 2.714568977364195, + "grad_norm": 0.88671875, + "learning_rate": 9.232624563630595e-06, + "loss": 1.142, + "step": 8642 + }, + { + "epoch": 2.715197204389736, + "grad_norm": 0.91015625, + "learning_rate": 9.230085687083466e-06, + "loss": 1.1661, + "step": 8644 + }, + { + "epoch": 2.7158254314152774, + "grad_norm": 0.87109375, + "learning_rate": 9.227546810536339e-06, + "loss": 1.2508, + "step": 8646 + }, + { + "epoch": 2.716453658440819, + "grad_norm": 0.8984375, + "learning_rate": 9.22500793398921e-06, + "loss": 1.3473, + "step": 8648 + }, + { + "epoch": 2.7170818854663605, + "grad_norm": 0.93359375, + "learning_rate": 9.222469057442084e-06, + "loss": 1.2084, + "step": 8650 + }, + { + "epoch": 2.717710112491902, + "grad_norm": 0.84765625, + "learning_rate": 9.219930180894955e-06, + "loss": 1.1336, + "step": 8652 + }, + { + "epoch": 2.718338339517443, + "grad_norm": 0.859375, + "learning_rate": 9.217391304347826e-06, + "loss": 1.2546, + "step": 8654 + }, + { + "epoch": 2.7189665665429845, + "grad_norm": 0.82421875, + "learning_rate": 9.2148524278007e-06, + "loss": 1.2444, + "step": 8656 + }, + { + "epoch": 2.719594793568526, + "grad_norm": 0.83984375, + "learning_rate": 9.21231355125357e-06, + "loss": 1.1496, + "step": 8658 + }, + { + "epoch": 2.720223020594067, + "grad_norm": 0.90234375, + "learning_rate": 9.209774674706444e-06, + "loss": 1.0408, + "step": 8660 + }, + { + "epoch": 2.7208512476196085, + "grad_norm": 0.87890625, + "learning_rate": 9.207235798159315e-06, + "loss": 1.1055, + "step": 8662 + }, + { + "epoch": 2.72147947464515, + "grad_norm": 0.8828125, + "learning_rate": 9.204696921612187e-06, + "loss": 1.203, + "step": 8664 + }, + { + "epoch": 2.722107701670691, + "grad_norm": 0.921875, + "learning_rate": 9.20215804506506e-06, + "loss": 1.1595, + "step": 8666 + }, + { + "epoch": 2.7227359286962325, + "grad_norm": 0.91796875, + "learning_rate": 9.199619168517931e-06, + "loss": 1.4185, + "step": 8668 + }, + { + "epoch": 2.723364155721774, + "grad_norm": 0.94921875, + "learning_rate": 9.197080291970804e-06, + "loss": 1.1604, + "step": 8670 + }, + { + "epoch": 2.7239923827473156, + "grad_norm": 0.87890625, + "learning_rate": 9.194541415423676e-06, + "loss": 1.1948, + "step": 8672 + }, + { + "epoch": 2.724620609772857, + "grad_norm": 0.8203125, + "learning_rate": 9.192002538876547e-06, + "loss": 1.1164, + "step": 8674 + }, + { + "epoch": 2.7252488367983982, + "grad_norm": 0.890625, + "learning_rate": 9.18946366232942e-06, + "loss": 1.1879, + "step": 8676 + }, + { + "epoch": 2.7258770638239396, + "grad_norm": 0.84375, + "learning_rate": 9.186924785782291e-06, + "loss": 1.2257, + "step": 8678 + }, + { + "epoch": 2.726505290849481, + "grad_norm": 0.81640625, + "learning_rate": 9.184385909235164e-06, + "loss": 1.1763, + "step": 8680 + }, + { + "epoch": 2.7271335178750222, + "grad_norm": 0.9140625, + "learning_rate": 9.181847032688036e-06, + "loss": 1.1286, + "step": 8682 + }, + { + "epoch": 2.7277617449005636, + "grad_norm": 0.8125, + "learning_rate": 9.179308156140909e-06, + "loss": 1.1175, + "step": 8684 + }, + { + "epoch": 2.728389971926105, + "grad_norm": 0.8671875, + "learning_rate": 9.17676927959378e-06, + "loss": 1.2022, + "step": 8686 + }, + { + "epoch": 2.729018198951646, + "grad_norm": 0.77734375, + "learning_rate": 9.174230403046652e-06, + "loss": 1.3269, + "step": 8688 + }, + { + "epoch": 2.7296464259771875, + "grad_norm": 0.859375, + "learning_rate": 9.171691526499525e-06, + "loss": 1.131, + "step": 8690 + }, + { + "epoch": 2.730274653002729, + "grad_norm": 0.96875, + "learning_rate": 9.169152649952396e-06, + "loss": 1.1066, + "step": 8692 + }, + { + "epoch": 2.73090288002827, + "grad_norm": 0.87109375, + "learning_rate": 9.16661377340527e-06, + "loss": 1.1499, + "step": 8694 + }, + { + "epoch": 2.7315311070538115, + "grad_norm": 0.90234375, + "learning_rate": 9.16407489685814e-06, + "loss": 1.1262, + "step": 8696 + }, + { + "epoch": 2.732159334079353, + "grad_norm": 1.0, + "learning_rate": 9.161536020311012e-06, + "loss": 1.1625, + "step": 8698 + }, + { + "epoch": 2.732787561104894, + "grad_norm": 1.0, + "learning_rate": 9.158997143763885e-06, + "loss": 1.1464, + "step": 8700 + }, + { + "epoch": 2.7334157881304355, + "grad_norm": 0.94140625, + "learning_rate": 9.156458267216756e-06, + "loss": 1.2327, + "step": 8702 + }, + { + "epoch": 2.734044015155977, + "grad_norm": 0.8359375, + "learning_rate": 9.15391939066963e-06, + "loss": 1.0969, + "step": 8704 + }, + { + "epoch": 2.734672242181518, + "grad_norm": 0.828125, + "learning_rate": 9.151380514122503e-06, + "loss": 1.1112, + "step": 8706 + }, + { + "epoch": 2.7353004692070595, + "grad_norm": 0.90234375, + "learning_rate": 9.148841637575374e-06, + "loss": 1.3812, + "step": 8708 + }, + { + "epoch": 2.735928696232601, + "grad_norm": 0.83203125, + "learning_rate": 9.146302761028247e-06, + "loss": 1.2572, + "step": 8710 + }, + { + "epoch": 2.736556923258142, + "grad_norm": 0.8984375, + "learning_rate": 9.143763884481118e-06, + "loss": 1.142, + "step": 8712 + }, + { + "epoch": 2.737185150283684, + "grad_norm": 0.8984375, + "learning_rate": 9.14122500793399e-06, + "loss": 1.144, + "step": 8714 + }, + { + "epoch": 2.7378133773092252, + "grad_norm": 0.89453125, + "learning_rate": 9.138686131386863e-06, + "loss": 1.3231, + "step": 8716 + }, + { + "epoch": 2.7384416043347666, + "grad_norm": 0.8828125, + "learning_rate": 9.136147254839734e-06, + "loss": 1.1236, + "step": 8718 + }, + { + "epoch": 2.739069831360308, + "grad_norm": 0.88671875, + "learning_rate": 9.133608378292607e-06, + "loss": 1.1687, + "step": 8720 + }, + { + "epoch": 2.7396980583858492, + "grad_norm": 0.86328125, + "learning_rate": 9.131069501745479e-06, + "loss": 1.2533, + "step": 8722 + }, + { + "epoch": 2.7403262854113906, + "grad_norm": 0.84375, + "learning_rate": 9.12853062519835e-06, + "loss": 1.0806, + "step": 8724 + }, + { + "epoch": 2.740954512436932, + "grad_norm": 0.890625, + "learning_rate": 9.125991748651223e-06, + "loss": 1.2269, + "step": 8726 + }, + { + "epoch": 2.741582739462473, + "grad_norm": 0.90234375, + "learning_rate": 9.123452872104095e-06, + "loss": 1.0659, + "step": 8728 + }, + { + "epoch": 2.7422109664880145, + "grad_norm": 0.83203125, + "learning_rate": 9.120913995556968e-06, + "loss": 1.2846, + "step": 8730 + }, + { + "epoch": 2.742839193513556, + "grad_norm": 0.8828125, + "learning_rate": 9.118375119009839e-06, + "loss": 1.2342, + "step": 8732 + }, + { + "epoch": 2.743467420539097, + "grad_norm": 0.79296875, + "learning_rate": 9.11583624246271e-06, + "loss": 1.2884, + "step": 8734 + }, + { + "epoch": 2.7440956475646385, + "grad_norm": 0.859375, + "learning_rate": 9.113297365915583e-06, + "loss": 1.2428, + "step": 8736 + }, + { + "epoch": 2.7447238745901803, + "grad_norm": 0.8984375, + "learning_rate": 9.110758489368455e-06, + "loss": 1.2098, + "step": 8738 + }, + { + "epoch": 2.7453521016157216, + "grad_norm": 0.828125, + "learning_rate": 9.108219612821328e-06, + "loss": 1.2179, + "step": 8740 + }, + { + "epoch": 2.745980328641263, + "grad_norm": 0.8203125, + "learning_rate": 9.1056807362742e-06, + "loss": 1.3499, + "step": 8742 + }, + { + "epoch": 2.7466085556668043, + "grad_norm": 0.8515625, + "learning_rate": 9.103141859727072e-06, + "loss": 1.0601, + "step": 8744 + }, + { + "epoch": 2.7472367826923456, + "grad_norm": 0.828125, + "learning_rate": 9.100602983179944e-06, + "loss": 1.2922, + "step": 8746 + }, + { + "epoch": 2.747865009717887, + "grad_norm": 0.8515625, + "learning_rate": 9.098064106632815e-06, + "loss": 1.1371, + "step": 8748 + }, + { + "epoch": 2.7484932367434283, + "grad_norm": 0.8828125, + "learning_rate": 9.095525230085688e-06, + "loss": 1.2278, + "step": 8750 + }, + { + "epoch": 2.7491214637689696, + "grad_norm": 0.953125, + "learning_rate": 9.09298635353856e-06, + "loss": 1.1732, + "step": 8752 + }, + { + "epoch": 2.749749690794511, + "grad_norm": 0.859375, + "learning_rate": 9.090447476991433e-06, + "loss": 1.1805, + "step": 8754 + }, + { + "epoch": 2.7503779178200523, + "grad_norm": 1.078125, + "learning_rate": 9.087908600444304e-06, + "loss": 1.0657, + "step": 8756 + }, + { + "epoch": 2.7510061448455936, + "grad_norm": 1.15625, + "learning_rate": 9.085369723897175e-06, + "loss": 1.1152, + "step": 8758 + }, + { + "epoch": 2.751634371871135, + "grad_norm": 0.96875, + "learning_rate": 9.082830847350049e-06, + "loss": 1.1851, + "step": 8760 + }, + { + "epoch": 2.7522625988966762, + "grad_norm": 0.8828125, + "learning_rate": 9.08029197080292e-06, + "loss": 1.1882, + "step": 8762 + }, + { + "epoch": 2.7528908259222176, + "grad_norm": 0.8671875, + "learning_rate": 9.077753094255793e-06, + "loss": 1.1752, + "step": 8764 + }, + { + "epoch": 2.753519052947759, + "grad_norm": 1.625, + "learning_rate": 9.075214217708664e-06, + "loss": 1.2417, + "step": 8766 + }, + { + "epoch": 2.7541472799733, + "grad_norm": 0.84375, + "learning_rate": 9.072675341161536e-06, + "loss": 1.1084, + "step": 8768 + }, + { + "epoch": 2.7547755069988415, + "grad_norm": 0.9140625, + "learning_rate": 9.070136464614409e-06, + "loss": 1.1443, + "step": 8770 + }, + { + "epoch": 2.755403734024383, + "grad_norm": 1.0078125, + "learning_rate": 9.06759758806728e-06, + "loss": 1.193, + "step": 8772 + }, + { + "epoch": 2.756031961049924, + "grad_norm": 0.93359375, + "learning_rate": 9.065058711520153e-06, + "loss": 1.2437, + "step": 8774 + }, + { + "epoch": 2.7566601880754655, + "grad_norm": 0.8359375, + "learning_rate": 9.062519834973025e-06, + "loss": 1.2941, + "step": 8776 + }, + { + "epoch": 2.757288415101007, + "grad_norm": 0.89453125, + "learning_rate": 9.059980958425896e-06, + "loss": 1.0076, + "step": 8778 + }, + { + "epoch": 2.7579166421265486, + "grad_norm": 0.91015625, + "learning_rate": 9.057442081878769e-06, + "loss": 1.1461, + "step": 8780 + }, + { + "epoch": 2.75854486915209, + "grad_norm": 0.93359375, + "learning_rate": 9.05490320533164e-06, + "loss": 1.1561, + "step": 8782 + }, + { + "epoch": 2.7591730961776313, + "grad_norm": 0.90234375, + "learning_rate": 9.052364328784514e-06, + "loss": 1.104, + "step": 8784 + }, + { + "epoch": 2.7598013232031726, + "grad_norm": 0.796875, + "learning_rate": 9.049825452237385e-06, + "loss": 1.1338, + "step": 8786 + }, + { + "epoch": 2.760429550228714, + "grad_norm": 0.859375, + "learning_rate": 9.047286575690258e-06, + "loss": 1.2664, + "step": 8788 + }, + { + "epoch": 2.7610577772542553, + "grad_norm": 0.8984375, + "learning_rate": 9.044747699143131e-06, + "loss": 1.1987, + "step": 8790 + }, + { + "epoch": 2.7616860042797966, + "grad_norm": 0.84765625, + "learning_rate": 9.042208822596002e-06, + "loss": 1.1966, + "step": 8792 + }, + { + "epoch": 2.762314231305338, + "grad_norm": 0.87109375, + "learning_rate": 9.039669946048874e-06, + "loss": 1.3295, + "step": 8794 + }, + { + "epoch": 2.7629424583308793, + "grad_norm": 0.88671875, + "learning_rate": 9.037131069501747e-06, + "loss": 1.2477, + "step": 8796 + }, + { + "epoch": 2.7635706853564206, + "grad_norm": 0.88671875, + "learning_rate": 9.034592192954618e-06, + "loss": 1.1968, + "step": 8798 + }, + { + "epoch": 2.764198912381962, + "grad_norm": 0.921875, + "learning_rate": 9.032053316407491e-06, + "loss": 1.2968, + "step": 8800 + }, + { + "epoch": 2.7648271394075032, + "grad_norm": 0.8671875, + "learning_rate": 9.029514439860363e-06, + "loss": 1.22, + "step": 8802 + }, + { + "epoch": 2.765455366433045, + "grad_norm": 0.98046875, + "learning_rate": 9.026975563313234e-06, + "loss": 1.184, + "step": 8804 + }, + { + "epoch": 2.7660835934585863, + "grad_norm": 0.9296875, + "learning_rate": 9.024436686766107e-06, + "loss": 1.2109, + "step": 8806 + }, + { + "epoch": 2.7667118204841277, + "grad_norm": 0.82421875, + "learning_rate": 9.021897810218979e-06, + "loss": 1.2232, + "step": 8808 + }, + { + "epoch": 2.767340047509669, + "grad_norm": 0.875, + "learning_rate": 9.019358933671852e-06, + "loss": 1.4004, + "step": 8810 + }, + { + "epoch": 2.7679682745352103, + "grad_norm": 0.9765625, + "learning_rate": 9.016820057124723e-06, + "loss": 1.2854, + "step": 8812 + }, + { + "epoch": 2.7685965015607517, + "grad_norm": 0.9140625, + "learning_rate": 9.014281180577596e-06, + "loss": 1.0666, + "step": 8814 + }, + { + "epoch": 2.769224728586293, + "grad_norm": 0.9140625, + "learning_rate": 9.011742304030468e-06, + "loss": 1.241, + "step": 8816 + }, + { + "epoch": 2.7698529556118343, + "grad_norm": 0.93359375, + "learning_rate": 9.009203427483339e-06, + "loss": 1.0892, + "step": 8818 + }, + { + "epoch": 2.7704811826373756, + "grad_norm": 0.9140625, + "learning_rate": 9.006664550936212e-06, + "loss": 1.1606, + "step": 8820 + }, + { + "epoch": 2.771109409662917, + "grad_norm": 0.88671875, + "learning_rate": 9.004125674389083e-06, + "loss": 1.2497, + "step": 8822 + }, + { + "epoch": 2.7717376366884583, + "grad_norm": 0.8828125, + "learning_rate": 9.001586797841956e-06, + "loss": 1.2447, + "step": 8824 + }, + { + "epoch": 2.7723658637139996, + "grad_norm": 0.98046875, + "learning_rate": 8.999047921294828e-06, + "loss": 1.1626, + "step": 8826 + }, + { + "epoch": 2.772994090739541, + "grad_norm": 0.9609375, + "learning_rate": 8.9965090447477e-06, + "loss": 1.0602, + "step": 8828 + }, + { + "epoch": 2.7736223177650823, + "grad_norm": 1.0390625, + "learning_rate": 8.993970168200572e-06, + "loss": 1.1906, + "step": 8830 + }, + { + "epoch": 2.7742505447906236, + "grad_norm": 0.94921875, + "learning_rate": 8.991431291653444e-06, + "loss": 1.1996, + "step": 8832 + }, + { + "epoch": 2.774878771816165, + "grad_norm": 0.8671875, + "learning_rate": 8.988892415106317e-06, + "loss": 1.2612, + "step": 8834 + }, + { + "epoch": 2.7755069988417063, + "grad_norm": 0.92578125, + "learning_rate": 8.986353538559188e-06, + "loss": 1.1974, + "step": 8836 + }, + { + "epoch": 2.7761352258672476, + "grad_norm": 0.83984375, + "learning_rate": 8.98381466201206e-06, + "loss": 1.1045, + "step": 8838 + }, + { + "epoch": 2.776763452892789, + "grad_norm": 0.9609375, + "learning_rate": 8.981275785464933e-06, + "loss": 1.1507, + "step": 8840 + }, + { + "epoch": 2.7773916799183302, + "grad_norm": 0.890625, + "learning_rate": 8.978736908917804e-06, + "loss": 1.2665, + "step": 8842 + }, + { + "epoch": 2.7780199069438716, + "grad_norm": 0.8984375, + "learning_rate": 8.976198032370677e-06, + "loss": 1.1378, + "step": 8844 + }, + { + "epoch": 2.7786481339694133, + "grad_norm": 0.9453125, + "learning_rate": 8.973659155823548e-06, + "loss": 1.0703, + "step": 8846 + }, + { + "epoch": 2.7792763609949547, + "grad_norm": 0.90234375, + "learning_rate": 8.971120279276422e-06, + "loss": 1.0785, + "step": 8848 + }, + { + "epoch": 2.779904588020496, + "grad_norm": 0.796875, + "learning_rate": 8.968581402729293e-06, + "loss": 1.2303, + "step": 8850 + }, + { + "epoch": 2.7805328150460373, + "grad_norm": 0.9609375, + "learning_rate": 8.966042526182164e-06, + "loss": 1.2005, + "step": 8852 + }, + { + "epoch": 2.7811610420715787, + "grad_norm": 0.87109375, + "learning_rate": 8.963503649635037e-06, + "loss": 1.2388, + "step": 8854 + }, + { + "epoch": 2.78178926909712, + "grad_norm": 0.9609375, + "learning_rate": 8.960964773087909e-06, + "loss": 1.1309, + "step": 8856 + }, + { + "epoch": 2.7824174961226613, + "grad_norm": 0.90234375, + "learning_rate": 8.958425896540782e-06, + "loss": 1.2428, + "step": 8858 + }, + { + "epoch": 2.7830457231482026, + "grad_norm": 0.875, + "learning_rate": 8.955887019993653e-06, + "loss": 1.1717, + "step": 8860 + }, + { + "epoch": 2.783673950173744, + "grad_norm": 0.83984375, + "learning_rate": 8.953348143446525e-06, + "loss": 1.2138, + "step": 8862 + }, + { + "epoch": 2.7843021771992853, + "grad_norm": 0.89453125, + "learning_rate": 8.950809266899398e-06, + "loss": 1.0967, + "step": 8864 + }, + { + "epoch": 2.7849304042248266, + "grad_norm": 0.87109375, + "learning_rate": 8.948270390352269e-06, + "loss": 1.164, + "step": 8866 + }, + { + "epoch": 2.785558631250368, + "grad_norm": 0.875, + "learning_rate": 8.945731513805142e-06, + "loss": 1.1851, + "step": 8868 + }, + { + "epoch": 2.7861868582759097, + "grad_norm": 0.85546875, + "learning_rate": 8.943192637258013e-06, + "loss": 1.09, + "step": 8870 + }, + { + "epoch": 2.786815085301451, + "grad_norm": 0.94140625, + "learning_rate": 8.940653760710885e-06, + "loss": 1.2916, + "step": 8872 + }, + { + "epoch": 2.7874433123269924, + "grad_norm": 0.828125, + "learning_rate": 8.938114884163758e-06, + "loss": 1.2003, + "step": 8874 + }, + { + "epoch": 2.7880715393525337, + "grad_norm": 0.890625, + "learning_rate": 8.935576007616631e-06, + "loss": 1.2229, + "step": 8876 + }, + { + "epoch": 2.788699766378075, + "grad_norm": 0.91796875, + "learning_rate": 8.933037131069502e-06, + "loss": 1.2452, + "step": 8878 + }, + { + "epoch": 2.7893279934036164, + "grad_norm": 0.890625, + "learning_rate": 8.930498254522375e-06, + "loss": 1.2213, + "step": 8880 + }, + { + "epoch": 2.7899562204291577, + "grad_norm": 0.94140625, + "learning_rate": 8.927959377975247e-06, + "loss": 1.0626, + "step": 8882 + }, + { + "epoch": 2.790584447454699, + "grad_norm": 0.89453125, + "learning_rate": 8.92542050142812e-06, + "loss": 1.0885, + "step": 8884 + }, + { + "epoch": 2.7912126744802404, + "grad_norm": 0.91796875, + "learning_rate": 8.922881624880991e-06, + "loss": 1.1579, + "step": 8886 + }, + { + "epoch": 2.7918409015057817, + "grad_norm": 0.8671875, + "learning_rate": 8.920342748333863e-06, + "loss": 1.1024, + "step": 8888 + }, + { + "epoch": 2.792469128531323, + "grad_norm": 0.875, + "learning_rate": 8.917803871786736e-06, + "loss": 1.2193, + "step": 8890 + }, + { + "epoch": 2.7930973555568643, + "grad_norm": 0.921875, + "learning_rate": 8.915264995239607e-06, + "loss": 1.0511, + "step": 8892 + }, + { + "epoch": 2.7937255825824057, + "grad_norm": 0.8359375, + "learning_rate": 8.91272611869248e-06, + "loss": 1.1799, + "step": 8894 + }, + { + "epoch": 2.794353809607947, + "grad_norm": 0.91796875, + "learning_rate": 8.910187242145352e-06, + "loss": 1.121, + "step": 8896 + }, + { + "epoch": 2.7949820366334883, + "grad_norm": 0.90234375, + "learning_rate": 8.907648365598223e-06, + "loss": 1.3075, + "step": 8898 + }, + { + "epoch": 2.7956102636590296, + "grad_norm": 0.8984375, + "learning_rate": 8.905109489051096e-06, + "loss": 1.0603, + "step": 8900 + }, + { + "epoch": 2.796238490684571, + "grad_norm": 0.859375, + "learning_rate": 8.902570612503967e-06, + "loss": 1.1152, + "step": 8902 + }, + { + "epoch": 2.7968667177101123, + "grad_norm": 0.8359375, + "learning_rate": 8.90003173595684e-06, + "loss": 1.1665, + "step": 8904 + }, + { + "epoch": 2.7974949447356536, + "grad_norm": 0.92578125, + "learning_rate": 8.897492859409712e-06, + "loss": 1.1832, + "step": 8906 + }, + { + "epoch": 2.798123171761195, + "grad_norm": 1.046875, + "learning_rate": 8.894953982862583e-06, + "loss": 1.162, + "step": 8908 + }, + { + "epoch": 2.7987513987867363, + "grad_norm": 0.83203125, + "learning_rate": 8.892415106315456e-06, + "loss": 1.2164, + "step": 8910 + }, + { + "epoch": 2.799379625812278, + "grad_norm": 0.86328125, + "learning_rate": 8.889876229768328e-06, + "loss": 1.1247, + "step": 8912 + }, + { + "epoch": 2.8000078528378194, + "grad_norm": 0.84375, + "learning_rate": 8.8873373532212e-06, + "loss": 1.0951, + "step": 8914 + }, + { + "epoch": 2.8006360798633607, + "grad_norm": 1.1953125, + "learning_rate": 8.884798476674072e-06, + "loss": 1.1273, + "step": 8916 + }, + { + "epoch": 2.801264306888902, + "grad_norm": 0.859375, + "learning_rate": 8.882259600126945e-06, + "loss": 1.0575, + "step": 8918 + }, + { + "epoch": 2.8018925339144434, + "grad_norm": 1.109375, + "learning_rate": 8.879720723579817e-06, + "loss": 1.2221, + "step": 8920 + }, + { + "epoch": 2.8025207609399847, + "grad_norm": 0.91015625, + "learning_rate": 8.877181847032688e-06, + "loss": 1.2034, + "step": 8922 + }, + { + "epoch": 2.803148987965526, + "grad_norm": 0.84765625, + "learning_rate": 8.874642970485561e-06, + "loss": 1.1343, + "step": 8924 + }, + { + "epoch": 2.8037772149910674, + "grad_norm": 0.9140625, + "learning_rate": 8.872104093938433e-06, + "loss": 1.1435, + "step": 8926 + }, + { + "epoch": 2.8044054420166087, + "grad_norm": 0.97265625, + "learning_rate": 8.869565217391306e-06, + "loss": 1.1574, + "step": 8928 + }, + { + "epoch": 2.80503366904215, + "grad_norm": 0.83984375, + "learning_rate": 8.867026340844177e-06, + "loss": 1.1683, + "step": 8930 + }, + { + "epoch": 2.8056618960676913, + "grad_norm": 0.921875, + "learning_rate": 8.864487464297048e-06, + "loss": 1.1728, + "step": 8932 + }, + { + "epoch": 2.8062901230932327, + "grad_norm": 0.82421875, + "learning_rate": 8.861948587749921e-06, + "loss": 1.3423, + "step": 8934 + }, + { + "epoch": 2.8069183501187744, + "grad_norm": 0.80859375, + "learning_rate": 8.859409711202793e-06, + "loss": 1.1885, + "step": 8936 + }, + { + "epoch": 2.8075465771443158, + "grad_norm": 0.8515625, + "learning_rate": 8.856870834655666e-06, + "loss": 1.2339, + "step": 8938 + }, + { + "epoch": 2.808174804169857, + "grad_norm": 0.859375, + "learning_rate": 8.854331958108537e-06, + "loss": 1.1838, + "step": 8940 + }, + { + "epoch": 2.8088030311953984, + "grad_norm": 0.828125, + "learning_rate": 8.851793081561409e-06, + "loss": 1.1717, + "step": 8942 + }, + { + "epoch": 2.8094312582209398, + "grad_norm": 0.89453125, + "learning_rate": 8.849254205014282e-06, + "loss": 1.1993, + "step": 8944 + }, + { + "epoch": 2.810059485246481, + "grad_norm": 0.8671875, + "learning_rate": 8.846715328467153e-06, + "loss": 1.2121, + "step": 8946 + }, + { + "epoch": 2.8106877122720224, + "grad_norm": 0.9453125, + "learning_rate": 8.844176451920026e-06, + "loss": 1.1136, + "step": 8948 + }, + { + "epoch": 2.8113159392975637, + "grad_norm": 0.8828125, + "learning_rate": 8.841637575372898e-06, + "loss": 1.0336, + "step": 8950 + }, + { + "epoch": 2.811944166323105, + "grad_norm": 0.84375, + "learning_rate": 8.83909869882577e-06, + "loss": 1.2212, + "step": 8952 + }, + { + "epoch": 2.8125723933486464, + "grad_norm": 0.91015625, + "learning_rate": 8.836559822278642e-06, + "loss": 1.1801, + "step": 8954 + }, + { + "epoch": 2.8132006203741877, + "grad_norm": 0.86328125, + "learning_rate": 8.834020945731513e-06, + "loss": 1.2164, + "step": 8956 + }, + { + "epoch": 2.813828847399729, + "grad_norm": 0.90234375, + "learning_rate": 8.831482069184387e-06, + "loss": 1.1291, + "step": 8958 + }, + { + "epoch": 2.8144570744252704, + "grad_norm": 0.8359375, + "learning_rate": 8.828943192637258e-06, + "loss": 1.1178, + "step": 8960 + }, + { + "epoch": 2.8150853014508117, + "grad_norm": 1.0, + "learning_rate": 8.826404316090131e-06, + "loss": 1.0997, + "step": 8962 + }, + { + "epoch": 2.815713528476353, + "grad_norm": 0.85546875, + "learning_rate": 8.823865439543004e-06, + "loss": 1.2184, + "step": 8964 + }, + { + "epoch": 2.8163417555018944, + "grad_norm": 0.94921875, + "learning_rate": 8.821326562995875e-06, + "loss": 1.1807, + "step": 8966 + }, + { + "epoch": 2.8169699825274357, + "grad_norm": 0.90234375, + "learning_rate": 8.818787686448747e-06, + "loss": 1.2236, + "step": 8968 + }, + { + "epoch": 2.817598209552977, + "grad_norm": 0.90234375, + "learning_rate": 8.81624880990162e-06, + "loss": 1.2124, + "step": 8970 + }, + { + "epoch": 2.8182264365785183, + "grad_norm": 0.9296875, + "learning_rate": 8.813709933354491e-06, + "loss": 1.1715, + "step": 8972 + }, + { + "epoch": 2.8188546636040597, + "grad_norm": 0.94140625, + "learning_rate": 8.811171056807364e-06, + "loss": 1.1341, + "step": 8974 + }, + { + "epoch": 2.819482890629601, + "grad_norm": 0.95703125, + "learning_rate": 8.808632180260236e-06, + "loss": 1.1516, + "step": 8976 + }, + { + "epoch": 2.8201111176551428, + "grad_norm": 0.8671875, + "learning_rate": 8.806093303713109e-06, + "loss": 1.2244, + "step": 8978 + }, + { + "epoch": 2.820739344680684, + "grad_norm": 0.92578125, + "learning_rate": 8.80355442716598e-06, + "loss": 1.0713, + "step": 8980 + }, + { + "epoch": 2.8213675717062254, + "grad_norm": 0.890625, + "learning_rate": 8.801015550618852e-06, + "loss": 1.2261, + "step": 8982 + }, + { + "epoch": 2.8219957987317668, + "grad_norm": 0.89453125, + "learning_rate": 8.798476674071725e-06, + "loss": 1.2586, + "step": 8984 + }, + { + "epoch": 2.822624025757308, + "grad_norm": 0.890625, + "learning_rate": 8.795937797524596e-06, + "loss": 1.1711, + "step": 8986 + }, + { + "epoch": 2.8232522527828494, + "grad_norm": 0.859375, + "learning_rate": 8.793398920977469e-06, + "loss": 1.2093, + "step": 8988 + }, + { + "epoch": 2.8238804798083907, + "grad_norm": 0.87890625, + "learning_rate": 8.79086004443034e-06, + "loss": 1.2378, + "step": 8990 + }, + { + "epoch": 2.824508706833932, + "grad_norm": 0.9140625, + "learning_rate": 8.788321167883212e-06, + "loss": 1.1352, + "step": 8992 + }, + { + "epoch": 2.8251369338594734, + "grad_norm": 0.96484375, + "learning_rate": 8.785782291336085e-06, + "loss": 1.2086, + "step": 8994 + }, + { + "epoch": 2.8257651608850147, + "grad_norm": 0.95703125, + "learning_rate": 8.783243414788956e-06, + "loss": 1.1629, + "step": 8996 + }, + { + "epoch": 2.826393387910556, + "grad_norm": 0.89453125, + "learning_rate": 8.78070453824183e-06, + "loss": 1.1229, + "step": 8998 + }, + { + "epoch": 2.8270216149360974, + "grad_norm": 0.9765625, + "learning_rate": 8.7781656616947e-06, + "loss": 1.2158, + "step": 9000 + }, + { + "epoch": 2.827649841961639, + "grad_norm": 0.875, + "learning_rate": 8.775626785147572e-06, + "loss": 1.2396, + "step": 9002 + }, + { + "epoch": 2.8282780689871805, + "grad_norm": 0.78125, + "learning_rate": 8.773087908600445e-06, + "loss": 1.1963, + "step": 9004 + }, + { + "epoch": 2.828906296012722, + "grad_norm": 0.84375, + "learning_rate": 8.770549032053317e-06, + "loss": 1.2174, + "step": 9006 + }, + { + "epoch": 2.829534523038263, + "grad_norm": 0.90625, + "learning_rate": 8.76801015550619e-06, + "loss": 1.1699, + "step": 9008 + }, + { + "epoch": 2.8301627500638045, + "grad_norm": 1.015625, + "learning_rate": 8.765471278959061e-06, + "loss": 1.1734, + "step": 9010 + }, + { + "epoch": 2.830790977089346, + "grad_norm": 0.8671875, + "learning_rate": 8.762932402411934e-06, + "loss": 1.1106, + "step": 9012 + }, + { + "epoch": 2.831419204114887, + "grad_norm": 0.84765625, + "learning_rate": 8.760393525864806e-06, + "loss": 1.228, + "step": 9014 + }, + { + "epoch": 2.8320474311404285, + "grad_norm": 0.91796875, + "learning_rate": 8.757854649317677e-06, + "loss": 1.2233, + "step": 9016 + }, + { + "epoch": 2.83267565816597, + "grad_norm": 0.80078125, + "learning_rate": 8.75531577277055e-06, + "loss": 1.265, + "step": 9018 + }, + { + "epoch": 2.833303885191511, + "grad_norm": 0.88671875, + "learning_rate": 8.752776896223421e-06, + "loss": 1.2456, + "step": 9020 + }, + { + "epoch": 2.8339321122170524, + "grad_norm": 0.96484375, + "learning_rate": 8.750238019676294e-06, + "loss": 1.2257, + "step": 9022 + }, + { + "epoch": 2.8345603392425938, + "grad_norm": 0.890625, + "learning_rate": 8.747699143129166e-06, + "loss": 1.1829, + "step": 9024 + }, + { + "epoch": 2.835188566268135, + "grad_norm": 0.8515625, + "learning_rate": 8.745160266582037e-06, + "loss": 1.2734, + "step": 9026 + }, + { + "epoch": 2.8358167932936764, + "grad_norm": 0.9765625, + "learning_rate": 8.74262139003491e-06, + "loss": 1.1946, + "step": 9028 + }, + { + "epoch": 2.8364450203192177, + "grad_norm": 0.828125, + "learning_rate": 8.740082513487782e-06, + "loss": 1.3757, + "step": 9030 + }, + { + "epoch": 2.837073247344759, + "grad_norm": 0.90625, + "learning_rate": 8.737543636940655e-06, + "loss": 1.2224, + "step": 9032 + }, + { + "epoch": 2.8377014743703004, + "grad_norm": 0.83984375, + "learning_rate": 8.735004760393526e-06, + "loss": 1.1388, + "step": 9034 + }, + { + "epoch": 2.8383297013958417, + "grad_norm": 0.85546875, + "learning_rate": 8.732465883846398e-06, + "loss": 1.1863, + "step": 9036 + }, + { + "epoch": 2.838957928421383, + "grad_norm": 0.87890625, + "learning_rate": 8.72992700729927e-06, + "loss": 1.2225, + "step": 9038 + }, + { + "epoch": 2.8395861554469244, + "grad_norm": 0.87890625, + "learning_rate": 8.727388130752142e-06, + "loss": 1.2286, + "step": 9040 + }, + { + "epoch": 2.8402143824724657, + "grad_norm": 0.8515625, + "learning_rate": 8.724849254205015e-06, + "loss": 1.086, + "step": 9042 + }, + { + "epoch": 2.8408426094980075, + "grad_norm": 0.92578125, + "learning_rate": 8.722310377657886e-06, + "loss": 1.159, + "step": 9044 + }, + { + "epoch": 2.841470836523549, + "grad_norm": 0.90234375, + "learning_rate": 8.719771501110758e-06, + "loss": 1.2525, + "step": 9046 + }, + { + "epoch": 2.84209906354909, + "grad_norm": 0.88671875, + "learning_rate": 8.717232624563633e-06, + "loss": 1.0467, + "step": 9048 + }, + { + "epoch": 2.8427272905746315, + "grad_norm": 0.89453125, + "learning_rate": 8.714693748016504e-06, + "loss": 1.3227, + "step": 9050 + }, + { + "epoch": 2.843355517600173, + "grad_norm": 0.91796875, + "learning_rate": 8.712154871469375e-06, + "loss": 1.0992, + "step": 9052 + }, + { + "epoch": 2.843983744625714, + "grad_norm": 0.859375, + "learning_rate": 8.709615994922248e-06, + "loss": 1.1781, + "step": 9054 + }, + { + "epoch": 2.8446119716512555, + "grad_norm": 0.98828125, + "learning_rate": 8.70707711837512e-06, + "loss": 1.2496, + "step": 9056 + }, + { + "epoch": 2.845240198676797, + "grad_norm": 0.94921875, + "learning_rate": 8.704538241827993e-06, + "loss": 1.2135, + "step": 9058 + }, + { + "epoch": 2.845868425702338, + "grad_norm": 0.9296875, + "learning_rate": 8.701999365280864e-06, + "loss": 1.0489, + "step": 9060 + }, + { + "epoch": 2.8464966527278794, + "grad_norm": 0.97265625, + "learning_rate": 8.699460488733736e-06, + "loss": 1.0931, + "step": 9062 + }, + { + "epoch": 2.8471248797534208, + "grad_norm": 0.87109375, + "learning_rate": 8.696921612186609e-06, + "loss": 1.1408, + "step": 9064 + }, + { + "epoch": 2.847753106778962, + "grad_norm": 0.87890625, + "learning_rate": 8.69438273563948e-06, + "loss": 1.2734, + "step": 9066 + }, + { + "epoch": 2.848381333804504, + "grad_norm": 0.82421875, + "learning_rate": 8.691843859092353e-06, + "loss": 1.1622, + "step": 9068 + }, + { + "epoch": 2.849009560830045, + "grad_norm": 0.87890625, + "learning_rate": 8.689304982545225e-06, + "loss": 1.1151, + "step": 9070 + }, + { + "epoch": 2.8496377878555865, + "grad_norm": 0.83984375, + "learning_rate": 8.686766105998096e-06, + "loss": 1.0864, + "step": 9072 + }, + { + "epoch": 2.850266014881128, + "grad_norm": 0.85546875, + "learning_rate": 8.684227229450969e-06, + "loss": 1.1366, + "step": 9074 + }, + { + "epoch": 2.850894241906669, + "grad_norm": 0.828125, + "learning_rate": 8.68168835290384e-06, + "loss": 1.1638, + "step": 9076 + }, + { + "epoch": 2.8515224689322105, + "grad_norm": 0.87109375, + "learning_rate": 8.679149476356713e-06, + "loss": 1.2531, + "step": 9078 + }, + { + "epoch": 2.852150695957752, + "grad_norm": 0.90234375, + "learning_rate": 8.676610599809585e-06, + "loss": 1.2749, + "step": 9080 + }, + { + "epoch": 2.852778922983293, + "grad_norm": 0.859375, + "learning_rate": 8.674071723262458e-06, + "loss": 1.1556, + "step": 9082 + }, + { + "epoch": 2.8534071500088345, + "grad_norm": 0.8828125, + "learning_rate": 8.67153284671533e-06, + "loss": 1.129, + "step": 9084 + }, + { + "epoch": 2.854035377034376, + "grad_norm": 0.8359375, + "learning_rate": 8.6689939701682e-06, + "loss": 1.2725, + "step": 9086 + }, + { + "epoch": 2.854663604059917, + "grad_norm": 0.9921875, + "learning_rate": 8.666455093621074e-06, + "loss": 1.3089, + "step": 9088 + }, + { + "epoch": 2.8552918310854585, + "grad_norm": 0.91796875, + "learning_rate": 8.663916217073945e-06, + "loss": 1.2405, + "step": 9090 + }, + { + "epoch": 2.855920058111, + "grad_norm": 0.93359375, + "learning_rate": 8.661377340526818e-06, + "loss": 1.2605, + "step": 9092 + }, + { + "epoch": 2.856548285136541, + "grad_norm": 1.0703125, + "learning_rate": 8.65883846397969e-06, + "loss": 1.1111, + "step": 9094 + }, + { + "epoch": 2.8571765121620825, + "grad_norm": 0.89453125, + "learning_rate": 8.656299587432561e-06, + "loss": 1.1104, + "step": 9096 + }, + { + "epoch": 2.857804739187624, + "grad_norm": 0.86328125, + "learning_rate": 8.653760710885434e-06, + "loss": 1.2555, + "step": 9098 + }, + { + "epoch": 2.858432966213165, + "grad_norm": 0.89453125, + "learning_rate": 8.651221834338305e-06, + "loss": 1.2309, + "step": 9100 + }, + { + "epoch": 2.8590611932387064, + "grad_norm": 0.88671875, + "learning_rate": 8.648682957791179e-06, + "loss": 1.1012, + "step": 9102 + }, + { + "epoch": 2.8596894202642478, + "grad_norm": 0.87890625, + "learning_rate": 8.64614408124405e-06, + "loss": 1.1028, + "step": 9104 + }, + { + "epoch": 2.860317647289789, + "grad_norm": 0.921875, + "learning_rate": 8.643605204696921e-06, + "loss": 1.2502, + "step": 9106 + }, + { + "epoch": 2.8609458743153304, + "grad_norm": 0.86328125, + "learning_rate": 8.641066328149794e-06, + "loss": 1.2413, + "step": 9108 + }, + { + "epoch": 2.861574101340872, + "grad_norm": 0.91015625, + "learning_rate": 8.638527451602666e-06, + "loss": 1.1771, + "step": 9110 + }, + { + "epoch": 2.8622023283664135, + "grad_norm": 0.89453125, + "learning_rate": 8.635988575055539e-06, + "loss": 1.2355, + "step": 9112 + }, + { + "epoch": 2.862830555391955, + "grad_norm": 0.81640625, + "learning_rate": 8.63344969850841e-06, + "loss": 1.1476, + "step": 9114 + }, + { + "epoch": 2.863458782417496, + "grad_norm": 0.90625, + "learning_rate": 8.630910821961283e-06, + "loss": 1.1657, + "step": 9116 + }, + { + "epoch": 2.8640870094430375, + "grad_norm": 0.8828125, + "learning_rate": 8.628371945414155e-06, + "loss": 1.2089, + "step": 9118 + }, + { + "epoch": 2.864715236468579, + "grad_norm": 0.96484375, + "learning_rate": 8.625833068867026e-06, + "loss": 1.173, + "step": 9120 + }, + { + "epoch": 2.86534346349412, + "grad_norm": 0.859375, + "learning_rate": 8.623294192319899e-06, + "loss": 1.2437, + "step": 9122 + }, + { + "epoch": 2.8659716905196615, + "grad_norm": 0.89453125, + "learning_rate": 8.62075531577277e-06, + "loss": 1.1699, + "step": 9124 + }, + { + "epoch": 2.866599917545203, + "grad_norm": 0.84765625, + "learning_rate": 8.618216439225644e-06, + "loss": 1.2851, + "step": 9126 + }, + { + "epoch": 2.867228144570744, + "grad_norm": 0.84765625, + "learning_rate": 8.615677562678515e-06, + "loss": 1.0717, + "step": 9128 + }, + { + "epoch": 2.8678563715962855, + "grad_norm": 0.88671875, + "learning_rate": 8.613138686131386e-06, + "loss": 0.9805, + "step": 9130 + }, + { + "epoch": 2.868484598621827, + "grad_norm": 1.078125, + "learning_rate": 8.61059980958426e-06, + "loss": 1.1905, + "step": 9132 + }, + { + "epoch": 2.8691128256473686, + "grad_norm": 0.875, + "learning_rate": 8.608060933037133e-06, + "loss": 1.3264, + "step": 9134 + }, + { + "epoch": 2.86974105267291, + "grad_norm": 0.86328125, + "learning_rate": 8.605522056490004e-06, + "loss": 1.2117, + "step": 9136 + }, + { + "epoch": 2.8703692796984512, + "grad_norm": 0.9765625, + "learning_rate": 8.602983179942877e-06, + "loss": 1.1902, + "step": 9138 + }, + { + "epoch": 2.8709975067239926, + "grad_norm": 0.8203125, + "learning_rate": 8.600444303395748e-06, + "loss": 1.2074, + "step": 9140 + }, + { + "epoch": 2.871625733749534, + "grad_norm": 0.89453125, + "learning_rate": 8.597905426848621e-06, + "loss": 1.2359, + "step": 9142 + }, + { + "epoch": 2.872253960775075, + "grad_norm": 1.140625, + "learning_rate": 8.595366550301493e-06, + "loss": 1.0976, + "step": 9144 + }, + { + "epoch": 2.8728821878006165, + "grad_norm": 0.90234375, + "learning_rate": 8.592827673754364e-06, + "loss": 1.4076, + "step": 9146 + }, + { + "epoch": 2.873510414826158, + "grad_norm": 0.91015625, + "learning_rate": 8.590288797207237e-06, + "loss": 1.2328, + "step": 9148 + }, + { + "epoch": 2.874138641851699, + "grad_norm": 0.94921875, + "learning_rate": 8.587749920660109e-06, + "loss": 1.1426, + "step": 9150 + }, + { + "epoch": 2.8747668688772405, + "grad_norm": 0.95703125, + "learning_rate": 8.585211044112982e-06, + "loss": 1.1638, + "step": 9152 + }, + { + "epoch": 2.875395095902782, + "grad_norm": 0.9140625, + "learning_rate": 8.582672167565853e-06, + "loss": 1.3359, + "step": 9154 + }, + { + "epoch": 2.876023322928323, + "grad_norm": 0.85546875, + "learning_rate": 8.580133291018724e-06, + "loss": 1.0855, + "step": 9156 + }, + { + "epoch": 2.8766515499538645, + "grad_norm": 0.90625, + "learning_rate": 8.577594414471598e-06, + "loss": 1.2933, + "step": 9158 + }, + { + "epoch": 2.877279776979406, + "grad_norm": 0.94140625, + "learning_rate": 8.575055537924469e-06, + "loss": 1.1769, + "step": 9160 + }, + { + "epoch": 2.877908004004947, + "grad_norm": 0.875, + "learning_rate": 8.572516661377342e-06, + "loss": 1.296, + "step": 9162 + }, + { + "epoch": 2.8785362310304885, + "grad_norm": 0.84765625, + "learning_rate": 8.569977784830213e-06, + "loss": 1.2159, + "step": 9164 + }, + { + "epoch": 2.87916445805603, + "grad_norm": 0.86328125, + "learning_rate": 8.567438908283085e-06, + "loss": 1.1394, + "step": 9166 + }, + { + "epoch": 2.879792685081571, + "grad_norm": 0.9765625, + "learning_rate": 8.564900031735958e-06, + "loss": 1.1248, + "step": 9168 + }, + { + "epoch": 2.8804209121071125, + "grad_norm": 0.8125, + "learning_rate": 8.56236115518883e-06, + "loss": 1.2397, + "step": 9170 + }, + { + "epoch": 2.881049139132654, + "grad_norm": 0.875, + "learning_rate": 8.559822278641702e-06, + "loss": 1.2171, + "step": 9172 + }, + { + "epoch": 2.881677366158195, + "grad_norm": 0.88671875, + "learning_rate": 8.557283402094574e-06, + "loss": 1.1612, + "step": 9174 + }, + { + "epoch": 2.882305593183737, + "grad_norm": 0.8359375, + "learning_rate": 8.554744525547447e-06, + "loss": 1.1409, + "step": 9176 + }, + { + "epoch": 2.8829338202092782, + "grad_norm": 0.90234375, + "learning_rate": 8.552205649000318e-06, + "loss": 1.1146, + "step": 9178 + }, + { + "epoch": 2.8835620472348196, + "grad_norm": 0.84375, + "learning_rate": 8.54966677245319e-06, + "loss": 1.1532, + "step": 9180 + }, + { + "epoch": 2.884190274260361, + "grad_norm": 0.88671875, + "learning_rate": 8.547127895906063e-06, + "loss": 1.1232, + "step": 9182 + }, + { + "epoch": 2.8848185012859022, + "grad_norm": 0.8828125, + "learning_rate": 8.544589019358934e-06, + "loss": 1.1765, + "step": 9184 + }, + { + "epoch": 2.8854467283114436, + "grad_norm": 0.8515625, + "learning_rate": 8.542050142811807e-06, + "loss": 1.1835, + "step": 9186 + }, + { + "epoch": 2.886074955336985, + "grad_norm": 0.921875, + "learning_rate": 8.539511266264678e-06, + "loss": 1.1912, + "step": 9188 + }, + { + "epoch": 2.886703182362526, + "grad_norm": 0.87890625, + "learning_rate": 8.53697238971755e-06, + "loss": 1.2664, + "step": 9190 + }, + { + "epoch": 2.8873314093880675, + "grad_norm": 0.9453125, + "learning_rate": 8.534433513170423e-06, + "loss": 1.0205, + "step": 9192 + }, + { + "epoch": 2.887959636413609, + "grad_norm": 0.875, + "learning_rate": 8.531894636623294e-06, + "loss": 1.0527, + "step": 9194 + }, + { + "epoch": 2.88858786343915, + "grad_norm": 0.86328125, + "learning_rate": 8.529355760076167e-06, + "loss": 1.2545, + "step": 9196 + }, + { + "epoch": 2.8892160904646915, + "grad_norm": 0.84765625, + "learning_rate": 8.526816883529039e-06, + "loss": 1.0928, + "step": 9198 + }, + { + "epoch": 2.8898443174902333, + "grad_norm": 0.92578125, + "learning_rate": 8.52427800698191e-06, + "loss": 1.209, + "step": 9200 + }, + { + "epoch": 2.8904725445157746, + "grad_norm": 0.84375, + "learning_rate": 8.521739130434783e-06, + "loss": 1.171, + "step": 9202 + }, + { + "epoch": 2.891100771541316, + "grad_norm": 0.96875, + "learning_rate": 8.519200253887655e-06, + "loss": 1.2062, + "step": 9204 + }, + { + "epoch": 2.8917289985668573, + "grad_norm": 0.8359375, + "learning_rate": 8.516661377340528e-06, + "loss": 1.2552, + "step": 9206 + }, + { + "epoch": 2.8923572255923986, + "grad_norm": 1.65625, + "learning_rate": 8.514122500793399e-06, + "loss": 1.1772, + "step": 9208 + }, + { + "epoch": 2.89298545261794, + "grad_norm": 0.94140625, + "learning_rate": 8.51158362424627e-06, + "loss": 1.2433, + "step": 9210 + }, + { + "epoch": 2.8936136796434813, + "grad_norm": 0.8046875, + "learning_rate": 8.509044747699144e-06, + "loss": 1.1172, + "step": 9212 + }, + { + "epoch": 2.8942419066690226, + "grad_norm": 0.90625, + "learning_rate": 8.506505871152015e-06, + "loss": 1.224, + "step": 9214 + }, + { + "epoch": 2.894870133694564, + "grad_norm": 0.859375, + "learning_rate": 8.503966994604888e-06, + "loss": 1.1046, + "step": 9216 + }, + { + "epoch": 2.8954983607201052, + "grad_norm": 0.8046875, + "learning_rate": 8.50142811805776e-06, + "loss": 1.0652, + "step": 9218 + }, + { + "epoch": 2.8961265877456466, + "grad_norm": 0.96484375, + "learning_rate": 8.498889241510632e-06, + "loss": 1.1309, + "step": 9220 + }, + { + "epoch": 2.896754814771188, + "grad_norm": 0.89453125, + "learning_rate": 8.496350364963506e-06, + "loss": 1.0387, + "step": 9222 + }, + { + "epoch": 2.8973830417967292, + "grad_norm": 0.9375, + "learning_rate": 8.493811488416377e-06, + "loss": 1.0663, + "step": 9224 + }, + { + "epoch": 2.8980112688222706, + "grad_norm": 0.82421875, + "learning_rate": 8.491272611869248e-06, + "loss": 1.2568, + "step": 9226 + }, + { + "epoch": 2.898639495847812, + "grad_norm": 0.8359375, + "learning_rate": 8.488733735322121e-06, + "loss": 1.2742, + "step": 9228 + }, + { + "epoch": 2.899267722873353, + "grad_norm": 0.91796875, + "learning_rate": 8.486194858774993e-06, + "loss": 1.1714, + "step": 9230 + }, + { + "epoch": 2.8998959498988945, + "grad_norm": 1.0703125, + "learning_rate": 8.483655982227866e-06, + "loss": 1.1513, + "step": 9232 + }, + { + "epoch": 2.900524176924436, + "grad_norm": 0.78125, + "learning_rate": 8.481117105680737e-06, + "loss": 1.1691, + "step": 9234 + }, + { + "epoch": 2.901152403949977, + "grad_norm": 0.87890625, + "learning_rate": 8.478578229133609e-06, + "loss": 1.3591, + "step": 9236 + }, + { + "epoch": 2.9017806309755185, + "grad_norm": 0.94921875, + "learning_rate": 8.476039352586482e-06, + "loss": 1.2139, + "step": 9238 + }, + { + "epoch": 2.9024088580010603, + "grad_norm": 0.80078125, + "learning_rate": 8.473500476039353e-06, + "loss": 1.1475, + "step": 9240 + }, + { + "epoch": 2.9030370850266016, + "grad_norm": 0.90234375, + "learning_rate": 8.470961599492226e-06, + "loss": 1.1807, + "step": 9242 + }, + { + "epoch": 2.903665312052143, + "grad_norm": 0.8125, + "learning_rate": 8.468422722945097e-06, + "loss": 1.1678, + "step": 9244 + }, + { + "epoch": 2.9042935390776843, + "grad_norm": 0.8359375, + "learning_rate": 8.46588384639797e-06, + "loss": 1.3143, + "step": 9246 + }, + { + "epoch": 2.9049217661032256, + "grad_norm": 0.86328125, + "learning_rate": 8.463344969850842e-06, + "loss": 1.1935, + "step": 9248 + }, + { + "epoch": 2.905549993128767, + "grad_norm": 0.85546875, + "learning_rate": 8.460806093303713e-06, + "loss": 1.3033, + "step": 9250 + }, + { + "epoch": 2.9061782201543083, + "grad_norm": 0.859375, + "learning_rate": 8.458267216756586e-06, + "loss": 1.2278, + "step": 9252 + }, + { + "epoch": 2.9068064471798496, + "grad_norm": 0.88671875, + "learning_rate": 8.455728340209458e-06, + "loss": 1.2614, + "step": 9254 + }, + { + "epoch": 2.907434674205391, + "grad_norm": 0.82421875, + "learning_rate": 8.453189463662331e-06, + "loss": 1.2318, + "step": 9256 + }, + { + "epoch": 2.9080629012309323, + "grad_norm": 0.92578125, + "learning_rate": 8.450650587115202e-06, + "loss": 1.1713, + "step": 9258 + }, + { + "epoch": 2.9086911282564736, + "grad_norm": 0.859375, + "learning_rate": 8.448111710568074e-06, + "loss": 1.2783, + "step": 9260 + }, + { + "epoch": 2.909319355282015, + "grad_norm": 0.921875, + "learning_rate": 8.445572834020947e-06, + "loss": 1.1782, + "step": 9262 + }, + { + "epoch": 2.9099475823075562, + "grad_norm": 0.828125, + "learning_rate": 8.443033957473818e-06, + "loss": 1.2585, + "step": 9264 + }, + { + "epoch": 2.910575809333098, + "grad_norm": 0.84765625, + "learning_rate": 8.440495080926691e-06, + "loss": 1.0162, + "step": 9266 + }, + { + "epoch": 2.9112040363586393, + "grad_norm": 0.95703125, + "learning_rate": 8.437956204379563e-06, + "loss": 1.1789, + "step": 9268 + }, + { + "epoch": 2.9118322633841807, + "grad_norm": 0.8671875, + "learning_rate": 8.435417327832434e-06, + "loss": 1.1599, + "step": 9270 + }, + { + "epoch": 2.912460490409722, + "grad_norm": 0.83203125, + "learning_rate": 8.432878451285307e-06, + "loss": 1.2784, + "step": 9272 + }, + { + "epoch": 2.9130887174352633, + "grad_norm": 0.90234375, + "learning_rate": 8.430339574738178e-06, + "loss": 1.2503, + "step": 9274 + }, + { + "epoch": 2.9137169444608046, + "grad_norm": 0.890625, + "learning_rate": 8.427800698191051e-06, + "loss": 1.2095, + "step": 9276 + }, + { + "epoch": 2.914345171486346, + "grad_norm": 0.85546875, + "learning_rate": 8.425261821643923e-06, + "loss": 1.1473, + "step": 9278 + }, + { + "epoch": 2.9149733985118873, + "grad_norm": 0.94140625, + "learning_rate": 8.422722945096796e-06, + "loss": 1.2989, + "step": 9280 + }, + { + "epoch": 2.9156016255374286, + "grad_norm": 0.83984375, + "learning_rate": 8.420184068549667e-06, + "loss": 1.1785, + "step": 9282 + }, + { + "epoch": 2.91622985256297, + "grad_norm": 0.859375, + "learning_rate": 8.417645192002539e-06, + "loss": 1.1599, + "step": 9284 + }, + { + "epoch": 2.9168580795885113, + "grad_norm": 0.8984375, + "learning_rate": 8.415106315455412e-06, + "loss": 1.1402, + "step": 9286 + }, + { + "epoch": 2.9174863066140526, + "grad_norm": 0.80859375, + "learning_rate": 8.412567438908283e-06, + "loss": 1.056, + "step": 9288 + }, + { + "epoch": 2.918114533639594, + "grad_norm": 0.98828125, + "learning_rate": 8.410028562361156e-06, + "loss": 1.1068, + "step": 9290 + }, + { + "epoch": 2.9187427606651353, + "grad_norm": 0.88671875, + "learning_rate": 8.407489685814028e-06, + "loss": 1.1869, + "step": 9292 + }, + { + "epoch": 2.9193709876906766, + "grad_norm": 0.98828125, + "learning_rate": 8.404950809266899e-06, + "loss": 1.145, + "step": 9294 + }, + { + "epoch": 2.919999214716218, + "grad_norm": 0.91015625, + "learning_rate": 8.402411932719772e-06, + "loss": 1.1293, + "step": 9296 + }, + { + "epoch": 2.9206274417417593, + "grad_norm": 0.94921875, + "learning_rate": 8.399873056172643e-06, + "loss": 1.1936, + "step": 9298 + }, + { + "epoch": 2.9212556687673006, + "grad_norm": 0.96484375, + "learning_rate": 8.397334179625517e-06, + "loss": 1.1045, + "step": 9300 + }, + { + "epoch": 2.921883895792842, + "grad_norm": 0.8359375, + "learning_rate": 8.394795303078388e-06, + "loss": 1.2715, + "step": 9302 + }, + { + "epoch": 2.9225121228183832, + "grad_norm": 0.96875, + "learning_rate": 8.392256426531261e-06, + "loss": 1.1606, + "step": 9304 + }, + { + "epoch": 2.923140349843925, + "grad_norm": 0.90625, + "learning_rate": 8.389717549984134e-06, + "loss": 1.0997, + "step": 9306 + }, + { + "epoch": 2.9237685768694663, + "grad_norm": 0.92578125, + "learning_rate": 8.387178673437005e-06, + "loss": 1.0984, + "step": 9308 + }, + { + "epoch": 2.9243968038950077, + "grad_norm": 0.8984375, + "learning_rate": 8.384639796889877e-06, + "loss": 1.2038, + "step": 9310 + }, + { + "epoch": 2.925025030920549, + "grad_norm": 0.890625, + "learning_rate": 8.38210092034275e-06, + "loss": 1.0816, + "step": 9312 + }, + { + "epoch": 2.9256532579460903, + "grad_norm": 0.88671875, + "learning_rate": 8.379562043795621e-06, + "loss": 1.2256, + "step": 9314 + }, + { + "epoch": 2.9262814849716317, + "grad_norm": 0.875, + "learning_rate": 8.377023167248494e-06, + "loss": 1.2699, + "step": 9316 + }, + { + "epoch": 2.926909711997173, + "grad_norm": 0.86328125, + "learning_rate": 8.374484290701366e-06, + "loss": 1.1746, + "step": 9318 + }, + { + "epoch": 2.9275379390227143, + "grad_norm": 0.91015625, + "learning_rate": 8.371945414154237e-06, + "loss": 1.1686, + "step": 9320 + }, + { + "epoch": 2.9281661660482556, + "grad_norm": 0.89453125, + "learning_rate": 8.36940653760711e-06, + "loss": 1.1485, + "step": 9322 + }, + { + "epoch": 2.928794393073797, + "grad_norm": 0.8046875, + "learning_rate": 8.366867661059982e-06, + "loss": 1.2353, + "step": 9324 + }, + { + "epoch": 2.9294226200993383, + "grad_norm": 0.91796875, + "learning_rate": 8.364328784512855e-06, + "loss": 1.2038, + "step": 9326 + }, + { + "epoch": 2.9300508471248796, + "grad_norm": 0.94140625, + "learning_rate": 8.361789907965726e-06, + "loss": 1.2617, + "step": 9328 + }, + { + "epoch": 2.930679074150421, + "grad_norm": 0.87890625, + "learning_rate": 8.359251031418597e-06, + "loss": 1.1843, + "step": 9330 + }, + { + "epoch": 2.9313073011759627, + "grad_norm": 0.88671875, + "learning_rate": 8.35671215487147e-06, + "loss": 1.217, + "step": 9332 + }, + { + "epoch": 2.931935528201504, + "grad_norm": 0.859375, + "learning_rate": 8.354173278324342e-06, + "loss": 1.1822, + "step": 9334 + }, + { + "epoch": 2.9325637552270454, + "grad_norm": 0.9296875, + "learning_rate": 8.351634401777215e-06, + "loss": 1.1071, + "step": 9336 + }, + { + "epoch": 2.9331919822525867, + "grad_norm": 0.8515625, + "learning_rate": 8.349095525230086e-06, + "loss": 1.1019, + "step": 9338 + }, + { + "epoch": 2.933820209278128, + "grad_norm": 0.93359375, + "learning_rate": 8.346556648682958e-06, + "loss": 1.1753, + "step": 9340 + }, + { + "epoch": 2.9344484363036694, + "grad_norm": 0.87109375, + "learning_rate": 8.34401777213583e-06, + "loss": 1.1814, + "step": 9342 + }, + { + "epoch": 2.9350766633292107, + "grad_norm": 1.0546875, + "learning_rate": 8.341478895588702e-06, + "loss": 1.1398, + "step": 9344 + }, + { + "epoch": 2.935704890354752, + "grad_norm": 1.015625, + "learning_rate": 8.338940019041575e-06, + "loss": 1.2175, + "step": 9346 + }, + { + "epoch": 2.9363331173802933, + "grad_norm": 0.92578125, + "learning_rate": 8.336401142494447e-06, + "loss": 1.057, + "step": 9348 + }, + { + "epoch": 2.9369613444058347, + "grad_norm": 0.8671875, + "learning_rate": 8.33386226594732e-06, + "loss": 1.2624, + "step": 9350 + }, + { + "epoch": 2.937589571431376, + "grad_norm": 0.85546875, + "learning_rate": 8.331323389400191e-06, + "loss": 1.1828, + "step": 9352 + }, + { + "epoch": 2.9382177984569173, + "grad_norm": 0.8984375, + "learning_rate": 8.328784512853062e-06, + "loss": 1.2289, + "step": 9354 + }, + { + "epoch": 2.9388460254824587, + "grad_norm": 0.96875, + "learning_rate": 8.326245636305936e-06, + "loss": 1.0668, + "step": 9356 + }, + { + "epoch": 2.939474252508, + "grad_norm": 0.90625, + "learning_rate": 8.323706759758807e-06, + "loss": 1.212, + "step": 9358 + }, + { + "epoch": 2.9401024795335413, + "grad_norm": 0.8671875, + "learning_rate": 8.32116788321168e-06, + "loss": 1.1254, + "step": 9360 + }, + { + "epoch": 2.9407307065590826, + "grad_norm": 0.859375, + "learning_rate": 8.318629006664551e-06, + "loss": 1.048, + "step": 9362 + }, + { + "epoch": 2.941358933584624, + "grad_norm": 0.89453125, + "learning_rate": 8.316090130117423e-06, + "loss": 1.0829, + "step": 9364 + }, + { + "epoch": 2.9419871606101653, + "grad_norm": 0.8671875, + "learning_rate": 8.313551253570296e-06, + "loss": 1.163, + "step": 9366 + }, + { + "epoch": 2.9426153876357066, + "grad_norm": 0.87890625, + "learning_rate": 8.311012377023167e-06, + "loss": 1.3302, + "step": 9368 + }, + { + "epoch": 2.943243614661248, + "grad_norm": 0.8828125, + "learning_rate": 8.30847350047604e-06, + "loss": 1.115, + "step": 9370 + }, + { + "epoch": 2.9438718416867897, + "grad_norm": 0.91796875, + "learning_rate": 8.305934623928912e-06, + "loss": 1.218, + "step": 9372 + }, + { + "epoch": 2.944500068712331, + "grad_norm": 0.89453125, + "learning_rate": 8.303395747381783e-06, + "loss": 1.17, + "step": 9374 + }, + { + "epoch": 2.9451282957378724, + "grad_norm": 0.875, + "learning_rate": 8.300856870834656e-06, + "loss": 1.2241, + "step": 9376 + }, + { + "epoch": 2.9457565227634137, + "grad_norm": 1.0234375, + "learning_rate": 8.298317994287528e-06, + "loss": 1.1651, + "step": 9378 + }, + { + "epoch": 2.946384749788955, + "grad_norm": 0.98828125, + "learning_rate": 8.2957791177404e-06, + "loss": 1.1871, + "step": 9380 + }, + { + "epoch": 2.9470129768144964, + "grad_norm": 1.0546875, + "learning_rate": 8.293240241193272e-06, + "loss": 1.2128, + "step": 9382 + }, + { + "epoch": 2.9476412038400377, + "grad_norm": 0.86328125, + "learning_rate": 8.290701364646145e-06, + "loss": 1.2018, + "step": 9384 + }, + { + "epoch": 2.948269430865579, + "grad_norm": 0.85546875, + "learning_rate": 8.288162488099016e-06, + "loss": 1.2859, + "step": 9386 + }, + { + "epoch": 2.9488976578911203, + "grad_norm": 0.8359375, + "learning_rate": 8.285623611551888e-06, + "loss": 1.2422, + "step": 9388 + }, + { + "epoch": 2.9495258849166617, + "grad_norm": 0.8671875, + "learning_rate": 8.283084735004761e-06, + "loss": 1.1708, + "step": 9390 + }, + { + "epoch": 2.950154111942203, + "grad_norm": 0.8046875, + "learning_rate": 8.280545858457634e-06, + "loss": 1.1229, + "step": 9392 + }, + { + "epoch": 2.9507823389677443, + "grad_norm": 0.92578125, + "learning_rate": 8.278006981910505e-06, + "loss": 1.117, + "step": 9394 + }, + { + "epoch": 2.9514105659932857, + "grad_norm": 0.828125, + "learning_rate": 8.275468105363378e-06, + "loss": 1.3889, + "step": 9396 + }, + { + "epoch": 2.9520387930188274, + "grad_norm": 0.828125, + "learning_rate": 8.27292922881625e-06, + "loss": 1.1619, + "step": 9398 + }, + { + "epoch": 2.9526670200443688, + "grad_norm": 0.93359375, + "learning_rate": 8.270390352269121e-06, + "loss": 1.1436, + "step": 9400 + }, + { + "epoch": 2.95329524706991, + "grad_norm": 0.8984375, + "learning_rate": 8.267851475721994e-06, + "loss": 1.1739, + "step": 9402 + }, + { + "epoch": 2.9539234740954514, + "grad_norm": 0.8984375, + "learning_rate": 8.265312599174866e-06, + "loss": 1.1537, + "step": 9404 + }, + { + "epoch": 2.9545517011209927, + "grad_norm": 0.87890625, + "learning_rate": 8.262773722627739e-06, + "loss": 1.1615, + "step": 9406 + }, + { + "epoch": 2.955179928146534, + "grad_norm": 0.87890625, + "learning_rate": 8.26023484608061e-06, + "loss": 1.2431, + "step": 9408 + }, + { + "epoch": 2.9558081551720754, + "grad_norm": 0.9296875, + "learning_rate": 8.257695969533483e-06, + "loss": 1.1621, + "step": 9410 + }, + { + "epoch": 2.9564363821976167, + "grad_norm": 0.890625, + "learning_rate": 8.255157092986355e-06, + "loss": 1.2451, + "step": 9412 + }, + { + "epoch": 2.957064609223158, + "grad_norm": 0.89453125, + "learning_rate": 8.252618216439226e-06, + "loss": 1.1724, + "step": 9414 + }, + { + "epoch": 2.9576928362486994, + "grad_norm": 0.8671875, + "learning_rate": 8.250079339892099e-06, + "loss": 1.095, + "step": 9416 + }, + { + "epoch": 2.9583210632742407, + "grad_norm": 0.9453125, + "learning_rate": 8.24754046334497e-06, + "loss": 1.1788, + "step": 9418 + }, + { + "epoch": 2.958949290299782, + "grad_norm": 0.84375, + "learning_rate": 8.245001586797843e-06, + "loss": 1.1435, + "step": 9420 + }, + { + "epoch": 2.9595775173253234, + "grad_norm": 0.99609375, + "learning_rate": 8.242462710250715e-06, + "loss": 1.2214, + "step": 9422 + }, + { + "epoch": 2.9602057443508647, + "grad_norm": 0.83984375, + "learning_rate": 8.239923833703586e-06, + "loss": 1.1846, + "step": 9424 + }, + { + "epoch": 2.960833971376406, + "grad_norm": 0.89453125, + "learning_rate": 8.23738495715646e-06, + "loss": 1.1564, + "step": 9426 + }, + { + "epoch": 2.9614621984019474, + "grad_norm": 0.875, + "learning_rate": 8.23484608060933e-06, + "loss": 1.1448, + "step": 9428 + }, + { + "epoch": 2.9620904254274887, + "grad_norm": 0.8046875, + "learning_rate": 8.232307204062204e-06, + "loss": 1.3647, + "step": 9430 + }, + { + "epoch": 2.96271865245303, + "grad_norm": 0.890625, + "learning_rate": 8.229768327515075e-06, + "loss": 1.221, + "step": 9432 + }, + { + "epoch": 2.9633468794785713, + "grad_norm": 0.92578125, + "learning_rate": 8.227229450967947e-06, + "loss": 1.2453, + "step": 9434 + }, + { + "epoch": 2.9639751065041127, + "grad_norm": 0.921875, + "learning_rate": 8.22469057442082e-06, + "loss": 1.1119, + "step": 9436 + }, + { + "epoch": 2.9646033335296544, + "grad_norm": 0.87109375, + "learning_rate": 8.222151697873691e-06, + "loss": 1.2253, + "step": 9438 + }, + { + "epoch": 2.9652315605551958, + "grad_norm": 0.91796875, + "learning_rate": 8.219612821326564e-06, + "loss": 1.191, + "step": 9440 + }, + { + "epoch": 2.965859787580737, + "grad_norm": 0.94140625, + "learning_rate": 8.217073944779435e-06, + "loss": 1.1427, + "step": 9442 + }, + { + "epoch": 2.9664880146062784, + "grad_norm": 0.890625, + "learning_rate": 8.214535068232309e-06, + "loss": 1.2054, + "step": 9444 + }, + { + "epoch": 2.9671162416318198, + "grad_norm": 0.91015625, + "learning_rate": 8.21199619168518e-06, + "loss": 1.2363, + "step": 9446 + }, + { + "epoch": 2.967744468657361, + "grad_norm": 0.91796875, + "learning_rate": 8.209457315138051e-06, + "loss": 1.3343, + "step": 9448 + }, + { + "epoch": 2.9683726956829024, + "grad_norm": 0.8671875, + "learning_rate": 8.206918438590924e-06, + "loss": 1.0634, + "step": 9450 + }, + { + "epoch": 2.9690009227084437, + "grad_norm": 0.8125, + "learning_rate": 8.204379562043796e-06, + "loss": 1.1639, + "step": 9452 + }, + { + "epoch": 2.969629149733985, + "grad_norm": 0.828125, + "learning_rate": 8.201840685496669e-06, + "loss": 1.1921, + "step": 9454 + }, + { + "epoch": 2.9702573767595264, + "grad_norm": 0.8828125, + "learning_rate": 8.19930180894954e-06, + "loss": 1.3581, + "step": 9456 + }, + { + "epoch": 2.9708856037850677, + "grad_norm": 0.921875, + "learning_rate": 8.196762932402412e-06, + "loss": 1.2369, + "step": 9458 + }, + { + "epoch": 2.971513830810609, + "grad_norm": 0.94140625, + "learning_rate": 8.194224055855285e-06, + "loss": 1.0998, + "step": 9460 + }, + { + "epoch": 2.9721420578361504, + "grad_norm": 1.375, + "learning_rate": 8.191685179308156e-06, + "loss": 1.0054, + "step": 9462 + }, + { + "epoch": 2.972770284861692, + "grad_norm": 0.8515625, + "learning_rate": 8.189146302761029e-06, + "loss": 1.2289, + "step": 9464 + }, + { + "epoch": 2.9733985118872335, + "grad_norm": 0.8359375, + "learning_rate": 8.1866074262139e-06, + "loss": 1.0944, + "step": 9466 + }, + { + "epoch": 2.974026738912775, + "grad_norm": 0.90625, + "learning_rate": 8.184068549666772e-06, + "loss": 1.168, + "step": 9468 + }, + { + "epoch": 2.974654965938316, + "grad_norm": 0.8203125, + "learning_rate": 8.181529673119645e-06, + "loss": 1.2147, + "step": 9470 + }, + { + "epoch": 2.9752831929638575, + "grad_norm": 0.9609375, + "learning_rate": 8.178990796572516e-06, + "loss": 1.2055, + "step": 9472 + }, + { + "epoch": 2.975911419989399, + "grad_norm": 0.8203125, + "learning_rate": 8.17645192002539e-06, + "loss": 1.2641, + "step": 9474 + }, + { + "epoch": 2.97653964701494, + "grad_norm": 0.88671875, + "learning_rate": 8.173913043478263e-06, + "loss": 1.2016, + "step": 9476 + }, + { + "epoch": 2.9771678740404814, + "grad_norm": 0.93359375, + "learning_rate": 8.171374166931134e-06, + "loss": 1.2389, + "step": 9478 + }, + { + "epoch": 2.9777961010660228, + "grad_norm": 0.83984375, + "learning_rate": 8.168835290384007e-06, + "loss": 1.3398, + "step": 9480 + }, + { + "epoch": 2.978424328091564, + "grad_norm": 0.8203125, + "learning_rate": 8.166296413836878e-06, + "loss": 1.3117, + "step": 9482 + }, + { + "epoch": 2.9790525551171054, + "grad_norm": 0.82421875, + "learning_rate": 8.16375753728975e-06, + "loss": 1.1342, + "step": 9484 + }, + { + "epoch": 2.9796807821426468, + "grad_norm": 0.82421875, + "learning_rate": 8.161218660742623e-06, + "loss": 1.183, + "step": 9486 + }, + { + "epoch": 2.980309009168188, + "grad_norm": 0.8203125, + "learning_rate": 8.158679784195494e-06, + "loss": 1.1521, + "step": 9488 + }, + { + "epoch": 2.9809372361937294, + "grad_norm": 0.91796875, + "learning_rate": 8.156140907648367e-06, + "loss": 1.0698, + "step": 9490 + }, + { + "epoch": 2.9815654632192707, + "grad_norm": 0.8671875, + "learning_rate": 8.153602031101239e-06, + "loss": 1.1782, + "step": 9492 + }, + { + "epoch": 2.982193690244812, + "grad_norm": 0.86328125, + "learning_rate": 8.15106315455411e-06, + "loss": 1.2666, + "step": 9494 + }, + { + "epoch": 2.9828219172703534, + "grad_norm": 0.95703125, + "learning_rate": 8.148524278006983e-06, + "loss": 1.3233, + "step": 9496 + }, + { + "epoch": 2.9834501442958947, + "grad_norm": 0.8828125, + "learning_rate": 8.145985401459855e-06, + "loss": 1.1066, + "step": 9498 + }, + { + "epoch": 2.984078371321436, + "grad_norm": 0.953125, + "learning_rate": 8.143446524912728e-06, + "loss": 1.1686, + "step": 9500 + } + ], + "logging_steps": 2, + "max_steps": 15915, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.572975921220813e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}