diff --git "a/e2.0/trainer_state.json" "b/e2.0/trainer_state.json" deleted file mode 100644--- "a/e2.0/trainer_state.json" +++ /dev/null @@ -1,23133 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0731491842864713, - "eval_steps": 500, - "global_step": 6600, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.000628227025541355, - "grad_norm": 37.5, - "learning_rate": 2.5000000000000004e-07, - "loss": 1.9002, - "step": 2 - }, - { - "epoch": 0.00125645405108271, - "grad_norm": 5.65625, - "learning_rate": 5.000000000000001e-07, - "loss": 1.6234, - "step": 4 - }, - { - "epoch": 0.001884681076624065, - "grad_norm": 5.15625, - "learning_rate": 7.5e-07, - "loss": 1.6223, - "step": 6 - }, - { - "epoch": 0.00251290810216542, - "grad_norm": 4.8125, - "learning_rate": 1.0000000000000002e-06, - "loss": 1.6607, - "step": 8 - }, - { - "epoch": 0.003141135127706775, - "grad_norm": 5.03125, - "learning_rate": 1.25e-06, - "loss": 1.6644, - "step": 10 - }, - { - "epoch": 0.00376936215324813, - "grad_norm": 5.71875, - "learning_rate": 1.5e-06, - "loss": 1.8307, - "step": 12 - }, - { - "epoch": 0.004397589178789485, - "grad_norm": 5.28125, - "learning_rate": 1.75e-06, - "loss": 1.623, - "step": 14 - }, - { - "epoch": 0.00502581620433084, - "grad_norm": 3.625, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.6915, - "step": 16 - }, - { - "epoch": 0.005654043229872195, - "grad_norm": 3.609375, - "learning_rate": 2.25e-06, - "loss": 1.8222, - "step": 18 - }, - { - "epoch": 0.00628227025541355, - "grad_norm": 3.21875, - "learning_rate": 2.5e-06, - "loss": 1.6702, - "step": 20 - }, - { - "epoch": 0.006910497280954905, - "grad_norm": 2.140625, - "learning_rate": 2.7500000000000004e-06, - "loss": 1.6118, - "step": 22 - }, - { - "epoch": 0.00753872430649626, - "grad_norm": 2.890625, - "learning_rate": 3e-06, - "loss": 1.6499, - "step": 24 - }, - { - "epoch": 0.008166951332037615, - "grad_norm": 2.421875, - "learning_rate": 3.2500000000000002e-06, - "loss": 1.4785, - "step": 26 - }, - { - "epoch": 0.00879517835757897, - "grad_norm": 2.421875, - "learning_rate": 3.5e-06, - "loss": 1.7235, - "step": 28 - }, - { - "epoch": 0.009423405383120325, - "grad_norm": 2.25, - "learning_rate": 3.7500000000000005e-06, - "loss": 1.6245, - "step": 30 - }, - { - "epoch": 0.01005163240866168, - "grad_norm": 1.6640625, - "learning_rate": 4.000000000000001e-06, - "loss": 1.5486, - "step": 32 - }, - { - "epoch": 0.010679859434203035, - "grad_norm": 2.09375, - "learning_rate": 4.25e-06, - "loss": 1.4448, - "step": 34 - }, - { - "epoch": 0.01130808645974439, - "grad_norm": 1.5078125, - "learning_rate": 4.5e-06, - "loss": 1.48, - "step": 36 - }, - { - "epoch": 0.011936313485285744, - "grad_norm": 1.6796875, - "learning_rate": 4.75e-06, - "loss": 1.5128, - "step": 38 - }, - { - "epoch": 0.0125645405108271, - "grad_norm": 1.5234375, - "learning_rate": 5e-06, - "loss": 1.4774, - "step": 40 - }, - { - "epoch": 0.013192767536368456, - "grad_norm": 1.546875, - "learning_rate": 5.2500000000000006e-06, - "loss": 1.3692, - "step": 42 - }, - { - "epoch": 0.01382099456190981, - "grad_norm": 1.34375, - "learning_rate": 5.500000000000001e-06, - "loss": 1.5056, - "step": 44 - }, - { - "epoch": 0.014449221587451166, - "grad_norm": 1.21875, - "learning_rate": 5.75e-06, - "loss": 1.4744, - "step": 46 - }, - { - "epoch": 0.01507744861299252, - "grad_norm": 1.0078125, - "learning_rate": 6e-06, - "loss": 1.5776, - "step": 48 - }, - { - "epoch": 0.015705675638533874, - "grad_norm": 1.1015625, - "learning_rate": 6.25e-06, - "loss": 1.486, - "step": 50 - }, - { - "epoch": 0.01633390266407523, - "grad_norm": 0.85546875, - "learning_rate": 6.5000000000000004e-06, - "loss": 1.5541, - "step": 52 - }, - { - "epoch": 0.016962129689616585, - "grad_norm": 0.984375, - "learning_rate": 6.750000000000001e-06, - "loss": 1.3999, - "step": 54 - }, - { - "epoch": 0.01759035671515794, - "grad_norm": 0.96875, - "learning_rate": 7e-06, - "loss": 1.4455, - "step": 56 - }, - { - "epoch": 0.018218583740699294, - "grad_norm": 1.0546875, - "learning_rate": 7.25e-06, - "loss": 1.5221, - "step": 58 - }, - { - "epoch": 0.01884681076624065, - "grad_norm": 0.88671875, - "learning_rate": 7.500000000000001e-06, - "loss": 1.4798, - "step": 60 - }, - { - "epoch": 0.019475037791782005, - "grad_norm": 0.9140625, - "learning_rate": 7.75e-06, - "loss": 1.4334, - "step": 62 - }, - { - "epoch": 0.02010326481732336, - "grad_norm": 0.98046875, - "learning_rate": 8.000000000000001e-06, - "loss": 1.3293, - "step": 64 - }, - { - "epoch": 0.020731491842864717, - "grad_norm": 0.83984375, - "learning_rate": 8.25e-06, - "loss": 1.4174, - "step": 66 - }, - { - "epoch": 0.02135971886840607, - "grad_norm": 0.859375, - "learning_rate": 8.5e-06, - "loss": 1.4177, - "step": 68 - }, - { - "epoch": 0.021987945893947425, - "grad_norm": 0.84765625, - "learning_rate": 8.750000000000001e-06, - "loss": 1.3708, - "step": 70 - }, - { - "epoch": 0.02261617291948878, - "grad_norm": 0.90234375, - "learning_rate": 9e-06, - "loss": 1.4062, - "step": 72 - }, - { - "epoch": 0.023244399945030136, - "grad_norm": 0.83984375, - "learning_rate": 9.250000000000001e-06, - "loss": 1.3829, - "step": 74 - }, - { - "epoch": 0.02387262697057149, - "grad_norm": 0.9375, - "learning_rate": 9.5e-06, - "loss": 1.3441, - "step": 76 - }, - { - "epoch": 0.024500853996112845, - "grad_norm": 0.90234375, - "learning_rate": 9.75e-06, - "loss": 1.5625, - "step": 78 - }, - { - "epoch": 0.0251290810216542, - "grad_norm": 0.81640625, - "learning_rate": 1e-05, - "loss": 1.4504, - "step": 80 - }, - { - "epoch": 0.025757308047195556, - "grad_norm": 0.83984375, - "learning_rate": 1.025e-05, - "loss": 1.4672, - "step": 82 - }, - { - "epoch": 0.026385535072736912, - "grad_norm": 0.82421875, - "learning_rate": 1.0500000000000001e-05, - "loss": 1.4405, - "step": 84 - }, - { - "epoch": 0.027013762098278264, - "grad_norm": 1.0546875, - "learning_rate": 1.075e-05, - "loss": 1.3557, - "step": 86 - }, - { - "epoch": 0.02764198912381962, - "grad_norm": 0.90234375, - "learning_rate": 1.1000000000000001e-05, - "loss": 1.4309, - "step": 88 - }, - { - "epoch": 0.028270216149360976, - "grad_norm": 0.78125, - "learning_rate": 1.125e-05, - "loss": 1.3528, - "step": 90 - }, - { - "epoch": 0.02889844317490233, - "grad_norm": 0.96875, - "learning_rate": 1.15e-05, - "loss": 1.4093, - "step": 92 - }, - { - "epoch": 0.029526670200443684, - "grad_norm": 0.87890625, - "learning_rate": 1.1750000000000001e-05, - "loss": 1.4324, - "step": 94 - }, - { - "epoch": 0.03015489722598504, - "grad_norm": 0.875, - "learning_rate": 1.2e-05, - "loss": 1.4622, - "step": 96 - }, - { - "epoch": 0.030783124251526395, - "grad_norm": 0.85546875, - "learning_rate": 1.2250000000000001e-05, - "loss": 1.5166, - "step": 98 - }, - { - "epoch": 0.03141135127706775, - "grad_norm": 0.76953125, - "learning_rate": 1.25e-05, - "loss": 1.4729, - "step": 100 - }, - { - "epoch": 0.032039578302609104, - "grad_norm": 0.8828125, - "learning_rate": 1.275e-05, - "loss": 1.4201, - "step": 102 - }, - { - "epoch": 0.03266780532815046, - "grad_norm": 0.97265625, - "learning_rate": 1.3000000000000001e-05, - "loss": 1.3646, - "step": 104 - }, - { - "epoch": 0.033296032353691815, - "grad_norm": 0.859375, - "learning_rate": 1.325e-05, - "loss": 1.3105, - "step": 106 - }, - { - "epoch": 0.03392425937923317, - "grad_norm": 0.78125, - "learning_rate": 1.3500000000000001e-05, - "loss": 1.5302, - "step": 108 - }, - { - "epoch": 0.03455248640477453, - "grad_norm": 0.875, - "learning_rate": 1.375e-05, - "loss": 1.3979, - "step": 110 - }, - { - "epoch": 0.03518071343031588, - "grad_norm": 0.796875, - "learning_rate": 1.4e-05, - "loss": 1.3961, - "step": 112 - }, - { - "epoch": 0.03580894045585724, - "grad_norm": 0.796875, - "learning_rate": 1.425e-05, - "loss": 1.3645, - "step": 114 - }, - { - "epoch": 0.03643716748139859, - "grad_norm": 0.7421875, - "learning_rate": 1.45e-05, - "loss": 1.306, - "step": 116 - }, - { - "epoch": 0.03706539450693994, - "grad_norm": 0.8828125, - "learning_rate": 1.4750000000000003e-05, - "loss": 1.3799, - "step": 118 - }, - { - "epoch": 0.0376936215324813, - "grad_norm": 0.73828125, - "learning_rate": 1.5000000000000002e-05, - "loss": 1.3281, - "step": 120 - }, - { - "epoch": 0.038321848558022654, - "grad_norm": 0.87890625, - "learning_rate": 1.525e-05, - "loss": 1.4052, - "step": 122 - }, - { - "epoch": 0.03895007558356401, - "grad_norm": 0.8203125, - "learning_rate": 1.55e-05, - "loss": 1.4946, - "step": 124 - }, - { - "epoch": 0.039578302609105366, - "grad_norm": 0.80859375, - "learning_rate": 1.575e-05, - "loss": 1.4292, - "step": 126 - }, - { - "epoch": 0.04020652963464672, - "grad_norm": 1.0078125, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.4858, - "step": 128 - }, - { - "epoch": 0.04083475666018808, - "grad_norm": 0.9765625, - "learning_rate": 1.6250000000000002e-05, - "loss": 1.2745, - "step": 130 - }, - { - "epoch": 0.04146298368572943, - "grad_norm": 0.8046875, - "learning_rate": 1.65e-05, - "loss": 1.4684, - "step": 132 - }, - { - "epoch": 0.04209121071127078, - "grad_norm": 0.80859375, - "learning_rate": 1.675e-05, - "loss": 1.4275, - "step": 134 - }, - { - "epoch": 0.04271943773681214, - "grad_norm": 0.90625, - "learning_rate": 1.7e-05, - "loss": 1.2831, - "step": 136 - }, - { - "epoch": 0.043347664762353494, - "grad_norm": 0.953125, - "learning_rate": 1.7250000000000003e-05, - "loss": 1.445, - "step": 138 - }, - { - "epoch": 0.04397589178789485, - "grad_norm": 0.80859375, - "learning_rate": 1.7500000000000002e-05, - "loss": 1.3457, - "step": 140 - }, - { - "epoch": 0.044604118813436205, - "grad_norm": 0.8046875, - "learning_rate": 1.775e-05, - "loss": 1.3961, - "step": 142 - }, - { - "epoch": 0.04523234583897756, - "grad_norm": 0.75, - "learning_rate": 1.8e-05, - "loss": 1.2985, - "step": 144 - }, - { - "epoch": 0.04586057286451892, - "grad_norm": 0.81640625, - "learning_rate": 1.825e-05, - "loss": 1.3075, - "step": 146 - }, - { - "epoch": 0.04648879989006027, - "grad_norm": 0.76953125, - "learning_rate": 1.8500000000000002e-05, - "loss": 1.3602, - "step": 148 - }, - { - "epoch": 0.04711702691560163, - "grad_norm": 0.8828125, - "learning_rate": 1.8750000000000002e-05, - "loss": 1.4481, - "step": 150 - }, - { - "epoch": 0.04774525394114298, - "grad_norm": 0.80078125, - "learning_rate": 1.9e-05, - "loss": 1.409, - "step": 152 - }, - { - "epoch": 0.04837348096668433, - "grad_norm": 0.80859375, - "learning_rate": 1.925e-05, - "loss": 1.357, - "step": 154 - }, - { - "epoch": 0.04900170799222569, - "grad_norm": 0.77734375, - "learning_rate": 1.95e-05, - "loss": 1.2841, - "step": 156 - }, - { - "epoch": 0.049629935017767045, - "grad_norm": 0.7421875, - "learning_rate": 1.9750000000000002e-05, - "loss": 1.4336, - "step": 158 - }, - { - "epoch": 0.0502581620433084, - "grad_norm": 0.9609375, - "learning_rate": 2e-05, - "loss": 1.3853, - "step": 160 - }, - { - "epoch": 0.050886389068849756, - "grad_norm": 0.89453125, - "learning_rate": 1.9997461123452876e-05, - "loss": 1.3465, - "step": 162 - }, - { - "epoch": 0.05151461609439111, - "grad_norm": 0.84375, - "learning_rate": 1.9994922246905744e-05, - "loss": 1.3669, - "step": 164 - }, - { - "epoch": 0.05214284311993247, - "grad_norm": 0.7421875, - "learning_rate": 1.999238337035862e-05, - "loss": 1.6046, - "step": 166 - }, - { - "epoch": 0.052771070145473824, - "grad_norm": 0.81640625, - "learning_rate": 1.998984449381149e-05, - "loss": 1.2993, - "step": 168 - }, - { - "epoch": 0.05339929717101517, - "grad_norm": 0.79296875, - "learning_rate": 1.9987305617264362e-05, - "loss": 1.4495, - "step": 170 - }, - { - "epoch": 0.05402752419655653, - "grad_norm": 0.7734375, - "learning_rate": 1.9984766740717233e-05, - "loss": 1.3141, - "step": 172 - }, - { - "epoch": 0.054655751222097884, - "grad_norm": 0.71484375, - "learning_rate": 1.9982227864170108e-05, - "loss": 1.4852, - "step": 174 - }, - { - "epoch": 0.05528397824763924, - "grad_norm": 0.875, - "learning_rate": 1.997968898762298e-05, - "loss": 1.4228, - "step": 176 - }, - { - "epoch": 0.055912205273180596, - "grad_norm": 0.80078125, - "learning_rate": 1.997715011107585e-05, - "loss": 1.5617, - "step": 178 - }, - { - "epoch": 0.05654043229872195, - "grad_norm": 0.82421875, - "learning_rate": 1.9974611234528722e-05, - "loss": 1.2938, - "step": 180 - }, - { - "epoch": 0.05716865932426331, - "grad_norm": 0.7421875, - "learning_rate": 1.9972072357981597e-05, - "loss": 1.469, - "step": 182 - }, - { - "epoch": 0.05779688634980466, - "grad_norm": 0.82421875, - "learning_rate": 1.9969533481434465e-05, - "loss": 1.41, - "step": 184 - }, - { - "epoch": 0.05842511337534602, - "grad_norm": 0.7265625, - "learning_rate": 1.996699460488734e-05, - "loss": 1.4414, - "step": 186 - }, - { - "epoch": 0.05905334040088737, - "grad_norm": 0.8828125, - "learning_rate": 1.996445572834021e-05, - "loss": 1.2717, - "step": 188 - }, - { - "epoch": 0.059681567426428724, - "grad_norm": 0.7890625, - "learning_rate": 1.9961916851793083e-05, - "loss": 1.3179, - "step": 190 - }, - { - "epoch": 0.06030979445197008, - "grad_norm": 0.88671875, - "learning_rate": 1.9959377975245954e-05, - "loss": 1.4353, - "step": 192 - }, - { - "epoch": 0.060938021477511435, - "grad_norm": 0.79296875, - "learning_rate": 1.995683909869883e-05, - "loss": 1.4721, - "step": 194 - }, - { - "epoch": 0.06156624850305279, - "grad_norm": 0.7890625, - "learning_rate": 1.99543002221517e-05, - "loss": 1.4394, - "step": 196 - }, - { - "epoch": 0.06219447552859415, - "grad_norm": 0.765625, - "learning_rate": 1.995176134560457e-05, - "loss": 1.4004, - "step": 198 - }, - { - "epoch": 0.0628227025541355, - "grad_norm": 0.703125, - "learning_rate": 1.9949222469057443e-05, - "loss": 1.3159, - "step": 200 - }, - { - "epoch": 0.06345092957967685, - "grad_norm": 0.7890625, - "learning_rate": 1.9946683592510318e-05, - "loss": 1.408, - "step": 202 - }, - { - "epoch": 0.06407915660521821, - "grad_norm": 0.77734375, - "learning_rate": 1.994414471596319e-05, - "loss": 1.362, - "step": 204 - }, - { - "epoch": 0.06470738363075956, - "grad_norm": 0.7421875, - "learning_rate": 1.994160583941606e-05, - "loss": 1.321, - "step": 206 - }, - { - "epoch": 0.06533561065630092, - "grad_norm": 0.76953125, - "learning_rate": 1.9939066962868932e-05, - "loss": 1.3576, - "step": 208 - }, - { - "epoch": 0.06596383768184227, - "grad_norm": 0.77734375, - "learning_rate": 1.9936528086321803e-05, - "loss": 1.4552, - "step": 210 - }, - { - "epoch": 0.06659206470738363, - "grad_norm": 0.875, - "learning_rate": 1.9933989209774675e-05, - "loss": 1.3144, - "step": 212 - }, - { - "epoch": 0.06722029173292499, - "grad_norm": 0.8515625, - "learning_rate": 1.993145033322755e-05, - "loss": 1.3575, - "step": 214 - }, - { - "epoch": 0.06784851875846634, - "grad_norm": 0.828125, - "learning_rate": 1.992891145668042e-05, - "loss": 1.25, - "step": 216 - }, - { - "epoch": 0.0684767457840077, - "grad_norm": 0.75, - "learning_rate": 1.9926372580133292e-05, - "loss": 1.4611, - "step": 218 - }, - { - "epoch": 0.06910497280954905, - "grad_norm": 0.73828125, - "learning_rate": 1.9923833703586163e-05, - "loss": 1.2994, - "step": 220 - }, - { - "epoch": 0.06973319983509041, - "grad_norm": 0.9375, - "learning_rate": 1.9921294827039038e-05, - "loss": 1.2697, - "step": 222 - }, - { - "epoch": 0.07036142686063176, - "grad_norm": 0.81640625, - "learning_rate": 1.991875595049191e-05, - "loss": 1.4699, - "step": 224 - }, - { - "epoch": 0.07098965388617312, - "grad_norm": 0.92578125, - "learning_rate": 1.991621707394478e-05, - "loss": 1.4276, - "step": 226 - }, - { - "epoch": 0.07161788091171448, - "grad_norm": 0.74609375, - "learning_rate": 1.9913678197397652e-05, - "loss": 1.4052, - "step": 228 - }, - { - "epoch": 0.07224610793725583, - "grad_norm": 0.8203125, - "learning_rate": 1.9911139320850527e-05, - "loss": 1.3744, - "step": 230 - }, - { - "epoch": 0.07287433496279717, - "grad_norm": 0.87109375, - "learning_rate": 1.9908600444303395e-05, - "loss": 1.363, - "step": 232 - }, - { - "epoch": 0.07350256198833853, - "grad_norm": 0.859375, - "learning_rate": 1.990606156775627e-05, - "loss": 1.421, - "step": 234 - }, - { - "epoch": 0.07413078901387989, - "grad_norm": 0.76171875, - "learning_rate": 1.990352269120914e-05, - "loss": 1.5131, - "step": 236 - }, - { - "epoch": 0.07475901603942124, - "grad_norm": 1.5625, - "learning_rate": 1.9900983814662013e-05, - "loss": 1.3155, - "step": 238 - }, - { - "epoch": 0.0753872430649626, - "grad_norm": 0.78125, - "learning_rate": 1.9898444938114884e-05, - "loss": 1.3595, - "step": 240 - }, - { - "epoch": 0.07601547009050395, - "grad_norm": 0.7890625, - "learning_rate": 1.989590606156776e-05, - "loss": 1.3706, - "step": 242 - }, - { - "epoch": 0.07664369711604531, - "grad_norm": 0.72265625, - "learning_rate": 1.989336718502063e-05, - "loss": 1.3058, - "step": 244 - }, - { - "epoch": 0.07727192414158666, - "grad_norm": 0.71484375, - "learning_rate": 1.98908283084735e-05, - "loss": 1.3404, - "step": 246 - }, - { - "epoch": 0.07790015116712802, - "grad_norm": 0.9453125, - "learning_rate": 1.9888289431926376e-05, - "loss": 1.235, - "step": 248 - }, - { - "epoch": 0.07852837819266938, - "grad_norm": 0.84765625, - "learning_rate": 1.9885750555379248e-05, - "loss": 1.3668, - "step": 250 - }, - { - "epoch": 0.07915660521821073, - "grad_norm": 0.71875, - "learning_rate": 1.988321167883212e-05, - "loss": 1.3602, - "step": 252 - }, - { - "epoch": 0.07978483224375209, - "grad_norm": 0.828125, - "learning_rate": 1.988067280228499e-05, - "loss": 1.3833, - "step": 254 - }, - { - "epoch": 0.08041305926929344, - "grad_norm": 0.796875, - "learning_rate": 1.9878133925737865e-05, - "loss": 1.4476, - "step": 256 - }, - { - "epoch": 0.0810412862948348, - "grad_norm": 0.68359375, - "learning_rate": 1.9875595049190733e-05, - "loss": 1.4111, - "step": 258 - }, - { - "epoch": 0.08166951332037616, - "grad_norm": 0.88671875, - "learning_rate": 1.9873056172643608e-05, - "loss": 1.3636, - "step": 260 - }, - { - "epoch": 0.08229774034591751, - "grad_norm": 0.78515625, - "learning_rate": 1.987051729609648e-05, - "loss": 1.2524, - "step": 262 - }, - { - "epoch": 0.08292596737145887, - "grad_norm": 0.9609375, - "learning_rate": 1.986797841954935e-05, - "loss": 1.4048, - "step": 264 - }, - { - "epoch": 0.08355419439700022, - "grad_norm": 0.7109375, - "learning_rate": 1.9865439543002222e-05, - "loss": 1.3619, - "step": 266 - }, - { - "epoch": 0.08418242142254156, - "grad_norm": 0.796875, - "learning_rate": 1.9862900666455097e-05, - "loss": 1.4125, - "step": 268 - }, - { - "epoch": 0.08481064844808292, - "grad_norm": 0.6875, - "learning_rate": 1.986036178990797e-05, - "loss": 1.4536, - "step": 270 - }, - { - "epoch": 0.08543887547362428, - "grad_norm": 0.8125, - "learning_rate": 1.985782291336084e-05, - "loss": 1.392, - "step": 272 - }, - { - "epoch": 0.08606710249916563, - "grad_norm": 0.77734375, - "learning_rate": 1.985528403681371e-05, - "loss": 1.393, - "step": 274 - }, - { - "epoch": 0.08669532952470699, - "grad_norm": 0.91015625, - "learning_rate": 1.9852745160266586e-05, - "loss": 1.3635, - "step": 276 - }, - { - "epoch": 0.08732355655024834, - "grad_norm": 0.75, - "learning_rate": 1.9850206283719454e-05, - "loss": 1.4626, - "step": 278 - }, - { - "epoch": 0.0879517835757897, - "grad_norm": 0.8671875, - "learning_rate": 1.984766740717233e-05, - "loss": 1.3507, - "step": 280 - }, - { - "epoch": 0.08858001060133106, - "grad_norm": 0.83203125, - "learning_rate": 1.98451285306252e-05, - "loss": 1.4432, - "step": 282 - }, - { - "epoch": 0.08920823762687241, - "grad_norm": 0.83203125, - "learning_rate": 1.984258965407807e-05, - "loss": 1.3932, - "step": 284 - }, - { - "epoch": 0.08983646465241377, - "grad_norm": 0.8203125, - "learning_rate": 1.9840050777530943e-05, - "loss": 1.391, - "step": 286 - }, - { - "epoch": 0.09046469167795512, - "grad_norm": 0.7109375, - "learning_rate": 1.9837511900983818e-05, - "loss": 1.4163, - "step": 288 - }, - { - "epoch": 0.09109291870349648, - "grad_norm": 1.171875, - "learning_rate": 1.983497302443669e-05, - "loss": 1.4135, - "step": 290 - }, - { - "epoch": 0.09172114572903783, - "grad_norm": 0.8515625, - "learning_rate": 1.983243414788956e-05, - "loss": 1.4099, - "step": 292 - }, - { - "epoch": 0.09234937275457919, - "grad_norm": 0.76171875, - "learning_rate": 1.982989527134243e-05, - "loss": 1.2512, - "step": 294 - }, - { - "epoch": 0.09297759978012055, - "grad_norm": 0.734375, - "learning_rate": 1.9827356394795306e-05, - "loss": 1.255, - "step": 296 - }, - { - "epoch": 0.0936058268056619, - "grad_norm": 0.87109375, - "learning_rate": 1.9824817518248174e-05, - "loss": 1.2295, - "step": 298 - }, - { - "epoch": 0.09423405383120326, - "grad_norm": 0.765625, - "learning_rate": 1.982227864170105e-05, - "loss": 1.4514, - "step": 300 - }, - { - "epoch": 0.09486228085674461, - "grad_norm": 0.8828125, - "learning_rate": 1.981973976515392e-05, - "loss": 1.3137, - "step": 302 - }, - { - "epoch": 0.09549050788228595, - "grad_norm": 0.86328125, - "learning_rate": 1.9817200888606792e-05, - "loss": 1.3511, - "step": 304 - }, - { - "epoch": 0.09611873490782731, - "grad_norm": 0.85546875, - "learning_rate": 1.9814662012059663e-05, - "loss": 1.3035, - "step": 306 - }, - { - "epoch": 0.09674696193336867, - "grad_norm": 0.734375, - "learning_rate": 1.9812123135512538e-05, - "loss": 1.4151, - "step": 308 - }, - { - "epoch": 0.09737518895891002, - "grad_norm": 0.79296875, - "learning_rate": 1.980958425896541e-05, - "loss": 1.3819, - "step": 310 - }, - { - "epoch": 0.09800341598445138, - "grad_norm": 0.76953125, - "learning_rate": 1.980704538241828e-05, - "loss": 1.3212, - "step": 312 - }, - { - "epoch": 0.09863164300999273, - "grad_norm": 0.86328125, - "learning_rate": 1.9804506505871152e-05, - "loss": 1.4313, - "step": 314 - }, - { - "epoch": 0.09925987003553409, - "grad_norm": 0.828125, - "learning_rate": 1.9801967629324027e-05, - "loss": 1.4021, - "step": 316 - }, - { - "epoch": 0.09988809706107545, - "grad_norm": 0.7578125, - "learning_rate": 1.97994287527769e-05, - "loss": 1.3191, - "step": 318 - }, - { - "epoch": 0.1005163240866168, - "grad_norm": 0.734375, - "learning_rate": 1.979688987622977e-05, - "loss": 1.3138, - "step": 320 - }, - { - "epoch": 0.10114455111215816, - "grad_norm": 0.78125, - "learning_rate": 1.979435099968264e-05, - "loss": 1.3881, - "step": 322 - }, - { - "epoch": 0.10177277813769951, - "grad_norm": 0.7265625, - "learning_rate": 1.9791812123135513e-05, - "loss": 1.1786, - "step": 324 - }, - { - "epoch": 0.10240100516324087, - "grad_norm": 0.73828125, - "learning_rate": 1.9789273246588384e-05, - "loss": 1.246, - "step": 326 - }, - { - "epoch": 0.10302923218878222, - "grad_norm": 0.8515625, - "learning_rate": 1.978673437004126e-05, - "loss": 1.359, - "step": 328 - }, - { - "epoch": 0.10365745921432358, - "grad_norm": 0.91796875, - "learning_rate": 1.978419549349413e-05, - "loss": 1.271, - "step": 330 - }, - { - "epoch": 0.10428568623986494, - "grad_norm": 0.75, - "learning_rate": 1.9781656616947e-05, - "loss": 1.4137, - "step": 332 - }, - { - "epoch": 0.10491391326540629, - "grad_norm": 0.75390625, - "learning_rate": 1.9779117740399876e-05, - "loss": 1.3471, - "step": 334 - }, - { - "epoch": 0.10554214029094765, - "grad_norm": 0.70703125, - "learning_rate": 1.9776578863852748e-05, - "loss": 1.4421, - "step": 336 - }, - { - "epoch": 0.10617036731648899, - "grad_norm": 0.73828125, - "learning_rate": 1.977403998730562e-05, - "loss": 1.2823, - "step": 338 - }, - { - "epoch": 0.10679859434203035, - "grad_norm": 0.76171875, - "learning_rate": 1.977150111075849e-05, - "loss": 1.463, - "step": 340 - }, - { - "epoch": 0.1074268213675717, - "grad_norm": 0.74609375, - "learning_rate": 1.9768962234211365e-05, - "loss": 1.2987, - "step": 342 - }, - { - "epoch": 0.10805504839311306, - "grad_norm": 0.88671875, - "learning_rate": 1.9766423357664237e-05, - "loss": 1.4113, - "step": 344 - }, - { - "epoch": 0.10868327541865441, - "grad_norm": 0.7578125, - "learning_rate": 1.9763884481117108e-05, - "loss": 1.4153, - "step": 346 - }, - { - "epoch": 0.10931150244419577, - "grad_norm": 0.703125, - "learning_rate": 1.976134560456998e-05, - "loss": 1.3976, - "step": 348 - }, - { - "epoch": 0.10993972946973712, - "grad_norm": 0.78515625, - "learning_rate": 1.975880672802285e-05, - "loss": 1.395, - "step": 350 - }, - { - "epoch": 0.11056795649527848, - "grad_norm": 0.83984375, - "learning_rate": 1.9756267851475722e-05, - "loss": 1.533, - "step": 352 - }, - { - "epoch": 0.11119618352081984, - "grad_norm": 0.796875, - "learning_rate": 1.9753728974928597e-05, - "loss": 1.3265, - "step": 354 - }, - { - "epoch": 0.11182441054636119, - "grad_norm": 0.76171875, - "learning_rate": 1.9751190098381468e-05, - "loss": 1.4088, - "step": 356 - }, - { - "epoch": 0.11245263757190255, - "grad_norm": 0.8671875, - "learning_rate": 1.974865122183434e-05, - "loss": 1.4432, - "step": 358 - }, - { - "epoch": 0.1130808645974439, - "grad_norm": 0.984375, - "learning_rate": 1.974611234528721e-05, - "loss": 1.2292, - "step": 360 - }, - { - "epoch": 0.11370909162298526, - "grad_norm": 0.73828125, - "learning_rate": 1.9743573468740086e-05, - "loss": 1.3708, - "step": 362 - }, - { - "epoch": 0.11433731864852661, - "grad_norm": 0.73046875, - "learning_rate": 1.9741034592192957e-05, - "loss": 1.2918, - "step": 364 - }, - { - "epoch": 0.11496554567406797, - "grad_norm": 0.79296875, - "learning_rate": 1.973849571564583e-05, - "loss": 1.4335, - "step": 366 - }, - { - "epoch": 0.11559377269960933, - "grad_norm": 0.78515625, - "learning_rate": 1.97359568390987e-05, - "loss": 1.2187, - "step": 368 - }, - { - "epoch": 0.11622199972515068, - "grad_norm": 10.625, - "learning_rate": 1.9733417962551575e-05, - "loss": 1.2494, - "step": 370 - }, - { - "epoch": 0.11685022675069204, - "grad_norm": 0.75, - "learning_rate": 1.9730879086004443e-05, - "loss": 1.2348, - "step": 372 - }, - { - "epoch": 0.11747845377623338, - "grad_norm": 0.6953125, - "learning_rate": 1.9728340209457317e-05, - "loss": 1.3933, - "step": 374 - }, - { - "epoch": 0.11810668080177474, - "grad_norm": 0.8515625, - "learning_rate": 1.972580133291019e-05, - "loss": 1.445, - "step": 376 - }, - { - "epoch": 0.11873490782731609, - "grad_norm": 0.8515625, - "learning_rate": 1.972326245636306e-05, - "loss": 1.3521, - "step": 378 - }, - { - "epoch": 0.11936313485285745, - "grad_norm": 0.69921875, - "learning_rate": 1.972072357981593e-05, - "loss": 1.4533, - "step": 380 - }, - { - "epoch": 0.1199913618783988, - "grad_norm": 0.78515625, - "learning_rate": 1.9718184703268806e-05, - "loss": 1.2909, - "step": 382 - }, - { - "epoch": 0.12061958890394016, - "grad_norm": 0.8203125, - "learning_rate": 1.9715645826721678e-05, - "loss": 1.5502, - "step": 384 - }, - { - "epoch": 0.12124781592948151, - "grad_norm": 0.84765625, - "learning_rate": 1.971310695017455e-05, - "loss": 1.3525, - "step": 386 - }, - { - "epoch": 0.12187604295502287, - "grad_norm": 0.796875, - "learning_rate": 1.971056807362742e-05, - "loss": 1.5028, - "step": 388 - }, - { - "epoch": 0.12250426998056423, - "grad_norm": 6.21875, - "learning_rate": 1.9708029197080295e-05, - "loss": 1.3943, - "step": 390 - }, - { - "epoch": 0.12313249700610558, - "grad_norm": 0.765625, - "learning_rate": 1.9705490320533163e-05, - "loss": 1.5042, - "step": 392 - }, - { - "epoch": 0.12376072403164694, - "grad_norm": 0.79296875, - "learning_rate": 1.9702951443986038e-05, - "loss": 1.3527, - "step": 394 - }, - { - "epoch": 0.1243889510571883, - "grad_norm": 0.734375, - "learning_rate": 1.970041256743891e-05, - "loss": 1.5268, - "step": 396 - }, - { - "epoch": 0.12501717808272964, - "grad_norm": 0.77734375, - "learning_rate": 1.969787369089178e-05, - "loss": 1.2923, - "step": 398 - }, - { - "epoch": 0.125645405108271, - "grad_norm": 0.85546875, - "learning_rate": 1.9695334814344652e-05, - "loss": 1.4865, - "step": 400 - }, - { - "epoch": 0.12627363213381235, - "grad_norm": 0.828125, - "learning_rate": 1.9692795937797527e-05, - "loss": 1.2926, - "step": 402 - }, - { - "epoch": 0.1269018591593537, - "grad_norm": 0.796875, - "learning_rate": 1.96902570612504e-05, - "loss": 1.3763, - "step": 404 - }, - { - "epoch": 0.12753008618489506, - "grad_norm": 0.77734375, - "learning_rate": 1.968771818470327e-05, - "loss": 1.4208, - "step": 406 - }, - { - "epoch": 0.12815831321043641, - "grad_norm": 0.8515625, - "learning_rate": 1.968517930815614e-05, - "loss": 1.2802, - "step": 408 - }, - { - "epoch": 0.12878654023597777, - "grad_norm": 0.7578125, - "learning_rate": 1.9682640431609016e-05, - "loss": 1.3137, - "step": 410 - }, - { - "epoch": 0.12941476726151913, - "grad_norm": 0.734375, - "learning_rate": 1.9680101555061887e-05, - "loss": 1.2313, - "step": 412 - }, - { - "epoch": 0.13004299428706048, - "grad_norm": 0.73828125, - "learning_rate": 1.967756267851476e-05, - "loss": 1.3286, - "step": 414 - }, - { - "epoch": 0.13067122131260184, - "grad_norm": 0.86328125, - "learning_rate": 1.967502380196763e-05, - "loss": 1.3544, - "step": 416 - }, - { - "epoch": 0.1312994483381432, - "grad_norm": 0.796875, - "learning_rate": 1.96724849254205e-05, - "loss": 1.4726, - "step": 418 - }, - { - "epoch": 0.13192767536368455, - "grad_norm": 0.71875, - "learning_rate": 1.9669946048873376e-05, - "loss": 1.3215, - "step": 420 - }, - { - "epoch": 0.1325559023892259, - "grad_norm": 0.78515625, - "learning_rate": 1.9667407172326248e-05, - "loss": 1.5521, - "step": 422 - }, - { - "epoch": 0.13318412941476726, - "grad_norm": 0.796875, - "learning_rate": 1.966486829577912e-05, - "loss": 1.3127, - "step": 424 - }, - { - "epoch": 0.13381235644030862, - "grad_norm": 0.8359375, - "learning_rate": 1.966232941923199e-05, - "loss": 1.2696, - "step": 426 - }, - { - "epoch": 0.13444058346584997, - "grad_norm": 0.8046875, - "learning_rate": 1.9659790542684865e-05, - "loss": 1.2138, - "step": 428 - }, - { - "epoch": 0.13506881049139133, - "grad_norm": 0.7890625, - "learning_rate": 1.9657251666137736e-05, - "loss": 1.4204, - "step": 430 - }, - { - "epoch": 0.13569703751693268, - "grad_norm": 0.75390625, - "learning_rate": 1.9654712789590608e-05, - "loss": 1.2865, - "step": 432 - }, - { - "epoch": 0.13632526454247404, - "grad_norm": 0.73828125, - "learning_rate": 1.965217391304348e-05, - "loss": 1.2856, - "step": 434 - }, - { - "epoch": 0.1369534915680154, - "grad_norm": 0.73046875, - "learning_rate": 1.9649635036496354e-05, - "loss": 1.4284, - "step": 436 - }, - { - "epoch": 0.13758171859355675, - "grad_norm": 0.73828125, - "learning_rate": 1.9647096159949225e-05, - "loss": 1.3569, - "step": 438 - }, - { - "epoch": 0.1382099456190981, - "grad_norm": 0.671875, - "learning_rate": 1.9644557283402097e-05, - "loss": 1.3295, - "step": 440 - }, - { - "epoch": 0.13883817264463946, - "grad_norm": 0.7421875, - "learning_rate": 1.9642018406854968e-05, - "loss": 1.2948, - "step": 442 - }, - { - "epoch": 0.13946639967018082, - "grad_norm": 0.79296875, - "learning_rate": 1.963947953030784e-05, - "loss": 1.4097, - "step": 444 - }, - { - "epoch": 0.14009462669572217, - "grad_norm": 1.0703125, - "learning_rate": 1.963694065376071e-05, - "loss": 1.3674, - "step": 446 - }, - { - "epoch": 0.14072285372126353, - "grad_norm": 0.8671875, - "learning_rate": 1.9634401777213586e-05, - "loss": 1.4544, - "step": 448 - }, - { - "epoch": 0.14135108074680489, - "grad_norm": 0.72265625, - "learning_rate": 1.9631862900666457e-05, - "loss": 1.3385, - "step": 450 - }, - { - "epoch": 0.14197930777234624, - "grad_norm": 0.75390625, - "learning_rate": 1.962932402411933e-05, - "loss": 1.3962, - "step": 452 - }, - { - "epoch": 0.1426075347978876, - "grad_norm": 0.80078125, - "learning_rate": 1.96267851475722e-05, - "loss": 1.3257, - "step": 454 - }, - { - "epoch": 0.14323576182342895, - "grad_norm": 0.8359375, - "learning_rate": 1.9624246271025075e-05, - "loss": 1.3572, - "step": 456 - }, - { - "epoch": 0.1438639888489703, - "grad_norm": 0.73046875, - "learning_rate": 1.9621707394477946e-05, - "loss": 1.5115, - "step": 458 - }, - { - "epoch": 0.14449221587451166, - "grad_norm": 0.7578125, - "learning_rate": 1.9619168517930817e-05, - "loss": 1.3532, - "step": 460 - }, - { - "epoch": 0.14512044290005302, - "grad_norm": 0.8046875, - "learning_rate": 1.961662964138369e-05, - "loss": 1.3612, - "step": 462 - }, - { - "epoch": 0.14574866992559435, - "grad_norm": 0.73828125, - "learning_rate": 1.9614090764836564e-05, - "loss": 1.3881, - "step": 464 - }, - { - "epoch": 0.1463768969511357, - "grad_norm": 0.94140625, - "learning_rate": 1.961155188828943e-05, - "loss": 1.534, - "step": 466 - }, - { - "epoch": 0.14700512397667706, - "grad_norm": 0.71484375, - "learning_rate": 1.9609013011742306e-05, - "loss": 1.4607, - "step": 468 - }, - { - "epoch": 0.14763335100221842, - "grad_norm": 0.72265625, - "learning_rate": 1.9606474135195178e-05, - "loss": 1.3466, - "step": 470 - }, - { - "epoch": 0.14826157802775977, - "grad_norm": 0.7109375, - "learning_rate": 1.960393525864805e-05, - "loss": 1.3187, - "step": 472 - }, - { - "epoch": 0.14888980505330113, - "grad_norm": 0.77734375, - "learning_rate": 1.960139638210092e-05, - "loss": 1.36, - "step": 474 - }, - { - "epoch": 0.14951803207884248, - "grad_norm": 0.859375, - "learning_rate": 1.9598857505553795e-05, - "loss": 1.2185, - "step": 476 - }, - { - "epoch": 0.15014625910438384, - "grad_norm": 0.71484375, - "learning_rate": 1.9596318629006667e-05, - "loss": 1.4085, - "step": 478 - }, - { - "epoch": 0.1507744861299252, - "grad_norm": 0.98046875, - "learning_rate": 1.9593779752459538e-05, - "loss": 1.3917, - "step": 480 - }, - { - "epoch": 0.15140271315546655, - "grad_norm": 0.7421875, - "learning_rate": 1.959124087591241e-05, - "loss": 1.3497, - "step": 482 - }, - { - "epoch": 0.1520309401810079, - "grad_norm": 0.76171875, - "learning_rate": 1.9588701999365284e-05, - "loss": 1.3855, - "step": 484 - }, - { - "epoch": 0.15265916720654926, - "grad_norm": 0.81640625, - "learning_rate": 1.9586163122818152e-05, - "loss": 1.4071, - "step": 486 - }, - { - "epoch": 0.15328739423209062, - "grad_norm": 0.80859375, - "learning_rate": 1.9583624246271027e-05, - "loss": 1.2817, - "step": 488 - }, - { - "epoch": 0.15391562125763197, - "grad_norm": 0.75, - "learning_rate": 1.9581085369723898e-05, - "loss": 1.3758, - "step": 490 - }, - { - "epoch": 0.15454384828317333, - "grad_norm": 0.78125, - "learning_rate": 1.957854649317677e-05, - "loss": 1.4021, - "step": 492 - }, - { - "epoch": 0.15517207530871469, - "grad_norm": 0.75390625, - "learning_rate": 1.957600761662964e-05, - "loss": 1.4163, - "step": 494 - }, - { - "epoch": 0.15580030233425604, - "grad_norm": 0.7890625, - "learning_rate": 1.9573468740082516e-05, - "loss": 1.3127, - "step": 496 - }, - { - "epoch": 0.1564285293597974, - "grad_norm": 0.79296875, - "learning_rate": 1.9570929863535387e-05, - "loss": 1.2817, - "step": 498 - }, - { - "epoch": 0.15705675638533875, - "grad_norm": 0.76171875, - "learning_rate": 1.956839098698826e-05, - "loss": 1.3561, - "step": 500 - }, - { - "epoch": 0.1576849834108801, - "grad_norm": 0.76171875, - "learning_rate": 1.956585211044113e-05, - "loss": 1.317, - "step": 502 - }, - { - "epoch": 0.15831321043642146, - "grad_norm": 1.3671875, - "learning_rate": 1.9563313233894005e-05, - "loss": 1.4507, - "step": 504 - }, - { - "epoch": 0.15894143746196282, - "grad_norm": 1.0703125, - "learning_rate": 1.9560774357346876e-05, - "loss": 1.3715, - "step": 506 - }, - { - "epoch": 0.15956966448750418, - "grad_norm": 0.765625, - "learning_rate": 1.9558235480799747e-05, - "loss": 1.3502, - "step": 508 - }, - { - "epoch": 0.16019789151304553, - "grad_norm": 0.72265625, - "learning_rate": 1.9555696604252622e-05, - "loss": 1.4822, - "step": 510 - }, - { - "epoch": 0.1608261185385869, - "grad_norm": 0.75, - "learning_rate": 1.955315772770549e-05, - "loss": 1.2966, - "step": 512 - }, - { - "epoch": 0.16145434556412824, - "grad_norm": 0.80078125, - "learning_rate": 1.9550618851158365e-05, - "loss": 1.2823, - "step": 514 - }, - { - "epoch": 0.1620825725896696, - "grad_norm": 0.87109375, - "learning_rate": 1.9548079974611236e-05, - "loss": 1.3542, - "step": 516 - }, - { - "epoch": 0.16271079961521095, - "grad_norm": 0.72265625, - "learning_rate": 1.9545541098064108e-05, - "loss": 1.2341, - "step": 518 - }, - { - "epoch": 0.1633390266407523, - "grad_norm": 0.90625, - "learning_rate": 1.954300222151698e-05, - "loss": 1.4864, - "step": 520 - }, - { - "epoch": 0.16396725366629367, - "grad_norm": 0.80078125, - "learning_rate": 1.9540463344969854e-05, - "loss": 1.4035, - "step": 522 - }, - { - "epoch": 0.16459548069183502, - "grad_norm": 0.7734375, - "learning_rate": 1.9537924468422725e-05, - "loss": 1.3945, - "step": 524 - }, - { - "epoch": 0.16522370771737638, - "grad_norm": 0.94921875, - "learning_rate": 1.9535385591875597e-05, - "loss": 1.3526, - "step": 526 - }, - { - "epoch": 0.16585193474291773, - "grad_norm": 0.71484375, - "learning_rate": 1.9532846715328468e-05, - "loss": 1.4006, - "step": 528 - }, - { - "epoch": 0.1664801617684591, - "grad_norm": 0.703125, - "learning_rate": 1.9530307838781343e-05, - "loss": 1.3114, - "step": 530 - }, - { - "epoch": 0.16710838879400045, - "grad_norm": 0.75390625, - "learning_rate": 1.9527768962234214e-05, - "loss": 1.4793, - "step": 532 - }, - { - "epoch": 0.16773661581954177, - "grad_norm": 0.7734375, - "learning_rate": 1.9525230085687086e-05, - "loss": 1.3226, - "step": 534 - }, - { - "epoch": 0.16836484284508313, - "grad_norm": 0.90625, - "learning_rate": 1.9522691209139957e-05, - "loss": 1.3562, - "step": 536 - }, - { - "epoch": 0.16899306987062448, - "grad_norm": 0.91796875, - "learning_rate": 1.952015233259283e-05, - "loss": 1.3007, - "step": 538 - }, - { - "epoch": 0.16962129689616584, - "grad_norm": 0.69140625, - "learning_rate": 1.95176134560457e-05, - "loss": 1.4449, - "step": 540 - }, - { - "epoch": 0.1702495239217072, - "grad_norm": 0.74609375, - "learning_rate": 1.9515074579498575e-05, - "loss": 1.3835, - "step": 542 - }, - { - "epoch": 0.17087775094724855, - "grad_norm": 0.74609375, - "learning_rate": 1.9512535702951446e-05, - "loss": 1.4734, - "step": 544 - }, - { - "epoch": 0.1715059779727899, - "grad_norm": 0.7109375, - "learning_rate": 1.9509996826404317e-05, - "loss": 1.4759, - "step": 546 - }, - { - "epoch": 0.17213420499833126, - "grad_norm": 0.6953125, - "learning_rate": 1.950745794985719e-05, - "loss": 1.3622, - "step": 548 - }, - { - "epoch": 0.17276243202387262, - "grad_norm": 0.703125, - "learning_rate": 1.9504919073310063e-05, - "loss": 1.5, - "step": 550 - }, - { - "epoch": 0.17339065904941398, - "grad_norm": 0.765625, - "learning_rate": 1.9502380196762935e-05, - "loss": 1.4584, - "step": 552 - }, - { - "epoch": 0.17401888607495533, - "grad_norm": 0.78125, - "learning_rate": 1.9499841320215806e-05, - "loss": 1.1847, - "step": 554 - }, - { - "epoch": 0.1746471131004967, - "grad_norm": 0.8046875, - "learning_rate": 1.9497302443668678e-05, - "loss": 1.4887, - "step": 556 - }, - { - "epoch": 0.17527534012603804, - "grad_norm": 0.703125, - "learning_rate": 1.9494763567121552e-05, - "loss": 1.3411, - "step": 558 - }, - { - "epoch": 0.1759035671515794, - "grad_norm": 0.8125, - "learning_rate": 1.949222469057442e-05, - "loss": 1.4525, - "step": 560 - }, - { - "epoch": 0.17653179417712075, - "grad_norm": 0.78125, - "learning_rate": 1.9489685814027295e-05, - "loss": 1.2743, - "step": 562 - }, - { - "epoch": 0.1771600212026621, - "grad_norm": 0.77734375, - "learning_rate": 1.9487146937480167e-05, - "loss": 1.385, - "step": 564 - }, - { - "epoch": 0.17778824822820347, - "grad_norm": 0.89453125, - "learning_rate": 1.9484608060933038e-05, - "loss": 1.3988, - "step": 566 - }, - { - "epoch": 0.17841647525374482, - "grad_norm": 0.78125, - "learning_rate": 1.948206918438591e-05, - "loss": 1.2637, - "step": 568 - }, - { - "epoch": 0.17904470227928618, - "grad_norm": 0.73046875, - "learning_rate": 1.9479530307838784e-05, - "loss": 1.3832, - "step": 570 - }, - { - "epoch": 0.17967292930482753, - "grad_norm": 0.83984375, - "learning_rate": 1.9476991431291655e-05, - "loss": 1.367, - "step": 572 - }, - { - "epoch": 0.1803011563303689, - "grad_norm": 0.85546875, - "learning_rate": 1.9474452554744527e-05, - "loss": 1.3174, - "step": 574 - }, - { - "epoch": 0.18092938335591024, - "grad_norm": 0.6875, - "learning_rate": 1.9471913678197398e-05, - "loss": 1.2966, - "step": 576 - }, - { - "epoch": 0.1815576103814516, - "grad_norm": 0.796875, - "learning_rate": 1.9469374801650273e-05, - "loss": 1.4582, - "step": 578 - }, - { - "epoch": 0.18218583740699296, - "grad_norm": 0.69921875, - "learning_rate": 1.946683592510314e-05, - "loss": 1.3229, - "step": 580 - }, - { - "epoch": 0.1828140644325343, - "grad_norm": 0.734375, - "learning_rate": 1.9464297048556016e-05, - "loss": 1.2895, - "step": 582 - }, - { - "epoch": 0.18344229145807567, - "grad_norm": 0.73046875, - "learning_rate": 1.9461758172008887e-05, - "loss": 1.5382, - "step": 584 - }, - { - "epoch": 0.18407051848361702, - "grad_norm": 0.92578125, - "learning_rate": 1.945921929546176e-05, - "loss": 1.4349, - "step": 586 - }, - { - "epoch": 0.18469874550915838, - "grad_norm": 0.828125, - "learning_rate": 1.945668041891463e-05, - "loss": 1.3861, - "step": 588 - }, - { - "epoch": 0.18532697253469974, - "grad_norm": 0.76953125, - "learning_rate": 1.9454141542367505e-05, - "loss": 1.2897, - "step": 590 - }, - { - "epoch": 0.1859551995602411, - "grad_norm": 0.8671875, - "learning_rate": 1.9451602665820376e-05, - "loss": 1.3362, - "step": 592 - }, - { - "epoch": 0.18658342658578245, - "grad_norm": 0.8046875, - "learning_rate": 1.9449063789273247e-05, - "loss": 1.3954, - "step": 594 - }, - { - "epoch": 0.1872116536113238, - "grad_norm": 0.734375, - "learning_rate": 1.9446524912726122e-05, - "loss": 1.3541, - "step": 596 - }, - { - "epoch": 0.18783988063686516, - "grad_norm": 0.80859375, - "learning_rate": 1.9443986036178994e-05, - "loss": 1.4498, - "step": 598 - }, - { - "epoch": 0.18846810766240651, - "grad_norm": 0.83203125, - "learning_rate": 1.9441447159631865e-05, - "loss": 1.3767, - "step": 600 - }, - { - "epoch": 0.18909633468794787, - "grad_norm": 0.9453125, - "learning_rate": 1.9438908283084736e-05, - "loss": 1.245, - "step": 602 - }, - { - "epoch": 0.18972456171348923, - "grad_norm": 0.66015625, - "learning_rate": 1.943636940653761e-05, - "loss": 1.4371, - "step": 604 - }, - { - "epoch": 0.19035278873903055, - "grad_norm": 0.7890625, - "learning_rate": 1.943383052999048e-05, - "loss": 1.3194, - "step": 606 - }, - { - "epoch": 0.1909810157645719, - "grad_norm": 0.7421875, - "learning_rate": 1.9431291653443354e-05, - "loss": 1.3339, - "step": 608 - }, - { - "epoch": 0.19160924279011327, - "grad_norm": 0.7578125, - "learning_rate": 1.9428752776896225e-05, - "loss": 1.3773, - "step": 610 - }, - { - "epoch": 0.19223746981565462, - "grad_norm": 0.7265625, - "learning_rate": 1.9426213900349097e-05, - "loss": 1.3456, - "step": 612 - }, - { - "epoch": 0.19286569684119598, - "grad_norm": 1.015625, - "learning_rate": 1.9423675023801968e-05, - "loss": 1.3713, - "step": 614 - }, - { - "epoch": 0.19349392386673733, - "grad_norm": 0.73828125, - "learning_rate": 1.9421136147254843e-05, - "loss": 1.4541, - "step": 616 - }, - { - "epoch": 0.1941221508922787, - "grad_norm": 0.71484375, - "learning_rate": 1.9418597270707714e-05, - "loss": 1.4132, - "step": 618 - }, - { - "epoch": 0.19475037791782004, - "grad_norm": 0.90234375, - "learning_rate": 1.9416058394160586e-05, - "loss": 1.461, - "step": 620 - }, - { - "epoch": 0.1953786049433614, - "grad_norm": 0.765625, - "learning_rate": 1.9413519517613457e-05, - "loss": 1.2741, - "step": 622 - }, - { - "epoch": 0.19600683196890276, - "grad_norm": 0.75, - "learning_rate": 1.9410980641066332e-05, - "loss": 1.4783, - "step": 624 - }, - { - "epoch": 0.1966350589944441, - "grad_norm": 0.9296875, - "learning_rate": 1.94084417645192e-05, - "loss": 1.3676, - "step": 626 - }, - { - "epoch": 0.19726328601998547, - "grad_norm": 0.83984375, - "learning_rate": 1.9405902887972074e-05, - "loss": 1.2958, - "step": 628 - }, - { - "epoch": 0.19789151304552682, - "grad_norm": 0.8125, - "learning_rate": 1.9403364011424946e-05, - "loss": 1.3159, - "step": 630 - }, - { - "epoch": 0.19851974007106818, - "grad_norm": 0.7578125, - "learning_rate": 1.9400825134877817e-05, - "loss": 1.3835, - "step": 632 - }, - { - "epoch": 0.19914796709660953, - "grad_norm": 0.890625, - "learning_rate": 1.939828625833069e-05, - "loss": 1.2554, - "step": 634 - }, - { - "epoch": 0.1997761941221509, - "grad_norm": 0.671875, - "learning_rate": 1.9395747381783563e-05, - "loss": 1.3815, - "step": 636 - }, - { - "epoch": 0.20040442114769225, - "grad_norm": 0.78125, - "learning_rate": 1.9393208505236435e-05, - "loss": 1.4323, - "step": 638 - }, - { - "epoch": 0.2010326481732336, - "grad_norm": 0.7265625, - "learning_rate": 1.9390669628689306e-05, - "loss": 1.3292, - "step": 640 - }, - { - "epoch": 0.20166087519877496, - "grad_norm": 0.82421875, - "learning_rate": 1.9388130752142178e-05, - "loss": 1.2865, - "step": 642 - }, - { - "epoch": 0.20228910222431631, - "grad_norm": 1.015625, - "learning_rate": 1.9385591875595052e-05, - "loss": 1.3822, - "step": 644 - }, - { - "epoch": 0.20291732924985767, - "grad_norm": 0.75, - "learning_rate": 1.9383052999047924e-05, - "loss": 1.3657, - "step": 646 - }, - { - "epoch": 0.20354555627539903, - "grad_norm": 0.82421875, - "learning_rate": 1.9380514122500795e-05, - "loss": 1.3554, - "step": 648 - }, - { - "epoch": 0.20417378330094038, - "grad_norm": 0.75, - "learning_rate": 1.9377975245953666e-05, - "loss": 1.331, - "step": 650 - }, - { - "epoch": 0.20480201032648174, - "grad_norm": 0.96484375, - "learning_rate": 1.9375436369406538e-05, - "loss": 1.3798, - "step": 652 - }, - { - "epoch": 0.2054302373520231, - "grad_norm": 0.80078125, - "learning_rate": 1.937289749285941e-05, - "loss": 1.4126, - "step": 654 - }, - { - "epoch": 0.20605846437756445, - "grad_norm": 0.7265625, - "learning_rate": 1.9370358616312284e-05, - "loss": 1.5412, - "step": 656 - }, - { - "epoch": 0.2066866914031058, - "grad_norm": 0.6875, - "learning_rate": 1.9367819739765155e-05, - "loss": 1.4367, - "step": 658 - }, - { - "epoch": 0.20731491842864716, - "grad_norm": 0.8828125, - "learning_rate": 1.9365280863218027e-05, - "loss": 1.3944, - "step": 660 - }, - { - "epoch": 0.20794314545418852, - "grad_norm": 0.74609375, - "learning_rate": 1.9362741986670898e-05, - "loss": 1.4311, - "step": 662 - }, - { - "epoch": 0.20857137247972987, - "grad_norm": 0.73046875, - "learning_rate": 1.9360203110123773e-05, - "loss": 1.452, - "step": 664 - }, - { - "epoch": 0.20919959950527123, - "grad_norm": 0.73828125, - "learning_rate": 1.9357664233576644e-05, - "loss": 1.3529, - "step": 666 - }, - { - "epoch": 0.20982782653081258, - "grad_norm": 0.703125, - "learning_rate": 1.9355125357029516e-05, - "loss": 1.3444, - "step": 668 - }, - { - "epoch": 0.21045605355635394, - "grad_norm": 0.70703125, - "learning_rate": 1.9352586480482387e-05, - "loss": 1.352, - "step": 670 - }, - { - "epoch": 0.2110842805818953, - "grad_norm": 0.81640625, - "learning_rate": 1.9350047603935262e-05, - "loss": 1.455, - "step": 672 - }, - { - "epoch": 0.21171250760743665, - "grad_norm": 0.7578125, - "learning_rate": 1.934750872738813e-05, - "loss": 1.2581, - "step": 674 - }, - { - "epoch": 0.21234073463297798, - "grad_norm": 0.8515625, - "learning_rate": 1.9344969850841005e-05, - "loss": 1.3224, - "step": 676 - }, - { - "epoch": 0.21296896165851933, - "grad_norm": 0.6875, - "learning_rate": 1.9342430974293876e-05, - "loss": 1.4604, - "step": 678 - }, - { - "epoch": 0.2135971886840607, - "grad_norm": 0.75, - "learning_rate": 1.9339892097746747e-05, - "loss": 1.2345, - "step": 680 - }, - { - "epoch": 0.21422541570960205, - "grad_norm": 0.70703125, - "learning_rate": 1.9337353221199622e-05, - "loss": 1.4289, - "step": 682 - }, - { - "epoch": 0.2148536427351434, - "grad_norm": 0.875, - "learning_rate": 1.9334814344652494e-05, - "loss": 1.4216, - "step": 684 - }, - { - "epoch": 0.21548186976068476, - "grad_norm": 0.796875, - "learning_rate": 1.9332275468105365e-05, - "loss": 1.4541, - "step": 686 - }, - { - "epoch": 0.2161100967862261, - "grad_norm": 0.83203125, - "learning_rate": 1.9329736591558236e-05, - "loss": 1.3089, - "step": 688 - }, - { - "epoch": 0.21673832381176747, - "grad_norm": 0.8828125, - "learning_rate": 1.932719771501111e-05, - "loss": 1.3822, - "step": 690 - }, - { - "epoch": 0.21736655083730883, - "grad_norm": 0.78125, - "learning_rate": 1.9324658838463982e-05, - "loss": 1.2839, - "step": 692 - }, - { - "epoch": 0.21799477786285018, - "grad_norm": 0.7421875, - "learning_rate": 1.9322119961916854e-05, - "loss": 1.2813, - "step": 694 - }, - { - "epoch": 0.21862300488839154, - "grad_norm": 0.7265625, - "learning_rate": 1.9319581085369725e-05, - "loss": 1.3437, - "step": 696 - }, - { - "epoch": 0.2192512319139329, - "grad_norm": 0.69140625, - "learning_rate": 1.93170422088226e-05, - "loss": 1.3649, - "step": 698 - }, - { - "epoch": 0.21987945893947425, - "grad_norm": 0.70703125, - "learning_rate": 1.9314503332275468e-05, - "loss": 1.3949, - "step": 700 - }, - { - "epoch": 0.2205076859650156, - "grad_norm": 0.9921875, - "learning_rate": 1.9311964455728343e-05, - "loss": 1.3488, - "step": 702 - }, - { - "epoch": 0.22113591299055696, - "grad_norm": 0.8671875, - "learning_rate": 1.9309425579181214e-05, - "loss": 1.268, - "step": 704 - }, - { - "epoch": 0.22176414001609832, - "grad_norm": 0.875, - "learning_rate": 1.9306886702634085e-05, - "loss": 1.3855, - "step": 706 - }, - { - "epoch": 0.22239236704163967, - "grad_norm": 0.765625, - "learning_rate": 1.9304347826086957e-05, - "loss": 1.2877, - "step": 708 - }, - { - "epoch": 0.22302059406718103, - "grad_norm": 0.7734375, - "learning_rate": 1.930180894953983e-05, - "loss": 1.3324, - "step": 710 - }, - { - "epoch": 0.22364882109272238, - "grad_norm": 0.75390625, - "learning_rate": 1.9299270072992703e-05, - "loss": 1.3602, - "step": 712 - }, - { - "epoch": 0.22427704811826374, - "grad_norm": 0.76171875, - "learning_rate": 1.9296731196445574e-05, - "loss": 1.3006, - "step": 714 - }, - { - "epoch": 0.2249052751438051, - "grad_norm": 0.7578125, - "learning_rate": 1.9294192319898446e-05, - "loss": 1.3667, - "step": 716 - }, - { - "epoch": 0.22553350216934645, - "grad_norm": 0.70703125, - "learning_rate": 1.929165344335132e-05, - "loss": 1.3536, - "step": 718 - }, - { - "epoch": 0.2261617291948878, - "grad_norm": 0.73828125, - "learning_rate": 1.928911456680419e-05, - "loss": 1.4343, - "step": 720 - }, - { - "epoch": 0.22678995622042916, - "grad_norm": 0.76171875, - "learning_rate": 1.9286575690257063e-05, - "loss": 1.355, - "step": 722 - }, - { - "epoch": 0.22741818324597052, - "grad_norm": 0.88671875, - "learning_rate": 1.9284036813709935e-05, - "loss": 1.3999, - "step": 724 - }, - { - "epoch": 0.22804641027151187, - "grad_norm": 0.9140625, - "learning_rate": 1.9281497937162806e-05, - "loss": 1.3638, - "step": 726 - }, - { - "epoch": 0.22867463729705323, - "grad_norm": 0.7265625, - "learning_rate": 1.9278959060615677e-05, - "loss": 1.2724, - "step": 728 - }, - { - "epoch": 0.22930286432259459, - "grad_norm": 1.0, - "learning_rate": 1.9276420184068552e-05, - "loss": 1.3783, - "step": 730 - }, - { - "epoch": 0.22993109134813594, - "grad_norm": 0.7578125, - "learning_rate": 1.9273881307521424e-05, - "loss": 1.2429, - "step": 732 - }, - { - "epoch": 0.2305593183736773, - "grad_norm": 0.76171875, - "learning_rate": 1.9271342430974295e-05, - "loss": 1.4618, - "step": 734 - }, - { - "epoch": 0.23118754539921865, - "grad_norm": 0.70703125, - "learning_rate": 1.9268803554427166e-05, - "loss": 1.3145, - "step": 736 - }, - { - "epoch": 0.23181577242476, - "grad_norm": 0.74609375, - "learning_rate": 1.926626467788004e-05, - "loss": 1.3562, - "step": 738 - }, - { - "epoch": 0.23244399945030136, - "grad_norm": 0.7734375, - "learning_rate": 1.9263725801332913e-05, - "loss": 1.3047, - "step": 740 - }, - { - "epoch": 0.23307222647584272, - "grad_norm": 0.765625, - "learning_rate": 1.9261186924785784e-05, - "loss": 1.4534, - "step": 742 - }, - { - "epoch": 0.23370045350138408, - "grad_norm": 0.84375, - "learning_rate": 1.9258648048238655e-05, - "loss": 1.3435, - "step": 744 - }, - { - "epoch": 0.23432868052692543, - "grad_norm": 0.75, - "learning_rate": 1.9256109171691527e-05, - "loss": 1.3951, - "step": 746 - }, - { - "epoch": 0.23495690755246676, - "grad_norm": 0.68359375, - "learning_rate": 1.9253570295144398e-05, - "loss": 1.3799, - "step": 748 - }, - { - "epoch": 0.23558513457800812, - "grad_norm": 0.8046875, - "learning_rate": 1.9251031418597273e-05, - "loss": 1.5794, - "step": 750 - }, - { - "epoch": 0.23621336160354947, - "grad_norm": 0.73828125, - "learning_rate": 1.9248492542050144e-05, - "loss": 1.3543, - "step": 752 - }, - { - "epoch": 0.23684158862909083, - "grad_norm": 0.71484375, - "learning_rate": 1.9245953665503016e-05, - "loss": 1.2956, - "step": 754 - }, - { - "epoch": 0.23746981565463218, - "grad_norm": 0.8359375, - "learning_rate": 1.9243414788955887e-05, - "loss": 1.2537, - "step": 756 - }, - { - "epoch": 0.23809804268017354, - "grad_norm": 0.83203125, - "learning_rate": 1.9240875912408762e-05, - "loss": 1.3696, - "step": 758 - }, - { - "epoch": 0.2387262697057149, - "grad_norm": 0.8359375, - "learning_rate": 1.9238337035861633e-05, - "loss": 1.4097, - "step": 760 - }, - { - "epoch": 0.23935449673125625, - "grad_norm": 1.015625, - "learning_rate": 1.9235798159314505e-05, - "loss": 1.307, - "step": 762 - }, - { - "epoch": 0.2399827237567976, - "grad_norm": 0.91015625, - "learning_rate": 1.923325928276738e-05, - "loss": 1.2294, - "step": 764 - }, - { - "epoch": 0.24061095078233896, - "grad_norm": 0.796875, - "learning_rate": 1.923072040622025e-05, - "loss": 1.3091, - "step": 766 - }, - { - "epoch": 0.24123917780788032, - "grad_norm": 0.859375, - "learning_rate": 1.9228181529673122e-05, - "loss": 1.3432, - "step": 768 - }, - { - "epoch": 0.24186740483342167, - "grad_norm": 0.80078125, - "learning_rate": 1.9225642653125993e-05, - "loss": 1.3201, - "step": 770 - }, - { - "epoch": 0.24249563185896303, - "grad_norm": 0.80078125, - "learning_rate": 1.9223103776578865e-05, - "loss": 1.4521, - "step": 772 - }, - { - "epoch": 0.24312385888450438, - "grad_norm": 0.95703125, - "learning_rate": 1.9220564900031736e-05, - "loss": 1.4254, - "step": 774 - }, - { - "epoch": 0.24375208591004574, - "grad_norm": 0.8125, - "learning_rate": 1.921802602348461e-05, - "loss": 1.3347, - "step": 776 - }, - { - "epoch": 0.2443803129355871, - "grad_norm": 0.7890625, - "learning_rate": 1.9215487146937482e-05, - "loss": 1.2956, - "step": 778 - }, - { - "epoch": 0.24500853996112845, - "grad_norm": 0.7734375, - "learning_rate": 1.9212948270390354e-05, - "loss": 1.3128, - "step": 780 - }, - { - "epoch": 0.2456367669866698, - "grad_norm": 0.84375, - "learning_rate": 1.9210409393843225e-05, - "loss": 1.3065, - "step": 782 - }, - { - "epoch": 0.24626499401221116, - "grad_norm": 1.015625, - "learning_rate": 1.92078705172961e-05, - "loss": 1.2968, - "step": 784 - }, - { - "epoch": 0.24689322103775252, - "grad_norm": 0.87890625, - "learning_rate": 1.920533164074897e-05, - "loss": 1.3041, - "step": 786 - }, - { - "epoch": 0.24752144806329388, - "grad_norm": 0.8046875, - "learning_rate": 1.9202792764201843e-05, - "loss": 1.4266, - "step": 788 - }, - { - "epoch": 0.24814967508883523, - "grad_norm": 0.68359375, - "learning_rate": 1.9200253887654714e-05, - "loss": 1.4958, - "step": 790 - }, - { - "epoch": 0.2487779021143766, - "grad_norm": 0.9375, - "learning_rate": 1.919771501110759e-05, - "loss": 1.4217, - "step": 792 - }, - { - "epoch": 0.24940612913991794, - "grad_norm": 0.70703125, - "learning_rate": 1.9195176134560457e-05, - "loss": 1.3905, - "step": 794 - }, - { - "epoch": 0.25003435616545927, - "grad_norm": 0.8203125, - "learning_rate": 1.919263725801333e-05, - "loss": 1.3715, - "step": 796 - }, - { - "epoch": 0.25066258319100065, - "grad_norm": 0.9296875, - "learning_rate": 1.9190098381466203e-05, - "loss": 1.4086, - "step": 798 - }, - { - "epoch": 0.251290810216542, - "grad_norm": 0.84375, - "learning_rate": 1.9187559504919074e-05, - "loss": 1.4157, - "step": 800 - }, - { - "epoch": 0.25191903724208337, - "grad_norm": 0.7109375, - "learning_rate": 1.9185020628371946e-05, - "loss": 1.2557, - "step": 802 - }, - { - "epoch": 0.2525472642676247, - "grad_norm": 0.7734375, - "learning_rate": 1.918248175182482e-05, - "loss": 1.3713, - "step": 804 - }, - { - "epoch": 0.2531754912931661, - "grad_norm": 0.7265625, - "learning_rate": 1.9179942875277692e-05, - "loss": 1.3549, - "step": 806 - }, - { - "epoch": 0.2538037183187074, - "grad_norm": 0.7734375, - "learning_rate": 1.9177403998730563e-05, - "loss": 1.4184, - "step": 808 - }, - { - "epoch": 0.2544319453442488, - "grad_norm": 0.828125, - "learning_rate": 1.9174865122183435e-05, - "loss": 1.3112, - "step": 810 - }, - { - "epoch": 0.2550601723697901, - "grad_norm": 0.7421875, - "learning_rate": 1.917232624563631e-05, - "loss": 1.3818, - "step": 812 - }, - { - "epoch": 0.2556883993953315, - "grad_norm": 0.796875, - "learning_rate": 1.9169787369089177e-05, - "loss": 1.4245, - "step": 814 - }, - { - "epoch": 0.25631662642087283, - "grad_norm": 0.91015625, - "learning_rate": 1.9167248492542052e-05, - "loss": 1.3986, - "step": 816 - }, - { - "epoch": 0.2569448534464142, - "grad_norm": 0.7421875, - "learning_rate": 1.9164709615994924e-05, - "loss": 1.3054, - "step": 818 - }, - { - "epoch": 0.25757308047195554, - "grad_norm": 0.8046875, - "learning_rate": 1.9162170739447795e-05, - "loss": 1.3303, - "step": 820 - }, - { - "epoch": 0.2582013074974969, - "grad_norm": 0.77734375, - "learning_rate": 1.9159631862900666e-05, - "loss": 1.3877, - "step": 822 - }, - { - "epoch": 0.25882953452303825, - "grad_norm": 0.8359375, - "learning_rate": 1.915709298635354e-05, - "loss": 1.3464, - "step": 824 - }, - { - "epoch": 0.25945776154857964, - "grad_norm": 0.74609375, - "learning_rate": 1.9154554109806412e-05, - "loss": 1.4358, - "step": 826 - }, - { - "epoch": 0.26008598857412096, - "grad_norm": 0.73046875, - "learning_rate": 1.9152015233259284e-05, - "loss": 1.2982, - "step": 828 - }, - { - "epoch": 0.26071421559966235, - "grad_norm": 0.796875, - "learning_rate": 1.9149476356712155e-05, - "loss": 1.398, - "step": 830 - }, - { - "epoch": 0.2613424426252037, - "grad_norm": 0.69921875, - "learning_rate": 1.914693748016503e-05, - "loss": 1.2641, - "step": 832 - }, - { - "epoch": 0.26197066965074506, - "grad_norm": 0.88671875, - "learning_rate": 1.91443986036179e-05, - "loss": 1.3669, - "step": 834 - }, - { - "epoch": 0.2625988966762864, - "grad_norm": 0.796875, - "learning_rate": 1.9141859727070773e-05, - "loss": 1.3182, - "step": 836 - }, - { - "epoch": 0.26322712370182777, - "grad_norm": 0.734375, - "learning_rate": 1.9139320850523644e-05, - "loss": 1.3939, - "step": 838 - }, - { - "epoch": 0.2638553507273691, - "grad_norm": 0.66796875, - "learning_rate": 1.9136781973976516e-05, - "loss": 1.4948, - "step": 840 - }, - { - "epoch": 0.2644835777529105, - "grad_norm": 0.88671875, - "learning_rate": 1.9134243097429387e-05, - "loss": 1.34, - "step": 842 - }, - { - "epoch": 0.2651118047784518, - "grad_norm": 0.890625, - "learning_rate": 1.913170422088226e-05, - "loss": 1.3576, - "step": 844 - }, - { - "epoch": 0.2657400318039932, - "grad_norm": 0.71875, - "learning_rate": 1.9129165344335133e-05, - "loss": 1.3366, - "step": 846 - }, - { - "epoch": 0.2663682588295345, - "grad_norm": 0.8359375, - "learning_rate": 1.9126626467788004e-05, - "loss": 1.4665, - "step": 848 - }, - { - "epoch": 0.2669964858550759, - "grad_norm": 0.69140625, - "learning_rate": 1.912408759124088e-05, - "loss": 1.4036, - "step": 850 - }, - { - "epoch": 0.26762471288061723, - "grad_norm": 0.73046875, - "learning_rate": 1.912154871469375e-05, - "loss": 1.2714, - "step": 852 - }, - { - "epoch": 0.26825293990615856, - "grad_norm": 0.71875, - "learning_rate": 1.9119009838146622e-05, - "loss": 1.3858, - "step": 854 - }, - { - "epoch": 0.26888116693169994, - "grad_norm": 0.734375, - "learning_rate": 1.9116470961599493e-05, - "loss": 1.4882, - "step": 856 - }, - { - "epoch": 0.26950939395724127, - "grad_norm": 0.77734375, - "learning_rate": 1.9113932085052368e-05, - "loss": 1.2592, - "step": 858 - }, - { - "epoch": 0.27013762098278266, - "grad_norm": 0.75, - "learning_rate": 1.911139320850524e-05, - "loss": 1.4349, - "step": 860 - }, - { - "epoch": 0.270765848008324, - "grad_norm": 0.921875, - "learning_rate": 1.910885433195811e-05, - "loss": 1.2003, - "step": 862 - }, - { - "epoch": 0.27139407503386537, - "grad_norm": 0.703125, - "learning_rate": 1.9106315455410982e-05, - "loss": 1.4485, - "step": 864 - }, - { - "epoch": 0.2720223020594067, - "grad_norm": 0.78125, - "learning_rate": 1.9103776578863854e-05, - "loss": 1.2389, - "step": 866 - }, - { - "epoch": 0.2726505290849481, - "grad_norm": 0.75, - "learning_rate": 1.9101237702316725e-05, - "loss": 1.4348, - "step": 868 - }, - { - "epoch": 0.2732787561104894, - "grad_norm": 0.78125, - "learning_rate": 1.90986988257696e-05, - "loss": 1.4559, - "step": 870 - }, - { - "epoch": 0.2739069831360308, - "grad_norm": 0.796875, - "learning_rate": 1.909615994922247e-05, - "loss": 1.4004, - "step": 872 - }, - { - "epoch": 0.2745352101615721, - "grad_norm": 0.8046875, - "learning_rate": 1.9093621072675343e-05, - "loss": 1.3105, - "step": 874 - }, - { - "epoch": 0.2751634371871135, - "grad_norm": 0.78125, - "learning_rate": 1.9091082196128214e-05, - "loss": 1.2796, - "step": 876 - }, - { - "epoch": 0.27579166421265483, - "grad_norm": 0.74609375, - "learning_rate": 1.908854331958109e-05, - "loss": 1.4628, - "step": 878 - }, - { - "epoch": 0.2764198912381962, - "grad_norm": 0.71484375, - "learning_rate": 1.908600444303396e-05, - "loss": 1.3618, - "step": 880 - }, - { - "epoch": 0.27704811826373754, - "grad_norm": 0.73828125, - "learning_rate": 1.908346556648683e-05, - "loss": 1.3635, - "step": 882 - }, - { - "epoch": 0.2776763452892789, - "grad_norm": 0.69921875, - "learning_rate": 1.9080926689939703e-05, - "loss": 1.3921, - "step": 884 - }, - { - "epoch": 0.27830457231482025, - "grad_norm": 0.70703125, - "learning_rate": 1.9078387813392578e-05, - "loss": 1.3431, - "step": 886 - }, - { - "epoch": 0.27893279934036164, - "grad_norm": 0.796875, - "learning_rate": 1.9075848936845446e-05, - "loss": 1.3725, - "step": 888 - }, - { - "epoch": 0.27956102636590296, - "grad_norm": 0.6640625, - "learning_rate": 1.907331006029832e-05, - "loss": 1.2754, - "step": 890 - }, - { - "epoch": 0.28018925339144435, - "grad_norm": 0.99609375, - "learning_rate": 1.9070771183751192e-05, - "loss": 1.1762, - "step": 892 - }, - { - "epoch": 0.2808174804169857, - "grad_norm": 0.80859375, - "learning_rate": 1.9068232307204063e-05, - "loss": 1.301, - "step": 894 - }, - { - "epoch": 0.28144570744252706, - "grad_norm": 0.68359375, - "learning_rate": 1.9065693430656935e-05, - "loss": 1.2999, - "step": 896 - }, - { - "epoch": 0.2820739344680684, - "grad_norm": 0.76171875, - "learning_rate": 1.906315455410981e-05, - "loss": 1.355, - "step": 898 - }, - { - "epoch": 0.28270216149360977, - "grad_norm": 0.71875, - "learning_rate": 1.906061567756268e-05, - "loss": 1.4332, - "step": 900 - }, - { - "epoch": 0.2833303885191511, - "grad_norm": 0.96875, - "learning_rate": 1.9058076801015552e-05, - "loss": 1.4116, - "step": 902 - }, - { - "epoch": 0.2839586155446925, - "grad_norm": 0.8203125, - "learning_rate": 1.9055537924468423e-05, - "loss": 1.3064, - "step": 904 - }, - { - "epoch": 0.2845868425702338, - "grad_norm": 0.81640625, - "learning_rate": 1.9052999047921298e-05, - "loss": 1.5111, - "step": 906 - }, - { - "epoch": 0.2852150695957752, - "grad_norm": 0.8125, - "learning_rate": 1.9050460171374166e-05, - "loss": 1.2457, - "step": 908 - }, - { - "epoch": 0.2858432966213165, - "grad_norm": 0.78125, - "learning_rate": 1.904792129482704e-05, - "loss": 1.346, - "step": 910 - }, - { - "epoch": 0.2864715236468579, - "grad_norm": 0.75390625, - "learning_rate": 1.9045382418279912e-05, - "loss": 1.3722, - "step": 912 - }, - { - "epoch": 0.28709975067239923, - "grad_norm": 0.8203125, - "learning_rate": 1.9042843541732784e-05, - "loss": 1.3245, - "step": 914 - }, - { - "epoch": 0.2877279776979406, - "grad_norm": 0.87109375, - "learning_rate": 1.9040304665185655e-05, - "loss": 1.42, - "step": 916 - }, - { - "epoch": 0.28835620472348195, - "grad_norm": 0.83984375, - "learning_rate": 1.903776578863853e-05, - "loss": 1.405, - "step": 918 - }, - { - "epoch": 0.28898443174902333, - "grad_norm": 0.703125, - "learning_rate": 1.90352269120914e-05, - "loss": 1.3066, - "step": 920 - }, - { - "epoch": 0.28961265877456466, - "grad_norm": 0.8046875, - "learning_rate": 1.9032688035544273e-05, - "loss": 1.3226, - "step": 922 - }, - { - "epoch": 0.29024088580010604, - "grad_norm": 0.875, - "learning_rate": 1.9030149158997144e-05, - "loss": 1.1937, - "step": 924 - }, - { - "epoch": 0.29086911282564737, - "grad_norm": 0.78125, - "learning_rate": 1.902761028245002e-05, - "loss": 1.3474, - "step": 926 - }, - { - "epoch": 0.2914973398511887, - "grad_norm": 0.76171875, - "learning_rate": 1.9025071405902887e-05, - "loss": 1.3306, - "step": 928 - }, - { - "epoch": 0.2921255668767301, - "grad_norm": 0.89453125, - "learning_rate": 1.902253252935576e-05, - "loss": 1.3498, - "step": 930 - }, - { - "epoch": 0.2927537939022714, - "grad_norm": 0.92578125, - "learning_rate": 1.9019993652808633e-05, - "loss": 1.4435, - "step": 932 - }, - { - "epoch": 0.2933820209278128, - "grad_norm": 0.75, - "learning_rate": 1.9017454776261504e-05, - "loss": 1.3682, - "step": 934 - }, - { - "epoch": 0.2940102479533541, - "grad_norm": 0.8203125, - "learning_rate": 1.901491589971438e-05, - "loss": 1.3391, - "step": 936 - }, - { - "epoch": 0.2946384749788955, - "grad_norm": 0.7109375, - "learning_rate": 1.901237702316725e-05, - "loss": 1.5098, - "step": 938 - }, - { - "epoch": 0.29526670200443683, - "grad_norm": 0.7109375, - "learning_rate": 1.9009838146620122e-05, - "loss": 1.4432, - "step": 940 - }, - { - "epoch": 0.2958949290299782, - "grad_norm": 0.703125, - "learning_rate": 1.9007299270072993e-05, - "loss": 1.3789, - "step": 942 - }, - { - "epoch": 0.29652315605551954, - "grad_norm": 0.703125, - "learning_rate": 1.9004760393525868e-05, - "loss": 1.2943, - "step": 944 - }, - { - "epoch": 0.2971513830810609, - "grad_norm": 0.8671875, - "learning_rate": 1.900222151697874e-05, - "loss": 1.3753, - "step": 946 - }, - { - "epoch": 0.29777961010660225, - "grad_norm": 0.70703125, - "learning_rate": 1.899968264043161e-05, - "loss": 1.3704, - "step": 948 - }, - { - "epoch": 0.29840783713214364, - "grad_norm": 0.78515625, - "learning_rate": 1.8997143763884482e-05, - "loss": 1.4176, - "step": 950 - }, - { - "epoch": 0.29903606415768497, - "grad_norm": 0.7890625, - "learning_rate": 1.8994604887337357e-05, - "loss": 1.2448, - "step": 952 - }, - { - "epoch": 0.29966429118322635, - "grad_norm": 0.7421875, - "learning_rate": 1.8992066010790225e-05, - "loss": 1.2357, - "step": 954 - }, - { - "epoch": 0.3002925182087677, - "grad_norm": 0.72265625, - "learning_rate": 1.89895271342431e-05, - "loss": 1.4002, - "step": 956 - }, - { - "epoch": 0.30092074523430906, - "grad_norm": 0.796875, - "learning_rate": 1.898698825769597e-05, - "loss": 1.3756, - "step": 958 - }, - { - "epoch": 0.3015489722598504, - "grad_norm": 0.75390625, - "learning_rate": 1.8984449381148842e-05, - "loss": 1.2851, - "step": 960 - }, - { - "epoch": 0.3021771992853918, - "grad_norm": 0.79296875, - "learning_rate": 1.8981910504601714e-05, - "loss": 1.3339, - "step": 962 - }, - { - "epoch": 0.3028054263109331, - "grad_norm": 0.6953125, - "learning_rate": 1.897937162805459e-05, - "loss": 1.4284, - "step": 964 - }, - { - "epoch": 0.3034336533364745, - "grad_norm": 0.83203125, - "learning_rate": 1.897683275150746e-05, - "loss": 1.3142, - "step": 966 - }, - { - "epoch": 0.3040618803620158, - "grad_norm": 0.76953125, - "learning_rate": 1.897429387496033e-05, - "loss": 1.4217, - "step": 968 - }, - { - "epoch": 0.3046901073875572, - "grad_norm": 0.7890625, - "learning_rate": 1.8971754998413203e-05, - "loss": 1.4308, - "step": 970 - }, - { - "epoch": 0.3053183344130985, - "grad_norm": 0.75390625, - "learning_rate": 1.8969216121866078e-05, - "loss": 1.2463, - "step": 972 - }, - { - "epoch": 0.3059465614386399, - "grad_norm": 0.72265625, - "learning_rate": 1.896667724531895e-05, - "loss": 1.3149, - "step": 974 - }, - { - "epoch": 0.30657478846418124, - "grad_norm": 0.91796875, - "learning_rate": 1.896413836877182e-05, - "loss": 1.3623, - "step": 976 - }, - { - "epoch": 0.3072030154897226, - "grad_norm": 0.69921875, - "learning_rate": 1.8961599492224692e-05, - "loss": 1.5585, - "step": 978 - }, - { - "epoch": 0.30783124251526395, - "grad_norm": 0.71875, - "learning_rate": 1.8959060615677563e-05, - "loss": 1.2155, - "step": 980 - }, - { - "epoch": 0.30845946954080533, - "grad_norm": 0.703125, - "learning_rate": 1.8956521739130434e-05, - "loss": 1.384, - "step": 982 - }, - { - "epoch": 0.30908769656634666, - "grad_norm": 0.890625, - "learning_rate": 1.895398286258331e-05, - "loss": 1.2724, - "step": 984 - }, - { - "epoch": 0.30971592359188804, - "grad_norm": 0.75, - "learning_rate": 1.895144398603618e-05, - "loss": 1.3504, - "step": 986 - }, - { - "epoch": 0.31034415061742937, - "grad_norm": 0.7578125, - "learning_rate": 1.8948905109489052e-05, - "loss": 1.3144, - "step": 988 - }, - { - "epoch": 0.31097237764297075, - "grad_norm": 0.71484375, - "learning_rate": 1.8946366232941923e-05, - "loss": 1.3399, - "step": 990 - }, - { - "epoch": 0.3116006046685121, - "grad_norm": 0.796875, - "learning_rate": 1.8943827356394798e-05, - "loss": 1.3355, - "step": 992 - }, - { - "epoch": 0.31222883169405347, - "grad_norm": 0.78125, - "learning_rate": 1.894128847984767e-05, - "loss": 1.2823, - "step": 994 - }, - { - "epoch": 0.3128570587195948, - "grad_norm": 0.78125, - "learning_rate": 1.893874960330054e-05, - "loss": 1.4969, - "step": 996 - }, - { - "epoch": 0.3134852857451361, - "grad_norm": 0.7890625, - "learning_rate": 1.8936210726753412e-05, - "loss": 1.3046, - "step": 998 - }, - { - "epoch": 0.3141135127706775, - "grad_norm": 0.7109375, - "learning_rate": 1.8933671850206287e-05, - "loss": 1.4317, - "step": 1000 - }, - { - "epoch": 0.31474173979621883, - "grad_norm": 0.71875, - "learning_rate": 1.8931132973659155e-05, - "loss": 1.3786, - "step": 1002 - }, - { - "epoch": 0.3153699668217602, - "grad_norm": 0.73046875, - "learning_rate": 1.892859409711203e-05, - "loss": 1.3259, - "step": 1004 - }, - { - "epoch": 0.31599819384730155, - "grad_norm": 0.72265625, - "learning_rate": 1.89260552205649e-05, - "loss": 1.3619, - "step": 1006 - }, - { - "epoch": 0.31662642087284293, - "grad_norm": 0.7578125, - "learning_rate": 1.8923516344017773e-05, - "loss": 1.4299, - "step": 1008 - }, - { - "epoch": 0.31725464789838426, - "grad_norm": 0.78515625, - "learning_rate": 1.8920977467470644e-05, - "loss": 1.389, - "step": 1010 - }, - { - "epoch": 0.31788287492392564, - "grad_norm": 0.8046875, - "learning_rate": 1.891843859092352e-05, - "loss": 1.3459, - "step": 1012 - }, - { - "epoch": 0.31851110194946697, - "grad_norm": 0.765625, - "learning_rate": 1.891589971437639e-05, - "loss": 1.4309, - "step": 1014 - }, - { - "epoch": 0.31913932897500835, - "grad_norm": 0.76953125, - "learning_rate": 1.891336083782926e-05, - "loss": 1.3712, - "step": 1016 - }, - { - "epoch": 0.3197675560005497, - "grad_norm": 0.80859375, - "learning_rate": 1.8910821961282133e-05, - "loss": 1.3044, - "step": 1018 - }, - { - "epoch": 0.32039578302609106, - "grad_norm": 0.6796875, - "learning_rate": 1.8908283084735008e-05, - "loss": 1.3589, - "step": 1020 - }, - { - "epoch": 0.3210240100516324, - "grad_norm": 0.69140625, - "learning_rate": 1.890574420818788e-05, - "loss": 1.2593, - "step": 1022 - }, - { - "epoch": 0.3216522370771738, - "grad_norm": 0.87109375, - "learning_rate": 1.890320533164075e-05, - "loss": 1.3657, - "step": 1024 - }, - { - "epoch": 0.3222804641027151, - "grad_norm": 0.6796875, - "learning_rate": 1.8900666455093625e-05, - "loss": 1.2129, - "step": 1026 - }, - { - "epoch": 0.3229086911282565, - "grad_norm": 0.71875, - "learning_rate": 1.8898127578546493e-05, - "loss": 1.09, - "step": 1028 - }, - { - "epoch": 0.3235369181537978, - "grad_norm": 0.8671875, - "learning_rate": 1.8895588701999368e-05, - "loss": 1.3569, - "step": 1030 - }, - { - "epoch": 0.3241651451793392, - "grad_norm": 0.78515625, - "learning_rate": 1.889304982545224e-05, - "loss": 1.4419, - "step": 1032 - }, - { - "epoch": 0.3247933722048805, - "grad_norm": 0.7578125, - "learning_rate": 1.889051094890511e-05, - "loss": 1.3802, - "step": 1034 - }, - { - "epoch": 0.3254215992304219, - "grad_norm": 0.75390625, - "learning_rate": 1.8887972072357982e-05, - "loss": 1.312, - "step": 1036 - }, - { - "epoch": 0.32604982625596324, - "grad_norm": 0.74609375, - "learning_rate": 1.8885433195810857e-05, - "loss": 1.4378, - "step": 1038 - }, - { - "epoch": 0.3266780532815046, - "grad_norm": 0.83203125, - "learning_rate": 1.8882894319263728e-05, - "loss": 1.2541, - "step": 1040 - }, - { - "epoch": 0.32730628030704595, - "grad_norm": 0.7421875, - "learning_rate": 1.88803554427166e-05, - "loss": 1.3656, - "step": 1042 - }, - { - "epoch": 0.32793450733258733, - "grad_norm": 0.7578125, - "learning_rate": 1.887781656616947e-05, - "loss": 1.4039, - "step": 1044 - }, - { - "epoch": 0.32856273435812866, - "grad_norm": 0.72265625, - "learning_rate": 1.8875277689622346e-05, - "loss": 1.3563, - "step": 1046 - }, - { - "epoch": 0.32919096138367004, - "grad_norm": 0.88671875, - "learning_rate": 1.8872738813075214e-05, - "loss": 1.285, - "step": 1048 - }, - { - "epoch": 0.3298191884092114, - "grad_norm": 0.84375, - "learning_rate": 1.887019993652809e-05, - "loss": 1.2465, - "step": 1050 - }, - { - "epoch": 0.33044741543475276, - "grad_norm": 0.92578125, - "learning_rate": 1.886766105998096e-05, - "loss": 1.2184, - "step": 1052 - }, - { - "epoch": 0.3310756424602941, - "grad_norm": 0.69921875, - "learning_rate": 1.886512218343383e-05, - "loss": 1.3098, - "step": 1054 - }, - { - "epoch": 0.33170386948583547, - "grad_norm": 0.76171875, - "learning_rate": 1.8862583306886703e-05, - "loss": 1.318, - "step": 1056 - }, - { - "epoch": 0.3323320965113768, - "grad_norm": 0.91015625, - "learning_rate": 1.8860044430339577e-05, - "loss": 1.2984, - "step": 1058 - }, - { - "epoch": 0.3329603235369182, - "grad_norm": 0.78515625, - "learning_rate": 1.885750555379245e-05, - "loss": 1.4075, - "step": 1060 - }, - { - "epoch": 0.3335885505624595, - "grad_norm": 0.94140625, - "learning_rate": 1.885496667724532e-05, - "loss": 1.354, - "step": 1062 - }, - { - "epoch": 0.3342167775880009, - "grad_norm": 0.74609375, - "learning_rate": 1.885242780069819e-05, - "loss": 1.2434, - "step": 1064 - }, - { - "epoch": 0.3348450046135422, - "grad_norm": 0.8359375, - "learning_rate": 1.8849888924151066e-05, - "loss": 1.4308, - "step": 1066 - }, - { - "epoch": 0.33547323163908355, - "grad_norm": 0.8984375, - "learning_rate": 1.8847350047603938e-05, - "loss": 1.2561, - "step": 1068 - }, - { - "epoch": 0.33610145866462493, - "grad_norm": 0.875, - "learning_rate": 1.884481117105681e-05, - "loss": 1.4753, - "step": 1070 - }, - { - "epoch": 0.33672968569016626, - "grad_norm": 0.69921875, - "learning_rate": 1.884227229450968e-05, - "loss": 1.369, - "step": 1072 - }, - { - "epoch": 0.33735791271570764, - "grad_norm": 0.76171875, - "learning_rate": 1.8839733417962552e-05, - "loss": 1.4776, - "step": 1074 - }, - { - "epoch": 0.33798613974124897, - "grad_norm": 0.73046875, - "learning_rate": 1.8837194541415423e-05, - "loss": 1.3619, - "step": 1076 - }, - { - "epoch": 0.33861436676679035, - "grad_norm": 0.77734375, - "learning_rate": 1.8834655664868298e-05, - "loss": 1.2684, - "step": 1078 - }, - { - "epoch": 0.3392425937923317, - "grad_norm": 0.7421875, - "learning_rate": 1.883211678832117e-05, - "loss": 1.4172, - "step": 1080 - }, - { - "epoch": 0.33987082081787306, - "grad_norm": 0.890625, - "learning_rate": 1.882957791177404e-05, - "loss": 1.501, - "step": 1082 - }, - { - "epoch": 0.3404990478434144, - "grad_norm": 0.82421875, - "learning_rate": 1.8827039035226912e-05, - "loss": 1.4823, - "step": 1084 - }, - { - "epoch": 0.3411272748689558, - "grad_norm": 0.8828125, - "learning_rate": 1.8824500158679787e-05, - "loss": 1.2784, - "step": 1086 - }, - { - "epoch": 0.3417555018944971, - "grad_norm": 0.76171875, - "learning_rate": 1.882196128213266e-05, - "loss": 1.359, - "step": 1088 - }, - { - "epoch": 0.3423837289200385, - "grad_norm": 0.79296875, - "learning_rate": 1.881942240558553e-05, - "loss": 1.2725, - "step": 1090 - }, - { - "epoch": 0.3430119559455798, - "grad_norm": 0.76171875, - "learning_rate": 1.88168835290384e-05, - "loss": 1.2185, - "step": 1092 - }, - { - "epoch": 0.3436401829711212, - "grad_norm": 0.703125, - "learning_rate": 1.8814344652491276e-05, - "loss": 1.3709, - "step": 1094 - }, - { - "epoch": 0.3442684099966625, - "grad_norm": 0.79296875, - "learning_rate": 1.8811805775944144e-05, - "loss": 1.4139, - "step": 1096 - }, - { - "epoch": 0.3448966370222039, - "grad_norm": 0.69921875, - "learning_rate": 1.880926689939702e-05, - "loss": 1.5253, - "step": 1098 - }, - { - "epoch": 0.34552486404774524, - "grad_norm": 0.72265625, - "learning_rate": 1.880672802284989e-05, - "loss": 1.2929, - "step": 1100 - }, - { - "epoch": 0.3461530910732866, - "grad_norm": 0.90625, - "learning_rate": 1.880418914630276e-05, - "loss": 1.3314, - "step": 1102 - }, - { - "epoch": 0.34678131809882795, - "grad_norm": 0.70703125, - "learning_rate": 1.8801650269755633e-05, - "loss": 1.1409, - "step": 1104 - }, - { - "epoch": 0.34740954512436933, - "grad_norm": 0.765625, - "learning_rate": 1.8799111393208508e-05, - "loss": 1.4453, - "step": 1106 - }, - { - "epoch": 0.34803777214991066, - "grad_norm": 0.671875, - "learning_rate": 1.879657251666138e-05, - "loss": 1.3495, - "step": 1108 - }, - { - "epoch": 0.34866599917545205, - "grad_norm": 0.77734375, - "learning_rate": 1.879403364011425e-05, - "loss": 1.3406, - "step": 1110 - }, - { - "epoch": 0.3492942262009934, - "grad_norm": 0.85546875, - "learning_rate": 1.8791494763567125e-05, - "loss": 1.2358, - "step": 1112 - }, - { - "epoch": 0.34992245322653476, - "grad_norm": 0.83984375, - "learning_rate": 1.8788955887019997e-05, - "loss": 1.3972, - "step": 1114 - }, - { - "epoch": 0.3505506802520761, - "grad_norm": 0.72265625, - "learning_rate": 1.8786417010472868e-05, - "loss": 1.3597, - "step": 1116 - }, - { - "epoch": 0.35117890727761747, - "grad_norm": 0.66015625, - "learning_rate": 1.878387813392574e-05, - "loss": 1.3003, - "step": 1118 - }, - { - "epoch": 0.3518071343031588, - "grad_norm": 0.86328125, - "learning_rate": 1.8781339257378614e-05, - "loss": 1.2663, - "step": 1120 - }, - { - "epoch": 0.3524353613287002, - "grad_norm": 0.73828125, - "learning_rate": 1.8778800380831482e-05, - "loss": 1.4089, - "step": 1122 - }, - { - "epoch": 0.3530635883542415, - "grad_norm": 0.828125, - "learning_rate": 1.8776261504284357e-05, - "loss": 1.3793, - "step": 1124 - }, - { - "epoch": 0.3536918153797829, - "grad_norm": 0.796875, - "learning_rate": 1.8773722627737228e-05, - "loss": 1.4041, - "step": 1126 - }, - { - "epoch": 0.3543200424053242, - "grad_norm": 0.8046875, - "learning_rate": 1.87711837511901e-05, - "loss": 1.252, - "step": 1128 - }, - { - "epoch": 0.3549482694308656, - "grad_norm": 0.76953125, - "learning_rate": 1.876864487464297e-05, - "loss": 1.3771, - "step": 1130 - }, - { - "epoch": 0.35557649645640693, - "grad_norm": 0.86328125, - "learning_rate": 1.8766105998095846e-05, - "loss": 1.2952, - "step": 1132 - }, - { - "epoch": 0.3562047234819483, - "grad_norm": 0.7734375, - "learning_rate": 1.8763567121548717e-05, - "loss": 1.2377, - "step": 1134 - }, - { - "epoch": 0.35683295050748964, - "grad_norm": 0.78125, - "learning_rate": 1.876102824500159e-05, - "loss": 1.429, - "step": 1136 - }, - { - "epoch": 0.35746117753303097, - "grad_norm": 0.7734375, - "learning_rate": 1.875848936845446e-05, - "loss": 1.3617, - "step": 1138 - }, - { - "epoch": 0.35808940455857236, - "grad_norm": 0.7109375, - "learning_rate": 1.8755950491907335e-05, - "loss": 1.4136, - "step": 1140 - }, - { - "epoch": 0.3587176315841137, - "grad_norm": 0.80859375, - "learning_rate": 1.8753411615360203e-05, - "loss": 1.2859, - "step": 1142 - }, - { - "epoch": 0.35934585860965507, - "grad_norm": 0.6796875, - "learning_rate": 1.8750872738813077e-05, - "loss": 1.2145, - "step": 1144 - }, - { - "epoch": 0.3599740856351964, - "grad_norm": 0.70703125, - "learning_rate": 1.874833386226595e-05, - "loss": 1.294, - "step": 1146 - }, - { - "epoch": 0.3606023126607378, - "grad_norm": 0.8203125, - "learning_rate": 1.874579498571882e-05, - "loss": 1.1749, - "step": 1148 - }, - { - "epoch": 0.3612305396862791, - "grad_norm": 0.75, - "learning_rate": 1.874325610917169e-05, - "loss": 1.2759, - "step": 1150 - }, - { - "epoch": 0.3618587667118205, - "grad_norm": 0.76953125, - "learning_rate": 1.8740717232624566e-05, - "loss": 1.2798, - "step": 1152 - }, - { - "epoch": 0.3624869937373618, - "grad_norm": 0.83203125, - "learning_rate": 1.8738178356077438e-05, - "loss": 1.3493, - "step": 1154 - }, - { - "epoch": 0.3631152207629032, - "grad_norm": 0.76953125, - "learning_rate": 1.873563947953031e-05, - "loss": 1.4311, - "step": 1156 - }, - { - "epoch": 0.36374344778844453, - "grad_norm": 0.765625, - "learning_rate": 1.873310060298318e-05, - "loss": 1.2613, - "step": 1158 - }, - { - "epoch": 0.3643716748139859, - "grad_norm": 1.0078125, - "learning_rate": 1.8730561726436055e-05, - "loss": 1.3474, - "step": 1160 - }, - { - "epoch": 0.36499990183952724, - "grad_norm": 0.7109375, - "learning_rate": 1.8728022849888923e-05, - "loss": 1.4257, - "step": 1162 - }, - { - "epoch": 0.3656281288650686, - "grad_norm": 0.78125, - "learning_rate": 1.8725483973341798e-05, - "loss": 1.3411, - "step": 1164 - }, - { - "epoch": 0.36625635589060995, - "grad_norm": 0.734375, - "learning_rate": 1.872294509679467e-05, - "loss": 1.3309, - "step": 1166 - }, - { - "epoch": 0.36688458291615134, - "grad_norm": 0.8984375, - "learning_rate": 1.872040622024754e-05, - "loss": 1.4467, - "step": 1168 - }, - { - "epoch": 0.36751280994169266, - "grad_norm": 0.8515625, - "learning_rate": 1.8717867343700412e-05, - "loss": 1.2754, - "step": 1170 - }, - { - "epoch": 0.36814103696723405, - "grad_norm": 0.7890625, - "learning_rate": 1.8715328467153287e-05, - "loss": 1.4556, - "step": 1172 - }, - { - "epoch": 0.3687692639927754, - "grad_norm": 0.84375, - "learning_rate": 1.871278959060616e-05, - "loss": 1.3598, - "step": 1174 - }, - { - "epoch": 0.36939749101831676, - "grad_norm": 0.6875, - "learning_rate": 1.871025071405903e-05, - "loss": 1.2428, - "step": 1176 - }, - { - "epoch": 0.3700257180438581, - "grad_norm": 0.8046875, - "learning_rate": 1.87077118375119e-05, - "loss": 1.3761, - "step": 1178 - }, - { - "epoch": 0.37065394506939947, - "grad_norm": 0.78515625, - "learning_rate": 1.8705172960964776e-05, - "loss": 1.3929, - "step": 1180 - }, - { - "epoch": 0.3712821720949408, - "grad_norm": 0.8671875, - "learning_rate": 1.8702634084417647e-05, - "loss": 1.2633, - "step": 1182 - }, - { - "epoch": 0.3719103991204822, - "grad_norm": 0.828125, - "learning_rate": 1.870009520787052e-05, - "loss": 1.4286, - "step": 1184 - }, - { - "epoch": 0.3725386261460235, - "grad_norm": 0.7734375, - "learning_rate": 1.869755633132339e-05, - "loss": 1.2967, - "step": 1186 - }, - { - "epoch": 0.3731668531715649, - "grad_norm": 1.015625, - "learning_rate": 1.869501745477626e-05, - "loss": 1.3566, - "step": 1188 - }, - { - "epoch": 0.3737950801971062, - "grad_norm": 0.71875, - "learning_rate": 1.8692478578229133e-05, - "loss": 1.3837, - "step": 1190 - }, - { - "epoch": 0.3744233072226476, - "grad_norm": 0.9296875, - "learning_rate": 1.8689939701682008e-05, - "loss": 1.369, - "step": 1192 - }, - { - "epoch": 0.37505153424818893, - "grad_norm": 0.71484375, - "learning_rate": 1.868740082513488e-05, - "loss": 1.4206, - "step": 1194 - }, - { - "epoch": 0.3756797612737303, - "grad_norm": 0.75390625, - "learning_rate": 1.868486194858775e-05, - "loss": 1.3345, - "step": 1196 - }, - { - "epoch": 0.37630798829927165, - "grad_norm": 0.84765625, - "learning_rate": 1.8682323072040625e-05, - "loss": 1.3843, - "step": 1198 - }, - { - "epoch": 0.37693621532481303, - "grad_norm": 0.71484375, - "learning_rate": 1.8679784195493496e-05, - "loss": 1.4273, - "step": 1200 - }, - { - "epoch": 0.37756444235035436, - "grad_norm": 0.7734375, - "learning_rate": 1.8677245318946368e-05, - "loss": 1.3729, - "step": 1202 - }, - { - "epoch": 0.37819266937589574, - "grad_norm": 1.15625, - "learning_rate": 1.867470644239924e-05, - "loss": 1.1632, - "step": 1204 - }, - { - "epoch": 0.37882089640143707, - "grad_norm": 0.6796875, - "learning_rate": 1.8672167565852114e-05, - "loss": 1.3493, - "step": 1206 - }, - { - "epoch": 0.37944912342697845, - "grad_norm": 0.7578125, - "learning_rate": 1.8669628689304985e-05, - "loss": 1.3056, - "step": 1208 - }, - { - "epoch": 0.3800773504525198, - "grad_norm": 0.7265625, - "learning_rate": 1.8667089812757857e-05, - "loss": 1.414, - "step": 1210 - }, - { - "epoch": 0.3807055774780611, - "grad_norm": 0.8359375, - "learning_rate": 1.8664550936210728e-05, - "loss": 1.33, - "step": 1212 - }, - { - "epoch": 0.3813338045036025, - "grad_norm": 0.80859375, - "learning_rate": 1.86620120596636e-05, - "loss": 1.378, - "step": 1214 - }, - { - "epoch": 0.3819620315291438, - "grad_norm": 0.95703125, - "learning_rate": 1.865947318311647e-05, - "loss": 1.2628, - "step": 1216 - }, - { - "epoch": 0.3825902585546852, - "grad_norm": 0.73046875, - "learning_rate": 1.8656934306569346e-05, - "loss": 1.2875, - "step": 1218 - }, - { - "epoch": 0.38321848558022653, - "grad_norm": 0.78515625, - "learning_rate": 1.8654395430022217e-05, - "loss": 1.3463, - "step": 1220 - }, - { - "epoch": 0.3838467126057679, - "grad_norm": 0.80078125, - "learning_rate": 1.865185655347509e-05, - "loss": 1.3272, - "step": 1222 - }, - { - "epoch": 0.38447493963130924, - "grad_norm": 0.71484375, - "learning_rate": 1.864931767692796e-05, - "loss": 1.3908, - "step": 1224 - }, - { - "epoch": 0.3851031666568506, - "grad_norm": 0.6796875, - "learning_rate": 1.8646778800380835e-05, - "loss": 1.3235, - "step": 1226 - }, - { - "epoch": 0.38573139368239195, - "grad_norm": 0.74609375, - "learning_rate": 1.8644239923833706e-05, - "loss": 1.2354, - "step": 1228 - }, - { - "epoch": 0.38635962070793334, - "grad_norm": 0.88671875, - "learning_rate": 1.8641701047286577e-05, - "loss": 1.2592, - "step": 1230 - }, - { - "epoch": 0.38698784773347467, - "grad_norm": 0.7265625, - "learning_rate": 1.863916217073945e-05, - "loss": 1.3272, - "step": 1232 - }, - { - "epoch": 0.38761607475901605, - "grad_norm": 0.77734375, - "learning_rate": 1.8636623294192323e-05, - "loss": 1.2147, - "step": 1234 - }, - { - "epoch": 0.3882443017845574, - "grad_norm": 0.7734375, - "learning_rate": 1.863408441764519e-05, - "loss": 1.3168, - "step": 1236 - }, - { - "epoch": 0.38887252881009876, - "grad_norm": 0.73828125, - "learning_rate": 1.8631545541098066e-05, - "loss": 1.2581, - "step": 1238 - }, - { - "epoch": 0.3895007558356401, - "grad_norm": 0.84375, - "learning_rate": 1.8629006664550938e-05, - "loss": 1.404, - "step": 1240 - }, - { - "epoch": 0.3901289828611815, - "grad_norm": 0.79296875, - "learning_rate": 1.862646778800381e-05, - "loss": 1.3546, - "step": 1242 - }, - { - "epoch": 0.3907572098867228, - "grad_norm": 0.74609375, - "learning_rate": 1.862392891145668e-05, - "loss": 1.2896, - "step": 1244 - }, - { - "epoch": 0.3913854369122642, - "grad_norm": 0.74609375, - "learning_rate": 1.8621390034909555e-05, - "loss": 1.3196, - "step": 1246 - }, - { - "epoch": 0.3920136639378055, - "grad_norm": 0.72265625, - "learning_rate": 1.8618851158362427e-05, - "loss": 1.3084, - "step": 1248 - }, - { - "epoch": 0.3926418909633469, - "grad_norm": 0.75390625, - "learning_rate": 1.8616312281815298e-05, - "loss": 1.2459, - "step": 1250 - }, - { - "epoch": 0.3932701179888882, - "grad_norm": 0.73828125, - "learning_rate": 1.861377340526817e-05, - "loss": 1.3642, - "step": 1252 - }, - { - "epoch": 0.3938983450144296, - "grad_norm": 0.9140625, - "learning_rate": 1.8611234528721044e-05, - "loss": 1.2232, - "step": 1254 - }, - { - "epoch": 0.39452657203997094, - "grad_norm": 0.6875, - "learning_rate": 1.8608695652173912e-05, - "loss": 1.2384, - "step": 1256 - }, - { - "epoch": 0.3951547990655123, - "grad_norm": 0.6640625, - "learning_rate": 1.8606156775626787e-05, - "loss": 1.3031, - "step": 1258 - }, - { - "epoch": 0.39578302609105365, - "grad_norm": 0.67578125, - "learning_rate": 1.8603617899079658e-05, - "loss": 1.3142, - "step": 1260 - }, - { - "epoch": 0.39641125311659503, - "grad_norm": 0.875, - "learning_rate": 1.860107902253253e-05, - "loss": 1.2851, - "step": 1262 - }, - { - "epoch": 0.39703948014213636, - "grad_norm": 0.73828125, - "learning_rate": 1.85985401459854e-05, - "loss": 1.3063, - "step": 1264 - }, - { - "epoch": 0.39766770716767774, - "grad_norm": 0.7578125, - "learning_rate": 1.8596001269438276e-05, - "loss": 1.4062, - "step": 1266 - }, - { - "epoch": 0.39829593419321907, - "grad_norm": 0.78125, - "learning_rate": 1.8593462392891147e-05, - "loss": 1.2698, - "step": 1268 - }, - { - "epoch": 0.39892416121876045, - "grad_norm": 0.6796875, - "learning_rate": 1.859092351634402e-05, - "loss": 1.3242, - "step": 1270 - }, - { - "epoch": 0.3995523882443018, - "grad_norm": 0.70703125, - "learning_rate": 1.858838463979689e-05, - "loss": 1.3655, - "step": 1272 - }, - { - "epoch": 0.40018061526984317, - "grad_norm": 0.75, - "learning_rate": 1.8585845763249765e-05, - "loss": 1.259, - "step": 1274 - }, - { - "epoch": 0.4008088422953845, - "grad_norm": 0.8984375, - "learning_rate": 1.8583306886702636e-05, - "loss": 1.2373, - "step": 1276 - }, - { - "epoch": 0.4014370693209259, - "grad_norm": 0.75390625, - "learning_rate": 1.8580768010155507e-05, - "loss": 1.3231, - "step": 1278 - }, - { - "epoch": 0.4020652963464672, - "grad_norm": 0.7421875, - "learning_rate": 1.8578229133608382e-05, - "loss": 1.3715, - "step": 1280 - }, - { - "epoch": 0.40269352337200853, - "grad_norm": 0.91015625, - "learning_rate": 1.857569025706125e-05, - "loss": 1.4227, - "step": 1282 - }, - { - "epoch": 0.4033217503975499, - "grad_norm": 0.72265625, - "learning_rate": 1.8573151380514125e-05, - "loss": 1.4352, - "step": 1284 - }, - { - "epoch": 0.40394997742309124, - "grad_norm": 0.8359375, - "learning_rate": 1.8570612503966996e-05, - "loss": 1.3358, - "step": 1286 - }, - { - "epoch": 0.40457820444863263, - "grad_norm": 0.7734375, - "learning_rate": 1.8568073627419868e-05, - "loss": 1.3508, - "step": 1288 - }, - { - "epoch": 0.40520643147417396, - "grad_norm": 0.94921875, - "learning_rate": 1.856553475087274e-05, - "loss": 1.4527, - "step": 1290 - }, - { - "epoch": 0.40583465849971534, - "grad_norm": 0.68359375, - "learning_rate": 1.8562995874325614e-05, - "loss": 1.4456, - "step": 1292 - }, - { - "epoch": 0.40646288552525667, - "grad_norm": 0.90625, - "learning_rate": 1.8560456997778485e-05, - "loss": 1.3093, - "step": 1294 - }, - { - "epoch": 0.40709111255079805, - "grad_norm": 0.74609375, - "learning_rate": 1.8557918121231357e-05, - "loss": 1.4534, - "step": 1296 - }, - { - "epoch": 0.4077193395763394, - "grad_norm": 0.9609375, - "learning_rate": 1.8555379244684228e-05, - "loss": 1.2337, - "step": 1298 - }, - { - "epoch": 0.40834756660188076, - "grad_norm": 0.71875, - "learning_rate": 1.8552840368137103e-05, - "loss": 1.212, - "step": 1300 - }, - { - "epoch": 0.4089757936274221, - "grad_norm": 0.70703125, - "learning_rate": 1.8550301491589974e-05, - "loss": 1.3673, - "step": 1302 - }, - { - "epoch": 0.4096040206529635, - "grad_norm": 0.6875, - "learning_rate": 1.8547762615042846e-05, - "loss": 1.3345, - "step": 1304 - }, - { - "epoch": 0.4102322476785048, - "grad_norm": 0.70703125, - "learning_rate": 1.8545223738495717e-05, - "loss": 1.3542, - "step": 1306 - }, - { - "epoch": 0.4108604747040462, - "grad_norm": 0.828125, - "learning_rate": 1.854268486194859e-05, - "loss": 1.4953, - "step": 1308 - }, - { - "epoch": 0.4114887017295875, - "grad_norm": 0.7421875, - "learning_rate": 1.854014598540146e-05, - "loss": 1.4254, - "step": 1310 - }, - { - "epoch": 0.4121169287551289, - "grad_norm": 0.71875, - "learning_rate": 1.8537607108854335e-05, - "loss": 1.3089, - "step": 1312 - }, - { - "epoch": 0.4127451557806702, - "grad_norm": 0.73046875, - "learning_rate": 1.8535068232307206e-05, - "loss": 1.3985, - "step": 1314 - }, - { - "epoch": 0.4133733828062116, - "grad_norm": 0.828125, - "learning_rate": 1.8532529355760077e-05, - "loss": 1.45, - "step": 1316 - }, - { - "epoch": 0.41400160983175294, - "grad_norm": 0.71875, - "learning_rate": 1.852999047921295e-05, - "loss": 1.472, - "step": 1318 - }, - { - "epoch": 0.4146298368572943, - "grad_norm": 0.69140625, - "learning_rate": 1.8527451602665823e-05, - "loss": 1.4135, - "step": 1320 - }, - { - "epoch": 0.41525806388283565, - "grad_norm": 0.76171875, - "learning_rate": 1.8524912726118695e-05, - "loss": 1.2985, - "step": 1322 - }, - { - "epoch": 0.41588629090837703, - "grad_norm": 0.84375, - "learning_rate": 1.8522373849571566e-05, - "loss": 1.292, - "step": 1324 - }, - { - "epoch": 0.41651451793391836, - "grad_norm": 0.73046875, - "learning_rate": 1.8519834973024438e-05, - "loss": 1.3459, - "step": 1326 - }, - { - "epoch": 0.41714274495945974, - "grad_norm": 0.72265625, - "learning_rate": 1.8517296096477312e-05, - "loss": 1.3259, - "step": 1328 - }, - { - "epoch": 0.41777097198500107, - "grad_norm": 0.70703125, - "learning_rate": 1.851475721993018e-05, - "loss": 1.3027, - "step": 1330 - }, - { - "epoch": 0.41839919901054246, - "grad_norm": 0.671875, - "learning_rate": 1.8512218343383055e-05, - "loss": 1.3385, - "step": 1332 - }, - { - "epoch": 0.4190274260360838, - "grad_norm": 0.7109375, - "learning_rate": 1.8509679466835926e-05, - "loss": 1.3775, - "step": 1334 - }, - { - "epoch": 0.41965565306162517, - "grad_norm": 0.79296875, - "learning_rate": 1.8507140590288798e-05, - "loss": 1.1561, - "step": 1336 - }, - { - "epoch": 0.4202838800871665, - "grad_norm": 0.8125, - "learning_rate": 1.850460171374167e-05, - "loss": 1.2644, - "step": 1338 - }, - { - "epoch": 0.4209121071127079, - "grad_norm": 0.72265625, - "learning_rate": 1.8502062837194544e-05, - "loss": 1.3686, - "step": 1340 - }, - { - "epoch": 0.4215403341382492, - "grad_norm": 0.79296875, - "learning_rate": 1.8499523960647415e-05, - "loss": 1.4161, - "step": 1342 - }, - { - "epoch": 0.4221685611637906, - "grad_norm": 0.796875, - "learning_rate": 1.8496985084100287e-05, - "loss": 1.3431, - "step": 1344 - }, - { - "epoch": 0.4227967881893319, - "grad_norm": 0.80859375, - "learning_rate": 1.8494446207553158e-05, - "loss": 1.3203, - "step": 1346 - }, - { - "epoch": 0.4234250152148733, - "grad_norm": 0.8359375, - "learning_rate": 1.8491907331006033e-05, - "loss": 1.3866, - "step": 1348 - }, - { - "epoch": 0.42405324224041463, - "grad_norm": 0.73828125, - "learning_rate": 1.84893684544589e-05, - "loss": 1.307, - "step": 1350 - }, - { - "epoch": 0.42468146926595596, - "grad_norm": 0.75390625, - "learning_rate": 1.8486829577911776e-05, - "loss": 1.3054, - "step": 1352 - }, - { - "epoch": 0.42530969629149734, - "grad_norm": 0.73828125, - "learning_rate": 1.8484290701364647e-05, - "loss": 1.263, - "step": 1354 - }, - { - "epoch": 0.42593792331703867, - "grad_norm": 0.7421875, - "learning_rate": 1.848175182481752e-05, - "loss": 1.2961, - "step": 1356 - }, - { - "epoch": 0.42656615034258005, - "grad_norm": 0.70703125, - "learning_rate": 1.847921294827039e-05, - "loss": 1.386, - "step": 1358 - }, - { - "epoch": 0.4271943773681214, - "grad_norm": 0.79296875, - "learning_rate": 1.8476674071723265e-05, - "loss": 1.2587, - "step": 1360 - }, - { - "epoch": 0.42782260439366276, - "grad_norm": 0.80078125, - "learning_rate": 1.8474135195176136e-05, - "loss": 1.3613, - "step": 1362 - }, - { - "epoch": 0.4284508314192041, - "grad_norm": 0.734375, - "learning_rate": 1.8471596318629007e-05, - "loss": 1.4578, - "step": 1364 - }, - { - "epoch": 0.4290790584447455, - "grad_norm": 0.75, - "learning_rate": 1.8469057442081882e-05, - "loss": 1.4915, - "step": 1366 - }, - { - "epoch": 0.4297072854702868, - "grad_norm": 0.984375, - "learning_rate": 1.8466518565534754e-05, - "loss": 1.2513, - "step": 1368 - }, - { - "epoch": 0.4303355124958282, - "grad_norm": 0.78125, - "learning_rate": 1.8463979688987625e-05, - "loss": 1.3317, - "step": 1370 - }, - { - "epoch": 0.4309637395213695, - "grad_norm": 0.76171875, - "learning_rate": 1.8461440812440496e-05, - "loss": 1.3281, - "step": 1372 - }, - { - "epoch": 0.4315919665469109, - "grad_norm": 0.89453125, - "learning_rate": 1.845890193589337e-05, - "loss": 1.2836, - "step": 1374 - }, - { - "epoch": 0.4322201935724522, - "grad_norm": 0.96875, - "learning_rate": 1.845636305934624e-05, - "loss": 1.3258, - "step": 1376 - }, - { - "epoch": 0.4328484205979936, - "grad_norm": 0.703125, - "learning_rate": 1.8453824182799114e-05, - "loss": 1.3192, - "step": 1378 - }, - { - "epoch": 0.43347664762353494, - "grad_norm": 0.7890625, - "learning_rate": 1.8451285306251985e-05, - "loss": 1.2383, - "step": 1380 - }, - { - "epoch": 0.4341048746490763, - "grad_norm": 0.6953125, - "learning_rate": 1.8448746429704857e-05, - "loss": 1.4198, - "step": 1382 - }, - { - "epoch": 0.43473310167461765, - "grad_norm": 0.84375, - "learning_rate": 1.8446207553157728e-05, - "loss": 1.3262, - "step": 1384 - }, - { - "epoch": 0.43536132870015903, - "grad_norm": 0.90234375, - "learning_rate": 1.8443668676610603e-05, - "loss": 1.3783, - "step": 1386 - }, - { - "epoch": 0.43598955572570036, - "grad_norm": 0.8046875, - "learning_rate": 1.8441129800063474e-05, - "loss": 1.3803, - "step": 1388 - }, - { - "epoch": 0.43661778275124175, - "grad_norm": 0.8359375, - "learning_rate": 1.8438590923516346e-05, - "loss": 1.2537, - "step": 1390 - }, - { - "epoch": 0.4372460097767831, - "grad_norm": 0.74609375, - "learning_rate": 1.8436052046969217e-05, - "loss": 1.4251, - "step": 1392 - }, - { - "epoch": 0.43787423680232446, - "grad_norm": 0.80078125, - "learning_rate": 1.843351317042209e-05, - "loss": 1.3708, - "step": 1394 - }, - { - "epoch": 0.4385024638278658, - "grad_norm": 0.81640625, - "learning_rate": 1.8430974293874963e-05, - "loss": 1.3983, - "step": 1396 - }, - { - "epoch": 0.43913069085340717, - "grad_norm": 0.703125, - "learning_rate": 1.8428435417327834e-05, - "loss": 1.3208, - "step": 1398 - }, - { - "epoch": 0.4397589178789485, - "grad_norm": 0.6484375, - "learning_rate": 1.8425896540780706e-05, - "loss": 1.2447, - "step": 1400 - }, - { - "epoch": 0.4403871449044899, - "grad_norm": 0.7265625, - "learning_rate": 1.8423357664233577e-05, - "loss": 1.4995, - "step": 1402 - }, - { - "epoch": 0.4410153719300312, - "grad_norm": 0.69140625, - "learning_rate": 1.842081878768645e-05, - "loss": 1.2333, - "step": 1404 - }, - { - "epoch": 0.4416435989555726, - "grad_norm": 0.72265625, - "learning_rate": 1.8418279911139323e-05, - "loss": 1.438, - "step": 1406 - }, - { - "epoch": 0.4422718259811139, - "grad_norm": 0.6796875, - "learning_rate": 1.8415741034592195e-05, - "loss": 1.3648, - "step": 1408 - }, - { - "epoch": 0.4429000530066553, - "grad_norm": 0.87890625, - "learning_rate": 1.8413202158045066e-05, - "loss": 1.3982, - "step": 1410 - }, - { - "epoch": 0.44352828003219663, - "grad_norm": 0.7734375, - "learning_rate": 1.8410663281497937e-05, - "loss": 1.2714, - "step": 1412 - }, - { - "epoch": 0.444156507057738, - "grad_norm": 0.66015625, - "learning_rate": 1.8408124404950812e-05, - "loss": 1.3464, - "step": 1414 - }, - { - "epoch": 0.44478473408327934, - "grad_norm": 0.671875, - "learning_rate": 1.8405585528403684e-05, - "loss": 1.3379, - "step": 1416 - }, - { - "epoch": 0.4454129611088207, - "grad_norm": 0.73046875, - "learning_rate": 1.8403046651856555e-05, - "loss": 1.3022, - "step": 1418 - }, - { - "epoch": 0.44604118813436205, - "grad_norm": 0.765625, - "learning_rate": 1.8400507775309426e-05, - "loss": 1.3677, - "step": 1420 - }, - { - "epoch": 0.4466694151599034, - "grad_norm": 0.6796875, - "learning_rate": 1.83979688987623e-05, - "loss": 1.3101, - "step": 1422 - }, - { - "epoch": 0.44729764218544477, - "grad_norm": 0.94140625, - "learning_rate": 1.839543002221517e-05, - "loss": 1.2118, - "step": 1424 - }, - { - "epoch": 0.4479258692109861, - "grad_norm": 2.84375, - "learning_rate": 1.8392891145668044e-05, - "loss": 1.2927, - "step": 1426 - }, - { - "epoch": 0.4485540962365275, - "grad_norm": 0.88671875, - "learning_rate": 1.8390352269120915e-05, - "loss": 1.4683, - "step": 1428 - }, - { - "epoch": 0.4491823232620688, - "grad_norm": 0.75, - "learning_rate": 1.8387813392573787e-05, - "loss": 1.2949, - "step": 1430 - }, - { - "epoch": 0.4498105502876102, - "grad_norm": 0.75, - "learning_rate": 1.8385274516026658e-05, - "loss": 1.3789, - "step": 1432 - }, - { - "epoch": 0.4504387773131515, - "grad_norm": 0.7265625, - "learning_rate": 1.8382735639479533e-05, - "loss": 1.3308, - "step": 1434 - }, - { - "epoch": 0.4510670043386929, - "grad_norm": 0.78125, - "learning_rate": 1.8380196762932404e-05, - "loss": 1.3221, - "step": 1436 - }, - { - "epoch": 0.45169523136423423, - "grad_norm": 0.703125, - "learning_rate": 1.8377657886385276e-05, - "loss": 1.353, - "step": 1438 - }, - { - "epoch": 0.4523234583897756, - "grad_norm": 0.84765625, - "learning_rate": 1.8375119009838147e-05, - "loss": 1.2386, - "step": 1440 - }, - { - "epoch": 0.45295168541531694, - "grad_norm": 0.70703125, - "learning_rate": 1.8372580133291022e-05, - "loss": 1.5192, - "step": 1442 - }, - { - "epoch": 0.4535799124408583, - "grad_norm": 0.7890625, - "learning_rate": 1.837004125674389e-05, - "loss": 1.4076, - "step": 1444 - }, - { - "epoch": 0.45420813946639965, - "grad_norm": 0.6953125, - "learning_rate": 1.8367502380196765e-05, - "loss": 1.4394, - "step": 1446 - }, - { - "epoch": 0.45483636649194104, - "grad_norm": 0.96484375, - "learning_rate": 1.836496350364964e-05, - "loss": 1.3238, - "step": 1448 - }, - { - "epoch": 0.45546459351748236, - "grad_norm": 0.75390625, - "learning_rate": 1.8362424627102507e-05, - "loss": 1.2685, - "step": 1450 - }, - { - "epoch": 0.45609282054302375, - "grad_norm": 0.7890625, - "learning_rate": 1.8359885750555382e-05, - "loss": 1.3883, - "step": 1452 - }, - { - "epoch": 0.4567210475685651, - "grad_norm": 0.71484375, - "learning_rate": 1.8357346874008253e-05, - "loss": 1.3735, - "step": 1454 - }, - { - "epoch": 0.45734927459410646, - "grad_norm": 0.7265625, - "learning_rate": 1.8354807997461125e-05, - "loss": 1.4432, - "step": 1456 - }, - { - "epoch": 0.4579775016196478, - "grad_norm": 0.8046875, - "learning_rate": 1.8352269120913996e-05, - "loss": 1.3395, - "step": 1458 - }, - { - "epoch": 0.45860572864518917, - "grad_norm": 0.78515625, - "learning_rate": 1.834973024436687e-05, - "loss": 1.2355, - "step": 1460 - }, - { - "epoch": 0.4592339556707305, - "grad_norm": 0.703125, - "learning_rate": 1.8347191367819742e-05, - "loss": 1.4257, - "step": 1462 - }, - { - "epoch": 0.4598621826962719, - "grad_norm": 0.78515625, - "learning_rate": 1.8344652491272614e-05, - "loss": 1.4014, - "step": 1464 - }, - { - "epoch": 0.4604904097218132, - "grad_norm": 0.66015625, - "learning_rate": 1.8342113614725485e-05, - "loss": 1.4452, - "step": 1466 - }, - { - "epoch": 0.4611186367473546, - "grad_norm": 0.7578125, - "learning_rate": 1.833957473817836e-05, - "loss": 1.2609, - "step": 1468 - }, - { - "epoch": 0.4617468637728959, - "grad_norm": 3.109375, - "learning_rate": 1.8337035861631228e-05, - "loss": 1.3392, - "step": 1470 - }, - { - "epoch": 0.4623750907984373, - "grad_norm": 0.8359375, - "learning_rate": 1.8334496985084103e-05, - "loss": 1.4992, - "step": 1472 - }, - { - "epoch": 0.46300331782397863, - "grad_norm": 0.71875, - "learning_rate": 1.8331958108536974e-05, - "loss": 1.3606, - "step": 1474 - }, - { - "epoch": 0.46363154484952, - "grad_norm": 0.73046875, - "learning_rate": 1.8329419231989845e-05, - "loss": 1.4007, - "step": 1476 - }, - { - "epoch": 0.46425977187506134, - "grad_norm": 0.71484375, - "learning_rate": 1.8326880355442717e-05, - "loss": 1.3096, - "step": 1478 - }, - { - "epoch": 0.46488799890060273, - "grad_norm": 0.75, - "learning_rate": 1.832434147889559e-05, - "loss": 1.3873, - "step": 1480 - }, - { - "epoch": 0.46551622592614406, - "grad_norm": 0.75390625, - "learning_rate": 1.8321802602348463e-05, - "loss": 1.3962, - "step": 1482 - }, - { - "epoch": 0.46614445295168544, - "grad_norm": 1.1171875, - "learning_rate": 1.8319263725801334e-05, - "loss": 1.3512, - "step": 1484 - }, - { - "epoch": 0.46677267997722677, - "grad_norm": 0.671875, - "learning_rate": 1.8316724849254206e-05, - "loss": 1.3546, - "step": 1486 - }, - { - "epoch": 0.46740090700276815, - "grad_norm": 0.76953125, - "learning_rate": 1.831418597270708e-05, - "loss": 1.3739, - "step": 1488 - }, - { - "epoch": 0.4680291340283095, - "grad_norm": 0.71875, - "learning_rate": 1.831164709615995e-05, - "loss": 1.3045, - "step": 1490 - }, - { - "epoch": 0.46865736105385086, - "grad_norm": 0.72265625, - "learning_rate": 1.8309108219612823e-05, - "loss": 1.385, - "step": 1492 - }, - { - "epoch": 0.4692855880793922, - "grad_norm": 0.765625, - "learning_rate": 1.8306569343065695e-05, - "loss": 1.282, - "step": 1494 - }, - { - "epoch": 0.4699138151049335, - "grad_norm": 0.671875, - "learning_rate": 1.8304030466518566e-05, - "loss": 1.5008, - "step": 1496 - }, - { - "epoch": 0.4705420421304749, - "grad_norm": 0.78515625, - "learning_rate": 1.8301491589971437e-05, - "loss": 1.422, - "step": 1498 - }, - { - "epoch": 0.47117026915601623, - "grad_norm": 0.70703125, - "learning_rate": 1.8298952713424312e-05, - "loss": 1.3419, - "step": 1500 - }, - { - "epoch": 0.4717984961815576, - "grad_norm": 0.72265625, - "learning_rate": 1.8296413836877184e-05, - "loss": 1.3878, - "step": 1502 - }, - { - "epoch": 0.47242672320709894, - "grad_norm": 0.7109375, - "learning_rate": 1.8293874960330055e-05, - "loss": 1.4378, - "step": 1504 - }, - { - "epoch": 0.4730549502326403, - "grad_norm": 0.69140625, - "learning_rate": 1.8291336083782926e-05, - "loss": 1.4115, - "step": 1506 - }, - { - "epoch": 0.47368317725818165, - "grad_norm": 0.75, - "learning_rate": 1.82887972072358e-05, - "loss": 1.2909, - "step": 1508 - }, - { - "epoch": 0.47431140428372304, - "grad_norm": 0.74609375, - "learning_rate": 1.8286258330688672e-05, - "loss": 1.3813, - "step": 1510 - }, - { - "epoch": 0.47493963130926437, - "grad_norm": 0.7890625, - "learning_rate": 1.8283719454141544e-05, - "loss": 1.3018, - "step": 1512 - }, - { - "epoch": 0.47556785833480575, - "grad_norm": 0.7109375, - "learning_rate": 1.8281180577594415e-05, - "loss": 1.228, - "step": 1514 - }, - { - "epoch": 0.4761960853603471, - "grad_norm": 0.65625, - "learning_rate": 1.8278641701047287e-05, - "loss": 1.3985, - "step": 1516 - }, - { - "epoch": 0.47682431238588846, - "grad_norm": 0.67578125, - "learning_rate": 1.8276102824500158e-05, - "loss": 1.4065, - "step": 1518 - }, - { - "epoch": 0.4774525394114298, - "grad_norm": 0.7421875, - "learning_rate": 1.8273563947953033e-05, - "loss": 1.34, - "step": 1520 - }, - { - "epoch": 0.47808076643697117, - "grad_norm": 0.73046875, - "learning_rate": 1.8271025071405904e-05, - "loss": 1.3451, - "step": 1522 - }, - { - "epoch": 0.4787089934625125, - "grad_norm": 0.75, - "learning_rate": 1.8268486194858776e-05, - "loss": 1.3477, - "step": 1524 - }, - { - "epoch": 0.4793372204880539, - "grad_norm": 0.734375, - "learning_rate": 1.8265947318311647e-05, - "loss": 1.3247, - "step": 1526 - }, - { - "epoch": 0.4799654475135952, - "grad_norm": 0.73046875, - "learning_rate": 1.8263408441764522e-05, - "loss": 1.21, - "step": 1528 - }, - { - "epoch": 0.4805936745391366, - "grad_norm": 0.71875, - "learning_rate": 1.8260869565217393e-05, - "loss": 1.3398, - "step": 1530 - }, - { - "epoch": 0.4812219015646779, - "grad_norm": 0.734375, - "learning_rate": 1.8258330688670264e-05, - "loss": 1.3262, - "step": 1532 - }, - { - "epoch": 0.4818501285902193, - "grad_norm": 0.75, - "learning_rate": 1.825579181212314e-05, - "loss": 1.4908, - "step": 1534 - }, - { - "epoch": 0.48247835561576063, - "grad_norm": 0.7890625, - "learning_rate": 1.825325293557601e-05, - "loss": 1.3113, - "step": 1536 - }, - { - "epoch": 0.483106582641302, - "grad_norm": 0.6640625, - "learning_rate": 1.8250714059028882e-05, - "loss": 1.2718, - "step": 1538 - }, - { - "epoch": 0.48373480966684335, - "grad_norm": 0.7265625, - "learning_rate": 1.8248175182481753e-05, - "loss": 1.406, - "step": 1540 - }, - { - "epoch": 0.48436303669238473, - "grad_norm": 0.6953125, - "learning_rate": 1.8245636305934625e-05, - "loss": 1.3577, - "step": 1542 - }, - { - "epoch": 0.48499126371792606, - "grad_norm": 0.7421875, - "learning_rate": 1.8243097429387496e-05, - "loss": 1.3054, - "step": 1544 - }, - { - "epoch": 0.48561949074346744, - "grad_norm": 0.78515625, - "learning_rate": 1.824055855284037e-05, - "loss": 1.3842, - "step": 1546 - }, - { - "epoch": 0.48624771776900877, - "grad_norm": 0.76953125, - "learning_rate": 1.8238019676293242e-05, - "loss": 1.3789, - "step": 1548 - }, - { - "epoch": 0.48687594479455015, - "grad_norm": 0.65625, - "learning_rate": 1.8235480799746114e-05, - "loss": 1.3439, - "step": 1550 - }, - { - "epoch": 0.4875041718200915, - "grad_norm": 0.69921875, - "learning_rate": 1.8232941923198985e-05, - "loss": 1.3715, - "step": 1552 - }, - { - "epoch": 0.48813239884563286, - "grad_norm": 0.70703125, - "learning_rate": 1.823040304665186e-05, - "loss": 1.4506, - "step": 1554 - }, - { - "epoch": 0.4887606258711742, - "grad_norm": 0.69140625, - "learning_rate": 1.822786417010473e-05, - "loss": 1.4064, - "step": 1556 - }, - { - "epoch": 0.4893888528967156, - "grad_norm": 0.7578125, - "learning_rate": 1.8225325293557603e-05, - "loss": 1.3322, - "step": 1558 - }, - { - "epoch": 0.4900170799222569, - "grad_norm": 0.76953125, - "learning_rate": 1.8222786417010474e-05, - "loss": 1.3075, - "step": 1560 - }, - { - "epoch": 0.4906453069477983, - "grad_norm": 0.703125, - "learning_rate": 1.822024754046335e-05, - "loss": 1.4187, - "step": 1562 - }, - { - "epoch": 0.4912735339733396, - "grad_norm": 0.8984375, - "learning_rate": 1.8217708663916217e-05, - "loss": 1.3365, - "step": 1564 - }, - { - "epoch": 0.49190176099888094, - "grad_norm": 0.7578125, - "learning_rate": 1.821516978736909e-05, - "loss": 1.3593, - "step": 1566 - }, - { - "epoch": 0.4925299880244223, - "grad_norm": 0.7421875, - "learning_rate": 1.8212630910821963e-05, - "loss": 1.298, - "step": 1568 - }, - { - "epoch": 0.49315821504996366, - "grad_norm": 0.71875, - "learning_rate": 1.8210092034274834e-05, - "loss": 1.4256, - "step": 1570 - }, - { - "epoch": 0.49378644207550504, - "grad_norm": 0.7890625, - "learning_rate": 1.8207553157727706e-05, - "loss": 1.4808, - "step": 1572 - }, - { - "epoch": 0.49441466910104637, - "grad_norm": 0.875, - "learning_rate": 1.820501428118058e-05, - "loss": 1.4233, - "step": 1574 - }, - { - "epoch": 0.49504289612658775, - "grad_norm": 0.7890625, - "learning_rate": 1.8202475404633452e-05, - "loss": 1.3273, - "step": 1576 - }, - { - "epoch": 0.4956711231521291, - "grad_norm": 0.73828125, - "learning_rate": 1.8199936528086323e-05, - "loss": 1.3384, - "step": 1578 - }, - { - "epoch": 0.49629935017767046, - "grad_norm": 0.7421875, - "learning_rate": 1.8197397651539195e-05, - "loss": 1.4455, - "step": 1580 - }, - { - "epoch": 0.4969275772032118, - "grad_norm": 0.7421875, - "learning_rate": 1.819485877499207e-05, - "loss": 1.3235, - "step": 1582 - }, - { - "epoch": 0.4975558042287532, - "grad_norm": 0.73828125, - "learning_rate": 1.8192319898444937e-05, - "loss": 1.3602, - "step": 1584 - }, - { - "epoch": 0.4981840312542945, - "grad_norm": 0.890625, - "learning_rate": 1.8189781021897812e-05, - "loss": 1.2892, - "step": 1586 - }, - { - "epoch": 0.4988122582798359, - "grad_norm": 0.85546875, - "learning_rate": 1.8187242145350684e-05, - "loss": 1.2329, - "step": 1588 - }, - { - "epoch": 0.4994404853053772, - "grad_norm": 0.68359375, - "learning_rate": 1.8184703268803555e-05, - "loss": 1.2801, - "step": 1590 - }, - { - "epoch": 0.5000687123309185, - "grad_norm": 0.76171875, - "learning_rate": 1.8182164392256426e-05, - "loss": 1.3857, - "step": 1592 - }, - { - "epoch": 0.50069693935646, - "grad_norm": 0.8671875, - "learning_rate": 1.81796255157093e-05, - "loss": 1.3113, - "step": 1594 - }, - { - "epoch": 0.5013251663820013, - "grad_norm": 0.78125, - "learning_rate": 1.8177086639162172e-05, - "loss": 1.4523, - "step": 1596 - }, - { - "epoch": 0.5019533934075426, - "grad_norm": 0.77734375, - "learning_rate": 1.8174547762615044e-05, - "loss": 1.3314, - "step": 1598 - }, - { - "epoch": 0.502581620433084, - "grad_norm": 0.796875, - "learning_rate": 1.8172008886067915e-05, - "loss": 1.3618, - "step": 1600 - }, - { - "epoch": 0.5032098474586254, - "grad_norm": 0.8203125, - "learning_rate": 1.816947000952079e-05, - "loss": 1.3553, - "step": 1602 - }, - { - "epoch": 0.5038380744841667, - "grad_norm": 0.671875, - "learning_rate": 1.816693113297366e-05, - "loss": 1.4201, - "step": 1604 - }, - { - "epoch": 0.5044663015097081, - "grad_norm": 0.8125, - "learning_rate": 1.8164392256426533e-05, - "loss": 1.309, - "step": 1606 - }, - { - "epoch": 0.5050945285352494, - "grad_norm": 0.7734375, - "learning_rate": 1.8161853379879404e-05, - "loss": 1.3145, - "step": 1608 - }, - { - "epoch": 0.5057227555607908, - "grad_norm": 0.87890625, - "learning_rate": 1.8159314503332275e-05, - "loss": 1.3546, - "step": 1610 - }, - { - "epoch": 0.5063509825863322, - "grad_norm": 0.7109375, - "learning_rate": 1.8156775626785147e-05, - "loss": 1.2818, - "step": 1612 - }, - { - "epoch": 0.5069792096118735, - "grad_norm": 0.796875, - "learning_rate": 1.815423675023802e-05, - "loss": 1.4176, - "step": 1614 - }, - { - "epoch": 0.5076074366374148, - "grad_norm": 0.734375, - "learning_rate": 1.8151697873690893e-05, - "loss": 1.3501, - "step": 1616 - }, - { - "epoch": 0.5082356636629562, - "grad_norm": 0.74609375, - "learning_rate": 1.8149158997143764e-05, - "loss": 1.3265, - "step": 1618 - }, - { - "epoch": 0.5088638906884976, - "grad_norm": 0.79296875, - "learning_rate": 1.814662012059664e-05, - "loss": 1.333, - "step": 1620 - }, - { - "epoch": 0.5094921177140389, - "grad_norm": 0.7265625, - "learning_rate": 1.814408124404951e-05, - "loss": 1.2086, - "step": 1622 - }, - { - "epoch": 0.5101203447395802, - "grad_norm": 0.8046875, - "learning_rate": 1.8141542367502382e-05, - "loss": 1.2181, - "step": 1624 - }, - { - "epoch": 0.5107485717651217, - "grad_norm": 0.72265625, - "learning_rate": 1.8139003490955253e-05, - "loss": 1.3269, - "step": 1626 - }, - { - "epoch": 0.511376798790663, - "grad_norm": 0.67578125, - "learning_rate": 1.8136464614408128e-05, - "loss": 1.2733, - "step": 1628 - }, - { - "epoch": 0.5120050258162043, - "grad_norm": 0.69921875, - "learning_rate": 1.8133925737861e-05, - "loss": 1.253, - "step": 1630 - }, - { - "epoch": 0.5126332528417457, - "grad_norm": 0.71484375, - "learning_rate": 1.813138686131387e-05, - "loss": 1.511, - "step": 1632 - }, - { - "epoch": 0.5132614798672871, - "grad_norm": 0.671875, - "learning_rate": 1.8128847984766742e-05, - "loss": 1.2451, - "step": 1634 - }, - { - "epoch": 0.5138897068928284, - "grad_norm": 0.66015625, - "learning_rate": 1.8126309108219614e-05, - "loss": 1.2587, - "step": 1636 - }, - { - "epoch": 0.5145179339183698, - "grad_norm": 0.875, - "learning_rate": 1.8123770231672485e-05, - "loss": 1.301, - "step": 1638 - }, - { - "epoch": 0.5151461609439111, - "grad_norm": 0.8046875, - "learning_rate": 1.812123135512536e-05, - "loss": 1.4174, - "step": 1640 - }, - { - "epoch": 0.5157743879694524, - "grad_norm": 0.7265625, - "learning_rate": 1.811869247857823e-05, - "loss": 1.2725, - "step": 1642 - }, - { - "epoch": 0.5164026149949938, - "grad_norm": 0.81640625, - "learning_rate": 1.8116153602031103e-05, - "loss": 1.3744, - "step": 1644 - }, - { - "epoch": 0.5170308420205352, - "grad_norm": 0.734375, - "learning_rate": 1.8113614725483974e-05, - "loss": 1.2455, - "step": 1646 - }, - { - "epoch": 0.5176590690460765, - "grad_norm": 0.68359375, - "learning_rate": 1.811107584893685e-05, - "loss": 1.4318, - "step": 1648 - }, - { - "epoch": 0.5182872960716178, - "grad_norm": 0.80859375, - "learning_rate": 1.810853697238972e-05, - "loss": 1.3426, - "step": 1650 - }, - { - "epoch": 0.5189155230971593, - "grad_norm": 0.73046875, - "learning_rate": 1.810599809584259e-05, - "loss": 1.1767, - "step": 1652 - }, - { - "epoch": 0.5195437501227006, - "grad_norm": 0.73046875, - "learning_rate": 1.8103459219295463e-05, - "loss": 1.2447, - "step": 1654 - }, - { - "epoch": 0.5201719771482419, - "grad_norm": 0.87890625, - "learning_rate": 1.8100920342748338e-05, - "loss": 1.3263, - "step": 1656 - }, - { - "epoch": 0.5208002041737833, - "grad_norm": 0.74609375, - "learning_rate": 1.8098381466201206e-05, - "loss": 1.3857, - "step": 1658 - }, - { - "epoch": 0.5214284311993247, - "grad_norm": 0.7421875, - "learning_rate": 1.809584258965408e-05, - "loss": 1.4577, - "step": 1660 - }, - { - "epoch": 0.522056658224866, - "grad_norm": 0.77734375, - "learning_rate": 1.8093303713106952e-05, - "loss": 1.4192, - "step": 1662 - }, - { - "epoch": 0.5226848852504073, - "grad_norm": 0.72265625, - "learning_rate": 1.8090764836559823e-05, - "loss": 1.2375, - "step": 1664 - }, - { - "epoch": 0.5233131122759487, - "grad_norm": 0.7578125, - "learning_rate": 1.8088225960012695e-05, - "loss": 1.3199, - "step": 1666 - }, - { - "epoch": 0.5239413393014901, - "grad_norm": 0.859375, - "learning_rate": 1.808568708346557e-05, - "loss": 1.3239, - "step": 1668 - }, - { - "epoch": 0.5245695663270314, - "grad_norm": 0.63671875, - "learning_rate": 1.808314820691844e-05, - "loss": 1.4124, - "step": 1670 - }, - { - "epoch": 0.5251977933525728, - "grad_norm": 0.8046875, - "learning_rate": 1.8080609330371312e-05, - "loss": 1.3539, - "step": 1672 - }, - { - "epoch": 0.5258260203781141, - "grad_norm": 0.75, - "learning_rate": 1.8078070453824183e-05, - "loss": 1.2979, - "step": 1674 - }, - { - "epoch": 0.5264542474036555, - "grad_norm": 0.67578125, - "learning_rate": 1.8075531577277058e-05, - "loss": 1.3582, - "step": 1676 - }, - { - "epoch": 0.5270824744291969, - "grad_norm": 0.7890625, - "learning_rate": 1.8072992700729926e-05, - "loss": 1.2866, - "step": 1678 - }, - { - "epoch": 0.5277107014547382, - "grad_norm": 0.83984375, - "learning_rate": 1.80704538241828e-05, - "loss": 1.1929, - "step": 1680 - }, - { - "epoch": 0.5283389284802795, - "grad_norm": 0.6875, - "learning_rate": 1.8067914947635672e-05, - "loss": 1.2081, - "step": 1682 - }, - { - "epoch": 0.528967155505821, - "grad_norm": 0.67578125, - "learning_rate": 1.8065376071088544e-05, - "loss": 1.4058, - "step": 1684 - }, - { - "epoch": 0.5295953825313623, - "grad_norm": 0.69140625, - "learning_rate": 1.8062837194541415e-05, - "loss": 1.3689, - "step": 1686 - }, - { - "epoch": 0.5302236095569036, - "grad_norm": 0.7734375, - "learning_rate": 1.806029831799429e-05, - "loss": 1.2963, - "step": 1688 - }, - { - "epoch": 0.530851836582445, - "grad_norm": 0.76953125, - "learning_rate": 1.805775944144716e-05, - "loss": 1.3622, - "step": 1690 - }, - { - "epoch": 0.5314800636079864, - "grad_norm": 0.765625, - "learning_rate": 1.8055220564900033e-05, - "loss": 1.2601, - "step": 1692 - }, - { - "epoch": 0.5321082906335277, - "grad_norm": 0.78515625, - "learning_rate": 1.8052681688352904e-05, - "loss": 1.2963, - "step": 1694 - }, - { - "epoch": 0.532736517659069, - "grad_norm": 0.7109375, - "learning_rate": 1.805014281180578e-05, - "loss": 1.3702, - "step": 1696 - }, - { - "epoch": 0.5333647446846104, - "grad_norm": 0.734375, - "learning_rate": 1.804760393525865e-05, - "loss": 1.2802, - "step": 1698 - }, - { - "epoch": 0.5339929717101518, - "grad_norm": 1.0078125, - "learning_rate": 1.804506505871152e-05, - "loss": 1.2703, - "step": 1700 - }, - { - "epoch": 0.5346211987356931, - "grad_norm": 0.71875, - "learning_rate": 1.8042526182164393e-05, - "loss": 1.2858, - "step": 1702 - }, - { - "epoch": 0.5352494257612345, - "grad_norm": 1.03125, - "learning_rate": 1.8039987305617264e-05, - "loss": 1.2983, - "step": 1704 - }, - { - "epoch": 0.5358776527867758, - "grad_norm": 0.71484375, - "learning_rate": 1.803744842907014e-05, - "loss": 1.4192, - "step": 1706 - }, - { - "epoch": 0.5365058798123171, - "grad_norm": 0.71484375, - "learning_rate": 1.803490955252301e-05, - "loss": 1.4011, - "step": 1708 - }, - { - "epoch": 0.5371341068378586, - "grad_norm": 0.6796875, - "learning_rate": 1.8032370675975882e-05, - "loss": 1.3894, - "step": 1710 - }, - { - "epoch": 0.5377623338633999, - "grad_norm": 0.75390625, - "learning_rate": 1.8029831799428753e-05, - "loss": 1.5168, - "step": 1712 - }, - { - "epoch": 0.5383905608889412, - "grad_norm": 1.4609375, - "learning_rate": 1.8027292922881628e-05, - "loss": 1.3708, - "step": 1714 - }, - { - "epoch": 0.5390187879144825, - "grad_norm": 0.76953125, - "learning_rate": 1.80247540463345e-05, - "loss": 1.3817, - "step": 1716 - }, - { - "epoch": 0.539647014940024, - "grad_norm": 0.7578125, - "learning_rate": 1.802221516978737e-05, - "loss": 1.3174, - "step": 1718 - }, - { - "epoch": 0.5402752419655653, - "grad_norm": 0.73828125, - "learning_rate": 1.8019676293240242e-05, - "loss": 1.3609, - "step": 1720 - }, - { - "epoch": 0.5409034689911066, - "grad_norm": 0.734375, - "learning_rate": 1.8017137416693117e-05, - "loss": 1.4835, - "step": 1722 - }, - { - "epoch": 0.541531696016648, - "grad_norm": 0.69921875, - "learning_rate": 1.801459854014599e-05, - "loss": 1.5052, - "step": 1724 - }, - { - "epoch": 0.5421599230421894, - "grad_norm": 0.72265625, - "learning_rate": 1.801205966359886e-05, - "loss": 1.3482, - "step": 1726 - }, - { - "epoch": 0.5427881500677307, - "grad_norm": 0.79296875, - "learning_rate": 1.800952078705173e-05, - "loss": 1.21, - "step": 1728 - }, - { - "epoch": 0.5434163770932721, - "grad_norm": 0.75390625, - "learning_rate": 1.8006981910504602e-05, - "loss": 1.3702, - "step": 1730 - }, - { - "epoch": 0.5440446041188134, - "grad_norm": 0.7578125, - "learning_rate": 1.8004443033957474e-05, - "loss": 1.4266, - "step": 1732 - }, - { - "epoch": 0.5446728311443548, - "grad_norm": 0.671875, - "learning_rate": 1.800190415741035e-05, - "loss": 1.339, - "step": 1734 - }, - { - "epoch": 0.5453010581698962, - "grad_norm": 0.74609375, - "learning_rate": 1.799936528086322e-05, - "loss": 1.3851, - "step": 1736 - }, - { - "epoch": 0.5459292851954375, - "grad_norm": 0.69140625, - "learning_rate": 1.799682640431609e-05, - "loss": 1.4017, - "step": 1738 - }, - { - "epoch": 0.5465575122209788, - "grad_norm": 0.734375, - "learning_rate": 1.7994287527768963e-05, - "loss": 1.2933, - "step": 1740 - }, - { - "epoch": 0.5471857392465203, - "grad_norm": 0.74609375, - "learning_rate": 1.7991748651221838e-05, - "loss": 1.3104, - "step": 1742 - }, - { - "epoch": 0.5478139662720616, - "grad_norm": 0.65625, - "learning_rate": 1.798920977467471e-05, - "loss": 1.231, - "step": 1744 - }, - { - "epoch": 0.5484421932976029, - "grad_norm": 0.7578125, - "learning_rate": 1.798667089812758e-05, - "loss": 1.4584, - "step": 1746 - }, - { - "epoch": 0.5490704203231442, - "grad_norm": 0.75390625, - "learning_rate": 1.798413202158045e-05, - "loss": 1.2988, - "step": 1748 - }, - { - "epoch": 0.5496986473486857, - "grad_norm": 0.7578125, - "learning_rate": 1.7981593145033326e-05, - "loss": 1.2553, - "step": 1750 - }, - { - "epoch": 0.550326874374227, - "grad_norm": 0.75, - "learning_rate": 1.7979054268486194e-05, - "loss": 1.3824, - "step": 1752 - }, - { - "epoch": 0.5509551013997683, - "grad_norm": 0.765625, - "learning_rate": 1.797651539193907e-05, - "loss": 1.4831, - "step": 1754 - }, - { - "epoch": 0.5515833284253097, - "grad_norm": 0.8046875, - "learning_rate": 1.797397651539194e-05, - "loss": 1.3839, - "step": 1756 - }, - { - "epoch": 0.5522115554508511, - "grad_norm": 0.7578125, - "learning_rate": 1.7971437638844812e-05, - "loss": 1.4556, - "step": 1758 - }, - { - "epoch": 0.5528397824763924, - "grad_norm": 0.67578125, - "learning_rate": 1.7968898762297683e-05, - "loss": 1.3564, - "step": 1760 - }, - { - "epoch": 0.5534680095019338, - "grad_norm": 0.81640625, - "learning_rate": 1.7966359885750558e-05, - "loss": 1.4027, - "step": 1762 - }, - { - "epoch": 0.5540962365274751, - "grad_norm": 0.93359375, - "learning_rate": 1.796382100920343e-05, - "loss": 1.3738, - "step": 1764 - }, - { - "epoch": 0.5547244635530165, - "grad_norm": 0.8203125, - "learning_rate": 1.79612821326563e-05, - "loss": 1.4116, - "step": 1766 - }, - { - "epoch": 0.5553526905785579, - "grad_norm": 0.9140625, - "learning_rate": 1.7958743256109172e-05, - "loss": 1.4383, - "step": 1768 - }, - { - "epoch": 0.5559809176040992, - "grad_norm": 0.76171875, - "learning_rate": 1.7956204379562047e-05, - "loss": 1.2174, - "step": 1770 - }, - { - "epoch": 0.5566091446296405, - "grad_norm": 0.75390625, - "learning_rate": 1.7953665503014915e-05, - "loss": 1.2893, - "step": 1772 - }, - { - "epoch": 0.557237371655182, - "grad_norm": 0.796875, - "learning_rate": 1.795112662646779e-05, - "loss": 1.291, - "step": 1774 - }, - { - "epoch": 0.5578655986807233, - "grad_norm": 0.82421875, - "learning_rate": 1.794858774992066e-05, - "loss": 1.4798, - "step": 1776 - }, - { - "epoch": 0.5584938257062646, - "grad_norm": 0.9296875, - "learning_rate": 1.7946048873373533e-05, - "loss": 1.2961, - "step": 1778 - }, - { - "epoch": 0.5591220527318059, - "grad_norm": 2.1875, - "learning_rate": 1.7943509996826404e-05, - "loss": 1.3342, - "step": 1780 - }, - { - "epoch": 0.5597502797573473, - "grad_norm": 0.890625, - "learning_rate": 1.794097112027928e-05, - "loss": 1.2582, - "step": 1782 - }, - { - "epoch": 0.5603785067828887, - "grad_norm": 0.66796875, - "learning_rate": 1.793843224373215e-05, - "loss": 1.3106, - "step": 1784 - }, - { - "epoch": 0.56100673380843, - "grad_norm": 0.8125, - "learning_rate": 1.793589336718502e-05, - "loss": 1.3369, - "step": 1786 - }, - { - "epoch": 0.5616349608339714, - "grad_norm": 0.859375, - "learning_rate": 1.7933354490637893e-05, - "loss": 1.2346, - "step": 1788 - }, - { - "epoch": 0.5622631878595127, - "grad_norm": 0.76171875, - "learning_rate": 1.7930815614090768e-05, - "loss": 1.2644, - "step": 1790 - }, - { - "epoch": 0.5628914148850541, - "grad_norm": 0.8359375, - "learning_rate": 1.792827673754364e-05, - "loss": 1.3247, - "step": 1792 - }, - { - "epoch": 0.5635196419105954, - "grad_norm": 0.7734375, - "learning_rate": 1.792573786099651e-05, - "loss": 1.2764, - "step": 1794 - }, - { - "epoch": 0.5641478689361368, - "grad_norm": 0.71484375, - "learning_rate": 1.7923198984449385e-05, - "loss": 1.2428, - "step": 1796 - }, - { - "epoch": 0.5647760959616781, - "grad_norm": 0.80078125, - "learning_rate": 1.7920660107902253e-05, - "loss": 1.4744, - "step": 1798 - }, - { - "epoch": 0.5654043229872195, - "grad_norm": 0.7421875, - "learning_rate": 1.7918121231355128e-05, - "loss": 1.3754, - "step": 1800 - }, - { - "epoch": 0.5660325500127609, - "grad_norm": 0.8828125, - "learning_rate": 1.7915582354808e-05, - "loss": 1.3084, - "step": 1802 - }, - { - "epoch": 0.5666607770383022, - "grad_norm": 0.75, - "learning_rate": 1.791304347826087e-05, - "loss": 1.3269, - "step": 1804 - }, - { - "epoch": 0.5672890040638435, - "grad_norm": 0.7265625, - "learning_rate": 1.7910504601713742e-05, - "loss": 1.3141, - "step": 1806 - }, - { - "epoch": 0.567917231089385, - "grad_norm": 0.90234375, - "learning_rate": 1.7907965725166617e-05, - "loss": 1.1482, - "step": 1808 - }, - { - "epoch": 0.5685454581149263, - "grad_norm": 0.69921875, - "learning_rate": 1.7905426848619488e-05, - "loss": 1.3093, - "step": 1810 - }, - { - "epoch": 0.5691736851404676, - "grad_norm": 0.6640625, - "learning_rate": 1.790288797207236e-05, - "loss": 1.4742, - "step": 1812 - }, - { - "epoch": 0.569801912166009, - "grad_norm": 0.8203125, - "learning_rate": 1.790034909552523e-05, - "loss": 1.3429, - "step": 1814 - }, - { - "epoch": 0.5704301391915504, - "grad_norm": 0.77734375, - "learning_rate": 1.7897810218978106e-05, - "loss": 1.3247, - "step": 1816 - }, - { - "epoch": 0.5710583662170917, - "grad_norm": 0.6875, - "learning_rate": 1.7895271342430974e-05, - "loss": 1.386, - "step": 1818 - }, - { - "epoch": 0.571686593242633, - "grad_norm": 0.6796875, - "learning_rate": 1.789273246588385e-05, - "loss": 1.3501, - "step": 1820 - }, - { - "epoch": 0.5723148202681744, - "grad_norm": 0.73828125, - "learning_rate": 1.789019358933672e-05, - "loss": 1.2759, - "step": 1822 - }, - { - "epoch": 0.5729430472937158, - "grad_norm": 0.78515625, - "learning_rate": 1.788765471278959e-05, - "loss": 1.2834, - "step": 1824 - }, - { - "epoch": 0.5735712743192571, - "grad_norm": 0.765625, - "learning_rate": 1.7885115836242463e-05, - "loss": 1.3764, - "step": 1826 - }, - { - "epoch": 0.5741995013447985, - "grad_norm": 0.80859375, - "learning_rate": 1.7882576959695337e-05, - "loss": 1.2428, - "step": 1828 - }, - { - "epoch": 0.5748277283703398, - "grad_norm": 0.78125, - "learning_rate": 1.788003808314821e-05, - "loss": 1.3577, - "step": 1830 - }, - { - "epoch": 0.5754559553958812, - "grad_norm": 0.94921875, - "learning_rate": 1.787749920660108e-05, - "loss": 1.2091, - "step": 1832 - }, - { - "epoch": 0.5760841824214226, - "grad_norm": 0.83203125, - "learning_rate": 1.787496033005395e-05, - "loss": 1.4, - "step": 1834 - }, - { - "epoch": 0.5767124094469639, - "grad_norm": 0.7109375, - "learning_rate": 1.7872421453506826e-05, - "loss": 1.3621, - "step": 1836 - }, - { - "epoch": 0.5773406364725052, - "grad_norm": 0.828125, - "learning_rate": 1.7869882576959698e-05, - "loss": 1.3756, - "step": 1838 - }, - { - "epoch": 0.5779688634980467, - "grad_norm": 0.68359375, - "learning_rate": 1.786734370041257e-05, - "loss": 1.3658, - "step": 1840 - }, - { - "epoch": 0.578597090523588, - "grad_norm": 0.7109375, - "learning_rate": 1.786480482386544e-05, - "loss": 1.2812, - "step": 1842 - }, - { - "epoch": 0.5792253175491293, - "grad_norm": 0.73828125, - "learning_rate": 1.7862265947318312e-05, - "loss": 1.4921, - "step": 1844 - }, - { - "epoch": 0.5798535445746706, - "grad_norm": 0.77734375, - "learning_rate": 1.7859727070771183e-05, - "loss": 1.3042, - "step": 1846 - }, - { - "epoch": 0.5804817716002121, - "grad_norm": 1.203125, - "learning_rate": 1.7857188194224058e-05, - "loss": 1.1429, - "step": 1848 - }, - { - "epoch": 0.5811099986257534, - "grad_norm": 0.73046875, - "learning_rate": 1.785464931767693e-05, - "loss": 1.4471, - "step": 1850 - }, - { - "epoch": 0.5817382256512947, - "grad_norm": 0.6953125, - "learning_rate": 1.78521104411298e-05, - "loss": 1.3808, - "step": 1852 - }, - { - "epoch": 0.5823664526768361, - "grad_norm": 0.94140625, - "learning_rate": 1.7849571564582672e-05, - "loss": 1.3266, - "step": 1854 - }, - { - "epoch": 0.5829946797023774, - "grad_norm": 0.68359375, - "learning_rate": 1.7847032688035547e-05, - "loss": 1.4399, - "step": 1856 - }, - { - "epoch": 0.5836229067279188, - "grad_norm": 0.75, - "learning_rate": 1.784449381148842e-05, - "loss": 1.2884, - "step": 1858 - }, - { - "epoch": 0.5842511337534602, - "grad_norm": 0.6796875, - "learning_rate": 1.784195493494129e-05, - "loss": 1.3308, - "step": 1860 - }, - { - "epoch": 0.5848793607790015, - "grad_norm": 0.7890625, - "learning_rate": 1.783941605839416e-05, - "loss": 1.3215, - "step": 1862 - }, - { - "epoch": 0.5855075878045428, - "grad_norm": 0.8671875, - "learning_rate": 1.7836877181847036e-05, - "loss": 1.4684, - "step": 1864 - }, - { - "epoch": 0.5861358148300843, - "grad_norm": 0.6875, - "learning_rate": 1.7834338305299904e-05, - "loss": 1.293, - "step": 1866 - }, - { - "epoch": 0.5867640418556256, - "grad_norm": 0.7578125, - "learning_rate": 1.783179942875278e-05, - "loss": 1.2667, - "step": 1868 - }, - { - "epoch": 0.5873922688811669, - "grad_norm": 0.76953125, - "learning_rate": 1.782926055220565e-05, - "loss": 1.3243, - "step": 1870 - }, - { - "epoch": 0.5880204959067082, - "grad_norm": 0.79296875, - "learning_rate": 1.782672167565852e-05, - "loss": 1.2651, - "step": 1872 - }, - { - "epoch": 0.5886487229322497, - "grad_norm": 0.69921875, - "learning_rate": 1.7824182799111393e-05, - "loss": 1.2973, - "step": 1874 - }, - { - "epoch": 0.589276949957791, - "grad_norm": 0.73828125, - "learning_rate": 1.7821643922564268e-05, - "loss": 1.2823, - "step": 1876 - }, - { - "epoch": 0.5899051769833323, - "grad_norm": 0.94921875, - "learning_rate": 1.781910504601714e-05, - "loss": 1.415, - "step": 1878 - }, - { - "epoch": 0.5905334040088737, - "grad_norm": 0.76171875, - "learning_rate": 1.781656616947001e-05, - "loss": 1.2477, - "step": 1880 - }, - { - "epoch": 0.5911616310344151, - "grad_norm": 0.80078125, - "learning_rate": 1.7814027292922885e-05, - "loss": 1.2649, - "step": 1882 - }, - { - "epoch": 0.5917898580599564, - "grad_norm": 0.64453125, - "learning_rate": 1.7811488416375756e-05, - "loss": 1.3797, - "step": 1884 - }, - { - "epoch": 0.5924180850854978, - "grad_norm": 0.75390625, - "learning_rate": 1.7808949539828628e-05, - "loss": 1.3717, - "step": 1886 - }, - { - "epoch": 0.5930463121110391, - "grad_norm": 0.70703125, - "learning_rate": 1.78064106632815e-05, - "loss": 1.2677, - "step": 1888 - }, - { - "epoch": 0.5936745391365805, - "grad_norm": 0.78515625, - "learning_rate": 1.7803871786734374e-05, - "loss": 1.4157, - "step": 1890 - }, - { - "epoch": 0.5943027661621219, - "grad_norm": 0.6875, - "learning_rate": 1.7801332910187242e-05, - "loss": 1.2478, - "step": 1892 - }, - { - "epoch": 0.5949309931876632, - "grad_norm": 0.73046875, - "learning_rate": 1.7798794033640117e-05, - "loss": 1.3108, - "step": 1894 - }, - { - "epoch": 0.5955592202132045, - "grad_norm": 0.75, - "learning_rate": 1.7796255157092988e-05, - "loss": 1.3043, - "step": 1896 - }, - { - "epoch": 0.596187447238746, - "grad_norm": 0.703125, - "learning_rate": 1.779371628054586e-05, - "loss": 1.259, - "step": 1898 - }, - { - "epoch": 0.5968156742642873, - "grad_norm": 0.83203125, - "learning_rate": 1.779117740399873e-05, - "loss": 1.2671, - "step": 1900 - }, - { - "epoch": 0.5974439012898286, - "grad_norm": 0.7734375, - "learning_rate": 1.7788638527451606e-05, - "loss": 1.3808, - "step": 1902 - }, - { - "epoch": 0.5980721283153699, - "grad_norm": 0.8828125, - "learning_rate": 1.7786099650904477e-05, - "loss": 1.3359, - "step": 1904 - }, - { - "epoch": 0.5987003553409114, - "grad_norm": 0.8359375, - "learning_rate": 1.778356077435735e-05, - "loss": 1.3205, - "step": 1906 - }, - { - "epoch": 0.5993285823664527, - "grad_norm": 0.73828125, - "learning_rate": 1.778102189781022e-05, - "loss": 1.3357, - "step": 1908 - }, - { - "epoch": 0.599956809391994, - "grad_norm": 0.76953125, - "learning_rate": 1.7778483021263095e-05, - "loss": 1.3952, - "step": 1910 - }, - { - "epoch": 0.6005850364175354, - "grad_norm": 0.828125, - "learning_rate": 1.7775944144715963e-05, - "loss": 1.2332, - "step": 1912 - }, - { - "epoch": 0.6012132634430768, - "grad_norm": 0.828125, - "learning_rate": 1.7773405268168837e-05, - "loss": 1.3406, - "step": 1914 - }, - { - "epoch": 0.6018414904686181, - "grad_norm": 0.71875, - "learning_rate": 1.777086639162171e-05, - "loss": 1.3423, - "step": 1916 - }, - { - "epoch": 0.6024697174941595, - "grad_norm": 0.74609375, - "learning_rate": 1.776832751507458e-05, - "loss": 1.3578, - "step": 1918 - }, - { - "epoch": 0.6030979445197008, - "grad_norm": 0.65625, - "learning_rate": 1.776578863852745e-05, - "loss": 1.3513, - "step": 1920 - }, - { - "epoch": 0.6037261715452421, - "grad_norm": 0.8203125, - "learning_rate": 1.7763249761980326e-05, - "loss": 1.2171, - "step": 1922 - }, - { - "epoch": 0.6043543985707835, - "grad_norm": 0.72265625, - "learning_rate": 1.7760710885433198e-05, - "loss": 1.3335, - "step": 1924 - }, - { - "epoch": 0.6049826255963249, - "grad_norm": 0.83203125, - "learning_rate": 1.775817200888607e-05, - "loss": 1.3946, - "step": 1926 - }, - { - "epoch": 0.6056108526218662, - "grad_norm": 0.76953125, - "learning_rate": 1.775563313233894e-05, - "loss": 1.2521, - "step": 1928 - }, - { - "epoch": 0.6062390796474075, - "grad_norm": 0.78125, - "learning_rate": 1.7753094255791815e-05, - "loss": 1.4663, - "step": 1930 - }, - { - "epoch": 0.606867306672949, - "grad_norm": 0.71484375, - "learning_rate": 1.7750555379244687e-05, - "loss": 1.1201, - "step": 1932 - }, - { - "epoch": 0.6074955336984903, - "grad_norm": 0.78515625, - "learning_rate": 1.7748016502697558e-05, - "loss": 1.4028, - "step": 1934 - }, - { - "epoch": 0.6081237607240316, - "grad_norm": 0.7734375, - "learning_rate": 1.774547762615043e-05, - "loss": 1.2642, - "step": 1936 - }, - { - "epoch": 0.608751987749573, - "grad_norm": 0.76953125, - "learning_rate": 1.77429387496033e-05, - "loss": 1.2945, - "step": 1938 - }, - { - "epoch": 0.6093802147751144, - "grad_norm": 0.76953125, - "learning_rate": 1.7740399873056172e-05, - "loss": 1.265, - "step": 1940 - }, - { - "epoch": 0.6100084418006557, - "grad_norm": 0.859375, - "learning_rate": 1.7737860996509047e-05, - "loss": 1.4148, - "step": 1942 - }, - { - "epoch": 0.610636668826197, - "grad_norm": 0.66796875, - "learning_rate": 1.7735322119961918e-05, - "loss": 1.2506, - "step": 1944 - }, - { - "epoch": 0.6112648958517384, - "grad_norm": 0.90234375, - "learning_rate": 1.773278324341479e-05, - "loss": 1.3281, - "step": 1946 - }, - { - "epoch": 0.6118931228772798, - "grad_norm": 0.7109375, - "learning_rate": 1.773024436686766e-05, - "loss": 1.273, - "step": 1948 - }, - { - "epoch": 0.6125213499028211, - "grad_norm": 0.75, - "learning_rate": 1.7727705490320536e-05, - "loss": 1.2533, - "step": 1950 - }, - { - "epoch": 0.6131495769283625, - "grad_norm": 0.77734375, - "learning_rate": 1.7725166613773407e-05, - "loss": 1.2262, - "step": 1952 - }, - { - "epoch": 0.6137778039539038, - "grad_norm": 0.72265625, - "learning_rate": 1.772262773722628e-05, - "loss": 1.2834, - "step": 1954 - }, - { - "epoch": 0.6144060309794452, - "grad_norm": 0.6796875, - "learning_rate": 1.772008886067915e-05, - "loss": 1.286, - "step": 1956 - }, - { - "epoch": 0.6150342580049866, - "grad_norm": 0.7265625, - "learning_rate": 1.7717549984132025e-05, - "loss": 1.2474, - "step": 1958 - }, - { - "epoch": 0.6156624850305279, - "grad_norm": 0.71484375, - "learning_rate": 1.7715011107584893e-05, - "loss": 1.345, - "step": 1960 - }, - { - "epoch": 0.6162907120560692, - "grad_norm": 0.79296875, - "learning_rate": 1.7712472231037767e-05, - "loss": 1.2253, - "step": 1962 - }, - { - "epoch": 0.6169189390816107, - "grad_norm": 0.76953125, - "learning_rate": 1.770993335449064e-05, - "loss": 1.3628, - "step": 1964 - }, - { - "epoch": 0.617547166107152, - "grad_norm": 0.76953125, - "learning_rate": 1.770739447794351e-05, - "loss": 1.2676, - "step": 1966 - }, - { - "epoch": 0.6181753931326933, - "grad_norm": 0.72265625, - "learning_rate": 1.7704855601396385e-05, - "loss": 1.2463, - "step": 1968 - }, - { - "epoch": 0.6188036201582346, - "grad_norm": 0.7109375, - "learning_rate": 1.7702316724849256e-05, - "loss": 1.3617, - "step": 1970 - }, - { - "epoch": 0.6194318471837761, - "grad_norm": 0.83984375, - "learning_rate": 1.7699777848302128e-05, - "loss": 1.4785, - "step": 1972 - }, - { - "epoch": 0.6200600742093174, - "grad_norm": 0.76953125, - "learning_rate": 1.7697238971755e-05, - "loss": 1.2933, - "step": 1974 - }, - { - "epoch": 0.6206883012348587, - "grad_norm": 0.70703125, - "learning_rate": 1.7694700095207874e-05, - "loss": 1.4211, - "step": 1976 - }, - { - "epoch": 0.6213165282604001, - "grad_norm": 0.734375, - "learning_rate": 1.7692161218660745e-05, - "loss": 1.4411, - "step": 1978 - }, - { - "epoch": 0.6219447552859415, - "grad_norm": 0.70703125, - "learning_rate": 1.7689622342113617e-05, - "loss": 1.2598, - "step": 1980 - }, - { - "epoch": 0.6225729823114828, - "grad_norm": 0.71875, - "learning_rate": 1.7687083465566488e-05, - "loss": 1.2098, - "step": 1982 - }, - { - "epoch": 0.6232012093370242, - "grad_norm": 0.69921875, - "learning_rate": 1.7684544589019363e-05, - "loss": 1.3236, - "step": 1984 - }, - { - "epoch": 0.6238294363625655, - "grad_norm": 0.73046875, - "learning_rate": 1.768200571247223e-05, - "loss": 1.3541, - "step": 1986 - }, - { - "epoch": 0.6244576633881069, - "grad_norm": 0.84765625, - "learning_rate": 1.7679466835925106e-05, - "loss": 1.2746, - "step": 1988 - }, - { - "epoch": 0.6250858904136483, - "grad_norm": 0.86328125, - "learning_rate": 1.7676927959377977e-05, - "loss": 1.3703, - "step": 1990 - }, - { - "epoch": 0.6257141174391896, - "grad_norm": 0.80859375, - "learning_rate": 1.767438908283085e-05, - "loss": 1.2673, - "step": 1992 - }, - { - "epoch": 0.6263423444647309, - "grad_norm": 0.88671875, - "learning_rate": 1.767185020628372e-05, - "loss": 1.2734, - "step": 1994 - }, - { - "epoch": 0.6269705714902722, - "grad_norm": 0.8125, - "learning_rate": 1.7669311329736595e-05, - "loss": 1.2994, - "step": 1996 - }, - { - "epoch": 0.6275987985158137, - "grad_norm": 0.84765625, - "learning_rate": 1.7666772453189466e-05, - "loss": 1.2314, - "step": 1998 - }, - { - "epoch": 0.628227025541355, - "grad_norm": 0.71875, - "learning_rate": 1.7664233576642337e-05, - "loss": 1.3692, - "step": 2000 - }, - { - "epoch": 0.6288552525668963, - "grad_norm": 0.703125, - "learning_rate": 1.766169470009521e-05, - "loss": 1.1083, - "step": 2002 - }, - { - "epoch": 0.6294834795924377, - "grad_norm": 0.71875, - "learning_rate": 1.7659155823548083e-05, - "loss": 1.3513, - "step": 2004 - }, - { - "epoch": 0.6301117066179791, - "grad_norm": 0.71875, - "learning_rate": 1.765661694700095e-05, - "loss": 1.2768, - "step": 2006 - }, - { - "epoch": 0.6307399336435204, - "grad_norm": 0.77734375, - "learning_rate": 1.7654078070453826e-05, - "loss": 1.399, - "step": 2008 - }, - { - "epoch": 0.6313681606690618, - "grad_norm": 0.7734375, - "learning_rate": 1.7651539193906698e-05, - "loss": 1.3596, - "step": 2010 - }, - { - "epoch": 0.6319963876946031, - "grad_norm": 0.99609375, - "learning_rate": 1.764900031735957e-05, - "loss": 1.3298, - "step": 2012 - }, - { - "epoch": 0.6326246147201445, - "grad_norm": 0.81640625, - "learning_rate": 1.764646144081244e-05, - "loss": 1.3194, - "step": 2014 - }, - { - "epoch": 0.6332528417456859, - "grad_norm": 0.78125, - "learning_rate": 1.7643922564265315e-05, - "loss": 1.2478, - "step": 2016 - }, - { - "epoch": 0.6338810687712272, - "grad_norm": 0.78125, - "learning_rate": 1.7641383687718187e-05, - "loss": 1.285, - "step": 2018 - }, - { - "epoch": 0.6345092957967685, - "grad_norm": 0.75, - "learning_rate": 1.7638844811171058e-05, - "loss": 1.4251, - "step": 2020 - }, - { - "epoch": 0.63513752282231, - "grad_norm": 0.97265625, - "learning_rate": 1.763630593462393e-05, - "loss": 1.281, - "step": 2022 - }, - { - "epoch": 0.6357657498478513, - "grad_norm": 0.859375, - "learning_rate": 1.7633767058076804e-05, - "loss": 1.3546, - "step": 2024 - }, - { - "epoch": 0.6363939768733926, - "grad_norm": 0.6796875, - "learning_rate": 1.7631228181529672e-05, - "loss": 1.2134, - "step": 2026 - }, - { - "epoch": 0.6370222038989339, - "grad_norm": 0.7734375, - "learning_rate": 1.7628689304982547e-05, - "loss": 1.352, - "step": 2028 - }, - { - "epoch": 0.6376504309244754, - "grad_norm": 0.69140625, - "learning_rate": 1.7626150428435418e-05, - "loss": 1.3923, - "step": 2030 - }, - { - "epoch": 0.6382786579500167, - "grad_norm": 0.69140625, - "learning_rate": 1.762361155188829e-05, - "loss": 1.3658, - "step": 2032 - }, - { - "epoch": 0.638906884975558, - "grad_norm": 0.96875, - "learning_rate": 1.762107267534116e-05, - "loss": 1.2421, - "step": 2034 - }, - { - "epoch": 0.6395351120010994, - "grad_norm": 0.71875, - "learning_rate": 1.7618533798794036e-05, - "loss": 1.3427, - "step": 2036 - }, - { - "epoch": 0.6401633390266408, - "grad_norm": 0.8515625, - "learning_rate": 1.7615994922246907e-05, - "loss": 1.3342, - "step": 2038 - }, - { - "epoch": 0.6407915660521821, - "grad_norm": 0.7578125, - "learning_rate": 1.761345604569978e-05, - "loss": 1.4221, - "step": 2040 - }, - { - "epoch": 0.6414197930777235, - "grad_norm": 0.73046875, - "learning_rate": 1.761091716915265e-05, - "loss": 1.3898, - "step": 2042 - }, - { - "epoch": 0.6420480201032648, - "grad_norm": 0.703125, - "learning_rate": 1.7608378292605525e-05, - "loss": 1.5576, - "step": 2044 - }, - { - "epoch": 0.6426762471288062, - "grad_norm": 0.79296875, - "learning_rate": 1.7605839416058396e-05, - "loss": 1.3117, - "step": 2046 - }, - { - "epoch": 0.6433044741543475, - "grad_norm": 0.76171875, - "learning_rate": 1.7603300539511267e-05, - "loss": 1.2932, - "step": 2048 - }, - { - "epoch": 0.6439327011798889, - "grad_norm": 0.7734375, - "learning_rate": 1.7600761662964142e-05, - "loss": 1.2463, - "step": 2050 - }, - { - "epoch": 0.6445609282054302, - "grad_norm": 0.703125, - "learning_rate": 1.759822278641701e-05, - "loss": 1.3657, - "step": 2052 - }, - { - "epoch": 0.6451891552309716, - "grad_norm": 0.7734375, - "learning_rate": 1.7595683909869885e-05, - "loss": 1.4386, - "step": 2054 - }, - { - "epoch": 0.645817382256513, - "grad_norm": 0.80078125, - "learning_rate": 1.7593145033322756e-05, - "loss": 1.3022, - "step": 2056 - }, - { - "epoch": 0.6464456092820543, - "grad_norm": 1.0546875, - "learning_rate": 1.7590606156775628e-05, - "loss": 1.5185, - "step": 2058 - }, - { - "epoch": 0.6470738363075956, - "grad_norm": 0.890625, - "learning_rate": 1.75880672802285e-05, - "loss": 1.1322, - "step": 2060 - }, - { - "epoch": 0.647702063333137, - "grad_norm": 0.671875, - "learning_rate": 1.7585528403681374e-05, - "loss": 1.3653, - "step": 2062 - }, - { - "epoch": 0.6483302903586784, - "grad_norm": 0.81640625, - "learning_rate": 1.7582989527134245e-05, - "loss": 1.2649, - "step": 2064 - }, - { - "epoch": 0.6489585173842197, - "grad_norm": 0.7578125, - "learning_rate": 1.7580450650587117e-05, - "loss": 1.4218, - "step": 2066 - }, - { - "epoch": 0.649586744409761, - "grad_norm": 0.84375, - "learning_rate": 1.7577911774039988e-05, - "loss": 1.42, - "step": 2068 - }, - { - "epoch": 0.6502149714353024, - "grad_norm": 0.83984375, - "learning_rate": 1.7575372897492863e-05, - "loss": 1.2207, - "step": 2070 - }, - { - "epoch": 0.6508431984608438, - "grad_norm": 0.73046875, - "learning_rate": 1.7572834020945734e-05, - "loss": 1.3358, - "step": 2072 - }, - { - "epoch": 0.6514714254863851, - "grad_norm": 0.921875, - "learning_rate": 1.7570295144398606e-05, - "loss": 1.2739, - "step": 2074 - }, - { - "epoch": 0.6520996525119265, - "grad_norm": 0.8125, - "learning_rate": 1.7567756267851477e-05, - "loss": 1.3296, - "step": 2076 - }, - { - "epoch": 0.6527278795374678, - "grad_norm": 0.76171875, - "learning_rate": 1.756521739130435e-05, - "loss": 1.3499, - "step": 2078 - }, - { - "epoch": 0.6533561065630092, - "grad_norm": 0.78515625, - "learning_rate": 1.756267851475722e-05, - "loss": 1.3431, - "step": 2080 - }, - { - "epoch": 0.6539843335885506, - "grad_norm": 0.6796875, - "learning_rate": 1.7560139638210094e-05, - "loss": 1.4784, - "step": 2082 - }, - { - "epoch": 0.6546125606140919, - "grad_norm": 0.7265625, - "learning_rate": 1.7557600761662966e-05, - "loss": 1.2925, - "step": 2084 - }, - { - "epoch": 0.6552407876396332, - "grad_norm": 0.828125, - "learning_rate": 1.7555061885115837e-05, - "loss": 1.2877, - "step": 2086 - }, - { - "epoch": 0.6558690146651747, - "grad_norm": 0.74609375, - "learning_rate": 1.755252300856871e-05, - "loss": 1.3971, - "step": 2088 - }, - { - "epoch": 0.656497241690716, - "grad_norm": 0.69921875, - "learning_rate": 1.7549984132021583e-05, - "loss": 1.2842, - "step": 2090 - }, - { - "epoch": 0.6571254687162573, - "grad_norm": 0.734375, - "learning_rate": 1.7547445255474455e-05, - "loss": 1.1632, - "step": 2092 - }, - { - "epoch": 0.6577536957417986, - "grad_norm": 0.828125, - "learning_rate": 1.7544906378927326e-05, - "loss": 1.3903, - "step": 2094 - }, - { - "epoch": 0.6583819227673401, - "grad_norm": 0.7421875, - "learning_rate": 1.7542367502380198e-05, - "loss": 1.4009, - "step": 2096 - }, - { - "epoch": 0.6590101497928814, - "grad_norm": 0.75390625, - "learning_rate": 1.7539828625833072e-05, - "loss": 1.353, - "step": 2098 - }, - { - "epoch": 0.6596383768184227, - "grad_norm": 0.7421875, - "learning_rate": 1.753728974928594e-05, - "loss": 1.4069, - "step": 2100 - }, - { - "epoch": 0.6602666038439641, - "grad_norm": 0.828125, - "learning_rate": 1.7534750872738815e-05, - "loss": 1.2694, - "step": 2102 - }, - { - "epoch": 0.6608948308695055, - "grad_norm": 0.6796875, - "learning_rate": 1.7532211996191686e-05, - "loss": 1.2923, - "step": 2104 - }, - { - "epoch": 0.6615230578950468, - "grad_norm": 0.75390625, - "learning_rate": 1.7529673119644558e-05, - "loss": 1.2756, - "step": 2106 - }, - { - "epoch": 0.6621512849205882, - "grad_norm": 0.7421875, - "learning_rate": 1.752713424309743e-05, - "loss": 1.4151, - "step": 2108 - }, - { - "epoch": 0.6627795119461295, - "grad_norm": 0.71875, - "learning_rate": 1.7524595366550304e-05, - "loss": 1.3067, - "step": 2110 - }, - { - "epoch": 0.6634077389716709, - "grad_norm": 0.70703125, - "learning_rate": 1.7522056490003175e-05, - "loss": 1.3295, - "step": 2112 - }, - { - "epoch": 0.6640359659972123, - "grad_norm": 0.7421875, - "learning_rate": 1.7519517613456047e-05, - "loss": 1.3994, - "step": 2114 - }, - { - "epoch": 0.6646641930227536, - "grad_norm": 0.79296875, - "learning_rate": 1.7516978736908918e-05, - "loss": 1.3512, - "step": 2116 - }, - { - "epoch": 0.6652924200482949, - "grad_norm": 0.71484375, - "learning_rate": 1.7514439860361793e-05, - "loss": 1.2393, - "step": 2118 - }, - { - "epoch": 0.6659206470738364, - "grad_norm": 0.7734375, - "learning_rate": 1.751190098381466e-05, - "loss": 1.2977, - "step": 2120 - }, - { - "epoch": 0.6665488740993777, - "grad_norm": 0.73828125, - "learning_rate": 1.7509362107267536e-05, - "loss": 1.4039, - "step": 2122 - }, - { - "epoch": 0.667177101124919, - "grad_norm": 0.7265625, - "learning_rate": 1.7506823230720407e-05, - "loss": 1.3294, - "step": 2124 - }, - { - "epoch": 0.6678053281504603, - "grad_norm": 0.69921875, - "learning_rate": 1.750428435417328e-05, - "loss": 1.2816, - "step": 2126 - }, - { - "epoch": 0.6684335551760018, - "grad_norm": 0.75, - "learning_rate": 1.750174547762615e-05, - "loss": 1.3298, - "step": 2128 - }, - { - "epoch": 0.6690617822015431, - "grad_norm": 0.6796875, - "learning_rate": 1.7499206601079025e-05, - "loss": 1.3823, - "step": 2130 - }, - { - "epoch": 0.6696900092270844, - "grad_norm": 0.72265625, - "learning_rate": 1.7496667724531896e-05, - "loss": 1.2973, - "step": 2132 - }, - { - "epoch": 0.6703182362526258, - "grad_norm": 0.67578125, - "learning_rate": 1.7494128847984767e-05, - "loss": 1.3873, - "step": 2134 - }, - { - "epoch": 0.6709464632781671, - "grad_norm": 0.71484375, - "learning_rate": 1.7491589971437642e-05, - "loss": 1.3746, - "step": 2136 - }, - { - "epoch": 0.6715746903037085, - "grad_norm": 0.71875, - "learning_rate": 1.7489051094890514e-05, - "loss": 1.2803, - "step": 2138 - }, - { - "epoch": 0.6722029173292499, - "grad_norm": 0.78515625, - "learning_rate": 1.7486512218343385e-05, - "loss": 1.3632, - "step": 2140 - }, - { - "epoch": 0.6728311443547912, - "grad_norm": 0.75, - "learning_rate": 1.7483973341796256e-05, - "loss": 1.3377, - "step": 2142 - }, - { - "epoch": 0.6734593713803325, - "grad_norm": 0.69921875, - "learning_rate": 1.748143446524913e-05, - "loss": 1.2896, - "step": 2144 - }, - { - "epoch": 0.674087598405874, - "grad_norm": 0.890625, - "learning_rate": 1.7478895588702e-05, - "loss": 1.2543, - "step": 2146 - }, - { - "epoch": 0.6747158254314153, - "grad_norm": 0.87109375, - "learning_rate": 1.7476356712154874e-05, - "loss": 1.2882, - "step": 2148 - }, - { - "epoch": 0.6753440524569566, - "grad_norm": 0.86328125, - "learning_rate": 1.7473817835607745e-05, - "loss": 1.3234, - "step": 2150 - }, - { - "epoch": 0.6759722794824979, - "grad_norm": 0.75390625, - "learning_rate": 1.7471278959060617e-05, - "loss": 1.2965, - "step": 2152 - }, - { - "epoch": 0.6766005065080394, - "grad_norm": 0.67578125, - "learning_rate": 1.7468740082513488e-05, - "loss": 1.4172, - "step": 2154 - }, - { - "epoch": 0.6772287335335807, - "grad_norm": 0.73828125, - "learning_rate": 1.7466201205966363e-05, - "loss": 1.3369, - "step": 2156 - }, - { - "epoch": 0.677856960559122, - "grad_norm": 0.7265625, - "learning_rate": 1.7463662329419234e-05, - "loss": 1.3239, - "step": 2158 - }, - { - "epoch": 0.6784851875846634, - "grad_norm": 0.7265625, - "learning_rate": 1.7461123452872105e-05, - "loss": 1.2926, - "step": 2160 - }, - { - "epoch": 0.6791134146102048, - "grad_norm": 0.83203125, - "learning_rate": 1.7458584576324977e-05, - "loss": 1.3588, - "step": 2162 - }, - { - "epoch": 0.6797416416357461, - "grad_norm": 0.87109375, - "learning_rate": 1.745604569977785e-05, - "loss": 1.1972, - "step": 2164 - }, - { - "epoch": 0.6803698686612875, - "grad_norm": 0.71484375, - "learning_rate": 1.7453506823230723e-05, - "loss": 1.2391, - "step": 2166 - }, - { - "epoch": 0.6809980956868288, - "grad_norm": 0.82421875, - "learning_rate": 1.7450967946683594e-05, - "loss": 1.3438, - "step": 2168 - }, - { - "epoch": 0.6816263227123702, - "grad_norm": 0.72265625, - "learning_rate": 1.7448429070136466e-05, - "loss": 1.4117, - "step": 2170 - }, - { - "epoch": 0.6822545497379116, - "grad_norm": 0.79296875, - "learning_rate": 1.7445890193589337e-05, - "loss": 1.388, - "step": 2172 - }, - { - "epoch": 0.6828827767634529, - "grad_norm": 0.6875, - "learning_rate": 1.744335131704221e-05, - "loss": 1.3602, - "step": 2174 - }, - { - "epoch": 0.6835110037889942, - "grad_norm": 1.0546875, - "learning_rate": 1.7440812440495083e-05, - "loss": 1.2999, - "step": 2176 - }, - { - "epoch": 0.6841392308145356, - "grad_norm": 0.828125, - "learning_rate": 1.7438273563947955e-05, - "loss": 1.3296, - "step": 2178 - }, - { - "epoch": 0.684767457840077, - "grad_norm": 0.78125, - "learning_rate": 1.7435734687400826e-05, - "loss": 1.302, - "step": 2180 - }, - { - "epoch": 0.6853956848656183, - "grad_norm": 0.73046875, - "learning_rate": 1.7433195810853697e-05, - "loss": 1.321, - "step": 2182 - }, - { - "epoch": 0.6860239118911596, - "grad_norm": 0.78515625, - "learning_rate": 1.7430656934306572e-05, - "loss": 1.3628, - "step": 2184 - }, - { - "epoch": 0.6866521389167011, - "grad_norm": 0.8671875, - "learning_rate": 1.7428118057759444e-05, - "loss": 1.3183, - "step": 2186 - }, - { - "epoch": 0.6872803659422424, - "grad_norm": 0.921875, - "learning_rate": 1.7425579181212315e-05, - "loss": 1.4956, - "step": 2188 - }, - { - "epoch": 0.6879085929677837, - "grad_norm": 0.7265625, - "learning_rate": 1.7423040304665186e-05, - "loss": 1.4264, - "step": 2190 - }, - { - "epoch": 0.688536819993325, - "grad_norm": 0.765625, - "learning_rate": 1.742050142811806e-05, - "loss": 1.3176, - "step": 2192 - }, - { - "epoch": 0.6891650470188665, - "grad_norm": 0.78515625, - "learning_rate": 1.741796255157093e-05, - "loss": 1.3268, - "step": 2194 - }, - { - "epoch": 0.6897932740444078, - "grad_norm": 1.046875, - "learning_rate": 1.7415423675023804e-05, - "loss": 1.346, - "step": 2196 - }, - { - "epoch": 0.6904215010699492, - "grad_norm": 0.80078125, - "learning_rate": 1.7412884798476675e-05, - "loss": 1.3614, - "step": 2198 - }, - { - "epoch": 0.6910497280954905, - "grad_norm": 0.7265625, - "learning_rate": 1.7410345921929547e-05, - "loss": 1.2779, - "step": 2200 - }, - { - "epoch": 0.6916779551210319, - "grad_norm": 0.7265625, - "learning_rate": 1.7407807045382418e-05, - "loss": 1.2913, - "step": 2202 - }, - { - "epoch": 0.6923061821465732, - "grad_norm": 0.86328125, - "learning_rate": 1.7405268168835293e-05, - "loss": 1.3669, - "step": 2204 - }, - { - "epoch": 0.6929344091721146, - "grad_norm": 0.71484375, - "learning_rate": 1.7402729292288164e-05, - "loss": 1.2992, - "step": 2206 - }, - { - "epoch": 0.6935626361976559, - "grad_norm": 0.80078125, - "learning_rate": 1.7400190415741036e-05, - "loss": 1.3157, - "step": 2208 - }, - { - "epoch": 0.6941908632231972, - "grad_norm": 0.828125, - "learning_rate": 1.7397651539193907e-05, - "loss": 1.1712, - "step": 2210 - }, - { - "epoch": 0.6948190902487387, - "grad_norm": 0.78515625, - "learning_rate": 1.7395112662646782e-05, - "loss": 1.1813, - "step": 2212 - }, - { - "epoch": 0.69544731727428, - "grad_norm": 0.76171875, - "learning_rate": 1.739257378609965e-05, - "loss": 1.3688, - "step": 2214 - }, - { - "epoch": 0.6960755442998213, - "grad_norm": 0.9453125, - "learning_rate": 1.7390034909552525e-05, - "loss": 1.3554, - "step": 2216 - }, - { - "epoch": 0.6967037713253627, - "grad_norm": 0.87890625, - "learning_rate": 1.7387496033005396e-05, - "loss": 1.3605, - "step": 2218 - }, - { - "epoch": 0.6973319983509041, - "grad_norm": 0.890625, - "learning_rate": 1.7384957156458267e-05, - "loss": 1.2138, - "step": 2220 - }, - { - "epoch": 0.6979602253764454, - "grad_norm": 0.8515625, - "learning_rate": 1.7382418279911142e-05, - "loss": 1.309, - "step": 2222 - }, - { - "epoch": 0.6985884524019867, - "grad_norm": 0.78515625, - "learning_rate": 1.7379879403364013e-05, - "loss": 1.2578, - "step": 2224 - }, - { - "epoch": 0.6992166794275281, - "grad_norm": 0.78515625, - "learning_rate": 1.7377340526816885e-05, - "loss": 1.3457, - "step": 2226 - }, - { - "epoch": 0.6998449064530695, - "grad_norm": 0.7890625, - "learning_rate": 1.7374801650269756e-05, - "loss": 1.2938, - "step": 2228 - }, - { - "epoch": 0.7004731334786108, - "grad_norm": 0.8984375, - "learning_rate": 1.737226277372263e-05, - "loss": 1.284, - "step": 2230 - }, - { - "epoch": 0.7011013605041522, - "grad_norm": 0.72265625, - "learning_rate": 1.7369723897175502e-05, - "loss": 1.415, - "step": 2232 - }, - { - "epoch": 0.7017295875296935, - "grad_norm": 0.94921875, - "learning_rate": 1.7367185020628374e-05, - "loss": 1.2291, - "step": 2234 - }, - { - "epoch": 0.7023578145552349, - "grad_norm": 0.74609375, - "learning_rate": 1.7364646144081245e-05, - "loss": 1.374, - "step": 2236 - }, - { - "epoch": 0.7029860415807763, - "grad_norm": 0.6953125, - "learning_rate": 1.736210726753412e-05, - "loss": 1.4697, - "step": 2238 - }, - { - "epoch": 0.7036142686063176, - "grad_norm": 0.71484375, - "learning_rate": 1.7359568390986988e-05, - "loss": 1.2784, - "step": 2240 - }, - { - "epoch": 0.7042424956318589, - "grad_norm": 0.73828125, - "learning_rate": 1.7357029514439863e-05, - "loss": 1.2381, - "step": 2242 - }, - { - "epoch": 0.7048707226574004, - "grad_norm": 0.78125, - "learning_rate": 1.7354490637892734e-05, - "loss": 1.2173, - "step": 2244 - }, - { - "epoch": 0.7054989496829417, - "grad_norm": 0.77734375, - "learning_rate": 1.7351951761345605e-05, - "loss": 1.2839, - "step": 2246 - }, - { - "epoch": 0.706127176708483, - "grad_norm": 0.6953125, - "learning_rate": 1.7349412884798477e-05, - "loss": 1.3768, - "step": 2248 - }, - { - "epoch": 0.7067554037340243, - "grad_norm": 0.81640625, - "learning_rate": 1.734687400825135e-05, - "loss": 1.3607, - "step": 2250 - }, - { - "epoch": 0.7073836307595658, - "grad_norm": 0.703125, - "learning_rate": 1.7344335131704223e-05, - "loss": 1.3943, - "step": 2252 - }, - { - "epoch": 0.7080118577851071, - "grad_norm": 0.6875, - "learning_rate": 1.7341796255157094e-05, - "loss": 1.4092, - "step": 2254 - }, - { - "epoch": 0.7086400848106484, - "grad_norm": 0.75, - "learning_rate": 1.7339257378609966e-05, - "loss": 1.2429, - "step": 2256 - }, - { - "epoch": 0.7092683118361898, - "grad_norm": 0.7109375, - "learning_rate": 1.733671850206284e-05, - "loss": 1.458, - "step": 2258 - }, - { - "epoch": 0.7098965388617312, - "grad_norm": 0.67578125, - "learning_rate": 1.7334179625515712e-05, - "loss": 1.3227, - "step": 2260 - }, - { - "epoch": 0.7105247658872725, - "grad_norm": 0.79296875, - "learning_rate": 1.7331640748968583e-05, - "loss": 1.4453, - "step": 2262 - }, - { - "epoch": 0.7111529929128139, - "grad_norm": 0.74609375, - "learning_rate": 1.7329101872421455e-05, - "loss": 1.2725, - "step": 2264 - }, - { - "epoch": 0.7117812199383552, - "grad_norm": 0.74609375, - "learning_rate": 1.7326562995874326e-05, - "loss": 1.2165, - "step": 2266 - }, - { - "epoch": 0.7124094469638966, - "grad_norm": 0.76171875, - "learning_rate": 1.7324024119327197e-05, - "loss": 1.4287, - "step": 2268 - }, - { - "epoch": 0.713037673989438, - "grad_norm": 0.6484375, - "learning_rate": 1.7321485242780072e-05, - "loss": 1.2783, - "step": 2270 - }, - { - "epoch": 0.7136659010149793, - "grad_norm": 0.6875, - "learning_rate": 1.7318946366232944e-05, - "loss": 1.3641, - "step": 2272 - }, - { - "epoch": 0.7142941280405206, - "grad_norm": 0.7109375, - "learning_rate": 1.7316407489685815e-05, - "loss": 1.3313, - "step": 2274 - }, - { - "epoch": 0.7149223550660619, - "grad_norm": 0.68359375, - "learning_rate": 1.7313868613138686e-05, - "loss": 1.3461, - "step": 2276 - }, - { - "epoch": 0.7155505820916034, - "grad_norm": 0.6953125, - "learning_rate": 1.731132973659156e-05, - "loss": 1.3159, - "step": 2278 - }, - { - "epoch": 0.7161788091171447, - "grad_norm": 0.765625, - "learning_rate": 1.7308790860044432e-05, - "loss": 1.2575, - "step": 2280 - }, - { - "epoch": 0.716807036142686, - "grad_norm": 0.65625, - "learning_rate": 1.7306251983497304e-05, - "loss": 1.3816, - "step": 2282 - }, - { - "epoch": 0.7174352631682274, - "grad_norm": 0.71875, - "learning_rate": 1.7303713106950175e-05, - "loss": 1.4548, - "step": 2284 - }, - { - "epoch": 0.7180634901937688, - "grad_norm": 0.83984375, - "learning_rate": 1.730117423040305e-05, - "loss": 1.2777, - "step": 2286 - }, - { - "epoch": 0.7186917172193101, - "grad_norm": 0.7421875, - "learning_rate": 1.7298635353855918e-05, - "loss": 1.3142, - "step": 2288 - }, - { - "epoch": 0.7193199442448515, - "grad_norm": 0.7890625, - "learning_rate": 1.7296096477308793e-05, - "loss": 1.2618, - "step": 2290 - }, - { - "epoch": 0.7199481712703928, - "grad_norm": 0.70703125, - "learning_rate": 1.7293557600761664e-05, - "loss": 1.3586, - "step": 2292 - }, - { - "epoch": 0.7205763982959342, - "grad_norm": 0.77734375, - "learning_rate": 1.7291018724214536e-05, - "loss": 1.2284, - "step": 2294 - }, - { - "epoch": 0.7212046253214756, - "grad_norm": 0.76953125, - "learning_rate": 1.7288479847667407e-05, - "loss": 1.3143, - "step": 2296 - }, - { - "epoch": 0.7218328523470169, - "grad_norm": 0.73828125, - "learning_rate": 1.728594097112028e-05, - "loss": 1.2988, - "step": 2298 - }, - { - "epoch": 0.7224610793725582, - "grad_norm": 0.78125, - "learning_rate": 1.7283402094573153e-05, - "loss": 1.3754, - "step": 2300 - }, - { - "epoch": 0.7230893063980997, - "grad_norm": 0.69140625, - "learning_rate": 1.7280863218026024e-05, - "loss": 1.3633, - "step": 2302 - }, - { - "epoch": 0.723717533423641, - "grad_norm": 0.71875, - "learning_rate": 1.7278324341478896e-05, - "loss": 1.4773, - "step": 2304 - }, - { - "epoch": 0.7243457604491823, - "grad_norm": 0.77734375, - "learning_rate": 1.727578546493177e-05, - "loss": 1.351, - "step": 2306 - }, - { - "epoch": 0.7249739874747236, - "grad_norm": 0.73828125, - "learning_rate": 1.7273246588384642e-05, - "loss": 1.3577, - "step": 2308 - }, - { - "epoch": 0.7256022145002651, - "grad_norm": 0.81640625, - "learning_rate": 1.7270707711837513e-05, - "loss": 1.2883, - "step": 2310 - }, - { - "epoch": 0.7262304415258064, - "grad_norm": 0.67578125, - "learning_rate": 1.7268168835290388e-05, - "loss": 1.4049, - "step": 2312 - }, - { - "epoch": 0.7268586685513477, - "grad_norm": 0.6796875, - "learning_rate": 1.7265629958743256e-05, - "loss": 1.3443, - "step": 2314 - }, - { - "epoch": 0.7274868955768891, - "grad_norm": 0.90234375, - "learning_rate": 1.726309108219613e-05, - "loss": 1.2131, - "step": 2316 - }, - { - "epoch": 0.7281151226024305, - "grad_norm": 0.71875, - "learning_rate": 1.7260552205649002e-05, - "loss": 1.353, - "step": 2318 - }, - { - "epoch": 0.7287433496279718, - "grad_norm": 0.73828125, - "learning_rate": 1.7258013329101874e-05, - "loss": 1.2911, - "step": 2320 - }, - { - "epoch": 0.7293715766535132, - "grad_norm": 0.90625, - "learning_rate": 1.7255474452554745e-05, - "loss": 1.3567, - "step": 2322 - }, - { - "epoch": 0.7299998036790545, - "grad_norm": 0.70703125, - "learning_rate": 1.725293557600762e-05, - "loss": 1.3589, - "step": 2324 - }, - { - "epoch": 0.7306280307045959, - "grad_norm": 0.87109375, - "learning_rate": 1.725039669946049e-05, - "loss": 1.3734, - "step": 2326 - }, - { - "epoch": 0.7312562577301372, - "grad_norm": 0.6484375, - "learning_rate": 1.7247857822913363e-05, - "loss": 1.3007, - "step": 2328 - }, - { - "epoch": 0.7318844847556786, - "grad_norm": 0.80859375, - "learning_rate": 1.7245318946366234e-05, - "loss": 1.3652, - "step": 2330 - }, - { - "epoch": 0.7325127117812199, - "grad_norm": 0.72265625, - "learning_rate": 1.724278006981911e-05, - "loss": 1.3929, - "step": 2332 - }, - { - "epoch": 0.7331409388067613, - "grad_norm": 0.6640625, - "learning_rate": 1.7240241193271977e-05, - "loss": 1.2893, - "step": 2334 - }, - { - "epoch": 0.7337691658323027, - "grad_norm": 0.77734375, - "learning_rate": 1.723770231672485e-05, - "loss": 1.4986, - "step": 2336 - }, - { - "epoch": 0.734397392857844, - "grad_norm": 0.6875, - "learning_rate": 1.7235163440177723e-05, - "loss": 1.3224, - "step": 2338 - }, - { - "epoch": 0.7350256198833853, - "grad_norm": 0.77734375, - "learning_rate": 1.7232624563630594e-05, - "loss": 1.422, - "step": 2340 - }, - { - "epoch": 0.7356538469089268, - "grad_norm": 0.703125, - "learning_rate": 1.7230085687083466e-05, - "loss": 1.4021, - "step": 2342 - }, - { - "epoch": 0.7362820739344681, - "grad_norm": 0.67578125, - "learning_rate": 1.722754681053634e-05, - "loss": 1.3948, - "step": 2344 - }, - { - "epoch": 0.7369103009600094, - "grad_norm": 0.73046875, - "learning_rate": 1.7225007933989212e-05, - "loss": 1.2958, - "step": 2346 - }, - { - "epoch": 0.7375385279855508, - "grad_norm": 0.734375, - "learning_rate": 1.7222469057442083e-05, - "loss": 1.2972, - "step": 2348 - }, - { - "epoch": 0.7381667550110921, - "grad_norm": 0.68359375, - "learning_rate": 1.7219930180894955e-05, - "loss": 1.3356, - "step": 2350 - }, - { - "epoch": 0.7387949820366335, - "grad_norm": 0.82421875, - "learning_rate": 1.721739130434783e-05, - "loss": 1.2247, - "step": 2352 - }, - { - "epoch": 0.7394232090621748, - "grad_norm": 0.70703125, - "learning_rate": 1.7214852427800697e-05, - "loss": 1.3243, - "step": 2354 - }, - { - "epoch": 0.7400514360877162, - "grad_norm": 0.7265625, - "learning_rate": 1.7212313551253572e-05, - "loss": 1.4064, - "step": 2356 - }, - { - "epoch": 0.7406796631132575, - "grad_norm": 0.77734375, - "learning_rate": 1.7209774674706443e-05, - "loss": 1.4806, - "step": 2358 - }, - { - "epoch": 0.7413078901387989, - "grad_norm": 0.85546875, - "learning_rate": 1.7207235798159315e-05, - "loss": 1.3769, - "step": 2360 - }, - { - "epoch": 0.7419361171643403, - "grad_norm": 0.71875, - "learning_rate": 1.7204696921612186e-05, - "loss": 1.2256, - "step": 2362 - }, - { - "epoch": 0.7425643441898816, - "grad_norm": 0.78125, - "learning_rate": 1.720215804506506e-05, - "loss": 1.389, - "step": 2364 - }, - { - "epoch": 0.7431925712154229, - "grad_norm": 0.6796875, - "learning_rate": 1.7199619168517932e-05, - "loss": 1.4362, - "step": 2366 - }, - { - "epoch": 0.7438207982409644, - "grad_norm": 0.8984375, - "learning_rate": 1.7197080291970804e-05, - "loss": 1.4191, - "step": 2368 - }, - { - "epoch": 0.7444490252665057, - "grad_norm": 0.7265625, - "learning_rate": 1.7194541415423675e-05, - "loss": 1.3115, - "step": 2370 - }, - { - "epoch": 0.745077252292047, - "grad_norm": 0.7578125, - "learning_rate": 1.719200253887655e-05, - "loss": 1.4019, - "step": 2372 - }, - { - "epoch": 0.7457054793175883, - "grad_norm": 0.734375, - "learning_rate": 1.718946366232942e-05, - "loss": 1.3587, - "step": 2374 - }, - { - "epoch": 0.7463337063431298, - "grad_norm": 0.87109375, - "learning_rate": 1.7186924785782293e-05, - "loss": 1.3749, - "step": 2376 - }, - { - "epoch": 0.7469619333686711, - "grad_norm": 0.6875, - "learning_rate": 1.7184385909235164e-05, - "loss": 1.3042, - "step": 2378 - }, - { - "epoch": 0.7475901603942124, - "grad_norm": 0.73828125, - "learning_rate": 1.7181847032688035e-05, - "loss": 1.2356, - "step": 2380 - }, - { - "epoch": 0.7482183874197538, - "grad_norm": 0.7734375, - "learning_rate": 1.7179308156140907e-05, - "loss": 1.2864, - "step": 2382 - }, - { - "epoch": 0.7488466144452952, - "grad_norm": 0.69921875, - "learning_rate": 1.717676927959378e-05, - "loss": 1.3995, - "step": 2384 - }, - { - "epoch": 0.7494748414708365, - "grad_norm": 0.78125, - "learning_rate": 1.7174230403046653e-05, - "loss": 1.2924, - "step": 2386 - }, - { - "epoch": 0.7501030684963779, - "grad_norm": 0.81640625, - "learning_rate": 1.7171691526499524e-05, - "loss": 1.2801, - "step": 2388 - }, - { - "epoch": 0.7507312955219192, - "grad_norm": 0.7890625, - "learning_rate": 1.7169152649952396e-05, - "loss": 1.2726, - "step": 2390 - }, - { - "epoch": 0.7513595225474606, - "grad_norm": 0.734375, - "learning_rate": 1.716661377340527e-05, - "loss": 1.35, - "step": 2392 - }, - { - "epoch": 0.751987749573002, - "grad_norm": 0.796875, - "learning_rate": 1.7164074896858142e-05, - "loss": 1.2783, - "step": 2394 - }, - { - "epoch": 0.7526159765985433, - "grad_norm": 0.78515625, - "learning_rate": 1.7161536020311013e-05, - "loss": 1.3665, - "step": 2396 - }, - { - "epoch": 0.7532442036240846, - "grad_norm": 0.98046875, - "learning_rate": 1.7158997143763888e-05, - "loss": 1.3679, - "step": 2398 - }, - { - "epoch": 0.7538724306496261, - "grad_norm": 0.78515625, - "learning_rate": 1.715645826721676e-05, - "loss": 1.3874, - "step": 2400 - }, - { - "epoch": 0.7545006576751674, - "grad_norm": 0.75390625, - "learning_rate": 1.715391939066963e-05, - "loss": 1.2631, - "step": 2402 - }, - { - "epoch": 0.7551288847007087, - "grad_norm": 0.796875, - "learning_rate": 1.7151380514122502e-05, - "loss": 1.2159, - "step": 2404 - }, - { - "epoch": 0.75575711172625, - "grad_norm": 0.74609375, - "learning_rate": 1.7148841637575374e-05, - "loss": 1.3067, - "step": 2406 - }, - { - "epoch": 0.7563853387517915, - "grad_norm": 0.7109375, - "learning_rate": 1.7146302761028245e-05, - "loss": 1.3503, - "step": 2408 - }, - { - "epoch": 0.7570135657773328, - "grad_norm": 0.7421875, - "learning_rate": 1.714376388448112e-05, - "loss": 1.369, - "step": 2410 - }, - { - "epoch": 0.7576417928028741, - "grad_norm": 0.88671875, - "learning_rate": 1.714122500793399e-05, - "loss": 1.2634, - "step": 2412 - }, - { - "epoch": 0.7582700198284155, - "grad_norm": 0.75390625, - "learning_rate": 1.7138686131386862e-05, - "loss": 1.2631, - "step": 2414 - }, - { - "epoch": 0.7588982468539569, - "grad_norm": 0.72265625, - "learning_rate": 1.7136147254839734e-05, - "loss": 1.33, - "step": 2416 - }, - { - "epoch": 0.7595264738794982, - "grad_norm": 0.72265625, - "learning_rate": 1.713360837829261e-05, - "loss": 1.3229, - "step": 2418 - }, - { - "epoch": 0.7601547009050396, - "grad_norm": 1.1640625, - "learning_rate": 1.713106950174548e-05, - "loss": 1.2857, - "step": 2420 - }, - { - "epoch": 0.7607829279305809, - "grad_norm": 0.875, - "learning_rate": 1.712853062519835e-05, - "loss": 1.3812, - "step": 2422 - }, - { - "epoch": 0.7614111549561222, - "grad_norm": 0.6953125, - "learning_rate": 1.7125991748651223e-05, - "loss": 1.3809, - "step": 2424 - }, - { - "epoch": 0.7620393819816637, - "grad_norm": 0.7890625, - "learning_rate": 1.7123452872104098e-05, - "loss": 1.3366, - "step": 2426 - }, - { - "epoch": 0.762667609007205, - "grad_norm": 0.77734375, - "learning_rate": 1.7120913995556966e-05, - "loss": 1.3628, - "step": 2428 - }, - { - "epoch": 0.7632958360327463, - "grad_norm": 0.8046875, - "learning_rate": 1.711837511900984e-05, - "loss": 1.3394, - "step": 2430 - }, - { - "epoch": 0.7639240630582876, - "grad_norm": 0.7265625, - "learning_rate": 1.7115836242462712e-05, - "loss": 1.4378, - "step": 2432 - }, - { - "epoch": 0.7645522900838291, - "grad_norm": 0.7890625, - "learning_rate": 1.7113297365915583e-05, - "loss": 1.1978, - "step": 2434 - }, - { - "epoch": 0.7651805171093704, - "grad_norm": 0.75, - "learning_rate": 1.7110758489368454e-05, - "loss": 1.2939, - "step": 2436 - }, - { - "epoch": 0.7658087441349117, - "grad_norm": 0.7109375, - "learning_rate": 1.710821961282133e-05, - "loss": 1.3248, - "step": 2438 - }, - { - "epoch": 0.7664369711604531, - "grad_norm": 0.7578125, - "learning_rate": 1.71056807362742e-05, - "loss": 1.2087, - "step": 2440 - }, - { - "epoch": 0.7670651981859945, - "grad_norm": 0.81640625, - "learning_rate": 1.7103141859727072e-05, - "loss": 1.1633, - "step": 2442 - }, - { - "epoch": 0.7676934252115358, - "grad_norm": 1.078125, - "learning_rate": 1.7100602983179943e-05, - "loss": 1.2432, - "step": 2444 - }, - { - "epoch": 0.7683216522370772, - "grad_norm": 0.75390625, - "learning_rate": 1.7098064106632818e-05, - "loss": 1.3272, - "step": 2446 - }, - { - "epoch": 0.7689498792626185, - "grad_norm": 0.78125, - "learning_rate": 1.7095525230085686e-05, - "loss": 1.2589, - "step": 2448 - }, - { - "epoch": 0.7695781062881599, - "grad_norm": 0.71484375, - "learning_rate": 1.709298635353856e-05, - "loss": 1.375, - "step": 2450 - }, - { - "epoch": 0.7702063333137013, - "grad_norm": 0.72265625, - "learning_rate": 1.7090447476991432e-05, - "loss": 1.2817, - "step": 2452 - }, - { - "epoch": 0.7708345603392426, - "grad_norm": 0.75390625, - "learning_rate": 1.7087908600444304e-05, - "loss": 1.2879, - "step": 2454 - }, - { - "epoch": 0.7714627873647839, - "grad_norm": 1.234375, - "learning_rate": 1.7085369723897175e-05, - "loss": 1.2573, - "step": 2456 - }, - { - "epoch": 0.7720910143903253, - "grad_norm": 0.7890625, - "learning_rate": 1.708283084735005e-05, - "loss": 1.343, - "step": 2458 - }, - { - "epoch": 0.7727192414158667, - "grad_norm": 0.7109375, - "learning_rate": 1.708029197080292e-05, - "loss": 1.357, - "step": 2460 - }, - { - "epoch": 0.773347468441408, - "grad_norm": 0.74609375, - "learning_rate": 1.7077753094255793e-05, - "loss": 1.3493, - "step": 2462 - }, - { - "epoch": 0.7739756954669493, - "grad_norm": 0.78515625, - "learning_rate": 1.7075214217708664e-05, - "loss": 1.2568, - "step": 2464 - }, - { - "epoch": 0.7746039224924908, - "grad_norm": 0.73828125, - "learning_rate": 1.707267534116154e-05, - "loss": 1.3476, - "step": 2466 - }, - { - "epoch": 0.7752321495180321, - "grad_norm": 0.76171875, - "learning_rate": 1.707013646461441e-05, - "loss": 1.2797, - "step": 2468 - }, - { - "epoch": 0.7758603765435734, - "grad_norm": 0.75390625, - "learning_rate": 1.706759758806728e-05, - "loss": 1.3368, - "step": 2470 - }, - { - "epoch": 0.7764886035691148, - "grad_norm": 0.671875, - "learning_rate": 1.7065058711520153e-05, - "loss": 1.2807, - "step": 2472 - }, - { - "epoch": 0.7771168305946562, - "grad_norm": 0.71484375, - "learning_rate": 1.7062519834973024e-05, - "loss": 1.5012, - "step": 2474 - }, - { - "epoch": 0.7777450576201975, - "grad_norm": 0.734375, - "learning_rate": 1.7059980958425896e-05, - "loss": 1.3204, - "step": 2476 - }, - { - "epoch": 0.7783732846457388, - "grad_norm": 0.8046875, - "learning_rate": 1.705744208187877e-05, - "loss": 1.3475, - "step": 2478 - }, - { - "epoch": 0.7790015116712802, - "grad_norm": 0.796875, - "learning_rate": 1.7054903205331642e-05, - "loss": 1.2051, - "step": 2480 - }, - { - "epoch": 0.7796297386968216, - "grad_norm": 0.68359375, - "learning_rate": 1.7052364328784513e-05, - "loss": 1.3502, - "step": 2482 - }, - { - "epoch": 0.780257965722363, - "grad_norm": 0.9140625, - "learning_rate": 1.7049825452237388e-05, - "loss": 1.2337, - "step": 2484 - }, - { - "epoch": 0.7808861927479043, - "grad_norm": 0.77734375, - "learning_rate": 1.704728657569026e-05, - "loss": 1.3524, - "step": 2486 - }, - { - "epoch": 0.7815144197734456, - "grad_norm": 0.82421875, - "learning_rate": 1.704474769914313e-05, - "loss": 1.3843, - "step": 2488 - }, - { - "epoch": 0.7821426467989869, - "grad_norm": 0.6953125, - "learning_rate": 1.7042208822596002e-05, - "loss": 1.3905, - "step": 2490 - }, - { - "epoch": 0.7827708738245284, - "grad_norm": 0.69921875, - "learning_rate": 1.7039669946048877e-05, - "loss": 1.3168, - "step": 2492 - }, - { - "epoch": 0.7833991008500697, - "grad_norm": 0.79296875, - "learning_rate": 1.7037131069501748e-05, - "loss": 1.233, - "step": 2494 - }, - { - "epoch": 0.784027327875611, - "grad_norm": 0.77734375, - "learning_rate": 1.703459219295462e-05, - "loss": 1.3278, - "step": 2496 - }, - { - "epoch": 0.7846555549011524, - "grad_norm": 0.6953125, - "learning_rate": 1.703205331640749e-05, - "loss": 1.2751, - "step": 2498 - }, - { - "epoch": 0.7852837819266938, - "grad_norm": 0.796875, - "learning_rate": 1.7029514439860362e-05, - "loss": 1.3463, - "step": 2500 - }, - { - "epoch": 0.7859120089522351, - "grad_norm": 0.80859375, - "learning_rate": 1.7026975563313234e-05, - "loss": 1.2921, - "step": 2502 - }, - { - "epoch": 0.7865402359777764, - "grad_norm": 0.71484375, - "learning_rate": 1.702443668676611e-05, - "loss": 1.1402, - "step": 2504 - }, - { - "epoch": 0.7871684630033178, - "grad_norm": 1.125, - "learning_rate": 1.702189781021898e-05, - "loss": 1.2382, - "step": 2506 - }, - { - "epoch": 0.7877966900288592, - "grad_norm": 0.63671875, - "learning_rate": 1.701935893367185e-05, - "loss": 1.3848, - "step": 2508 - }, - { - "epoch": 0.7884249170544005, - "grad_norm": 0.7578125, - "learning_rate": 1.7016820057124723e-05, - "loss": 1.2577, - "step": 2510 - }, - { - "epoch": 0.7890531440799419, - "grad_norm": 0.74609375, - "learning_rate": 1.7014281180577597e-05, - "loss": 1.4976, - "step": 2512 - }, - { - "epoch": 0.7896813711054832, - "grad_norm": 0.65234375, - "learning_rate": 1.701174230403047e-05, - "loss": 1.3051, - "step": 2514 - }, - { - "epoch": 0.7903095981310246, - "grad_norm": 0.75, - "learning_rate": 1.700920342748334e-05, - "loss": 1.3637, - "step": 2516 - }, - { - "epoch": 0.790937825156566, - "grad_norm": 0.828125, - "learning_rate": 1.700666455093621e-05, - "loss": 1.2335, - "step": 2518 - }, - { - "epoch": 0.7915660521821073, - "grad_norm": 0.73828125, - "learning_rate": 1.7004125674389086e-05, - "loss": 1.2534, - "step": 2520 - }, - { - "epoch": 0.7921942792076486, - "grad_norm": 0.78515625, - "learning_rate": 1.7001586797841954e-05, - "loss": 1.4272, - "step": 2522 - }, - { - "epoch": 0.7928225062331901, - "grad_norm": 0.66796875, - "learning_rate": 1.699904792129483e-05, - "loss": 1.2296, - "step": 2524 - }, - { - "epoch": 0.7934507332587314, - "grad_norm": 0.765625, - "learning_rate": 1.69965090447477e-05, - "loss": 1.3799, - "step": 2526 - }, - { - "epoch": 0.7940789602842727, - "grad_norm": 0.625, - "learning_rate": 1.6993970168200572e-05, - "loss": 1.4241, - "step": 2528 - }, - { - "epoch": 0.794707187309814, - "grad_norm": 0.8125, - "learning_rate": 1.6991431291653443e-05, - "loss": 1.2411, - "step": 2530 - }, - { - "epoch": 0.7953354143353555, - "grad_norm": 1.078125, - "learning_rate": 1.6988892415106318e-05, - "loss": 1.3962, - "step": 2532 - }, - { - "epoch": 0.7959636413608968, - "grad_norm": 0.8828125, - "learning_rate": 1.698635353855919e-05, - "loss": 1.3154, - "step": 2534 - }, - { - "epoch": 0.7965918683864381, - "grad_norm": 0.62890625, - "learning_rate": 1.698381466201206e-05, - "loss": 1.3236, - "step": 2536 - }, - { - "epoch": 0.7972200954119795, - "grad_norm": 0.80859375, - "learning_rate": 1.6981275785464932e-05, - "loss": 1.2605, - "step": 2538 - }, - { - "epoch": 0.7978483224375209, - "grad_norm": 0.7578125, - "learning_rate": 1.6978736908917807e-05, - "loss": 1.2216, - "step": 2540 - }, - { - "epoch": 0.7984765494630622, - "grad_norm": 0.6875, - "learning_rate": 1.6976198032370675e-05, - "loss": 1.3394, - "step": 2542 - }, - { - "epoch": 0.7991047764886036, - "grad_norm": 0.73828125, - "learning_rate": 1.697365915582355e-05, - "loss": 1.331, - "step": 2544 - }, - { - "epoch": 0.7997330035141449, - "grad_norm": 0.72265625, - "learning_rate": 1.697112027927642e-05, - "loss": 1.3703, - "step": 2546 - }, - { - "epoch": 0.8003612305396863, - "grad_norm": 0.828125, - "learning_rate": 1.6968581402729293e-05, - "loss": 1.3128, - "step": 2548 - }, - { - "epoch": 0.8009894575652277, - "grad_norm": 0.8125, - "learning_rate": 1.6966042526182164e-05, - "loss": 1.278, - "step": 2550 - }, - { - "epoch": 0.801617684590769, - "grad_norm": 0.65625, - "learning_rate": 1.696350364963504e-05, - "loss": 1.3876, - "step": 2552 - }, - { - "epoch": 0.8022459116163103, - "grad_norm": 0.71484375, - "learning_rate": 1.696096477308791e-05, - "loss": 1.2858, - "step": 2554 - }, - { - "epoch": 0.8028741386418518, - "grad_norm": 0.6953125, - "learning_rate": 1.695842589654078e-05, - "loss": 1.412, - "step": 2556 - }, - { - "epoch": 0.8035023656673931, - "grad_norm": 0.7109375, - "learning_rate": 1.6955887019993653e-05, - "loss": 1.4499, - "step": 2558 - }, - { - "epoch": 0.8041305926929344, - "grad_norm": 0.9140625, - "learning_rate": 1.6953348143446528e-05, - "loss": 1.291, - "step": 2560 - }, - { - "epoch": 0.8047588197184757, - "grad_norm": 0.90625, - "learning_rate": 1.69508092668994e-05, - "loss": 1.4154, - "step": 2562 - }, - { - "epoch": 0.8053870467440171, - "grad_norm": 0.82421875, - "learning_rate": 1.694827039035227e-05, - "loss": 1.4474, - "step": 2564 - }, - { - "epoch": 0.8060152737695585, - "grad_norm": 0.79296875, - "learning_rate": 1.6945731513805145e-05, - "loss": 1.3263, - "step": 2566 - }, - { - "epoch": 0.8066435007950998, - "grad_norm": 0.84375, - "learning_rate": 1.6943192637258013e-05, - "loss": 1.3238, - "step": 2568 - }, - { - "epoch": 0.8072717278206412, - "grad_norm": 0.83984375, - "learning_rate": 1.6940653760710888e-05, - "loss": 1.4225, - "step": 2570 - }, - { - "epoch": 0.8078999548461825, - "grad_norm": 0.70703125, - "learning_rate": 1.693811488416376e-05, - "loss": 1.2038, - "step": 2572 - }, - { - "epoch": 0.8085281818717239, - "grad_norm": 0.8359375, - "learning_rate": 1.693557600761663e-05, - "loss": 1.1913, - "step": 2574 - }, - { - "epoch": 0.8091564088972653, - "grad_norm": 0.76953125, - "learning_rate": 1.6933037131069502e-05, - "loss": 1.3431, - "step": 2576 - }, - { - "epoch": 0.8097846359228066, - "grad_norm": 0.87890625, - "learning_rate": 1.6930498254522377e-05, - "loss": 1.3336, - "step": 2578 - }, - { - "epoch": 0.8104128629483479, - "grad_norm": 0.87890625, - "learning_rate": 1.6927959377975248e-05, - "loss": 1.2205, - "step": 2580 - }, - { - "epoch": 0.8110410899738894, - "grad_norm": 0.69921875, - "learning_rate": 1.692542050142812e-05, - "loss": 1.3004, - "step": 2582 - }, - { - "epoch": 0.8116693169994307, - "grad_norm": 0.75390625, - "learning_rate": 1.692288162488099e-05, - "loss": 1.3125, - "step": 2584 - }, - { - "epoch": 0.812297544024972, - "grad_norm": 0.6953125, - "learning_rate": 1.6920342748333866e-05, - "loss": 1.4572, - "step": 2586 - }, - { - "epoch": 0.8129257710505133, - "grad_norm": 0.74609375, - "learning_rate": 1.6917803871786737e-05, - "loss": 1.2809, - "step": 2588 - }, - { - "epoch": 0.8135539980760548, - "grad_norm": 0.66796875, - "learning_rate": 1.691526499523961e-05, - "loss": 1.2979, - "step": 2590 - }, - { - "epoch": 0.8141822251015961, - "grad_norm": 0.890625, - "learning_rate": 1.691272611869248e-05, - "loss": 1.3751, - "step": 2592 - }, - { - "epoch": 0.8148104521271374, - "grad_norm": 0.8125, - "learning_rate": 1.691018724214535e-05, - "loss": 1.3556, - "step": 2594 - }, - { - "epoch": 0.8154386791526788, - "grad_norm": 0.734375, - "learning_rate": 1.6907648365598223e-05, - "loss": 1.2648, - "step": 2596 - }, - { - "epoch": 0.8160669061782202, - "grad_norm": 0.77734375, - "learning_rate": 1.6905109489051097e-05, - "loss": 1.3499, - "step": 2598 - }, - { - "epoch": 0.8166951332037615, - "grad_norm": 0.8359375, - "learning_rate": 1.690257061250397e-05, - "loss": 1.3424, - "step": 2600 - }, - { - "epoch": 0.8173233602293029, - "grad_norm": 0.72265625, - "learning_rate": 1.690003173595684e-05, - "loss": 1.3746, - "step": 2602 - }, - { - "epoch": 0.8179515872548442, - "grad_norm": 0.78515625, - "learning_rate": 1.689749285940971e-05, - "loss": 1.3152, - "step": 2604 - }, - { - "epoch": 0.8185798142803856, - "grad_norm": 0.7109375, - "learning_rate": 1.6894953982862586e-05, - "loss": 1.3755, - "step": 2606 - }, - { - "epoch": 0.819208041305927, - "grad_norm": 0.84765625, - "learning_rate": 1.6892415106315458e-05, - "loss": 1.2247, - "step": 2608 - }, - { - "epoch": 0.8198362683314683, - "grad_norm": 0.69921875, - "learning_rate": 1.688987622976833e-05, - "loss": 1.4328, - "step": 2610 - }, - { - "epoch": 0.8204644953570096, - "grad_norm": 0.6796875, - "learning_rate": 1.68873373532212e-05, - "loss": 1.2965, - "step": 2612 - }, - { - "epoch": 0.821092722382551, - "grad_norm": 0.91015625, - "learning_rate": 1.6884798476674075e-05, - "loss": 1.2175, - "step": 2614 - }, - { - "epoch": 0.8217209494080924, - "grad_norm": 0.8828125, - "learning_rate": 1.6882259600126943e-05, - "loss": 1.1868, - "step": 2616 - }, - { - "epoch": 0.8223491764336337, - "grad_norm": 0.9296875, - "learning_rate": 1.6879720723579818e-05, - "loss": 1.331, - "step": 2618 - }, - { - "epoch": 0.822977403459175, - "grad_norm": 0.69140625, - "learning_rate": 1.687718184703269e-05, - "loss": 1.3342, - "step": 2620 - }, - { - "epoch": 0.8236056304847165, - "grad_norm": 0.68359375, - "learning_rate": 1.687464297048556e-05, - "loss": 1.3036, - "step": 2622 - }, - { - "epoch": 0.8242338575102578, - "grad_norm": 0.75390625, - "learning_rate": 1.6872104093938432e-05, - "loss": 1.2481, - "step": 2624 - }, - { - "epoch": 0.8248620845357991, - "grad_norm": 0.703125, - "learning_rate": 1.6869565217391307e-05, - "loss": 1.3175, - "step": 2626 - }, - { - "epoch": 0.8254903115613405, - "grad_norm": 0.97265625, - "learning_rate": 1.686702634084418e-05, - "loss": 1.3181, - "step": 2628 - }, - { - "epoch": 0.8261185385868819, - "grad_norm": 0.7421875, - "learning_rate": 1.686448746429705e-05, - "loss": 1.3106, - "step": 2630 - }, - { - "epoch": 0.8267467656124232, - "grad_norm": 0.82421875, - "learning_rate": 1.686194858774992e-05, - "loss": 1.3216, - "step": 2632 - }, - { - "epoch": 0.8273749926379645, - "grad_norm": 0.85546875, - "learning_rate": 1.6859409711202796e-05, - "loss": 1.3221, - "step": 2634 - }, - { - "epoch": 0.8280032196635059, - "grad_norm": 0.7734375, - "learning_rate": 1.6856870834655664e-05, - "loss": 1.3614, - "step": 2636 - }, - { - "epoch": 0.8286314466890472, - "grad_norm": 0.7890625, - "learning_rate": 1.685433195810854e-05, - "loss": 1.3956, - "step": 2638 - }, - { - "epoch": 0.8292596737145886, - "grad_norm": 0.6875, - "learning_rate": 1.685179308156141e-05, - "loss": 1.1662, - "step": 2640 - }, - { - "epoch": 0.82988790074013, - "grad_norm": 0.76953125, - "learning_rate": 1.684925420501428e-05, - "loss": 1.2505, - "step": 2642 - }, - { - "epoch": 0.8305161277656713, - "grad_norm": 0.86328125, - "learning_rate": 1.6846715328467153e-05, - "loss": 1.251, - "step": 2644 - }, - { - "epoch": 0.8311443547912126, - "grad_norm": 0.78515625, - "learning_rate": 1.6844176451920028e-05, - "loss": 1.398, - "step": 2646 - }, - { - "epoch": 0.8317725818167541, - "grad_norm": 0.79296875, - "learning_rate": 1.68416375753729e-05, - "loss": 1.2618, - "step": 2648 - }, - { - "epoch": 0.8324008088422954, - "grad_norm": 0.66796875, - "learning_rate": 1.683909869882577e-05, - "loss": 1.3516, - "step": 2650 - }, - { - "epoch": 0.8330290358678367, - "grad_norm": 0.74609375, - "learning_rate": 1.6836559822278645e-05, - "loss": 1.4359, - "step": 2652 - }, - { - "epoch": 0.833657262893378, - "grad_norm": 0.703125, - "learning_rate": 1.6834020945731516e-05, - "loss": 1.3158, - "step": 2654 - }, - { - "epoch": 0.8342854899189195, - "grad_norm": 0.81640625, - "learning_rate": 1.6831482069184388e-05, - "loss": 1.2849, - "step": 2656 - }, - { - "epoch": 0.8349137169444608, - "grad_norm": 0.734375, - "learning_rate": 1.682894319263726e-05, - "loss": 1.4921, - "step": 2658 - }, - { - "epoch": 0.8355419439700021, - "grad_norm": 0.94921875, - "learning_rate": 1.6826404316090134e-05, - "loss": 1.2774, - "step": 2660 - }, - { - "epoch": 0.8361701709955435, - "grad_norm": 0.78125, - "learning_rate": 1.6823865439543002e-05, - "loss": 1.3282, - "step": 2662 - }, - { - "epoch": 0.8367983980210849, - "grad_norm": 0.75, - "learning_rate": 1.6821326562995877e-05, - "loss": 1.2604, - "step": 2664 - }, - { - "epoch": 0.8374266250466262, - "grad_norm": 0.75390625, - "learning_rate": 1.6818787686448748e-05, - "loss": 1.2322, - "step": 2666 - }, - { - "epoch": 0.8380548520721676, - "grad_norm": 0.75, - "learning_rate": 1.681624880990162e-05, - "loss": 1.3847, - "step": 2668 - }, - { - "epoch": 0.8386830790977089, - "grad_norm": 1.0, - "learning_rate": 1.681370993335449e-05, - "loss": 1.2521, - "step": 2670 - }, - { - "epoch": 0.8393113061232503, - "grad_norm": 0.73046875, - "learning_rate": 1.6811171056807366e-05, - "loss": 1.4187, - "step": 2672 - }, - { - "epoch": 0.8399395331487917, - "grad_norm": 0.7109375, - "learning_rate": 1.6808632180260237e-05, - "loss": 1.195, - "step": 2674 - }, - { - "epoch": 0.840567760174333, - "grad_norm": 1.015625, - "learning_rate": 1.680609330371311e-05, - "loss": 1.3454, - "step": 2676 - }, - { - "epoch": 0.8411959871998743, - "grad_norm": 0.78515625, - "learning_rate": 1.680355442716598e-05, - "loss": 1.453, - "step": 2678 - }, - { - "epoch": 0.8418242142254158, - "grad_norm": 0.68359375, - "learning_rate": 1.6801015550618855e-05, - "loss": 1.4218, - "step": 2680 - }, - { - "epoch": 0.8424524412509571, - "grad_norm": 0.85546875, - "learning_rate": 1.6798476674071723e-05, - "loss": 1.4194, - "step": 2682 - }, - { - "epoch": 0.8430806682764984, - "grad_norm": 0.80859375, - "learning_rate": 1.6795937797524597e-05, - "loss": 1.3225, - "step": 2684 - }, - { - "epoch": 0.8437088953020397, - "grad_norm": 0.7578125, - "learning_rate": 1.679339892097747e-05, - "loss": 1.205, - "step": 2686 - }, - { - "epoch": 0.8443371223275812, - "grad_norm": 1.046875, - "learning_rate": 1.679086004443034e-05, - "loss": 1.425, - "step": 2688 - }, - { - "epoch": 0.8449653493531225, - "grad_norm": 0.6875, - "learning_rate": 1.678832116788321e-05, - "loss": 1.3743, - "step": 2690 - }, - { - "epoch": 0.8455935763786638, - "grad_norm": 0.83203125, - "learning_rate": 1.6785782291336086e-05, - "loss": 1.2462, - "step": 2692 - }, - { - "epoch": 0.8462218034042052, - "grad_norm": 0.671875, - "learning_rate": 1.6783243414788958e-05, - "loss": 1.3989, - "step": 2694 - }, - { - "epoch": 0.8468500304297466, - "grad_norm": 0.76953125, - "learning_rate": 1.678070453824183e-05, - "loss": 1.4101, - "step": 2696 - }, - { - "epoch": 0.8474782574552879, - "grad_norm": 0.71484375, - "learning_rate": 1.67781656616947e-05, - "loss": 1.2639, - "step": 2698 - }, - { - "epoch": 0.8481064844808293, - "grad_norm": 0.79296875, - "learning_rate": 1.6775626785147575e-05, - "loss": 1.3388, - "step": 2700 - }, - { - "epoch": 0.8487347115063706, - "grad_norm": 0.78515625, - "learning_rate": 1.6773087908600447e-05, - "loss": 1.363, - "step": 2702 - }, - { - "epoch": 0.8493629385319119, - "grad_norm": 0.828125, - "learning_rate": 1.6770549032053318e-05, - "loss": 1.2831, - "step": 2704 - }, - { - "epoch": 0.8499911655574534, - "grad_norm": 0.7109375, - "learning_rate": 1.676801015550619e-05, - "loss": 1.2638, - "step": 2706 - }, - { - "epoch": 0.8506193925829947, - "grad_norm": 0.6875, - "learning_rate": 1.676547127895906e-05, - "loss": 1.3733, - "step": 2708 - }, - { - "epoch": 0.851247619608536, - "grad_norm": 0.6796875, - "learning_rate": 1.6762932402411932e-05, - "loss": 1.3726, - "step": 2710 - }, - { - "epoch": 0.8518758466340773, - "grad_norm": 0.73828125, - "learning_rate": 1.6760393525864807e-05, - "loss": 1.3406, - "step": 2712 - }, - { - "epoch": 0.8525040736596188, - "grad_norm": 0.69921875, - "learning_rate": 1.6757854649317678e-05, - "loss": 1.4331, - "step": 2714 - }, - { - "epoch": 0.8531323006851601, - "grad_norm": 0.7890625, - "learning_rate": 1.675531577277055e-05, - "loss": 1.302, - "step": 2716 - }, - { - "epoch": 0.8537605277107014, - "grad_norm": 0.79296875, - "learning_rate": 1.675277689622342e-05, - "loss": 1.3428, - "step": 2718 - }, - { - "epoch": 0.8543887547362428, - "grad_norm": 0.765625, - "learning_rate": 1.6750238019676296e-05, - "loss": 1.2827, - "step": 2720 - }, - { - "epoch": 0.8550169817617842, - "grad_norm": 0.67578125, - "learning_rate": 1.6747699143129167e-05, - "loss": 1.2744, - "step": 2722 - }, - { - "epoch": 0.8556452087873255, - "grad_norm": 0.75, - "learning_rate": 1.674516026658204e-05, - "loss": 1.2999, - "step": 2724 - }, - { - "epoch": 0.8562734358128669, - "grad_norm": 0.9765625, - "learning_rate": 1.674262139003491e-05, - "loss": 1.2288, - "step": 2726 - }, - { - "epoch": 0.8569016628384082, - "grad_norm": 0.7109375, - "learning_rate": 1.6740082513487785e-05, - "loss": 1.3101, - "step": 2728 - }, - { - "epoch": 0.8575298898639496, - "grad_norm": 0.71484375, - "learning_rate": 1.6737543636940653e-05, - "loss": 1.309, - "step": 2730 - }, - { - "epoch": 0.858158116889491, - "grad_norm": 0.69921875, - "learning_rate": 1.6735004760393527e-05, - "loss": 1.3683, - "step": 2732 - }, - { - "epoch": 0.8587863439150323, - "grad_norm": 1.015625, - "learning_rate": 1.67324658838464e-05, - "loss": 1.2708, - "step": 2734 - }, - { - "epoch": 0.8594145709405736, - "grad_norm": 0.7578125, - "learning_rate": 1.672992700729927e-05, - "loss": 1.5443, - "step": 2736 - }, - { - "epoch": 0.860042797966115, - "grad_norm": 0.73046875, - "learning_rate": 1.6727388130752145e-05, - "loss": 1.3305, - "step": 2738 - }, - { - "epoch": 0.8606710249916564, - "grad_norm": 0.86328125, - "learning_rate": 1.6724849254205016e-05, - "loss": 1.3512, - "step": 2740 - }, - { - "epoch": 0.8612992520171977, - "grad_norm": 0.73828125, - "learning_rate": 1.6722310377657888e-05, - "loss": 1.3854, - "step": 2742 - }, - { - "epoch": 0.861927479042739, - "grad_norm": 0.75390625, - "learning_rate": 1.671977150111076e-05, - "loss": 1.2901, - "step": 2744 - }, - { - "epoch": 0.8625557060682805, - "grad_norm": 0.68359375, - "learning_rate": 1.6717232624563634e-05, - "loss": 1.3502, - "step": 2746 - }, - { - "epoch": 0.8631839330938218, - "grad_norm": 0.7578125, - "learning_rate": 1.6714693748016505e-05, - "loss": 1.1293, - "step": 2748 - }, - { - "epoch": 0.8638121601193631, - "grad_norm": 0.74609375, - "learning_rate": 1.6712154871469377e-05, - "loss": 1.3325, - "step": 2750 - }, - { - "epoch": 0.8644403871449045, - "grad_norm": 0.7890625, - "learning_rate": 1.6709615994922248e-05, - "loss": 1.4138, - "step": 2752 - }, - { - "epoch": 0.8650686141704459, - "grad_norm": 0.69140625, - "learning_rate": 1.6707077118375123e-05, - "loss": 1.2818, - "step": 2754 - }, - { - "epoch": 0.8656968411959872, - "grad_norm": 0.73046875, - "learning_rate": 1.670453824182799e-05, - "loss": 1.2926, - "step": 2756 - }, - { - "epoch": 0.8663250682215285, - "grad_norm": 0.6953125, - "learning_rate": 1.6701999365280866e-05, - "loss": 1.3686, - "step": 2758 - }, - { - "epoch": 0.8669532952470699, - "grad_norm": 0.8359375, - "learning_rate": 1.6699460488733737e-05, - "loss": 1.2924, - "step": 2760 - }, - { - "epoch": 0.8675815222726113, - "grad_norm": 0.78515625, - "learning_rate": 1.669692161218661e-05, - "loss": 1.4022, - "step": 2762 - }, - { - "epoch": 0.8682097492981526, - "grad_norm": 0.8359375, - "learning_rate": 1.669438273563948e-05, - "loss": 1.429, - "step": 2764 - }, - { - "epoch": 0.868837976323694, - "grad_norm": 0.7890625, - "learning_rate": 1.6691843859092355e-05, - "loss": 1.2911, - "step": 2766 - }, - { - "epoch": 0.8694662033492353, - "grad_norm": 0.73046875, - "learning_rate": 1.6689304982545226e-05, - "loss": 1.4, - "step": 2768 - }, - { - "epoch": 0.8700944303747767, - "grad_norm": 0.88671875, - "learning_rate": 1.6686766105998097e-05, - "loss": 1.3409, - "step": 2770 - }, - { - "epoch": 0.8707226574003181, - "grad_norm": 1.0390625, - "learning_rate": 1.668422722945097e-05, - "loss": 1.2781, - "step": 2772 - }, - { - "epoch": 0.8713508844258594, - "grad_norm": 0.8359375, - "learning_rate": 1.6681688352903843e-05, - "loss": 1.3083, - "step": 2774 - }, - { - "epoch": 0.8719791114514007, - "grad_norm": 0.73046875, - "learning_rate": 1.667914947635671e-05, - "loss": 1.2491, - "step": 2776 - }, - { - "epoch": 0.872607338476942, - "grad_norm": 0.67578125, - "learning_rate": 1.6676610599809586e-05, - "loss": 1.3156, - "step": 2778 - }, - { - "epoch": 0.8732355655024835, - "grad_norm": 0.8515625, - "learning_rate": 1.6674071723262458e-05, - "loss": 1.2403, - "step": 2780 - }, - { - "epoch": 0.8738637925280248, - "grad_norm": 0.74609375, - "learning_rate": 1.667153284671533e-05, - "loss": 1.4226, - "step": 2782 - }, - { - "epoch": 0.8744920195535661, - "grad_norm": 0.84765625, - "learning_rate": 1.66689939701682e-05, - "loss": 1.2981, - "step": 2784 - }, - { - "epoch": 0.8751202465791075, - "grad_norm": 0.6953125, - "learning_rate": 1.6666455093621075e-05, - "loss": 1.256, - "step": 2786 - }, - { - "epoch": 0.8757484736046489, - "grad_norm": 0.734375, - "learning_rate": 1.6663916217073946e-05, - "loss": 1.255, - "step": 2788 - }, - { - "epoch": 0.8763767006301902, - "grad_norm": 0.7265625, - "learning_rate": 1.6661377340526818e-05, - "loss": 1.2185, - "step": 2790 - }, - { - "epoch": 0.8770049276557316, - "grad_norm": 0.6640625, - "learning_rate": 1.665883846397969e-05, - "loss": 1.4315, - "step": 2792 - }, - { - "epoch": 0.8776331546812729, - "grad_norm": 0.703125, - "learning_rate": 1.6656299587432564e-05, - "loss": 1.4531, - "step": 2794 - }, - { - "epoch": 0.8782613817068143, - "grad_norm": 0.8828125, - "learning_rate": 1.6653760710885435e-05, - "loss": 1.2937, - "step": 2796 - }, - { - "epoch": 0.8788896087323557, - "grad_norm": 0.9375, - "learning_rate": 1.6651221834338307e-05, - "loss": 1.2382, - "step": 2798 - }, - { - "epoch": 0.879517835757897, - "grad_norm": 0.8515625, - "learning_rate": 1.6648682957791178e-05, - "loss": 1.2398, - "step": 2800 - }, - { - "epoch": 0.8801460627834383, - "grad_norm": 0.7890625, - "learning_rate": 1.664614408124405e-05, - "loss": 1.3117, - "step": 2802 - }, - { - "epoch": 0.8807742898089798, - "grad_norm": 0.88671875, - "learning_rate": 1.664360520469692e-05, - "loss": 1.35, - "step": 2804 - }, - { - "epoch": 0.8814025168345211, - "grad_norm": 0.8125, - "learning_rate": 1.6641066328149796e-05, - "loss": 1.4186, - "step": 2806 - }, - { - "epoch": 0.8820307438600624, - "grad_norm": 0.67578125, - "learning_rate": 1.6638527451602667e-05, - "loss": 1.2733, - "step": 2808 - }, - { - "epoch": 0.8826589708856037, - "grad_norm": 0.734375, - "learning_rate": 1.663598857505554e-05, - "loss": 1.312, - "step": 2810 - }, - { - "epoch": 0.8832871979111452, - "grad_norm": 0.76171875, - "learning_rate": 1.663344969850841e-05, - "loss": 1.2711, - "step": 2812 - }, - { - "epoch": 0.8839154249366865, - "grad_norm": 0.7265625, - "learning_rate": 1.6630910821961285e-05, - "loss": 1.3649, - "step": 2814 - }, - { - "epoch": 0.8845436519622278, - "grad_norm": 0.734375, - "learning_rate": 1.6628371945414156e-05, - "loss": 1.5247, - "step": 2816 - }, - { - "epoch": 0.8851718789877692, - "grad_norm": 0.76171875, - "learning_rate": 1.6625833068867027e-05, - "loss": 1.2794, - "step": 2818 - }, - { - "epoch": 0.8858001060133106, - "grad_norm": 0.68359375, - "learning_rate": 1.66232941923199e-05, - "loss": 1.2475, - "step": 2820 - }, - { - "epoch": 0.8864283330388519, - "grad_norm": 0.8359375, - "learning_rate": 1.6620755315772774e-05, - "loss": 1.1927, - "step": 2822 - }, - { - "epoch": 0.8870565600643933, - "grad_norm": 0.8828125, - "learning_rate": 1.6618216439225645e-05, - "loss": 1.2817, - "step": 2824 - }, - { - "epoch": 0.8876847870899346, - "grad_norm": 0.72265625, - "learning_rate": 1.6615677562678516e-05, - "loss": 1.3958, - "step": 2826 - }, - { - "epoch": 0.888313014115476, - "grad_norm": 0.69140625, - "learning_rate": 1.6613138686131388e-05, - "loss": 1.4729, - "step": 2828 - }, - { - "epoch": 0.8889412411410174, - "grad_norm": 0.87109375, - "learning_rate": 1.661059980958426e-05, - "loss": 1.2705, - "step": 2830 - }, - { - "epoch": 0.8895694681665587, - "grad_norm": 0.73828125, - "learning_rate": 1.6608060933037134e-05, - "loss": 1.4104, - "step": 2832 - }, - { - "epoch": 0.8901976951921, - "grad_norm": 0.78125, - "learning_rate": 1.6605522056490005e-05, - "loss": 1.3625, - "step": 2834 - }, - { - "epoch": 0.8908259222176415, - "grad_norm": 0.8046875, - "learning_rate": 1.6602983179942877e-05, - "loss": 1.3754, - "step": 2836 - }, - { - "epoch": 0.8914541492431828, - "grad_norm": 0.7578125, - "learning_rate": 1.6600444303395748e-05, - "loss": 1.2763, - "step": 2838 - }, - { - "epoch": 0.8920823762687241, - "grad_norm": 0.6953125, - "learning_rate": 1.6597905426848623e-05, - "loss": 1.1745, - "step": 2840 - }, - { - "epoch": 0.8927106032942654, - "grad_norm": 0.75390625, - "learning_rate": 1.6595366550301494e-05, - "loss": 1.2782, - "step": 2842 - }, - { - "epoch": 0.8933388303198068, - "grad_norm": 0.796875, - "learning_rate": 1.6592827673754366e-05, - "loss": 1.3032, - "step": 2844 - }, - { - "epoch": 0.8939670573453482, - "grad_norm": 0.7265625, - "learning_rate": 1.6590288797207237e-05, - "loss": 1.4176, - "step": 2846 - }, - { - "epoch": 0.8945952843708895, - "grad_norm": 0.71875, - "learning_rate": 1.658774992066011e-05, - "loss": 1.3804, - "step": 2848 - }, - { - "epoch": 0.8952235113964309, - "grad_norm": 0.6796875, - "learning_rate": 1.658521104411298e-05, - "loss": 1.3173, - "step": 2850 - }, - { - "epoch": 0.8958517384219722, - "grad_norm": 0.86328125, - "learning_rate": 1.6582672167565854e-05, - "loss": 1.1673, - "step": 2852 - }, - { - "epoch": 0.8964799654475136, - "grad_norm": 0.69921875, - "learning_rate": 1.6580133291018726e-05, - "loss": 1.3201, - "step": 2854 - }, - { - "epoch": 0.897108192473055, - "grad_norm": 0.6953125, - "learning_rate": 1.6577594414471597e-05, - "loss": 1.3234, - "step": 2856 - }, - { - "epoch": 0.8977364194985963, - "grad_norm": 2.359375, - "learning_rate": 1.657505553792447e-05, - "loss": 1.4672, - "step": 2858 - }, - { - "epoch": 0.8983646465241376, - "grad_norm": 0.78515625, - "learning_rate": 1.6572516661377343e-05, - "loss": 1.3377, - "step": 2860 - }, - { - "epoch": 0.898992873549679, - "grad_norm": 0.71484375, - "learning_rate": 1.6569977784830215e-05, - "loss": 1.2545, - "step": 2862 - }, - { - "epoch": 0.8996211005752204, - "grad_norm": 0.8984375, - "learning_rate": 1.6567438908283086e-05, - "loss": 1.2684, - "step": 2864 - }, - { - "epoch": 0.9002493276007617, - "grad_norm": 0.7421875, - "learning_rate": 1.6564900031735957e-05, - "loss": 1.2329, - "step": 2866 - }, - { - "epoch": 0.900877554626303, - "grad_norm": 0.6796875, - "learning_rate": 1.6562361155188832e-05, - "loss": 1.4101, - "step": 2868 - }, - { - "epoch": 0.9015057816518445, - "grad_norm": 0.7578125, - "learning_rate": 1.65598222786417e-05, - "loss": 1.2965, - "step": 2870 - }, - { - "epoch": 0.9021340086773858, - "grad_norm": 0.90234375, - "learning_rate": 1.6557283402094575e-05, - "loss": 1.331, - "step": 2872 - }, - { - "epoch": 0.9027622357029271, - "grad_norm": 0.765625, - "learning_rate": 1.6554744525547446e-05, - "loss": 1.4061, - "step": 2874 - }, - { - "epoch": 0.9033904627284685, - "grad_norm": 0.76953125, - "learning_rate": 1.6552205649000318e-05, - "loss": 1.3482, - "step": 2876 - }, - { - "epoch": 0.9040186897540099, - "grad_norm": 0.68359375, - "learning_rate": 1.654966677245319e-05, - "loss": 1.3822, - "step": 2878 - }, - { - "epoch": 0.9046469167795512, - "grad_norm": 0.79296875, - "learning_rate": 1.6547127895906064e-05, - "loss": 1.2013, - "step": 2880 - }, - { - "epoch": 0.9052751438050926, - "grad_norm": 0.75390625, - "learning_rate": 1.6544589019358935e-05, - "loss": 1.2415, - "step": 2882 - }, - { - "epoch": 0.9059033708306339, - "grad_norm": 0.8984375, - "learning_rate": 1.6542050142811807e-05, - "loss": 1.3142, - "step": 2884 - }, - { - "epoch": 0.9065315978561753, - "grad_norm": 0.7734375, - "learning_rate": 1.6539511266264678e-05, - "loss": 1.3292, - "step": 2886 - }, - { - "epoch": 0.9071598248817166, - "grad_norm": 0.7421875, - "learning_rate": 1.6536972389717553e-05, - "loss": 1.3243, - "step": 2888 - }, - { - "epoch": 0.907788051907258, - "grad_norm": 0.75, - "learning_rate": 1.6534433513170424e-05, - "loss": 1.2548, - "step": 2890 - }, - { - "epoch": 0.9084162789327993, - "grad_norm": 0.78515625, - "learning_rate": 1.6531894636623296e-05, - "loss": 1.3526, - "step": 2892 - }, - { - "epoch": 0.9090445059583407, - "grad_norm": 0.7890625, - "learning_rate": 1.6529355760076167e-05, - "loss": 1.3198, - "step": 2894 - }, - { - "epoch": 0.9096727329838821, - "grad_norm": 0.6875, - "learning_rate": 1.652681688352904e-05, - "loss": 1.0987, - "step": 2896 - }, - { - "epoch": 0.9103009600094234, - "grad_norm": 0.7890625, - "learning_rate": 1.652427800698191e-05, - "loss": 1.2387, - "step": 2898 - }, - { - "epoch": 0.9109291870349647, - "grad_norm": 0.71484375, - "learning_rate": 1.6521739130434785e-05, - "loss": 1.1774, - "step": 2900 - }, - { - "epoch": 0.9115574140605062, - "grad_norm": 0.78515625, - "learning_rate": 1.6519200253887656e-05, - "loss": 1.2341, - "step": 2902 - }, - { - "epoch": 0.9121856410860475, - "grad_norm": 0.796875, - "learning_rate": 1.6516661377340527e-05, - "loss": 1.2046, - "step": 2904 - }, - { - "epoch": 0.9128138681115888, - "grad_norm": 0.7890625, - "learning_rate": 1.65141225007934e-05, - "loss": 1.477, - "step": 2906 - }, - { - "epoch": 0.9134420951371302, - "grad_norm": 0.7109375, - "learning_rate": 1.6511583624246273e-05, - "loss": 1.4045, - "step": 2908 - }, - { - "epoch": 0.9140703221626716, - "grad_norm": 0.7734375, - "learning_rate": 1.6509044747699145e-05, - "loss": 1.2798, - "step": 2910 - }, - { - "epoch": 0.9146985491882129, - "grad_norm": 0.703125, - "learning_rate": 1.6506505871152016e-05, - "loss": 1.3729, - "step": 2912 - }, - { - "epoch": 0.9153267762137542, - "grad_norm": 0.84375, - "learning_rate": 1.650396699460489e-05, - "loss": 1.3434, - "step": 2914 - }, - { - "epoch": 0.9159550032392956, - "grad_norm": 0.73046875, - "learning_rate": 1.650142811805776e-05, - "loss": 1.3866, - "step": 2916 - }, - { - "epoch": 0.9165832302648369, - "grad_norm": 0.71875, - "learning_rate": 1.6498889241510634e-05, - "loss": 1.2233, - "step": 2918 - }, - { - "epoch": 0.9172114572903783, - "grad_norm": 0.95703125, - "learning_rate": 1.6496350364963505e-05, - "loss": 1.2631, - "step": 2920 - }, - { - "epoch": 0.9178396843159197, - "grad_norm": 0.6640625, - "learning_rate": 1.6493811488416377e-05, - "loss": 1.3768, - "step": 2922 - }, - { - "epoch": 0.918467911341461, - "grad_norm": 0.84375, - "learning_rate": 1.6491272611869248e-05, - "loss": 1.2722, - "step": 2924 - }, - { - "epoch": 0.9190961383670023, - "grad_norm": 0.83203125, - "learning_rate": 1.6488733735322123e-05, - "loss": 1.2799, - "step": 2926 - }, - { - "epoch": 0.9197243653925438, - "grad_norm": 0.859375, - "learning_rate": 1.6486194858774994e-05, - "loss": 1.2571, - "step": 2928 - }, - { - "epoch": 0.9203525924180851, - "grad_norm": 0.71875, - "learning_rate": 1.6483655982227865e-05, - "loss": 1.2148, - "step": 2930 - }, - { - "epoch": 0.9209808194436264, - "grad_norm": 0.74609375, - "learning_rate": 1.6481117105680737e-05, - "loss": 1.3129, - "step": 2932 - }, - { - "epoch": 0.9216090464691677, - "grad_norm": 0.71484375, - "learning_rate": 1.647857822913361e-05, - "loss": 1.2683, - "step": 2934 - }, - { - "epoch": 0.9222372734947092, - "grad_norm": 0.703125, - "learning_rate": 1.6476039352586483e-05, - "loss": 1.356, - "step": 2936 - }, - { - "epoch": 0.9228655005202505, - "grad_norm": 0.74609375, - "learning_rate": 1.6473500476039354e-05, - "loss": 1.2901, - "step": 2938 - }, - { - "epoch": 0.9234937275457918, - "grad_norm": 0.68359375, - "learning_rate": 1.6470961599492226e-05, - "loss": 1.4158, - "step": 2940 - }, - { - "epoch": 0.9241219545713332, - "grad_norm": 0.796875, - "learning_rate": 1.6468422722945097e-05, - "loss": 1.2391, - "step": 2942 - }, - { - "epoch": 0.9247501815968746, - "grad_norm": 0.74609375, - "learning_rate": 1.646588384639797e-05, - "loss": 1.3964, - "step": 2944 - }, - { - "epoch": 0.9253784086224159, - "grad_norm": 0.71875, - "learning_rate": 1.6463344969850843e-05, - "loss": 1.3187, - "step": 2946 - }, - { - "epoch": 0.9260066356479573, - "grad_norm": 0.6640625, - "learning_rate": 1.6460806093303715e-05, - "loss": 1.3794, - "step": 2948 - }, - { - "epoch": 0.9266348626734986, - "grad_norm": 0.69140625, - "learning_rate": 1.6458267216756586e-05, - "loss": 1.3897, - "step": 2950 - }, - { - "epoch": 0.92726308969904, - "grad_norm": 0.73046875, - "learning_rate": 1.6455728340209457e-05, - "loss": 1.2514, - "step": 2952 - }, - { - "epoch": 0.9278913167245814, - "grad_norm": 0.7265625, - "learning_rate": 1.6453189463662332e-05, - "loss": 1.2275, - "step": 2954 - }, - { - "epoch": 0.9285195437501227, - "grad_norm": 0.8828125, - "learning_rate": 1.6450650587115204e-05, - "loss": 1.3661, - "step": 2956 - }, - { - "epoch": 0.929147770775664, - "grad_norm": 0.703125, - "learning_rate": 1.6448111710568075e-05, - "loss": 1.3095, - "step": 2958 - }, - { - "epoch": 0.9297759978012055, - "grad_norm": 0.80859375, - "learning_rate": 1.6445572834020946e-05, - "loss": 1.4244, - "step": 2960 - }, - { - "epoch": 0.9304042248267468, - "grad_norm": 0.69921875, - "learning_rate": 1.644303395747382e-05, - "loss": 1.3683, - "step": 2962 - }, - { - "epoch": 0.9310324518522881, - "grad_norm": 0.71875, - "learning_rate": 1.644049508092669e-05, - "loss": 1.512, - "step": 2964 - }, - { - "epoch": 0.9316606788778294, - "grad_norm": 0.80859375, - "learning_rate": 1.6437956204379564e-05, - "loss": 1.4732, - "step": 2966 - }, - { - "epoch": 0.9322889059033709, - "grad_norm": 0.734375, - "learning_rate": 1.6435417327832435e-05, - "loss": 1.34, - "step": 2968 - }, - { - "epoch": 0.9329171329289122, - "grad_norm": 0.77734375, - "learning_rate": 1.6432878451285307e-05, - "loss": 1.2436, - "step": 2970 - }, - { - "epoch": 0.9335453599544535, - "grad_norm": 0.7421875, - "learning_rate": 1.6430339574738178e-05, - "loss": 1.3719, - "step": 2972 - }, - { - "epoch": 0.9341735869799949, - "grad_norm": 0.79296875, - "learning_rate": 1.6427800698191053e-05, - "loss": 1.3081, - "step": 2974 - }, - { - "epoch": 0.9348018140055363, - "grad_norm": 0.74609375, - "learning_rate": 1.6425261821643924e-05, - "loss": 1.3141, - "step": 2976 - }, - { - "epoch": 0.9354300410310776, - "grad_norm": 0.73828125, - "learning_rate": 1.6422722945096796e-05, - "loss": 1.3385, - "step": 2978 - }, - { - "epoch": 0.936058268056619, - "grad_norm": 0.73046875, - "learning_rate": 1.6420184068549667e-05, - "loss": 1.3452, - "step": 2980 - }, - { - "epoch": 0.9366864950821603, - "grad_norm": 0.80078125, - "learning_rate": 1.6417645192002542e-05, - "loss": 1.2058, - "step": 2982 - }, - { - "epoch": 0.9373147221077017, - "grad_norm": 0.75390625, - "learning_rate": 1.641510631545541e-05, - "loss": 1.3226, - "step": 2984 - }, - { - "epoch": 0.937942949133243, - "grad_norm": 0.7109375, - "learning_rate": 1.6412567438908284e-05, - "loss": 1.362, - "step": 2986 - }, - { - "epoch": 0.9385711761587844, - "grad_norm": 0.734375, - "learning_rate": 1.6410028562361156e-05, - "loss": 1.2904, - "step": 2988 - }, - { - "epoch": 0.9391994031843257, - "grad_norm": 0.875, - "learning_rate": 1.6407489685814027e-05, - "loss": 1.3076, - "step": 2990 - }, - { - "epoch": 0.939827630209867, - "grad_norm": 1.2890625, - "learning_rate": 1.64049508092669e-05, - "loss": 1.2634, - "step": 2992 - }, - { - "epoch": 0.9404558572354085, - "grad_norm": 0.77734375, - "learning_rate": 1.6402411932719773e-05, - "loss": 1.3545, - "step": 2994 - }, - { - "epoch": 0.9410840842609498, - "grad_norm": 0.73046875, - "learning_rate": 1.6399873056172645e-05, - "loss": 1.3319, - "step": 2996 - }, - { - "epoch": 0.9417123112864911, - "grad_norm": 0.75, - "learning_rate": 1.6397334179625516e-05, - "loss": 1.3534, - "step": 2998 - }, - { - "epoch": 0.9423405383120325, - "grad_norm": 0.68359375, - "learning_rate": 1.639479530307839e-05, - "loss": 1.2353, - "step": 3000 - }, - { - "epoch": 0.9429687653375739, - "grad_norm": 0.68359375, - "learning_rate": 1.6392256426531262e-05, - "loss": 1.2108, - "step": 3002 - }, - { - "epoch": 0.9435969923631152, - "grad_norm": 0.71875, - "learning_rate": 1.6389717549984134e-05, - "loss": 1.2961, - "step": 3004 - }, - { - "epoch": 0.9442252193886566, - "grad_norm": 0.734375, - "learning_rate": 1.6387178673437005e-05, - "loss": 1.2746, - "step": 3006 - }, - { - "epoch": 0.9448534464141979, - "grad_norm": 0.72265625, - "learning_rate": 1.638463979688988e-05, - "loss": 1.2231, - "step": 3008 - }, - { - "epoch": 0.9454816734397393, - "grad_norm": 0.85546875, - "learning_rate": 1.6382100920342748e-05, - "loss": 1.4304, - "step": 3010 - }, - { - "epoch": 0.9461099004652807, - "grad_norm": 0.87890625, - "learning_rate": 1.6379562043795623e-05, - "loss": 1.336, - "step": 3012 - }, - { - "epoch": 0.946738127490822, - "grad_norm": 0.78125, - "learning_rate": 1.6377023167248494e-05, - "loss": 1.2532, - "step": 3014 - }, - { - "epoch": 0.9473663545163633, - "grad_norm": 0.73046875, - "learning_rate": 1.6374484290701365e-05, - "loss": 1.3438, - "step": 3016 - }, - { - "epoch": 0.9479945815419047, - "grad_norm": 0.765625, - "learning_rate": 1.6371945414154237e-05, - "loss": 1.3412, - "step": 3018 - }, - { - "epoch": 0.9486228085674461, - "grad_norm": 0.7578125, - "learning_rate": 1.636940653760711e-05, - "loss": 1.2943, - "step": 3020 - }, - { - "epoch": 0.9492510355929874, - "grad_norm": 0.71484375, - "learning_rate": 1.6366867661059983e-05, - "loss": 1.3679, - "step": 3022 - }, - { - "epoch": 0.9498792626185287, - "grad_norm": 0.6953125, - "learning_rate": 1.6364328784512854e-05, - "loss": 1.2899, - "step": 3024 - }, - { - "epoch": 0.9505074896440702, - "grad_norm": 0.87109375, - "learning_rate": 1.6361789907965726e-05, - "loss": 1.4329, - "step": 3026 - }, - { - "epoch": 0.9511357166696115, - "grad_norm": 0.6875, - "learning_rate": 1.63592510314186e-05, - "loss": 1.2883, - "step": 3028 - }, - { - "epoch": 0.9517639436951528, - "grad_norm": 0.66796875, - "learning_rate": 1.6356712154871472e-05, - "loss": 1.2225, - "step": 3030 - }, - { - "epoch": 0.9523921707206942, - "grad_norm": 0.796875, - "learning_rate": 1.6354173278324343e-05, - "loss": 1.3128, - "step": 3032 - }, - { - "epoch": 0.9530203977462356, - "grad_norm": 0.8828125, - "learning_rate": 1.6351634401777215e-05, - "loss": 1.3125, - "step": 3034 - }, - { - "epoch": 0.9536486247717769, - "grad_norm": 0.7265625, - "learning_rate": 1.6349095525230086e-05, - "loss": 1.3294, - "step": 3036 - }, - { - "epoch": 0.9542768517973182, - "grad_norm": 0.79296875, - "learning_rate": 1.6346556648682957e-05, - "loss": 1.2799, - "step": 3038 - }, - { - "epoch": 0.9549050788228596, - "grad_norm": 0.703125, - "learning_rate": 1.6344017772135832e-05, - "loss": 1.3259, - "step": 3040 - }, - { - "epoch": 0.955533305848401, - "grad_norm": 0.66796875, - "learning_rate": 1.6341478895588704e-05, - "loss": 1.2925, - "step": 3042 - }, - { - "epoch": 0.9561615328739423, - "grad_norm": 0.81640625, - "learning_rate": 1.6338940019041575e-05, - "loss": 1.4347, - "step": 3044 - }, - { - "epoch": 0.9567897598994837, - "grad_norm": 0.71875, - "learning_rate": 1.6336401142494446e-05, - "loss": 1.1746, - "step": 3046 - }, - { - "epoch": 0.957417986925025, - "grad_norm": 0.8671875, - "learning_rate": 1.633386226594732e-05, - "loss": 1.3328, - "step": 3048 - }, - { - "epoch": 0.9580462139505664, - "grad_norm": 0.6953125, - "learning_rate": 1.6331323389400192e-05, - "loss": 1.3283, - "step": 3050 - }, - { - "epoch": 0.9586744409761078, - "grad_norm": 0.67578125, - "learning_rate": 1.6328784512853064e-05, - "loss": 1.4036, - "step": 3052 - }, - { - "epoch": 0.9593026680016491, - "grad_norm": 0.83984375, - "learning_rate": 1.6326245636305935e-05, - "loss": 1.2255, - "step": 3054 - }, - { - "epoch": 0.9599308950271904, - "grad_norm": 0.7265625, - "learning_rate": 1.632370675975881e-05, - "loss": 1.2382, - "step": 3056 - }, - { - "epoch": 0.9605591220527318, - "grad_norm": 0.72265625, - "learning_rate": 1.6321167883211678e-05, - "loss": 1.311, - "step": 3058 - }, - { - "epoch": 0.9611873490782732, - "grad_norm": 0.7109375, - "learning_rate": 1.6318629006664553e-05, - "loss": 1.2916, - "step": 3060 - }, - { - "epoch": 0.9618155761038145, - "grad_norm": 0.734375, - "learning_rate": 1.6316090130117424e-05, - "loss": 1.2996, - "step": 3062 - }, - { - "epoch": 0.9624438031293558, - "grad_norm": 0.6796875, - "learning_rate": 1.6313551253570295e-05, - "loss": 1.3253, - "step": 3064 - }, - { - "epoch": 0.9630720301548972, - "grad_norm": 0.66796875, - "learning_rate": 1.6311012377023167e-05, - "loss": 1.3582, - "step": 3066 - }, - { - "epoch": 0.9637002571804386, - "grad_norm": 0.671875, - "learning_rate": 1.630847350047604e-05, - "loss": 1.3637, - "step": 3068 - }, - { - "epoch": 0.9643284842059799, - "grad_norm": 0.80859375, - "learning_rate": 1.6305934623928913e-05, - "loss": 1.2882, - "step": 3070 - }, - { - "epoch": 0.9649567112315213, - "grad_norm": 1.1171875, - "learning_rate": 1.6303395747381784e-05, - "loss": 1.2281, - "step": 3072 - }, - { - "epoch": 0.9655849382570626, - "grad_norm": 0.80078125, - "learning_rate": 1.6300856870834656e-05, - "loss": 1.3915, - "step": 3074 - }, - { - "epoch": 0.966213165282604, - "grad_norm": 0.75, - "learning_rate": 1.629831799428753e-05, - "loss": 1.401, - "step": 3076 - }, - { - "epoch": 0.9668413923081454, - "grad_norm": 0.73828125, - "learning_rate": 1.62957791177404e-05, - "loss": 1.2698, - "step": 3078 - }, - { - "epoch": 0.9674696193336867, - "grad_norm": 0.75, - "learning_rate": 1.6293240241193273e-05, - "loss": 1.3833, - "step": 3080 - }, - { - "epoch": 0.968097846359228, - "grad_norm": 0.84375, - "learning_rate": 1.6290701364646148e-05, - "loss": 1.2167, - "step": 3082 - }, - { - "epoch": 0.9687260733847695, - "grad_norm": 0.6875, - "learning_rate": 1.6288162488099016e-05, - "loss": 1.3872, - "step": 3084 - }, - { - "epoch": 0.9693543004103108, - "grad_norm": 0.65625, - "learning_rate": 1.628562361155189e-05, - "loss": 1.1942, - "step": 3086 - }, - { - "epoch": 0.9699825274358521, - "grad_norm": 0.91015625, - "learning_rate": 1.6283084735004762e-05, - "loss": 1.3442, - "step": 3088 - }, - { - "epoch": 0.9706107544613934, - "grad_norm": 0.68359375, - "learning_rate": 1.6280545858457634e-05, - "loss": 1.1895, - "step": 3090 - }, - { - "epoch": 0.9712389814869349, - "grad_norm": 0.765625, - "learning_rate": 1.6278006981910505e-05, - "loss": 1.2817, - "step": 3092 - }, - { - "epoch": 0.9718672085124762, - "grad_norm": 0.85546875, - "learning_rate": 1.627546810536338e-05, - "loss": 1.3469, - "step": 3094 - }, - { - "epoch": 0.9724954355380175, - "grad_norm": 0.76171875, - "learning_rate": 1.627292922881625e-05, - "loss": 1.3717, - "step": 3096 - }, - { - "epoch": 0.9731236625635589, - "grad_norm": 0.6953125, - "learning_rate": 1.6270390352269123e-05, - "loss": 1.3015, - "step": 3098 - }, - { - "epoch": 0.9737518895891003, - "grad_norm": 0.703125, - "learning_rate": 1.6267851475721994e-05, - "loss": 1.4265, - "step": 3100 - }, - { - "epoch": 0.9743801166146416, - "grad_norm": 0.80859375, - "learning_rate": 1.626531259917487e-05, - "loss": 1.2424, - "step": 3102 - }, - { - "epoch": 0.975008343640183, - "grad_norm": 0.8125, - "learning_rate": 1.6262773722627737e-05, - "loss": 1.3252, - "step": 3104 - }, - { - "epoch": 0.9756365706657243, - "grad_norm": 0.87890625, - "learning_rate": 1.626023484608061e-05, - "loss": 1.3464, - "step": 3106 - }, - { - "epoch": 0.9762647976912657, - "grad_norm": 0.81640625, - "learning_rate": 1.6257695969533483e-05, - "loss": 1.3007, - "step": 3108 - }, - { - "epoch": 0.9768930247168071, - "grad_norm": 0.90234375, - "learning_rate": 1.6255157092986354e-05, - "loss": 1.2549, - "step": 3110 - }, - { - "epoch": 0.9775212517423484, - "grad_norm": 0.76171875, - "learning_rate": 1.6252618216439226e-05, - "loss": 1.3299, - "step": 3112 - }, - { - "epoch": 0.9781494787678897, - "grad_norm": 0.77734375, - "learning_rate": 1.62500793398921e-05, - "loss": 1.3118, - "step": 3114 - }, - { - "epoch": 0.9787777057934312, - "grad_norm": 0.765625, - "learning_rate": 1.6247540463344972e-05, - "loss": 1.3114, - "step": 3116 - }, - { - "epoch": 0.9794059328189725, - "grad_norm": 0.66796875, - "learning_rate": 1.6245001586797843e-05, - "loss": 1.2729, - "step": 3118 - }, - { - "epoch": 0.9800341598445138, - "grad_norm": 0.69921875, - "learning_rate": 1.6242462710250715e-05, - "loss": 1.3769, - "step": 3120 - }, - { - "epoch": 0.9806623868700551, - "grad_norm": 0.65234375, - "learning_rate": 1.623992383370359e-05, - "loss": 1.2581, - "step": 3122 - }, - { - "epoch": 0.9812906138955966, - "grad_norm": 0.73046875, - "learning_rate": 1.623738495715646e-05, - "loss": 1.3578, - "step": 3124 - }, - { - "epoch": 0.9819188409211379, - "grad_norm": 0.67578125, - "learning_rate": 1.6234846080609332e-05, - "loss": 1.3055, - "step": 3126 - }, - { - "epoch": 0.9825470679466792, - "grad_norm": 1.3125, - "learning_rate": 1.6232307204062203e-05, - "loss": 1.2103, - "step": 3128 - }, - { - "epoch": 0.9831752949722206, - "grad_norm": 0.79296875, - "learning_rate": 1.6229768327515075e-05, - "loss": 1.3818, - "step": 3130 - }, - { - "epoch": 0.9838035219977619, - "grad_norm": 0.703125, - "learning_rate": 1.6227229450967946e-05, - "loss": 1.3531, - "step": 3132 - }, - { - "epoch": 0.9844317490233033, - "grad_norm": 0.73046875, - "learning_rate": 1.622469057442082e-05, - "loss": 1.2813, - "step": 3134 - }, - { - "epoch": 0.9850599760488447, - "grad_norm": 0.7578125, - "learning_rate": 1.6222151697873692e-05, - "loss": 1.3331, - "step": 3136 - }, - { - "epoch": 0.985688203074386, - "grad_norm": 0.72265625, - "learning_rate": 1.6219612821326564e-05, - "loss": 1.3681, - "step": 3138 - }, - { - "epoch": 0.9863164300999273, - "grad_norm": 0.7421875, - "learning_rate": 1.6217073944779435e-05, - "loss": 1.3545, - "step": 3140 - }, - { - "epoch": 0.9869446571254687, - "grad_norm": 0.703125, - "learning_rate": 1.621453506823231e-05, - "loss": 1.392, - "step": 3142 - }, - { - "epoch": 0.9875728841510101, - "grad_norm": 0.79296875, - "learning_rate": 1.621199619168518e-05, - "loss": 1.3232, - "step": 3144 - }, - { - "epoch": 0.9882011111765514, - "grad_norm": 0.7578125, - "learning_rate": 1.6209457315138053e-05, - "loss": 1.2144, - "step": 3146 - }, - { - "epoch": 0.9888293382020927, - "grad_norm": 0.6796875, - "learning_rate": 1.6206918438590924e-05, - "loss": 1.3129, - "step": 3148 - }, - { - "epoch": 0.9894575652276342, - "grad_norm": 0.74609375, - "learning_rate": 1.62043795620438e-05, - "loss": 1.3398, - "step": 3150 - }, - { - "epoch": 0.9900857922531755, - "grad_norm": 0.79296875, - "learning_rate": 1.6201840685496667e-05, - "loss": 1.3094, - "step": 3152 - }, - { - "epoch": 0.9907140192787168, - "grad_norm": 0.66796875, - "learning_rate": 1.619930180894954e-05, - "loss": 1.436, - "step": 3154 - }, - { - "epoch": 0.9913422463042582, - "grad_norm": 0.828125, - "learning_rate": 1.6196762932402413e-05, - "loss": 1.4225, - "step": 3156 - }, - { - "epoch": 0.9919704733297996, - "grad_norm": 0.76953125, - "learning_rate": 1.6194224055855284e-05, - "loss": 1.2521, - "step": 3158 - }, - { - "epoch": 0.9925987003553409, - "grad_norm": 0.7265625, - "learning_rate": 1.6191685179308156e-05, - "loss": 1.2926, - "step": 3160 - }, - { - "epoch": 0.9932269273808823, - "grad_norm": 0.66796875, - "learning_rate": 1.618914630276103e-05, - "loss": 1.222, - "step": 3162 - }, - { - "epoch": 0.9938551544064236, - "grad_norm": 0.87890625, - "learning_rate": 1.6186607426213902e-05, - "loss": 1.3083, - "step": 3164 - }, - { - "epoch": 0.994483381431965, - "grad_norm": 0.66015625, - "learning_rate": 1.6184068549666773e-05, - "loss": 1.3349, - "step": 3166 - }, - { - "epoch": 0.9951116084575063, - "grad_norm": 0.66015625, - "learning_rate": 1.6181529673119648e-05, - "loss": 1.306, - "step": 3168 - }, - { - "epoch": 0.9957398354830477, - "grad_norm": 0.796875, - "learning_rate": 1.617899079657252e-05, - "loss": 1.2053, - "step": 3170 - }, - { - "epoch": 0.996368062508589, - "grad_norm": 0.91015625, - "learning_rate": 1.617645192002539e-05, - "loss": 1.2241, - "step": 3172 - }, - { - "epoch": 0.9969962895341304, - "grad_norm": 0.78125, - "learning_rate": 1.6173913043478262e-05, - "loss": 1.1861, - "step": 3174 - }, - { - "epoch": 0.9976245165596718, - "grad_norm": 0.6953125, - "learning_rate": 1.6171374166931137e-05, - "loss": 1.3181, - "step": 3176 - }, - { - "epoch": 0.9982527435852131, - "grad_norm": 0.7265625, - "learning_rate": 1.6168835290384005e-05, - "loss": 1.2654, - "step": 3178 - }, - { - "epoch": 0.9988809706107544, - "grad_norm": 0.77734375, - "learning_rate": 1.616629641383688e-05, - "loss": 1.435, - "step": 3180 - }, - { - "epoch": 0.9995091976362959, - "grad_norm": 0.75, - "learning_rate": 1.616375753728975e-05, - "loss": 1.4596, - "step": 3182 - }, - { - "epoch": 1.000137424661837, - "grad_norm": 0.68359375, - "learning_rate": 1.6161218660742622e-05, - "loss": 1.3658, - "step": 3184 - }, - { - "epoch": 1.0007656516873786, - "grad_norm": 0.67578125, - "learning_rate": 1.6158679784195494e-05, - "loss": 1.278, - "step": 3186 - }, - { - "epoch": 1.00139387871292, - "grad_norm": 0.6640625, - "learning_rate": 1.615614090764837e-05, - "loss": 1.3165, - "step": 3188 - }, - { - "epoch": 1.0020221057384613, - "grad_norm": 0.68359375, - "learning_rate": 1.615360203110124e-05, - "loss": 1.292, - "step": 3190 - }, - { - "epoch": 1.0026503327640026, - "grad_norm": 0.7109375, - "learning_rate": 1.615106315455411e-05, - "loss": 1.2257, - "step": 3192 - }, - { - "epoch": 1.003278559789544, - "grad_norm": 0.78125, - "learning_rate": 1.6148524278006983e-05, - "loss": 1.256, - "step": 3194 - }, - { - "epoch": 1.0039067868150853, - "grad_norm": 0.83984375, - "learning_rate": 1.6145985401459858e-05, - "loss": 1.2126, - "step": 3196 - }, - { - "epoch": 1.0045350138406266, - "grad_norm": 0.75390625, - "learning_rate": 1.6143446524912726e-05, - "loss": 1.2712, - "step": 3198 - }, - { - "epoch": 1.005163240866168, - "grad_norm": 0.76953125, - "learning_rate": 1.61409076483656e-05, - "loss": 1.1845, - "step": 3200 - }, - { - "epoch": 1.0057914678917093, - "grad_norm": 0.796875, - "learning_rate": 1.613836877181847e-05, - "loss": 1.2542, - "step": 3202 - }, - { - "epoch": 1.0064196949172508, - "grad_norm": 0.8125, - "learning_rate": 1.6135829895271343e-05, - "loss": 1.2691, - "step": 3204 - }, - { - "epoch": 1.0070479219427921, - "grad_norm": 0.78125, - "learning_rate": 1.6133291018724214e-05, - "loss": 1.1913, - "step": 3206 - }, - { - "epoch": 1.0076761489683335, - "grad_norm": 0.78125, - "learning_rate": 1.613075214217709e-05, - "loss": 1.1205, - "step": 3208 - }, - { - "epoch": 1.0083043759938748, - "grad_norm": 0.7578125, - "learning_rate": 1.612821326562996e-05, - "loss": 1.2168, - "step": 3210 - }, - { - "epoch": 1.0089326030194161, - "grad_norm": 0.73046875, - "learning_rate": 1.6125674389082832e-05, - "loss": 1.0832, - "step": 3212 - }, - { - "epoch": 1.0095608300449574, - "grad_norm": 0.76953125, - "learning_rate": 1.6123135512535703e-05, - "loss": 1.2185, - "step": 3214 - }, - { - "epoch": 1.0101890570704988, - "grad_norm": 0.78515625, - "learning_rate": 1.6120596635988578e-05, - "loss": 1.3084, - "step": 3216 - }, - { - "epoch": 1.01081728409604, - "grad_norm": 0.82421875, - "learning_rate": 1.6118057759441446e-05, - "loss": 1.1159, - "step": 3218 - }, - { - "epoch": 1.0114455111215817, - "grad_norm": 0.7421875, - "learning_rate": 1.611551888289432e-05, - "loss": 1.165, - "step": 3220 - }, - { - "epoch": 1.012073738147123, - "grad_norm": 0.73046875, - "learning_rate": 1.6112980006347192e-05, - "loss": 1.1485, - "step": 3222 - }, - { - "epoch": 1.0127019651726643, - "grad_norm": 0.90234375, - "learning_rate": 1.6110441129800064e-05, - "loss": 1.23, - "step": 3224 - }, - { - "epoch": 1.0133301921982056, - "grad_norm": 0.6875, - "learning_rate": 1.6107902253252935e-05, - "loss": 1.1168, - "step": 3226 - }, - { - "epoch": 1.013958419223747, - "grad_norm": 0.8359375, - "learning_rate": 1.610536337670581e-05, - "loss": 1.0645, - "step": 3228 - }, - { - "epoch": 1.0145866462492883, - "grad_norm": 0.7890625, - "learning_rate": 1.610282450015868e-05, - "loss": 1.2309, - "step": 3230 - }, - { - "epoch": 1.0152148732748296, - "grad_norm": 0.83203125, - "learning_rate": 1.6100285623611553e-05, - "loss": 1.2314, - "step": 3232 - }, - { - "epoch": 1.015843100300371, - "grad_norm": 0.7734375, - "learning_rate": 1.6097746747064424e-05, - "loss": 1.3229, - "step": 3234 - }, - { - "epoch": 1.0164713273259125, - "grad_norm": 0.73828125, - "learning_rate": 1.60952078705173e-05, - "loss": 1.1837, - "step": 3236 - }, - { - "epoch": 1.0170995543514538, - "grad_norm": 0.8203125, - "learning_rate": 1.609266899397017e-05, - "loss": 1.2404, - "step": 3238 - }, - { - "epoch": 1.0177277813769952, - "grad_norm": 0.765625, - "learning_rate": 1.609013011742304e-05, - "loss": 1.344, - "step": 3240 - }, - { - "epoch": 1.0183560084025365, - "grad_norm": 0.74609375, - "learning_rate": 1.6087591240875913e-05, - "loss": 1.3881, - "step": 3242 - }, - { - "epoch": 1.0189842354280778, - "grad_norm": 0.7890625, - "learning_rate": 1.6085052364328784e-05, - "loss": 1.3539, - "step": 3244 - }, - { - "epoch": 1.0196124624536191, - "grad_norm": 0.72265625, - "learning_rate": 1.6082513487781656e-05, - "loss": 1.2862, - "step": 3246 - }, - { - "epoch": 1.0202406894791605, - "grad_norm": 0.73046875, - "learning_rate": 1.607997461123453e-05, - "loss": 1.3068, - "step": 3248 - }, - { - "epoch": 1.0208689165047018, - "grad_norm": 0.6875, - "learning_rate": 1.6077435734687402e-05, - "loss": 1.2403, - "step": 3250 - }, - { - "epoch": 1.0214971435302433, - "grad_norm": 0.7578125, - "learning_rate": 1.6074896858140273e-05, - "loss": 1.2603, - "step": 3252 - }, - { - "epoch": 1.0221253705557847, - "grad_norm": 0.83203125, - "learning_rate": 1.6072357981593148e-05, - "loss": 1.1728, - "step": 3254 - }, - { - "epoch": 1.022753597581326, - "grad_norm": 0.7734375, - "learning_rate": 1.606981910504602e-05, - "loss": 1.1989, - "step": 3256 - }, - { - "epoch": 1.0233818246068673, - "grad_norm": 0.84765625, - "learning_rate": 1.606728022849889e-05, - "loss": 1.2041, - "step": 3258 - }, - { - "epoch": 1.0240100516324087, - "grad_norm": 0.84375, - "learning_rate": 1.6064741351951762e-05, - "loss": 1.2197, - "step": 3260 - }, - { - "epoch": 1.02463827865795, - "grad_norm": 0.81640625, - "learning_rate": 1.6062202475404637e-05, - "loss": 1.1039, - "step": 3262 - }, - { - "epoch": 1.0252665056834913, - "grad_norm": 0.74609375, - "learning_rate": 1.6059663598857508e-05, - "loss": 1.2697, - "step": 3264 - }, - { - "epoch": 1.0258947327090326, - "grad_norm": 0.8515625, - "learning_rate": 1.605712472231038e-05, - "loss": 1.152, - "step": 3266 - }, - { - "epoch": 1.026522959734574, - "grad_norm": 0.83203125, - "learning_rate": 1.605458584576325e-05, - "loss": 1.2728, - "step": 3268 - }, - { - "epoch": 1.0271511867601155, - "grad_norm": 0.77734375, - "learning_rate": 1.6052046969216122e-05, - "loss": 1.1901, - "step": 3270 - }, - { - "epoch": 1.0277794137856568, - "grad_norm": 0.765625, - "learning_rate": 1.6049508092668994e-05, - "loss": 1.2918, - "step": 3272 - }, - { - "epoch": 1.0284076408111982, - "grad_norm": 0.7890625, - "learning_rate": 1.604696921612187e-05, - "loss": 1.2328, - "step": 3274 - }, - { - "epoch": 1.0290358678367395, - "grad_norm": 0.7578125, - "learning_rate": 1.604443033957474e-05, - "loss": 1.2688, - "step": 3276 - }, - { - "epoch": 1.0296640948622808, - "grad_norm": 0.78125, - "learning_rate": 1.604189146302761e-05, - "loss": 1.1562, - "step": 3278 - }, - { - "epoch": 1.0302923218878222, - "grad_norm": 0.83203125, - "learning_rate": 1.6039352586480483e-05, - "loss": 1.2546, - "step": 3280 - }, - { - "epoch": 1.0309205489133635, - "grad_norm": 0.75, - "learning_rate": 1.6036813709933357e-05, - "loss": 1.1542, - "step": 3282 - }, - { - "epoch": 1.0315487759389048, - "grad_norm": 0.87109375, - "learning_rate": 1.603427483338623e-05, - "loss": 1.2115, - "step": 3284 - }, - { - "epoch": 1.0321770029644464, - "grad_norm": 0.8671875, - "learning_rate": 1.60317359568391e-05, - "loss": 1.3227, - "step": 3286 - }, - { - "epoch": 1.0328052299899877, - "grad_norm": 0.77734375, - "learning_rate": 1.602919708029197e-05, - "loss": 1.2416, - "step": 3288 - }, - { - "epoch": 1.033433457015529, - "grad_norm": 0.8125, - "learning_rate": 1.6026658203744846e-05, - "loss": 1.2146, - "step": 3290 - }, - { - "epoch": 1.0340616840410704, - "grad_norm": 0.75390625, - "learning_rate": 1.6024119327197714e-05, - "loss": 1.2237, - "step": 3292 - }, - { - "epoch": 1.0346899110666117, - "grad_norm": 0.734375, - "learning_rate": 1.602158045065059e-05, - "loss": 1.1345, - "step": 3294 - }, - { - "epoch": 1.035318138092153, - "grad_norm": 0.76171875, - "learning_rate": 1.601904157410346e-05, - "loss": 1.1062, - "step": 3296 - }, - { - "epoch": 1.0359463651176943, - "grad_norm": 0.74609375, - "learning_rate": 1.6016502697556332e-05, - "loss": 1.2563, - "step": 3298 - }, - { - "epoch": 1.0365745921432357, - "grad_norm": 0.7890625, - "learning_rate": 1.6013963821009203e-05, - "loss": 1.1443, - "step": 3300 - }, - { - "epoch": 1.0372028191687772, - "grad_norm": 0.8125, - "learning_rate": 1.6011424944462078e-05, - "loss": 1.3163, - "step": 3302 - }, - { - "epoch": 1.0378310461943185, - "grad_norm": 0.80078125, - "learning_rate": 1.600888606791495e-05, - "loss": 1.2362, - "step": 3304 - }, - { - "epoch": 1.0384592732198599, - "grad_norm": 0.828125, - "learning_rate": 1.600634719136782e-05, - "loss": 1.2361, - "step": 3306 - }, - { - "epoch": 1.0390875002454012, - "grad_norm": 0.859375, - "learning_rate": 1.6003808314820692e-05, - "loss": 1.2125, - "step": 3308 - }, - { - "epoch": 1.0397157272709425, - "grad_norm": 0.9140625, - "learning_rate": 1.6001269438273567e-05, - "loss": 1.1605, - "step": 3310 - }, - { - "epoch": 1.0403439542964839, - "grad_norm": 0.88671875, - "learning_rate": 1.5998730561726435e-05, - "loss": 1.1425, - "step": 3312 - }, - { - "epoch": 1.0409721813220252, - "grad_norm": 0.72265625, - "learning_rate": 1.599619168517931e-05, - "loss": 1.1267, - "step": 3314 - }, - { - "epoch": 1.0416004083475665, - "grad_norm": 0.79296875, - "learning_rate": 1.599365280863218e-05, - "loss": 1.3268, - "step": 3316 - }, - { - "epoch": 1.042228635373108, - "grad_norm": 0.85546875, - "learning_rate": 1.5991113932085053e-05, - "loss": 1.2864, - "step": 3318 - }, - { - "epoch": 1.0428568623986494, - "grad_norm": 0.796875, - "learning_rate": 1.5988575055537924e-05, - "loss": 1.3753, - "step": 3320 - }, - { - "epoch": 1.0434850894241907, - "grad_norm": 0.734375, - "learning_rate": 1.59860361789908e-05, - "loss": 1.279, - "step": 3322 - }, - { - "epoch": 1.044113316449732, - "grad_norm": 0.77734375, - "learning_rate": 1.598349730244367e-05, - "loss": 1.2186, - "step": 3324 - }, - { - "epoch": 1.0447415434752734, - "grad_norm": 0.765625, - "learning_rate": 1.598095842589654e-05, - "loss": 1.2752, - "step": 3326 - }, - { - "epoch": 1.0453697705008147, - "grad_norm": 0.88671875, - "learning_rate": 1.5978419549349413e-05, - "loss": 1.0901, - "step": 3328 - }, - { - "epoch": 1.045997997526356, - "grad_norm": 0.71875, - "learning_rate": 1.5975880672802288e-05, - "loss": 1.2462, - "step": 3330 - }, - { - "epoch": 1.0466262245518974, - "grad_norm": 0.81640625, - "learning_rate": 1.597334179625516e-05, - "loss": 1.2196, - "step": 3332 - }, - { - "epoch": 1.0472544515774387, - "grad_norm": 0.77734375, - "learning_rate": 1.597080291970803e-05, - "loss": 1.181, - "step": 3334 - }, - { - "epoch": 1.0478826786029802, - "grad_norm": 0.828125, - "learning_rate": 1.5968264043160902e-05, - "loss": 1.2889, - "step": 3336 - }, - { - "epoch": 1.0485109056285216, - "grad_norm": 0.79296875, - "learning_rate": 1.5965725166613773e-05, - "loss": 1.2918, - "step": 3338 - }, - { - "epoch": 1.049139132654063, - "grad_norm": 0.8515625, - "learning_rate": 1.5963186290066648e-05, - "loss": 1.2074, - "step": 3340 - }, - { - "epoch": 1.0497673596796042, - "grad_norm": 0.72265625, - "learning_rate": 1.596064741351952e-05, - "loss": 1.0973, - "step": 3342 - }, - { - "epoch": 1.0503955867051455, - "grad_norm": 0.85546875, - "learning_rate": 1.595810853697239e-05, - "loss": 1.2965, - "step": 3344 - }, - { - "epoch": 1.0510238137306869, - "grad_norm": 0.90625, - "learning_rate": 1.5955569660425262e-05, - "loss": 1.1753, - "step": 3346 - }, - { - "epoch": 1.0516520407562282, - "grad_norm": 0.89453125, - "learning_rate": 1.5953030783878137e-05, - "loss": 1.1732, - "step": 3348 - }, - { - "epoch": 1.0522802677817698, - "grad_norm": 0.8125, - "learning_rate": 1.5950491907331008e-05, - "loss": 1.2432, - "step": 3350 - }, - { - "epoch": 1.052908494807311, - "grad_norm": 0.74609375, - "learning_rate": 1.594795303078388e-05, - "loss": 1.3405, - "step": 3352 - }, - { - "epoch": 1.0535367218328524, - "grad_norm": 0.73828125, - "learning_rate": 1.594541415423675e-05, - "loss": 1.2937, - "step": 3354 - }, - { - "epoch": 1.0541649488583937, - "grad_norm": 0.76953125, - "learning_rate": 1.5942875277689626e-05, - "loss": 1.2403, - "step": 3356 - }, - { - "epoch": 1.054793175883935, - "grad_norm": 0.71875, - "learning_rate": 1.5940336401142497e-05, - "loss": 1.1935, - "step": 3358 - }, - { - "epoch": 1.0554214029094764, - "grad_norm": 0.80859375, - "learning_rate": 1.593779752459537e-05, - "loss": 1.1554, - "step": 3360 - }, - { - "epoch": 1.0560496299350177, - "grad_norm": 0.79296875, - "learning_rate": 1.593525864804824e-05, - "loss": 1.2863, - "step": 3362 - }, - { - "epoch": 1.056677856960559, - "grad_norm": 0.76953125, - "learning_rate": 1.593271977150111e-05, - "loss": 1.3232, - "step": 3364 - }, - { - "epoch": 1.0573060839861004, - "grad_norm": 0.87109375, - "learning_rate": 1.5930180894953983e-05, - "loss": 1.165, - "step": 3366 - }, - { - "epoch": 1.057934311011642, - "grad_norm": 0.859375, - "learning_rate": 1.5927642018406857e-05, - "loss": 1.2739, - "step": 3368 - }, - { - "epoch": 1.0585625380371833, - "grad_norm": 0.72265625, - "learning_rate": 1.592510314185973e-05, - "loss": 1.1609, - "step": 3370 - }, - { - "epoch": 1.0591907650627246, - "grad_norm": 0.7421875, - "learning_rate": 1.59225642653126e-05, - "loss": 1.3293, - "step": 3372 - }, - { - "epoch": 1.059818992088266, - "grad_norm": 0.76171875, - "learning_rate": 1.592002538876547e-05, - "loss": 1.2502, - "step": 3374 - }, - { - "epoch": 1.0604472191138072, - "grad_norm": 0.7578125, - "learning_rate": 1.5917486512218346e-05, - "loss": 1.1905, - "step": 3376 - }, - { - "epoch": 1.0610754461393486, - "grad_norm": 0.79296875, - "learning_rate": 1.5914947635671218e-05, - "loss": 1.3572, - "step": 3378 - }, - { - "epoch": 1.06170367316489, - "grad_norm": 0.96875, - "learning_rate": 1.591240875912409e-05, - "loss": 1.2005, - "step": 3380 - }, - { - "epoch": 1.0623319001904312, - "grad_norm": 0.74609375, - "learning_rate": 1.590986988257696e-05, - "loss": 1.2663, - "step": 3382 - }, - { - "epoch": 1.0629601272159728, - "grad_norm": 0.80078125, - "learning_rate": 1.5907331006029835e-05, - "loss": 1.2143, - "step": 3384 - }, - { - "epoch": 1.063588354241514, - "grad_norm": 0.73828125, - "learning_rate": 1.5904792129482703e-05, - "loss": 1.2849, - "step": 3386 - }, - { - "epoch": 1.0642165812670554, - "grad_norm": 0.765625, - "learning_rate": 1.5902253252935578e-05, - "loss": 1.2787, - "step": 3388 - }, - { - "epoch": 1.0648448082925968, - "grad_norm": 0.8046875, - "learning_rate": 1.589971437638845e-05, - "loss": 1.1279, - "step": 3390 - }, - { - "epoch": 1.065473035318138, - "grad_norm": 0.7890625, - "learning_rate": 1.589717549984132e-05, - "loss": 1.2399, - "step": 3392 - }, - { - "epoch": 1.0661012623436794, - "grad_norm": 0.8125, - "learning_rate": 1.5894636623294192e-05, - "loss": 1.2965, - "step": 3394 - }, - { - "epoch": 1.0667294893692207, - "grad_norm": 0.7265625, - "learning_rate": 1.5892097746747067e-05, - "loss": 1.1753, - "step": 3396 - }, - { - "epoch": 1.067357716394762, - "grad_norm": 0.80859375, - "learning_rate": 1.5889558870199938e-05, - "loss": 1.229, - "step": 3398 - }, - { - "epoch": 1.0679859434203034, - "grad_norm": 0.73046875, - "learning_rate": 1.588701999365281e-05, - "loss": 1.2592, - "step": 3400 - }, - { - "epoch": 1.068614170445845, - "grad_norm": 0.78125, - "learning_rate": 1.588448111710568e-05, - "loss": 1.2941, - "step": 3402 - }, - { - "epoch": 1.0692423974713863, - "grad_norm": 0.75390625, - "learning_rate": 1.5881942240558556e-05, - "loss": 1.2962, - "step": 3404 - }, - { - "epoch": 1.0698706244969276, - "grad_norm": 0.796875, - "learning_rate": 1.5879403364011424e-05, - "loss": 1.2558, - "step": 3406 - }, - { - "epoch": 1.070498851522469, - "grad_norm": 0.7578125, - "learning_rate": 1.58768644874643e-05, - "loss": 1.3021, - "step": 3408 - }, - { - "epoch": 1.0711270785480103, - "grad_norm": 0.80859375, - "learning_rate": 1.587432561091717e-05, - "loss": 1.2742, - "step": 3410 - }, - { - "epoch": 1.0717553055735516, - "grad_norm": 0.796875, - "learning_rate": 1.587178673437004e-05, - "loss": 1.2945, - "step": 3412 - }, - { - "epoch": 1.072383532599093, - "grad_norm": 0.83984375, - "learning_rate": 1.5869247857822913e-05, - "loss": 1.323, - "step": 3414 - }, - { - "epoch": 1.0730117596246345, - "grad_norm": 0.76953125, - "learning_rate": 1.5866708981275787e-05, - "loss": 1.2498, - "step": 3416 - }, - { - "epoch": 1.0736399866501758, - "grad_norm": 0.765625, - "learning_rate": 1.586417010472866e-05, - "loss": 1.0935, - "step": 3418 - }, - { - "epoch": 1.0742682136757171, - "grad_norm": 0.77734375, - "learning_rate": 1.586163122818153e-05, - "loss": 1.1667, - "step": 3420 - }, - { - "epoch": 1.0748964407012584, - "grad_norm": 0.75390625, - "learning_rate": 1.58590923516344e-05, - "loss": 1.2579, - "step": 3422 - }, - { - "epoch": 1.0755246677267998, - "grad_norm": 0.79296875, - "learning_rate": 1.5856553475087276e-05, - "loss": 1.1673, - "step": 3424 - }, - { - "epoch": 1.076152894752341, - "grad_norm": 0.73046875, - "learning_rate": 1.5854014598540148e-05, - "loss": 1.411, - "step": 3426 - }, - { - "epoch": 1.0767811217778824, - "grad_norm": 0.82421875, - "learning_rate": 1.585147572199302e-05, - "loss": 1.1398, - "step": 3428 - }, - { - "epoch": 1.0774093488034238, - "grad_norm": 0.76953125, - "learning_rate": 1.5848936845445894e-05, - "loss": 1.2636, - "step": 3430 - }, - { - "epoch": 1.078037575828965, - "grad_norm": 0.71484375, - "learning_rate": 1.5846397968898762e-05, - "loss": 1.2364, - "step": 3432 - }, - { - "epoch": 1.0786658028545066, - "grad_norm": 0.78515625, - "learning_rate": 1.5843859092351637e-05, - "loss": 1.4491, - "step": 3434 - }, - { - "epoch": 1.079294029880048, - "grad_norm": 0.828125, - "learning_rate": 1.5841320215804508e-05, - "loss": 1.1533, - "step": 3436 - }, - { - "epoch": 1.0799222569055893, - "grad_norm": 0.95703125, - "learning_rate": 1.583878133925738e-05, - "loss": 1.229, - "step": 3438 - }, - { - "epoch": 1.0805504839311306, - "grad_norm": 0.76171875, - "learning_rate": 1.583624246271025e-05, - "loss": 1.4361, - "step": 3440 - }, - { - "epoch": 1.081178710956672, - "grad_norm": 0.7890625, - "learning_rate": 1.5833703586163126e-05, - "loss": 1.3107, - "step": 3442 - }, - { - "epoch": 1.0818069379822133, - "grad_norm": 0.84765625, - "learning_rate": 1.5831164709615997e-05, - "loss": 1.1652, - "step": 3444 - }, - { - "epoch": 1.0824351650077546, - "grad_norm": 0.8359375, - "learning_rate": 1.582862583306887e-05, - "loss": 1.2367, - "step": 3446 - }, - { - "epoch": 1.083063392033296, - "grad_norm": 0.76953125, - "learning_rate": 1.582608695652174e-05, - "loss": 1.178, - "step": 3448 - }, - { - "epoch": 1.0836916190588375, - "grad_norm": 0.82421875, - "learning_rate": 1.5823548079974615e-05, - "loss": 1.2211, - "step": 3450 - }, - { - "epoch": 1.0843198460843788, - "grad_norm": 0.9140625, - "learning_rate": 1.5821009203427486e-05, - "loss": 1.2025, - "step": 3452 - }, - { - "epoch": 1.0849480731099201, - "grad_norm": 0.83203125, - "learning_rate": 1.5818470326880357e-05, - "loss": 1.2151, - "step": 3454 - }, - { - "epoch": 1.0855763001354615, - "grad_norm": 0.79296875, - "learning_rate": 1.581593145033323e-05, - "loss": 1.2384, - "step": 3456 - }, - { - "epoch": 1.0862045271610028, - "grad_norm": 0.85546875, - "learning_rate": 1.58133925737861e-05, - "loss": 1.2462, - "step": 3458 - }, - { - "epoch": 1.0868327541865441, - "grad_norm": 0.78515625, - "learning_rate": 1.581085369723897e-05, - "loss": 1.1743, - "step": 3460 - }, - { - "epoch": 1.0874609812120855, - "grad_norm": 0.7890625, - "learning_rate": 1.5808314820691846e-05, - "loss": 1.2484, - "step": 3462 - }, - { - "epoch": 1.0880892082376268, - "grad_norm": 0.8203125, - "learning_rate": 1.5805775944144718e-05, - "loss": 1.1292, - "step": 3464 - }, - { - "epoch": 1.088717435263168, - "grad_norm": 0.7890625, - "learning_rate": 1.580323706759759e-05, - "loss": 1.3436, - "step": 3466 - }, - { - "epoch": 1.0893456622887097, - "grad_norm": 0.79296875, - "learning_rate": 1.580069819105046e-05, - "loss": 1.106, - "step": 3468 - }, - { - "epoch": 1.089973889314251, - "grad_norm": 0.7109375, - "learning_rate": 1.5798159314503335e-05, - "loss": 1.1988, - "step": 3470 - }, - { - "epoch": 1.0906021163397923, - "grad_norm": 0.84765625, - "learning_rate": 1.5795620437956207e-05, - "loss": 1.4247, - "step": 3472 - }, - { - "epoch": 1.0912303433653336, - "grad_norm": 0.79296875, - "learning_rate": 1.5793081561409078e-05, - "loss": 1.2297, - "step": 3474 - }, - { - "epoch": 1.091858570390875, - "grad_norm": 0.78125, - "learning_rate": 1.579054268486195e-05, - "loss": 1.1387, - "step": 3476 - }, - { - "epoch": 1.0924867974164163, - "grad_norm": 0.73046875, - "learning_rate": 1.5788003808314824e-05, - "loss": 1.3101, - "step": 3478 - }, - { - "epoch": 1.0931150244419576, - "grad_norm": 0.98046875, - "learning_rate": 1.5785464931767692e-05, - "loss": 1.2612, - "step": 3480 - }, - { - "epoch": 1.0937432514674992, - "grad_norm": 0.79296875, - "learning_rate": 1.5782926055220567e-05, - "loss": 1.1812, - "step": 3482 - }, - { - "epoch": 1.0943714784930405, - "grad_norm": 0.7265625, - "learning_rate": 1.5780387178673438e-05, - "loss": 1.2531, - "step": 3484 - }, - { - "epoch": 1.0949997055185818, - "grad_norm": 0.91796875, - "learning_rate": 1.577784830212631e-05, - "loss": 1.1636, - "step": 3486 - }, - { - "epoch": 1.0956279325441232, - "grad_norm": 0.75, - "learning_rate": 1.577530942557918e-05, - "loss": 1.2318, - "step": 3488 - }, - { - "epoch": 1.0962561595696645, - "grad_norm": 0.859375, - "learning_rate": 1.5772770549032056e-05, - "loss": 1.337, - "step": 3490 - }, - { - "epoch": 1.0968843865952058, - "grad_norm": 3.765625, - "learning_rate": 1.5770231672484927e-05, - "loss": 1.2689, - "step": 3492 - }, - { - "epoch": 1.0975126136207471, - "grad_norm": 0.80078125, - "learning_rate": 1.57676927959378e-05, - "loss": 1.2185, - "step": 3494 - }, - { - "epoch": 1.0981408406462885, - "grad_norm": 0.8359375, - "learning_rate": 1.576515391939067e-05, - "loss": 1.3053, - "step": 3496 - }, - { - "epoch": 1.0987690676718298, - "grad_norm": 0.76171875, - "learning_rate": 1.5762615042843545e-05, - "loss": 1.1346, - "step": 3498 - }, - { - "epoch": 1.0993972946973714, - "grad_norm": 0.7109375, - "learning_rate": 1.5760076166296413e-05, - "loss": 1.2331, - "step": 3500 - }, - { - "epoch": 1.1000255217229127, - "grad_norm": 0.74609375, - "learning_rate": 1.5757537289749287e-05, - "loss": 1.2669, - "step": 3502 - }, - { - "epoch": 1.100653748748454, - "grad_norm": 0.78515625, - "learning_rate": 1.575499841320216e-05, - "loss": 1.2222, - "step": 3504 - }, - { - "epoch": 1.1012819757739953, - "grad_norm": 0.8046875, - "learning_rate": 1.575245953665503e-05, - "loss": 1.1517, - "step": 3506 - }, - { - "epoch": 1.1019102027995367, - "grad_norm": 0.79296875, - "learning_rate": 1.57499206601079e-05, - "loss": 1.2206, - "step": 3508 - }, - { - "epoch": 1.102538429825078, - "grad_norm": 0.7890625, - "learning_rate": 1.5747381783560776e-05, - "loss": 1.2261, - "step": 3510 - }, - { - "epoch": 1.1031666568506193, - "grad_norm": 2.125, - "learning_rate": 1.5744842907013648e-05, - "loss": 1.0953, - "step": 3512 - }, - { - "epoch": 1.1037948838761606, - "grad_norm": 0.83984375, - "learning_rate": 1.574230403046652e-05, - "loss": 1.3215, - "step": 3514 - }, - { - "epoch": 1.1044231109017022, - "grad_norm": 0.79296875, - "learning_rate": 1.5739765153919394e-05, - "loss": 1.3369, - "step": 3516 - }, - { - "epoch": 1.1050513379272435, - "grad_norm": 0.84375, - "learning_rate": 1.5737226277372265e-05, - "loss": 1.2915, - "step": 3518 - }, - { - "epoch": 1.1056795649527849, - "grad_norm": 0.85546875, - "learning_rate": 1.5734687400825137e-05, - "loss": 1.3005, - "step": 3520 - }, - { - "epoch": 1.1063077919783262, - "grad_norm": 0.83984375, - "learning_rate": 1.5732148524278008e-05, - "loss": 1.2166, - "step": 3522 - }, - { - "epoch": 1.1069360190038675, - "grad_norm": 0.86328125, - "learning_rate": 1.5729609647730883e-05, - "loss": 1.1558, - "step": 3524 - }, - { - "epoch": 1.1075642460294088, - "grad_norm": 0.77734375, - "learning_rate": 1.572707077118375e-05, - "loss": 1.2904, - "step": 3526 - }, - { - "epoch": 1.1081924730549502, - "grad_norm": 0.859375, - "learning_rate": 1.5724531894636626e-05, - "loss": 1.1995, - "step": 3528 - }, - { - "epoch": 1.1088207000804915, - "grad_norm": 0.78515625, - "learning_rate": 1.5721993018089497e-05, - "loss": 1.3063, - "step": 3530 - }, - { - "epoch": 1.109448927106033, - "grad_norm": 0.85546875, - "learning_rate": 1.571945414154237e-05, - "loss": 1.3033, - "step": 3532 - }, - { - "epoch": 1.1100771541315744, - "grad_norm": 0.81640625, - "learning_rate": 1.571691526499524e-05, - "loss": 1.2816, - "step": 3534 - }, - { - "epoch": 1.1107053811571157, - "grad_norm": 0.81640625, - "learning_rate": 1.5714376388448114e-05, - "loss": 1.3709, - "step": 3536 - }, - { - "epoch": 1.111333608182657, - "grad_norm": 0.8125, - "learning_rate": 1.5711837511900986e-05, - "loss": 1.1248, - "step": 3538 - }, - { - "epoch": 1.1119618352081984, - "grad_norm": 0.81640625, - "learning_rate": 1.5709298635353857e-05, - "loss": 1.2373, - "step": 3540 - }, - { - "epoch": 1.1125900622337397, - "grad_norm": 0.8671875, - "learning_rate": 1.570675975880673e-05, - "loss": 1.3827, - "step": 3542 - }, - { - "epoch": 1.113218289259281, - "grad_norm": 0.90234375, - "learning_rate": 1.5704220882259603e-05, - "loss": 1.2102, - "step": 3544 - }, - { - "epoch": 1.1138465162848223, - "grad_norm": 0.80859375, - "learning_rate": 1.570168200571247e-05, - "loss": 1.1883, - "step": 3546 - }, - { - "epoch": 1.114474743310364, - "grad_norm": 0.828125, - "learning_rate": 1.5699143129165346e-05, - "loss": 1.1861, - "step": 3548 - }, - { - "epoch": 1.1151029703359052, - "grad_norm": 0.80078125, - "learning_rate": 1.5696604252618218e-05, - "loss": 1.3095, - "step": 3550 - }, - { - "epoch": 1.1157311973614465, - "grad_norm": 0.76171875, - "learning_rate": 1.569406537607109e-05, - "loss": 1.1794, - "step": 3552 - }, - { - "epoch": 1.1163594243869879, - "grad_norm": 0.84765625, - "learning_rate": 1.569152649952396e-05, - "loss": 1.2195, - "step": 3554 - }, - { - "epoch": 1.1169876514125292, - "grad_norm": 0.7578125, - "learning_rate": 1.5688987622976835e-05, - "loss": 1.3026, - "step": 3556 - }, - { - "epoch": 1.1176158784380705, - "grad_norm": 0.859375, - "learning_rate": 1.5686448746429706e-05, - "loss": 1.308, - "step": 3558 - }, - { - "epoch": 1.1182441054636119, - "grad_norm": 0.734375, - "learning_rate": 1.5683909869882578e-05, - "loss": 1.192, - "step": 3560 - }, - { - "epoch": 1.1188723324891532, - "grad_norm": 0.76953125, - "learning_rate": 1.568137099333545e-05, - "loss": 1.1283, - "step": 3562 - }, - { - "epoch": 1.1195005595146945, - "grad_norm": 0.765625, - "learning_rate": 1.5678832116788324e-05, - "loss": 1.2358, - "step": 3564 - }, - { - "epoch": 1.120128786540236, - "grad_norm": 0.77734375, - "learning_rate": 1.5676293240241195e-05, - "loss": 1.1856, - "step": 3566 - }, - { - "epoch": 1.1207570135657774, - "grad_norm": 0.828125, - "learning_rate": 1.5673754363694067e-05, - "loss": 1.2214, - "step": 3568 - }, - { - "epoch": 1.1213852405913187, - "grad_norm": 0.796875, - "learning_rate": 1.5671215487146938e-05, - "loss": 1.1597, - "step": 3570 - }, - { - "epoch": 1.12201346761686, - "grad_norm": 0.796875, - "learning_rate": 1.566867661059981e-05, - "loss": 1.1691, - "step": 3572 - }, - { - "epoch": 1.1226416946424014, - "grad_norm": 0.78125, - "learning_rate": 1.566613773405268e-05, - "loss": 1.3458, - "step": 3574 - }, - { - "epoch": 1.1232699216679427, - "grad_norm": 0.7890625, - "learning_rate": 1.5663598857505556e-05, - "loss": 1.0832, - "step": 3576 - }, - { - "epoch": 1.123898148693484, - "grad_norm": 0.93359375, - "learning_rate": 1.5661059980958427e-05, - "loss": 1.2571, - "step": 3578 - }, - { - "epoch": 1.1245263757190254, - "grad_norm": 0.78515625, - "learning_rate": 1.56585211044113e-05, - "loss": 1.3284, - "step": 3580 - }, - { - "epoch": 1.125154602744567, - "grad_norm": 0.8984375, - "learning_rate": 1.565598222786417e-05, - "loss": 1.1614, - "step": 3582 - }, - { - "epoch": 1.1257828297701082, - "grad_norm": 0.78125, - "learning_rate": 1.5653443351317045e-05, - "loss": 1.208, - "step": 3584 - }, - { - "epoch": 1.1264110567956496, - "grad_norm": 0.75, - "learning_rate": 1.5650904474769916e-05, - "loss": 1.1761, - "step": 3586 - }, - { - "epoch": 1.127039283821191, - "grad_norm": 0.734375, - "learning_rate": 1.5648365598222787e-05, - "loss": 1.3036, - "step": 3588 - }, - { - "epoch": 1.1276675108467322, - "grad_norm": 0.765625, - "learning_rate": 1.564582672167566e-05, - "loss": 1.1291, - "step": 3590 - }, - { - "epoch": 1.1282957378722736, - "grad_norm": 0.8125, - "learning_rate": 1.5643287845128534e-05, - "loss": 1.2135, - "step": 3592 - }, - { - "epoch": 1.1289239648978149, - "grad_norm": 0.7421875, - "learning_rate": 1.56407489685814e-05, - "loss": 1.2834, - "step": 3594 - }, - { - "epoch": 1.1295521919233562, - "grad_norm": 0.828125, - "learning_rate": 1.5638210092034276e-05, - "loss": 1.328, - "step": 3596 - }, - { - "epoch": 1.1301804189488975, - "grad_norm": 0.79296875, - "learning_rate": 1.5635671215487148e-05, - "loss": 1.346, - "step": 3598 - }, - { - "epoch": 1.130808645974439, - "grad_norm": 0.74609375, - "learning_rate": 1.563313233894002e-05, - "loss": 1.0939, - "step": 3600 - }, - { - "epoch": 1.1314368729999804, - "grad_norm": 0.8046875, - "learning_rate": 1.5630593462392894e-05, - "loss": 1.243, - "step": 3602 - }, - { - "epoch": 1.1320651000255217, - "grad_norm": 0.84765625, - "learning_rate": 1.5628054585845765e-05, - "loss": 1.2313, - "step": 3604 - }, - { - "epoch": 1.132693327051063, - "grad_norm": 0.81640625, - "learning_rate": 1.5625515709298637e-05, - "loss": 1.2229, - "step": 3606 - }, - { - "epoch": 1.1333215540766044, - "grad_norm": 0.890625, - "learning_rate": 1.5622976832751508e-05, - "loss": 1.3329, - "step": 3608 - }, - { - "epoch": 1.1339497811021457, - "grad_norm": 0.96484375, - "learning_rate": 1.5620437956204383e-05, - "loss": 1.1156, - "step": 3610 - }, - { - "epoch": 1.134578008127687, - "grad_norm": 0.828125, - "learning_rate": 1.5617899079657254e-05, - "loss": 1.1354, - "step": 3612 - }, - { - "epoch": 1.1352062351532286, - "grad_norm": 0.85546875, - "learning_rate": 1.5615360203110125e-05, - "loss": 1.2974, - "step": 3614 - }, - { - "epoch": 1.13583446217877, - "grad_norm": 0.76953125, - "learning_rate": 1.5612821326562997e-05, - "loss": 1.3404, - "step": 3616 - }, - { - "epoch": 1.1364626892043113, - "grad_norm": 0.84765625, - "learning_rate": 1.561028245001587e-05, - "loss": 1.2677, - "step": 3618 - }, - { - "epoch": 1.1370909162298526, - "grad_norm": 0.82421875, - "learning_rate": 1.560774357346874e-05, - "loss": 1.1016, - "step": 3620 - }, - { - "epoch": 1.137719143255394, - "grad_norm": 0.703125, - "learning_rate": 1.5605204696921614e-05, - "loss": 1.2943, - "step": 3622 - }, - { - "epoch": 1.1383473702809352, - "grad_norm": 0.796875, - "learning_rate": 1.5602665820374486e-05, - "loss": 1.196, - "step": 3624 - }, - { - "epoch": 1.1389755973064766, - "grad_norm": 0.79296875, - "learning_rate": 1.5600126943827357e-05, - "loss": 1.1846, - "step": 3626 - }, - { - "epoch": 1.139603824332018, - "grad_norm": 0.8046875, - "learning_rate": 1.559758806728023e-05, - "loss": 1.2626, - "step": 3628 - }, - { - "epoch": 1.1402320513575592, - "grad_norm": 0.80078125, - "learning_rate": 1.5595049190733103e-05, - "loss": 1.3086, - "step": 3630 - }, - { - "epoch": 1.1408602783831008, - "grad_norm": 0.9453125, - "learning_rate": 1.5592510314185975e-05, - "loss": 1.2576, - "step": 3632 - }, - { - "epoch": 1.141488505408642, - "grad_norm": 0.8515625, - "learning_rate": 1.5589971437638846e-05, - "loss": 1.3336, - "step": 3634 - }, - { - "epoch": 1.1421167324341834, - "grad_norm": 0.71484375, - "learning_rate": 1.5587432561091717e-05, - "loss": 1.1999, - "step": 3636 - }, - { - "epoch": 1.1427449594597248, - "grad_norm": 0.90625, - "learning_rate": 1.5584893684544592e-05, - "loss": 1.2824, - "step": 3638 - }, - { - "epoch": 1.143373186485266, - "grad_norm": 0.8125, - "learning_rate": 1.558235480799746e-05, - "loss": 1.3522, - "step": 3640 - }, - { - "epoch": 1.1440014135108074, - "grad_norm": 0.96484375, - "learning_rate": 1.5579815931450335e-05, - "loss": 1.2944, - "step": 3642 - }, - { - "epoch": 1.1446296405363487, - "grad_norm": 0.77734375, - "learning_rate": 1.5577277054903206e-05, - "loss": 1.2327, - "step": 3644 - }, - { - "epoch": 1.1452578675618903, - "grad_norm": 0.796875, - "learning_rate": 1.5574738178356078e-05, - "loss": 1.3088, - "step": 3646 - }, - { - "epoch": 1.1458860945874316, - "grad_norm": 0.8046875, - "learning_rate": 1.557219930180895e-05, - "loss": 1.2625, - "step": 3648 - }, - { - "epoch": 1.146514321612973, - "grad_norm": 0.79296875, - "learning_rate": 1.5569660425261824e-05, - "loss": 1.3164, - "step": 3650 - }, - { - "epoch": 1.1471425486385143, - "grad_norm": 0.875, - "learning_rate": 1.5567121548714695e-05, - "loss": 1.1474, - "step": 3652 - }, - { - "epoch": 1.1477707756640556, - "grad_norm": 0.76953125, - "learning_rate": 1.5564582672167567e-05, - "loss": 1.3335, - "step": 3654 - }, - { - "epoch": 1.148399002689597, - "grad_norm": 0.74609375, - "learning_rate": 1.5562043795620438e-05, - "loss": 1.2914, - "step": 3656 - }, - { - "epoch": 1.1490272297151383, - "grad_norm": 0.8125, - "learning_rate": 1.5559504919073313e-05, - "loss": 1.2283, - "step": 3658 - }, - { - "epoch": 1.1496554567406796, - "grad_norm": 0.8203125, - "learning_rate": 1.5556966042526184e-05, - "loss": 1.2828, - "step": 3660 - }, - { - "epoch": 1.150283683766221, - "grad_norm": 0.96875, - "learning_rate": 1.5554427165979056e-05, - "loss": 1.2971, - "step": 3662 - }, - { - "epoch": 1.1509119107917622, - "grad_norm": 0.80859375, - "learning_rate": 1.5551888289431927e-05, - "loss": 1.1939, - "step": 3664 - }, - { - "epoch": 1.1515401378173038, - "grad_norm": 0.72265625, - "learning_rate": 1.55493494128848e-05, - "loss": 1.2101, - "step": 3666 - }, - { - "epoch": 1.1521683648428451, - "grad_norm": 0.79296875, - "learning_rate": 1.554681053633767e-05, - "loss": 1.3979, - "step": 3668 - }, - { - "epoch": 1.1527965918683865, - "grad_norm": 0.84375, - "learning_rate": 1.5544271659790545e-05, - "loss": 1.1002, - "step": 3670 - }, - { - "epoch": 1.1534248188939278, - "grad_norm": 0.80859375, - "learning_rate": 1.5541732783243416e-05, - "loss": 1.2375, - "step": 3672 - }, - { - "epoch": 1.1540530459194691, - "grad_norm": 0.71875, - "learning_rate": 1.5539193906696287e-05, - "loss": 1.315, - "step": 3674 - }, - { - "epoch": 1.1546812729450104, - "grad_norm": 0.74609375, - "learning_rate": 1.553665503014916e-05, - "loss": 1.3209, - "step": 3676 - }, - { - "epoch": 1.1553094999705518, - "grad_norm": 0.7265625, - "learning_rate": 1.5534116153602033e-05, - "loss": 1.2149, - "step": 3678 - }, - { - "epoch": 1.1559377269960933, - "grad_norm": 0.73828125, - "learning_rate": 1.5531577277054905e-05, - "loss": 1.2292, - "step": 3680 - }, - { - "epoch": 1.1565659540216346, - "grad_norm": 0.79296875, - "learning_rate": 1.5529038400507776e-05, - "loss": 1.148, - "step": 3682 - }, - { - "epoch": 1.157194181047176, - "grad_norm": 0.73828125, - "learning_rate": 1.552649952396065e-05, - "loss": 1.2629, - "step": 3684 - }, - { - "epoch": 1.1578224080727173, - "grad_norm": 0.7734375, - "learning_rate": 1.5523960647413522e-05, - "loss": 1.13, - "step": 3686 - }, - { - "epoch": 1.1584506350982586, - "grad_norm": 0.7578125, - "learning_rate": 1.5521421770866394e-05, - "loss": 1.2941, - "step": 3688 - }, - { - "epoch": 1.1590788621238, - "grad_norm": 0.8046875, - "learning_rate": 1.5518882894319265e-05, - "loss": 1.2437, - "step": 3690 - }, - { - "epoch": 1.1597070891493413, - "grad_norm": 0.84765625, - "learning_rate": 1.5516344017772136e-05, - "loss": 1.3109, - "step": 3692 - }, - { - "epoch": 1.1603353161748826, - "grad_norm": 0.8125, - "learning_rate": 1.5513805141225008e-05, - "loss": 1.0916, - "step": 3694 - }, - { - "epoch": 1.160963543200424, - "grad_norm": 0.84375, - "learning_rate": 1.5511266264677883e-05, - "loss": 1.1996, - "step": 3696 - }, - { - "epoch": 1.1615917702259655, - "grad_norm": 0.94140625, - "learning_rate": 1.5508727388130754e-05, - "loss": 1.2492, - "step": 3698 - }, - { - "epoch": 1.1622199972515068, - "grad_norm": 0.77734375, - "learning_rate": 1.5506188511583625e-05, - "loss": 1.2623, - "step": 3700 - }, - { - "epoch": 1.1628482242770481, - "grad_norm": 0.83203125, - "learning_rate": 1.5503649635036497e-05, - "loss": 1.4083, - "step": 3702 - }, - { - "epoch": 1.1634764513025895, - "grad_norm": 0.78515625, - "learning_rate": 1.550111075848937e-05, - "loss": 1.2405, - "step": 3704 - }, - { - "epoch": 1.1641046783281308, - "grad_norm": 0.86328125, - "learning_rate": 1.5498571881942243e-05, - "loss": 1.0913, - "step": 3706 - }, - { - "epoch": 1.1647329053536721, - "grad_norm": 0.83984375, - "learning_rate": 1.5496033005395114e-05, - "loss": 1.1841, - "step": 3708 - }, - { - "epoch": 1.1653611323792135, - "grad_norm": 0.77734375, - "learning_rate": 1.5493494128847986e-05, - "loss": 1.3798, - "step": 3710 - }, - { - "epoch": 1.165989359404755, - "grad_norm": 0.765625, - "learning_rate": 1.549095525230086e-05, - "loss": 1.3218, - "step": 3712 - }, - { - "epoch": 1.1666175864302963, - "grad_norm": 1.0, - "learning_rate": 1.548841637575373e-05, - "loss": 1.3271, - "step": 3714 - }, - { - "epoch": 1.1672458134558377, - "grad_norm": 0.796875, - "learning_rate": 1.5485877499206603e-05, - "loss": 1.3283, - "step": 3716 - }, - { - "epoch": 1.167874040481379, - "grad_norm": 0.88671875, - "learning_rate": 1.5483338622659475e-05, - "loss": 1.1618, - "step": 3718 - }, - { - "epoch": 1.1685022675069203, - "grad_norm": 0.83984375, - "learning_rate": 1.5480799746112346e-05, - "loss": 1.2939, - "step": 3720 - }, - { - "epoch": 1.1691304945324617, - "grad_norm": 0.8203125, - "learning_rate": 1.5478260869565217e-05, - "loss": 1.2001, - "step": 3722 - }, - { - "epoch": 1.169758721558003, - "grad_norm": 0.8125, - "learning_rate": 1.5475721993018092e-05, - "loss": 1.2964, - "step": 3724 - }, - { - "epoch": 1.1703869485835443, - "grad_norm": 0.75390625, - "learning_rate": 1.5473183116470964e-05, - "loss": 1.2945, - "step": 3726 - }, - { - "epoch": 1.1710151756090856, - "grad_norm": 0.7578125, - "learning_rate": 1.5470644239923835e-05, - "loss": 1.3001, - "step": 3728 - }, - { - "epoch": 1.1716434026346272, - "grad_norm": 0.78515625, - "learning_rate": 1.5468105363376706e-05, - "loss": 1.0717, - "step": 3730 - }, - { - "epoch": 1.1722716296601685, - "grad_norm": 0.890625, - "learning_rate": 1.546556648682958e-05, - "loss": 1.1697, - "step": 3732 - }, - { - "epoch": 1.1728998566857098, - "grad_norm": 0.91796875, - "learning_rate": 1.546302761028245e-05, - "loss": 1.1945, - "step": 3734 - }, - { - "epoch": 1.1735280837112512, - "grad_norm": 0.765625, - "learning_rate": 1.5460488733735324e-05, - "loss": 1.1745, - "step": 3736 - }, - { - "epoch": 1.1741563107367925, - "grad_norm": 0.83203125, - "learning_rate": 1.5457949857188195e-05, - "loss": 1.2856, - "step": 3738 - }, - { - "epoch": 1.1747845377623338, - "grad_norm": 0.76953125, - "learning_rate": 1.5455410980641067e-05, - "loss": 1.2289, - "step": 3740 - }, - { - "epoch": 1.1754127647878752, - "grad_norm": 0.8671875, - "learning_rate": 1.5452872104093938e-05, - "loss": 1.1443, - "step": 3742 - }, - { - "epoch": 1.1760409918134165, - "grad_norm": 0.81640625, - "learning_rate": 1.5450333227546813e-05, - "loss": 1.2261, - "step": 3744 - }, - { - "epoch": 1.176669218838958, - "grad_norm": 0.77734375, - "learning_rate": 1.5447794350999684e-05, - "loss": 1.2703, - "step": 3746 - }, - { - "epoch": 1.1772974458644994, - "grad_norm": 0.73828125, - "learning_rate": 1.5445255474452556e-05, - "loss": 1.2568, - "step": 3748 - }, - { - "epoch": 1.1779256728900407, - "grad_norm": 0.7265625, - "learning_rate": 1.5442716597905427e-05, - "loss": 1.2741, - "step": 3750 - }, - { - "epoch": 1.178553899915582, - "grad_norm": 0.796875, - "learning_rate": 1.54401777213583e-05, - "loss": 1.2063, - "step": 3752 - }, - { - "epoch": 1.1791821269411233, - "grad_norm": 0.75390625, - "learning_rate": 1.5437638844811173e-05, - "loss": 1.2187, - "step": 3754 - }, - { - "epoch": 1.1798103539666647, - "grad_norm": 0.77734375, - "learning_rate": 1.5435099968264044e-05, - "loss": 1.2115, - "step": 3756 - }, - { - "epoch": 1.180438580992206, - "grad_norm": 0.84375, - "learning_rate": 1.5432561091716916e-05, - "loss": 1.2467, - "step": 3758 - }, - { - "epoch": 1.1810668080177473, - "grad_norm": 0.76953125, - "learning_rate": 1.5430022215169787e-05, - "loss": 1.3909, - "step": 3760 - }, - { - "epoch": 1.1816950350432887, - "grad_norm": 0.8359375, - "learning_rate": 1.542748333862266e-05, - "loss": 1.3045, - "step": 3762 - }, - { - "epoch": 1.1823232620688302, - "grad_norm": 0.76953125, - "learning_rate": 1.5424944462075533e-05, - "loss": 1.2189, - "step": 3764 - }, - { - "epoch": 1.1829514890943715, - "grad_norm": 0.80078125, - "learning_rate": 1.5422405585528405e-05, - "loss": 1.0881, - "step": 3766 - }, - { - "epoch": 1.1835797161199129, - "grad_norm": 0.74609375, - "learning_rate": 1.5419866708981276e-05, - "loss": 1.2768, - "step": 3768 - }, - { - "epoch": 1.1842079431454542, - "grad_norm": 0.84765625, - "learning_rate": 1.541732783243415e-05, - "loss": 1.3732, - "step": 3770 - }, - { - "epoch": 1.1848361701709955, - "grad_norm": 0.8203125, - "learning_rate": 1.5414788955887022e-05, - "loss": 1.3118, - "step": 3772 - }, - { - "epoch": 1.1854643971965368, - "grad_norm": 0.8359375, - "learning_rate": 1.5412250079339894e-05, - "loss": 1.3224, - "step": 3774 - }, - { - "epoch": 1.1860926242220782, - "grad_norm": 0.828125, - "learning_rate": 1.5409711202792765e-05, - "loss": 1.1572, - "step": 3776 - }, - { - "epoch": 1.1867208512476197, - "grad_norm": 0.73828125, - "learning_rate": 1.540717232624564e-05, - "loss": 1.2863, - "step": 3778 - }, - { - "epoch": 1.187349078273161, - "grad_norm": 0.76171875, - "learning_rate": 1.5404633449698508e-05, - "loss": 1.1624, - "step": 3780 - }, - { - "epoch": 1.1879773052987024, - "grad_norm": 0.78515625, - "learning_rate": 1.5402094573151383e-05, - "loss": 1.3295, - "step": 3782 - }, - { - "epoch": 1.1886055323242437, - "grad_norm": 0.79296875, - "learning_rate": 1.5399555696604254e-05, - "loss": 1.3428, - "step": 3784 - }, - { - "epoch": 1.189233759349785, - "grad_norm": 0.8046875, - "learning_rate": 1.5397016820057125e-05, - "loss": 1.2549, - "step": 3786 - }, - { - "epoch": 1.1898619863753264, - "grad_norm": 0.734375, - "learning_rate": 1.5394477943509997e-05, - "loss": 1.2031, - "step": 3788 - }, - { - "epoch": 1.1904902134008677, - "grad_norm": 0.74609375, - "learning_rate": 1.539193906696287e-05, - "loss": 1.13, - "step": 3790 - }, - { - "epoch": 1.191118440426409, - "grad_norm": 0.82421875, - "learning_rate": 1.5389400190415743e-05, - "loss": 1.2701, - "step": 3792 - }, - { - "epoch": 1.1917466674519503, - "grad_norm": 0.828125, - "learning_rate": 1.5386861313868614e-05, - "loss": 1.208, - "step": 3794 - }, - { - "epoch": 1.192374894477492, - "grad_norm": 0.80078125, - "learning_rate": 1.5384322437321486e-05, - "loss": 1.2884, - "step": 3796 - }, - { - "epoch": 1.1930031215030332, - "grad_norm": 0.78125, - "learning_rate": 1.538178356077436e-05, - "loss": 1.1525, - "step": 3798 - }, - { - "epoch": 1.1936313485285746, - "grad_norm": 0.8515625, - "learning_rate": 1.5379244684227232e-05, - "loss": 1.3432, - "step": 3800 - }, - { - "epoch": 1.1942595755541159, - "grad_norm": 0.73828125, - "learning_rate": 1.5376705807680103e-05, - "loss": 1.1941, - "step": 3802 - }, - { - "epoch": 1.1948878025796572, - "grad_norm": 0.76953125, - "learning_rate": 1.5374166931132975e-05, - "loss": 1.1627, - "step": 3804 - }, - { - "epoch": 1.1955160296051985, - "grad_norm": 0.83984375, - "learning_rate": 1.5371628054585846e-05, - "loss": 1.1927, - "step": 3806 - }, - { - "epoch": 1.1961442566307399, - "grad_norm": 0.73046875, - "learning_rate": 1.5369089178038717e-05, - "loss": 1.2496, - "step": 3808 - }, - { - "epoch": 1.1967724836562812, - "grad_norm": 0.7578125, - "learning_rate": 1.5366550301491592e-05, - "loss": 1.3009, - "step": 3810 - }, - { - "epoch": 1.1974007106818227, - "grad_norm": 0.76953125, - "learning_rate": 1.5364011424944463e-05, - "loss": 1.2321, - "step": 3812 - }, - { - "epoch": 1.198028937707364, - "grad_norm": 0.78515625, - "learning_rate": 1.5361472548397335e-05, - "loss": 1.3292, - "step": 3814 - }, - { - "epoch": 1.1986571647329054, - "grad_norm": 0.8125, - "learning_rate": 1.5358933671850206e-05, - "loss": 1.1956, - "step": 3816 - }, - { - "epoch": 1.1992853917584467, - "grad_norm": 0.77734375, - "learning_rate": 1.535639479530308e-05, - "loss": 1.1833, - "step": 3818 - }, - { - "epoch": 1.199913618783988, - "grad_norm": 0.828125, - "learning_rate": 1.5353855918755952e-05, - "loss": 1.3083, - "step": 3820 - }, - { - "epoch": 1.2005418458095294, - "grad_norm": 0.84375, - "learning_rate": 1.5351317042208824e-05, - "loss": 1.2926, - "step": 3822 - }, - { - "epoch": 1.2011700728350707, - "grad_norm": 0.859375, - "learning_rate": 1.5348778165661695e-05, - "loss": 1.2154, - "step": 3824 - }, - { - "epoch": 1.201798299860612, - "grad_norm": 0.828125, - "learning_rate": 1.534623928911457e-05, - "loss": 1.2833, - "step": 3826 - }, - { - "epoch": 1.2024265268861534, - "grad_norm": 0.84765625, - "learning_rate": 1.5343700412567438e-05, - "loss": 1.2619, - "step": 3828 - }, - { - "epoch": 1.203054753911695, - "grad_norm": 0.89453125, - "learning_rate": 1.5341161536020313e-05, - "loss": 1.1693, - "step": 3830 - }, - { - "epoch": 1.2036829809372362, - "grad_norm": 0.82421875, - "learning_rate": 1.5338622659473184e-05, - "loss": 1.388, - "step": 3832 - }, - { - "epoch": 1.2043112079627776, - "grad_norm": 0.73828125, - "learning_rate": 1.5336083782926055e-05, - "loss": 1.1663, - "step": 3834 - }, - { - "epoch": 1.204939434988319, - "grad_norm": 0.80078125, - "learning_rate": 1.5333544906378927e-05, - "loss": 1.2422, - "step": 3836 - }, - { - "epoch": 1.2055676620138602, - "grad_norm": 0.76953125, - "learning_rate": 1.53310060298318e-05, - "loss": 1.3582, - "step": 3838 - }, - { - "epoch": 1.2061958890394016, - "grad_norm": 0.75, - "learning_rate": 1.5328467153284673e-05, - "loss": 1.279, - "step": 3840 - }, - { - "epoch": 1.2068241160649429, - "grad_norm": 0.796875, - "learning_rate": 1.5325928276737544e-05, - "loss": 1.1224, - "step": 3842 - }, - { - "epoch": 1.2074523430904844, - "grad_norm": 0.75, - "learning_rate": 1.5323389400190416e-05, - "loss": 1.164, - "step": 3844 - }, - { - "epoch": 1.2080805701160258, - "grad_norm": 0.76171875, - "learning_rate": 1.532085052364329e-05, - "loss": 1.2088, - "step": 3846 - }, - { - "epoch": 1.208708797141567, - "grad_norm": 0.80078125, - "learning_rate": 1.531831164709616e-05, - "loss": 1.2329, - "step": 3848 - }, - { - "epoch": 1.2093370241671084, - "grad_norm": 0.7890625, - "learning_rate": 1.5315772770549033e-05, - "loss": 1.251, - "step": 3850 - }, - { - "epoch": 1.2099652511926497, - "grad_norm": 0.88671875, - "learning_rate": 1.5313233894001908e-05, - "loss": 1.2042, - "step": 3852 - }, - { - "epoch": 1.210593478218191, - "grad_norm": 0.8046875, - "learning_rate": 1.5310695017454776e-05, - "loss": 1.143, - "step": 3854 - }, - { - "epoch": 1.2112217052437324, - "grad_norm": 0.7734375, - "learning_rate": 1.530815614090765e-05, - "loss": 1.319, - "step": 3856 - }, - { - "epoch": 1.2118499322692737, - "grad_norm": 0.80078125, - "learning_rate": 1.5305617264360522e-05, - "loss": 1.2545, - "step": 3858 - }, - { - "epoch": 1.212478159294815, - "grad_norm": 0.75390625, - "learning_rate": 1.5303078387813394e-05, - "loss": 1.2631, - "step": 3860 - }, - { - "epoch": 1.2131063863203566, - "grad_norm": 0.84375, - "learning_rate": 1.5300539511266265e-05, - "loss": 1.2877, - "step": 3862 - }, - { - "epoch": 1.213734613345898, - "grad_norm": 0.76953125, - "learning_rate": 1.529800063471914e-05, - "loss": 1.2363, - "step": 3864 - }, - { - "epoch": 1.2143628403714393, - "grad_norm": 0.87890625, - "learning_rate": 1.529546175817201e-05, - "loss": 1.1702, - "step": 3866 - }, - { - "epoch": 1.2149910673969806, - "grad_norm": 0.7734375, - "learning_rate": 1.5292922881624882e-05, - "loss": 1.25, - "step": 3868 - }, - { - "epoch": 1.215619294422522, - "grad_norm": 0.83984375, - "learning_rate": 1.5290384005077754e-05, - "loss": 1.3495, - "step": 3870 - }, - { - "epoch": 1.2162475214480633, - "grad_norm": 0.7734375, - "learning_rate": 1.528784512853063e-05, - "loss": 1.2843, - "step": 3872 - }, - { - "epoch": 1.2168757484736046, - "grad_norm": 0.7109375, - "learning_rate": 1.5285306251983497e-05, - "loss": 1.2542, - "step": 3874 - }, - { - "epoch": 1.217503975499146, - "grad_norm": 0.74609375, - "learning_rate": 1.528276737543637e-05, - "loss": 1.3101, - "step": 3876 - }, - { - "epoch": 1.2181322025246875, - "grad_norm": 0.7734375, - "learning_rate": 1.5280228498889243e-05, - "loss": 1.28, - "step": 3878 - }, - { - "epoch": 1.2187604295502288, - "grad_norm": 0.7734375, - "learning_rate": 1.5277689622342114e-05, - "loss": 1.2766, - "step": 3880 - }, - { - "epoch": 1.2193886565757701, - "grad_norm": 0.796875, - "learning_rate": 1.5275150745794986e-05, - "loss": 1.236, - "step": 3882 - }, - { - "epoch": 1.2200168836013114, - "grad_norm": 0.79296875, - "learning_rate": 1.527261186924786e-05, - "loss": 1.1882, - "step": 3884 - }, - { - "epoch": 1.2206451106268528, - "grad_norm": 0.82421875, - "learning_rate": 1.5270072992700732e-05, - "loss": 1.1035, - "step": 3886 - }, - { - "epoch": 1.221273337652394, - "grad_norm": 0.796875, - "learning_rate": 1.5267534116153603e-05, - "loss": 1.1533, - "step": 3888 - }, - { - "epoch": 1.2219015646779354, - "grad_norm": 0.8125, - "learning_rate": 1.5264995239606474e-05, - "loss": 1.1649, - "step": 3890 - }, - { - "epoch": 1.2225297917034768, - "grad_norm": 0.875, - "learning_rate": 1.526245636305935e-05, - "loss": 1.2608, - "step": 3892 - }, - { - "epoch": 1.223158018729018, - "grad_norm": 0.83984375, - "learning_rate": 1.525991748651222e-05, - "loss": 1.4731, - "step": 3894 - }, - { - "epoch": 1.2237862457545596, - "grad_norm": 0.7578125, - "learning_rate": 1.5257378609965092e-05, - "loss": 1.276, - "step": 3896 - }, - { - "epoch": 1.224414472780101, - "grad_norm": 0.7265625, - "learning_rate": 1.5254839733417963e-05, - "loss": 1.1836, - "step": 3898 - }, - { - "epoch": 1.2250426998056423, - "grad_norm": 0.8515625, - "learning_rate": 1.5252300856870836e-05, - "loss": 1.1804, - "step": 3900 - }, - { - "epoch": 1.2256709268311836, - "grad_norm": 0.765625, - "learning_rate": 1.5249761980323708e-05, - "loss": 1.3125, - "step": 3902 - }, - { - "epoch": 1.226299153856725, - "grad_norm": 0.7421875, - "learning_rate": 1.5247223103776581e-05, - "loss": 1.2143, - "step": 3904 - }, - { - "epoch": 1.2269273808822663, - "grad_norm": 0.80859375, - "learning_rate": 1.5244684227229452e-05, - "loss": 1.2005, - "step": 3906 - }, - { - "epoch": 1.2275556079078076, - "grad_norm": 0.8359375, - "learning_rate": 1.5242145350682325e-05, - "loss": 1.2187, - "step": 3908 - }, - { - "epoch": 1.2281838349333492, - "grad_norm": 0.734375, - "learning_rate": 1.5239606474135195e-05, - "loss": 1.3649, - "step": 3910 - }, - { - "epoch": 1.2288120619588905, - "grad_norm": 0.85546875, - "learning_rate": 1.5237067597588068e-05, - "loss": 1.2188, - "step": 3912 - }, - { - "epoch": 1.2294402889844318, - "grad_norm": 0.80078125, - "learning_rate": 1.523452872104094e-05, - "loss": 1.1949, - "step": 3914 - }, - { - "epoch": 1.2300685160099731, - "grad_norm": 0.796875, - "learning_rate": 1.5231989844493813e-05, - "loss": 1.2312, - "step": 3916 - }, - { - "epoch": 1.2306967430355145, - "grad_norm": 0.765625, - "learning_rate": 1.5229450967946684e-05, - "loss": 1.1944, - "step": 3918 - }, - { - "epoch": 1.2313249700610558, - "grad_norm": 0.81640625, - "learning_rate": 1.5226912091399557e-05, - "loss": 1.2278, - "step": 3920 - }, - { - "epoch": 1.2319531970865971, - "grad_norm": 0.7578125, - "learning_rate": 1.5224373214852428e-05, - "loss": 1.2428, - "step": 3922 - }, - { - "epoch": 1.2325814241121384, - "grad_norm": 0.73828125, - "learning_rate": 1.5221834338305302e-05, - "loss": 1.3336, - "step": 3924 - }, - { - "epoch": 1.2332096511376798, - "grad_norm": 0.76171875, - "learning_rate": 1.5219295461758173e-05, - "loss": 1.3555, - "step": 3926 - }, - { - "epoch": 1.2338378781632213, - "grad_norm": 0.875, - "learning_rate": 1.5216756585211046e-05, - "loss": 1.1514, - "step": 3928 - }, - { - "epoch": 1.2344661051887627, - "grad_norm": 0.70703125, - "learning_rate": 1.5214217708663916e-05, - "loss": 1.3315, - "step": 3930 - }, - { - "epoch": 1.235094332214304, - "grad_norm": 0.7890625, - "learning_rate": 1.521167883211679e-05, - "loss": 1.2199, - "step": 3932 - }, - { - "epoch": 1.2357225592398453, - "grad_norm": 0.84375, - "learning_rate": 1.520913995556966e-05, - "loss": 1.1983, - "step": 3934 - }, - { - "epoch": 1.2363507862653866, - "grad_norm": 0.73828125, - "learning_rate": 1.5206601079022533e-05, - "loss": 1.2382, - "step": 3936 - }, - { - "epoch": 1.236979013290928, - "grad_norm": 0.91015625, - "learning_rate": 1.5204062202475406e-05, - "loss": 1.1452, - "step": 3938 - }, - { - "epoch": 1.2376072403164693, - "grad_norm": 0.84765625, - "learning_rate": 1.5201523325928278e-05, - "loss": 1.2165, - "step": 3940 - }, - { - "epoch": 1.2382354673420108, - "grad_norm": 0.95703125, - "learning_rate": 1.519898444938115e-05, - "loss": 1.1627, - "step": 3942 - }, - { - "epoch": 1.2388636943675522, - "grad_norm": 0.8125, - "learning_rate": 1.5196445572834022e-05, - "loss": 1.1965, - "step": 3944 - }, - { - "epoch": 1.2394919213930935, - "grad_norm": 0.69921875, - "learning_rate": 1.5193906696286895e-05, - "loss": 1.19, - "step": 3946 - }, - { - "epoch": 1.2401201484186348, - "grad_norm": 0.796875, - "learning_rate": 1.5191367819739767e-05, - "loss": 1.2672, - "step": 3948 - }, - { - "epoch": 1.2407483754441762, - "grad_norm": 0.84765625, - "learning_rate": 1.518882894319264e-05, - "loss": 1.2684, - "step": 3950 - }, - { - "epoch": 1.2413766024697175, - "grad_norm": 0.76953125, - "learning_rate": 1.5186290066645511e-05, - "loss": 1.3026, - "step": 3952 - }, - { - "epoch": 1.2420048294952588, - "grad_norm": 0.7421875, - "learning_rate": 1.5183751190098384e-05, - "loss": 1.2142, - "step": 3954 - }, - { - "epoch": 1.2426330565208001, - "grad_norm": 0.7421875, - "learning_rate": 1.5181212313551254e-05, - "loss": 1.2394, - "step": 3956 - }, - { - "epoch": 1.2432612835463415, - "grad_norm": 0.84765625, - "learning_rate": 1.5178673437004129e-05, - "loss": 1.1269, - "step": 3958 - }, - { - "epoch": 1.2438895105718828, - "grad_norm": 0.80078125, - "learning_rate": 1.5176134560456998e-05, - "loss": 1.2404, - "step": 3960 - }, - { - "epoch": 1.2445177375974243, - "grad_norm": 0.8671875, - "learning_rate": 1.5173595683909871e-05, - "loss": 1.2585, - "step": 3962 - }, - { - "epoch": 1.2451459646229657, - "grad_norm": 0.80859375, - "learning_rate": 1.5171056807362743e-05, - "loss": 1.2264, - "step": 3964 - }, - { - "epoch": 1.245774191648507, - "grad_norm": 0.80859375, - "learning_rate": 1.5168517930815616e-05, - "loss": 1.3453, - "step": 3966 - }, - { - "epoch": 1.2464024186740483, - "grad_norm": 0.80859375, - "learning_rate": 1.5165979054268487e-05, - "loss": 1.2773, - "step": 3968 - }, - { - "epoch": 1.2470306456995897, - "grad_norm": 0.8671875, - "learning_rate": 1.516344017772136e-05, - "loss": 1.15, - "step": 3970 - }, - { - "epoch": 1.247658872725131, - "grad_norm": 0.80078125, - "learning_rate": 1.5160901301174232e-05, - "loss": 1.3222, - "step": 3972 - }, - { - "epoch": 1.2482870997506723, - "grad_norm": 0.796875, - "learning_rate": 1.5158362424627105e-05, - "loss": 1.1521, - "step": 3974 - }, - { - "epoch": 1.2489153267762139, - "grad_norm": 0.91015625, - "learning_rate": 1.5155823548079976e-05, - "loss": 1.2704, - "step": 3976 - }, - { - "epoch": 1.2495435538017552, - "grad_norm": 0.76171875, - "learning_rate": 1.515328467153285e-05, - "loss": 1.3046, - "step": 3978 - }, - { - "epoch": 1.2501717808272965, - "grad_norm": 0.75390625, - "learning_rate": 1.5150745794985719e-05, - "loss": 1.1866, - "step": 3980 - }, - { - "epoch": 1.2508000078528378, - "grad_norm": 0.73828125, - "learning_rate": 1.5148206918438592e-05, - "loss": 1.3763, - "step": 3982 - }, - { - "epoch": 1.2514282348783792, - "grad_norm": 0.76171875, - "learning_rate": 1.5145668041891463e-05, - "loss": 1.3093, - "step": 3984 - }, - { - "epoch": 1.2520564619039205, - "grad_norm": 0.79296875, - "learning_rate": 1.5143129165344336e-05, - "loss": 1.1302, - "step": 3986 - }, - { - "epoch": 1.2526846889294618, - "grad_norm": 0.8203125, - "learning_rate": 1.5140590288797208e-05, - "loss": 1.2198, - "step": 3988 - }, - { - "epoch": 1.2533129159550032, - "grad_norm": 0.7421875, - "learning_rate": 1.5138051412250081e-05, - "loss": 1.2666, - "step": 3990 - }, - { - "epoch": 1.2539411429805445, - "grad_norm": 0.921875, - "learning_rate": 1.5135512535702952e-05, - "loss": 1.0958, - "step": 3992 - }, - { - "epoch": 1.2545693700060858, - "grad_norm": 0.8515625, - "learning_rate": 1.5132973659155825e-05, - "loss": 1.2114, - "step": 3994 - }, - { - "epoch": 1.2551975970316274, - "grad_norm": 0.78515625, - "learning_rate": 1.5130434782608697e-05, - "loss": 1.3153, - "step": 3996 - }, - { - "epoch": 1.2558258240571687, - "grad_norm": 0.8125, - "learning_rate": 1.512789590606157e-05, - "loss": 1.2129, - "step": 3998 - }, - { - "epoch": 1.25645405108271, - "grad_norm": 0.7890625, - "learning_rate": 1.512535702951444e-05, - "loss": 1.2922, - "step": 4000 - }, - { - "epoch": 1.2570822781082513, - "grad_norm": 0.7578125, - "learning_rate": 1.5122818152967314e-05, - "loss": 1.1252, - "step": 4002 - }, - { - "epoch": 1.2577105051337927, - "grad_norm": 0.77734375, - "learning_rate": 1.5120279276420184e-05, - "loss": 1.1488, - "step": 4004 - }, - { - "epoch": 1.258338732159334, - "grad_norm": 0.80078125, - "learning_rate": 1.5117740399873057e-05, - "loss": 1.2191, - "step": 4006 - }, - { - "epoch": 1.2589669591848756, - "grad_norm": 0.859375, - "learning_rate": 1.5115201523325928e-05, - "loss": 1.345, - "step": 4008 - }, - { - "epoch": 1.2595951862104169, - "grad_norm": 0.7734375, - "learning_rate": 1.5112662646778801e-05, - "loss": 1.2722, - "step": 4010 - }, - { - "epoch": 1.2602234132359582, - "grad_norm": 0.75390625, - "learning_rate": 1.5110123770231673e-05, - "loss": 1.2438, - "step": 4012 - }, - { - "epoch": 1.2608516402614995, - "grad_norm": 0.7734375, - "learning_rate": 1.5107584893684546e-05, - "loss": 1.2488, - "step": 4014 - }, - { - "epoch": 1.2614798672870409, - "grad_norm": 0.765625, - "learning_rate": 1.5105046017137417e-05, - "loss": 1.1744, - "step": 4016 - }, - { - "epoch": 1.2621080943125822, - "grad_norm": 0.78515625, - "learning_rate": 1.510250714059029e-05, - "loss": 1.2071, - "step": 4018 - }, - { - "epoch": 1.2627363213381235, - "grad_norm": 0.75, - "learning_rate": 1.5099968264043162e-05, - "loss": 1.3156, - "step": 4020 - }, - { - "epoch": 1.2633645483636649, - "grad_norm": 0.890625, - "learning_rate": 1.5097429387496035e-05, - "loss": 1.1231, - "step": 4022 - }, - { - "epoch": 1.2639927753892062, - "grad_norm": 0.796875, - "learning_rate": 1.5094890510948908e-05, - "loss": 1.151, - "step": 4024 - }, - { - "epoch": 1.2646210024147475, - "grad_norm": 0.75, - "learning_rate": 1.5092351634401778e-05, - "loss": 1.3427, - "step": 4026 - }, - { - "epoch": 1.265249229440289, - "grad_norm": 0.796875, - "learning_rate": 1.5089812757854652e-05, - "loss": 1.2093, - "step": 4028 - }, - { - "epoch": 1.2658774564658304, - "grad_norm": 0.765625, - "learning_rate": 1.5087273881307522e-05, - "loss": 1.179, - "step": 4030 - }, - { - "epoch": 1.2665056834913717, - "grad_norm": 0.8125, - "learning_rate": 1.5084735004760395e-05, - "loss": 1.2487, - "step": 4032 - }, - { - "epoch": 1.267133910516913, - "grad_norm": 0.81640625, - "learning_rate": 1.5082196128213267e-05, - "loss": 1.2788, - "step": 4034 - }, - { - "epoch": 1.2677621375424544, - "grad_norm": 0.78515625, - "learning_rate": 1.507965725166614e-05, - "loss": 1.2915, - "step": 4036 - }, - { - "epoch": 1.2683903645679957, - "grad_norm": 0.8046875, - "learning_rate": 1.5077118375119011e-05, - "loss": 1.2832, - "step": 4038 - }, - { - "epoch": 1.2690185915935372, - "grad_norm": 0.84375, - "learning_rate": 1.5074579498571884e-05, - "loss": 1.2789, - "step": 4040 - }, - { - "epoch": 1.2696468186190786, - "grad_norm": 0.74609375, - "learning_rate": 1.5072040622024755e-05, - "loss": 1.0852, - "step": 4042 - }, - { - "epoch": 1.27027504564462, - "grad_norm": 0.8828125, - "learning_rate": 1.5069501745477629e-05, - "loss": 1.2983, - "step": 4044 - }, - { - "epoch": 1.2709032726701612, - "grad_norm": 0.8203125, - "learning_rate": 1.50669628689305e-05, - "loss": 1.2256, - "step": 4046 - }, - { - "epoch": 1.2715314996957026, - "grad_norm": 0.79296875, - "learning_rate": 1.5064423992383373e-05, - "loss": 1.2011, - "step": 4048 - }, - { - "epoch": 1.2721597267212439, - "grad_norm": 0.7109375, - "learning_rate": 1.5061885115836243e-05, - "loss": 1.2055, - "step": 4050 - }, - { - "epoch": 1.2727879537467852, - "grad_norm": 0.828125, - "learning_rate": 1.5059346239289116e-05, - "loss": 1.1325, - "step": 4052 - }, - { - "epoch": 1.2734161807723265, - "grad_norm": 0.828125, - "learning_rate": 1.5056807362741987e-05, - "loss": 1.182, - "step": 4054 - }, - { - "epoch": 1.2740444077978679, - "grad_norm": 1.1796875, - "learning_rate": 1.505426848619486e-05, - "loss": 1.2868, - "step": 4056 - }, - { - "epoch": 1.2746726348234092, - "grad_norm": 0.7578125, - "learning_rate": 1.5051729609647732e-05, - "loss": 1.3885, - "step": 4058 - }, - { - "epoch": 1.2753008618489505, - "grad_norm": 0.7265625, - "learning_rate": 1.5049190733100605e-05, - "loss": 1.2637, - "step": 4060 - }, - { - "epoch": 1.275929088874492, - "grad_norm": 0.7265625, - "learning_rate": 1.5046651856553476e-05, - "loss": 1.2665, - "step": 4062 - }, - { - "epoch": 1.2765573159000334, - "grad_norm": 0.72265625, - "learning_rate": 1.5044112980006349e-05, - "loss": 1.3012, - "step": 4064 - }, - { - "epoch": 1.2771855429255747, - "grad_norm": 0.953125, - "learning_rate": 1.504157410345922e-05, - "loss": 1.2597, - "step": 4066 - }, - { - "epoch": 1.277813769951116, - "grad_norm": 0.7265625, - "learning_rate": 1.5039035226912094e-05, - "loss": 1.2633, - "step": 4068 - }, - { - "epoch": 1.2784419969766574, - "grad_norm": 0.7890625, - "learning_rate": 1.5036496350364965e-05, - "loss": 1.2377, - "step": 4070 - }, - { - "epoch": 1.2790702240021987, - "grad_norm": 0.80859375, - "learning_rate": 1.5033957473817838e-05, - "loss": 1.4411, - "step": 4072 - }, - { - "epoch": 1.2796984510277403, - "grad_norm": 0.75390625, - "learning_rate": 1.5031418597270708e-05, - "loss": 1.1997, - "step": 4074 - }, - { - "epoch": 1.2803266780532816, - "grad_norm": 0.77734375, - "learning_rate": 1.502887972072358e-05, - "loss": 1.2864, - "step": 4076 - }, - { - "epoch": 1.280954905078823, - "grad_norm": 0.8125, - "learning_rate": 1.5026340844176452e-05, - "loss": 1.293, - "step": 4078 - }, - { - "epoch": 1.2815831321043643, - "grad_norm": 0.77734375, - "learning_rate": 1.5023801967629325e-05, - "loss": 1.2866, - "step": 4080 - }, - { - "epoch": 1.2822113591299056, - "grad_norm": 0.73046875, - "learning_rate": 1.5021263091082197e-05, - "loss": 1.2788, - "step": 4082 - }, - { - "epoch": 1.282839586155447, - "grad_norm": 0.828125, - "learning_rate": 1.501872421453507e-05, - "loss": 1.1705, - "step": 4084 - }, - { - "epoch": 1.2834678131809882, - "grad_norm": 0.86328125, - "learning_rate": 1.5016185337987941e-05, - "loss": 1.1257, - "step": 4086 - }, - { - "epoch": 1.2840960402065296, - "grad_norm": 0.76953125, - "learning_rate": 1.5013646461440814e-05, - "loss": 1.2574, - "step": 4088 - }, - { - "epoch": 1.284724267232071, - "grad_norm": 0.6953125, - "learning_rate": 1.5011107584893686e-05, - "loss": 1.2126, - "step": 4090 - }, - { - "epoch": 1.2853524942576122, - "grad_norm": 0.6953125, - "learning_rate": 1.5008568708346559e-05, - "loss": 1.281, - "step": 4092 - }, - { - "epoch": 1.2859807212831538, - "grad_norm": 2.875, - "learning_rate": 1.5006029831799428e-05, - "loss": 1.2376, - "step": 4094 - }, - { - "epoch": 1.286608948308695, - "grad_norm": 0.76171875, - "learning_rate": 1.5003490955252303e-05, - "loss": 1.2656, - "step": 4096 - }, - { - "epoch": 1.2872371753342364, - "grad_norm": 0.83203125, - "learning_rate": 1.5000952078705173e-05, - "loss": 1.3466, - "step": 4098 - }, - { - "epoch": 1.2878654023597778, - "grad_norm": 0.79296875, - "learning_rate": 1.4998413202158046e-05, - "loss": 1.266, - "step": 4100 - }, - { - "epoch": 1.288493629385319, - "grad_norm": 0.78125, - "learning_rate": 1.4995874325610917e-05, - "loss": 1.283, - "step": 4102 - }, - { - "epoch": 1.2891218564108604, - "grad_norm": 0.74609375, - "learning_rate": 1.499333544906379e-05, - "loss": 1.2892, - "step": 4104 - }, - { - "epoch": 1.289750083436402, - "grad_norm": 0.76953125, - "learning_rate": 1.4990796572516662e-05, - "loss": 1.2792, - "step": 4106 - }, - { - "epoch": 1.2903783104619433, - "grad_norm": 0.7421875, - "learning_rate": 1.4988257695969535e-05, - "loss": 1.3327, - "step": 4108 - }, - { - "epoch": 1.2910065374874846, - "grad_norm": 0.859375, - "learning_rate": 1.4985718819422408e-05, - "loss": 1.1793, - "step": 4110 - }, - { - "epoch": 1.291634764513026, - "grad_norm": 0.75, - "learning_rate": 1.498317994287528e-05, - "loss": 1.3966, - "step": 4112 - }, - { - "epoch": 1.2922629915385673, - "grad_norm": 0.78515625, - "learning_rate": 1.4980641066328152e-05, - "loss": 1.235, - "step": 4114 - }, - { - "epoch": 1.2928912185641086, - "grad_norm": 0.8984375, - "learning_rate": 1.4978102189781024e-05, - "loss": 1.1285, - "step": 4116 - }, - { - "epoch": 1.29351944558965, - "grad_norm": 0.7578125, - "learning_rate": 1.4975563313233897e-05, - "loss": 1.1071, - "step": 4118 - }, - { - "epoch": 1.2941476726151913, - "grad_norm": 0.7421875, - "learning_rate": 1.4973024436686766e-05, - "loss": 1.3497, - "step": 4120 - }, - { - "epoch": 1.2947758996407326, - "grad_norm": 0.89453125, - "learning_rate": 1.4970485560139641e-05, - "loss": 1.2041, - "step": 4122 - }, - { - "epoch": 1.295404126666274, - "grad_norm": 0.8203125, - "learning_rate": 1.4967946683592511e-05, - "loss": 1.2132, - "step": 4124 - }, - { - "epoch": 1.2960323536918152, - "grad_norm": 0.8828125, - "learning_rate": 1.4965407807045384e-05, - "loss": 1.2878, - "step": 4126 - }, - { - "epoch": 1.2966605807173568, - "grad_norm": 0.91015625, - "learning_rate": 1.4962868930498255e-05, - "loss": 1.337, - "step": 4128 - }, - { - "epoch": 1.2972888077428981, - "grad_norm": 0.82421875, - "learning_rate": 1.4960330053951128e-05, - "loss": 1.2482, - "step": 4130 - }, - { - "epoch": 1.2979170347684394, - "grad_norm": 0.78515625, - "learning_rate": 1.4957791177404e-05, - "loss": 1.3392, - "step": 4132 - }, - { - "epoch": 1.2985452617939808, - "grad_norm": 0.76953125, - "learning_rate": 1.4955252300856873e-05, - "loss": 1.3088, - "step": 4134 - }, - { - "epoch": 1.299173488819522, - "grad_norm": 0.78125, - "learning_rate": 1.4952713424309744e-05, - "loss": 1.3334, - "step": 4136 - }, - { - "epoch": 1.2998017158450634, - "grad_norm": 0.91796875, - "learning_rate": 1.4950174547762617e-05, - "loss": 1.1723, - "step": 4138 - }, - { - "epoch": 1.300429942870605, - "grad_norm": 0.78515625, - "learning_rate": 1.4947635671215489e-05, - "loss": 1.3257, - "step": 4140 - }, - { - "epoch": 1.3010581698961463, - "grad_norm": 0.8125, - "learning_rate": 1.4945096794668362e-05, - "loss": 1.3229, - "step": 4142 - }, - { - "epoch": 1.3016863969216876, - "grad_norm": 0.76953125, - "learning_rate": 1.4942557918121231e-05, - "loss": 1.3304, - "step": 4144 - }, - { - "epoch": 1.302314623947229, - "grad_norm": 0.80078125, - "learning_rate": 1.4940019041574105e-05, - "loss": 1.3206, - "step": 4146 - }, - { - "epoch": 1.3029428509727703, - "grad_norm": 0.796875, - "learning_rate": 1.4937480165026976e-05, - "loss": 1.1349, - "step": 4148 - }, - { - "epoch": 1.3035710779983116, - "grad_norm": 0.8046875, - "learning_rate": 1.4934941288479849e-05, - "loss": 1.2338, - "step": 4150 - }, - { - "epoch": 1.304199305023853, - "grad_norm": 0.82421875, - "learning_rate": 1.493240241193272e-05, - "loss": 1.1981, - "step": 4152 - }, - { - "epoch": 1.3048275320493943, - "grad_norm": 0.96875, - "learning_rate": 1.4929863535385593e-05, - "loss": 1.2488, - "step": 4154 - }, - { - "epoch": 1.3054557590749356, - "grad_norm": 0.75390625, - "learning_rate": 1.4927324658838465e-05, - "loss": 1.1739, - "step": 4156 - }, - { - "epoch": 1.306083986100477, - "grad_norm": 0.703125, - "learning_rate": 1.4924785782291338e-05, - "loss": 1.1962, - "step": 4158 - }, - { - "epoch": 1.3067122131260185, - "grad_norm": 0.875, - "learning_rate": 1.492224690574421e-05, - "loss": 1.2288, - "step": 4160 - }, - { - "epoch": 1.3073404401515598, - "grad_norm": 0.81640625, - "learning_rate": 1.4919708029197082e-05, - "loss": 1.2643, - "step": 4162 - }, - { - "epoch": 1.3079686671771011, - "grad_norm": 0.859375, - "learning_rate": 1.4917169152649952e-05, - "loss": 1.3054, - "step": 4164 - }, - { - "epoch": 1.3085968942026425, - "grad_norm": 0.7265625, - "learning_rate": 1.4914630276102827e-05, - "loss": 1.135, - "step": 4166 - }, - { - "epoch": 1.3092251212281838, - "grad_norm": 0.92578125, - "learning_rate": 1.4912091399555697e-05, - "loss": 1.1145, - "step": 4168 - }, - { - "epoch": 1.3098533482537251, - "grad_norm": 0.78125, - "learning_rate": 1.490955252300857e-05, - "loss": 1.3165, - "step": 4170 - }, - { - "epoch": 1.3104815752792667, - "grad_norm": 0.8828125, - "learning_rate": 1.4907013646461441e-05, - "loss": 1.2578, - "step": 4172 - }, - { - "epoch": 1.311109802304808, - "grad_norm": 0.84375, - "learning_rate": 1.4904474769914314e-05, - "loss": 1.3692, - "step": 4174 - }, - { - "epoch": 1.3117380293303493, - "grad_norm": 0.8359375, - "learning_rate": 1.4901935893367185e-05, - "loss": 1.3078, - "step": 4176 - }, - { - "epoch": 1.3123662563558907, - "grad_norm": 0.9296875, - "learning_rate": 1.4899397016820059e-05, - "loss": 1.2137, - "step": 4178 - }, - { - "epoch": 1.312994483381432, - "grad_norm": 0.84375, - "learning_rate": 1.489685814027293e-05, - "loss": 1.2295, - "step": 4180 - }, - { - "epoch": 1.3136227104069733, - "grad_norm": 0.85546875, - "learning_rate": 1.4894319263725803e-05, - "loss": 1.1217, - "step": 4182 - }, - { - "epoch": 1.3142509374325146, - "grad_norm": 0.80859375, - "learning_rate": 1.4891780387178674e-05, - "loss": 1.2411, - "step": 4184 - }, - { - "epoch": 1.314879164458056, - "grad_norm": 0.78125, - "learning_rate": 1.4889241510631547e-05, - "loss": 1.3081, - "step": 4186 - }, - { - "epoch": 1.3155073914835973, - "grad_norm": 0.828125, - "learning_rate": 1.4886702634084417e-05, - "loss": 1.3283, - "step": 4188 - }, - { - "epoch": 1.3161356185091386, - "grad_norm": 0.76171875, - "learning_rate": 1.488416375753729e-05, - "loss": 1.2081, - "step": 4190 - }, - { - "epoch": 1.3167638455346802, - "grad_norm": 0.7734375, - "learning_rate": 1.4881624880990162e-05, - "loss": 1.2085, - "step": 4192 - }, - { - "epoch": 1.3173920725602215, - "grad_norm": 0.765625, - "learning_rate": 1.4879086004443035e-05, - "loss": 1.3019, - "step": 4194 - }, - { - "epoch": 1.3180202995857628, - "grad_norm": 0.75390625, - "learning_rate": 1.4876547127895908e-05, - "loss": 1.2424, - "step": 4196 - }, - { - "epoch": 1.3186485266113042, - "grad_norm": 0.76171875, - "learning_rate": 1.4874008251348779e-05, - "loss": 1.2922, - "step": 4198 - }, - { - "epoch": 1.3192767536368455, - "grad_norm": 0.7890625, - "learning_rate": 1.4871469374801652e-05, - "loss": 1.262, - "step": 4200 - }, - { - "epoch": 1.3199049806623868, - "grad_norm": 0.7265625, - "learning_rate": 1.4868930498254524e-05, - "loss": 1.2194, - "step": 4202 - }, - { - "epoch": 1.3205332076879281, - "grad_norm": 0.75, - "learning_rate": 1.4866391621707397e-05, - "loss": 1.1209, - "step": 4204 - }, - { - "epoch": 1.3211614347134697, - "grad_norm": 0.80078125, - "learning_rate": 1.4863852745160268e-05, - "loss": 1.3312, - "step": 4206 - }, - { - "epoch": 1.321789661739011, - "grad_norm": 0.86328125, - "learning_rate": 1.4861313868613141e-05, - "loss": 1.2484, - "step": 4208 - }, - { - "epoch": 1.3224178887645524, - "grad_norm": 0.87890625, - "learning_rate": 1.4858774992066013e-05, - "loss": 1.2436, - "step": 4210 - }, - { - "epoch": 1.3230461157900937, - "grad_norm": 0.7890625, - "learning_rate": 1.4856236115518886e-05, - "loss": 1.2811, - "step": 4212 - }, - { - "epoch": 1.323674342815635, - "grad_norm": 2.375, - "learning_rate": 1.4853697238971755e-05, - "loss": 1.0794, - "step": 4214 - }, - { - "epoch": 1.3243025698411763, - "grad_norm": 0.82421875, - "learning_rate": 1.4851158362424628e-05, - "loss": 1.2972, - "step": 4216 - }, - { - "epoch": 1.3249307968667177, - "grad_norm": 0.7734375, - "learning_rate": 1.48486194858775e-05, - "loss": 1.2615, - "step": 4218 - }, - { - "epoch": 1.325559023892259, - "grad_norm": 0.7734375, - "learning_rate": 1.4846080609330373e-05, - "loss": 1.2853, - "step": 4220 - }, - { - "epoch": 1.3261872509178003, - "grad_norm": 0.83984375, - "learning_rate": 1.4843541732783244e-05, - "loss": 1.2105, - "step": 4222 - }, - { - "epoch": 1.3268154779433416, - "grad_norm": 0.87109375, - "learning_rate": 1.4841002856236117e-05, - "loss": 1.0318, - "step": 4224 - }, - { - "epoch": 1.3274437049688832, - "grad_norm": 0.7578125, - "learning_rate": 1.4838463979688989e-05, - "loss": 1.2111, - "step": 4226 - }, - { - "epoch": 1.3280719319944245, - "grad_norm": 0.86328125, - "learning_rate": 1.4835925103141862e-05, - "loss": 1.0525, - "step": 4228 - }, - { - "epoch": 1.3287001590199659, - "grad_norm": 0.78515625, - "learning_rate": 1.4833386226594733e-05, - "loss": 1.2753, - "step": 4230 - }, - { - "epoch": 1.3293283860455072, - "grad_norm": 0.83203125, - "learning_rate": 1.4830847350047606e-05, - "loss": 1.1766, - "step": 4232 - }, - { - "epoch": 1.3299566130710485, - "grad_norm": 0.765625, - "learning_rate": 1.4828308473500478e-05, - "loss": 1.3159, - "step": 4234 - }, - { - "epoch": 1.3305848400965898, - "grad_norm": 0.7890625, - "learning_rate": 1.482576959695335e-05, - "loss": 1.1587, - "step": 4236 - }, - { - "epoch": 1.3312130671221314, - "grad_norm": 0.8203125, - "learning_rate": 1.482323072040622e-05, - "loss": 1.1362, - "step": 4238 - }, - { - "epoch": 1.3318412941476727, - "grad_norm": 0.9140625, - "learning_rate": 1.4820691843859093e-05, - "loss": 1.2305, - "step": 4240 - }, - { - "epoch": 1.332469521173214, - "grad_norm": 0.77734375, - "learning_rate": 1.4818152967311965e-05, - "loss": 1.163, - "step": 4242 - }, - { - "epoch": 1.3330977481987554, - "grad_norm": 0.80078125, - "learning_rate": 1.4815614090764838e-05, - "loss": 1.2654, - "step": 4244 - }, - { - "epoch": 1.3337259752242967, - "grad_norm": 0.76171875, - "learning_rate": 1.481307521421771e-05, - "loss": 1.1764, - "step": 4246 - }, - { - "epoch": 1.334354202249838, - "grad_norm": 0.7890625, - "learning_rate": 1.4810536337670582e-05, - "loss": 1.2239, - "step": 4248 - }, - { - "epoch": 1.3349824292753794, - "grad_norm": 0.77734375, - "learning_rate": 1.4807997461123454e-05, - "loss": 1.2153, - "step": 4250 - }, - { - "epoch": 1.3356106563009207, - "grad_norm": 0.73046875, - "learning_rate": 1.4805458584576327e-05, - "loss": 1.1689, - "step": 4252 - }, - { - "epoch": 1.336238883326462, - "grad_norm": 0.8671875, - "learning_rate": 1.4802919708029198e-05, - "loss": 1.2603, - "step": 4254 - }, - { - "epoch": 1.3368671103520033, - "grad_norm": 0.8046875, - "learning_rate": 1.4800380831482071e-05, - "loss": 1.1904, - "step": 4256 - }, - { - "epoch": 1.337495337377545, - "grad_norm": 0.7578125, - "learning_rate": 1.4797841954934941e-05, - "loss": 1.4012, - "step": 4258 - }, - { - "epoch": 1.3381235644030862, - "grad_norm": 0.9453125, - "learning_rate": 1.4795303078387816e-05, - "loss": 1.2366, - "step": 4260 - }, - { - "epoch": 1.3387517914286275, - "grad_norm": 0.7734375, - "learning_rate": 1.4792764201840685e-05, - "loss": 1.2833, - "step": 4262 - }, - { - "epoch": 1.3393800184541689, - "grad_norm": 0.7578125, - "learning_rate": 1.4790225325293558e-05, - "loss": 1.3163, - "step": 4264 - }, - { - "epoch": 1.3400082454797102, - "grad_norm": 0.80859375, - "learning_rate": 1.478768644874643e-05, - "loss": 1.2823, - "step": 4266 - }, - { - "epoch": 1.3406364725052515, - "grad_norm": 0.8515625, - "learning_rate": 1.4785147572199303e-05, - "loss": 1.1118, - "step": 4268 - }, - { - "epoch": 1.3412646995307929, - "grad_norm": 0.7578125, - "learning_rate": 1.4782608695652174e-05, - "loss": 1.1947, - "step": 4270 - }, - { - "epoch": 1.3418929265563344, - "grad_norm": 0.71875, - "learning_rate": 1.4780069819105047e-05, - "loss": 1.3296, - "step": 4272 - }, - { - "epoch": 1.3425211535818757, - "grad_norm": 0.7578125, - "learning_rate": 1.4777530942557919e-05, - "loss": 1.3039, - "step": 4274 - }, - { - "epoch": 1.343149380607417, - "grad_norm": 0.9609375, - "learning_rate": 1.4774992066010792e-05, - "loss": 1.1281, - "step": 4276 - }, - { - "epoch": 1.3437776076329584, - "grad_norm": 0.79296875, - "learning_rate": 1.4772453189463663e-05, - "loss": 1.2095, - "step": 4278 - }, - { - "epoch": 1.3444058346584997, - "grad_norm": 0.796875, - "learning_rate": 1.4769914312916536e-05, - "loss": 1.2037, - "step": 4280 - }, - { - "epoch": 1.345034061684041, - "grad_norm": 0.79296875, - "learning_rate": 1.476737543636941e-05, - "loss": 1.2451, - "step": 4282 - }, - { - "epoch": 1.3456622887095824, - "grad_norm": 0.76953125, - "learning_rate": 1.4764836559822279e-05, - "loss": 1.2812, - "step": 4284 - }, - { - "epoch": 1.3462905157351237, - "grad_norm": 0.8125, - "learning_rate": 1.4762297683275154e-05, - "loss": 1.1563, - "step": 4286 - }, - { - "epoch": 1.346918742760665, - "grad_norm": 0.74609375, - "learning_rate": 1.4759758806728024e-05, - "loss": 1.2046, - "step": 4288 - }, - { - "epoch": 1.3475469697862064, - "grad_norm": 0.8046875, - "learning_rate": 1.4757219930180897e-05, - "loss": 1.1475, - "step": 4290 - }, - { - "epoch": 1.348175196811748, - "grad_norm": 0.84375, - "learning_rate": 1.4754681053633768e-05, - "loss": 1.19, - "step": 4292 - }, - { - "epoch": 1.3488034238372892, - "grad_norm": 0.83203125, - "learning_rate": 1.4752142177086641e-05, - "loss": 1.1384, - "step": 4294 - }, - { - "epoch": 1.3494316508628306, - "grad_norm": 0.75390625, - "learning_rate": 1.4749603300539512e-05, - "loss": 1.3681, - "step": 4296 - }, - { - "epoch": 1.350059877888372, - "grad_norm": 0.77734375, - "learning_rate": 1.4747064423992386e-05, - "loss": 1.4912, - "step": 4298 - }, - { - "epoch": 1.3506881049139132, - "grad_norm": 0.796875, - "learning_rate": 1.4744525547445257e-05, - "loss": 1.2292, - "step": 4300 - }, - { - "epoch": 1.3513163319394546, - "grad_norm": 0.80859375, - "learning_rate": 1.474198667089813e-05, - "loss": 1.2261, - "step": 4302 - }, - { - "epoch": 1.351944558964996, - "grad_norm": 0.7578125, - "learning_rate": 1.4739447794351001e-05, - "loss": 1.3621, - "step": 4304 - }, - { - "epoch": 1.3525727859905374, - "grad_norm": 0.8515625, - "learning_rate": 1.4736908917803874e-05, - "loss": 1.1424, - "step": 4306 - }, - { - "epoch": 1.3532010130160788, - "grad_norm": 0.8203125, - "learning_rate": 1.4734370041256744e-05, - "loss": 1.1803, - "step": 4308 - }, - { - "epoch": 1.35382924004162, - "grad_norm": 0.73828125, - "learning_rate": 1.4731831164709617e-05, - "loss": 1.2921, - "step": 4310 - }, - { - "epoch": 1.3544574670671614, - "grad_norm": 0.77734375, - "learning_rate": 1.4729292288162489e-05, - "loss": 1.359, - "step": 4312 - }, - { - "epoch": 1.3550856940927027, - "grad_norm": 0.7578125, - "learning_rate": 1.4726753411615362e-05, - "loss": 1.2099, - "step": 4314 - }, - { - "epoch": 1.355713921118244, - "grad_norm": 0.73046875, - "learning_rate": 1.4724214535068233e-05, - "loss": 1.3183, - "step": 4316 - }, - { - "epoch": 1.3563421481437854, - "grad_norm": 0.80078125, - "learning_rate": 1.4721675658521106e-05, - "loss": 1.3491, - "step": 4318 - }, - { - "epoch": 1.3569703751693267, - "grad_norm": 0.8203125, - "learning_rate": 1.4719136781973978e-05, - "loss": 1.239, - "step": 4320 - }, - { - "epoch": 1.357598602194868, - "grad_norm": 0.80859375, - "learning_rate": 1.471659790542685e-05, - "loss": 1.2861, - "step": 4322 - }, - { - "epoch": 1.3582268292204096, - "grad_norm": 0.79296875, - "learning_rate": 1.4714059028879722e-05, - "loss": 1.2679, - "step": 4324 - }, - { - "epoch": 1.358855056245951, - "grad_norm": 0.72265625, - "learning_rate": 1.4711520152332595e-05, - "loss": 1.2569, - "step": 4326 - }, - { - "epoch": 1.3594832832714923, - "grad_norm": 0.7890625, - "learning_rate": 1.4708981275785465e-05, - "loss": 1.2359, - "step": 4328 - }, - { - "epoch": 1.3601115102970336, - "grad_norm": 0.85546875, - "learning_rate": 1.470644239923834e-05, - "loss": 1.2838, - "step": 4330 - }, - { - "epoch": 1.360739737322575, - "grad_norm": 0.8359375, - "learning_rate": 1.470390352269121e-05, - "loss": 1.302, - "step": 4332 - }, - { - "epoch": 1.3613679643481162, - "grad_norm": 0.83984375, - "learning_rate": 1.4701364646144082e-05, - "loss": 1.3001, - "step": 4334 - }, - { - "epoch": 1.3619961913736578, - "grad_norm": 0.828125, - "learning_rate": 1.4698825769596954e-05, - "loss": 1.2303, - "step": 4336 - }, - { - "epoch": 1.3626244183991991, - "grad_norm": 0.83984375, - "learning_rate": 1.4696286893049827e-05, - "loss": 1.2391, - "step": 4338 - }, - { - "epoch": 1.3632526454247405, - "grad_norm": 0.8515625, - "learning_rate": 1.4693748016502698e-05, - "loss": 1.2481, - "step": 4340 - }, - { - "epoch": 1.3638808724502818, - "grad_norm": 0.76171875, - "learning_rate": 1.4691209139955571e-05, - "loss": 1.2994, - "step": 4342 - }, - { - "epoch": 1.364509099475823, - "grad_norm": 0.96484375, - "learning_rate": 1.4688670263408443e-05, - "loss": 1.392, - "step": 4344 - }, - { - "epoch": 1.3651373265013644, - "grad_norm": 0.83984375, - "learning_rate": 1.4686131386861316e-05, - "loss": 1.3212, - "step": 4346 - }, - { - "epoch": 1.3657655535269058, - "grad_norm": 0.81640625, - "learning_rate": 1.4683592510314187e-05, - "loss": 1.3631, - "step": 4348 - }, - { - "epoch": 1.366393780552447, - "grad_norm": 0.79296875, - "learning_rate": 1.468105363376706e-05, - "loss": 1.2164, - "step": 4350 - }, - { - "epoch": 1.3670220075779884, - "grad_norm": 0.82421875, - "learning_rate": 1.467851475721993e-05, - "loss": 1.1514, - "step": 4352 - }, - { - "epoch": 1.3676502346035297, - "grad_norm": 0.79296875, - "learning_rate": 1.4675975880672803e-05, - "loss": 1.3862, - "step": 4354 - }, - { - "epoch": 1.368278461629071, - "grad_norm": 0.890625, - "learning_rate": 1.4673437004125674e-05, - "loss": 1.2115, - "step": 4356 - }, - { - "epoch": 1.3689066886546126, - "grad_norm": 0.91796875, - "learning_rate": 1.4670898127578547e-05, - "loss": 1.1759, - "step": 4358 - }, - { - "epoch": 1.369534915680154, - "grad_norm": 0.78515625, - "learning_rate": 1.4668359251031419e-05, - "loss": 1.2677, - "step": 4360 - }, - { - "epoch": 1.3701631427056953, - "grad_norm": 0.79296875, - "learning_rate": 1.4665820374484292e-05, - "loss": 1.287, - "step": 4362 - }, - { - "epoch": 1.3707913697312366, - "grad_norm": 0.76171875, - "learning_rate": 1.4663281497937163e-05, - "loss": 1.3592, - "step": 4364 - }, - { - "epoch": 1.371419596756778, - "grad_norm": 0.828125, - "learning_rate": 1.4660742621390036e-05, - "loss": 1.2563, - "step": 4366 - }, - { - "epoch": 1.3720478237823193, - "grad_norm": 0.76953125, - "learning_rate": 1.465820374484291e-05, - "loss": 1.1968, - "step": 4368 - }, - { - "epoch": 1.3726760508078608, - "grad_norm": 0.78125, - "learning_rate": 1.465566486829578e-05, - "loss": 1.3271, - "step": 4370 - }, - { - "epoch": 1.3733042778334021, - "grad_norm": 0.74609375, - "learning_rate": 1.4653125991748654e-05, - "loss": 1.304, - "step": 4372 - }, - { - "epoch": 1.3739325048589435, - "grad_norm": 0.7421875, - "learning_rate": 1.4650587115201525e-05, - "loss": 1.3881, - "step": 4374 - }, - { - "epoch": 1.3745607318844848, - "grad_norm": 0.765625, - "learning_rate": 1.4648048238654398e-05, - "loss": 1.2676, - "step": 4376 - }, - { - "epoch": 1.3751889589100261, - "grad_norm": 0.76171875, - "learning_rate": 1.4645509362107268e-05, - "loss": 1.2412, - "step": 4378 - }, - { - "epoch": 1.3758171859355675, - "grad_norm": 0.921875, - "learning_rate": 1.4642970485560141e-05, - "loss": 1.1142, - "step": 4380 - }, - { - "epoch": 1.3764454129611088, - "grad_norm": 0.78515625, - "learning_rate": 1.4640431609013012e-05, - "loss": 1.2977, - "step": 4382 - }, - { - "epoch": 1.37707363998665, - "grad_norm": 0.76953125, - "learning_rate": 1.4637892732465885e-05, - "loss": 1.1754, - "step": 4384 - }, - { - "epoch": 1.3777018670121914, - "grad_norm": 0.7890625, - "learning_rate": 1.4635353855918757e-05, - "loss": 1.3088, - "step": 4386 - }, - { - "epoch": 1.3783300940377328, - "grad_norm": 0.79296875, - "learning_rate": 1.463281497937163e-05, - "loss": 1.359, - "step": 4388 - }, - { - "epoch": 1.3789583210632743, - "grad_norm": 0.7578125, - "learning_rate": 1.4630276102824501e-05, - "loss": 1.1869, - "step": 4390 - }, - { - "epoch": 1.3795865480888156, - "grad_norm": 0.7265625, - "learning_rate": 1.4627737226277374e-05, - "loss": 1.3206, - "step": 4392 - }, - { - "epoch": 1.380214775114357, - "grad_norm": 0.7109375, - "learning_rate": 1.4625198349730246e-05, - "loss": 1.308, - "step": 4394 - }, - { - "epoch": 1.3808430021398983, - "grad_norm": 0.77734375, - "learning_rate": 1.4622659473183119e-05, - "loss": 1.2614, - "step": 4396 - }, - { - "epoch": 1.3814712291654396, - "grad_norm": 0.8671875, - "learning_rate": 1.462012059663599e-05, - "loss": 1.1872, - "step": 4398 - }, - { - "epoch": 1.382099456190981, - "grad_norm": 0.82421875, - "learning_rate": 1.4617581720088863e-05, - "loss": 1.2658, - "step": 4400 - }, - { - "epoch": 1.3827276832165225, - "grad_norm": 0.82421875, - "learning_rate": 1.4615042843541733e-05, - "loss": 1.3195, - "step": 4402 - }, - { - "epoch": 1.3833559102420638, - "grad_norm": 0.83984375, - "learning_rate": 1.4612503966994606e-05, - "loss": 1.1573, - "step": 4404 - }, - { - "epoch": 1.3839841372676052, - "grad_norm": 0.85546875, - "learning_rate": 1.4609965090447477e-05, - "loss": 1.2768, - "step": 4406 - }, - { - "epoch": 1.3846123642931465, - "grad_norm": 0.83984375, - "learning_rate": 1.460742621390035e-05, - "loss": 1.2157, - "step": 4408 - }, - { - "epoch": 1.3852405913186878, - "grad_norm": 0.76171875, - "learning_rate": 1.4604887337353222e-05, - "loss": 1.1608, - "step": 4410 - }, - { - "epoch": 1.3858688183442291, - "grad_norm": 0.8984375, - "learning_rate": 1.4602348460806095e-05, - "loss": 1.1619, - "step": 4412 - }, - { - "epoch": 1.3864970453697705, - "grad_norm": 0.73828125, - "learning_rate": 1.4599809584258966e-05, - "loss": 1.1692, - "step": 4414 - }, - { - "epoch": 1.3871252723953118, - "grad_norm": 0.76171875, - "learning_rate": 1.459727070771184e-05, - "loss": 1.3201, - "step": 4416 - }, - { - "epoch": 1.3877534994208531, - "grad_norm": 0.890625, - "learning_rate": 1.459473183116471e-05, - "loss": 1.2667, - "step": 4418 - }, - { - "epoch": 1.3883817264463945, - "grad_norm": 0.84375, - "learning_rate": 1.4592192954617584e-05, - "loss": 1.3936, - "step": 4420 - }, - { - "epoch": 1.3890099534719358, - "grad_norm": 0.796875, - "learning_rate": 1.4589654078070454e-05, - "loss": 1.271, - "step": 4422 - }, - { - "epoch": 1.3896381804974773, - "grad_norm": 0.84375, - "learning_rate": 1.4587115201523328e-05, - "loss": 1.2298, - "step": 4424 - }, - { - "epoch": 1.3902664075230187, - "grad_norm": 0.7421875, - "learning_rate": 1.4584576324976198e-05, - "loss": 1.3023, - "step": 4426 - }, - { - "epoch": 1.39089463454856, - "grad_norm": 0.78125, - "learning_rate": 1.4582037448429071e-05, - "loss": 1.0285, - "step": 4428 - }, - { - "epoch": 1.3915228615741013, - "grad_norm": 0.90234375, - "learning_rate": 1.4579498571881942e-05, - "loss": 1.345, - "step": 4430 - }, - { - "epoch": 1.3921510885996426, - "grad_norm": 0.76953125, - "learning_rate": 1.4576959695334816e-05, - "loss": 1.3171, - "step": 4432 - }, - { - "epoch": 1.392779315625184, - "grad_norm": 0.83203125, - "learning_rate": 1.4574420818787687e-05, - "loss": 1.1312, - "step": 4434 - }, - { - "epoch": 1.3934075426507255, - "grad_norm": 0.80078125, - "learning_rate": 1.457188194224056e-05, - "loss": 1.281, - "step": 4436 - }, - { - "epoch": 1.3940357696762669, - "grad_norm": 0.8046875, - "learning_rate": 1.4569343065693431e-05, - "loss": 1.3061, - "step": 4438 - }, - { - "epoch": 1.3946639967018082, - "grad_norm": 0.90625, - "learning_rate": 1.4566804189146304e-05, - "loss": 1.2167, - "step": 4440 - }, - { - "epoch": 1.3952922237273495, - "grad_norm": 0.80859375, - "learning_rate": 1.4564265312599176e-05, - "loss": 1.2837, - "step": 4442 - }, - { - "epoch": 1.3959204507528908, - "grad_norm": 0.9140625, - "learning_rate": 1.4561726436052049e-05, - "loss": 1.1497, - "step": 4444 - }, - { - "epoch": 1.3965486777784322, - "grad_norm": 1.1015625, - "learning_rate": 1.4559187559504919e-05, - "loss": 1.2668, - "step": 4446 - }, - { - "epoch": 1.3971769048039735, - "grad_norm": 0.81640625, - "learning_rate": 1.4556648682957792e-05, - "loss": 1.2495, - "step": 4448 - }, - { - "epoch": 1.3978051318295148, - "grad_norm": 0.73046875, - "learning_rate": 1.4554109806410663e-05, - "loss": 1.3183, - "step": 4450 - }, - { - "epoch": 1.3984333588550562, - "grad_norm": 0.88671875, - "learning_rate": 1.4551570929863536e-05, - "loss": 1.2387, - "step": 4452 - }, - { - "epoch": 1.3990615858805975, - "grad_norm": 0.84375, - "learning_rate": 1.454903205331641e-05, - "loss": 1.3808, - "step": 4454 - }, - { - "epoch": 1.399689812906139, - "grad_norm": 0.796875, - "learning_rate": 1.454649317676928e-05, - "loss": 1.2578, - "step": 4456 - }, - { - "epoch": 1.4003180399316804, - "grad_norm": 0.8125, - "learning_rate": 1.4543954300222154e-05, - "loss": 1.3209, - "step": 4458 - }, - { - "epoch": 1.4009462669572217, - "grad_norm": 0.83984375, - "learning_rate": 1.4541415423675025e-05, - "loss": 1.2369, - "step": 4460 - }, - { - "epoch": 1.401574493982763, - "grad_norm": 0.765625, - "learning_rate": 1.4538876547127898e-05, - "loss": 1.2486, - "step": 4462 - }, - { - "epoch": 1.4022027210083043, - "grad_norm": 0.7421875, - "learning_rate": 1.453633767058077e-05, - "loss": 1.1907, - "step": 4464 - }, - { - "epoch": 1.4028309480338457, - "grad_norm": 0.74609375, - "learning_rate": 1.4533798794033643e-05, - "loss": 1.1616, - "step": 4466 - }, - { - "epoch": 1.4034591750593872, - "grad_norm": 0.84375, - "learning_rate": 1.4531259917486514e-05, - "loss": 1.1429, - "step": 4468 - }, - { - "epoch": 1.4040874020849285, - "grad_norm": 0.8125, - "learning_rate": 1.4528721040939387e-05, - "loss": 1.2914, - "step": 4470 - }, - { - "epoch": 1.4047156291104699, - "grad_norm": 0.89453125, - "learning_rate": 1.4526182164392257e-05, - "loss": 1.3573, - "step": 4472 - }, - { - "epoch": 1.4053438561360112, - "grad_norm": 0.8125, - "learning_rate": 1.452364328784513e-05, - "loss": 1.258, - "step": 4474 - }, - { - "epoch": 1.4059720831615525, - "grad_norm": 0.76171875, - "learning_rate": 1.4521104411298001e-05, - "loss": 1.3615, - "step": 4476 - }, - { - "epoch": 1.4066003101870939, - "grad_norm": 0.734375, - "learning_rate": 1.4518565534750874e-05, - "loss": 1.2793, - "step": 4478 - }, - { - "epoch": 1.4072285372126352, - "grad_norm": 0.75390625, - "learning_rate": 1.4516026658203746e-05, - "loss": 1.2621, - "step": 4480 - }, - { - "epoch": 1.4078567642381765, - "grad_norm": 0.734375, - "learning_rate": 1.4513487781656619e-05, - "loss": 1.2854, - "step": 4482 - }, - { - "epoch": 1.4084849912637178, - "grad_norm": 0.80859375, - "learning_rate": 1.451094890510949e-05, - "loss": 1.2765, - "step": 4484 - }, - { - "epoch": 1.4091132182892592, - "grad_norm": 0.7734375, - "learning_rate": 1.4508410028562363e-05, - "loss": 1.2565, - "step": 4486 - }, - { - "epoch": 1.4097414453148005, - "grad_norm": 0.83203125, - "learning_rate": 1.4505871152015235e-05, - "loss": 1.1619, - "step": 4488 - }, - { - "epoch": 1.410369672340342, - "grad_norm": 0.73828125, - "learning_rate": 1.4503332275468108e-05, - "loss": 1.2787, - "step": 4490 - }, - { - "epoch": 1.4109978993658834, - "grad_norm": 0.7890625, - "learning_rate": 1.4500793398920977e-05, - "loss": 1.1383, - "step": 4492 - }, - { - "epoch": 1.4116261263914247, - "grad_norm": 0.84375, - "learning_rate": 1.4498254522373852e-05, - "loss": 1.2907, - "step": 4494 - }, - { - "epoch": 1.412254353416966, - "grad_norm": 0.7734375, - "learning_rate": 1.4495715645826722e-05, - "loss": 1.2964, - "step": 4496 - }, - { - "epoch": 1.4128825804425074, - "grad_norm": 0.78125, - "learning_rate": 1.4493176769279595e-05, - "loss": 1.4016, - "step": 4498 - }, - { - "epoch": 1.4135108074680487, - "grad_norm": 0.83203125, - "learning_rate": 1.4490637892732466e-05, - "loss": 1.1816, - "step": 4500 - }, - { - "epoch": 1.4141390344935902, - "grad_norm": 0.796875, - "learning_rate": 1.448809901618534e-05, - "loss": 1.2369, - "step": 4502 - }, - { - "epoch": 1.4147672615191316, - "grad_norm": 0.8515625, - "learning_rate": 1.448556013963821e-05, - "loss": 1.2545, - "step": 4504 - }, - { - "epoch": 1.415395488544673, - "grad_norm": 0.76171875, - "learning_rate": 1.4483021263091084e-05, - "loss": 1.3477, - "step": 4506 - }, - { - "epoch": 1.4160237155702142, - "grad_norm": 0.8671875, - "learning_rate": 1.4480482386543955e-05, - "loss": 1.2051, - "step": 4508 - }, - { - "epoch": 1.4166519425957556, - "grad_norm": 0.890625, - "learning_rate": 1.4477943509996828e-05, - "loss": 1.1868, - "step": 4510 - }, - { - "epoch": 1.4172801696212969, - "grad_norm": 0.91015625, - "learning_rate": 1.44754046334497e-05, - "loss": 1.2113, - "step": 4512 - }, - { - "epoch": 1.4179083966468382, - "grad_norm": 0.78125, - "learning_rate": 1.4472865756902573e-05, - "loss": 1.2932, - "step": 4514 - }, - { - "epoch": 1.4185366236723795, - "grad_norm": 0.80859375, - "learning_rate": 1.4470326880355442e-05, - "loss": 1.2502, - "step": 4516 - }, - { - "epoch": 1.4191648506979209, - "grad_norm": 0.8515625, - "learning_rate": 1.4467788003808315e-05, - "loss": 1.3597, - "step": 4518 - }, - { - "epoch": 1.4197930777234622, - "grad_norm": 0.796875, - "learning_rate": 1.4465249127261187e-05, - "loss": 1.2579, - "step": 4520 - }, - { - "epoch": 1.4204213047490037, - "grad_norm": 0.765625, - "learning_rate": 1.446271025071406e-05, - "loss": 1.315, - "step": 4522 - }, - { - "epoch": 1.421049531774545, - "grad_norm": 0.80078125, - "learning_rate": 1.4460171374166931e-05, - "loss": 1.1785, - "step": 4524 - }, - { - "epoch": 1.4216777588000864, - "grad_norm": 0.75, - "learning_rate": 1.4457632497619804e-05, - "loss": 1.2612, - "step": 4526 - }, - { - "epoch": 1.4223059858256277, - "grad_norm": 0.76953125, - "learning_rate": 1.4455093621072676e-05, - "loss": 1.2523, - "step": 4528 - }, - { - "epoch": 1.422934212851169, - "grad_norm": 0.83984375, - "learning_rate": 1.4452554744525549e-05, - "loss": 1.288, - "step": 4530 - }, - { - "epoch": 1.4235624398767104, - "grad_norm": 0.7421875, - "learning_rate": 1.445001586797842e-05, - "loss": 1.2778, - "step": 4532 - }, - { - "epoch": 1.424190666902252, - "grad_norm": 0.8203125, - "learning_rate": 1.4447476991431293e-05, - "loss": 1.3666, - "step": 4534 - }, - { - "epoch": 1.4248188939277933, - "grad_norm": 0.8671875, - "learning_rate": 1.4444938114884165e-05, - "loss": 1.2365, - "step": 4536 - }, - { - "epoch": 1.4254471209533346, - "grad_norm": 0.81640625, - "learning_rate": 1.4442399238337038e-05, - "loss": 1.4104, - "step": 4538 - }, - { - "epoch": 1.426075347978876, - "grad_norm": 0.78125, - "learning_rate": 1.4439860361789911e-05, - "loss": 1.3859, - "step": 4540 - }, - { - "epoch": 1.4267035750044172, - "grad_norm": 0.77734375, - "learning_rate": 1.443732148524278e-05, - "loss": 1.2868, - "step": 4542 - }, - { - "epoch": 1.4273318020299586, - "grad_norm": 0.91796875, - "learning_rate": 1.4434782608695654e-05, - "loss": 1.2102, - "step": 4544 - }, - { - "epoch": 1.4279600290555, - "grad_norm": 0.81640625, - "learning_rate": 1.4432243732148525e-05, - "loss": 1.4068, - "step": 4546 - }, - { - "epoch": 1.4285882560810412, - "grad_norm": 0.75, - "learning_rate": 1.4429704855601398e-05, - "loss": 1.1468, - "step": 4548 - }, - { - "epoch": 1.4292164831065826, - "grad_norm": 0.74609375, - "learning_rate": 1.442716597905427e-05, - "loss": 1.114, - "step": 4550 - }, - { - "epoch": 1.4298447101321239, - "grad_norm": 0.7578125, - "learning_rate": 1.4424627102507143e-05, - "loss": 1.1786, - "step": 4552 - }, - { - "epoch": 1.4304729371576652, - "grad_norm": 0.8515625, - "learning_rate": 1.4422088225960014e-05, - "loss": 1.2882, - "step": 4554 - }, - { - "epoch": 1.4311011641832068, - "grad_norm": 0.875, - "learning_rate": 1.4419549349412887e-05, - "loss": 1.2154, - "step": 4556 - }, - { - "epoch": 1.431729391208748, - "grad_norm": 0.91015625, - "learning_rate": 1.4417010472865758e-05, - "loss": 1.3404, - "step": 4558 - }, - { - "epoch": 1.4323576182342894, - "grad_norm": 0.83203125, - "learning_rate": 1.4414471596318631e-05, - "loss": 1.17, - "step": 4560 - }, - { - "epoch": 1.4329858452598307, - "grad_norm": 0.7578125, - "learning_rate": 1.4411932719771503e-05, - "loss": 1.2197, - "step": 4562 - }, - { - "epoch": 1.433614072285372, - "grad_norm": 0.8515625, - "learning_rate": 1.4409393843224376e-05, - "loss": 1.2704, - "step": 4564 - }, - { - "epoch": 1.4342422993109134, - "grad_norm": 0.73828125, - "learning_rate": 1.4406854966677246e-05, - "loss": 1.217, - "step": 4566 - }, - { - "epoch": 1.434870526336455, - "grad_norm": 0.796875, - "learning_rate": 1.4404316090130119e-05, - "loss": 1.158, - "step": 4568 - }, - { - "epoch": 1.4354987533619963, - "grad_norm": 0.78125, - "learning_rate": 1.440177721358299e-05, - "loss": 1.2378, - "step": 4570 - }, - { - "epoch": 1.4361269803875376, - "grad_norm": 0.7421875, - "learning_rate": 1.4399238337035863e-05, - "loss": 1.1346, - "step": 4572 - }, - { - "epoch": 1.436755207413079, - "grad_norm": 0.83984375, - "learning_rate": 1.4396699460488735e-05, - "loss": 1.3427, - "step": 4574 - }, - { - "epoch": 1.4373834344386203, - "grad_norm": 0.80078125, - "learning_rate": 1.4394160583941608e-05, - "loss": 1.2679, - "step": 4576 - }, - { - "epoch": 1.4380116614641616, - "grad_norm": 0.76953125, - "learning_rate": 1.4391621707394479e-05, - "loss": 1.1892, - "step": 4578 - }, - { - "epoch": 1.438639888489703, - "grad_norm": 0.73828125, - "learning_rate": 1.4389082830847352e-05, - "loss": 1.3267, - "step": 4580 - }, - { - "epoch": 1.4392681155152443, - "grad_norm": 0.734375, - "learning_rate": 1.4386543954300223e-05, - "loss": 1.2524, - "step": 4582 - }, - { - "epoch": 1.4398963425407856, - "grad_norm": 0.734375, - "learning_rate": 1.4384005077753097e-05, - "loss": 1.1367, - "step": 4584 - }, - { - "epoch": 1.440524569566327, - "grad_norm": 0.7578125, - "learning_rate": 1.4381466201205966e-05, - "loss": 1.3201, - "step": 4586 - }, - { - "epoch": 1.4411527965918685, - "grad_norm": 0.75390625, - "learning_rate": 1.437892732465884e-05, - "loss": 1.2344, - "step": 4588 - }, - { - "epoch": 1.4417810236174098, - "grad_norm": 0.8125, - "learning_rate": 1.437638844811171e-05, - "loss": 1.1569, - "step": 4590 - }, - { - "epoch": 1.4424092506429511, - "grad_norm": 0.75, - "learning_rate": 1.4373849571564584e-05, - "loss": 1.1568, - "step": 4592 - }, - { - "epoch": 1.4430374776684924, - "grad_norm": 0.82421875, - "learning_rate": 1.4371310695017455e-05, - "loss": 1.1668, - "step": 4594 - }, - { - "epoch": 1.4436657046940338, - "grad_norm": 0.8046875, - "learning_rate": 1.4368771818470328e-05, - "loss": 1.318, - "step": 4596 - }, - { - "epoch": 1.444293931719575, - "grad_norm": 0.7578125, - "learning_rate": 1.43662329419232e-05, - "loss": 1.3324, - "step": 4598 - }, - { - "epoch": 1.4449221587451166, - "grad_norm": 0.73828125, - "learning_rate": 1.4363694065376073e-05, - "loss": 1.2083, - "step": 4600 - }, - { - "epoch": 1.445550385770658, - "grad_norm": 0.75, - "learning_rate": 1.4361155188828944e-05, - "loss": 1.1944, - "step": 4602 - }, - { - "epoch": 1.4461786127961993, - "grad_norm": 0.84375, - "learning_rate": 1.4358616312281817e-05, - "loss": 1.2482, - "step": 4604 - }, - { - "epoch": 1.4468068398217406, - "grad_norm": 0.75390625, - "learning_rate": 1.4356077435734688e-05, - "loss": 1.162, - "step": 4606 - }, - { - "epoch": 1.447435066847282, - "grad_norm": 0.83984375, - "learning_rate": 1.4353538559187562e-05, - "loss": 1.1867, - "step": 4608 - }, - { - "epoch": 1.4480632938728233, - "grad_norm": 0.828125, - "learning_rate": 1.4350999682640431e-05, - "loss": 1.2864, - "step": 4610 - }, - { - "epoch": 1.4486915208983646, - "grad_norm": 0.95703125, - "learning_rate": 1.4348460806093304e-05, - "loss": 1.1464, - "step": 4612 - }, - { - "epoch": 1.449319747923906, - "grad_norm": 0.8046875, - "learning_rate": 1.4345921929546176e-05, - "loss": 1.1811, - "step": 4614 - }, - { - "epoch": 1.4499479749494473, - "grad_norm": 0.796875, - "learning_rate": 1.4343383052999049e-05, - "loss": 1.2498, - "step": 4616 - }, - { - "epoch": 1.4505762019749886, - "grad_norm": 0.7890625, - "learning_rate": 1.434084417645192e-05, - "loss": 1.192, - "step": 4618 - }, - { - "epoch": 1.4512044290005301, - "grad_norm": 0.78515625, - "learning_rate": 1.4338305299904793e-05, - "loss": 1.3238, - "step": 4620 - }, - { - "epoch": 1.4518326560260715, - "grad_norm": 0.85546875, - "learning_rate": 1.4335766423357665e-05, - "loss": 1.1391, - "step": 4622 - }, - { - "epoch": 1.4524608830516128, - "grad_norm": 0.77734375, - "learning_rate": 1.4333227546810538e-05, - "loss": 1.2083, - "step": 4624 - }, - { - "epoch": 1.4530891100771541, - "grad_norm": 0.84375, - "learning_rate": 1.433068867026341e-05, - "loss": 1.2331, - "step": 4626 - }, - { - "epoch": 1.4537173371026955, - "grad_norm": 0.79296875, - "learning_rate": 1.4328149793716282e-05, - "loss": 1.2771, - "step": 4628 - }, - { - "epoch": 1.4543455641282368, - "grad_norm": 0.8359375, - "learning_rate": 1.4325610917169155e-05, - "loss": 1.1523, - "step": 4630 - }, - { - "epoch": 1.4549737911537781, - "grad_norm": 0.81640625, - "learning_rate": 1.4323072040622027e-05, - "loss": 1.3079, - "step": 4632 - }, - { - "epoch": 1.4556020181793197, - "grad_norm": 0.765625, - "learning_rate": 1.43205331640749e-05, - "loss": 1.1524, - "step": 4634 - }, - { - "epoch": 1.456230245204861, - "grad_norm": 0.78125, - "learning_rate": 1.431799428752777e-05, - "loss": 1.1978, - "step": 4636 - }, - { - "epoch": 1.4568584722304023, - "grad_norm": 0.78125, - "learning_rate": 1.4315455410980642e-05, - "loss": 1.2417, - "step": 4638 - }, - { - "epoch": 1.4574866992559437, - "grad_norm": 0.7734375, - "learning_rate": 1.4312916534433514e-05, - "loss": 1.3005, - "step": 4640 - }, - { - "epoch": 1.458114926281485, - "grad_norm": 0.90234375, - "learning_rate": 1.4310377657886387e-05, - "loss": 1.2385, - "step": 4642 - }, - { - "epoch": 1.4587431533070263, - "grad_norm": 0.76171875, - "learning_rate": 1.4307838781339258e-05, - "loss": 1.324, - "step": 4644 - }, - { - "epoch": 1.4593713803325676, - "grad_norm": 0.74609375, - "learning_rate": 1.4305299904792131e-05, - "loss": 1.3922, - "step": 4646 - }, - { - "epoch": 1.459999607358109, - "grad_norm": 0.91015625, - "learning_rate": 1.4302761028245003e-05, - "loss": 1.3316, - "step": 4648 - }, - { - "epoch": 1.4606278343836503, - "grad_norm": 0.88671875, - "learning_rate": 1.4300222151697876e-05, - "loss": 1.2378, - "step": 4650 - }, - { - "epoch": 1.4612560614091916, - "grad_norm": 0.85546875, - "learning_rate": 1.4297683275150747e-05, - "loss": 1.2244, - "step": 4652 - }, - { - "epoch": 1.4618842884347332, - "grad_norm": 0.7578125, - "learning_rate": 1.429514439860362e-05, - "loss": 1.2384, - "step": 4654 - }, - { - "epoch": 1.4625125154602745, - "grad_norm": 0.76953125, - "learning_rate": 1.429260552205649e-05, - "loss": 1.39, - "step": 4656 - }, - { - "epoch": 1.4631407424858158, - "grad_norm": 0.79296875, - "learning_rate": 1.4290066645509365e-05, - "loss": 1.1775, - "step": 4658 - }, - { - "epoch": 1.4637689695113572, - "grad_norm": 0.75390625, - "learning_rate": 1.4287527768962234e-05, - "loss": 1.2629, - "step": 4660 - }, - { - "epoch": 1.4643971965368985, - "grad_norm": 0.90234375, - "learning_rate": 1.4284988892415108e-05, - "loss": 1.1894, - "step": 4662 - }, - { - "epoch": 1.4650254235624398, - "grad_norm": 0.75, - "learning_rate": 1.4282450015867979e-05, - "loss": 1.4307, - "step": 4664 - }, - { - "epoch": 1.4656536505879814, - "grad_norm": 0.7734375, - "learning_rate": 1.4279911139320852e-05, - "loss": 1.2757, - "step": 4666 - }, - { - "epoch": 1.4662818776135227, - "grad_norm": 0.80078125, - "learning_rate": 1.4277372262773723e-05, - "loss": 1.1575, - "step": 4668 - }, - { - "epoch": 1.466910104639064, - "grad_norm": 0.75, - "learning_rate": 1.4274833386226596e-05, - "loss": 1.2476, - "step": 4670 - }, - { - "epoch": 1.4675383316646053, - "grad_norm": 0.7421875, - "learning_rate": 1.4272294509679468e-05, - "loss": 1.1947, - "step": 4672 - }, - { - "epoch": 1.4681665586901467, - "grad_norm": 0.8046875, - "learning_rate": 1.4269755633132341e-05, - "loss": 1.2767, - "step": 4674 - }, - { - "epoch": 1.468794785715688, - "grad_norm": 0.76953125, - "learning_rate": 1.4267216756585212e-05, - "loss": 1.1294, - "step": 4676 - }, - { - "epoch": 1.4694230127412293, - "grad_norm": 0.88671875, - "learning_rate": 1.4264677880038085e-05, - "loss": 1.1821, - "step": 4678 - }, - { - "epoch": 1.4700512397667707, - "grad_norm": 0.80859375, - "learning_rate": 1.4262139003490955e-05, - "loss": 1.1981, - "step": 4680 - }, - { - "epoch": 1.470679466792312, - "grad_norm": 0.8671875, - "learning_rate": 1.4259600126943828e-05, - "loss": 1.2921, - "step": 4682 - }, - { - "epoch": 1.4713076938178533, - "grad_norm": 0.87109375, - "learning_rate": 1.42570612503967e-05, - "loss": 1.2258, - "step": 4684 - }, - { - "epoch": 1.4719359208433949, - "grad_norm": 0.79296875, - "learning_rate": 1.4254522373849573e-05, - "loss": 1.1364, - "step": 4686 - }, - { - "epoch": 1.4725641478689362, - "grad_norm": 0.8671875, - "learning_rate": 1.4251983497302444e-05, - "loss": 1.2497, - "step": 4688 - }, - { - "epoch": 1.4731923748944775, - "grad_norm": 0.796875, - "learning_rate": 1.4249444620755317e-05, - "loss": 1.1485, - "step": 4690 - }, - { - "epoch": 1.4738206019200188, - "grad_norm": 0.8125, - "learning_rate": 1.4246905744208188e-05, - "loss": 1.1257, - "step": 4692 - }, - { - "epoch": 1.4744488289455602, - "grad_norm": 0.7421875, - "learning_rate": 1.4244366867661061e-05, - "loss": 1.2718, - "step": 4694 - }, - { - "epoch": 1.4750770559711015, - "grad_norm": 0.90234375, - "learning_rate": 1.4241827991113933e-05, - "loss": 1.0906, - "step": 4696 - }, - { - "epoch": 1.4757052829966428, - "grad_norm": 0.7578125, - "learning_rate": 1.4239289114566806e-05, - "loss": 1.2542, - "step": 4698 - }, - { - "epoch": 1.4763335100221844, - "grad_norm": 0.8125, - "learning_rate": 1.4236750238019676e-05, - "loss": 1.2242, - "step": 4700 - }, - { - "epoch": 1.4769617370477257, - "grad_norm": 0.78125, - "learning_rate": 1.423421136147255e-05, - "loss": 1.1202, - "step": 4702 - }, - { - "epoch": 1.477589964073267, - "grad_norm": 0.78515625, - "learning_rate": 1.423167248492542e-05, - "loss": 1.2829, - "step": 4704 - }, - { - "epoch": 1.4782181910988084, - "grad_norm": 0.7421875, - "learning_rate": 1.4229133608378293e-05, - "loss": 1.2276, - "step": 4706 - }, - { - "epoch": 1.4788464181243497, - "grad_norm": 0.82421875, - "learning_rate": 1.4226594731831165e-05, - "loss": 1.0959, - "step": 4708 - }, - { - "epoch": 1.479474645149891, - "grad_norm": 0.91796875, - "learning_rate": 1.4224055855284038e-05, - "loss": 1.2402, - "step": 4710 - }, - { - "epoch": 1.4801028721754323, - "grad_norm": 0.73828125, - "learning_rate": 1.422151697873691e-05, - "loss": 1.41, - "step": 4712 - }, - { - "epoch": 1.4807310992009737, - "grad_norm": 0.8515625, - "learning_rate": 1.4218978102189782e-05, - "loss": 1.2751, - "step": 4714 - }, - { - "epoch": 1.481359326226515, - "grad_norm": 0.7734375, - "learning_rate": 1.4216439225642655e-05, - "loss": 1.1848, - "step": 4716 - }, - { - "epoch": 1.4819875532520563, - "grad_norm": 0.85546875, - "learning_rate": 1.4213900349095527e-05, - "loss": 1.1541, - "step": 4718 - }, - { - "epoch": 1.4826157802775979, - "grad_norm": 0.79296875, - "learning_rate": 1.42113614725484e-05, - "loss": 1.248, - "step": 4720 - }, - { - "epoch": 1.4832440073031392, - "grad_norm": 0.84375, - "learning_rate": 1.4208822596001271e-05, - "loss": 1.0783, - "step": 4722 - }, - { - "epoch": 1.4838722343286805, - "grad_norm": 0.82421875, - "learning_rate": 1.4206283719454144e-05, - "loss": 1.3559, - "step": 4724 - }, - { - "epoch": 1.4845004613542219, - "grad_norm": 0.83984375, - "learning_rate": 1.4203744842907014e-05, - "loss": 1.2525, - "step": 4726 - }, - { - "epoch": 1.4851286883797632, - "grad_norm": 0.83203125, - "learning_rate": 1.4201205966359889e-05, - "loss": 1.3513, - "step": 4728 - }, - { - "epoch": 1.4857569154053045, - "grad_norm": 0.84765625, - "learning_rate": 1.4198667089812758e-05, - "loss": 1.1958, - "step": 4730 - }, - { - "epoch": 1.486385142430846, - "grad_norm": 0.83984375, - "learning_rate": 1.4196128213265631e-05, - "loss": 1.2256, - "step": 4732 - }, - { - "epoch": 1.4870133694563874, - "grad_norm": 0.828125, - "learning_rate": 1.4193589336718503e-05, - "loss": 1.2719, - "step": 4734 - }, - { - "epoch": 1.4876415964819287, - "grad_norm": 0.8984375, - "learning_rate": 1.4191050460171376e-05, - "loss": 1.186, - "step": 4736 - }, - { - "epoch": 1.48826982350747, - "grad_norm": 0.78125, - "learning_rate": 1.4188511583624247e-05, - "loss": 1.3374, - "step": 4738 - }, - { - "epoch": 1.4888980505330114, - "grad_norm": 0.90234375, - "learning_rate": 1.418597270707712e-05, - "loss": 1.139, - "step": 4740 - }, - { - "epoch": 1.4895262775585527, - "grad_norm": 0.84375, - "learning_rate": 1.4183433830529992e-05, - "loss": 1.269, - "step": 4742 - }, - { - "epoch": 1.490154504584094, - "grad_norm": 0.796875, - "learning_rate": 1.4180894953982865e-05, - "loss": 1.1428, - "step": 4744 - }, - { - "epoch": 1.4907827316096354, - "grad_norm": 0.73046875, - "learning_rate": 1.4178356077435736e-05, - "loss": 1.304, - "step": 4746 - }, - { - "epoch": 1.4914109586351767, - "grad_norm": 0.81640625, - "learning_rate": 1.4175817200888609e-05, - "loss": 1.4207, - "step": 4748 - }, - { - "epoch": 1.492039185660718, - "grad_norm": 0.7109375, - "learning_rate": 1.4173278324341479e-05, - "loss": 1.2348, - "step": 4750 - }, - { - "epoch": 1.4926674126862596, - "grad_norm": 0.796875, - "learning_rate": 1.4170739447794352e-05, - "loss": 1.2315, - "step": 4752 - }, - { - "epoch": 1.493295639711801, - "grad_norm": 0.78515625, - "learning_rate": 1.4168200571247223e-05, - "loss": 1.408, - "step": 4754 - }, - { - "epoch": 1.4939238667373422, - "grad_norm": 0.796875, - "learning_rate": 1.4165661694700096e-05, - "loss": 1.2126, - "step": 4756 - }, - { - "epoch": 1.4945520937628836, - "grad_norm": 0.80859375, - "learning_rate": 1.4163122818152968e-05, - "loss": 1.3488, - "step": 4758 - }, - { - "epoch": 1.4951803207884249, - "grad_norm": 0.75, - "learning_rate": 1.416058394160584e-05, - "loss": 1.2919, - "step": 4760 - }, - { - "epoch": 1.4958085478139662, - "grad_norm": 0.79296875, - "learning_rate": 1.4158045065058712e-05, - "loss": 1.2386, - "step": 4762 - }, - { - "epoch": 1.4964367748395075, - "grad_norm": 0.93359375, - "learning_rate": 1.4155506188511585e-05, - "loss": 1.1495, - "step": 4764 - }, - { - "epoch": 1.497065001865049, - "grad_norm": 0.80859375, - "learning_rate": 1.4152967311964457e-05, - "loss": 1.0899, - "step": 4766 - }, - { - "epoch": 1.4976932288905904, - "grad_norm": 0.703125, - "learning_rate": 1.415042843541733e-05, - "loss": 1.1979, - "step": 4768 - }, - { - "epoch": 1.4983214559161318, - "grad_norm": 0.8125, - "learning_rate": 1.4147889558870201e-05, - "loss": 1.2938, - "step": 4770 - }, - { - "epoch": 1.498949682941673, - "grad_norm": 1.6875, - "learning_rate": 1.4145350682323074e-05, - "loss": 1.1939, - "step": 4772 - }, - { - "epoch": 1.4995779099672144, - "grad_norm": 0.76953125, - "learning_rate": 1.4142811805775944e-05, - "loss": 1.2171, - "step": 4774 - }, - { - "epoch": 1.5002061369927557, - "grad_norm": 0.8046875, - "learning_rate": 1.4140272929228817e-05, - "loss": 1.202, - "step": 4776 - }, - { - "epoch": 1.500834364018297, - "grad_norm": 0.90234375, - "learning_rate": 1.4137734052681688e-05, - "loss": 1.2284, - "step": 4778 - }, - { - "epoch": 1.5014625910438384, - "grad_norm": 0.75, - "learning_rate": 1.4135195176134561e-05, - "loss": 1.2907, - "step": 4780 - }, - { - "epoch": 1.5020908180693797, - "grad_norm": 0.87109375, - "learning_rate": 1.4132656299587433e-05, - "loss": 1.304, - "step": 4782 - }, - { - "epoch": 1.502719045094921, - "grad_norm": 0.85546875, - "learning_rate": 1.4130117423040306e-05, - "loss": 1.3084, - "step": 4784 - }, - { - "epoch": 1.5033472721204624, - "grad_norm": 0.7890625, - "learning_rate": 1.4127578546493177e-05, - "loss": 1.277, - "step": 4786 - }, - { - "epoch": 1.503975499146004, - "grad_norm": 0.7734375, - "learning_rate": 1.412503966994605e-05, - "loss": 1.2962, - "step": 4788 - }, - { - "epoch": 1.5046037261715453, - "grad_norm": 0.73046875, - "learning_rate": 1.4122500793398922e-05, - "loss": 1.1253, - "step": 4790 - }, - { - "epoch": 1.5052319531970866, - "grad_norm": 0.95703125, - "learning_rate": 1.4119961916851795e-05, - "loss": 1.1104, - "step": 4792 - }, - { - "epoch": 1.505860180222628, - "grad_norm": 0.7890625, - "learning_rate": 1.4117423040304664e-05, - "loss": 1.2353, - "step": 4794 - }, - { - "epoch": 1.5064884072481695, - "grad_norm": 0.81640625, - "learning_rate": 1.411488416375754e-05, - "loss": 1.2573, - "step": 4796 - }, - { - "epoch": 1.5071166342737108, - "grad_norm": 0.85546875, - "learning_rate": 1.4112345287210412e-05, - "loss": 1.2517, - "step": 4798 - }, - { - "epoch": 1.5077448612992521, - "grad_norm": 0.75390625, - "learning_rate": 1.4109806410663282e-05, - "loss": 1.3036, - "step": 4800 - }, - { - "epoch": 1.5083730883247934, - "grad_norm": 0.8046875, - "learning_rate": 1.4107267534116155e-05, - "loss": 1.364, - "step": 4802 - }, - { - "epoch": 1.5090013153503348, - "grad_norm": 0.890625, - "learning_rate": 1.4104728657569026e-05, - "loss": 1.1784, - "step": 4804 - }, - { - "epoch": 1.509629542375876, - "grad_norm": 0.74609375, - "learning_rate": 1.41021897810219e-05, - "loss": 1.1693, - "step": 4806 - }, - { - "epoch": 1.5102577694014174, - "grad_norm": 0.8203125, - "learning_rate": 1.4099650904474771e-05, - "loss": 1.2911, - "step": 4808 - }, - { - "epoch": 1.5108859964269588, - "grad_norm": 0.796875, - "learning_rate": 1.4097112027927644e-05, - "loss": 1.1448, - "step": 4810 - }, - { - "epoch": 1.5115142234525, - "grad_norm": 0.828125, - "learning_rate": 1.4094573151380515e-05, - "loss": 1.3268, - "step": 4812 - }, - { - "epoch": 1.5121424504780414, - "grad_norm": 0.7890625, - "learning_rate": 1.4092034274833388e-05, - "loss": 1.3592, - "step": 4814 - }, - { - "epoch": 1.5127706775035827, - "grad_norm": 0.8125, - "learning_rate": 1.408949539828626e-05, - "loss": 1.2284, - "step": 4816 - }, - { - "epoch": 1.513398904529124, - "grad_norm": 0.82421875, - "learning_rate": 1.4086956521739133e-05, - "loss": 1.3063, - "step": 4818 - }, - { - "epoch": 1.5140271315546656, - "grad_norm": 0.859375, - "learning_rate": 1.4084417645192003e-05, - "loss": 1.291, - "step": 4820 - }, - { - "epoch": 1.514655358580207, - "grad_norm": 0.8515625, - "learning_rate": 1.4081878768644877e-05, - "loss": 1.1867, - "step": 4822 - }, - { - "epoch": 1.5152835856057483, - "grad_norm": 0.7890625, - "learning_rate": 1.4079339892097747e-05, - "loss": 1.1726, - "step": 4824 - }, - { - "epoch": 1.5159118126312896, - "grad_norm": 0.76171875, - "learning_rate": 1.407680101555062e-05, - "loss": 1.2812, - "step": 4826 - }, - { - "epoch": 1.5165400396568312, - "grad_norm": 0.83984375, - "learning_rate": 1.4074262139003492e-05, - "loss": 1.2503, - "step": 4828 - }, - { - "epoch": 1.5171682666823725, - "grad_norm": 0.8125, - "learning_rate": 1.4071723262456365e-05, - "loss": 1.1713, - "step": 4830 - }, - { - "epoch": 1.5177964937079138, - "grad_norm": 0.7578125, - "learning_rate": 1.4069184385909236e-05, - "loss": 1.2758, - "step": 4832 - }, - { - "epoch": 1.5184247207334551, - "grad_norm": 0.796875, - "learning_rate": 1.4066645509362109e-05, - "loss": 1.4859, - "step": 4834 - }, - { - "epoch": 1.5190529477589965, - "grad_norm": 0.8125, - "learning_rate": 1.406410663281498e-05, - "loss": 1.1555, - "step": 4836 - }, - { - "epoch": 1.5196811747845378, - "grad_norm": 0.953125, - "learning_rate": 1.4061567756267854e-05, - "loss": 1.211, - "step": 4838 - }, - { - "epoch": 1.5203094018100791, - "grad_norm": 0.89453125, - "learning_rate": 1.4059028879720725e-05, - "loss": 1.1481, - "step": 4840 - }, - { - "epoch": 1.5209376288356204, - "grad_norm": 0.84765625, - "learning_rate": 1.4056490003173598e-05, - "loss": 1.1373, - "step": 4842 - }, - { - "epoch": 1.5215658558611618, - "grad_norm": 0.7734375, - "learning_rate": 1.4053951126626468e-05, - "loss": 1.2099, - "step": 4844 - }, - { - "epoch": 1.522194082886703, - "grad_norm": 0.8125, - "learning_rate": 1.405141225007934e-05, - "loss": 1.331, - "step": 4846 - }, - { - "epoch": 1.5228223099122444, - "grad_norm": 0.7578125, - "learning_rate": 1.4048873373532212e-05, - "loss": 1.2244, - "step": 4848 - }, - { - "epoch": 1.5234505369377858, - "grad_norm": 0.7109375, - "learning_rate": 1.4046334496985085e-05, - "loss": 1.301, - "step": 4850 - }, - { - "epoch": 1.524078763963327, - "grad_norm": 0.83984375, - "learning_rate": 1.4043795620437957e-05, - "loss": 1.2133, - "step": 4852 - }, - { - "epoch": 1.5247069909888686, - "grad_norm": 0.828125, - "learning_rate": 1.404125674389083e-05, - "loss": 1.1242, - "step": 4854 - }, - { - "epoch": 1.52533521801441, - "grad_norm": 0.84375, - "learning_rate": 1.4038717867343701e-05, - "loss": 1.2627, - "step": 4856 - }, - { - "epoch": 1.5259634450399513, - "grad_norm": 0.71875, - "learning_rate": 1.4036178990796574e-05, - "loss": 1.3087, - "step": 4858 - }, - { - "epoch": 1.5265916720654926, - "grad_norm": 0.85546875, - "learning_rate": 1.4033640114249446e-05, - "loss": 1.1323, - "step": 4860 - }, - { - "epoch": 1.5272198990910342, - "grad_norm": 0.75390625, - "learning_rate": 1.4031101237702319e-05, - "loss": 1.2379, - "step": 4862 - }, - { - "epoch": 1.5278481261165755, - "grad_norm": 0.765625, - "learning_rate": 1.4028562361155188e-05, - "loss": 1.2671, - "step": 4864 - }, - { - "epoch": 1.5284763531421168, - "grad_norm": 0.765625, - "learning_rate": 1.4026023484608063e-05, - "loss": 1.2881, - "step": 4866 - }, - { - "epoch": 1.5291045801676582, - "grad_norm": 0.85546875, - "learning_rate": 1.4023484608060933e-05, - "loss": 1.2102, - "step": 4868 - }, - { - "epoch": 1.5297328071931995, - "grad_norm": 0.828125, - "learning_rate": 1.4020945731513806e-05, - "loss": 1.2949, - "step": 4870 - }, - { - "epoch": 1.5303610342187408, - "grad_norm": 0.7734375, - "learning_rate": 1.4018406854966677e-05, - "loss": 1.2383, - "step": 4872 - }, - { - "epoch": 1.5309892612442821, - "grad_norm": 0.96484375, - "learning_rate": 1.401586797841955e-05, - "loss": 1.1815, - "step": 4874 - }, - { - "epoch": 1.5316174882698235, - "grad_norm": 0.8125, - "learning_rate": 1.4013329101872422e-05, - "loss": 1.2497, - "step": 4876 - }, - { - "epoch": 1.5322457152953648, - "grad_norm": 0.8125, - "learning_rate": 1.4010790225325295e-05, - "loss": 1.1109, - "step": 4878 - }, - { - "epoch": 1.5328739423209061, - "grad_norm": 0.8359375, - "learning_rate": 1.4008251348778166e-05, - "loss": 1.2732, - "step": 4880 - }, - { - "epoch": 1.5335021693464475, - "grad_norm": 0.81640625, - "learning_rate": 1.400571247223104e-05, - "loss": 1.2428, - "step": 4882 - }, - { - "epoch": 1.5341303963719888, - "grad_norm": 0.87890625, - "learning_rate": 1.4003173595683912e-05, - "loss": 1.2501, - "step": 4884 - }, - { - "epoch": 1.5347586233975303, - "grad_norm": 0.83984375, - "learning_rate": 1.4000634719136784e-05, - "loss": 1.2886, - "step": 4886 - }, - { - "epoch": 1.5353868504230717, - "grad_norm": 0.828125, - "learning_rate": 1.3998095842589657e-05, - "loss": 1.2391, - "step": 4888 - }, - { - "epoch": 1.536015077448613, - "grad_norm": 0.828125, - "learning_rate": 1.3995556966042526e-05, - "loss": 1.2547, - "step": 4890 - }, - { - "epoch": 1.5366433044741543, - "grad_norm": 0.7734375, - "learning_rate": 1.3993018089495401e-05, - "loss": 1.2538, - "step": 4892 - }, - { - "epoch": 1.5372715314996959, - "grad_norm": 0.8671875, - "learning_rate": 1.3990479212948271e-05, - "loss": 1.2798, - "step": 4894 - }, - { - "epoch": 1.5378997585252372, - "grad_norm": 0.7734375, - "learning_rate": 1.3987940336401144e-05, - "loss": 1.2126, - "step": 4896 - }, - { - "epoch": 1.5385279855507785, - "grad_norm": 0.79296875, - "learning_rate": 1.3985401459854015e-05, - "loss": 1.2583, - "step": 4898 - }, - { - "epoch": 1.5391562125763198, - "grad_norm": 0.83203125, - "learning_rate": 1.3982862583306888e-05, - "loss": 1.2099, - "step": 4900 - }, - { - "epoch": 1.5397844396018612, - "grad_norm": 0.859375, - "learning_rate": 1.398032370675976e-05, - "loss": 1.3456, - "step": 4902 - }, - { - "epoch": 1.5404126666274025, - "grad_norm": 0.78125, - "learning_rate": 1.3977784830212633e-05, - "loss": 1.2055, - "step": 4904 - }, - { - "epoch": 1.5410408936529438, - "grad_norm": 0.78515625, - "learning_rate": 1.3975245953665504e-05, - "loss": 1.2495, - "step": 4906 - }, - { - "epoch": 1.5416691206784852, - "grad_norm": 0.78125, - "learning_rate": 1.3972707077118377e-05, - "loss": 1.2034, - "step": 4908 - }, - { - "epoch": 1.5422973477040265, - "grad_norm": 0.734375, - "learning_rate": 1.3970168200571249e-05, - "loss": 1.2081, - "step": 4910 - }, - { - "epoch": 1.5429255747295678, - "grad_norm": 0.79296875, - "learning_rate": 1.3967629324024122e-05, - "loss": 1.0711, - "step": 4912 - }, - { - "epoch": 1.5435538017551091, - "grad_norm": 0.75390625, - "learning_rate": 1.3965090447476991e-05, - "loss": 1.233, - "step": 4914 - }, - { - "epoch": 1.5441820287806505, - "grad_norm": 0.7578125, - "learning_rate": 1.3962551570929865e-05, - "loss": 1.2307, - "step": 4916 - }, - { - "epoch": 1.5448102558061918, - "grad_norm": 0.7890625, - "learning_rate": 1.3960012694382736e-05, - "loss": 1.3425, - "step": 4918 - }, - { - "epoch": 1.5454384828317334, - "grad_norm": 0.7578125, - "learning_rate": 1.3957473817835609e-05, - "loss": 1.2415, - "step": 4920 - }, - { - "epoch": 1.5460667098572747, - "grad_norm": 0.859375, - "learning_rate": 1.395493494128848e-05, - "loss": 1.1973, - "step": 4922 - }, - { - "epoch": 1.546694936882816, - "grad_norm": 0.71875, - "learning_rate": 1.3952396064741353e-05, - "loss": 1.2275, - "step": 4924 - }, - { - "epoch": 1.5473231639083573, - "grad_norm": 0.8125, - "learning_rate": 1.3949857188194225e-05, - "loss": 1.416, - "step": 4926 - }, - { - "epoch": 1.5479513909338989, - "grad_norm": 0.80859375, - "learning_rate": 1.3947318311647098e-05, - "loss": 1.262, - "step": 4928 - }, - { - "epoch": 1.5485796179594402, - "grad_norm": 0.75, - "learning_rate": 1.394477943509997e-05, - "loss": 1.0942, - "step": 4930 - }, - { - "epoch": 1.5492078449849815, - "grad_norm": 0.8203125, - "learning_rate": 1.3942240558552842e-05, - "loss": 1.2883, - "step": 4932 - }, - { - "epoch": 1.5498360720105229, - "grad_norm": 0.84765625, - "learning_rate": 1.3939701682005714e-05, - "loss": 1.2058, - "step": 4934 - }, - { - "epoch": 1.5504642990360642, - "grad_norm": 0.75390625, - "learning_rate": 1.3937162805458587e-05, - "loss": 1.2756, - "step": 4936 - }, - { - "epoch": 1.5510925260616055, - "grad_norm": 0.8359375, - "learning_rate": 1.3934623928911457e-05, - "loss": 1.2265, - "step": 4938 - }, - { - "epoch": 1.5517207530871469, - "grad_norm": 0.82421875, - "learning_rate": 1.393208505236433e-05, - "loss": 1.0057, - "step": 4940 - }, - { - "epoch": 1.5523489801126882, - "grad_norm": 0.77734375, - "learning_rate": 1.3929546175817201e-05, - "loss": 1.1519, - "step": 4942 - }, - { - "epoch": 1.5529772071382295, - "grad_norm": 0.7734375, - "learning_rate": 1.3927007299270074e-05, - "loss": 1.3281, - "step": 4944 - }, - { - "epoch": 1.5536054341637708, - "grad_norm": 0.74609375, - "learning_rate": 1.3924468422722945e-05, - "loss": 1.267, - "step": 4946 - }, - { - "epoch": 1.5542336611893122, - "grad_norm": 0.8046875, - "learning_rate": 1.3921929546175819e-05, - "loss": 1.2482, - "step": 4948 - }, - { - "epoch": 1.5548618882148535, - "grad_norm": 0.7578125, - "learning_rate": 1.391939066962869e-05, - "loss": 1.3847, - "step": 4950 - }, - { - "epoch": 1.555490115240395, - "grad_norm": 0.7890625, - "learning_rate": 1.3916851793081563e-05, - "loss": 1.4112, - "step": 4952 - }, - { - "epoch": 1.5561183422659364, - "grad_norm": 0.85546875, - "learning_rate": 1.3914312916534434e-05, - "loss": 1.2111, - "step": 4954 - }, - { - "epoch": 1.5567465692914777, - "grad_norm": 0.7734375, - "learning_rate": 1.3911774039987307e-05, - "loss": 1.4313, - "step": 4956 - }, - { - "epoch": 1.557374796317019, - "grad_norm": 0.74609375, - "learning_rate": 1.3909235163440177e-05, - "loss": 1.2122, - "step": 4958 - }, - { - "epoch": 1.5580030233425606, - "grad_norm": 0.7578125, - "learning_rate": 1.3906696286893052e-05, - "loss": 1.2285, - "step": 4960 - }, - { - "epoch": 1.558631250368102, - "grad_norm": 0.890625, - "learning_rate": 1.3904157410345922e-05, - "loss": 1.2074, - "step": 4962 - }, - { - "epoch": 1.5592594773936432, - "grad_norm": 0.79296875, - "learning_rate": 1.3901618533798795e-05, - "loss": 1.3096, - "step": 4964 - }, - { - "epoch": 1.5598877044191846, - "grad_norm": 0.859375, - "learning_rate": 1.3899079657251666e-05, - "loss": 1.1488, - "step": 4966 - }, - { - "epoch": 1.560515931444726, - "grad_norm": 0.79296875, - "learning_rate": 1.3896540780704539e-05, - "loss": 1.358, - "step": 4968 - }, - { - "epoch": 1.5611441584702672, - "grad_norm": 0.78515625, - "learning_rate": 1.3894001904157412e-05, - "loss": 1.2134, - "step": 4970 - }, - { - "epoch": 1.5617723854958085, - "grad_norm": 0.7890625, - "learning_rate": 1.3891463027610284e-05, - "loss": 1.3555, - "step": 4972 - }, - { - "epoch": 1.5624006125213499, - "grad_norm": 0.87109375, - "learning_rate": 1.3888924151063157e-05, - "loss": 1.2697, - "step": 4974 - }, - { - "epoch": 1.5630288395468912, - "grad_norm": 0.94140625, - "learning_rate": 1.3886385274516028e-05, - "loss": 1.1357, - "step": 4976 - }, - { - "epoch": 1.5636570665724325, - "grad_norm": 0.8359375, - "learning_rate": 1.3883846397968901e-05, - "loss": 1.1122, - "step": 4978 - }, - { - "epoch": 1.5642852935979739, - "grad_norm": 0.9375, - "learning_rate": 1.3881307521421772e-05, - "loss": 1.3238, - "step": 4980 - }, - { - "epoch": 1.5649135206235152, - "grad_norm": 0.85546875, - "learning_rate": 1.3878768644874646e-05, - "loss": 1.1624, - "step": 4982 - }, - { - "epoch": 1.5655417476490565, - "grad_norm": 0.83203125, - "learning_rate": 1.3876229768327515e-05, - "loss": 1.1995, - "step": 4984 - }, - { - "epoch": 1.566169974674598, - "grad_norm": 0.921875, - "learning_rate": 1.387369089178039e-05, - "loss": 1.433, - "step": 4986 - }, - { - "epoch": 1.5667982017001394, - "grad_norm": 0.828125, - "learning_rate": 1.387115201523326e-05, - "loss": 1.2223, - "step": 4988 - }, - { - "epoch": 1.5674264287256807, - "grad_norm": 0.8203125, - "learning_rate": 1.3868613138686133e-05, - "loss": 1.2335, - "step": 4990 - }, - { - "epoch": 1.568054655751222, - "grad_norm": 0.78515625, - "learning_rate": 1.3866074262139004e-05, - "loss": 1.2387, - "step": 4992 - }, - { - "epoch": 1.5686828827767636, - "grad_norm": 0.74609375, - "learning_rate": 1.3863535385591877e-05, - "loss": 1.2488, - "step": 4994 - }, - { - "epoch": 1.569311109802305, - "grad_norm": 0.765625, - "learning_rate": 1.3860996509044749e-05, - "loss": 1.3808, - "step": 4996 - }, - { - "epoch": 1.5699393368278463, - "grad_norm": 0.73828125, - "learning_rate": 1.3858457632497622e-05, - "loss": 1.4113, - "step": 4998 - }, - { - "epoch": 1.5705675638533876, - "grad_norm": 0.8671875, - "learning_rate": 1.3855918755950493e-05, - "loss": 1.2078, - "step": 5000 - }, - { - "epoch": 1.571195790878929, - "grad_norm": 0.7734375, - "learning_rate": 1.3853379879403366e-05, - "loss": 1.2806, - "step": 5002 - }, - { - "epoch": 1.5718240179044702, - "grad_norm": 0.8515625, - "learning_rate": 1.3850841002856238e-05, - "loss": 1.2592, - "step": 5004 - }, - { - "epoch": 1.5724522449300116, - "grad_norm": 0.76171875, - "learning_rate": 1.384830212630911e-05, - "loss": 1.2786, - "step": 5006 - }, - { - "epoch": 1.573080471955553, - "grad_norm": 0.8515625, - "learning_rate": 1.384576324976198e-05, - "loss": 1.2668, - "step": 5008 - }, - { - "epoch": 1.5737086989810942, - "grad_norm": 0.796875, - "learning_rate": 1.3843224373214853e-05, - "loss": 1.1614, - "step": 5010 - }, - { - "epoch": 1.5743369260066356, - "grad_norm": 0.78515625, - "learning_rate": 1.3840685496667725e-05, - "loss": 1.1552, - "step": 5012 - }, - { - "epoch": 1.5749651530321769, - "grad_norm": 0.79296875, - "learning_rate": 1.3838146620120598e-05, - "loss": 1.295, - "step": 5014 - }, - { - "epoch": 1.5755933800577182, - "grad_norm": 0.8203125, - "learning_rate": 1.383560774357347e-05, - "loss": 1.2958, - "step": 5016 - }, - { - "epoch": 1.5762216070832598, - "grad_norm": 0.73828125, - "learning_rate": 1.3833068867026342e-05, - "loss": 1.3272, - "step": 5018 - }, - { - "epoch": 1.576849834108801, - "grad_norm": 0.8828125, - "learning_rate": 1.3830529990479214e-05, - "loss": 1.2085, - "step": 5020 - }, - { - "epoch": 1.5774780611343424, - "grad_norm": 0.82421875, - "learning_rate": 1.3827991113932087e-05, - "loss": 1.28, - "step": 5022 - }, - { - "epoch": 1.5781062881598837, - "grad_norm": 0.78125, - "learning_rate": 1.3825452237384958e-05, - "loss": 1.2884, - "step": 5024 - }, - { - "epoch": 1.5787345151854253, - "grad_norm": 0.69140625, - "learning_rate": 1.3822913360837831e-05, - "loss": 1.3882, - "step": 5026 - }, - { - "epoch": 1.5793627422109666, - "grad_norm": 0.77734375, - "learning_rate": 1.3820374484290701e-05, - "loss": 1.3267, - "step": 5028 - }, - { - "epoch": 1.579990969236508, - "grad_norm": 0.859375, - "learning_rate": 1.3817835607743576e-05, - "loss": 1.2475, - "step": 5030 - }, - { - "epoch": 1.5806191962620493, - "grad_norm": 0.6953125, - "learning_rate": 1.3815296731196445e-05, - "loss": 1.2973, - "step": 5032 - }, - { - "epoch": 1.5812474232875906, - "grad_norm": 0.80078125, - "learning_rate": 1.3812757854649318e-05, - "loss": 1.1669, - "step": 5034 - }, - { - "epoch": 1.581875650313132, - "grad_norm": 0.78515625, - "learning_rate": 1.381021897810219e-05, - "loss": 1.2783, - "step": 5036 - }, - { - "epoch": 1.5825038773386733, - "grad_norm": 0.765625, - "learning_rate": 1.3807680101555063e-05, - "loss": 1.1044, - "step": 5038 - }, - { - "epoch": 1.5831321043642146, - "grad_norm": 0.71875, - "learning_rate": 1.3805141225007934e-05, - "loss": 1.2039, - "step": 5040 - }, - { - "epoch": 1.583760331389756, - "grad_norm": 0.76171875, - "learning_rate": 1.3802602348460807e-05, - "loss": 1.3272, - "step": 5042 - }, - { - "epoch": 1.5843885584152972, - "grad_norm": 0.859375, - "learning_rate": 1.3800063471913679e-05, - "loss": 1.1171, - "step": 5044 - }, - { - "epoch": 1.5850167854408386, - "grad_norm": 0.83984375, - "learning_rate": 1.3797524595366552e-05, - "loss": 1.085, - "step": 5046 - }, - { - "epoch": 1.58564501246638, - "grad_norm": 1.0703125, - "learning_rate": 1.3794985718819423e-05, - "loss": 1.1656, - "step": 5048 - }, - { - "epoch": 1.5862732394919212, - "grad_norm": 0.8203125, - "learning_rate": 1.3792446842272296e-05, - "loss": 1.2341, - "step": 5050 - }, - { - "epoch": 1.5869014665174628, - "grad_norm": 0.8203125, - "learning_rate": 1.3789907965725166e-05, - "loss": 1.2506, - "step": 5052 - }, - { - "epoch": 1.587529693543004, - "grad_norm": 0.78125, - "learning_rate": 1.3787369089178039e-05, - "loss": 1.2872, - "step": 5054 - }, - { - "epoch": 1.5881579205685454, - "grad_norm": 0.81640625, - "learning_rate": 1.3784830212630914e-05, - "loss": 1.1942, - "step": 5056 - }, - { - "epoch": 1.5887861475940868, - "grad_norm": 0.75390625, - "learning_rate": 1.3782291336083783e-05, - "loss": 1.2892, - "step": 5058 - }, - { - "epoch": 1.5894143746196283, - "grad_norm": 0.796875, - "learning_rate": 1.3779752459536657e-05, - "loss": 1.2254, - "step": 5060 - }, - { - "epoch": 1.5900426016451696, - "grad_norm": 0.765625, - "learning_rate": 1.3777213582989528e-05, - "loss": 1.2721, - "step": 5062 - }, - { - "epoch": 1.590670828670711, - "grad_norm": 0.78515625, - "learning_rate": 1.3774674706442401e-05, - "loss": 1.2791, - "step": 5064 - }, - { - "epoch": 1.5912990556962523, - "grad_norm": 0.76953125, - "learning_rate": 1.3772135829895272e-05, - "loss": 1.2474, - "step": 5066 - }, - { - "epoch": 1.5919272827217936, - "grad_norm": 0.796875, - "learning_rate": 1.3769596953348145e-05, - "loss": 1.2996, - "step": 5068 - }, - { - "epoch": 1.592555509747335, - "grad_norm": 0.796875, - "learning_rate": 1.3767058076801017e-05, - "loss": 1.1624, - "step": 5070 - }, - { - "epoch": 1.5931837367728763, - "grad_norm": 0.79296875, - "learning_rate": 1.376451920025389e-05, - "loss": 1.1621, - "step": 5072 - }, - { - "epoch": 1.5938119637984176, - "grad_norm": 0.8203125, - "learning_rate": 1.3761980323706761e-05, - "loss": 1.1595, - "step": 5074 - }, - { - "epoch": 1.594440190823959, - "grad_norm": 0.75, - "learning_rate": 1.3759441447159634e-05, - "loss": 1.2585, - "step": 5076 - }, - { - "epoch": 1.5950684178495003, - "grad_norm": 0.859375, - "learning_rate": 1.3756902570612504e-05, - "loss": 1.1875, - "step": 5078 - }, - { - "epoch": 1.5956966448750416, - "grad_norm": 0.8359375, - "learning_rate": 1.3754363694065377e-05, - "loss": 1.286, - "step": 5080 - }, - { - "epoch": 1.596324871900583, - "grad_norm": 0.7421875, - "learning_rate": 1.3751824817518249e-05, - "loss": 1.313, - "step": 5082 - }, - { - "epoch": 1.5969530989261245, - "grad_norm": 0.8984375, - "learning_rate": 1.3749285940971122e-05, - "loss": 1.2761, - "step": 5084 - }, - { - "epoch": 1.5975813259516658, - "grad_norm": 0.82421875, - "learning_rate": 1.3746747064423993e-05, - "loss": 1.2133, - "step": 5086 - }, - { - "epoch": 1.5982095529772071, - "grad_norm": 0.7890625, - "learning_rate": 1.3744208187876866e-05, - "loss": 1.2465, - "step": 5088 - }, - { - "epoch": 1.5988377800027485, - "grad_norm": 0.76171875, - "learning_rate": 1.3741669311329737e-05, - "loss": 1.3734, - "step": 5090 - }, - { - "epoch": 1.59946600702829, - "grad_norm": 0.80859375, - "learning_rate": 1.373913043478261e-05, - "loss": 1.4389, - "step": 5092 - }, - { - "epoch": 1.6000942340538313, - "grad_norm": 0.8125, - "learning_rate": 1.3736591558235482e-05, - "loss": 1.2194, - "step": 5094 - }, - { - "epoch": 1.6007224610793727, - "grad_norm": 0.86328125, - "learning_rate": 1.3734052681688355e-05, - "loss": 1.1488, - "step": 5096 - }, - { - "epoch": 1.601350688104914, - "grad_norm": 0.87109375, - "learning_rate": 1.3731513805141226e-05, - "loss": 1.324, - "step": 5098 - }, - { - "epoch": 1.6019789151304553, - "grad_norm": 0.8125, - "learning_rate": 1.37289749285941e-05, - "loss": 1.2289, - "step": 5100 - }, - { - "epoch": 1.6026071421559966, - "grad_norm": 0.76953125, - "learning_rate": 1.3726436052046969e-05, - "loss": 1.1883, - "step": 5102 - }, - { - "epoch": 1.603235369181538, - "grad_norm": 0.83203125, - "learning_rate": 1.3723897175499842e-05, - "loss": 1.2215, - "step": 5104 - }, - { - "epoch": 1.6038635962070793, - "grad_norm": 0.81640625, - "learning_rate": 1.3721358298952714e-05, - "loss": 1.2423, - "step": 5106 - }, - { - "epoch": 1.6044918232326206, - "grad_norm": 0.7890625, - "learning_rate": 1.3718819422405587e-05, - "loss": 1.1615, - "step": 5108 - }, - { - "epoch": 1.605120050258162, - "grad_norm": 0.7578125, - "learning_rate": 1.3716280545858458e-05, - "loss": 1.1432, - "step": 5110 - }, - { - "epoch": 1.6057482772837033, - "grad_norm": 0.8203125, - "learning_rate": 1.3713741669311331e-05, - "loss": 1.3432, - "step": 5112 - }, - { - "epoch": 1.6063765043092446, - "grad_norm": 0.8828125, - "learning_rate": 1.3711202792764203e-05, - "loss": 1.2572, - "step": 5114 - }, - { - "epoch": 1.6070047313347862, - "grad_norm": 0.83984375, - "learning_rate": 1.3708663916217076e-05, - "loss": 1.141, - "step": 5116 - }, - { - "epoch": 1.6076329583603275, - "grad_norm": 0.80078125, - "learning_rate": 1.3706125039669947e-05, - "loss": 1.2724, - "step": 5118 - }, - { - "epoch": 1.6082611853858688, - "grad_norm": 0.98046875, - "learning_rate": 1.370358616312282e-05, - "loss": 1.2636, - "step": 5120 - }, - { - "epoch": 1.6088894124114101, - "grad_norm": 0.89453125, - "learning_rate": 1.370104728657569e-05, - "loss": 1.2074, - "step": 5122 - }, - { - "epoch": 1.6095176394369515, - "grad_norm": 0.81640625, - "learning_rate": 1.3698508410028565e-05, - "loss": 1.3499, - "step": 5124 - }, - { - "epoch": 1.610145866462493, - "grad_norm": 0.76953125, - "learning_rate": 1.3695969533481434e-05, - "loss": 1.3866, - "step": 5126 - }, - { - "epoch": 1.6107740934880344, - "grad_norm": 0.796875, - "learning_rate": 1.3693430656934307e-05, - "loss": 1.2968, - "step": 5128 - }, - { - "epoch": 1.6114023205135757, - "grad_norm": 0.75, - "learning_rate": 1.3690891780387179e-05, - "loss": 1.4032, - "step": 5130 - }, - { - "epoch": 1.612030547539117, - "grad_norm": 0.73046875, - "learning_rate": 1.3688352903840052e-05, - "loss": 1.3089, - "step": 5132 - }, - { - "epoch": 1.6126587745646583, - "grad_norm": 0.7734375, - "learning_rate": 1.3685814027292923e-05, - "loss": 1.332, - "step": 5134 - }, - { - "epoch": 1.6132870015901997, - "grad_norm": 0.79296875, - "learning_rate": 1.3683275150745796e-05, - "loss": 1.1898, - "step": 5136 - }, - { - "epoch": 1.613915228615741, - "grad_norm": 0.87890625, - "learning_rate": 1.3680736274198668e-05, - "loss": 1.2405, - "step": 5138 - }, - { - "epoch": 1.6145434556412823, - "grad_norm": 0.77734375, - "learning_rate": 1.367819739765154e-05, - "loss": 1.2174, - "step": 5140 - }, - { - "epoch": 1.6151716826668236, - "grad_norm": 0.75, - "learning_rate": 1.3675658521104414e-05, - "loss": 1.0467, - "step": 5142 - }, - { - "epoch": 1.615799909692365, - "grad_norm": 0.765625, - "learning_rate": 1.3673119644557285e-05, - "loss": 1.2141, - "step": 5144 - }, - { - "epoch": 1.6164281367179063, - "grad_norm": 0.81640625, - "learning_rate": 1.3670580768010158e-05, - "loss": 1.362, - "step": 5146 - }, - { - "epoch": 1.6170563637434476, - "grad_norm": 0.80078125, - "learning_rate": 1.3668041891463028e-05, - "loss": 1.239, - "step": 5148 - }, - { - "epoch": 1.6176845907689892, - "grad_norm": 0.890625, - "learning_rate": 1.3665503014915903e-05, - "loss": 1.3551, - "step": 5150 - }, - { - "epoch": 1.6183128177945305, - "grad_norm": 0.89453125, - "learning_rate": 1.3662964138368772e-05, - "loss": 1.3027, - "step": 5152 - }, - { - "epoch": 1.6189410448200718, - "grad_norm": 0.70703125, - "learning_rate": 1.3660425261821645e-05, - "loss": 1.262, - "step": 5154 - }, - { - "epoch": 1.6195692718456132, - "grad_norm": 0.81640625, - "learning_rate": 1.3657886385274517e-05, - "loss": 1.3053, - "step": 5156 - }, - { - "epoch": 1.6201974988711547, - "grad_norm": 0.74609375, - "learning_rate": 1.365534750872739e-05, - "loss": 1.3218, - "step": 5158 - }, - { - "epoch": 1.620825725896696, - "grad_norm": 0.765625, - "learning_rate": 1.3652808632180261e-05, - "loss": 1.2845, - "step": 5160 - }, - { - "epoch": 1.6214539529222374, - "grad_norm": 0.953125, - "learning_rate": 1.3650269755633134e-05, - "loss": 1.1991, - "step": 5162 - }, - { - "epoch": 1.6220821799477787, - "grad_norm": 0.734375, - "learning_rate": 1.3647730879086006e-05, - "loss": 1.3062, - "step": 5164 - }, - { - "epoch": 1.62271040697332, - "grad_norm": 0.80859375, - "learning_rate": 1.3645192002538879e-05, - "loss": 1.0548, - "step": 5166 - }, - { - "epoch": 1.6233386339988614, - "grad_norm": 0.80078125, - "learning_rate": 1.364265312599175e-05, - "loss": 1.1821, - "step": 5168 - }, - { - "epoch": 1.6239668610244027, - "grad_norm": 0.78125, - "learning_rate": 1.3640114249444623e-05, - "loss": 1.1822, - "step": 5170 - }, - { - "epoch": 1.624595088049944, - "grad_norm": 0.90234375, - "learning_rate": 1.3637575372897493e-05, - "loss": 1.2382, - "step": 5172 - }, - { - "epoch": 1.6252233150754853, - "grad_norm": 0.828125, - "learning_rate": 1.3635036496350366e-05, - "loss": 1.2857, - "step": 5174 - }, - { - "epoch": 1.6258515421010267, - "grad_norm": 0.87890625, - "learning_rate": 1.3632497619803237e-05, - "loss": 1.0598, - "step": 5176 - }, - { - "epoch": 1.626479769126568, - "grad_norm": 0.765625, - "learning_rate": 1.362995874325611e-05, - "loss": 1.2989, - "step": 5178 - }, - { - "epoch": 1.6271079961521093, - "grad_norm": 0.84375, - "learning_rate": 1.3627419866708982e-05, - "loss": 1.2732, - "step": 5180 - }, - { - "epoch": 1.6277362231776509, - "grad_norm": 0.77734375, - "learning_rate": 1.3624880990161855e-05, - "loss": 1.1808, - "step": 5182 - }, - { - "epoch": 1.6283644502031922, - "grad_norm": 0.79296875, - "learning_rate": 1.3622342113614726e-05, - "loss": 1.2011, - "step": 5184 - }, - { - "epoch": 1.6289926772287335, - "grad_norm": 0.7734375, - "learning_rate": 1.36198032370676e-05, - "loss": 1.2063, - "step": 5186 - }, - { - "epoch": 1.6296209042542749, - "grad_norm": 0.91015625, - "learning_rate": 1.361726436052047e-05, - "loss": 1.243, - "step": 5188 - }, - { - "epoch": 1.6302491312798162, - "grad_norm": 0.90625, - "learning_rate": 1.3614725483973344e-05, - "loss": 1.3291, - "step": 5190 - }, - { - "epoch": 1.6308773583053577, - "grad_norm": 0.76953125, - "learning_rate": 1.3612186607426214e-05, - "loss": 1.2388, - "step": 5192 - }, - { - "epoch": 1.631505585330899, - "grad_norm": 0.8203125, - "learning_rate": 1.3609647730879088e-05, - "loss": 1.2186, - "step": 5194 - }, - { - "epoch": 1.6321338123564404, - "grad_norm": 0.83203125, - "learning_rate": 1.3607108854331958e-05, - "loss": 1.3375, - "step": 5196 - }, - { - "epoch": 1.6327620393819817, - "grad_norm": 0.7890625, - "learning_rate": 1.3604569977784831e-05, - "loss": 1.1837, - "step": 5198 - }, - { - "epoch": 1.633390266407523, - "grad_norm": 0.87890625, - "learning_rate": 1.3602031101237702e-05, - "loss": 1.1972, - "step": 5200 - }, - { - "epoch": 1.6340184934330644, - "grad_norm": 0.80859375, - "learning_rate": 1.3599492224690576e-05, - "loss": 1.303, - "step": 5202 - }, - { - "epoch": 1.6346467204586057, - "grad_norm": 1.015625, - "learning_rate": 1.3596953348143447e-05, - "loss": 1.2091, - "step": 5204 - }, - { - "epoch": 1.635274947484147, - "grad_norm": 0.7734375, - "learning_rate": 1.359441447159632e-05, - "loss": 1.2598, - "step": 5206 - }, - { - "epoch": 1.6359031745096884, - "grad_norm": 0.8671875, - "learning_rate": 1.3591875595049191e-05, - "loss": 1.0553, - "step": 5208 - }, - { - "epoch": 1.6365314015352297, - "grad_norm": 0.8203125, - "learning_rate": 1.3589336718502064e-05, - "loss": 1.2889, - "step": 5210 - }, - { - "epoch": 1.637159628560771, - "grad_norm": 1.2109375, - "learning_rate": 1.3586797841954936e-05, - "loss": 1.1413, - "step": 5212 - }, - { - "epoch": 1.6377878555863123, - "grad_norm": 0.828125, - "learning_rate": 1.3584258965407809e-05, - "loss": 1.1636, - "step": 5214 - }, - { - "epoch": 1.638416082611854, - "grad_norm": 1.0625, - "learning_rate": 1.3581720088860679e-05, - "loss": 1.276, - "step": 5216 - }, - { - "epoch": 1.6390443096373952, - "grad_norm": 0.80859375, - "learning_rate": 1.3579181212313552e-05, - "loss": 1.305, - "step": 5218 - }, - { - "epoch": 1.6396725366629366, - "grad_norm": 0.7734375, - "learning_rate": 1.3576642335766423e-05, - "loss": 1.201, - "step": 5220 - }, - { - "epoch": 1.6403007636884779, - "grad_norm": 0.80078125, - "learning_rate": 1.3574103459219296e-05, - "loss": 1.4048, - "step": 5222 - }, - { - "epoch": 1.6409289907140194, - "grad_norm": 0.80078125, - "learning_rate": 1.3571564582672168e-05, - "loss": 1.2899, - "step": 5224 - }, - { - "epoch": 1.6415572177395608, - "grad_norm": 0.765625, - "learning_rate": 1.356902570612504e-05, - "loss": 1.2957, - "step": 5226 - }, - { - "epoch": 1.642185444765102, - "grad_norm": 0.7734375, - "learning_rate": 1.3566486829577914e-05, - "loss": 1.1528, - "step": 5228 - }, - { - "epoch": 1.6428136717906434, - "grad_norm": 0.79296875, - "learning_rate": 1.3563947953030785e-05, - "loss": 1.3055, - "step": 5230 - }, - { - "epoch": 1.6434418988161847, - "grad_norm": 0.953125, - "learning_rate": 1.3561409076483658e-05, - "loss": 1.0612, - "step": 5232 - }, - { - "epoch": 1.644070125841726, - "grad_norm": 0.80078125, - "learning_rate": 1.355887019993653e-05, - "loss": 1.1704, - "step": 5234 - }, - { - "epoch": 1.6446983528672674, - "grad_norm": 0.89453125, - "learning_rate": 1.3556331323389403e-05, - "loss": 1.1888, - "step": 5236 - }, - { - "epoch": 1.6453265798928087, - "grad_norm": 0.75390625, - "learning_rate": 1.3553792446842274e-05, - "loss": 1.2332, - "step": 5238 - }, - { - "epoch": 1.64595480691835, - "grad_norm": 0.74609375, - "learning_rate": 1.3551253570295147e-05, - "loss": 1.2469, - "step": 5240 - }, - { - "epoch": 1.6465830339438914, - "grad_norm": 0.80859375, - "learning_rate": 1.3548714693748017e-05, - "loss": 1.2556, - "step": 5242 - }, - { - "epoch": 1.6472112609694327, - "grad_norm": 0.80859375, - "learning_rate": 1.354617581720089e-05, - "loss": 1.2354, - "step": 5244 - }, - { - "epoch": 1.647839487994974, - "grad_norm": 0.75390625, - "learning_rate": 1.3543636940653761e-05, - "loss": 1.4014, - "step": 5246 - }, - { - "epoch": 1.6484677150205156, - "grad_norm": 0.75, - "learning_rate": 1.3541098064106634e-05, - "loss": 1.3373, - "step": 5248 - }, - { - "epoch": 1.649095942046057, - "grad_norm": 0.91015625, - "learning_rate": 1.3538559187559506e-05, - "loss": 1.3094, - "step": 5250 - }, - { - "epoch": 1.6497241690715982, - "grad_norm": 0.765625, - "learning_rate": 1.3536020311012379e-05, - "loss": 1.2154, - "step": 5252 - }, - { - "epoch": 1.6503523960971396, - "grad_norm": 0.7890625, - "learning_rate": 1.353348143446525e-05, - "loss": 1.1919, - "step": 5254 - }, - { - "epoch": 1.6509806231226811, - "grad_norm": 0.80859375, - "learning_rate": 1.3530942557918123e-05, - "loss": 1.2142, - "step": 5256 - }, - { - "epoch": 1.6516088501482225, - "grad_norm": 0.78125, - "learning_rate": 1.3528403681370995e-05, - "loss": 1.2202, - "step": 5258 - }, - { - "epoch": 1.6522370771737638, - "grad_norm": 0.78515625, - "learning_rate": 1.3525864804823868e-05, - "loss": 1.3794, - "step": 5260 - }, - { - "epoch": 1.652865304199305, - "grad_norm": 0.765625, - "learning_rate": 1.3523325928276739e-05, - "loss": 1.1983, - "step": 5262 - }, - { - "epoch": 1.6534935312248464, - "grad_norm": 0.8046875, - "learning_rate": 1.3520787051729612e-05, - "loss": 1.3406, - "step": 5264 - }, - { - "epoch": 1.6541217582503878, - "grad_norm": 0.96484375, - "learning_rate": 1.3518248175182482e-05, - "loss": 1.1838, - "step": 5266 - }, - { - "epoch": 1.654749985275929, - "grad_norm": 0.77734375, - "learning_rate": 1.3515709298635355e-05, - "loss": 1.3651, - "step": 5268 - }, - { - "epoch": 1.6553782123014704, - "grad_norm": 0.703125, - "learning_rate": 1.3513170422088226e-05, - "loss": 1.2919, - "step": 5270 - }, - { - "epoch": 1.6560064393270117, - "grad_norm": 0.71875, - "learning_rate": 1.35106315455411e-05, - "loss": 1.3154, - "step": 5272 - }, - { - "epoch": 1.656634666352553, - "grad_norm": 0.8125, - "learning_rate": 1.350809266899397e-05, - "loss": 1.2643, - "step": 5274 - }, - { - "epoch": 1.6572628933780944, - "grad_norm": 0.77734375, - "learning_rate": 1.3505553792446844e-05, - "loss": 1.3239, - "step": 5276 - }, - { - "epoch": 1.6578911204036357, - "grad_norm": 0.9140625, - "learning_rate": 1.3503014915899715e-05, - "loss": 1.2137, - "step": 5278 - }, - { - "epoch": 1.658519347429177, - "grad_norm": 0.82421875, - "learning_rate": 1.3500476039352588e-05, - "loss": 1.2215, - "step": 5280 - }, - { - "epoch": 1.6591475744547186, - "grad_norm": 0.953125, - "learning_rate": 1.349793716280546e-05, - "loss": 1.1606, - "step": 5282 - }, - { - "epoch": 1.65977580148026, - "grad_norm": 0.7734375, - "learning_rate": 1.3495398286258333e-05, - "loss": 1.2496, - "step": 5284 - }, - { - "epoch": 1.6604040285058013, - "grad_norm": 0.73828125, - "learning_rate": 1.3492859409711202e-05, - "loss": 1.1227, - "step": 5286 - }, - { - "epoch": 1.6610322555313426, - "grad_norm": 0.75, - "learning_rate": 1.3490320533164077e-05, - "loss": 1.2112, - "step": 5288 - }, - { - "epoch": 1.6616604825568841, - "grad_norm": 0.79296875, - "learning_rate": 1.3487781656616947e-05, - "loss": 1.206, - "step": 5290 - }, - { - "epoch": 1.6622887095824255, - "grad_norm": 0.90625, - "learning_rate": 1.348524278006982e-05, - "loss": 1.2691, - "step": 5292 - }, - { - "epoch": 1.6629169366079668, - "grad_norm": 0.81640625, - "learning_rate": 1.3482703903522691e-05, - "loss": 1.2359, - "step": 5294 - }, - { - "epoch": 1.6635451636335081, - "grad_norm": 0.765625, - "learning_rate": 1.3480165026975564e-05, - "loss": 1.2491, - "step": 5296 - }, - { - "epoch": 1.6641733906590495, - "grad_norm": 0.8984375, - "learning_rate": 1.3477626150428436e-05, - "loss": 1.1253, - "step": 5298 - }, - { - "epoch": 1.6648016176845908, - "grad_norm": 0.79296875, - "learning_rate": 1.3475087273881309e-05, - "loss": 1.2954, - "step": 5300 - }, - { - "epoch": 1.6654298447101321, - "grad_norm": 0.88671875, - "learning_rate": 1.347254839733418e-05, - "loss": 1.178, - "step": 5302 - }, - { - "epoch": 1.6660580717356734, - "grad_norm": 0.81640625, - "learning_rate": 1.3470009520787053e-05, - "loss": 1.2116, - "step": 5304 - }, - { - "epoch": 1.6666862987612148, - "grad_norm": 0.90234375, - "learning_rate": 1.3467470644239925e-05, - "loss": 1.2565, - "step": 5306 - }, - { - "epoch": 1.667314525786756, - "grad_norm": 0.8984375, - "learning_rate": 1.3464931767692798e-05, - "loss": 1.0026, - "step": 5308 - }, - { - "epoch": 1.6679427528122974, - "grad_norm": 0.83203125, - "learning_rate": 1.3462392891145667e-05, - "loss": 1.292, - "step": 5310 - }, - { - "epoch": 1.6685709798378388, - "grad_norm": 0.86328125, - "learning_rate": 1.345985401459854e-05, - "loss": 1.104, - "step": 5312 - }, - { - "epoch": 1.6691992068633803, - "grad_norm": 0.828125, - "learning_rate": 1.3457315138051414e-05, - "loss": 1.2361, - "step": 5314 - }, - { - "epoch": 1.6698274338889216, - "grad_norm": 0.90234375, - "learning_rate": 1.3454776261504285e-05, - "loss": 1.2665, - "step": 5316 - }, - { - "epoch": 1.670455660914463, - "grad_norm": 0.8125, - "learning_rate": 1.3452237384957158e-05, - "loss": 1.2824, - "step": 5318 - }, - { - "epoch": 1.6710838879400043, - "grad_norm": 0.8515625, - "learning_rate": 1.344969850841003e-05, - "loss": 1.1185, - "step": 5320 - }, - { - "epoch": 1.6717121149655458, - "grad_norm": 0.77734375, - "learning_rate": 1.3447159631862903e-05, - "loss": 1.2738, - "step": 5322 - }, - { - "epoch": 1.6723403419910872, - "grad_norm": 0.77734375, - "learning_rate": 1.3444620755315774e-05, - "loss": 1.2713, - "step": 5324 - }, - { - "epoch": 1.6729685690166285, - "grad_norm": 0.91015625, - "learning_rate": 1.3442081878768647e-05, - "loss": 1.1093, - "step": 5326 - }, - { - "epoch": 1.6735967960421698, - "grad_norm": 0.86328125, - "learning_rate": 1.3439543002221518e-05, - "loss": 1.3263, - "step": 5328 - }, - { - "epoch": 1.6742250230677111, - "grad_norm": 0.76953125, - "learning_rate": 1.3437004125674391e-05, - "loss": 1.3777, - "step": 5330 - }, - { - "epoch": 1.6748532500932525, - "grad_norm": 1.0546875, - "learning_rate": 1.3434465249127263e-05, - "loss": 1.1151, - "step": 5332 - }, - { - "epoch": 1.6754814771187938, - "grad_norm": 0.77734375, - "learning_rate": 1.3431926372580136e-05, - "loss": 1.5733, - "step": 5334 - }, - { - "epoch": 1.6761097041443351, - "grad_norm": 0.796875, - "learning_rate": 1.3429387496033006e-05, - "loss": 1.1432, - "step": 5336 - }, - { - "epoch": 1.6767379311698765, - "grad_norm": 0.8671875, - "learning_rate": 1.3426848619485879e-05, - "loss": 1.3853, - "step": 5338 - }, - { - "epoch": 1.6773661581954178, - "grad_norm": 0.78125, - "learning_rate": 1.342430974293875e-05, - "loss": 1.3414, - "step": 5340 - }, - { - "epoch": 1.6779943852209591, - "grad_norm": 0.85546875, - "learning_rate": 1.3421770866391623e-05, - "loss": 1.2539, - "step": 5342 - }, - { - "epoch": 1.6786226122465004, - "grad_norm": 0.99609375, - "learning_rate": 1.3419231989844494e-05, - "loss": 1.2596, - "step": 5344 - }, - { - "epoch": 1.6792508392720418, - "grad_norm": 0.890625, - "learning_rate": 1.3416693113297368e-05, - "loss": 1.1853, - "step": 5346 - }, - { - "epoch": 1.6798790662975833, - "grad_norm": 0.76953125, - "learning_rate": 1.3414154236750239e-05, - "loss": 1.3223, - "step": 5348 - }, - { - "epoch": 1.6805072933231247, - "grad_norm": 0.83203125, - "learning_rate": 1.3411615360203112e-05, - "loss": 1.3358, - "step": 5350 - }, - { - "epoch": 1.681135520348666, - "grad_norm": 0.796875, - "learning_rate": 1.3409076483655983e-05, - "loss": 1.1666, - "step": 5352 - }, - { - "epoch": 1.6817637473742073, - "grad_norm": 0.9453125, - "learning_rate": 1.3406537607108856e-05, - "loss": 1.2604, - "step": 5354 - }, - { - "epoch": 1.6823919743997489, - "grad_norm": 0.76953125, - "learning_rate": 1.3403998730561726e-05, - "loss": 1.2772, - "step": 5356 - }, - { - "epoch": 1.6830202014252902, - "grad_norm": 0.87109375, - "learning_rate": 1.3401459854014601e-05, - "loss": 1.2507, - "step": 5358 - }, - { - "epoch": 1.6836484284508315, - "grad_norm": 0.82421875, - "learning_rate": 1.339892097746747e-05, - "loss": 1.1198, - "step": 5360 - }, - { - "epoch": 1.6842766554763728, - "grad_norm": 0.79296875, - "learning_rate": 1.3396382100920344e-05, - "loss": 1.3049, - "step": 5362 - }, - { - "epoch": 1.6849048825019142, - "grad_norm": 0.828125, - "learning_rate": 1.3393843224373215e-05, - "loss": 1.1426, - "step": 5364 - }, - { - "epoch": 1.6855331095274555, - "grad_norm": 0.83984375, - "learning_rate": 1.3391304347826088e-05, - "loss": 1.1442, - "step": 5366 - }, - { - "epoch": 1.6861613365529968, - "grad_norm": 0.8359375, - "learning_rate": 1.338876547127896e-05, - "loss": 1.2436, - "step": 5368 - }, - { - "epoch": 1.6867895635785382, - "grad_norm": 0.7734375, - "learning_rate": 1.3386226594731833e-05, - "loss": 1.2264, - "step": 5370 - }, - { - "epoch": 1.6874177906040795, - "grad_norm": 0.828125, - "learning_rate": 1.3383687718184704e-05, - "loss": 1.2157, - "step": 5372 - }, - { - "epoch": 1.6880460176296208, - "grad_norm": 0.77734375, - "learning_rate": 1.3381148841637577e-05, - "loss": 1.2976, - "step": 5374 - }, - { - "epoch": 1.6886742446551621, - "grad_norm": 0.81640625, - "learning_rate": 1.3378609965090448e-05, - "loss": 1.2154, - "step": 5376 - }, - { - "epoch": 1.6893024716807035, - "grad_norm": 0.7265625, - "learning_rate": 1.3376071088543322e-05, - "loss": 1.1889, - "step": 5378 - }, - { - "epoch": 1.689930698706245, - "grad_norm": 0.83203125, - "learning_rate": 1.3373532211996191e-05, - "loss": 1.2453, - "step": 5380 - }, - { - "epoch": 1.6905589257317863, - "grad_norm": 0.796875, - "learning_rate": 1.3370993335449064e-05, - "loss": 1.1871, - "step": 5382 - }, - { - "epoch": 1.6911871527573277, - "grad_norm": 0.74609375, - "learning_rate": 1.3368454458901936e-05, - "loss": 1.3467, - "step": 5384 - }, - { - "epoch": 1.691815379782869, - "grad_norm": 0.77734375, - "learning_rate": 1.3365915582354809e-05, - "loss": 1.3004, - "step": 5386 - }, - { - "epoch": 1.6924436068084106, - "grad_norm": 0.9140625, - "learning_rate": 1.336337670580768e-05, - "loss": 1.3259, - "step": 5388 - }, - { - "epoch": 1.6930718338339519, - "grad_norm": 0.85546875, - "learning_rate": 1.3360837829260553e-05, - "loss": 1.2515, - "step": 5390 - }, - { - "epoch": 1.6937000608594932, - "grad_norm": 0.80859375, - "learning_rate": 1.3358298952713425e-05, - "loss": 1.2712, - "step": 5392 - }, - { - "epoch": 1.6943282878850345, - "grad_norm": 0.76171875, - "learning_rate": 1.3355760076166298e-05, - "loss": 1.2064, - "step": 5394 - }, - { - "epoch": 1.6949565149105759, - "grad_norm": 0.75, - "learning_rate": 1.3353221199619169e-05, - "loss": 1.3884, - "step": 5396 - }, - { - "epoch": 1.6955847419361172, - "grad_norm": 0.796875, - "learning_rate": 1.3350682323072042e-05, - "loss": 1.24, - "step": 5398 - }, - { - "epoch": 1.6962129689616585, - "grad_norm": 0.77734375, - "learning_rate": 1.3348143446524915e-05, - "loss": 1.2544, - "step": 5400 - }, - { - "epoch": 1.6968411959871998, - "grad_norm": 0.83984375, - "learning_rate": 1.3345604569977787e-05, - "loss": 1.2522, - "step": 5402 - }, - { - "epoch": 1.6974694230127412, - "grad_norm": 0.83984375, - "learning_rate": 1.334306569343066e-05, - "loss": 1.2362, - "step": 5404 - }, - { - "epoch": 1.6980976500382825, - "grad_norm": 0.9375, - "learning_rate": 1.334052681688353e-05, - "loss": 1.3574, - "step": 5406 - }, - { - "epoch": 1.6987258770638238, - "grad_norm": 0.765625, - "learning_rate": 1.3337987940336402e-05, - "loss": 1.274, - "step": 5408 - }, - { - "epoch": 1.6993541040893652, - "grad_norm": 0.87890625, - "learning_rate": 1.3335449063789274e-05, - "loss": 1.2817, - "step": 5410 - }, - { - "epoch": 1.6999823311149065, - "grad_norm": 0.765625, - "learning_rate": 1.3332910187242147e-05, - "loss": 1.3324, - "step": 5412 - }, - { - "epoch": 1.700610558140448, - "grad_norm": 0.76953125, - "learning_rate": 1.3330371310695018e-05, - "loss": 1.2094, - "step": 5414 - }, - { - "epoch": 1.7012387851659894, - "grad_norm": 0.8203125, - "learning_rate": 1.3327832434147891e-05, - "loss": 1.1927, - "step": 5416 - }, - { - "epoch": 1.7018670121915307, - "grad_norm": 0.8203125, - "learning_rate": 1.3325293557600763e-05, - "loss": 1.166, - "step": 5418 - }, - { - "epoch": 1.702495239217072, - "grad_norm": 0.859375, - "learning_rate": 1.3322754681053636e-05, - "loss": 1.3582, - "step": 5420 - }, - { - "epoch": 1.7031234662426136, - "grad_norm": 0.80078125, - "learning_rate": 1.3320215804506507e-05, - "loss": 1.3184, - "step": 5422 - }, - { - "epoch": 1.703751693268155, - "grad_norm": 0.8046875, - "learning_rate": 1.331767692795938e-05, - "loss": 1.2835, - "step": 5424 - }, - { - "epoch": 1.7043799202936962, - "grad_norm": 0.80859375, - "learning_rate": 1.3315138051412252e-05, - "loss": 1.2446, - "step": 5426 - }, - { - "epoch": 1.7050081473192376, - "grad_norm": 0.8359375, - "learning_rate": 1.3312599174865125e-05, - "loss": 1.2357, - "step": 5428 - }, - { - "epoch": 1.7056363743447789, - "grad_norm": 0.921875, - "learning_rate": 1.3310060298317994e-05, - "loss": 1.2273, - "step": 5430 - }, - { - "epoch": 1.7062646013703202, - "grad_norm": 0.8203125, - "learning_rate": 1.3307521421770867e-05, - "loss": 1.2684, - "step": 5432 - }, - { - "epoch": 1.7068928283958615, - "grad_norm": 0.85546875, - "learning_rate": 1.3304982545223739e-05, - "loss": 1.2107, - "step": 5434 - }, - { - "epoch": 1.7075210554214029, - "grad_norm": 0.77734375, - "learning_rate": 1.3302443668676612e-05, - "loss": 1.3441, - "step": 5436 - }, - { - "epoch": 1.7081492824469442, - "grad_norm": 0.69921875, - "learning_rate": 1.3299904792129483e-05, - "loss": 1.2601, - "step": 5438 - }, - { - "epoch": 1.7087775094724855, - "grad_norm": 0.78515625, - "learning_rate": 1.3297365915582356e-05, - "loss": 1.2648, - "step": 5440 - }, - { - "epoch": 1.7094057364980269, - "grad_norm": 0.80078125, - "learning_rate": 1.3294827039035228e-05, - "loss": 1.1744, - "step": 5442 - }, - { - "epoch": 1.7100339635235682, - "grad_norm": 0.8203125, - "learning_rate": 1.3292288162488101e-05, - "loss": 1.0949, - "step": 5444 - }, - { - "epoch": 1.7106621905491097, - "grad_norm": 0.8515625, - "learning_rate": 1.3289749285940972e-05, - "loss": 1.1723, - "step": 5446 - }, - { - "epoch": 1.711290417574651, - "grad_norm": 0.86328125, - "learning_rate": 1.3287210409393845e-05, - "loss": 1.2938, - "step": 5448 - }, - { - "epoch": 1.7119186446001924, - "grad_norm": 0.81640625, - "learning_rate": 1.3284671532846715e-05, - "loss": 1.3799, - "step": 5450 - }, - { - "epoch": 1.7125468716257337, - "grad_norm": 0.77734375, - "learning_rate": 1.3282132656299588e-05, - "loss": 1.0393, - "step": 5452 - }, - { - "epoch": 1.7131750986512753, - "grad_norm": 0.75390625, - "learning_rate": 1.327959377975246e-05, - "loss": 1.0883, - "step": 5454 - }, - { - "epoch": 1.7138033256768166, - "grad_norm": 0.82421875, - "learning_rate": 1.3277054903205333e-05, - "loss": 1.1746, - "step": 5456 - }, - { - "epoch": 1.714431552702358, - "grad_norm": 0.8984375, - "learning_rate": 1.3274516026658204e-05, - "loss": 1.2404, - "step": 5458 - }, - { - "epoch": 1.7150597797278992, - "grad_norm": 0.84765625, - "learning_rate": 1.3271977150111077e-05, - "loss": 1.1545, - "step": 5460 - }, - { - "epoch": 1.7156880067534406, - "grad_norm": 0.8828125, - "learning_rate": 1.3269438273563948e-05, - "loss": 1.3406, - "step": 5462 - }, - { - "epoch": 1.716316233778982, - "grad_norm": 0.8125, - "learning_rate": 1.3266899397016821e-05, - "loss": 1.2526, - "step": 5464 - }, - { - "epoch": 1.7169444608045232, - "grad_norm": 0.8046875, - "learning_rate": 1.3264360520469693e-05, - "loss": 1.2369, - "step": 5466 - }, - { - "epoch": 1.7175726878300646, - "grad_norm": 0.84375, - "learning_rate": 1.3261821643922566e-05, - "loss": 1.2515, - "step": 5468 - }, - { - "epoch": 1.7182009148556059, - "grad_norm": 0.79296875, - "learning_rate": 1.3259282767375437e-05, - "loss": 1.1968, - "step": 5470 - }, - { - "epoch": 1.7188291418811472, - "grad_norm": 0.84375, - "learning_rate": 1.325674389082831e-05, - "loss": 1.2784, - "step": 5472 - }, - { - "epoch": 1.7194573689066885, - "grad_norm": 0.8125, - "learning_rate": 1.325420501428118e-05, - "loss": 1.2446, - "step": 5474 - }, - { - "epoch": 1.7200855959322299, - "grad_norm": 1.2421875, - "learning_rate": 1.3251666137734053e-05, - "loss": 1.1592, - "step": 5476 - }, - { - "epoch": 1.7207138229577712, - "grad_norm": 0.80859375, - "learning_rate": 1.3249127261186925e-05, - "loss": 1.2955, - "step": 5478 - }, - { - "epoch": 1.7213420499833127, - "grad_norm": 0.76171875, - "learning_rate": 1.3246588384639798e-05, - "loss": 1.334, - "step": 5480 - }, - { - "epoch": 1.721970277008854, - "grad_norm": 0.8359375, - "learning_rate": 1.3244049508092669e-05, - "loss": 1.114, - "step": 5482 - }, - { - "epoch": 1.7225985040343954, - "grad_norm": 0.859375, - "learning_rate": 1.3241510631545542e-05, - "loss": 1.0863, - "step": 5484 - }, - { - "epoch": 1.7232267310599367, - "grad_norm": 0.78515625, - "learning_rate": 1.3238971754998415e-05, - "loss": 1.3088, - "step": 5486 - }, - { - "epoch": 1.7238549580854783, - "grad_norm": 0.765625, - "learning_rate": 1.3236432878451287e-05, - "loss": 1.0641, - "step": 5488 - }, - { - "epoch": 1.7244831851110196, - "grad_norm": 0.796875, - "learning_rate": 1.323389400190416e-05, - "loss": 1.3067, - "step": 5490 - }, - { - "epoch": 1.725111412136561, - "grad_norm": 0.80859375, - "learning_rate": 1.3231355125357031e-05, - "loss": 1.2372, - "step": 5492 - }, - { - "epoch": 1.7257396391621023, - "grad_norm": 0.90234375, - "learning_rate": 1.3228816248809904e-05, - "loss": 1.0898, - "step": 5494 - }, - { - "epoch": 1.7263678661876436, - "grad_norm": 0.80078125, - "learning_rate": 1.3226277372262775e-05, - "loss": 1.297, - "step": 5496 - }, - { - "epoch": 1.726996093213185, - "grad_norm": 0.72265625, - "learning_rate": 1.3223738495715649e-05, - "loss": 1.2627, - "step": 5498 - }, - { - "epoch": 1.7276243202387263, - "grad_norm": 0.7421875, - "learning_rate": 1.3221199619168518e-05, - "loss": 1.3876, - "step": 5500 - }, - { - "epoch": 1.7282525472642676, - "grad_norm": 0.85546875, - "learning_rate": 1.3218660742621391e-05, - "loss": 1.1941, - "step": 5502 - }, - { - "epoch": 1.728880774289809, - "grad_norm": 0.7578125, - "learning_rate": 1.3216121866074263e-05, - "loss": 1.2344, - "step": 5504 - }, - { - "epoch": 1.7295090013153502, - "grad_norm": 0.86328125, - "learning_rate": 1.3213582989527136e-05, - "loss": 1.1713, - "step": 5506 - }, - { - "epoch": 1.7301372283408916, - "grad_norm": 0.80859375, - "learning_rate": 1.3211044112980007e-05, - "loss": 1.242, - "step": 5508 - }, - { - "epoch": 1.730765455366433, - "grad_norm": 0.74609375, - "learning_rate": 1.320850523643288e-05, - "loss": 1.1983, - "step": 5510 - }, - { - "epoch": 1.7313936823919744, - "grad_norm": 0.80078125, - "learning_rate": 1.3205966359885752e-05, - "loss": 1.2646, - "step": 5512 - }, - { - "epoch": 1.7320219094175158, - "grad_norm": 0.80078125, - "learning_rate": 1.3203427483338625e-05, - "loss": 1.3675, - "step": 5514 - }, - { - "epoch": 1.732650136443057, - "grad_norm": 0.82421875, - "learning_rate": 1.3200888606791496e-05, - "loss": 1.2915, - "step": 5516 - }, - { - "epoch": 1.7332783634685984, - "grad_norm": 0.8046875, - "learning_rate": 1.3198349730244369e-05, - "loss": 1.3231, - "step": 5518 - }, - { - "epoch": 1.73390659049414, - "grad_norm": 0.8203125, - "learning_rate": 1.3195810853697239e-05, - "loss": 1.228, - "step": 5520 - }, - { - "epoch": 1.7345348175196813, - "grad_norm": 0.91796875, - "learning_rate": 1.3193271977150114e-05, - "loss": 1.154, - "step": 5522 - }, - { - "epoch": 1.7351630445452226, - "grad_norm": 0.82421875, - "learning_rate": 1.3190733100602983e-05, - "loss": 1.2426, - "step": 5524 - }, - { - "epoch": 1.735791271570764, - "grad_norm": 0.76953125, - "learning_rate": 1.3188194224055856e-05, - "loss": 1.1921, - "step": 5526 - }, - { - "epoch": 1.7364194985963053, - "grad_norm": 0.90625, - "learning_rate": 1.3185655347508728e-05, - "loss": 1.2466, - "step": 5528 - }, - { - "epoch": 1.7370477256218466, - "grad_norm": 0.7578125, - "learning_rate": 1.31831164709616e-05, - "loss": 1.2553, - "step": 5530 - }, - { - "epoch": 1.737675952647388, - "grad_norm": 0.765625, - "learning_rate": 1.3180577594414472e-05, - "loss": 1.2312, - "step": 5532 - }, - { - "epoch": 1.7383041796729293, - "grad_norm": 0.765625, - "learning_rate": 1.3178038717867345e-05, - "loss": 1.2277, - "step": 5534 - }, - { - "epoch": 1.7389324066984706, - "grad_norm": 0.84375, - "learning_rate": 1.3175499841320217e-05, - "loss": 1.2192, - "step": 5536 - }, - { - "epoch": 1.739560633724012, - "grad_norm": 0.796875, - "learning_rate": 1.317296096477309e-05, - "loss": 1.1536, - "step": 5538 - }, - { - "epoch": 1.7401888607495533, - "grad_norm": 0.890625, - "learning_rate": 1.3170422088225961e-05, - "loss": 1.16, - "step": 5540 - }, - { - "epoch": 1.7408170877750946, - "grad_norm": 0.90625, - "learning_rate": 1.3167883211678834e-05, - "loss": 1.472, - "step": 5542 - }, - { - "epoch": 1.7414453148006361, - "grad_norm": 0.796875, - "learning_rate": 1.3165344335131704e-05, - "loss": 1.3138, - "step": 5544 - }, - { - "epoch": 1.7420735418261775, - "grad_norm": 0.79296875, - "learning_rate": 1.3162805458584577e-05, - "loss": 1.2316, - "step": 5546 - }, - { - "epoch": 1.7427017688517188, - "grad_norm": 0.8359375, - "learning_rate": 1.3160266582037448e-05, - "loss": 1.2371, - "step": 5548 - }, - { - "epoch": 1.7433299958772601, - "grad_norm": 0.8125, - "learning_rate": 1.3157727705490321e-05, - "loss": 1.302, - "step": 5550 - }, - { - "epoch": 1.7439582229028014, - "grad_norm": 0.83203125, - "learning_rate": 1.3155188828943193e-05, - "loss": 1.1977, - "step": 5552 - }, - { - "epoch": 1.744586449928343, - "grad_norm": 0.84765625, - "learning_rate": 1.3152649952396066e-05, - "loss": 1.234, - "step": 5554 - }, - { - "epoch": 1.7452146769538843, - "grad_norm": 0.80859375, - "learning_rate": 1.3150111075848937e-05, - "loss": 1.2929, - "step": 5556 - }, - { - "epoch": 1.7458429039794257, - "grad_norm": 0.77734375, - "learning_rate": 1.314757219930181e-05, - "loss": 1.2919, - "step": 5558 - }, - { - "epoch": 1.746471131004967, - "grad_norm": 0.71875, - "learning_rate": 1.3145033322754682e-05, - "loss": 1.3428, - "step": 5560 - }, - { - "epoch": 1.7470993580305083, - "grad_norm": 0.875, - "learning_rate": 1.3142494446207555e-05, - "loss": 1.1509, - "step": 5562 - }, - { - "epoch": 1.7477275850560496, - "grad_norm": 0.96875, - "learning_rate": 1.3139955569660424e-05, - "loss": 1.1809, - "step": 5564 - }, - { - "epoch": 1.748355812081591, - "grad_norm": 0.78515625, - "learning_rate": 1.31374166931133e-05, - "loss": 1.2189, - "step": 5566 - }, - { - "epoch": 1.7489840391071323, - "grad_norm": 0.85546875, - "learning_rate": 1.3134877816566169e-05, - "loss": 1.1861, - "step": 5568 - }, - { - "epoch": 1.7496122661326736, - "grad_norm": 0.875, - "learning_rate": 1.3132338940019042e-05, - "loss": 1.1364, - "step": 5570 - }, - { - "epoch": 1.750240493158215, - "grad_norm": 0.77734375, - "learning_rate": 1.3129800063471915e-05, - "loss": 1.1878, - "step": 5572 - }, - { - "epoch": 1.7508687201837563, - "grad_norm": 0.76953125, - "learning_rate": 1.3127261186924786e-05, - "loss": 1.1927, - "step": 5574 - }, - { - "epoch": 1.7514969472092976, - "grad_norm": 0.8359375, - "learning_rate": 1.312472231037766e-05, - "loss": 1.1183, - "step": 5576 - }, - { - "epoch": 1.7521251742348392, - "grad_norm": 0.80078125, - "learning_rate": 1.3122183433830531e-05, - "loss": 1.293, - "step": 5578 - }, - { - "epoch": 1.7527534012603805, - "grad_norm": 0.7890625, - "learning_rate": 1.3119644557283404e-05, - "loss": 1.2526, - "step": 5580 - }, - { - "epoch": 1.7533816282859218, - "grad_norm": 0.83203125, - "learning_rate": 1.3117105680736275e-05, - "loss": 1.1872, - "step": 5582 - }, - { - "epoch": 1.7540098553114631, - "grad_norm": 0.88671875, - "learning_rate": 1.3114566804189148e-05, - "loss": 1.2216, - "step": 5584 - }, - { - "epoch": 1.7546380823370047, - "grad_norm": 0.734375, - "learning_rate": 1.311202792764202e-05, - "loss": 1.1968, - "step": 5586 - }, - { - "epoch": 1.755266309362546, - "grad_norm": 0.80078125, - "learning_rate": 1.3109489051094893e-05, - "loss": 1.3127, - "step": 5588 - }, - { - "epoch": 1.7558945363880873, - "grad_norm": 0.81640625, - "learning_rate": 1.3106950174547763e-05, - "loss": 1.143, - "step": 5590 - }, - { - "epoch": 1.7565227634136287, - "grad_norm": 1.0546875, - "learning_rate": 1.3104411298000637e-05, - "loss": 1.2036, - "step": 5592 - }, - { - "epoch": 1.75715099043917, - "grad_norm": 0.875, - "learning_rate": 1.3101872421453507e-05, - "loss": 1.3434, - "step": 5594 - }, - { - "epoch": 1.7577792174647113, - "grad_norm": 0.97265625, - "learning_rate": 1.309933354490638e-05, - "loss": 1.1842, - "step": 5596 - }, - { - "epoch": 1.7584074444902527, - "grad_norm": 0.796875, - "learning_rate": 1.3096794668359251e-05, - "loss": 1.2543, - "step": 5598 - }, - { - "epoch": 1.759035671515794, - "grad_norm": 0.91015625, - "learning_rate": 1.3094255791812125e-05, - "loss": 1.2244, - "step": 5600 - }, - { - "epoch": 1.7596638985413353, - "grad_norm": 0.8046875, - "learning_rate": 1.3091716915264996e-05, - "loss": 1.1647, - "step": 5602 - }, - { - "epoch": 1.7602921255668766, - "grad_norm": 0.79296875, - "learning_rate": 1.3089178038717869e-05, - "loss": 1.3046, - "step": 5604 - }, - { - "epoch": 1.760920352592418, - "grad_norm": 0.765625, - "learning_rate": 1.308663916217074e-05, - "loss": 1.1823, - "step": 5606 - }, - { - "epoch": 1.7615485796179593, - "grad_norm": 0.875, - "learning_rate": 1.3084100285623613e-05, - "loss": 1.3261, - "step": 5608 - }, - { - "epoch": 1.7621768066435008, - "grad_norm": 0.73828125, - "learning_rate": 1.3081561409076485e-05, - "loss": 1.2197, - "step": 5610 - }, - { - "epoch": 1.7628050336690422, - "grad_norm": 0.75, - "learning_rate": 1.3079022532529358e-05, - "loss": 1.3214, - "step": 5612 - }, - { - "epoch": 1.7634332606945835, - "grad_norm": 0.76171875, - "learning_rate": 1.3076483655982228e-05, - "loss": 1.3303, - "step": 5614 - }, - { - "epoch": 1.7640614877201248, - "grad_norm": 0.796875, - "learning_rate": 1.30739447794351e-05, - "loss": 1.2344, - "step": 5616 - }, - { - "epoch": 1.7646897147456662, - "grad_norm": 0.90625, - "learning_rate": 1.3071405902887972e-05, - "loss": 1.2934, - "step": 5618 - }, - { - "epoch": 1.7653179417712077, - "grad_norm": 0.8203125, - "learning_rate": 1.3068867026340845e-05, - "loss": 1.1482, - "step": 5620 - }, - { - "epoch": 1.765946168796749, - "grad_norm": 0.70703125, - "learning_rate": 1.3066328149793717e-05, - "loss": 1.2069, - "step": 5622 - }, - { - "epoch": 1.7665743958222904, - "grad_norm": 0.82421875, - "learning_rate": 1.306378927324659e-05, - "loss": 1.1891, - "step": 5624 - }, - { - "epoch": 1.7672026228478317, - "grad_norm": 0.71484375, - "learning_rate": 1.3061250396699461e-05, - "loss": 1.3099, - "step": 5626 - }, - { - "epoch": 1.767830849873373, - "grad_norm": 0.7578125, - "learning_rate": 1.3058711520152334e-05, - "loss": 1.2631, - "step": 5628 - }, - { - "epoch": 1.7684590768989144, - "grad_norm": 0.8125, - "learning_rate": 1.3056172643605205e-05, - "loss": 1.2516, - "step": 5630 - }, - { - "epoch": 1.7690873039244557, - "grad_norm": 0.8046875, - "learning_rate": 1.3053633767058079e-05, - "loss": 1.2557, - "step": 5632 - }, - { - "epoch": 1.769715530949997, - "grad_norm": 0.92578125, - "learning_rate": 1.305109489051095e-05, - "loss": 1.2499, - "step": 5634 - }, - { - "epoch": 1.7703437579755383, - "grad_norm": 0.765625, - "learning_rate": 1.3048556013963823e-05, - "loss": 1.3002, - "step": 5636 - }, - { - "epoch": 1.7709719850010797, - "grad_norm": 0.82421875, - "learning_rate": 1.3046017137416693e-05, - "loss": 1.2968, - "step": 5638 - }, - { - "epoch": 1.771600212026621, - "grad_norm": 0.8125, - "learning_rate": 1.3043478260869566e-05, - "loss": 1.3062, - "step": 5640 - }, - { - "epoch": 1.7722284390521623, - "grad_norm": 0.765625, - "learning_rate": 1.3040939384322437e-05, - "loss": 1.1898, - "step": 5642 - }, - { - "epoch": 1.7728566660777039, - "grad_norm": 0.8203125, - "learning_rate": 1.303840050777531e-05, - "loss": 1.3012, - "step": 5644 - }, - { - "epoch": 1.7734848931032452, - "grad_norm": 0.82421875, - "learning_rate": 1.3035861631228182e-05, - "loss": 1.1432, - "step": 5646 - }, - { - "epoch": 1.7741131201287865, - "grad_norm": 0.75390625, - "learning_rate": 1.3033322754681055e-05, - "loss": 1.2746, - "step": 5648 - }, - { - "epoch": 1.7747413471543279, - "grad_norm": 0.8359375, - "learning_rate": 1.3030783878133926e-05, - "loss": 1.2069, - "step": 5650 - }, - { - "epoch": 1.7753695741798694, - "grad_norm": 0.8125, - "learning_rate": 1.3028245001586799e-05, - "loss": 1.4756, - "step": 5652 - }, - { - "epoch": 1.7759978012054107, - "grad_norm": 0.79296875, - "learning_rate": 1.302570612503967e-05, - "loss": 1.2794, - "step": 5654 - }, - { - "epoch": 1.776626028230952, - "grad_norm": 0.8125, - "learning_rate": 1.3023167248492544e-05, - "loss": 1.4529, - "step": 5656 - }, - { - "epoch": 1.7772542552564934, - "grad_norm": 0.74609375, - "learning_rate": 1.3020628371945417e-05, - "loss": 1.2728, - "step": 5658 - }, - { - "epoch": 1.7778824822820347, - "grad_norm": 0.796875, - "learning_rate": 1.3018089495398288e-05, - "loss": 1.2532, - "step": 5660 - }, - { - "epoch": 1.778510709307576, - "grad_norm": 0.76171875, - "learning_rate": 1.3015550618851161e-05, - "loss": 1.1706, - "step": 5662 - }, - { - "epoch": 1.7791389363331174, - "grad_norm": 0.78125, - "learning_rate": 1.301301174230403e-05, - "loss": 1.3033, - "step": 5664 - }, - { - "epoch": 1.7797671633586587, - "grad_norm": 0.74609375, - "learning_rate": 1.3010472865756904e-05, - "loss": 1.2695, - "step": 5666 - }, - { - "epoch": 1.7803953903842, - "grad_norm": 0.84375, - "learning_rate": 1.3007933989209775e-05, - "loss": 1.1627, - "step": 5668 - }, - { - "epoch": 1.7810236174097414, - "grad_norm": 0.87109375, - "learning_rate": 1.3005395112662648e-05, - "loss": 1.2485, - "step": 5670 - }, - { - "epoch": 1.7816518444352827, - "grad_norm": 0.77734375, - "learning_rate": 1.300285623611552e-05, - "loss": 1.2374, - "step": 5672 - }, - { - "epoch": 1.782280071460824, - "grad_norm": 0.76953125, - "learning_rate": 1.3000317359568393e-05, - "loss": 1.302, - "step": 5674 - }, - { - "epoch": 1.7829082984863656, - "grad_norm": 0.8828125, - "learning_rate": 1.2997778483021264e-05, - "loss": 1.3329, - "step": 5676 - }, - { - "epoch": 1.783536525511907, - "grad_norm": 0.77734375, - "learning_rate": 1.2995239606474137e-05, - "loss": 1.2451, - "step": 5678 - }, - { - "epoch": 1.7841647525374482, - "grad_norm": 0.81640625, - "learning_rate": 1.2992700729927009e-05, - "loss": 1.2296, - "step": 5680 - }, - { - "epoch": 1.7847929795629895, - "grad_norm": 0.76171875, - "learning_rate": 1.2990161853379882e-05, - "loss": 1.2764, - "step": 5682 - }, - { - "epoch": 1.785421206588531, - "grad_norm": 0.76953125, - "learning_rate": 1.2987622976832751e-05, - "loss": 1.3022, - "step": 5684 - }, - { - "epoch": 1.7860494336140724, - "grad_norm": 0.78515625, - "learning_rate": 1.2985084100285626e-05, - "loss": 1.1862, - "step": 5686 - }, - { - "epoch": 1.7866776606396138, - "grad_norm": 0.9375, - "learning_rate": 1.2982545223738496e-05, - "loss": 1.097, - "step": 5688 - }, - { - "epoch": 1.787305887665155, - "grad_norm": 0.7421875, - "learning_rate": 1.2980006347191369e-05, - "loss": 1.3138, - "step": 5690 - }, - { - "epoch": 1.7879341146906964, - "grad_norm": 0.73828125, - "learning_rate": 1.297746747064424e-05, - "loss": 1.2888, - "step": 5692 - }, - { - "epoch": 1.7885623417162377, - "grad_norm": 0.8671875, - "learning_rate": 1.2974928594097113e-05, - "loss": 1.3605, - "step": 5694 - }, - { - "epoch": 1.789190568741779, - "grad_norm": 0.7734375, - "learning_rate": 1.2972389717549985e-05, - "loss": 1.2925, - "step": 5696 - }, - { - "epoch": 1.7898187957673204, - "grad_norm": 0.89453125, - "learning_rate": 1.2969850841002858e-05, - "loss": 1.0879, - "step": 5698 - }, - { - "epoch": 1.7904470227928617, - "grad_norm": 0.78515625, - "learning_rate": 1.296731196445573e-05, - "loss": 1.1732, - "step": 5700 - }, - { - "epoch": 1.791075249818403, - "grad_norm": 0.71875, - "learning_rate": 1.2964773087908602e-05, - "loss": 1.2151, - "step": 5702 - }, - { - "epoch": 1.7917034768439444, - "grad_norm": 0.7734375, - "learning_rate": 1.2962234211361474e-05, - "loss": 1.1779, - "step": 5704 - }, - { - "epoch": 1.7923317038694857, - "grad_norm": 0.8125, - "learning_rate": 1.2959695334814347e-05, - "loss": 1.2626, - "step": 5706 - }, - { - "epoch": 1.792959930895027, - "grad_norm": 0.7890625, - "learning_rate": 1.2957156458267216e-05, - "loss": 1.1991, - "step": 5708 - }, - { - "epoch": 1.7935881579205686, - "grad_norm": 0.74609375, - "learning_rate": 1.295461758172009e-05, - "loss": 1.3407, - "step": 5710 - }, - { - "epoch": 1.79421638494611, - "grad_norm": 0.78515625, - "learning_rate": 1.2952078705172961e-05, - "loss": 1.2508, - "step": 5712 - }, - { - "epoch": 1.7948446119716512, - "grad_norm": 0.74609375, - "learning_rate": 1.2949539828625834e-05, - "loss": 1.4018, - "step": 5714 - }, - { - "epoch": 1.7954728389971926, - "grad_norm": 0.76953125, - "learning_rate": 1.2947000952078705e-05, - "loss": 1.1721, - "step": 5716 - }, - { - "epoch": 1.7961010660227341, - "grad_norm": 0.79296875, - "learning_rate": 1.2944462075531578e-05, - "loss": 1.2232, - "step": 5718 - }, - { - "epoch": 1.7967292930482754, - "grad_norm": 0.73828125, - "learning_rate": 1.294192319898445e-05, - "loss": 1.2397, - "step": 5720 - }, - { - "epoch": 1.7973575200738168, - "grad_norm": 0.8203125, - "learning_rate": 1.2939384322437323e-05, - "loss": 1.172, - "step": 5722 - }, - { - "epoch": 1.797985747099358, - "grad_norm": 0.73828125, - "learning_rate": 1.2936845445890194e-05, - "loss": 1.3208, - "step": 5724 - }, - { - "epoch": 1.7986139741248994, - "grad_norm": 0.77734375, - "learning_rate": 1.2934306569343067e-05, - "loss": 1.2101, - "step": 5726 - }, - { - "epoch": 1.7992422011504408, - "grad_norm": 0.84375, - "learning_rate": 1.2931767692795937e-05, - "loss": 1.2136, - "step": 5728 - }, - { - "epoch": 1.799870428175982, - "grad_norm": 0.81640625, - "learning_rate": 1.2929228816248812e-05, - "loss": 1.3315, - "step": 5730 - }, - { - "epoch": 1.8004986552015234, - "grad_norm": 0.82421875, - "learning_rate": 1.2926689939701682e-05, - "loss": 1.1851, - "step": 5732 - }, - { - "epoch": 1.8011268822270647, - "grad_norm": 0.83984375, - "learning_rate": 1.2924151063154555e-05, - "loss": 1.1484, - "step": 5734 - }, - { - "epoch": 1.801755109252606, - "grad_norm": 0.78515625, - "learning_rate": 1.2921612186607426e-05, - "loss": 1.1852, - "step": 5736 - }, - { - "epoch": 1.8023833362781474, - "grad_norm": 0.82421875, - "learning_rate": 1.2919073310060299e-05, - "loss": 1.2899, - "step": 5738 - }, - { - "epoch": 1.8030115633036887, - "grad_norm": 0.84375, - "learning_rate": 1.291653443351317e-05, - "loss": 1.2624, - "step": 5740 - }, - { - "epoch": 1.8036397903292303, - "grad_norm": 0.73828125, - "learning_rate": 1.2913995556966044e-05, - "loss": 1.1763, - "step": 5742 - }, - { - "epoch": 1.8042680173547716, - "grad_norm": 0.828125, - "learning_rate": 1.2911456680418917e-05, - "loss": 1.384, - "step": 5744 - }, - { - "epoch": 1.804896244380313, - "grad_norm": 0.83203125, - "learning_rate": 1.2908917803871788e-05, - "loss": 1.241, - "step": 5746 - }, - { - "epoch": 1.8055244714058543, - "grad_norm": 0.83203125, - "learning_rate": 1.2906378927324661e-05, - "loss": 1.3487, - "step": 5748 - }, - { - "epoch": 1.8061526984313958, - "grad_norm": 0.76953125, - "learning_rate": 1.2903840050777532e-05, - "loss": 1.2097, - "step": 5750 - }, - { - "epoch": 1.8067809254569371, - "grad_norm": 0.72265625, - "learning_rate": 1.2901301174230406e-05, - "loss": 1.2788, - "step": 5752 - }, - { - "epoch": 1.8074091524824785, - "grad_norm": 0.7578125, - "learning_rate": 1.2898762297683275e-05, - "loss": 1.2052, - "step": 5754 - }, - { - "epoch": 1.8080373795080198, - "grad_norm": 0.765625, - "learning_rate": 1.289622342113615e-05, - "loss": 1.3531, - "step": 5756 - }, - { - "epoch": 1.8086656065335611, - "grad_norm": 0.83984375, - "learning_rate": 1.289368454458902e-05, - "loss": 1.2512, - "step": 5758 - }, - { - "epoch": 1.8092938335591024, - "grad_norm": 0.765625, - "learning_rate": 1.2891145668041893e-05, - "loss": 1.2617, - "step": 5760 - }, - { - "epoch": 1.8099220605846438, - "grad_norm": 0.80078125, - "learning_rate": 1.2888606791494764e-05, - "loss": 1.2172, - "step": 5762 - }, - { - "epoch": 1.810550287610185, - "grad_norm": 0.80859375, - "learning_rate": 1.2886067914947637e-05, - "loss": 1.3569, - "step": 5764 - }, - { - "epoch": 1.8111785146357264, - "grad_norm": 0.8828125, - "learning_rate": 1.2883529038400509e-05, - "loss": 1.1964, - "step": 5766 - }, - { - "epoch": 1.8118067416612678, - "grad_norm": 0.85546875, - "learning_rate": 1.2880990161853382e-05, - "loss": 1.1786, - "step": 5768 - }, - { - "epoch": 1.812434968686809, - "grad_norm": 0.7578125, - "learning_rate": 1.2878451285306253e-05, - "loss": 1.2148, - "step": 5770 - }, - { - "epoch": 1.8130631957123504, - "grad_norm": 0.90234375, - "learning_rate": 1.2875912408759126e-05, - "loss": 1.1685, - "step": 5772 - }, - { - "epoch": 1.8136914227378917, - "grad_norm": 0.8046875, - "learning_rate": 1.2873373532211998e-05, - "loss": 1.1826, - "step": 5774 - }, - { - "epoch": 1.8143196497634333, - "grad_norm": 0.8828125, - "learning_rate": 1.287083465566487e-05, - "loss": 1.2594, - "step": 5776 - }, - { - "epoch": 1.8149478767889746, - "grad_norm": 0.828125, - "learning_rate": 1.286829577911774e-05, - "loss": 1.209, - "step": 5778 - }, - { - "epoch": 1.815576103814516, - "grad_norm": 0.83984375, - "learning_rate": 1.2865756902570613e-05, - "loss": 1.2065, - "step": 5780 - }, - { - "epoch": 1.8162043308400573, - "grad_norm": 0.78125, - "learning_rate": 1.2863218026023485e-05, - "loss": 1.1886, - "step": 5782 - }, - { - "epoch": 1.8168325578655988, - "grad_norm": 0.79296875, - "learning_rate": 1.2860679149476358e-05, - "loss": 1.2196, - "step": 5784 - }, - { - "epoch": 1.8174607848911402, - "grad_norm": 0.92578125, - "learning_rate": 1.285814027292923e-05, - "loss": 1.1996, - "step": 5786 - }, - { - "epoch": 1.8180890119166815, - "grad_norm": 0.7578125, - "learning_rate": 1.2855601396382102e-05, - "loss": 1.1933, - "step": 5788 - }, - { - "epoch": 1.8187172389422228, - "grad_norm": 0.80859375, - "learning_rate": 1.2853062519834974e-05, - "loss": 1.2438, - "step": 5790 - }, - { - "epoch": 1.8193454659677641, - "grad_norm": 0.8671875, - "learning_rate": 1.2850523643287847e-05, - "loss": 1.2136, - "step": 5792 - }, - { - "epoch": 1.8199736929933055, - "grad_norm": 0.78125, - "learning_rate": 1.2847984766740718e-05, - "loss": 1.3761, - "step": 5794 - }, - { - "epoch": 1.8206019200188468, - "grad_norm": 0.75390625, - "learning_rate": 1.2845445890193591e-05, - "loss": 1.1976, - "step": 5796 - }, - { - "epoch": 1.8212301470443881, - "grad_norm": 0.78125, - "learning_rate": 1.2842907013646463e-05, - "loss": 1.3462, - "step": 5798 - }, - { - "epoch": 1.8218583740699295, - "grad_norm": 0.875, - "learning_rate": 1.2840368137099336e-05, - "loss": 1.2293, - "step": 5800 - }, - { - "epoch": 1.8224866010954708, - "grad_norm": 0.92578125, - "learning_rate": 1.2837829260552205e-05, - "loss": 1.2233, - "step": 5802 - }, - { - "epoch": 1.823114828121012, - "grad_norm": 0.78125, - "learning_rate": 1.2835290384005078e-05, - "loss": 1.2824, - "step": 5804 - }, - { - "epoch": 1.8237430551465534, - "grad_norm": 0.75390625, - "learning_rate": 1.283275150745795e-05, - "loss": 1.2794, - "step": 5806 - }, - { - "epoch": 1.824371282172095, - "grad_norm": 0.78515625, - "learning_rate": 1.2830212630910823e-05, - "loss": 1.1825, - "step": 5808 - }, - { - "epoch": 1.8249995091976363, - "grad_norm": 0.9296875, - "learning_rate": 1.2827673754363694e-05, - "loss": 1.1168, - "step": 5810 - }, - { - "epoch": 1.8256277362231776, - "grad_norm": 0.83984375, - "learning_rate": 1.2825134877816567e-05, - "loss": 1.2021, - "step": 5812 - }, - { - "epoch": 1.826255963248719, - "grad_norm": 0.81640625, - "learning_rate": 1.2822596001269439e-05, - "loss": 1.3024, - "step": 5814 - }, - { - "epoch": 1.8268841902742605, - "grad_norm": 0.76953125, - "learning_rate": 1.2820057124722312e-05, - "loss": 1.2621, - "step": 5816 - }, - { - "epoch": 1.8275124172998019, - "grad_norm": 0.80078125, - "learning_rate": 1.2817518248175183e-05, - "loss": 1.0934, - "step": 5818 - }, - { - "epoch": 1.8281406443253432, - "grad_norm": 0.7890625, - "learning_rate": 1.2814979371628056e-05, - "loss": 1.3914, - "step": 5820 - }, - { - "epoch": 1.8287688713508845, - "grad_norm": 0.8828125, - "learning_rate": 1.2812440495080926e-05, - "loss": 1.1312, - "step": 5822 - }, - { - "epoch": 1.8293970983764258, - "grad_norm": 0.83203125, - "learning_rate": 1.28099016185338e-05, - "loss": 1.1222, - "step": 5824 - }, - { - "epoch": 1.8300253254019672, - "grad_norm": 0.73828125, - "learning_rate": 1.280736274198667e-05, - "loss": 1.2303, - "step": 5826 - }, - { - "epoch": 1.8306535524275085, - "grad_norm": 0.86328125, - "learning_rate": 1.2804823865439543e-05, - "loss": 1.2426, - "step": 5828 - }, - { - "epoch": 1.8312817794530498, - "grad_norm": 0.77734375, - "learning_rate": 1.2802284988892417e-05, - "loss": 1.1024, - "step": 5830 - }, - { - "epoch": 1.8319100064785911, - "grad_norm": 0.8515625, - "learning_rate": 1.2799746112345288e-05, - "loss": 1.3141, - "step": 5832 - }, - { - "epoch": 1.8325382335041325, - "grad_norm": 0.75390625, - "learning_rate": 1.2797207235798161e-05, - "loss": 1.1605, - "step": 5834 - }, - { - "epoch": 1.8331664605296738, - "grad_norm": 0.81640625, - "learning_rate": 1.2794668359251032e-05, - "loss": 1.1718, - "step": 5836 - }, - { - "epoch": 1.8337946875552151, - "grad_norm": 0.7421875, - "learning_rate": 1.2792129482703905e-05, - "loss": 1.1882, - "step": 5838 - }, - { - "epoch": 1.8344229145807565, - "grad_norm": 0.80859375, - "learning_rate": 1.2789590606156777e-05, - "loss": 1.3604, - "step": 5840 - }, - { - "epoch": 1.835051141606298, - "grad_norm": 0.79296875, - "learning_rate": 1.278705172960965e-05, - "loss": 1.1974, - "step": 5842 - }, - { - "epoch": 1.8356793686318393, - "grad_norm": 0.8828125, - "learning_rate": 1.2784512853062521e-05, - "loss": 1.1672, - "step": 5844 - }, - { - "epoch": 1.8363075956573807, - "grad_norm": 0.80859375, - "learning_rate": 1.2781973976515394e-05, - "loss": 1.2177, - "step": 5846 - }, - { - "epoch": 1.836935822682922, - "grad_norm": 0.91796875, - "learning_rate": 1.2779435099968264e-05, - "loss": 1.2313, - "step": 5848 - }, - { - "epoch": 1.8375640497084635, - "grad_norm": 0.76171875, - "learning_rate": 1.2776896223421139e-05, - "loss": 1.3003, - "step": 5850 - }, - { - "epoch": 1.8381922767340049, - "grad_norm": 0.84375, - "learning_rate": 1.2774357346874009e-05, - "loss": 1.192, - "step": 5852 - }, - { - "epoch": 1.8388205037595462, - "grad_norm": 0.96484375, - "learning_rate": 1.2771818470326882e-05, - "loss": 1.2617, - "step": 5854 - }, - { - "epoch": 1.8394487307850875, - "grad_norm": 0.76171875, - "learning_rate": 1.2769279593779753e-05, - "loss": 1.0411, - "step": 5856 - }, - { - "epoch": 1.8400769578106289, - "grad_norm": 0.76953125, - "learning_rate": 1.2766740717232626e-05, - "loss": 1.2904, - "step": 5858 - }, - { - "epoch": 1.8407051848361702, - "grad_norm": 0.79296875, - "learning_rate": 1.2764201840685497e-05, - "loss": 1.3183, - "step": 5860 - }, - { - "epoch": 1.8413334118617115, - "grad_norm": 0.83984375, - "learning_rate": 1.276166296413837e-05, - "loss": 1.3205, - "step": 5862 - }, - { - "epoch": 1.8419616388872528, - "grad_norm": 0.828125, - "learning_rate": 1.2759124087591242e-05, - "loss": 1.2196, - "step": 5864 - }, - { - "epoch": 1.8425898659127942, - "grad_norm": 0.78515625, - "learning_rate": 1.2756585211044115e-05, - "loss": 1.243, - "step": 5866 - }, - { - "epoch": 1.8432180929383355, - "grad_norm": 0.82421875, - "learning_rate": 1.2754046334496986e-05, - "loss": 1.1317, - "step": 5868 - }, - { - "epoch": 1.8438463199638768, - "grad_norm": 0.82421875, - "learning_rate": 1.275150745794986e-05, - "loss": 1.2339, - "step": 5870 - }, - { - "epoch": 1.8444745469894182, - "grad_norm": 0.77734375, - "learning_rate": 1.2748968581402729e-05, - "loss": 1.1604, - "step": 5872 - }, - { - "epoch": 1.8451027740149597, - "grad_norm": 0.79296875, - "learning_rate": 1.2746429704855602e-05, - "loss": 1.1184, - "step": 5874 - }, - { - "epoch": 1.845731001040501, - "grad_norm": 0.78125, - "learning_rate": 1.2743890828308474e-05, - "loss": 1.2737, - "step": 5876 - }, - { - "epoch": 1.8463592280660424, - "grad_norm": 0.78125, - "learning_rate": 1.2741351951761347e-05, - "loss": 1.2273, - "step": 5878 - }, - { - "epoch": 1.8469874550915837, - "grad_norm": 0.79296875, - "learning_rate": 1.2738813075214218e-05, - "loss": 1.2768, - "step": 5880 - }, - { - "epoch": 1.8476156821171252, - "grad_norm": 0.73828125, - "learning_rate": 1.2736274198667091e-05, - "loss": 1.3388, - "step": 5882 - }, - { - "epoch": 1.8482439091426666, - "grad_norm": 0.85546875, - "learning_rate": 1.2733735322119962e-05, - "loss": 1.2175, - "step": 5884 - }, - { - "epoch": 1.848872136168208, - "grad_norm": 0.75390625, - "learning_rate": 1.2731196445572836e-05, - "loss": 1.2659, - "step": 5886 - }, - { - "epoch": 1.8495003631937492, - "grad_norm": 0.84765625, - "learning_rate": 1.2728657569025707e-05, - "loss": 1.2642, - "step": 5888 - }, - { - "epoch": 1.8501285902192905, - "grad_norm": 0.8203125, - "learning_rate": 1.272611869247858e-05, - "loss": 1.0971, - "step": 5890 - }, - { - "epoch": 1.8507568172448319, - "grad_norm": 0.74609375, - "learning_rate": 1.272357981593145e-05, - "loss": 1.3001, - "step": 5892 - }, - { - "epoch": 1.8513850442703732, - "grad_norm": 0.6953125, - "learning_rate": 1.2721040939384324e-05, - "loss": 1.457, - "step": 5894 - }, - { - "epoch": 1.8520132712959145, - "grad_norm": 0.72265625, - "learning_rate": 1.2718502062837194e-05, - "loss": 1.3597, - "step": 5896 - }, - { - "epoch": 1.8526414983214559, - "grad_norm": 0.89453125, - "learning_rate": 1.2715963186290067e-05, - "loss": 1.2853, - "step": 5898 - }, - { - "epoch": 1.8532697253469972, - "grad_norm": 0.7734375, - "learning_rate": 1.2713424309742939e-05, - "loss": 1.2468, - "step": 5900 - }, - { - "epoch": 1.8538979523725385, - "grad_norm": 0.76953125, - "learning_rate": 1.2710885433195812e-05, - "loss": 1.3252, - "step": 5902 - }, - { - "epoch": 1.8545261793980798, - "grad_norm": 0.7421875, - "learning_rate": 1.2708346556648683e-05, - "loss": 1.3121, - "step": 5904 - }, - { - "epoch": 1.8551544064236212, - "grad_norm": 0.81640625, - "learning_rate": 1.2705807680101556e-05, - "loss": 1.1819, - "step": 5906 - }, - { - "epoch": 1.8557826334491627, - "grad_norm": 0.8359375, - "learning_rate": 1.2703268803554428e-05, - "loss": 1.2418, - "step": 5908 - }, - { - "epoch": 1.856410860474704, - "grad_norm": 0.71875, - "learning_rate": 1.27007299270073e-05, - "loss": 1.3177, - "step": 5910 - }, - { - "epoch": 1.8570390875002454, - "grad_norm": 0.78515625, - "learning_rate": 1.2698191050460172e-05, - "loss": 1.2514, - "step": 5912 - }, - { - "epoch": 1.8576673145257867, - "grad_norm": 0.89453125, - "learning_rate": 1.2695652173913045e-05, - "loss": 1.2886, - "step": 5914 - }, - { - "epoch": 1.8582955415513283, - "grad_norm": 0.82421875, - "learning_rate": 1.2693113297365918e-05, - "loss": 1.3111, - "step": 5916 - }, - { - "epoch": 1.8589237685768696, - "grad_norm": 0.90625, - "learning_rate": 1.2690574420818788e-05, - "loss": 1.1563, - "step": 5918 - }, - { - "epoch": 1.859551995602411, - "grad_norm": 2.40625, - "learning_rate": 1.2688035544271663e-05, - "loss": 1.2238, - "step": 5920 - }, - { - "epoch": 1.8601802226279522, - "grad_norm": 0.8515625, - "learning_rate": 1.2685496667724532e-05, - "loss": 1.2487, - "step": 5922 - }, - { - "epoch": 1.8608084496534936, - "grad_norm": 0.80859375, - "learning_rate": 1.2682957791177405e-05, - "loss": 1.2178, - "step": 5924 - }, - { - "epoch": 1.861436676679035, - "grad_norm": 0.8125, - "learning_rate": 1.2680418914630277e-05, - "loss": 1.1426, - "step": 5926 - }, - { - "epoch": 1.8620649037045762, - "grad_norm": 0.80859375, - "learning_rate": 1.267788003808315e-05, - "loss": 1.2623, - "step": 5928 - }, - { - "epoch": 1.8626931307301176, - "grad_norm": 0.87109375, - "learning_rate": 1.2675341161536021e-05, - "loss": 1.0865, - "step": 5930 - }, - { - "epoch": 1.8633213577556589, - "grad_norm": 0.85546875, - "learning_rate": 1.2672802284988894e-05, - "loss": 1.1582, - "step": 5932 - }, - { - "epoch": 1.8639495847812002, - "grad_norm": 0.84765625, - "learning_rate": 1.2670263408441766e-05, - "loss": 0.9822, - "step": 5934 - }, - { - "epoch": 1.8645778118067415, - "grad_norm": 1.0625, - "learning_rate": 1.2667724531894639e-05, - "loss": 1.1975, - "step": 5936 - }, - { - "epoch": 1.8652060388322829, - "grad_norm": 0.87109375, - "learning_rate": 1.266518565534751e-05, - "loss": 1.1873, - "step": 5938 - }, - { - "epoch": 1.8658342658578244, - "grad_norm": 0.8125, - "learning_rate": 1.2662646778800383e-05, - "loss": 1.2006, - "step": 5940 - }, - { - "epoch": 1.8664624928833657, - "grad_norm": 0.78515625, - "learning_rate": 1.2660107902253253e-05, - "loss": 1.2259, - "step": 5942 - }, - { - "epoch": 1.867090719908907, - "grad_norm": 0.8359375, - "learning_rate": 1.2657569025706126e-05, - "loss": 1.1884, - "step": 5944 - }, - { - "epoch": 1.8677189469344484, - "grad_norm": 0.8671875, - "learning_rate": 1.2655030149158997e-05, - "loss": 1.1981, - "step": 5946 - }, - { - "epoch": 1.86834717395999, - "grad_norm": 0.77734375, - "learning_rate": 1.265249127261187e-05, - "loss": 1.1178, - "step": 5948 - }, - { - "epoch": 1.8689754009855313, - "grad_norm": 0.7578125, - "learning_rate": 1.2649952396064742e-05, - "loss": 1.2778, - "step": 5950 - }, - { - "epoch": 1.8696036280110726, - "grad_norm": 0.78125, - "learning_rate": 1.2647413519517615e-05, - "loss": 1.171, - "step": 5952 - }, - { - "epoch": 1.870231855036614, - "grad_norm": 0.7890625, - "learning_rate": 1.2644874642970486e-05, - "loss": 1.2154, - "step": 5954 - }, - { - "epoch": 1.8708600820621553, - "grad_norm": 0.79296875, - "learning_rate": 1.264233576642336e-05, - "loss": 1.2481, - "step": 5956 - }, - { - "epoch": 1.8714883090876966, - "grad_norm": 0.84375, - "learning_rate": 1.263979688987623e-05, - "loss": 1.2464, - "step": 5958 - }, - { - "epoch": 1.872116536113238, - "grad_norm": 0.76171875, - "learning_rate": 1.2637258013329104e-05, - "loss": 1.1656, - "step": 5960 - }, - { - "epoch": 1.8727447631387792, - "grad_norm": 0.73046875, - "learning_rate": 1.2634719136781975e-05, - "loss": 1.3967, - "step": 5962 - }, - { - "epoch": 1.8733729901643206, - "grad_norm": 0.81640625, - "learning_rate": 1.2632180260234848e-05, - "loss": 1.295, - "step": 5964 - }, - { - "epoch": 1.874001217189862, - "grad_norm": 0.71875, - "learning_rate": 1.2629641383687718e-05, - "loss": 1.1594, - "step": 5966 - }, - { - "epoch": 1.8746294442154032, - "grad_norm": 0.86328125, - "learning_rate": 1.2627102507140591e-05, - "loss": 1.1973, - "step": 5968 - }, - { - "epoch": 1.8752576712409446, - "grad_norm": 0.9765625, - "learning_rate": 1.2624563630593462e-05, - "loss": 1.3587, - "step": 5970 - }, - { - "epoch": 1.8758858982664859, - "grad_norm": 0.80078125, - "learning_rate": 1.2622024754046335e-05, - "loss": 1.3327, - "step": 5972 - }, - { - "epoch": 1.8765141252920274, - "grad_norm": 0.8515625, - "learning_rate": 1.2619485877499207e-05, - "loss": 1.1387, - "step": 5974 - }, - { - "epoch": 1.8771423523175688, - "grad_norm": 0.9140625, - "learning_rate": 1.261694700095208e-05, - "loss": 1.1447, - "step": 5976 - }, - { - "epoch": 1.87777057934311, - "grad_norm": 0.75, - "learning_rate": 1.2614408124404951e-05, - "loss": 1.1872, - "step": 5978 - }, - { - "epoch": 1.8783988063686514, - "grad_norm": 0.7890625, - "learning_rate": 1.2611869247857824e-05, - "loss": 1.2919, - "step": 5980 - }, - { - "epoch": 1.879027033394193, - "grad_norm": 0.78125, - "learning_rate": 1.2609330371310696e-05, - "loss": 1.3053, - "step": 5982 - }, - { - "epoch": 1.8796552604197343, - "grad_norm": 0.8125, - "learning_rate": 1.2606791494763569e-05, - "loss": 1.1493, - "step": 5984 - }, - { - "epoch": 1.8802834874452756, - "grad_norm": 0.765625, - "learning_rate": 1.2604252618216439e-05, - "loss": 1.2792, - "step": 5986 - }, - { - "epoch": 1.880911714470817, - "grad_norm": 0.765625, - "learning_rate": 1.2601713741669313e-05, - "loss": 1.3208, - "step": 5988 - }, - { - "epoch": 1.8815399414963583, - "grad_norm": 0.80859375, - "learning_rate": 1.2599174865122183e-05, - "loss": 1.1665, - "step": 5990 - }, - { - "epoch": 1.8821681685218996, - "grad_norm": 0.73828125, - "learning_rate": 1.2596635988575056e-05, - "loss": 1.2419, - "step": 5992 - }, - { - "epoch": 1.882796395547441, - "grad_norm": 0.83203125, - "learning_rate": 1.2594097112027927e-05, - "loss": 1.4174, - "step": 5994 - }, - { - "epoch": 1.8834246225729823, - "grad_norm": 0.8984375, - "learning_rate": 1.25915582354808e-05, - "loss": 1.2457, - "step": 5996 - }, - { - "epoch": 1.8840528495985236, - "grad_norm": 0.80859375, - "learning_rate": 1.2589019358933672e-05, - "loss": 1.2441, - "step": 5998 - }, - { - "epoch": 1.884681076624065, - "grad_norm": 0.8046875, - "learning_rate": 1.2586480482386545e-05, - "loss": 1.2268, - "step": 6000 - }, - { - "epoch": 1.8853093036496062, - "grad_norm": 0.8828125, - "learning_rate": 1.2583941605839418e-05, - "loss": 1.117, - "step": 6002 - }, - { - "epoch": 1.8859375306751476, - "grad_norm": 0.76953125, - "learning_rate": 1.258140272929229e-05, - "loss": 1.2595, - "step": 6004 - }, - { - "epoch": 1.8865657577006891, - "grad_norm": 1.0078125, - "learning_rate": 1.2578863852745163e-05, - "loss": 1.2623, - "step": 6006 - }, - { - "epoch": 1.8871939847262305, - "grad_norm": 0.8671875, - "learning_rate": 1.2576324976198034e-05, - "loss": 1.2805, - "step": 6008 - }, - { - "epoch": 1.8878222117517718, - "grad_norm": 0.8359375, - "learning_rate": 1.2573786099650907e-05, - "loss": 1.1744, - "step": 6010 - }, - { - "epoch": 1.8884504387773131, - "grad_norm": 0.875, - "learning_rate": 1.2571247223103777e-05, - "loss": 1.3327, - "step": 6012 - }, - { - "epoch": 1.8890786658028547, - "grad_norm": 0.73046875, - "learning_rate": 1.2568708346556651e-05, - "loss": 1.1917, - "step": 6014 - }, - { - "epoch": 1.889706892828396, - "grad_norm": 0.75390625, - "learning_rate": 1.2566169470009521e-05, - "loss": 1.4492, - "step": 6016 - }, - { - "epoch": 1.8903351198539373, - "grad_norm": 0.8515625, - "learning_rate": 1.2563630593462394e-05, - "loss": 1.1507, - "step": 6018 - }, - { - "epoch": 1.8909633468794786, - "grad_norm": 0.80859375, - "learning_rate": 1.2561091716915266e-05, - "loss": 1.2665, - "step": 6020 - }, - { - "epoch": 1.89159157390502, - "grad_norm": 0.8671875, - "learning_rate": 1.2558552840368139e-05, - "loss": 1.2222, - "step": 6022 - }, - { - "epoch": 1.8922198009305613, - "grad_norm": 0.81640625, - "learning_rate": 1.255601396382101e-05, - "loss": 1.2067, - "step": 6024 - }, - { - "epoch": 1.8928480279561026, - "grad_norm": 0.82421875, - "learning_rate": 1.2553475087273883e-05, - "loss": 1.1897, - "step": 6026 - }, - { - "epoch": 1.893476254981644, - "grad_norm": 0.9375, - "learning_rate": 1.2550936210726755e-05, - "loss": 1.1622, - "step": 6028 - }, - { - "epoch": 1.8941044820071853, - "grad_norm": 0.91796875, - "learning_rate": 1.2548397334179628e-05, - "loss": 1.3068, - "step": 6030 - }, - { - "epoch": 1.8947327090327266, - "grad_norm": 0.8984375, - "learning_rate": 1.2545858457632499e-05, - "loss": 1.2051, - "step": 6032 - }, - { - "epoch": 1.895360936058268, - "grad_norm": 0.76171875, - "learning_rate": 1.2543319581085372e-05, - "loss": 1.1956, - "step": 6034 - }, - { - "epoch": 1.8959891630838093, - "grad_norm": 0.8125, - "learning_rate": 1.2540780704538242e-05, - "loss": 1.3253, - "step": 6036 - }, - { - "epoch": 1.8966173901093508, - "grad_norm": 0.83203125, - "learning_rate": 1.2538241827991115e-05, - "loss": 1.2558, - "step": 6038 - }, - { - "epoch": 1.8972456171348921, - "grad_norm": 0.80078125, - "learning_rate": 1.2535702951443986e-05, - "loss": 1.2556, - "step": 6040 - }, - { - "epoch": 1.8978738441604335, - "grad_norm": 0.8125, - "learning_rate": 1.253316407489686e-05, - "loss": 1.1596, - "step": 6042 - }, - { - "epoch": 1.8985020711859748, - "grad_norm": 0.80859375, - "learning_rate": 1.253062519834973e-05, - "loss": 1.2663, - "step": 6044 - }, - { - "epoch": 1.8991302982115161, - "grad_norm": 0.81640625, - "learning_rate": 1.2528086321802604e-05, - "loss": 1.2155, - "step": 6046 - }, - { - "epoch": 1.8997585252370577, - "grad_norm": 0.75390625, - "learning_rate": 1.2525547445255475e-05, - "loss": 1.272, - "step": 6048 - }, - { - "epoch": 1.900386752262599, - "grad_norm": 0.78125, - "learning_rate": 1.2523008568708348e-05, - "loss": 1.2012, - "step": 6050 - }, - { - "epoch": 1.9010149792881403, - "grad_norm": 0.76171875, - "learning_rate": 1.252046969216122e-05, - "loss": 1.2281, - "step": 6052 - }, - { - "epoch": 1.9016432063136817, - "grad_norm": 0.81640625, - "learning_rate": 1.2517930815614093e-05, - "loss": 1.2045, - "step": 6054 - }, - { - "epoch": 1.902271433339223, - "grad_norm": 0.79296875, - "learning_rate": 1.2515391939066962e-05, - "loss": 1.2902, - "step": 6056 - }, - { - "epoch": 1.9028996603647643, - "grad_norm": 0.80078125, - "learning_rate": 1.2512853062519837e-05, - "loss": 1.2146, - "step": 6058 - }, - { - "epoch": 1.9035278873903057, - "grad_norm": 0.83203125, - "learning_rate": 1.2510314185972707e-05, - "loss": 1.1062, - "step": 6060 - }, - { - "epoch": 1.904156114415847, - "grad_norm": 0.83984375, - "learning_rate": 1.250777530942558e-05, - "loss": 1.1466, - "step": 6062 - }, - { - "epoch": 1.9047843414413883, - "grad_norm": 0.7890625, - "learning_rate": 1.2505236432878451e-05, - "loss": 1.2591, - "step": 6064 - }, - { - "epoch": 1.9054125684669296, - "grad_norm": 0.8203125, - "learning_rate": 1.2502697556331324e-05, - "loss": 1.2363, - "step": 6066 - }, - { - "epoch": 1.906040795492471, - "grad_norm": 0.7265625, - "learning_rate": 1.2500158679784196e-05, - "loss": 1.3046, - "step": 6068 - }, - { - "epoch": 1.9066690225180123, - "grad_norm": 0.765625, - "learning_rate": 1.2497619803237069e-05, - "loss": 1.3882, - "step": 6070 - }, - { - "epoch": 1.9072972495435538, - "grad_norm": 0.81640625, - "learning_rate": 1.249508092668994e-05, - "loss": 1.2277, - "step": 6072 - }, - { - "epoch": 1.9079254765690952, - "grad_norm": 0.76171875, - "learning_rate": 1.2492542050142813e-05, - "loss": 1.2135, - "step": 6074 - }, - { - "epoch": 1.9085537035946365, - "grad_norm": 0.80859375, - "learning_rate": 1.2490003173595685e-05, - "loss": 1.171, - "step": 6076 - }, - { - "epoch": 1.9091819306201778, - "grad_norm": 0.8359375, - "learning_rate": 1.2487464297048558e-05, - "loss": 1.2015, - "step": 6078 - }, - { - "epoch": 1.9098101576457194, - "grad_norm": 0.78515625, - "learning_rate": 1.2484925420501427e-05, - "loss": 1.3251, - "step": 6080 - }, - { - "epoch": 1.9104383846712607, - "grad_norm": 0.74609375, - "learning_rate": 1.24823865439543e-05, - "loss": 1.3621, - "step": 6082 - }, - { - "epoch": 1.911066611696802, - "grad_norm": 0.71484375, - "learning_rate": 1.2479847667407172e-05, - "loss": 1.2972, - "step": 6084 - }, - { - "epoch": 1.9116948387223434, - "grad_norm": 0.8671875, - "learning_rate": 1.2477308790860045e-05, - "loss": 1.1877, - "step": 6086 - }, - { - "epoch": 1.9123230657478847, - "grad_norm": 0.78125, - "learning_rate": 1.2474769914312918e-05, - "loss": 1.221, - "step": 6088 - }, - { - "epoch": 1.912951292773426, - "grad_norm": 0.85546875, - "learning_rate": 1.247223103776579e-05, - "loss": 1.3329, - "step": 6090 - }, - { - "epoch": 1.9135795197989673, - "grad_norm": 0.84375, - "learning_rate": 1.2469692161218662e-05, - "loss": 1.2478, - "step": 6092 - }, - { - "epoch": 1.9142077468245087, - "grad_norm": 0.859375, - "learning_rate": 1.2467153284671534e-05, - "loss": 1.2821, - "step": 6094 - }, - { - "epoch": 1.91483597385005, - "grad_norm": 0.765625, - "learning_rate": 1.2464614408124407e-05, - "loss": 1.1848, - "step": 6096 - }, - { - "epoch": 1.9154642008755913, - "grad_norm": 0.81640625, - "learning_rate": 1.2462075531577278e-05, - "loss": 1.2984, - "step": 6098 - }, - { - "epoch": 1.9160924279011327, - "grad_norm": 0.69921875, - "learning_rate": 1.2459536655030151e-05, - "loss": 1.1486, - "step": 6100 - }, - { - "epoch": 1.916720654926674, - "grad_norm": 0.74609375, - "learning_rate": 1.2456997778483023e-05, - "loss": 1.3557, - "step": 6102 - }, - { - "epoch": 1.9173488819522155, - "grad_norm": 0.76953125, - "learning_rate": 1.2454458901935896e-05, - "loss": 1.2475, - "step": 6104 - }, - { - "epoch": 1.9179771089777569, - "grad_norm": 0.83984375, - "learning_rate": 1.2451920025388766e-05, - "loss": 1.1533, - "step": 6106 - }, - { - "epoch": 1.9186053360032982, - "grad_norm": 0.796875, - "learning_rate": 1.2449381148841639e-05, - "loss": 1.2381, - "step": 6108 - }, - { - "epoch": 1.9192335630288395, - "grad_norm": 1.0234375, - "learning_rate": 1.244684227229451e-05, - "loss": 1.2304, - "step": 6110 - }, - { - "epoch": 1.919861790054381, - "grad_norm": 0.75, - "learning_rate": 1.2444303395747383e-05, - "loss": 1.3272, - "step": 6112 - }, - { - "epoch": 1.9204900170799224, - "grad_norm": 0.796875, - "learning_rate": 1.2441764519200254e-05, - "loss": 1.1725, - "step": 6114 - }, - { - "epoch": 1.9211182441054637, - "grad_norm": 0.953125, - "learning_rate": 1.2439225642653128e-05, - "loss": 1.0932, - "step": 6116 - }, - { - "epoch": 1.921746471131005, - "grad_norm": 0.75390625, - "learning_rate": 1.2436686766105999e-05, - "loss": 1.2361, - "step": 6118 - }, - { - "epoch": 1.9223746981565464, - "grad_norm": 0.7421875, - "learning_rate": 1.2434147889558872e-05, - "loss": 1.3047, - "step": 6120 - }, - { - "epoch": 1.9230029251820877, - "grad_norm": 0.86328125, - "learning_rate": 1.2431609013011743e-05, - "loss": 1.3639, - "step": 6122 - }, - { - "epoch": 1.923631152207629, - "grad_norm": 0.9140625, - "learning_rate": 1.2429070136464616e-05, - "loss": 1.1849, - "step": 6124 - }, - { - "epoch": 1.9242593792331704, - "grad_norm": 0.77734375, - "learning_rate": 1.2426531259917488e-05, - "loss": 1.2506, - "step": 6126 - }, - { - "epoch": 1.9248876062587117, - "grad_norm": 0.79296875, - "learning_rate": 1.2423992383370361e-05, - "loss": 1.2748, - "step": 6128 - }, - { - "epoch": 1.925515833284253, - "grad_norm": 0.921875, - "learning_rate": 1.242145350682323e-05, - "loss": 1.3513, - "step": 6130 - }, - { - "epoch": 1.9261440603097943, - "grad_norm": 0.75, - "learning_rate": 1.2418914630276104e-05, - "loss": 1.3052, - "step": 6132 - }, - { - "epoch": 1.9267722873353357, - "grad_norm": 0.796875, - "learning_rate": 1.2416375753728975e-05, - "loss": 1.1691, - "step": 6134 - }, - { - "epoch": 1.927400514360877, - "grad_norm": 0.74609375, - "learning_rate": 1.2413836877181848e-05, - "loss": 1.1503, - "step": 6136 - }, - { - "epoch": 1.9280287413864186, - "grad_norm": 0.85546875, - "learning_rate": 1.241129800063472e-05, - "loss": 1.218, - "step": 6138 - }, - { - "epoch": 1.9286569684119599, - "grad_norm": 0.93359375, - "learning_rate": 1.2408759124087593e-05, - "loss": 1.1987, - "step": 6140 - }, - { - "epoch": 1.9292851954375012, - "grad_norm": 0.76171875, - "learning_rate": 1.2406220247540464e-05, - "loss": 1.1148, - "step": 6142 - }, - { - "epoch": 1.9299134224630425, - "grad_norm": 0.80859375, - "learning_rate": 1.2403681370993337e-05, - "loss": 1.3232, - "step": 6144 - }, - { - "epoch": 1.930541649488584, - "grad_norm": 0.7734375, - "learning_rate": 1.2401142494446208e-05, - "loss": 1.2638, - "step": 6146 - }, - { - "epoch": 1.9311698765141254, - "grad_norm": 0.8359375, - "learning_rate": 1.2398603617899081e-05, - "loss": 1.282, - "step": 6148 - }, - { - "epoch": 1.9317981035396667, - "grad_norm": 0.80859375, - "learning_rate": 1.2396064741351951e-05, - "loss": 1.292, - "step": 6150 - }, - { - "epoch": 1.932426330565208, - "grad_norm": 0.796875, - "learning_rate": 1.2393525864804826e-05, - "loss": 1.3647, - "step": 6152 - }, - { - "epoch": 1.9330545575907494, - "grad_norm": 0.828125, - "learning_rate": 1.2390986988257696e-05, - "loss": 1.1766, - "step": 6154 - }, - { - "epoch": 1.9336827846162907, - "grad_norm": 0.7578125, - "learning_rate": 1.2388448111710569e-05, - "loss": 1.2779, - "step": 6156 - }, - { - "epoch": 1.934311011641832, - "grad_norm": 0.8984375, - "learning_rate": 1.238590923516344e-05, - "loss": 1.2979, - "step": 6158 - }, - { - "epoch": 1.9349392386673734, - "grad_norm": 0.7890625, - "learning_rate": 1.2383370358616313e-05, - "loss": 1.3134, - "step": 6160 - }, - { - "epoch": 1.9355674656929147, - "grad_norm": 0.96875, - "learning_rate": 1.2380831482069185e-05, - "loss": 1.1554, - "step": 6162 - }, - { - "epoch": 1.936195692718456, - "grad_norm": 0.8046875, - "learning_rate": 1.2378292605522058e-05, - "loss": 1.3649, - "step": 6164 - }, - { - "epoch": 1.9368239197439974, - "grad_norm": 0.76953125, - "learning_rate": 1.2375753728974929e-05, - "loss": 1.2077, - "step": 6166 - }, - { - "epoch": 1.9374521467695387, - "grad_norm": 0.84765625, - "learning_rate": 1.2373214852427802e-05, - "loss": 1.288, - "step": 6168 - }, - { - "epoch": 1.9380803737950802, - "grad_norm": 0.85546875, - "learning_rate": 1.2370675975880673e-05, - "loss": 1.3333, - "step": 6170 - }, - { - "epoch": 1.9387086008206216, - "grad_norm": 0.84375, - "learning_rate": 1.2368137099333547e-05, - "loss": 1.345, - "step": 6172 - }, - { - "epoch": 1.939336827846163, - "grad_norm": 0.83203125, - "learning_rate": 1.236559822278642e-05, - "loss": 1.3079, - "step": 6174 - }, - { - "epoch": 1.9399650548717042, - "grad_norm": 0.80859375, - "learning_rate": 1.236305934623929e-05, - "loss": 1.2672, - "step": 6176 - }, - { - "epoch": 1.9405932818972458, - "grad_norm": 0.75390625, - "learning_rate": 1.2360520469692162e-05, - "loss": 1.2107, - "step": 6178 - }, - { - "epoch": 1.941221508922787, - "grad_norm": 0.76171875, - "learning_rate": 1.2357981593145034e-05, - "loss": 1.2125, - "step": 6180 - }, - { - "epoch": 1.9418497359483284, - "grad_norm": 0.77734375, - "learning_rate": 1.2355442716597907e-05, - "loss": 1.2236, - "step": 6182 - }, - { - "epoch": 1.9424779629738698, - "grad_norm": 0.8359375, - "learning_rate": 1.2352903840050778e-05, - "loss": 1.1823, - "step": 6184 - }, - { - "epoch": 1.943106189999411, - "grad_norm": 0.9765625, - "learning_rate": 1.2350364963503651e-05, - "loss": 1.2182, - "step": 6186 - }, - { - "epoch": 1.9437344170249524, - "grad_norm": 0.8125, - "learning_rate": 1.2347826086956523e-05, - "loss": 1.2996, - "step": 6188 - }, - { - "epoch": 1.9443626440504937, - "grad_norm": 0.796875, - "learning_rate": 1.2345287210409396e-05, - "loss": 1.1519, - "step": 6190 - }, - { - "epoch": 1.944990871076035, - "grad_norm": 0.7578125, - "learning_rate": 1.2342748333862267e-05, - "loss": 1.125, - "step": 6192 - }, - { - "epoch": 1.9456190981015764, - "grad_norm": 0.8046875, - "learning_rate": 1.234020945731514e-05, - "loss": 1.1524, - "step": 6194 - }, - { - "epoch": 1.9462473251271177, - "grad_norm": 0.765625, - "learning_rate": 1.2337670580768012e-05, - "loss": 1.2544, - "step": 6196 - }, - { - "epoch": 1.946875552152659, - "grad_norm": 0.8125, - "learning_rate": 1.2335131704220885e-05, - "loss": 1.1555, - "step": 6198 - }, - { - "epoch": 1.9475037791782004, - "grad_norm": 0.75390625, - "learning_rate": 1.2332592827673754e-05, - "loss": 1.2127, - "step": 6200 - }, - { - "epoch": 1.9481320062037417, - "grad_norm": 0.984375, - "learning_rate": 1.2330053951126627e-05, - "loss": 1.2985, - "step": 6202 - }, - { - "epoch": 1.9487602332292833, - "grad_norm": 0.83984375, - "learning_rate": 1.2327515074579499e-05, - "loss": 1.3043, - "step": 6204 - }, - { - "epoch": 1.9493884602548246, - "grad_norm": 0.78515625, - "learning_rate": 1.2324976198032372e-05, - "loss": 1.2911, - "step": 6206 - }, - { - "epoch": 1.950016687280366, - "grad_norm": 0.875, - "learning_rate": 1.2322437321485243e-05, - "loss": 1.2217, - "step": 6208 - }, - { - "epoch": 1.9506449143059073, - "grad_norm": 0.7734375, - "learning_rate": 1.2319898444938116e-05, - "loss": 1.2718, - "step": 6210 - }, - { - "epoch": 1.9512731413314488, - "grad_norm": 0.79296875, - "learning_rate": 1.2317359568390988e-05, - "loss": 1.3247, - "step": 6212 - }, - { - "epoch": 1.9519013683569901, - "grad_norm": 0.7890625, - "learning_rate": 1.231482069184386e-05, - "loss": 1.31, - "step": 6214 - }, - { - "epoch": 1.9525295953825315, - "grad_norm": 0.7265625, - "learning_rate": 1.2312281815296732e-05, - "loss": 1.2022, - "step": 6216 - }, - { - "epoch": 1.9531578224080728, - "grad_norm": 0.78515625, - "learning_rate": 1.2309742938749605e-05, - "loss": 1.1422, - "step": 6218 - }, - { - "epoch": 1.9537860494336141, - "grad_norm": 0.78125, - "learning_rate": 1.2307204062202475e-05, - "loss": 1.1487, - "step": 6220 - }, - { - "epoch": 1.9544142764591554, - "grad_norm": 0.9609375, - "learning_rate": 1.230466518565535e-05, - "loss": 1.2259, - "step": 6222 - }, - { - "epoch": 1.9550425034846968, - "grad_norm": 0.859375, - "learning_rate": 1.230212630910822e-05, - "loss": 1.1845, - "step": 6224 - }, - { - "epoch": 1.955670730510238, - "grad_norm": 0.76953125, - "learning_rate": 1.2299587432561093e-05, - "loss": 1.2533, - "step": 6226 - }, - { - "epoch": 1.9562989575357794, - "grad_norm": 0.78515625, - "learning_rate": 1.2297048556013964e-05, - "loss": 1.2527, - "step": 6228 - }, - { - "epoch": 1.9569271845613208, - "grad_norm": 0.78125, - "learning_rate": 1.2294509679466837e-05, - "loss": 1.2083, - "step": 6230 - }, - { - "epoch": 1.957555411586862, - "grad_norm": 0.75, - "learning_rate": 1.2291970802919708e-05, - "loss": 1.2065, - "step": 6232 - }, - { - "epoch": 1.9581836386124034, - "grad_norm": 0.7578125, - "learning_rate": 1.2289431926372581e-05, - "loss": 1.2186, - "step": 6234 - }, - { - "epoch": 1.958811865637945, - "grad_norm": 0.78125, - "learning_rate": 1.2286893049825453e-05, - "loss": 1.2687, - "step": 6236 - }, - { - "epoch": 1.9594400926634863, - "grad_norm": 0.76171875, - "learning_rate": 1.2284354173278326e-05, - "loss": 1.2311, - "step": 6238 - }, - { - "epoch": 1.9600683196890276, - "grad_norm": 0.87890625, - "learning_rate": 1.2281815296731197e-05, - "loss": 1.1847, - "step": 6240 - }, - { - "epoch": 1.960696546714569, - "grad_norm": 0.7578125, - "learning_rate": 1.227927642018407e-05, - "loss": 1.2003, - "step": 6242 - }, - { - "epoch": 1.9613247737401105, - "grad_norm": 0.7890625, - "learning_rate": 1.227673754363694e-05, - "loss": 1.4039, - "step": 6244 - }, - { - "epoch": 1.9619530007656518, - "grad_norm": 0.828125, - "learning_rate": 1.2274198667089813e-05, - "loss": 1.2274, - "step": 6246 - }, - { - "epoch": 1.9625812277911932, - "grad_norm": 0.82421875, - "learning_rate": 1.2271659790542684e-05, - "loss": 1.2743, - "step": 6248 - }, - { - "epoch": 1.9632094548167345, - "grad_norm": 0.734375, - "learning_rate": 1.2269120913995558e-05, - "loss": 1.414, - "step": 6250 - }, - { - "epoch": 1.9638376818422758, - "grad_norm": 0.90625, - "learning_rate": 1.2266582037448429e-05, - "loss": 1.2246, - "step": 6252 - }, - { - "epoch": 1.9644659088678171, - "grad_norm": 0.73046875, - "learning_rate": 1.2264043160901302e-05, - "loss": 1.2005, - "step": 6254 - }, - { - "epoch": 1.9650941358933585, - "grad_norm": 0.91015625, - "learning_rate": 1.2261504284354175e-05, - "loss": 1.2002, - "step": 6256 - }, - { - "epoch": 1.9657223629188998, - "grad_norm": 0.7421875, - "learning_rate": 1.2258965407807046e-05, - "loss": 1.2792, - "step": 6258 - }, - { - "epoch": 1.9663505899444411, - "grad_norm": 0.796875, - "learning_rate": 1.225642653125992e-05, - "loss": 1.0757, - "step": 6260 - }, - { - "epoch": 1.9669788169699824, - "grad_norm": 0.7890625, - "learning_rate": 1.2253887654712791e-05, - "loss": 1.3089, - "step": 6262 - }, - { - "epoch": 1.9676070439955238, - "grad_norm": 0.828125, - "learning_rate": 1.2251348778165664e-05, - "loss": 1.2696, - "step": 6264 - }, - { - "epoch": 1.968235271021065, - "grad_norm": 0.7890625, - "learning_rate": 1.2248809901618535e-05, - "loss": 1.2951, - "step": 6266 - }, - { - "epoch": 1.9688634980466064, - "grad_norm": 0.74609375, - "learning_rate": 1.2246271025071408e-05, - "loss": 1.232, - "step": 6268 - }, - { - "epoch": 1.969491725072148, - "grad_norm": 0.76171875, - "learning_rate": 1.2243732148524278e-05, - "loss": 1.3751, - "step": 6270 - }, - { - "epoch": 1.9701199520976893, - "grad_norm": 0.83203125, - "learning_rate": 1.2241193271977151e-05, - "loss": 1.3549, - "step": 6272 - }, - { - "epoch": 1.9707481791232306, - "grad_norm": 1.046875, - "learning_rate": 1.2238654395430023e-05, - "loss": 1.2652, - "step": 6274 - }, - { - "epoch": 1.971376406148772, - "grad_norm": 0.80859375, - "learning_rate": 1.2236115518882896e-05, - "loss": 1.2687, - "step": 6276 - }, - { - "epoch": 1.9720046331743135, - "grad_norm": 0.7890625, - "learning_rate": 1.2233576642335767e-05, - "loss": 1.4137, - "step": 6278 - }, - { - "epoch": 1.9726328601998548, - "grad_norm": 0.828125, - "learning_rate": 1.223103776578864e-05, - "loss": 1.1847, - "step": 6280 - }, - { - "epoch": 1.9732610872253962, - "grad_norm": 0.78125, - "learning_rate": 1.2228498889241512e-05, - "loss": 1.3383, - "step": 6282 - }, - { - "epoch": 1.9738893142509375, - "grad_norm": 0.8125, - "learning_rate": 1.2225960012694385e-05, - "loss": 1.2893, - "step": 6284 - }, - { - "epoch": 1.9745175412764788, - "grad_norm": 0.765625, - "learning_rate": 1.2223421136147256e-05, - "loss": 1.2479, - "step": 6286 - }, - { - "epoch": 1.9751457683020202, - "grad_norm": 0.8359375, - "learning_rate": 1.2220882259600129e-05, - "loss": 1.2674, - "step": 6288 - }, - { - "epoch": 1.9757739953275615, - "grad_norm": 0.71484375, - "learning_rate": 1.2218343383053e-05, - "loss": 1.332, - "step": 6290 - }, - { - "epoch": 1.9764022223531028, - "grad_norm": 0.8203125, - "learning_rate": 1.2215804506505874e-05, - "loss": 1.3086, - "step": 6292 - }, - { - "epoch": 1.9770304493786441, - "grad_norm": 0.74609375, - "learning_rate": 1.2213265629958743e-05, - "loss": 1.3607, - "step": 6294 - }, - { - "epoch": 1.9776586764041855, - "grad_norm": 0.78515625, - "learning_rate": 1.2210726753411616e-05, - "loss": 1.2653, - "step": 6296 - }, - { - "epoch": 1.9782869034297268, - "grad_norm": 0.77734375, - "learning_rate": 1.2208187876864488e-05, - "loss": 1.3309, - "step": 6298 - }, - { - "epoch": 1.9789151304552681, - "grad_norm": 0.80859375, - "learning_rate": 1.220564900031736e-05, - "loss": 1.3084, - "step": 6300 - }, - { - "epoch": 1.9795433574808097, - "grad_norm": 0.82421875, - "learning_rate": 1.2203110123770232e-05, - "loss": 1.2319, - "step": 6302 - }, - { - "epoch": 1.980171584506351, - "grad_norm": 0.76171875, - "learning_rate": 1.2200571247223105e-05, - "loss": 1.193, - "step": 6304 - }, - { - "epoch": 1.9807998115318923, - "grad_norm": 0.7890625, - "learning_rate": 1.2198032370675977e-05, - "loss": 1.1819, - "step": 6306 - }, - { - "epoch": 1.9814280385574337, - "grad_norm": 0.75390625, - "learning_rate": 1.219549349412885e-05, - "loss": 1.2694, - "step": 6308 - }, - { - "epoch": 1.9820562655829752, - "grad_norm": 0.796875, - "learning_rate": 1.2192954617581721e-05, - "loss": 1.2051, - "step": 6310 - }, - { - "epoch": 1.9826844926085165, - "grad_norm": 0.8125, - "learning_rate": 1.2190415741034594e-05, - "loss": 1.1432, - "step": 6312 - }, - { - "epoch": 1.9833127196340579, - "grad_norm": 0.859375, - "learning_rate": 1.2187876864487464e-05, - "loss": 1.1966, - "step": 6314 - }, - { - "epoch": 1.9839409466595992, - "grad_norm": 0.73046875, - "learning_rate": 1.2185337987940337e-05, - "loss": 1.1952, - "step": 6316 - }, - { - "epoch": 1.9845691736851405, - "grad_norm": 0.796875, - "learning_rate": 1.2182799111393208e-05, - "loss": 1.2597, - "step": 6318 - }, - { - "epoch": 1.9851974007106818, - "grad_norm": 1.2578125, - "learning_rate": 1.2180260234846081e-05, - "loss": 1.1049, - "step": 6320 - }, - { - "epoch": 1.9858256277362232, - "grad_norm": 0.73828125, - "learning_rate": 1.2177721358298953e-05, - "loss": 1.2622, - "step": 6322 - }, - { - "epoch": 1.9864538547617645, - "grad_norm": 0.75390625, - "learning_rate": 1.2175182481751826e-05, - "loss": 1.2554, - "step": 6324 - }, - { - "epoch": 1.9870820817873058, - "grad_norm": 0.8046875, - "learning_rate": 1.2172643605204697e-05, - "loss": 1.4156, - "step": 6326 - }, - { - "epoch": 1.9877103088128472, - "grad_norm": 0.78125, - "learning_rate": 1.217010472865757e-05, - "loss": 1.2517, - "step": 6328 - }, - { - "epoch": 1.9883385358383885, - "grad_norm": 0.83203125, - "learning_rate": 1.2167565852110442e-05, - "loss": 1.3161, - "step": 6330 - }, - { - "epoch": 1.9889667628639298, - "grad_norm": 0.828125, - "learning_rate": 1.2165026975563315e-05, - "loss": 1.2247, - "step": 6332 - }, - { - "epoch": 1.9895949898894711, - "grad_norm": 0.8203125, - "learning_rate": 1.2162488099016186e-05, - "loss": 1.1215, - "step": 6334 - }, - { - "epoch": 1.9902232169150127, - "grad_norm": 0.8046875, - "learning_rate": 1.215994922246906e-05, - "loss": 1.1817, - "step": 6336 - }, - { - "epoch": 1.990851443940554, - "grad_norm": 0.8515625, - "learning_rate": 1.2157410345921929e-05, - "loss": 1.2248, - "step": 6338 - }, - { - "epoch": 1.9914796709660953, - "grad_norm": 0.7734375, - "learning_rate": 1.2154871469374802e-05, - "loss": 1.3265, - "step": 6340 - }, - { - "epoch": 1.9921078979916367, - "grad_norm": 0.81640625, - "learning_rate": 1.2152332592827675e-05, - "loss": 1.2027, - "step": 6342 - }, - { - "epoch": 1.9927361250171782, - "grad_norm": 1.234375, - "learning_rate": 1.2149793716280546e-05, - "loss": 1.1223, - "step": 6344 - }, - { - "epoch": 1.9933643520427196, - "grad_norm": 0.86328125, - "learning_rate": 1.214725483973342e-05, - "loss": 1.2041, - "step": 6346 - }, - { - "epoch": 1.9939925790682609, - "grad_norm": 0.86328125, - "learning_rate": 1.2144715963186291e-05, - "loss": 1.2209, - "step": 6348 - }, - { - "epoch": 1.9946208060938022, - "grad_norm": 0.83984375, - "learning_rate": 1.2142177086639164e-05, - "loss": 1.3876, - "step": 6350 - }, - { - "epoch": 1.9952490331193435, - "grad_norm": 0.83203125, - "learning_rate": 1.2139638210092035e-05, - "loss": 1.341, - "step": 6352 - }, - { - "epoch": 1.9958772601448849, - "grad_norm": 0.83203125, - "learning_rate": 1.2137099333544908e-05, - "loss": 1.2775, - "step": 6354 - }, - { - "epoch": 1.9965054871704262, - "grad_norm": 0.8046875, - "learning_rate": 1.213456045699778e-05, - "loss": 1.2379, - "step": 6356 - }, - { - "epoch": 1.9971337141959675, - "grad_norm": 0.85546875, - "learning_rate": 1.2132021580450653e-05, - "loss": 1.311, - "step": 6358 - }, - { - "epoch": 1.9977619412215089, - "grad_norm": 0.84765625, - "learning_rate": 1.2129482703903524e-05, - "loss": 1.3089, - "step": 6360 - }, - { - "epoch": 1.9983901682470502, - "grad_norm": 0.84765625, - "learning_rate": 1.2126943827356397e-05, - "loss": 1.1843, - "step": 6362 - }, - { - "epoch": 1.9990183952725915, - "grad_norm": 0.7734375, - "learning_rate": 1.2124404950809267e-05, - "loss": 1.1485, - "step": 6364 - }, - { - "epoch": 1.9996466222981328, - "grad_norm": 0.875, - "learning_rate": 1.212186607426214e-05, - "loss": 1.0792, - "step": 6366 - }, - { - "epoch": 2.000274849323674, - "grad_norm": 0.7265625, - "learning_rate": 1.2119327197715011e-05, - "loss": 1.3043, - "step": 6368 - }, - { - "epoch": 2.0009030763492155, - "grad_norm": 0.78515625, - "learning_rate": 1.2116788321167885e-05, - "loss": 1.1913, - "step": 6370 - }, - { - "epoch": 2.0015313033747573, - "grad_norm": 0.8046875, - "learning_rate": 1.2114249444620756e-05, - "loss": 1.1807, - "step": 6372 - }, - { - "epoch": 2.0021595304002986, - "grad_norm": 0.91015625, - "learning_rate": 1.2111710568073629e-05, - "loss": 1.1107, - "step": 6374 - }, - { - "epoch": 2.00278775742584, - "grad_norm": 0.89453125, - "learning_rate": 1.21091716915265e-05, - "loss": 1.2526, - "step": 6376 - }, - { - "epoch": 2.0034159844513812, - "grad_norm": 0.78125, - "learning_rate": 1.2106632814979373e-05, - "loss": 1.1686, - "step": 6378 - }, - { - "epoch": 2.0040442114769226, - "grad_norm": 0.87890625, - "learning_rate": 1.2104093938432245e-05, - "loss": 1.0691, - "step": 6380 - }, - { - "epoch": 2.004672438502464, - "grad_norm": 0.96875, - "learning_rate": 1.2101555061885118e-05, - "loss": 1.199, - "step": 6382 - }, - { - "epoch": 2.0053006655280052, - "grad_norm": 0.875, - "learning_rate": 1.2099016185337988e-05, - "loss": 1.194, - "step": 6384 - }, - { - "epoch": 2.0059288925535466, - "grad_norm": 1.0, - "learning_rate": 1.2096477308790862e-05, - "loss": 1.2757, - "step": 6386 - }, - { - "epoch": 2.006557119579088, - "grad_norm": 0.75, - "learning_rate": 1.2093938432243732e-05, - "loss": 1.2772, - "step": 6388 - }, - { - "epoch": 2.007185346604629, - "grad_norm": 0.87890625, - "learning_rate": 1.2091399555696605e-05, - "loss": 1.2131, - "step": 6390 - }, - { - "epoch": 2.0078135736301705, - "grad_norm": 0.90625, - "learning_rate": 1.2088860679149477e-05, - "loss": 1.0315, - "step": 6392 - }, - { - "epoch": 2.008441800655712, - "grad_norm": 0.8203125, - "learning_rate": 1.208632180260235e-05, - "loss": 1.207, - "step": 6394 - }, - { - "epoch": 2.009070027681253, - "grad_norm": 0.90625, - "learning_rate": 1.2083782926055221e-05, - "loss": 1.1659, - "step": 6396 - }, - { - "epoch": 2.0096982547067945, - "grad_norm": 0.859375, - "learning_rate": 1.2081244049508094e-05, - "loss": 1.1712, - "step": 6398 - }, - { - "epoch": 2.010326481732336, - "grad_norm": 1.0390625, - "learning_rate": 1.2078705172960965e-05, - "loss": 1.235, - "step": 6400 - }, - { - "epoch": 2.010954708757877, - "grad_norm": 0.9296875, - "learning_rate": 1.2076166296413839e-05, - "loss": 1.1337, - "step": 6402 - }, - { - "epoch": 2.0115829357834185, - "grad_norm": 0.8046875, - "learning_rate": 1.207362741986671e-05, - "loss": 1.1306, - "step": 6404 - }, - { - "epoch": 2.0122111628089603, - "grad_norm": 0.8125, - "learning_rate": 1.2071088543319583e-05, - "loss": 1.1945, - "step": 6406 - }, - { - "epoch": 2.0128393898345016, - "grad_norm": 0.859375, - "learning_rate": 1.2068549666772453e-05, - "loss": 1.1251, - "step": 6408 - }, - { - "epoch": 2.013467616860043, - "grad_norm": 0.8359375, - "learning_rate": 1.2066010790225326e-05, - "loss": 1.1349, - "step": 6410 - }, - { - "epoch": 2.0140958438855843, - "grad_norm": 0.8984375, - "learning_rate": 1.2063471913678197e-05, - "loss": 1.0783, - "step": 6412 - }, - { - "epoch": 2.0147240709111256, - "grad_norm": 0.9296875, - "learning_rate": 1.206093303713107e-05, - "loss": 1.0695, - "step": 6414 - }, - { - "epoch": 2.015352297936667, - "grad_norm": 1.0390625, - "learning_rate": 1.2058394160583942e-05, - "loss": 1.1497, - "step": 6416 - }, - { - "epoch": 2.0159805249622083, - "grad_norm": 0.8046875, - "learning_rate": 1.2055855284036815e-05, - "loss": 1.3358, - "step": 6418 - }, - { - "epoch": 2.0166087519877496, - "grad_norm": 0.90625, - "learning_rate": 1.2053316407489686e-05, - "loss": 1.1467, - "step": 6420 - }, - { - "epoch": 2.017236979013291, - "grad_norm": 0.80078125, - "learning_rate": 1.2050777530942559e-05, - "loss": 1.1837, - "step": 6422 - }, - { - "epoch": 2.0178652060388322, - "grad_norm": 0.87109375, - "learning_rate": 1.204823865439543e-05, - "loss": 1.1541, - "step": 6424 - }, - { - "epoch": 2.0184934330643736, - "grad_norm": 0.8828125, - "learning_rate": 1.2045699777848304e-05, - "loss": 1.2496, - "step": 6426 - }, - { - "epoch": 2.019121660089915, - "grad_norm": 0.8203125, - "learning_rate": 1.2043160901301177e-05, - "loss": 1.1149, - "step": 6428 - }, - { - "epoch": 2.019749887115456, - "grad_norm": 0.8359375, - "learning_rate": 1.2040622024754048e-05, - "loss": 1.236, - "step": 6430 - }, - { - "epoch": 2.0203781141409975, - "grad_norm": 0.90625, - "learning_rate": 1.2038083148206921e-05, - "loss": 1.1907, - "step": 6432 - }, - { - "epoch": 2.021006341166539, - "grad_norm": 0.80078125, - "learning_rate": 1.203554427165979e-05, - "loss": 1.1428, - "step": 6434 - }, - { - "epoch": 2.02163456819208, - "grad_norm": 0.8515625, - "learning_rate": 1.2033005395112664e-05, - "loss": 1.2013, - "step": 6436 - }, - { - "epoch": 2.022262795217622, - "grad_norm": 0.921875, - "learning_rate": 1.2030466518565535e-05, - "loss": 1.2182, - "step": 6438 - }, - { - "epoch": 2.0228910222431633, - "grad_norm": 0.85546875, - "learning_rate": 1.2027927642018408e-05, - "loss": 1.1166, - "step": 6440 - }, - { - "epoch": 2.0235192492687046, - "grad_norm": 0.828125, - "learning_rate": 1.202538876547128e-05, - "loss": 1.1307, - "step": 6442 - }, - { - "epoch": 2.024147476294246, - "grad_norm": 0.89453125, - "learning_rate": 1.2022849888924153e-05, - "loss": 0.9865, - "step": 6444 - }, - { - "epoch": 2.0247757033197873, - "grad_norm": 0.84375, - "learning_rate": 1.2020311012377024e-05, - "loss": 1.0833, - "step": 6446 - }, - { - "epoch": 2.0254039303453286, - "grad_norm": 0.89453125, - "learning_rate": 1.2017772135829897e-05, - "loss": 1.1807, - "step": 6448 - }, - { - "epoch": 2.02603215737087, - "grad_norm": 0.94140625, - "learning_rate": 1.2015233259282769e-05, - "loss": 1.0344, - "step": 6450 - }, - { - "epoch": 2.0266603843964113, - "grad_norm": 0.83984375, - "learning_rate": 1.2012694382735642e-05, - "loss": 1.0775, - "step": 6452 - }, - { - "epoch": 2.0272886114219526, - "grad_norm": 0.86328125, - "learning_rate": 1.2010155506188511e-05, - "loss": 1.1049, - "step": 6454 - }, - { - "epoch": 2.027916838447494, - "grad_norm": 0.83984375, - "learning_rate": 1.2007616629641386e-05, - "loss": 1.3446, - "step": 6456 - }, - { - "epoch": 2.0285450654730353, - "grad_norm": 0.91015625, - "learning_rate": 1.2005077753094256e-05, - "loss": 1.3205, - "step": 6458 - }, - { - "epoch": 2.0291732924985766, - "grad_norm": 0.8671875, - "learning_rate": 1.2002538876547129e-05, - "loss": 1.28, - "step": 6460 - }, - { - "epoch": 2.029801519524118, - "grad_norm": 0.828125, - "learning_rate": 1.2e-05, - "loss": 1.157, - "step": 6462 - }, - { - "epoch": 2.0304297465496592, - "grad_norm": 0.8828125, - "learning_rate": 1.1997461123452873e-05, - "loss": 1.132, - "step": 6464 - }, - { - "epoch": 2.0310579735752006, - "grad_norm": 0.8671875, - "learning_rate": 1.1994922246905745e-05, - "loss": 1.1373, - "step": 6466 - }, - { - "epoch": 2.031686200600742, - "grad_norm": 0.89453125, - "learning_rate": 1.1992383370358618e-05, - "loss": 1.2225, - "step": 6468 - }, - { - "epoch": 2.0323144276262832, - "grad_norm": 0.84765625, - "learning_rate": 1.198984449381149e-05, - "loss": 1.21, - "step": 6470 - }, - { - "epoch": 2.032942654651825, - "grad_norm": 0.80859375, - "learning_rate": 1.1987305617264362e-05, - "loss": 1.1602, - "step": 6472 - }, - { - "epoch": 2.0335708816773663, - "grad_norm": 0.8828125, - "learning_rate": 1.1984766740717234e-05, - "loss": 1.1242, - "step": 6474 - }, - { - "epoch": 2.0341991087029077, - "grad_norm": 0.875, - "learning_rate": 1.1982227864170107e-05, - "loss": 1.1352, - "step": 6476 - }, - { - "epoch": 2.034827335728449, - "grad_norm": 0.87890625, - "learning_rate": 1.1979688987622976e-05, - "loss": 1.1541, - "step": 6478 - }, - { - "epoch": 2.0354555627539903, - "grad_norm": 0.8515625, - "learning_rate": 1.197715011107585e-05, - "loss": 1.1356, - "step": 6480 - }, - { - "epoch": 2.0360837897795316, - "grad_norm": 0.84375, - "learning_rate": 1.1974611234528721e-05, - "loss": 1.3641, - "step": 6482 - }, - { - "epoch": 2.036712016805073, - "grad_norm": 1.21875, - "learning_rate": 1.1972072357981594e-05, - "loss": 1.0959, - "step": 6484 - }, - { - "epoch": 2.0373402438306143, - "grad_norm": 0.859375, - "learning_rate": 1.1969533481434465e-05, - "loss": 1.2647, - "step": 6486 - }, - { - "epoch": 2.0379684708561556, - "grad_norm": 0.81640625, - "learning_rate": 1.1966994604887338e-05, - "loss": 1.0983, - "step": 6488 - }, - { - "epoch": 2.038596697881697, - "grad_norm": 0.84375, - "learning_rate": 1.196445572834021e-05, - "loss": 1.1839, - "step": 6490 - }, - { - "epoch": 2.0392249249072383, - "grad_norm": 0.859375, - "learning_rate": 1.1961916851793083e-05, - "loss": 1.2528, - "step": 6492 - }, - { - "epoch": 2.0398531519327796, - "grad_norm": 0.87890625, - "learning_rate": 1.1959377975245954e-05, - "loss": 1.1767, - "step": 6494 - }, - { - "epoch": 2.040481378958321, - "grad_norm": 0.87890625, - "learning_rate": 1.1956839098698827e-05, - "loss": 1.2149, - "step": 6496 - }, - { - "epoch": 2.0411096059838623, - "grad_norm": 0.88671875, - "learning_rate": 1.1954300222151699e-05, - "loss": 1.1772, - "step": 6498 - }, - { - "epoch": 2.0417378330094036, - "grad_norm": 0.91796875, - "learning_rate": 1.1951761345604572e-05, - "loss": 1.1659, - "step": 6500 - }, - { - "epoch": 2.042366060034945, - "grad_norm": 0.9140625, - "learning_rate": 1.1949222469057442e-05, - "loss": 1.1775, - "step": 6502 - }, - { - "epoch": 2.0429942870604867, - "grad_norm": 0.953125, - "learning_rate": 1.1946683592510315e-05, - "loss": 1.2286, - "step": 6504 - }, - { - "epoch": 2.043622514086028, - "grad_norm": 0.8203125, - "learning_rate": 1.1944144715963186e-05, - "loss": 1.265, - "step": 6506 - }, - { - "epoch": 2.0442507411115693, - "grad_norm": 0.89453125, - "learning_rate": 1.1941605839416059e-05, - "loss": 1.2133, - "step": 6508 - }, - { - "epoch": 2.0448789681371107, - "grad_norm": 1.0, - "learning_rate": 1.193906696286893e-05, - "loss": 1.0619, - "step": 6510 - }, - { - "epoch": 2.045507195162652, - "grad_norm": 0.8515625, - "learning_rate": 1.1936528086321803e-05, - "loss": 1.292, - "step": 6512 - }, - { - "epoch": 2.0461354221881933, - "grad_norm": 0.91015625, - "learning_rate": 1.1933989209774677e-05, - "loss": 1.1345, - "step": 6514 - }, - { - "epoch": 2.0467636492137347, - "grad_norm": 0.87109375, - "learning_rate": 1.1931450333227548e-05, - "loss": 1.0742, - "step": 6516 - }, - { - "epoch": 2.047391876239276, - "grad_norm": 0.88671875, - "learning_rate": 1.1928911456680421e-05, - "loss": 1.0563, - "step": 6518 - }, - { - "epoch": 2.0480201032648173, - "grad_norm": 0.84765625, - "learning_rate": 1.1926372580133292e-05, - "loss": 1.0728, - "step": 6520 - }, - { - "epoch": 2.0486483302903586, - "grad_norm": 0.87109375, - "learning_rate": 1.1923833703586165e-05, - "loss": 1.2013, - "step": 6522 - }, - { - "epoch": 2.0492765573159, - "grad_norm": 0.76953125, - "learning_rate": 1.1921294827039037e-05, - "loss": 1.2654, - "step": 6524 - }, - { - "epoch": 2.0499047843414413, - "grad_norm": 0.8984375, - "learning_rate": 1.191875595049191e-05, - "loss": 1.1785, - "step": 6526 - }, - { - "epoch": 2.0505330113669826, - "grad_norm": 0.98046875, - "learning_rate": 1.191621707394478e-05, - "loss": 1.166, - "step": 6528 - }, - { - "epoch": 2.051161238392524, - "grad_norm": 0.8125, - "learning_rate": 1.1913678197397653e-05, - "loss": 1.0234, - "step": 6530 - }, - { - "epoch": 2.0517894654180653, - "grad_norm": 0.8203125, - "learning_rate": 1.1911139320850524e-05, - "loss": 1.1007, - "step": 6532 - }, - { - "epoch": 2.0524176924436066, - "grad_norm": 0.8828125, - "learning_rate": 1.1908600444303397e-05, - "loss": 1.2466, - "step": 6534 - }, - { - "epoch": 2.053045919469148, - "grad_norm": 1.0234375, - "learning_rate": 1.1906061567756269e-05, - "loss": 1.1554, - "step": 6536 - }, - { - "epoch": 2.0536741464946897, - "grad_norm": 0.79296875, - "learning_rate": 1.1903522691209142e-05, - "loss": 1.293, - "step": 6538 - }, - { - "epoch": 2.054302373520231, - "grad_norm": 0.7890625, - "learning_rate": 1.1900983814662013e-05, - "loss": 1.1603, - "step": 6540 - }, - { - "epoch": 2.0549306005457724, - "grad_norm": 0.98046875, - "learning_rate": 1.1898444938114886e-05, - "loss": 1.1157, - "step": 6542 - }, - { - "epoch": 2.0555588275713137, - "grad_norm": 0.84765625, - "learning_rate": 1.1895906061567757e-05, - "loss": 1.102, - "step": 6544 - }, - { - "epoch": 2.056187054596855, - "grad_norm": 0.8046875, - "learning_rate": 1.189336718502063e-05, - "loss": 1.1276, - "step": 6546 - }, - { - "epoch": 2.0568152816223964, - "grad_norm": 0.8125, - "learning_rate": 1.18908283084735e-05, - "loss": 1.3061, - "step": 6548 - }, - { - "epoch": 2.0574435086479377, - "grad_norm": 0.83984375, - "learning_rate": 1.1888289431926375e-05, - "loss": 1.2868, - "step": 6550 - }, - { - "epoch": 2.058071735673479, - "grad_norm": 0.90234375, - "learning_rate": 1.1885750555379245e-05, - "loss": 1.2344, - "step": 6552 - }, - { - "epoch": 2.0586999626990203, - "grad_norm": 0.91015625, - "learning_rate": 1.1883211678832118e-05, - "loss": 1.1805, - "step": 6554 - }, - { - "epoch": 2.0593281897245617, - "grad_norm": 0.8671875, - "learning_rate": 1.1880672802284989e-05, - "loss": 1.1347, - "step": 6556 - }, - { - "epoch": 2.059956416750103, - "grad_norm": 0.8828125, - "learning_rate": 1.1878133925737862e-05, - "loss": 1.1707, - "step": 6558 - }, - { - "epoch": 2.0605846437756443, - "grad_norm": 0.8828125, - "learning_rate": 1.1875595049190734e-05, - "loss": 1.2383, - "step": 6560 - }, - { - "epoch": 2.0612128708011856, - "grad_norm": 0.83203125, - "learning_rate": 1.1873056172643607e-05, - "loss": 1.1534, - "step": 6562 - }, - { - "epoch": 2.061841097826727, - "grad_norm": 0.83984375, - "learning_rate": 1.1870517296096478e-05, - "loss": 1.1963, - "step": 6564 - }, - { - "epoch": 2.0624693248522683, - "grad_norm": 0.828125, - "learning_rate": 1.1867978419549351e-05, - "loss": 1.2344, - "step": 6566 - }, - { - "epoch": 2.0630975518778096, - "grad_norm": 0.8359375, - "learning_rate": 1.1865439543002223e-05, - "loss": 1.158, - "step": 6568 - }, - { - "epoch": 2.0637257789033514, - "grad_norm": 0.8359375, - "learning_rate": 1.1862900666455096e-05, - "loss": 1.2401, - "step": 6570 - }, - { - "epoch": 2.0643540059288927, - "grad_norm": 0.85546875, - "learning_rate": 1.1860361789907965e-05, - "loss": 1.1387, - "step": 6572 - }, - { - "epoch": 2.064982232954434, - "grad_norm": 0.8359375, - "learning_rate": 1.1857822913360838e-05, - "loss": 1.1706, - "step": 6574 - }, - { - "epoch": 2.0656104599799754, - "grad_norm": 0.80859375, - "learning_rate": 1.185528403681371e-05, - "loss": 1.2223, - "step": 6576 - }, - { - "epoch": 2.0662386870055167, - "grad_norm": 0.828125, - "learning_rate": 1.1852745160266583e-05, - "loss": 1.1918, - "step": 6578 - }, - { - "epoch": 2.066866914031058, - "grad_norm": 0.8203125, - "learning_rate": 1.1850206283719454e-05, - "loss": 1.1429, - "step": 6580 - }, - { - "epoch": 2.0674951410565994, - "grad_norm": 0.91796875, - "learning_rate": 1.1847667407172327e-05, - "loss": 1.1263, - "step": 6582 - }, - { - "epoch": 2.0681233680821407, - "grad_norm": 0.84375, - "learning_rate": 1.1845128530625199e-05, - "loss": 1.2082, - "step": 6584 - }, - { - "epoch": 2.068751595107682, - "grad_norm": 0.90625, - "learning_rate": 1.1842589654078072e-05, - "loss": 1.1829, - "step": 6586 - }, - { - "epoch": 2.0693798221332234, - "grad_norm": 0.8046875, - "learning_rate": 1.1840050777530943e-05, - "loss": 1.0186, - "step": 6588 - }, - { - "epoch": 2.0700080491587647, - "grad_norm": 0.828125, - "learning_rate": 1.1837511900983816e-05, - "loss": 1.2465, - "step": 6590 - }, - { - "epoch": 2.070636276184306, - "grad_norm": 0.8671875, - "learning_rate": 1.1834973024436686e-05, - "loss": 1.1994, - "step": 6592 - }, - { - "epoch": 2.0712645032098473, - "grad_norm": 0.8828125, - "learning_rate": 1.183243414788956e-05, - "loss": 1.1866, - "step": 6594 - }, - { - "epoch": 2.0718927302353887, - "grad_norm": 0.89453125, - "learning_rate": 1.182989527134243e-05, - "loss": 1.2716, - "step": 6596 - }, - { - "epoch": 2.07252095726093, - "grad_norm": 0.921875, - "learning_rate": 1.1827356394795303e-05, - "loss": 1.3113, - "step": 6598 - }, - { - "epoch": 2.0731491842864713, - "grad_norm": 0.859375, - "learning_rate": 1.1824817518248176e-05, - "loss": 1.1559, - "step": 6600 - } - ], - "logging_steps": 2, - "max_steps": 15915, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.7875411663218278e+19, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}