diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21500 +1,1810 @@ { - "best_metric": 0.9729424715042114, - "best_model_checkpoint": "runs/deepseek_lora_20240421-183352/checkpoint-25000", - "epoch": 0.694577089374707, + "best_metric": 0.986103355884552, + "best_model_checkpoint": "runs/deepseek_lora_20240422-095359/checkpoint-2500", + "epoch": 0.05788142411455891, "eval_steps": 500, - "global_step": 30000, + "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 8.893424987792969, + "grad_norm": 11.34415054321289, "learning_rate": 4.0000000000000003e-07, - "loss": 2.1665, + "loss": 1.2669, "step": 10 }, { "epoch": 0.0, - "grad_norm": 7.90573787689209, + "grad_norm": 7.290144443511963, "learning_rate": 8.000000000000001e-07, - "loss": 2.2972, + "loss": 1.3878, "step": 20 }, { "epoch": 0.0, - "grad_norm": 4.421755313873291, + "grad_norm": 5.349151134490967, "learning_rate": 1.2000000000000002e-06, - "loss": 2.0517, + "loss": 1.1855, "step": 30 }, { "epoch": 0.0, - "grad_norm": 4.259636402130127, + "grad_norm": 14.503132820129395, "learning_rate": 1.6000000000000001e-06, - "loss": 2.1163, + "loss": 1.0839, "step": 40 }, { "epoch": 0.0, - "grad_norm": 4.228359222412109, + "grad_norm": 10.504097938537598, "learning_rate": 2.0000000000000003e-06, - "loss": 2.2697, + "loss": 1.3778, "step": 50 }, { "epoch": 0.0, - "grad_norm": 8.439579963684082, + "grad_norm": 10.759912490844727, "learning_rate": 2.4000000000000003e-06, - "loss": 2.1731, + "loss": 1.2921, "step": 60 }, { "epoch": 0.0, - "grad_norm": 2.426865339279175, + "grad_norm": 10.052595138549805, "learning_rate": 2.8000000000000003e-06, - "loss": 2.2052, + "loss": 1.2835, "step": 70 }, { "epoch": 0.0, - "grad_norm": 4.410788059234619, + "grad_norm": 4.416482925415039, "learning_rate": 3.2000000000000003e-06, - "loss": 2.0863, + "loss": 1.1876, "step": 80 }, { "epoch": 0.0, - "grad_norm": 5.553213596343994, + "grad_norm": 8.467272758483887, "learning_rate": 3.6000000000000003e-06, - "loss": 1.9664, + "loss": 1.0513, "step": 90 }, { "epoch": 0.0, - "grad_norm": 6.600622177124023, + "grad_norm": 7.2040510177612305, "learning_rate": 4.000000000000001e-06, - "loss": 2.0656, + "loss": 1.1429, "step": 100 }, { "epoch": 0.0, - "grad_norm": 4.6402153968811035, + "grad_norm": 22.72906494140625, "learning_rate": 4.4e-06, - "loss": 2.0541, + "loss": 1.344, "step": 110 }, { "epoch": 0.0, - "grad_norm": 3.2395169734954834, + "grad_norm": 2.6358766555786133, "learning_rate": 4.800000000000001e-06, - "loss": 1.8353, + "loss": 1.1027, "step": 120 }, { "epoch": 0.0, - "grad_norm": 2.8458945751190186, + "grad_norm": 2.7037551403045654, "learning_rate": 5.2e-06, - "loss": 1.8408, + "loss": 1.1483, "step": 130 }, { "epoch": 0.0, - "grad_norm": 5.223249912261963, + "grad_norm": 7.809167861938477, "learning_rate": 5.600000000000001e-06, - "loss": 1.905, + "loss": 1.2556, "step": 140 }, { "epoch": 0.0, - "grad_norm": 3.8996775150299072, + "grad_norm": 10.522435188293457, "learning_rate": 6e-06, - "loss": 1.9277, + "loss": 1.3272, "step": 150 }, { "epoch": 0.0, - "grad_norm": 3.8909013271331787, + "grad_norm": 12.723409652709961, "learning_rate": 6.4000000000000006e-06, - "loss": 1.777, + "loss": 1.1342, "step": 160 }, { "epoch": 0.0, - "grad_norm": 2.76326060295105, + "grad_norm": 9.65882682800293, "learning_rate": 6.800000000000001e-06, - "loss": 1.7483, + "loss": 1.1509, "step": 170 }, { "epoch": 0.0, - "grad_norm": 3.6061465740203857, + "grad_norm": 7.01952600479126, "learning_rate": 7.2000000000000005e-06, - "loss": 1.79, + "loss": 1.3069, "step": 180 }, { "epoch": 0.0, - "grad_norm": 4.046599388122559, + "grad_norm": 3.8367834091186523, "learning_rate": 7.600000000000001e-06, - "loss": 1.7282, + "loss": 1.1551, "step": 190 }, { "epoch": 0.0, - "grad_norm": 3.0414559841156006, + "grad_norm": 6.172680854797363, "learning_rate": 8.000000000000001e-06, - "loss": 1.8936, + "loss": 1.4156, "step": 200 }, { "epoch": 0.0, - "grad_norm": 6.000649452209473, + "grad_norm": 13.904732704162598, "learning_rate": 8.400000000000001e-06, - "loss": 1.7291, + "loss": 1.2325, "step": 210 }, { "epoch": 0.01, - "grad_norm": 5.2610554695129395, + "grad_norm": 7.113356590270996, "learning_rate": 8.8e-06, - "loss": 1.5919, + "loss": 1.142, "step": 220 }, { "epoch": 0.01, - "grad_norm": 4.346323490142822, + "grad_norm": 14.426301956176758, "learning_rate": 9.200000000000002e-06, - "loss": 1.7002, + "loss": 1.2648, "step": 230 }, { "epoch": 0.01, - "grad_norm": 6.625606060028076, + "grad_norm": 11.00722885131836, "learning_rate": 9.600000000000001e-06, - "loss": 1.7204, + "loss": 1.3083, "step": 240 }, { "epoch": 0.01, - "grad_norm": 5.263577938079834, + "grad_norm": 5.72845983505249, "learning_rate": 1e-05, - "loss": 1.6544, + "loss": 1.1742, "step": 250 }, { "epoch": 0.01, - "grad_norm": 5.526005268096924, + "grad_norm": 2.7809460163116455, "learning_rate": 1.04e-05, - "loss": 1.6343, + "loss": 1.2009, "step": 260 }, { "epoch": 0.01, - "grad_norm": 6.9661431312561035, + "grad_norm": 11.464003562927246, "learning_rate": 1.0800000000000002e-05, - "loss": 1.6069, + "loss": 1.1236, "step": 270 }, { "epoch": 0.01, - "grad_norm": 2.983661651611328, + "grad_norm": 4.139829158782959, "learning_rate": 1.1200000000000001e-05, - "loss": 1.718, + "loss": 1.412, "step": 280 }, { "epoch": 0.01, - "grad_norm": 7.731842517852783, + "grad_norm": 11.483891487121582, "learning_rate": 1.16e-05, - "loss": 1.6998, + "loss": 1.2858, "step": 290 }, { "epoch": 0.01, - "grad_norm": 6.424485683441162, + "grad_norm": 5.432833671569824, "learning_rate": 1.2e-05, - "loss": 1.7128, + "loss": 1.2636, "step": 300 }, { "epoch": 0.01, - "grad_norm": 2.6413464546203613, + "grad_norm": 8.610489845275879, "learning_rate": 1.2400000000000002e-05, - "loss": 1.7114, + "loss": 1.3628, "step": 310 }, { "epoch": 0.01, - "grad_norm": 6.612671375274658, + "grad_norm": 5.597244739532471, "learning_rate": 1.2800000000000001e-05, - "loss": 1.717, + "loss": 1.3585, "step": 320 }, { "epoch": 0.01, - "grad_norm": 3.804788827896118, + "grad_norm": 11.667290687561035, "learning_rate": 1.3200000000000002e-05, - "loss": 1.7916, + "loss": 1.4112, "step": 330 }, { "epoch": 0.01, - "grad_norm": 2.907038688659668, + "grad_norm": 4.102590084075928, "learning_rate": 1.3600000000000002e-05, - "loss": 1.6078, + "loss": 1.2255, "step": 340 }, { "epoch": 0.01, - "grad_norm": 2.003390312194824, + "grad_norm": 15.342700958251953, "learning_rate": 1.4e-05, - "loss": 1.7339, + "loss": 1.2915, "step": 350 }, { "epoch": 0.01, - "grad_norm": 3.2724103927612305, + "grad_norm": 13.78890323638916, "learning_rate": 1.4400000000000001e-05, - "loss": 1.8393, + "loss": 1.5371, "step": 360 }, { "epoch": 0.01, - "grad_norm": 3.7622690200805664, + "grad_norm": 13.456836700439453, "learning_rate": 1.48e-05, - "loss": 1.5091, + "loss": 1.0586, "step": 370 }, { "epoch": 0.01, - "grad_norm": 2.663651943206787, + "grad_norm": 2.98771071434021, "learning_rate": 1.5200000000000002e-05, - "loss": 1.639, + "loss": 1.3475, "step": 380 }, { "epoch": 0.01, - "grad_norm": 3.8204166889190674, + "grad_norm": 6.61796236038208, "learning_rate": 1.5600000000000003e-05, - "loss": 1.4148, + "loss": 1.1562, "step": 390 }, { "epoch": 0.01, - "grad_norm": 4.282427787780762, + "grad_norm": 5.871325492858887, "learning_rate": 1.6000000000000003e-05, - "loss": 1.7065, + "loss": 1.3575, "step": 400 }, { "epoch": 0.01, - "grad_norm": 9.151147842407227, + "grad_norm": 6.282336235046387, "learning_rate": 1.64e-05, - "loss": 1.5874, + "loss": 1.3261, "step": 410 }, { "epoch": 0.01, - "grad_norm": 4.359065532684326, + "grad_norm": 5.444363594055176, "learning_rate": 1.6800000000000002e-05, - "loss": 1.4316, + "loss": 1.0091, "step": 420 }, { "epoch": 0.01, - "grad_norm": 3.6506259441375732, + "grad_norm": 1.7336666584014893, "learning_rate": 1.72e-05, - "loss": 1.5907, + "loss": 1.3492, "step": 430 }, { "epoch": 0.01, - "grad_norm": 2.2175769805908203, + "grad_norm": 5.654507637023926, "learning_rate": 1.76e-05, - "loss": 1.6386, + "loss": 1.2701, "step": 440 }, { "epoch": 0.01, - "grad_norm": 6.790574550628662, + "grad_norm": 3.83292555809021, "learning_rate": 1.8e-05, - "loss": 1.4903, + "loss": 1.1073, "step": 450 }, { "epoch": 0.01, - "grad_norm": 3.407963514328003, + "grad_norm": 5.379516124725342, "learning_rate": 1.8400000000000003e-05, - "loss": 1.5074, + "loss": 1.2015, "step": 460 }, { "epoch": 0.01, - "grad_norm": 4.586065769195557, + "grad_norm": 3.365577220916748, "learning_rate": 1.88e-05, - "loss": 1.4186, + "loss": 1.1174, "step": 470 }, { "epoch": 0.01, - "grad_norm": 3.2595417499542236, + "grad_norm": 11.620206832885742, "learning_rate": 1.9200000000000003e-05, - "loss": 1.5489, + "loss": 1.3091, "step": 480 }, { "epoch": 0.01, - "grad_norm": 3.8598222732543945, + "grad_norm": 10.400999069213867, "learning_rate": 1.9600000000000002e-05, - "loss": 1.314, + "loss": 0.98, "step": 490 }, { "epoch": 0.01, - "grad_norm": 5.427089214324951, + "grad_norm": 5.732550144195557, "learning_rate": 2e-05, - "loss": 1.7116, + "loss": 1.3638, "step": 500 }, { "epoch": 0.01, - "eval_loss": 1.4248766899108887, - "eval_runtime": 67.1505, - "eval_samples_per_second": 14.892, - "eval_steps_per_second": 14.892, + "eval_loss": 1.0330229997634888, + "eval_runtime": 66.9668, + "eval_samples_per_second": 14.933, + "eval_steps_per_second": 14.933, "step": 500 }, { "epoch": 0.01, - "grad_norm": 3.126067876815796, - "learning_rate": 1.999322033898305e-05, - "loss": 1.6259, + "grad_norm": 3.0168514251708984, + "learning_rate": 1.9955555555555557e-05, + "loss": 1.2762, "step": 510 }, { "epoch": 0.01, - "grad_norm": 4.224739074707031, - "learning_rate": 1.9986440677966104e-05, - "loss": 1.6205, + "grad_norm": 3.4872841835021973, + "learning_rate": 1.9911111111111112e-05, + "loss": 1.3287, "step": 520 }, { "epoch": 0.01, - "grad_norm": 7.176899433135986, - "learning_rate": 1.9979661016949154e-05, - "loss": 1.5216, + "grad_norm": 5.1740336418151855, + "learning_rate": 1.9866666666666667e-05, + "loss": 1.2577, "step": 530 }, { "epoch": 0.01, - "grad_norm": 2.984959840774536, - "learning_rate": 1.9972881355932204e-05, - "loss": 1.3944, + "grad_norm": 2.3551251888275146, + "learning_rate": 1.9822222222222226e-05, + "loss": 1.1244, "step": 540 }, { "epoch": 0.01, - "grad_norm": 3.6579527854919434, - "learning_rate": 1.9966101694915257e-05, - "loss": 1.5515, + "grad_norm": 2.230109691619873, + "learning_rate": 1.977777777777778e-05, + "loss": 1.2021, "step": 550 }, { "epoch": 0.01, - "grad_norm": 3.6600215435028076, - "learning_rate": 1.9959322033898307e-05, - "loss": 1.5275, + "grad_norm": 2.284055709838867, + "learning_rate": 1.9733333333333336e-05, + "loss": 1.2919, "step": 560 }, { "epoch": 0.01, - "grad_norm": 5.994769096374512, - "learning_rate": 1.995254237288136e-05, - "loss": 1.6558, + "grad_norm": 2.5889816284179688, + "learning_rate": 1.968888888888889e-05, + "loss": 1.3648, "step": 570 }, { "epoch": 0.01, - "grad_norm": 5.50462532043457, - "learning_rate": 1.994576271186441e-05, - "loss": 1.641, + "grad_norm": 4.799360752105713, + "learning_rate": 1.9644444444444447e-05, + "loss": 1.396, "step": 580 }, { "epoch": 0.01, - "grad_norm": 3.41587233543396, - "learning_rate": 1.993898305084746e-05, - "loss": 1.461, + "grad_norm": 2.2582507133483887, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.2263, "step": 590 }, { "epoch": 0.01, - "grad_norm": 3.1005308628082275, - "learning_rate": 1.9932203389830512e-05, - "loss": 1.4009, + "grad_norm": 9.519213676452637, + "learning_rate": 1.9555555555555557e-05, + "loss": 1.1475, "step": 600 }, { "epoch": 0.01, - "grad_norm": 3.533121347427368, - "learning_rate": 1.992542372881356e-05, - "loss": 1.7164, + "grad_norm": 1.0547032356262207, + "learning_rate": 1.9511111111111113e-05, + "loss": 1.4278, "step": 610 }, { "epoch": 0.01, - "grad_norm": 4.897855758666992, - "learning_rate": 1.991864406779661e-05, - "loss": 1.5889, + "grad_norm": 9.448994636535645, + "learning_rate": 1.9466666666666668e-05, + "loss": 1.3394, "step": 620 }, { "epoch": 0.01, - "grad_norm": 2.890028953552246, - "learning_rate": 1.991186440677966e-05, - "loss": 1.4549, + "grad_norm": 6.21259069442749, + "learning_rate": 1.9422222222222223e-05, + "loss": 1.206, "step": 630 }, { "epoch": 0.01, - "grad_norm": 5.191209316253662, - "learning_rate": 1.990508474576271e-05, - "loss": 1.4711, + "grad_norm": 3.4361753463745117, + "learning_rate": 1.9377777777777778e-05, + "loss": 1.2472, "step": 640 }, { "epoch": 0.02, - "grad_norm": 3.568540334701538, - "learning_rate": 1.9898305084745764e-05, - "loss": 1.3507, + "grad_norm": 2.0469722747802734, + "learning_rate": 1.9333333333333333e-05, + "loss": 1.1283, "step": 650 }, { "epoch": 0.02, - "grad_norm": 1.9555094242095947, - "learning_rate": 1.9891525423728814e-05, - "loss": 1.4569, + "grad_norm": 3.071639060974121, + "learning_rate": 1.928888888888889e-05, + "loss": 1.2191, "step": 660 }, { "epoch": 0.02, - "grad_norm": 7.103305339813232, - "learning_rate": 1.9884745762711867e-05, - "loss": 1.6679, + "grad_norm": 3.428431272506714, + "learning_rate": 1.9244444444444444e-05, + "loss": 1.3863, "step": 670 }, { "epoch": 0.02, - "grad_norm": 5.208203315734863, - "learning_rate": 1.9877966101694917e-05, - "loss": 1.3169, + "grad_norm": 7.4005656242370605, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.0794, "step": 680 }, { "epoch": 0.02, - "grad_norm": 7.401998996734619, - "learning_rate": 1.9871186440677966e-05, - "loss": 1.642, + "grad_norm": 2.636923313140869, + "learning_rate": 1.9155555555555558e-05, + "loss": 1.3761, "step": 690 }, { "epoch": 0.02, - "grad_norm": 6.582605361938477, - "learning_rate": 1.986440677966102e-05, - "loss": 1.5996, + "grad_norm": 5.060346603393555, + "learning_rate": 1.9111111111111113e-05, + "loss": 1.4087, "step": 700 }, { "epoch": 0.02, - "grad_norm": 3.554516315460205, - "learning_rate": 1.985762711864407e-05, - "loss": 1.5291, + "grad_norm": 6.77576208114624, + "learning_rate": 1.9066666666666668e-05, + "loss": 1.2442, "step": 710 }, { "epoch": 0.02, - "grad_norm": 2.563401460647583, - "learning_rate": 1.985084745762712e-05, - "loss": 1.4608, + "grad_norm": 3.3169615268707275, + "learning_rate": 1.9022222222222223e-05, + "loss": 1.2498, "step": 720 }, { "epoch": 0.02, - "grad_norm": 5.061882972717285, - "learning_rate": 1.9844067796610172e-05, - "loss": 1.6631, + "grad_norm": 3.9390623569488525, + "learning_rate": 1.897777777777778e-05, + "loss": 1.4098, "step": 730 }, { "epoch": 0.02, - "grad_norm": 1.9863511323928833, - "learning_rate": 1.9837288135593222e-05, - "loss": 1.5619, + "grad_norm": 5.928336143493652, + "learning_rate": 1.8933333333333334e-05, + "loss": 1.235, "step": 740 }, { "epoch": 0.02, - "grad_norm": 1.3460112810134888, - "learning_rate": 1.9830508474576275e-05, - "loss": 1.3738, + "grad_norm": 9.329615592956543, + "learning_rate": 1.888888888888889e-05, + "loss": 1.2166, "step": 750 }, { "epoch": 0.02, - "grad_norm": 5.177835464477539, - "learning_rate": 1.9823728813559324e-05, - "loss": 1.6424, + "grad_norm": 6.106197357177734, + "learning_rate": 1.8844444444444444e-05, + "loss": 1.3728, "step": 760 }, { "epoch": 0.02, - "grad_norm": 1.877689242362976, - "learning_rate": 1.9816949152542374e-05, - "loss": 1.4095, + "grad_norm": 4.729337215423584, + "learning_rate": 1.88e-05, + "loss": 1.2131, "step": 770 }, { "epoch": 0.02, - "grad_norm": 2.7550783157348633, - "learning_rate": 1.9810169491525427e-05, - "loss": 1.5935, + "grad_norm": 3.116116762161255, + "learning_rate": 1.8755555555555558e-05, + "loss": 1.4169, "step": 780 }, { "epoch": 0.02, - "grad_norm": 1.4884017705917358, - "learning_rate": 1.9803389830508477e-05, - "loss": 1.4442, + "grad_norm": 8.869202613830566, + "learning_rate": 1.8711111111111113e-05, + "loss": 1.2377, "step": 790 }, { "epoch": 0.02, - "grad_norm": 2.309656858444214, - "learning_rate": 1.9796610169491527e-05, - "loss": 1.5698, + "grad_norm": 4.858852863311768, + "learning_rate": 1.866666666666667e-05, + "loss": 1.3863, "step": 800 }, { "epoch": 0.02, - "grad_norm": 2.5358872413635254, - "learning_rate": 1.978983050847458e-05, - "loss": 1.2542, + "grad_norm": 10.197395324707031, + "learning_rate": 1.8622222222222224e-05, + "loss": 1.1053, "step": 810 }, { "epoch": 0.02, - "grad_norm": 3.623551368713379, - "learning_rate": 1.9783050847457626e-05, - "loss": 1.663, + "grad_norm": 2.7740931510925293, + "learning_rate": 1.857777777777778e-05, + "loss": 1.41, "step": 820 }, { "epoch": 0.02, - "grad_norm": 4.512385845184326, - "learning_rate": 1.977627118644068e-05, - "loss": 1.5636, + "grad_norm": 8.306866645812988, + "learning_rate": 1.8533333333333334e-05, + "loss": 1.2661, "step": 830 }, { "epoch": 0.02, - "grad_norm": 3.330457925796509, - "learning_rate": 1.976949152542373e-05, - "loss": 1.3357, + "grad_norm": 5.462616920471191, + "learning_rate": 1.848888888888889e-05, + "loss": 1.1217, "step": 840 }, { "epoch": 0.02, - "grad_norm": 1.6951920986175537, - "learning_rate": 1.9762711864406782e-05, - "loss": 1.5766, + "grad_norm": 1.6351518630981445, + "learning_rate": 1.8444444444444448e-05, + "loss": 1.2512, "step": 850 }, { "epoch": 0.02, - "grad_norm": 2.8462748527526855, - "learning_rate": 1.9755932203389832e-05, - "loss": 1.5554, + "grad_norm": 4.930731773376465, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.3136, "step": 860 }, { "epoch": 0.02, - "grad_norm": 1.8278895616531372, - "learning_rate": 1.974915254237288e-05, - "loss": 1.3479, + "grad_norm": 6.907737731933594, + "learning_rate": 1.835555555555556e-05, + "loss": 1.1728, "step": 870 }, { "epoch": 0.02, - "grad_norm": 3.1370766162872314, - "learning_rate": 1.9742372881355935e-05, - "loss": 1.5584, + "grad_norm": 2.2834455966949463, + "learning_rate": 1.8311111111111114e-05, + "loss": 1.3566, "step": 880 }, { "epoch": 0.02, - "grad_norm": 8.17900276184082, - "learning_rate": 1.9735593220338984e-05, - "loss": 1.5172, + "grad_norm": 6.938192367553711, + "learning_rate": 1.826666666666667e-05, + "loss": 1.2879, "step": 890 }, { "epoch": 0.02, - "grad_norm": 7.5510640144348145, - "learning_rate": 1.9728813559322034e-05, - "loss": 1.4574, + "grad_norm": 3.4376509189605713, + "learning_rate": 1.8222222222222224e-05, + "loss": 1.1971, "step": 900 }, { "epoch": 0.02, - "grad_norm": 2.811005115509033, - "learning_rate": 1.9722033898305087e-05, - "loss": 1.576, + "grad_norm": 4.437848091125488, + "learning_rate": 1.817777777777778e-05, + "loss": 1.3583, "step": 910 }, { "epoch": 0.02, - "grad_norm": 4.8449907302856445, - "learning_rate": 1.9715254237288137e-05, - "loss": 1.4886, + "grad_norm": 6.843893051147461, + "learning_rate": 1.8133333333333335e-05, + "loss": 1.2323, "step": 920 }, { "epoch": 0.02, - "grad_norm": 3.2932374477386475, - "learning_rate": 1.970847457627119e-05, - "loss": 1.4845, + "grad_norm": 3.2527034282684326, + "learning_rate": 1.808888888888889e-05, + "loss": 1.2691, "step": 930 }, { "epoch": 0.02, - "grad_norm": 2.189581871032715, - "learning_rate": 1.970169491525424e-05, - "loss": 1.4143, + "grad_norm": 2.6528022289276123, + "learning_rate": 1.8044444444444445e-05, + "loss": 1.2525, "step": 940 }, { "epoch": 0.02, - "grad_norm": 1.5806180238723755, - "learning_rate": 1.969491525423729e-05, - "loss": 1.4859, + "grad_norm": 2.8804101943969727, + "learning_rate": 1.8e-05, + "loss": 1.3387, "step": 950 }, { "epoch": 0.02, - "grad_norm": 5.293037414550781, - "learning_rate": 1.9688135593220342e-05, - "loss": 1.4887, + "grad_norm": 9.754573822021484, + "learning_rate": 1.7955555555555556e-05, + "loss": 1.265, "step": 960 }, { "epoch": 0.02, - "grad_norm": 5.328525543212891, - "learning_rate": 1.9681355932203392e-05, - "loss": 1.2715, + "grad_norm": 2.54309344291687, + "learning_rate": 1.791111111111111e-05, + "loss": 1.0998, "step": 970 }, { "epoch": 0.02, - "grad_norm": 2.8140487670898438, - "learning_rate": 1.9674576271186442e-05, - "loss": 1.4281, + "grad_norm": 9.926447868347168, + "learning_rate": 1.7866666666666666e-05, + "loss": 1.1927, "step": 980 }, { "epoch": 0.02, - "grad_norm": 5.055774688720703, - "learning_rate": 1.9667796610169495e-05, - "loss": 1.2676, + "grad_norm": 6.4870805740356445, + "learning_rate": 1.782222222222222e-05, + "loss": 1.0565, "step": 990 }, { "epoch": 0.02, - "grad_norm": 1.9871106147766113, - "learning_rate": 1.9661016949152545e-05, - "loss": 1.4242, + "grad_norm": 3.2183382511138916, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.2572, "step": 1000 }, { "epoch": 0.02, - "eval_loss": 1.2329587936401367, - "eval_runtime": 67.1625, - "eval_samples_per_second": 14.889, - "eval_steps_per_second": 14.889, + "eval_loss": 1.019097089767456, + "eval_runtime": 66.8787, + "eval_samples_per_second": 14.952, + "eval_steps_per_second": 14.952, "step": 1000 }, { "epoch": 0.02, - "grad_norm": 9.538952827453613, - "learning_rate": 1.9654237288135594e-05, - "loss": 1.664, + "grad_norm": 9.61516284942627, + "learning_rate": 1.7733333333333335e-05, + "loss": 1.4306, "step": 1010 }, { "epoch": 0.02, - "grad_norm": 7.894416809082031, - "learning_rate": 1.9647457627118644e-05, - "loss": 1.5421, + "grad_norm": 10.755435943603516, + "learning_rate": 1.768888888888889e-05, + "loss": 1.3042, "step": 1020 }, { "epoch": 0.02, - "grad_norm": 15.61988353729248, - "learning_rate": 1.9640677966101697e-05, - "loss": 1.5037, + "grad_norm": 11.427221298217773, + "learning_rate": 1.7644444444444446e-05, + "loss": 1.3201, "step": 1030 }, { "epoch": 0.02, - "grad_norm": 1.5587459802627563, - "learning_rate": 1.9633898305084747e-05, - "loss": 1.4768, + "grad_norm": 9.656847953796387, + "learning_rate": 1.76e-05, + "loss": 1.2579, "step": 1040 }, { "epoch": 0.02, - "grad_norm": 2.0465331077575684, - "learning_rate": 1.9627118644067796e-05, - "loss": 1.6342, + "grad_norm": 4.030166149139404, + "learning_rate": 1.7555555555555556e-05, + "loss": 1.4462, "step": 1050 }, { "epoch": 0.02, - "grad_norm": 2.6867032051086426, - "learning_rate": 1.962033898305085e-05, - "loss": 1.6395, + "grad_norm": 6.84343957901001, + "learning_rate": 1.751111111111111e-05, + "loss": 1.4213, "step": 1060 }, { "epoch": 0.02, - "grad_norm": 3.0958399772644043, - "learning_rate": 1.96135593220339e-05, - "loss": 1.3742, + "grad_norm": 2.4579355716705322, + "learning_rate": 1.7466666666666667e-05, + "loss": 1.2673, "step": 1070 }, { "epoch": 0.03, - "grad_norm": 3.064396858215332, - "learning_rate": 1.960677966101695e-05, - "loss": 1.258, + "grad_norm": 6.692587375640869, + "learning_rate": 1.7422222222222222e-05, + "loss": 1.1579, "step": 1080 }, { "epoch": 0.03, - "grad_norm": 3.9529526233673096, - "learning_rate": 1.9600000000000002e-05, - "loss": 1.6082, + "grad_norm": 4.579308032989502, + "learning_rate": 1.737777777777778e-05, + "loss": 1.3278, "step": 1090 }, { "epoch": 0.03, - "grad_norm": 8.01111125946045, - "learning_rate": 1.9593220338983052e-05, - "loss": 1.5338, + "grad_norm": 7.755845069885254, + "learning_rate": 1.7333333333333336e-05, + "loss": 1.3471, "step": 1100 }, { "epoch": 0.03, - "grad_norm": 2.7759506702423096, - "learning_rate": 1.95864406779661e-05, - "loss": 1.4728, + "grad_norm": 2.065462112426758, + "learning_rate": 1.728888888888889e-05, + "loss": 1.2811, "step": 1110 }, { "epoch": 0.03, - "grad_norm": 2.339688777923584, - "learning_rate": 1.9579661016949155e-05, - "loss": 1.5947, + "grad_norm": 1.6691714525222778, + "learning_rate": 1.7244444444444446e-05, + "loss": 1.4433, "step": 1120 }, { "epoch": 0.03, - "grad_norm": 3.9827146530151367, - "learning_rate": 1.9572881355932204e-05, - "loss": 1.4338, + "grad_norm": 6.87007474899292, + "learning_rate": 1.72e-05, + "loss": 1.1916, "step": 1130 }, { "epoch": 0.03, - "grad_norm": 2.902629852294922, - "learning_rate": 1.9566101694915257e-05, - "loss": 1.5609, + "grad_norm": 1.663476586341858, + "learning_rate": 1.7155555555555557e-05, + "loss": 1.3006, "step": 1140 }, { "epoch": 0.03, - "grad_norm": 2.317248582839966, - "learning_rate": 1.9559322033898307e-05, - "loss": 1.5503, + "grad_norm": 2.493046522140503, + "learning_rate": 1.7111111111111112e-05, + "loss": 1.3099, "step": 1150 }, { "epoch": 0.03, - "grad_norm": 1.522884726524353, - "learning_rate": 1.9552542372881357e-05, - "loss": 1.3661, + "grad_norm": 3.2659084796905518, + "learning_rate": 1.706666666666667e-05, + "loss": 1.1553, "step": 1160 }, { "epoch": 0.03, - "grad_norm": 2.1419677734375, - "learning_rate": 1.954576271186441e-05, - "loss": 1.4839, + "grad_norm": 3.07832670211792, + "learning_rate": 1.7022222222222226e-05, + "loss": 1.2604, "step": 1170 }, { "epoch": 0.03, - "grad_norm": 2.2150516510009766, - "learning_rate": 1.953898305084746e-05, - "loss": 1.1754, + "grad_norm": 2.0068790912628174, + "learning_rate": 1.697777777777778e-05, + "loss": 0.9679, "step": 1180 }, { "epoch": 0.03, - "grad_norm": 4.25466775894165, - "learning_rate": 1.953220338983051e-05, - "loss": 1.4118, + "grad_norm": 14.066789627075195, + "learning_rate": 1.6933333333333336e-05, + "loss": 1.2291, "step": 1190 }, { "epoch": 0.03, - "grad_norm": 2.261260986328125, - "learning_rate": 1.9525423728813562e-05, - "loss": 1.4779, + "grad_norm": 5.058719158172607, + "learning_rate": 1.688888888888889e-05, + "loss": 1.3224, "step": 1200 }, { "epoch": 0.03, - "grad_norm": 4.751726150512695, - "learning_rate": 1.9518644067796612e-05, - "loss": 1.3143, + "grad_norm": 6.309176445007324, + "learning_rate": 1.6844444444444447e-05, + "loss": 1.1184, "step": 1210 }, { "epoch": 0.03, - "grad_norm": 3.0561487674713135, - "learning_rate": 1.9511864406779665e-05, - "loss": 1.5102, + "grad_norm": 9.05258846282959, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.345, "step": 1220 }, { "epoch": 0.03, - "grad_norm": 4.0936279296875, - "learning_rate": 1.950508474576271e-05, - "loss": 1.4763, + "grad_norm": 4.345346927642822, + "learning_rate": 1.6755555555555557e-05, + "loss": 1.2663, "step": 1230 }, { "epoch": 0.03, - "grad_norm": 2.0847556591033936, - "learning_rate": 1.9498305084745765e-05, - "loss": 1.3039, + "grad_norm": 4.119368076324463, + "learning_rate": 1.6711111111111112e-05, + "loss": 1.1441, "step": 1240 }, { "epoch": 0.03, - "grad_norm": 4.303075313568115, - "learning_rate": 1.9491525423728814e-05, - "loss": 1.4491, + "grad_norm": 5.0422563552856445, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.2199, "step": 1250 }, { "epoch": 0.03, - "grad_norm": 1.7520992755889893, - "learning_rate": 1.9484745762711864e-05, - "loss": 1.3379, + "grad_norm": 11.27535343170166, + "learning_rate": 1.6622222222222223e-05, + "loss": 1.1876, "step": 1260 }, { "epoch": 0.03, - "grad_norm": 6.233397006988525, - "learning_rate": 1.9477966101694917e-05, - "loss": 1.5582, + "grad_norm": 6.767408847808838, + "learning_rate": 1.6577777777777778e-05, + "loss": 1.3928, "step": 1270 }, { "epoch": 0.03, - "grad_norm": 2.988858461380005, - "learning_rate": 1.9471186440677967e-05, - "loss": 1.4801, + "grad_norm": 6.1706862449646, + "learning_rate": 1.6533333333333333e-05, + "loss": 1.2268, "step": 1280 }, { "epoch": 0.03, - "grad_norm": 5.172889232635498, - "learning_rate": 1.9464406779661017e-05, - "loss": 1.5481, + "grad_norm": 6.185644149780273, + "learning_rate": 1.648888888888889e-05, + "loss": 1.3086, "step": 1290 }, { "epoch": 0.03, - "grad_norm": 2.5870141983032227, - "learning_rate": 1.945762711864407e-05, - "loss": 1.4008, + "grad_norm": 9.57487678527832, + "learning_rate": 1.6444444444444444e-05, + "loss": 1.2079, "step": 1300 }, { "epoch": 0.03, - "grad_norm": 4.538797378540039, - "learning_rate": 1.945084745762712e-05, - "loss": 1.4145, + "grad_norm": 4.104099273681641, + "learning_rate": 1.64e-05, + "loss": 1.2012, "step": 1310 }, { "epoch": 0.03, - "grad_norm": 4.275447368621826, - "learning_rate": 1.9444067796610172e-05, - "loss": 1.4524, + "grad_norm": 6.703191757202148, + "learning_rate": 1.6355555555555557e-05, + "loss": 1.2567, "step": 1320 }, { "epoch": 0.03, - "grad_norm": 3.357238531112671, - "learning_rate": 1.9437288135593222e-05, - "loss": 1.487, + "grad_norm": 4.194169521331787, + "learning_rate": 1.6311111111111113e-05, + "loss": 1.3687, "step": 1330 }, { "epoch": 0.03, - "grad_norm": 2.1907289028167725, - "learning_rate": 1.9430508474576272e-05, - "loss": 1.3633, + "grad_norm": 4.2333269119262695, + "learning_rate": 1.6266666666666668e-05, + "loss": 1.1547, "step": 1340 }, { "epoch": 0.03, - "grad_norm": 1.7265887260437012, - "learning_rate": 1.9423728813559325e-05, - "loss": 1.3923, + "grad_norm": 3.007706880569458, + "learning_rate": 1.6222222222222223e-05, + "loss": 1.2031, "step": 1350 }, { "epoch": 0.03, - "grad_norm": 10.415843963623047, - "learning_rate": 1.9416949152542375e-05, - "loss": 1.3302, + "grad_norm": 22.12256622314453, + "learning_rate": 1.617777777777778e-05, + "loss": 1.1117, "step": 1360 }, { "epoch": 0.03, - "grad_norm": 6.544619560241699, - "learning_rate": 1.9410169491525424e-05, - "loss": 1.2906, + "grad_norm": 5.559662818908691, + "learning_rate": 1.6133333333333334e-05, + "loss": 1.0796, "step": 1370 }, { "epoch": 0.03, - "grad_norm": 2.8395652770996094, - "learning_rate": 1.9403389830508477e-05, - "loss": 1.4636, + "grad_norm": 4.618852615356445, + "learning_rate": 1.608888888888889e-05, + "loss": 1.3308, "step": 1380 }, { "epoch": 0.03, - "grad_norm": 3.81257963180542, - "learning_rate": 1.9396610169491527e-05, - "loss": 1.408, + "grad_norm": 3.6262142658233643, + "learning_rate": 1.6044444444444444e-05, + "loss": 1.1913, "step": 1390 }, { "epoch": 0.03, - "grad_norm": 2.445089817047119, - "learning_rate": 1.938983050847458e-05, - "loss": 1.5095, + "grad_norm": 8.0167875289917, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.341, "step": 1400 }, { "epoch": 0.03, - "grad_norm": 3.918297290802002, - "learning_rate": 1.938305084745763e-05, - "loss": 1.154, + "grad_norm": 5.142825603485107, + "learning_rate": 1.5955555555555558e-05, + "loss": 1.03, "step": 1410 }, { "epoch": 0.03, - "grad_norm": 1.7477134466171265, - "learning_rate": 1.937627118644068e-05, - "loss": 1.4866, + "grad_norm": 3.186448335647583, + "learning_rate": 1.5911111111111113e-05, + "loss": 1.2788, "step": 1420 }, { "epoch": 0.03, - "grad_norm": 2.768603801727295, - "learning_rate": 1.9369491525423733e-05, - "loss": 1.5642, + "grad_norm": 8.11291217803955, + "learning_rate": 1.586666666666667e-05, + "loss": 1.4419, "step": 1430 }, { "epoch": 0.03, - "grad_norm": 2.055950164794922, - "learning_rate": 1.936271186440678e-05, - "loss": 1.4019, + "grad_norm": 6.920192718505859, + "learning_rate": 1.5822222222222224e-05, + "loss": 1.3127, "step": 1440 }, { "epoch": 0.03, - "grad_norm": 9.180340766906738, - "learning_rate": 1.9355932203389832e-05, - "loss": 1.509, + "grad_norm": 4.357940196990967, + "learning_rate": 1.577777777777778e-05, + "loss": 1.3184, "step": 1450 }, { "epoch": 0.03, - "grad_norm": 4.526889801025391, - "learning_rate": 1.9349152542372882e-05, - "loss": 1.5197, + "grad_norm": 8.83785343170166, + "learning_rate": 1.5733333333333334e-05, + "loss": 1.382, "step": 1460 }, { "epoch": 0.03, - "grad_norm": 2.020207166671753, - "learning_rate": 1.934237288135593e-05, - "loss": 1.6292, + "grad_norm": 2.565369129180908, + "learning_rate": 1.5688888888888893e-05, + "loss": 1.4148, "step": 1470 }, { "epoch": 0.03, - "grad_norm": 3.9965012073516846, - "learning_rate": 1.9335593220338985e-05, - "loss": 1.6703, + "grad_norm": 3.819150686264038, + "learning_rate": 1.5644444444444448e-05, + "loss": 1.4428, "step": 1480 }, { "epoch": 0.03, - "grad_norm": 0.9574942588806152, - "learning_rate": 1.9328813559322034e-05, - "loss": 1.3525, + "grad_norm": 7.107534408569336, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.2188, "step": 1490 }, { "epoch": 0.03, - "grad_norm": 2.8527791500091553, - "learning_rate": 1.9322033898305087e-05, - "loss": 1.5142, + "grad_norm": 9.566027641296387, + "learning_rate": 1.555555555555556e-05, + "loss": 1.2406, "step": 1500 }, { "epoch": 0.03, - "eval_loss": 1.1832261085510254, - "eval_runtime": 67.0686, - "eval_samples_per_second": 14.91, - "eval_steps_per_second": 14.91, + "eval_loss": 1.0252078771591187, + "eval_runtime": 66.9753, + "eval_samples_per_second": 14.931, + "eval_steps_per_second": 14.931, "step": 1500 }, { "epoch": 0.03, - "grad_norm": 5.690788745880127, - "learning_rate": 1.9315254237288137e-05, - "loss": 1.4003, + "grad_norm": 8.054425239562988, + "learning_rate": 1.5511111111111114e-05, + "loss": 1.2767, "step": 1510 }, { "epoch": 0.04, - "grad_norm": 7.6338605880737305, - "learning_rate": 1.9308474576271187e-05, - "loss": 1.4324, + "grad_norm": 2.8494319915771484, + "learning_rate": 1.546666666666667e-05, + "loss": 1.1632, "step": 1520 }, { "epoch": 0.04, - "grad_norm": 3.098454713821411, - "learning_rate": 1.930169491525424e-05, - "loss": 1.5234, + "grad_norm": 2.9305810928344727, + "learning_rate": 1.5422222222222224e-05, + "loss": 1.2075, "step": 1530 }, { "epoch": 0.04, - "grad_norm": 4.322094440460205, - "learning_rate": 1.929491525423729e-05, - "loss": 1.2201, + "grad_norm": 11.442655563354492, + "learning_rate": 1.537777777777778e-05, + "loss": 1.0231, "step": 1540 }, { "epoch": 0.04, - "grad_norm": 4.089012622833252, - "learning_rate": 1.928813559322034e-05, - "loss": 1.4444, + "grad_norm": 1.735372543334961, + "learning_rate": 1.5333333333333334e-05, + "loss": 1.282, "step": 1550 }, { "epoch": 0.04, - "grad_norm": 3.2624104022979736, - "learning_rate": 1.9281355932203392e-05, - "loss": 1.4144, + "grad_norm": 11.126168251037598, + "learning_rate": 1.528888888888889e-05, + "loss": 1.2106, "step": 1560 }, { "epoch": 0.04, - "grad_norm": 2.4324264526367188, - "learning_rate": 1.9274576271186442e-05, - "loss": 1.3715, + "grad_norm": 5.569930076599121, + "learning_rate": 1.5244444444444447e-05, + "loss": 1.2016, "step": 1570 }, { "epoch": 0.04, - "grad_norm": 3.596635341644287, - "learning_rate": 1.9267796610169492e-05, - "loss": 1.366, + "grad_norm": 4.002272605895996, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.206, "step": 1580 }, { "epoch": 0.04, - "grad_norm": 2.6570637226104736, - "learning_rate": 1.9261016949152545e-05, - "loss": 1.5426, + "grad_norm": 11.26425838470459, + "learning_rate": 1.5155555555555557e-05, + "loss": 1.3145, "step": 1590 }, { "epoch": 0.04, - "grad_norm": 2.935263156890869, - "learning_rate": 1.9254237288135595e-05, - "loss": 1.2968, + "grad_norm": 6.265772819519043, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.105, "step": 1600 }, { "epoch": 0.04, - "grad_norm": 4.188492298126221, - "learning_rate": 1.9247457627118648e-05, - "loss": 1.396, + "grad_norm": 6.139275550842285, + "learning_rate": 1.5066666666666668e-05, + "loss": 1.2016, "step": 1610 }, { "epoch": 0.04, - "grad_norm": 4.421194553375244, - "learning_rate": 1.9240677966101698e-05, - "loss": 1.4905, + "grad_norm": 4.753066539764404, + "learning_rate": 1.5022222222222223e-05, + "loss": 1.3433, "step": 1620 }, { "epoch": 0.04, - "grad_norm": 1.735612154006958, - "learning_rate": 1.9233898305084747e-05, - "loss": 1.4508, + "grad_norm": 8.761942863464355, + "learning_rate": 1.497777777777778e-05, + "loss": 1.3153, "step": 1630 }, { "epoch": 0.04, - "grad_norm": 1.989076018333435, - "learning_rate": 1.92271186440678e-05, - "loss": 1.4195, + "grad_norm": 3.3448381423950195, + "learning_rate": 1.4933333333333335e-05, + "loss": 1.24, "step": 1640 }, { "epoch": 0.04, - "grad_norm": 1.4910849332809448, - "learning_rate": 1.9220338983050847e-05, - "loss": 1.374, + "grad_norm": 2.818711519241333, + "learning_rate": 1.488888888888889e-05, + "loss": 1.2067, "step": 1650 }, { "epoch": 0.04, - "grad_norm": 1.7809735536575317, - "learning_rate": 1.92135593220339e-05, - "loss": 1.5008, + "grad_norm": 11.795276641845703, + "learning_rate": 1.4844444444444445e-05, + "loss": 1.3926, "step": 1660 }, { "epoch": 0.04, - "grad_norm": 3.1804401874542236, - "learning_rate": 1.920677966101695e-05, - "loss": 1.7274, + "grad_norm": 4.47786283493042, + "learning_rate": 1.48e-05, + "loss": 1.5349, "step": 1670 }, { "epoch": 0.04, - "grad_norm": 1.3297713994979858, - "learning_rate": 1.9200000000000003e-05, - "loss": 1.3982, + "grad_norm": 4.560647010803223, + "learning_rate": 1.4755555555555556e-05, + "loss": 1.2318, "step": 1680 }, { "epoch": 0.04, - "grad_norm": 2.2256460189819336, - "learning_rate": 1.9193220338983052e-05, - "loss": 1.3758, + "grad_norm": 5.564818382263184, + "learning_rate": 1.4711111111111111e-05, + "loss": 1.2044, "step": 1690 }, { "epoch": 0.04, - "grad_norm": 1.3911705017089844, - "learning_rate": 1.9186440677966102e-05, - "loss": 1.5054, + "grad_norm": 1.4382556676864624, + "learning_rate": 1.4666666666666666e-05, + "loss": 1.3367, "step": 1700 }, { "epoch": 0.04, - "grad_norm": 1.7374446392059326, - "learning_rate": 1.9179661016949155e-05, - "loss": 1.3796, + "grad_norm": 2.7976467609405518, + "learning_rate": 1.4622222222222225e-05, + "loss": 1.2304, "step": 1710 }, { "epoch": 0.04, - "grad_norm": 2.8097071647644043, - "learning_rate": 1.9172881355932205e-05, - "loss": 1.2966, + "grad_norm": 3.021933078765869, + "learning_rate": 1.457777777777778e-05, + "loss": 1.1339, "step": 1720 }, { "epoch": 0.04, - "grad_norm": 2.590428352355957, - "learning_rate": 1.9166101694915254e-05, - "loss": 1.493, + "grad_norm": 1.9026000499725342, + "learning_rate": 1.4533333333333335e-05, + "loss": 1.3733, "step": 1730 }, { "epoch": 0.04, - "grad_norm": 3.458738327026367, - "learning_rate": 1.9159322033898308e-05, - "loss": 1.3179, + "grad_norm": 4.237542152404785, + "learning_rate": 1.448888888888889e-05, + "loss": 1.0956, "step": 1740 }, { "epoch": 0.04, - "grad_norm": 4.689072132110596, - "learning_rate": 1.9152542372881357e-05, - "loss": 1.2673, + "grad_norm": 5.288679599761963, + "learning_rate": 1.4444444444444446e-05, + "loss": 1.0542, "step": 1750 }, { "epoch": 0.04, - "grad_norm": 2.915776252746582, - "learning_rate": 1.9145762711864407e-05, - "loss": 1.5096, + "grad_norm": 4.956302642822266, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.326, "step": 1760 }, { "epoch": 0.04, - "grad_norm": 4.790291786193848, - "learning_rate": 1.913898305084746e-05, - "loss": 1.2601, + "grad_norm": 4.096738338470459, + "learning_rate": 1.4355555555555556e-05, + "loss": 1.1285, "step": 1770 }, { "epoch": 0.04, - "grad_norm": 2.455094337463379, - "learning_rate": 1.913220338983051e-05, - "loss": 1.5662, + "grad_norm": 7.757137298583984, + "learning_rate": 1.4311111111111111e-05, + "loss": 1.3607, "step": 1780 }, { "epoch": 0.04, - "grad_norm": 3.6922054290771484, - "learning_rate": 1.9125423728813563e-05, - "loss": 1.1819, + "grad_norm": 5.415702819824219, + "learning_rate": 1.4266666666666668e-05, + "loss": 1.0, "step": 1790 }, { "epoch": 0.04, - "grad_norm": 4.807013511657715, - "learning_rate": 1.9118644067796613e-05, - "loss": 1.4572, + "grad_norm": 9.445133209228516, + "learning_rate": 1.4222222222222224e-05, + "loss": 1.2187, "step": 1800 }, { "epoch": 0.04, - "grad_norm": 4.976742267608643, - "learning_rate": 1.9111864406779662e-05, - "loss": 1.4976, + "grad_norm": 5.406456470489502, + "learning_rate": 1.4177777777777779e-05, + "loss": 1.3459, "step": 1810 }, { "epoch": 0.04, - "grad_norm": 5.053407669067383, - "learning_rate": 1.9105084745762715e-05, - "loss": 1.4944, + "grad_norm": 1.619770884513855, + "learning_rate": 1.4133333333333334e-05, + "loss": 1.2368, "step": 1820 }, { "epoch": 0.04, - "grad_norm": 4.595452308654785, - "learning_rate": 1.9098305084745765e-05, - "loss": 1.3392, + "grad_norm": 4.307522296905518, + "learning_rate": 1.408888888888889e-05, + "loss": 1.0995, "step": 1830 }, { "epoch": 0.04, - "grad_norm": 1.3358185291290283, - "learning_rate": 1.9091525423728815e-05, - "loss": 1.3567, + "grad_norm": 6.3472185134887695, + "learning_rate": 1.4044444444444445e-05, + "loss": 1.2563, "step": 1840 }, { "epoch": 0.04, - "grad_norm": 2.6956140995025635, - "learning_rate": 1.9084745762711868e-05, - "loss": 1.4298, + "grad_norm": 2.04168701171875, + "learning_rate": 1.4e-05, + "loss": 1.2858, "step": 1850 }, { "epoch": 0.04, - "grad_norm": 2.6025402545928955, - "learning_rate": 1.9077966101694914e-05, - "loss": 1.3674, + "grad_norm": 2.180267810821533, + "learning_rate": 1.3955555555555558e-05, + "loss": 1.123, "step": 1860 }, { "epoch": 0.04, - "grad_norm": 1.4478222131729126, - "learning_rate": 1.9071186440677967e-05, - "loss": 1.5232, + "grad_norm": 2.560042381286621, + "learning_rate": 1.3911111111111114e-05, + "loss": 1.3962, "step": 1870 }, { "epoch": 0.04, - "grad_norm": 2.7879269123077393, - "learning_rate": 1.9064406779661017e-05, - "loss": 1.2088, + "grad_norm": 5.683982849121094, + "learning_rate": 1.3866666666666669e-05, + "loss": 1.0984, "step": 1880 }, { "epoch": 0.04, - "grad_norm": 4.009925365447998, - "learning_rate": 1.905762711864407e-05, - "loss": 1.306, + "grad_norm": 3.1994190216064453, + "learning_rate": 1.3822222222222224e-05, + "loss": 1.1746, "step": 1890 }, { "epoch": 0.04, - "grad_norm": 2.8789501190185547, - "learning_rate": 1.905084745762712e-05, - "loss": 1.6352, + "grad_norm": 8.851926803588867, + "learning_rate": 1.377777777777778e-05, + "loss": 1.4551, "step": 1900 }, { "epoch": 0.04, - "grad_norm": 1.9612170457839966, - "learning_rate": 1.904406779661017e-05, - "loss": 1.4313, + "grad_norm": 2.670786142349243, + "learning_rate": 1.3733333333333335e-05, + "loss": 1.2446, "step": 1910 }, { "epoch": 0.04, - "grad_norm": 3.5845580101013184, - "learning_rate": 1.9037288135593223e-05, - "loss": 1.464, + "grad_norm": 5.134795188903809, + "learning_rate": 1.368888888888889e-05, + "loss": 1.2128, "step": 1920 }, { "epoch": 0.04, - "grad_norm": 5.5218095779418945, - "learning_rate": 1.9030508474576272e-05, - "loss": 1.3589, + "grad_norm": 4.486630916595459, + "learning_rate": 1.3644444444444445e-05, + "loss": 1.1951, "step": 1930 }, { "epoch": 0.04, - "grad_norm": 3.3145925998687744, - "learning_rate": 1.9023728813559322e-05, - "loss": 1.5245, + "grad_norm": 4.025615215301514, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.3628, "step": 1940 }, { "epoch": 0.05, - "grad_norm": 1.0829366445541382, - "learning_rate": 1.9016949152542375e-05, - "loss": 1.5265, + "grad_norm": 4.232900619506836, + "learning_rate": 1.3555555555555557e-05, + "loss": 1.3367, "step": 1950 }, { "epoch": 0.05, - "grad_norm": 2.3161795139312744, - "learning_rate": 1.9010169491525425e-05, - "loss": 1.3759, + "grad_norm": 1.6469148397445679, + "learning_rate": 1.3511111111111112e-05, + "loss": 1.2144, "step": 1960 }, { "epoch": 0.05, - "grad_norm": 4.037906169891357, - "learning_rate": 1.9003389830508478e-05, - "loss": 1.576, + "grad_norm": 8.560945510864258, + "learning_rate": 1.3466666666666668e-05, + "loss": 1.4125, "step": 1970 }, { "epoch": 0.05, - "grad_norm": 2.638181447982788, - "learning_rate": 1.8996610169491528e-05, - "loss": 1.3606, + "grad_norm": 3.06697416305542, + "learning_rate": 1.3422222222222223e-05, + "loss": 1.1769, "step": 1980 }, { "epoch": 0.05, - "grad_norm": 1.6348967552185059, - "learning_rate": 1.8989830508474577e-05, - "loss": 1.4664, + "grad_norm": 2.721186399459839, + "learning_rate": 1.3377777777777778e-05, + "loss": 1.3266, "step": 1990 }, { "epoch": 0.05, - "grad_norm": 2.778348684310913, - "learning_rate": 1.898305084745763e-05, - "loss": 1.308, + "grad_norm": 4.427524566650391, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.1697, "step": 2000 }, { "epoch": 0.05, - "eval_loss": 1.1971772909164429, - "eval_runtime": 67.1893, - "eval_samples_per_second": 14.883, - "eval_steps_per_second": 14.883, + "eval_loss": 1.042366623878479, + "eval_runtime": 67.0143, + "eval_samples_per_second": 14.922, + "eval_steps_per_second": 14.922, "step": 2000 }, { "epoch": 0.05, - "grad_norm": 3.655930757522583, - "learning_rate": 1.897627118644068e-05, - "loss": 1.5353, + "grad_norm": 1.322222113609314, + "learning_rate": 1.3288888888888889e-05, + "loss": 1.3173, "step": 2010 }, { "epoch": 0.05, - "grad_norm": 7.265784740447998, - "learning_rate": 1.896949152542373e-05, - "loss": 1.4901, + "grad_norm": 5.731060028076172, + "learning_rate": 1.3244444444444447e-05, + "loss": 1.2578, "step": 2020 }, { "epoch": 0.05, - "grad_norm": 1.902195930480957, - "learning_rate": 1.8962711864406783e-05, - "loss": 1.3619, + "grad_norm": 0.8411041498184204, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.1796, "step": 2030 }, { "epoch": 0.05, - "grad_norm": 2.685209035873413, - "learning_rate": 1.8955932203389833e-05, - "loss": 1.4934, + "grad_norm": 5.4170026779174805, + "learning_rate": 1.3155555555555558e-05, + "loss": 1.3608, "step": 2040 }, { "epoch": 0.05, - "grad_norm": 6.6489458084106445, - "learning_rate": 1.8949152542372882e-05, - "loss": 1.4807, + "grad_norm": 4.58616304397583, + "learning_rate": 1.3111111111111113e-05, + "loss": 1.2726, "step": 2050 }, { "epoch": 0.05, - "grad_norm": 4.4234724044799805, - "learning_rate": 1.8942372881355932e-05, - "loss": 1.4953, + "grad_norm": 3.936751365661621, + "learning_rate": 1.3066666666666668e-05, + "loss": 1.3327, "step": 2060 }, { "epoch": 0.05, - "grad_norm": 2.339203119277954, - "learning_rate": 1.8935593220338985e-05, - "loss": 1.4583, + "grad_norm": 4.1074042320251465, + "learning_rate": 1.3022222222222223e-05, + "loss": 1.2446, "step": 2070 }, { "epoch": 0.05, - "grad_norm": 2.0084259510040283, - "learning_rate": 1.8928813559322035e-05, - "loss": 1.1904, + "grad_norm": 2.657953977584839, + "learning_rate": 1.2977777777777779e-05, + "loss": 0.9989, "step": 2080 }, { "epoch": 0.05, - "grad_norm": 2.666440963745117, - "learning_rate": 1.8922033898305085e-05, - "loss": 1.3025, + "grad_norm": 5.181591987609863, + "learning_rate": 1.2933333333333334e-05, + "loss": 1.1732, "step": 2090 }, { "epoch": 0.05, - "grad_norm": 1.9041775465011597, - "learning_rate": 1.8915254237288138e-05, - "loss": 1.3873, + "grad_norm": 5.981390953063965, + "learning_rate": 1.288888888888889e-05, + "loss": 1.1463, "step": 2100 }, { "epoch": 0.05, - "grad_norm": 3.1416661739349365, - "learning_rate": 1.8908474576271187e-05, - "loss": 1.3741, + "grad_norm": 6.05696964263916, + "learning_rate": 1.2844444444444446e-05, + "loss": 1.2061, "step": 2110 }, { "epoch": 0.05, - "grad_norm": 3.8044958114624023, - "learning_rate": 1.8901694915254237e-05, - "loss": 1.5686, + "grad_norm": 6.697308540344238, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.3052, "step": 2120 }, { "epoch": 0.05, - "grad_norm": 3.8283112049102783, - "learning_rate": 1.889491525423729e-05, - "loss": 1.3731, + "grad_norm": 14.833478927612305, + "learning_rate": 1.2755555555555556e-05, + "loss": 1.1987, "step": 2130 }, { "epoch": 0.05, - "grad_norm": 3.2208967208862305, - "learning_rate": 1.888813559322034e-05, - "loss": 1.5184, + "grad_norm": 4.479732990264893, + "learning_rate": 1.2711111111111112e-05, + "loss": 1.337, "step": 2140 }, { "epoch": 0.05, - "grad_norm": 1.5146480798721313, - "learning_rate": 1.8881355932203393e-05, - "loss": 1.3791, + "grad_norm": 3.746943235397339, + "learning_rate": 1.2666666666666667e-05, + "loss": 1.3082, "step": 2150 }, { "epoch": 0.05, - "grad_norm": 2.624804973602295, - "learning_rate": 1.8874576271186443e-05, - "loss": 1.3399, + "grad_norm": 4.7201828956604, + "learning_rate": 1.2622222222222222e-05, + "loss": 1.1393, "step": 2160 }, { "epoch": 0.05, - "grad_norm": 4.085561752319336, - "learning_rate": 1.8867796610169492e-05, - "loss": 1.5298, + "grad_norm": 5.082671642303467, + "learning_rate": 1.257777777777778e-05, + "loss": 1.3047, "step": 2170 }, { "epoch": 0.05, - "grad_norm": 4.3589959144592285, - "learning_rate": 1.8861016949152545e-05, - "loss": 1.2609, + "grad_norm": 3.5268049240112305, + "learning_rate": 1.2533333333333336e-05, + "loss": 1.1255, "step": 2180 }, { "epoch": 0.05, - "grad_norm": 5.165348529815674, - "learning_rate": 1.8854237288135595e-05, - "loss": 1.5708, + "grad_norm": 3.8856544494628906, + "learning_rate": 1.2488888888888891e-05, + "loss": 1.4022, "step": 2190 }, { "epoch": 0.05, - "grad_norm": 2.508254051208496, - "learning_rate": 1.8847457627118645e-05, - "loss": 1.5046, + "grad_norm": 5.552175521850586, + "learning_rate": 1.2444444444444446e-05, + "loss": 1.3444, "step": 2200 }, { "epoch": 0.05, - "grad_norm": 4.811107635498047, - "learning_rate": 1.8840677966101698e-05, - "loss": 1.5798, + "grad_norm": 2.772660255432129, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.4274, "step": 2210 }, { "epoch": 0.05, - "grad_norm": 4.915175914764404, - "learning_rate": 1.8833898305084748e-05, - "loss": 1.3742, + "grad_norm": 6.9137773513793945, + "learning_rate": 1.2355555555555557e-05, + "loss": 1.1356, "step": 2220 }, { "epoch": 0.05, - "grad_norm": 4.212732791900635, - "learning_rate": 1.8827118644067797e-05, - "loss": 1.4748, + "grad_norm": 2.5068724155426025, + "learning_rate": 1.2311111111111112e-05, + "loss": 1.25, "step": 2230 }, { "epoch": 0.05, - "grad_norm": 2.4692771434783936, - "learning_rate": 1.882033898305085e-05, - "loss": 1.1476, + "grad_norm": 0.8587338924407959, + "learning_rate": 1.2266666666666667e-05, + "loss": 0.9182, "step": 2240 }, { "epoch": 0.05, - "grad_norm": 3.057826042175293, - "learning_rate": 1.88135593220339e-05, - "loss": 1.4235, + "grad_norm": 5.356514930725098, + "learning_rate": 1.2222222222222224e-05, + "loss": 1.2882, "step": 2250 }, { "epoch": 0.05, - "grad_norm": 1.5246427059173584, - "learning_rate": 1.8806779661016953e-05, - "loss": 1.4705, + "grad_norm": 8.728981018066406, + "learning_rate": 1.217777777777778e-05, + "loss": 1.2974, "step": 2260 }, { "epoch": 0.05, - "grad_norm": 1.8235859870910645, - "learning_rate": 1.88e-05, - "loss": 1.4017, + "grad_norm": 2.5229454040527344, + "learning_rate": 1.2133333333333335e-05, + "loss": 1.2368, "step": 2270 }, { "epoch": 0.05, - "grad_norm": 2.3737335205078125, - "learning_rate": 1.8793220338983053e-05, - "loss": 1.2918, + "grad_norm": 2.760233163833618, + "learning_rate": 1.208888888888889e-05, + "loss": 1.0504, "step": 2280 }, { "epoch": 0.05, - "grad_norm": 1.6644233465194702, - "learning_rate": 1.8786440677966102e-05, - "loss": 1.5225, + "grad_norm": 1.7355753183364868, + "learning_rate": 1.2044444444444445e-05, + "loss": 1.3547, "step": 2290 }, { "epoch": 0.05, - "grad_norm": 2.259829044342041, - "learning_rate": 1.8779661016949152e-05, - "loss": 1.428, + "grad_norm": 2.2375967502593994, + "learning_rate": 1.2e-05, + "loss": 1.3058, "step": 2300 }, { "epoch": 0.05, - "grad_norm": 2.356344223022461, - "learning_rate": 1.8772881355932205e-05, - "loss": 1.4652, + "grad_norm": 4.5727386474609375, + "learning_rate": 1.1955555555555556e-05, + "loss": 1.3097, "step": 2310 }, { "epoch": 0.05, - "grad_norm": 4.862521171569824, - "learning_rate": 1.8766101694915255e-05, - "loss": 1.3431, + "grad_norm": 7.387458801269531, + "learning_rate": 1.191111111111111e-05, + "loss": 1.2058, "step": 2320 }, { "epoch": 0.05, - "grad_norm": 2.1847054958343506, - "learning_rate": 1.8759322033898305e-05, - "loss": 1.3383, + "grad_norm": 3.593174934387207, + "learning_rate": 1.186666666666667e-05, + "loss": 1.1711, "step": 2330 }, { "epoch": 0.05, - "grad_norm": 5.384062767028809, - "learning_rate": 1.8752542372881358e-05, - "loss": 1.3222, + "grad_norm": 3.3077690601348877, + "learning_rate": 1.1822222222222225e-05, + "loss": 1.1175, "step": 2340 }, { "epoch": 0.05, - "grad_norm": 2.504513740539551, - "learning_rate": 1.8745762711864407e-05, - "loss": 1.4033, + "grad_norm": 4.785140514373779, + "learning_rate": 1.177777777777778e-05, + "loss": 1.2618, "step": 2350 }, { "epoch": 0.05, - "grad_norm": 2.035881757736206, - "learning_rate": 1.873898305084746e-05, - "loss": 1.4297, + "grad_norm": 1.8957252502441406, + "learning_rate": 1.1733333333333335e-05, + "loss": 1.1641, "step": 2360 }, { "epoch": 0.05, - "grad_norm": 3.0843729972839355, - "learning_rate": 1.873220338983051e-05, - "loss": 1.5548, + "grad_norm": 7.033701419830322, + "learning_rate": 1.168888888888889e-05, + "loss": 1.3563, "step": 2370 }, { "epoch": 0.06, - "grad_norm": 4.0240912437438965, - "learning_rate": 1.872542372881356e-05, - "loss": 1.5772, + "grad_norm": 2.6566953659057617, + "learning_rate": 1.1644444444444446e-05, + "loss": 1.3673, "step": 2380 }, { "epoch": 0.06, - "grad_norm": 5.927667140960693, - "learning_rate": 1.8718644067796613e-05, - "loss": 1.471, + "grad_norm": 10.329083442687988, + "learning_rate": 1.16e-05, + "loss": 1.2953, "step": 2390 }, { "epoch": 0.06, - "grad_norm": 4.345820903778076, - "learning_rate": 1.8711864406779663e-05, - "loss": 1.5337, + "grad_norm": 3.5499980449676514, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.3136, "step": 2400 }, { "epoch": 0.06, - "grad_norm": 4.0480122566223145, - "learning_rate": 1.8705084745762712e-05, - "loss": 1.5327, + "grad_norm": 2.051661968231201, + "learning_rate": 1.1511111111111113e-05, + "loss": 1.3235, "step": 2410 }, { "epoch": 0.06, - "grad_norm": 2.2904655933380127, - "learning_rate": 1.8698305084745765e-05, - "loss": 1.3821, + "grad_norm": 2.9961462020874023, + "learning_rate": 1.1466666666666668e-05, + "loss": 1.1637, "step": 2420 }, { "epoch": 0.06, - "grad_norm": 5.9938225746154785, - "learning_rate": 1.8691525423728815e-05, - "loss": 1.2987, + "grad_norm": 8.53834056854248, + "learning_rate": 1.1422222222222223e-05, + "loss": 1.1066, "step": 2430 }, { "epoch": 0.06, - "grad_norm": 2.0605130195617676, - "learning_rate": 1.8684745762711868e-05, - "loss": 1.7052, + "grad_norm": 5.684910297393799, + "learning_rate": 1.1377777777777779e-05, + "loss": 1.4476, "step": 2440 }, { "epoch": 0.06, - "grad_norm": 1.4659353494644165, - "learning_rate": 1.8677966101694918e-05, - "loss": 1.5719, + "grad_norm": 16.79145622253418, + "learning_rate": 1.1333333333333334e-05, + "loss": 1.3978, "step": 2450 }, { "epoch": 0.06, - "grad_norm": 1.8421217203140259, - "learning_rate": 1.8671186440677968e-05, - "loss": 1.3397, + "grad_norm": 3.3043346405029297, + "learning_rate": 1.1288888888888889e-05, + "loss": 1.1598, "step": 2460 }, { "epoch": 0.06, - "grad_norm": 1.0998902320861816, - "learning_rate": 1.866440677966102e-05, - "loss": 1.4649, + "grad_norm": 3.7516391277313232, + "learning_rate": 1.1244444444444444e-05, + "loss": 1.3534, "step": 2470 }, { "epoch": 0.06, - "grad_norm": 3.088279962539673, - "learning_rate": 1.8657627118644067e-05, - "loss": 1.4717, + "grad_norm": 2.0643539428710938, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.2924, "step": 2480 }, { "epoch": 0.06, - "grad_norm": 8.840229034423828, - "learning_rate": 1.865084745762712e-05, - "loss": 1.4184, + "grad_norm": 4.207937717437744, + "learning_rate": 1.1155555555555556e-05, + "loss": 1.2699, "step": 2490 }, { "epoch": 0.06, - "grad_norm": 2.5428450107574463, - "learning_rate": 1.864406779661017e-05, - "loss": 1.5584, + "grad_norm": 3.3195173740386963, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.379, "step": 2500 }, { "epoch": 0.06, - "eval_loss": 1.1482951641082764, - "eval_runtime": 67.0978, - "eval_samples_per_second": 14.904, - "eval_steps_per_second": 14.904, + "eval_loss": 0.986103355884552, + "eval_runtime": 67.0933, + "eval_samples_per_second": 14.905, + "eval_steps_per_second": 14.905, "step": 2500 - }, - { - "epoch": 0.06, - "grad_norm": 5.455661773681641, - "learning_rate": 1.863728813559322e-05, - "loss": 1.5017, - "step": 2510 - }, - { - "epoch": 0.06, - "grad_norm": 4.0317816734313965, - "learning_rate": 1.8630508474576273e-05, - "loss": 1.5774, - "step": 2520 - }, - { - "epoch": 0.06, - "grad_norm": 3.15238618850708, - "learning_rate": 1.8623728813559322e-05, - "loss": 1.3854, - "step": 2530 - }, - { - "epoch": 0.06, - "grad_norm": 2.9443018436431885, - "learning_rate": 1.8616949152542376e-05, - "loss": 1.5316, - "step": 2540 - }, - { - "epoch": 0.06, - "grad_norm": 3.2570126056671143, - "learning_rate": 1.8610169491525425e-05, - "loss": 1.4141, - "step": 2550 - }, - { - "epoch": 0.06, - "grad_norm": 1.6843924522399902, - "learning_rate": 1.8603389830508475e-05, - "loss": 1.4986, - "step": 2560 - }, - { - "epoch": 0.06, - "grad_norm": 1.697768211364746, - "learning_rate": 1.8596610169491528e-05, - "loss": 1.3707, - "step": 2570 - }, - { - "epoch": 0.06, - "grad_norm": 5.394255638122559, - "learning_rate": 1.8589830508474578e-05, - "loss": 1.4716, - "step": 2580 - }, - { - "epoch": 0.06, - "grad_norm": 5.255263805389404, - "learning_rate": 1.8583050847457627e-05, - "loss": 1.5351, - "step": 2590 - }, - { - "epoch": 0.06, - "grad_norm": 4.587831974029541, - "learning_rate": 1.857627118644068e-05, - "loss": 1.3261, - "step": 2600 - }, - { - "epoch": 0.06, - "grad_norm": 8.405659675598145, - "learning_rate": 1.856949152542373e-05, - "loss": 1.4404, - "step": 2610 - }, - { - "epoch": 0.06, - "grad_norm": 4.0395121574401855, - "learning_rate": 1.856271186440678e-05, - "loss": 1.368, - "step": 2620 - }, - { - "epoch": 0.06, - "grad_norm": 6.0600972175598145, - "learning_rate": 1.8555932203389833e-05, - "loss": 1.4306, - "step": 2630 - }, - { - "epoch": 0.06, - "grad_norm": 3.951476573944092, - "learning_rate": 1.8549152542372883e-05, - "loss": 1.4079, - "step": 2640 - }, - { - "epoch": 0.06, - "grad_norm": 2.6316113471984863, - "learning_rate": 1.8542372881355936e-05, - "loss": 1.4675, - "step": 2650 - }, - { - "epoch": 0.06, - "grad_norm": 2.2290899753570557, - "learning_rate": 1.8535593220338986e-05, - "loss": 1.462, - "step": 2660 - }, - { - "epoch": 0.06, - "grad_norm": 1.8435947895050049, - "learning_rate": 1.8528813559322035e-05, - "loss": 1.0381, - "step": 2670 - }, - { - "epoch": 0.06, - "grad_norm": 2.033623456954956, - "learning_rate": 1.852203389830509e-05, - "loss": 1.5768, - "step": 2680 - }, - { - "epoch": 0.06, - "grad_norm": 2.9724245071411133, - "learning_rate": 1.8515254237288135e-05, - "loss": 1.2659, - "step": 2690 - }, - { - "epoch": 0.06, - "grad_norm": 3.107775926589966, - "learning_rate": 1.8508474576271188e-05, - "loss": 1.1751, - "step": 2700 - }, - { - "epoch": 0.06, - "grad_norm": 4.55193567276001, - "learning_rate": 1.8501694915254237e-05, - "loss": 1.5424, - "step": 2710 - }, - { - "epoch": 0.06, - "grad_norm": 1.6639865636825562, - "learning_rate": 1.849491525423729e-05, - "loss": 1.391, - "step": 2720 - }, - { - "epoch": 0.06, - "grad_norm": 1.7521861791610718, - "learning_rate": 1.848813559322034e-05, - "loss": 1.5969, - "step": 2730 - }, - { - "epoch": 0.06, - "grad_norm": 1.5757704973220825, - "learning_rate": 1.848135593220339e-05, - "loss": 1.4257, - "step": 2740 - }, - { - "epoch": 0.06, - "grad_norm": 3.4172585010528564, - "learning_rate": 1.8474576271186443e-05, - "loss": 1.4596, - "step": 2750 - }, - { - "epoch": 0.06, - "grad_norm": 3.5575098991394043, - "learning_rate": 1.8467796610169493e-05, - "loss": 1.5946, - "step": 2760 - }, - { - "epoch": 0.06, - "grad_norm": 3.385815382003784, - "learning_rate": 1.8461016949152542e-05, - "loss": 1.3725, - "step": 2770 - }, - { - "epoch": 0.06, - "grad_norm": 3.108947992324829, - "learning_rate": 1.8454237288135596e-05, - "loss": 1.3995, - "step": 2780 - }, - { - "epoch": 0.06, - "grad_norm": 1.180427074432373, - "learning_rate": 1.8447457627118645e-05, - "loss": 1.7337, - "step": 2790 - }, - { - "epoch": 0.06, - "grad_norm": 1.628450632095337, - "learning_rate": 1.8440677966101695e-05, - "loss": 1.3329, - "step": 2800 - }, - { - "epoch": 0.07, - "grad_norm": 1.4316961765289307, - "learning_rate": 1.8433898305084748e-05, - "loss": 1.5717, - "step": 2810 - }, - { - "epoch": 0.07, - "grad_norm": 1.428252100944519, - "learning_rate": 1.8427118644067798e-05, - "loss": 1.3912, - "step": 2820 - }, - { - "epoch": 0.07, - "grad_norm": 2.6684863567352295, - "learning_rate": 1.842033898305085e-05, - "loss": 1.4674, - "step": 2830 - }, - { - "epoch": 0.07, - "grad_norm": 1.7176380157470703, - "learning_rate": 1.84135593220339e-05, - "loss": 1.2351, - "step": 2840 - }, - { - "epoch": 0.07, - "grad_norm": 2.1771388053894043, - "learning_rate": 1.840677966101695e-05, - "loss": 1.6447, - "step": 2850 - }, - { - "epoch": 0.07, - "grad_norm": 4.724181652069092, - "learning_rate": 1.8400000000000003e-05, - "loss": 1.4167, - "step": 2860 - }, - { - "epoch": 0.07, - "grad_norm": 2.6035029888153076, - "learning_rate": 1.8393220338983053e-05, - "loss": 1.3391, - "step": 2870 - }, - { - "epoch": 0.07, - "grad_norm": 2.2763707637786865, - "learning_rate": 1.8386440677966103e-05, - "loss": 1.5107, - "step": 2880 - }, - { - "epoch": 0.07, - "grad_norm": 1.8805983066558838, - "learning_rate": 1.8379661016949153e-05, - "loss": 1.6482, - "step": 2890 - }, - { - "epoch": 0.07, - "grad_norm": 1.5126351118087769, - "learning_rate": 1.8372881355932202e-05, - "loss": 1.4343, - "step": 2900 - }, - { - "epoch": 0.07, - "grad_norm": 7.735413074493408, - "learning_rate": 1.8366101694915255e-05, - "loss": 1.2867, - "step": 2910 - }, - { - "epoch": 0.07, - "grad_norm": 3.7856109142303467, - "learning_rate": 1.8359322033898305e-05, - "loss": 1.3506, - "step": 2920 - }, - { - "epoch": 0.07, - "grad_norm": 2.2521424293518066, - "learning_rate": 1.8352542372881358e-05, - "loss": 1.3935, - "step": 2930 - }, - { - "epoch": 0.07, - "grad_norm": 2.2285022735595703, - "learning_rate": 1.8345762711864408e-05, - "loss": 1.5142, - "step": 2940 - }, - { - "epoch": 0.07, - "grad_norm": 2.304668664932251, - "learning_rate": 1.8338983050847458e-05, - "loss": 1.5006, - "step": 2950 - }, - { - "epoch": 0.07, - "grad_norm": 1.3826477527618408, - "learning_rate": 1.833220338983051e-05, - "loss": 1.1512, - "step": 2960 - }, - { - "epoch": 0.07, - "grad_norm": 1.6900218725204468, - "learning_rate": 1.832542372881356e-05, - "loss": 1.4017, - "step": 2970 - }, - { - "epoch": 0.07, - "grad_norm": 3.0282979011535645, - "learning_rate": 1.831864406779661e-05, - "loss": 1.3722, - "step": 2980 - }, - { - "epoch": 0.07, - "grad_norm": 4.85763692855835, - "learning_rate": 1.8311864406779663e-05, - "loss": 1.3328, - "step": 2990 - }, - { - "epoch": 0.07, - "grad_norm": 2.2919251918792725, - "learning_rate": 1.8305084745762713e-05, - "loss": 1.4177, - "step": 3000 - }, - { - "epoch": 0.07, - "eval_loss": 1.2010964155197144, - "eval_runtime": 67.1519, - "eval_samples_per_second": 14.892, - "eval_steps_per_second": 14.892, - "step": 3000 - }, - { - "epoch": 0.07, - "grad_norm": 2.3012161254882812, - "learning_rate": 1.8298305084745766e-05, - "loss": 1.2741, - "step": 3010 - }, - { - "epoch": 0.07, - "grad_norm": 1.728154182434082, - "learning_rate": 1.8291525423728816e-05, - "loss": 1.495, - "step": 3020 - }, - { - "epoch": 0.07, - "grad_norm": 2.277280569076538, - "learning_rate": 1.8284745762711865e-05, - "loss": 1.4877, - "step": 3030 - }, - { - "epoch": 0.07, - "grad_norm": 2.6222431659698486, - "learning_rate": 1.827796610169492e-05, - "loss": 1.3664, - "step": 3040 - }, - { - "epoch": 0.07, - "grad_norm": 2.7156178951263428, - "learning_rate": 1.8271186440677968e-05, - "loss": 1.3912, - "step": 3050 - }, - { - "epoch": 0.07, - "grad_norm": 0.9766186475753784, - "learning_rate": 1.8264406779661018e-05, - "loss": 1.3334, - "step": 3060 - }, - { - "epoch": 0.07, - "grad_norm": 11.627836227416992, - "learning_rate": 1.825762711864407e-05, - "loss": 1.6583, - "step": 3070 - }, - { - "epoch": 0.07, - "grad_norm": 2.1804165840148926, - "learning_rate": 1.825084745762712e-05, - "loss": 1.2931, - "step": 3080 - }, - { - "epoch": 0.07, - "grad_norm": 12.77298641204834, - "learning_rate": 1.824406779661017e-05, - "loss": 1.3071, - "step": 3090 - }, - { - "epoch": 0.07, - "grad_norm": 3.189802646636963, - "learning_rate": 1.823728813559322e-05, - "loss": 1.508, - "step": 3100 - }, - { - "epoch": 0.07, - "grad_norm": 2.3131775856018066, - "learning_rate": 1.8230508474576273e-05, - "loss": 1.4604, - "step": 3110 - }, - { - "epoch": 0.07, - "grad_norm": 4.224339962005615, - "learning_rate": 1.8223728813559323e-05, - "loss": 1.2131, - "step": 3120 - }, - { - "epoch": 0.07, - "grad_norm": 4.142259120941162, - "learning_rate": 1.8216949152542373e-05, - "loss": 1.409, - "step": 3130 - }, - { - "epoch": 0.07, - "grad_norm": 2.409524917602539, - "learning_rate": 1.8210169491525426e-05, - "loss": 1.5591, - "step": 3140 - }, - { - "epoch": 0.07, - "grad_norm": 2.3798465728759766, - "learning_rate": 1.8203389830508475e-05, - "loss": 1.4358, - "step": 3150 - }, - { - "epoch": 0.07, - "grad_norm": 0.9663469195365906, - "learning_rate": 1.8196610169491525e-05, - "loss": 1.4432, - "step": 3160 - }, - { - "epoch": 0.07, - "grad_norm": 2.4938082695007324, - "learning_rate": 1.8189830508474578e-05, - "loss": 1.2837, - "step": 3170 - }, - { - "epoch": 0.07, - "grad_norm": 2.094170093536377, - "learning_rate": 1.8183050847457628e-05, - "loss": 1.3925, - "step": 3180 - }, - { - "epoch": 0.07, - "grad_norm": 4.124843120574951, - "learning_rate": 1.817627118644068e-05, - "loss": 1.5111, - "step": 3190 - }, - { - "epoch": 0.07, - "grad_norm": 1.090320348739624, - "learning_rate": 1.816949152542373e-05, - "loss": 1.4229, - "step": 3200 - }, - { - "epoch": 0.07, - "grad_norm": 2.482816696166992, - "learning_rate": 1.816271186440678e-05, - "loss": 1.3102, - "step": 3210 - }, - { - "epoch": 0.07, - "grad_norm": 3.5619702339172363, - "learning_rate": 1.8155932203389833e-05, - "loss": 1.2051, - "step": 3220 - }, - { - "epoch": 0.07, - "grad_norm": 7.419604301452637, - "learning_rate": 1.8149152542372883e-05, - "loss": 1.3915, - "step": 3230 - }, - { - "epoch": 0.08, - "grad_norm": 4.109569072723389, - "learning_rate": 1.8142372881355933e-05, - "loss": 1.4158, - "step": 3240 - }, - { - "epoch": 0.08, - "grad_norm": 2.4256246089935303, - "learning_rate": 1.8135593220338986e-05, - "loss": 1.4494, - "step": 3250 - }, - { - "epoch": 0.08, - "grad_norm": 1.4669333696365356, - "learning_rate": 1.8128813559322036e-05, - "loss": 1.6379, - "step": 3260 - }, - { - "epoch": 0.08, - "grad_norm": 2.800262689590454, - "learning_rate": 1.8122033898305085e-05, - "loss": 1.5004, - "step": 3270 - }, - { - "epoch": 0.08, - "grad_norm": 2.3720695972442627, - "learning_rate": 1.811525423728814e-05, - "loss": 1.425, - "step": 3280 - }, - { - "epoch": 0.08, - "grad_norm": 1.8338054418563843, - "learning_rate": 1.8108474576271188e-05, - "loss": 1.4227, - "step": 3290 - }, - { - "epoch": 0.08, - "grad_norm": 4.969144821166992, - "learning_rate": 1.810169491525424e-05, - "loss": 1.3806, - "step": 3300 - }, - { - "epoch": 0.08, - "grad_norm": 3.558366537094116, - "learning_rate": 1.8094915254237288e-05, - "loss": 1.4254, - "step": 3310 - }, - { - "epoch": 0.08, - "grad_norm": 1.669677972793579, - "learning_rate": 1.808813559322034e-05, - "loss": 1.4065, - "step": 3320 - }, - { - "epoch": 0.08, - "grad_norm": 1.3998602628707886, - "learning_rate": 1.808135593220339e-05, - "loss": 1.7619, - "step": 3330 - }, - { - "epoch": 0.08, - "grad_norm": 2.9056684970855713, - "learning_rate": 1.807457627118644e-05, - "loss": 1.4177, - "step": 3340 - }, - { - "epoch": 0.08, - "grad_norm": 2.942318916320801, - "learning_rate": 1.8067796610169493e-05, - "loss": 1.4779, - "step": 3350 - }, - { - "epoch": 0.08, - "grad_norm": 3.1195051670074463, - "learning_rate": 1.8061016949152543e-05, - "loss": 1.4865, - "step": 3360 - }, - { - "epoch": 0.08, - "grad_norm": 4.495617866516113, - "learning_rate": 1.8054237288135593e-05, - "loss": 1.3127, - "step": 3370 - }, - { - "epoch": 0.08, - "grad_norm": 2.5328311920166016, - "learning_rate": 1.8047457627118646e-05, - "loss": 1.4624, - "step": 3380 - }, - { - "epoch": 0.08, - "grad_norm": 3.0216922760009766, - "learning_rate": 1.8040677966101695e-05, - "loss": 1.4197, - "step": 3390 - }, - { - "epoch": 0.08, - "grad_norm": 1.578646183013916, - "learning_rate": 1.803389830508475e-05, - "loss": 1.2317, - "step": 3400 - }, - { - "epoch": 0.08, - "grad_norm": 3.8747899532318115, - "learning_rate": 1.8027118644067798e-05, - "loss": 1.5859, - "step": 3410 - }, - { - "epoch": 0.08, - "grad_norm": 4.799411773681641, - "learning_rate": 1.8020338983050848e-05, - "loss": 1.5663, - "step": 3420 - }, - { - "epoch": 0.08, - "grad_norm": 3.006511688232422, - "learning_rate": 1.80135593220339e-05, - "loss": 1.4501, - "step": 3430 - }, - { - "epoch": 0.08, - "grad_norm": 1.5541688203811646, - "learning_rate": 1.800677966101695e-05, - "loss": 1.6072, - "step": 3440 - }, - { - "epoch": 0.08, - "grad_norm": 1.2471415996551514, - "learning_rate": 1.8e-05, - "loss": 1.4034, - "step": 3450 - }, - { - "epoch": 0.08, - "grad_norm": 2.43544602394104, - "learning_rate": 1.7993220338983054e-05, - "loss": 1.3181, - "step": 3460 - }, - { - "epoch": 0.08, - "grad_norm": 6.56860876083374, - "learning_rate": 1.7986440677966103e-05, - "loss": 1.308, - "step": 3470 - }, - { - "epoch": 0.08, - "grad_norm": 1.6072529554367065, - "learning_rate": 1.7979661016949156e-05, - "loss": 1.3989, - "step": 3480 - }, - { - "epoch": 0.08, - "grad_norm": 1.9302940368652344, - "learning_rate": 1.7972881355932206e-05, - "loss": 1.5623, - "step": 3490 - }, - { - "epoch": 0.08, - "grad_norm": 5.61619758605957, - "learning_rate": 1.7966101694915256e-05, - "loss": 1.4135, - "step": 3500 - }, - { - "epoch": 0.08, - "eval_loss": 1.1386640071868896, - "eval_runtime": 67.1064, - "eval_samples_per_second": 14.902, - "eval_steps_per_second": 14.902, - "step": 3500 - }, - { - "epoch": 0.08, - "grad_norm": 2.315586566925049, - "learning_rate": 1.795932203389831e-05, - "loss": 1.5134, - "step": 3510 - }, - { - "epoch": 0.08, - "grad_norm": 6.156167507171631, - "learning_rate": 1.7952542372881355e-05, - "loss": 1.4425, - "step": 3520 - }, - { - "epoch": 0.08, - "grad_norm": 1.9159029722213745, - "learning_rate": 1.7945762711864408e-05, - "loss": 1.3159, - "step": 3530 - }, - { - "epoch": 0.08, - "grad_norm": 3.3781180381774902, - "learning_rate": 1.7938983050847458e-05, - "loss": 1.3683, - "step": 3540 - }, - { - "epoch": 0.08, - "grad_norm": 1.8861943483352661, - "learning_rate": 1.7932203389830508e-05, - "loss": 1.486, - "step": 3550 - }, - { - "epoch": 0.08, - "grad_norm": 3.9574244022369385, - "learning_rate": 1.792542372881356e-05, - "loss": 1.4858, - "step": 3560 - }, - { - "epoch": 0.08, - "grad_norm": 5.585728168487549, - "learning_rate": 1.791864406779661e-05, - "loss": 1.4388, - "step": 3570 - }, - { - "epoch": 0.08, - "grad_norm": 6.258697986602783, - "learning_rate": 1.7911864406779664e-05, - "loss": 1.5302, - "step": 3580 - }, - { - "epoch": 0.08, - "grad_norm": 4.292418479919434, - "learning_rate": 1.7905084745762713e-05, - "loss": 1.3228, - "step": 3590 - }, - { - "epoch": 0.08, - "grad_norm": 4.342001438140869, - "learning_rate": 1.7898305084745763e-05, - "loss": 1.1876, - "step": 3600 - }, - { - "epoch": 0.08, - "grad_norm": 7.243982315063477, - "learning_rate": 1.7891525423728816e-05, - "loss": 1.4241, - "step": 3610 - }, - { - "epoch": 0.08, - "grad_norm": 8.561788558959961, - "learning_rate": 1.7884745762711866e-05, - "loss": 1.4567, - "step": 3620 - }, - { - "epoch": 0.08, - "grad_norm": 2.6268651485443115, - "learning_rate": 1.7877966101694916e-05, - "loss": 1.2928, - "step": 3630 - }, - { - "epoch": 0.08, - "grad_norm": 3.3656156063079834, - "learning_rate": 1.787118644067797e-05, - "loss": 1.2128, - "step": 3640 - }, - { - "epoch": 0.08, - "grad_norm": 3.3748042583465576, - "learning_rate": 1.7864406779661018e-05, - "loss": 1.472, - "step": 3650 - }, - { - "epoch": 0.08, - "grad_norm": 4.713076591491699, - "learning_rate": 1.785762711864407e-05, - "loss": 1.4151, - "step": 3660 - }, - { - "epoch": 0.08, - "grad_norm": 1.744603157043457, - "learning_rate": 1.785084745762712e-05, - "loss": 1.3556, - "step": 3670 - }, - { - "epoch": 0.09, - "grad_norm": 5.440206050872803, - "learning_rate": 1.784406779661017e-05, - "loss": 1.5536, - "step": 3680 - }, - { - "epoch": 0.09, - "grad_norm": 1.1328643560409546, - "learning_rate": 1.7837288135593224e-05, - "loss": 1.3039, - "step": 3690 - }, - { - "epoch": 0.09, - "grad_norm": 2.336589813232422, - "learning_rate": 1.7830508474576274e-05, - "loss": 1.4308, - "step": 3700 - }, - { - "epoch": 0.09, - "grad_norm": 2.2741591930389404, - "learning_rate": 1.7823728813559323e-05, - "loss": 1.3202, - "step": 3710 - }, - { - "epoch": 0.09, - "grad_norm": 4.327836990356445, - "learning_rate": 1.7816949152542376e-05, - "loss": 1.5221, - "step": 3720 - }, - { - "epoch": 0.09, - "grad_norm": 1.930452823638916, - "learning_rate": 1.7810169491525423e-05, - "loss": 1.3796, - "step": 3730 - }, - { - "epoch": 0.09, - "grad_norm": 4.003491401672363, - "learning_rate": 1.7803389830508476e-05, - "loss": 1.5096, - "step": 3740 - }, - { - "epoch": 0.09, - "grad_norm": 1.0040380954742432, - "learning_rate": 1.7796610169491526e-05, - "loss": 1.5853, - "step": 3750 - }, - { - "epoch": 0.09, - "grad_norm": 2.6716127395629883, - "learning_rate": 1.778983050847458e-05, - "loss": 1.4564, - "step": 3760 - }, - { - "epoch": 0.09, - "grad_norm": 5.1395158767700195, - "learning_rate": 1.778305084745763e-05, - "loss": 1.3919, - "step": 3770 - }, - { - "epoch": 0.09, - "grad_norm": 1.6375240087509155, - "learning_rate": 1.7776271186440678e-05, - "loss": 1.2678, - "step": 3780 - }, - { - "epoch": 0.09, - "grad_norm": 11.120715141296387, - "learning_rate": 1.776949152542373e-05, - "loss": 1.5301, - "step": 3790 - }, - { - "epoch": 0.09, - "grad_norm": 1.9304801225662231, - "learning_rate": 1.776271186440678e-05, - "loss": 1.3043, - "step": 3800 - }, - { - "epoch": 0.09, - "grad_norm": 3.231403112411499, - "learning_rate": 1.775593220338983e-05, - "loss": 1.3634, - "step": 3810 - }, - { - "epoch": 0.09, - "grad_norm": 2.287160873413086, - "learning_rate": 1.7749152542372884e-05, - "loss": 1.306, - "step": 3820 - }, - { - "epoch": 0.09, - "grad_norm": 4.768348217010498, - "learning_rate": 1.7742372881355933e-05, - "loss": 1.479, - "step": 3830 - }, - { - "epoch": 0.09, - "grad_norm": 3.244331121444702, - "learning_rate": 1.7735593220338983e-05, - "loss": 1.2446, - "step": 3840 - }, - { - "epoch": 0.09, - "grad_norm": 3.9224987030029297, - "learning_rate": 1.7728813559322036e-05, - "loss": 1.4043, - "step": 3850 - }, - { - "epoch": 0.09, - "grad_norm": 3.9674978256225586, - "learning_rate": 1.7722033898305086e-05, - "loss": 1.3901, - "step": 3860 - }, - { - "epoch": 0.09, - "grad_norm": 7.087489604949951, - "learning_rate": 1.771525423728814e-05, - "loss": 1.4567, - "step": 3870 - }, - { - "epoch": 0.09, - "grad_norm": 1.8183858394622803, - "learning_rate": 1.770847457627119e-05, - "loss": 1.6078, - "step": 3880 - }, - { - "epoch": 0.09, - "grad_norm": 1.5410759449005127, - "learning_rate": 1.770169491525424e-05, - "loss": 1.3961, - "step": 3890 - }, - { - "epoch": 0.09, - "grad_norm": 2.4553983211517334, - "learning_rate": 1.769491525423729e-05, - "loss": 1.405, - "step": 3900 - }, - { - "epoch": 0.09, - "grad_norm": 5.181529998779297, - "learning_rate": 1.768813559322034e-05, - "loss": 1.4944, - "step": 3910 - }, - { - "epoch": 0.09, - "grad_norm": 2.054699420928955, - "learning_rate": 1.768135593220339e-05, - "loss": 1.5309, - "step": 3920 - }, - { - "epoch": 0.09, - "grad_norm": 3.4803507328033447, - "learning_rate": 1.767457627118644e-05, - "loss": 1.2931, - "step": 3930 - }, - { - "epoch": 0.09, - "grad_norm": 1.9554686546325684, - "learning_rate": 1.7667796610169494e-05, - "loss": 1.3651, - "step": 3940 - }, - { - "epoch": 0.09, - "grad_norm": 1.0756142139434814, - "learning_rate": 1.7661016949152543e-05, - "loss": 1.4476, - "step": 3950 - }, - { - "epoch": 0.09, - "grad_norm": 2.552859306335449, - "learning_rate": 1.7654237288135593e-05, - "loss": 1.4417, - "step": 3960 - }, - { - "epoch": 0.09, - "grad_norm": 2.7441787719726562, - "learning_rate": 1.7647457627118646e-05, - "loss": 1.3769, - "step": 3970 - }, - { - "epoch": 0.09, - "grad_norm": 2.9613826274871826, - "learning_rate": 1.7640677966101696e-05, - "loss": 1.4381, - "step": 3980 - }, - { - "epoch": 0.09, - "grad_norm": 4.84116268157959, - "learning_rate": 1.7633898305084746e-05, - "loss": 1.3539, - "step": 3990 - }, - { - "epoch": 0.09, - "grad_norm": 1.5078232288360596, - "learning_rate": 1.76271186440678e-05, - "loss": 1.4277, - "step": 4000 - }, - { - "epoch": 0.09, - "eval_loss": 1.158501386642456, - "eval_runtime": 67.1002, - "eval_samples_per_second": 14.903, - "eval_steps_per_second": 14.903, - "step": 4000 - }, - { - "epoch": 0.09, - "grad_norm": 2.6108415126800537, - "learning_rate": 1.762033898305085e-05, - "loss": 1.5502, - "step": 4010 - }, - { - "epoch": 0.09, - "grad_norm": 4.075037479400635, - "learning_rate": 1.7613559322033898e-05, - "loss": 1.3346, - "step": 4020 - }, - { - "epoch": 0.09, - "grad_norm": 2.969217300415039, - "learning_rate": 1.760677966101695e-05, - "loss": 1.3709, - "step": 4030 - }, - { - "epoch": 0.09, - "grad_norm": 3.885554313659668, - "learning_rate": 1.76e-05, - "loss": 1.4961, - "step": 4040 - }, - { - "epoch": 0.09, - "grad_norm": 1.5712426900863647, - "learning_rate": 1.7593220338983054e-05, - "loss": 1.3091, - "step": 4050 - }, - { - "epoch": 0.09, - "grad_norm": 2.020577907562256, - "learning_rate": 1.7586440677966104e-05, - "loss": 1.3308, - "step": 4060 - }, - { - "epoch": 0.09, - "grad_norm": 4.234301567077637, - "learning_rate": 1.7579661016949153e-05, - "loss": 1.5322, - "step": 4070 - }, - { - "epoch": 0.09, - "grad_norm": 1.667493224143982, - "learning_rate": 1.7572881355932206e-05, - "loss": 1.3986, - "step": 4080 - }, - { - "epoch": 0.09, - "grad_norm": 1.5255711078643799, - "learning_rate": 1.7566101694915256e-05, - "loss": 1.1095, - "step": 4090 - }, - { - "epoch": 0.09, - "grad_norm": 2.342149257659912, - "learning_rate": 1.7559322033898306e-05, - "loss": 1.468, - "step": 4100 - }, - { - "epoch": 0.1, - "grad_norm": 3.444767951965332, - "learning_rate": 1.755254237288136e-05, - "loss": 1.3548, - "step": 4110 - }, - { - "epoch": 0.1, - "grad_norm": 2.0781197547912598, - "learning_rate": 1.754576271186441e-05, - "loss": 1.4868, - "step": 4120 - }, - { - "epoch": 0.1, - "grad_norm": 3.5227110385894775, - "learning_rate": 1.753898305084746e-05, - "loss": 1.2477, - "step": 4130 - }, - { - "epoch": 0.1, - "grad_norm": 2.230503797531128, - "learning_rate": 1.7532203389830508e-05, - "loss": 1.5486, - "step": 4140 - }, - { - "epoch": 0.1, - "grad_norm": 2.484776496887207, - "learning_rate": 1.752542372881356e-05, - "loss": 1.4854, - "step": 4150 - }, - { - "epoch": 0.1, - "grad_norm": 2.312047004699707, - "learning_rate": 1.751864406779661e-05, - "loss": 1.45, - "step": 4160 - }, - { - "epoch": 0.1, - "grad_norm": 4.45994758605957, - "learning_rate": 1.751186440677966e-05, - "loss": 1.2472, - "step": 4170 - }, - { - "epoch": 0.1, - "grad_norm": 1.4740896224975586, - "learning_rate": 1.7505084745762714e-05, - "loss": 1.1978, - "step": 4180 - }, - { - "epoch": 0.1, - "grad_norm": 2.2780067920684814, - "learning_rate": 1.7498305084745763e-05, - "loss": 1.4335, - "step": 4190 - }, - { - "epoch": 0.1, - "grad_norm": 2.937509298324585, - "learning_rate": 1.7491525423728813e-05, - "loss": 1.4036, - "step": 4200 - }, - { - "epoch": 0.1, - "grad_norm": 5.987236022949219, - "learning_rate": 1.7484745762711866e-05, - "loss": 1.3751, - "step": 4210 - }, - { - "epoch": 0.1, - "grad_norm": 1.929296851158142, - "learning_rate": 1.7477966101694916e-05, - "loss": 1.294, - "step": 4220 - }, - { - "epoch": 0.1, - "grad_norm": 2.1039040088653564, - "learning_rate": 1.747118644067797e-05, - "loss": 1.3019, - "step": 4230 - }, - { - "epoch": 0.1, - "grad_norm": 6.1424946784973145, - "learning_rate": 1.746440677966102e-05, - "loss": 1.2913, - "step": 4240 - }, - { - "epoch": 0.1, - "grad_norm": 0.9339830279350281, - "learning_rate": 1.745762711864407e-05, - "loss": 1.3142, - "step": 4250 - }, - { - "epoch": 0.1, - "grad_norm": 2.47251296043396, - "learning_rate": 1.745084745762712e-05, - "loss": 1.4085, - "step": 4260 - }, - { - "epoch": 0.1, - "grad_norm": 1.0523934364318848, - "learning_rate": 1.744406779661017e-05, - "loss": 1.3264, - "step": 4270 - }, - { - "epoch": 0.1, - "grad_norm": 2.680063009262085, - "learning_rate": 1.743728813559322e-05, - "loss": 1.5524, - "step": 4280 - }, - { - "epoch": 0.1, - "grad_norm": 4.143898010253906, - "learning_rate": 1.7430508474576274e-05, - "loss": 1.5795, - "step": 4290 - }, - { - "epoch": 0.1, - "grad_norm": 3.3456549644470215, - "learning_rate": 1.7423728813559324e-05, - "loss": 1.3907, - "step": 4300 - }, - { - "epoch": 0.1, - "grad_norm": 3.776109457015991, - "learning_rate": 1.7416949152542373e-05, - "loss": 1.2968, - "step": 4310 - }, - { - "epoch": 0.1, - "grad_norm": 4.241484642028809, - "learning_rate": 1.7410169491525427e-05, - "loss": 1.305, - "step": 4320 - }, - { - "epoch": 0.1, - "grad_norm": 2.8671717643737793, - "learning_rate": 1.7403389830508476e-05, - "loss": 1.5132, - "step": 4330 - }, - { - "epoch": 0.1, - "grad_norm": 3.0062782764434814, - "learning_rate": 1.739661016949153e-05, - "loss": 1.2456, - "step": 4340 - }, - { - "epoch": 0.1, - "grad_norm": 4.044645309448242, - "learning_rate": 1.7389830508474576e-05, - "loss": 1.4838, - "step": 4350 - }, - { - "epoch": 0.1, - "grad_norm": 2.291123867034912, - "learning_rate": 1.738305084745763e-05, - "loss": 1.4495, - "step": 4360 - }, - { - "epoch": 0.1, - "grad_norm": 3.096938133239746, - "learning_rate": 1.737627118644068e-05, - "loss": 1.2715, - "step": 4370 - }, - { - "epoch": 0.1, - "grad_norm": 2.5189833641052246, - "learning_rate": 1.7369491525423728e-05, - "loss": 1.1358, - "step": 4380 - }, - { - "epoch": 0.1, - "grad_norm": 3.956369638442993, - "learning_rate": 1.736271186440678e-05, - "loss": 1.1939, - "step": 4390 - }, - { - "epoch": 0.1, - "grad_norm": 8.631299018859863, - "learning_rate": 1.735593220338983e-05, - "loss": 1.5566, - "step": 4400 - }, - { - "epoch": 0.1, - "grad_norm": 1.6200116872787476, - "learning_rate": 1.734915254237288e-05, - "loss": 1.5092, - "step": 4410 - }, - { - "epoch": 0.1, - "grad_norm": 4.132010459899902, - "learning_rate": 1.7342372881355934e-05, - "loss": 1.4241, - "step": 4420 - }, - { - "epoch": 0.1, - "grad_norm": 3.7074265480041504, - "learning_rate": 1.7335593220338983e-05, - "loss": 1.4407, - "step": 4430 - }, - { - "epoch": 0.1, - "grad_norm": 2.0132627487182617, - "learning_rate": 1.7328813559322037e-05, - "loss": 1.3255, - "step": 4440 - }, - { - "epoch": 0.1, - "grad_norm": 3.750293254852295, - "learning_rate": 1.7322033898305086e-05, - "loss": 1.5557, - "step": 4450 - }, - { - "epoch": 0.1, - "grad_norm": 1.0939888954162598, - "learning_rate": 1.7315254237288136e-05, - "loss": 1.3632, - "step": 4460 - }, - { - "epoch": 0.1, - "grad_norm": 2.476027727127075, - "learning_rate": 1.730847457627119e-05, - "loss": 1.3655, - "step": 4470 - }, - { - "epoch": 0.1, - "grad_norm": 3.4436848163604736, - "learning_rate": 1.730169491525424e-05, - "loss": 1.3026, - "step": 4480 - }, - { - "epoch": 0.1, - "grad_norm": 5.511134147644043, - "learning_rate": 1.729491525423729e-05, - "loss": 1.5307, - "step": 4490 - }, - { - "epoch": 0.1, - "grad_norm": 1.5898433923721313, - "learning_rate": 1.728813559322034e-05, - "loss": 1.4483, - "step": 4500 - }, - { - "epoch": 0.1, - "eval_loss": 1.1334296464920044, - "eval_runtime": 67.0278, - "eval_samples_per_second": 14.919, - "eval_steps_per_second": 14.919, - "step": 4500 - }, - { - "epoch": 0.1, - "grad_norm": 1.7626523971557617, - "learning_rate": 1.728135593220339e-05, - "loss": 1.3683, - "step": 4510 - }, - { - "epoch": 0.1, - "grad_norm": 3.1556930541992188, - "learning_rate": 1.7274576271186444e-05, - "loss": 1.4022, - "step": 4520 - }, - { - "epoch": 0.1, - "grad_norm": 3.2186577320098877, - "learning_rate": 1.7267796610169494e-05, - "loss": 1.41, - "step": 4530 - }, - { - "epoch": 0.11, - "grad_norm": 6.2501068115234375, - "learning_rate": 1.7261016949152544e-05, - "loss": 1.4536, - "step": 4540 - }, - { - "epoch": 0.11, - "grad_norm": 4.240054130554199, - "learning_rate": 1.7254237288135597e-05, - "loss": 1.4073, - "step": 4550 - }, - { - "epoch": 0.11, - "grad_norm": 3.387373208999634, - "learning_rate": 1.7247457627118643e-05, - "loss": 1.613, - "step": 4560 - }, - { - "epoch": 0.11, - "grad_norm": 4.530575752258301, - "learning_rate": 1.7240677966101696e-05, - "loss": 1.2231, - "step": 4570 - }, - { - "epoch": 0.11, - "grad_norm": 2.739079236984253, - "learning_rate": 1.7233898305084746e-05, - "loss": 1.4286, - "step": 4580 - }, - { - "epoch": 0.11, - "grad_norm": 4.380158424377441, - "learning_rate": 1.7227118644067796e-05, - "loss": 1.3608, - "step": 4590 - }, - { - "epoch": 0.11, - "grad_norm": 5.204037189483643, - "learning_rate": 1.722033898305085e-05, - "loss": 1.2801, - "step": 4600 - }, - { - "epoch": 0.11, - "grad_norm": 1.6450468301773071, - "learning_rate": 1.72135593220339e-05, - "loss": 1.29, - "step": 4610 - }, - { - "epoch": 0.11, - "grad_norm": 2.5391757488250732, - "learning_rate": 1.720677966101695e-05, - "loss": 1.6189, - "step": 4620 - }, - { - "epoch": 0.11, - "grad_norm": 3.872121810913086, - "learning_rate": 1.72e-05, - "loss": 1.2915, - "step": 4630 - }, - { - "epoch": 0.11, - "grad_norm": 2.8644561767578125, - "learning_rate": 1.719322033898305e-05, - "loss": 1.5074, - "step": 4640 - }, - { - "epoch": 0.11, - "grad_norm": 3.774881601333618, - "learning_rate": 1.7186440677966104e-05, - "loss": 1.3268, - "step": 4650 - }, - { - "epoch": 0.11, - "grad_norm": 2.5650038719177246, - "learning_rate": 1.7179661016949154e-05, - "loss": 1.3892, - "step": 4660 - }, - { - "epoch": 0.11, - "grad_norm": 1.5609863996505737, - "learning_rate": 1.7172881355932204e-05, - "loss": 1.3675, - "step": 4670 - }, - { - "epoch": 0.11, - "grad_norm": 8.063512802124023, - "learning_rate": 1.7166101694915257e-05, - "loss": 1.3671, - "step": 4680 - }, - { - "epoch": 0.11, - "grad_norm": 1.6324501037597656, - "learning_rate": 1.7159322033898306e-05, - "loss": 1.3284, - "step": 4690 - }, - { - "epoch": 0.11, - "grad_norm": 2.6894187927246094, - "learning_rate": 1.715254237288136e-05, - "loss": 1.3032, - "step": 4700 - }, - { - "epoch": 0.11, - "grad_norm": 2.1557717323303223, - "learning_rate": 1.714576271186441e-05, - "loss": 1.366, - "step": 4710 - }, - { - "epoch": 0.11, - "grad_norm": 2.8717806339263916, - "learning_rate": 1.713898305084746e-05, - "loss": 1.5634, - "step": 4720 - }, - { - "epoch": 0.11, - "grad_norm": 1.2488980293273926, - "learning_rate": 1.7132203389830512e-05, - "loss": 1.4814, - "step": 4730 - }, - { - "epoch": 0.11, - "grad_norm": 3.677203893661499, - "learning_rate": 1.712542372881356e-05, - "loss": 1.2079, - "step": 4740 - }, - { - "epoch": 0.11, - "grad_norm": 1.8588718175888062, - "learning_rate": 1.711864406779661e-05, - "loss": 1.3525, - "step": 4750 - }, - { - "epoch": 0.11, - "grad_norm": 2.2451605796813965, - "learning_rate": 1.711186440677966e-05, - "loss": 1.4148, - "step": 4760 - }, - { - "epoch": 0.11, - "grad_norm": 1.5242979526519775, - "learning_rate": 1.710508474576271e-05, - "loss": 1.2413, - "step": 4770 - }, - { - "epoch": 0.11, - "grad_norm": 2.1980478763580322, - "learning_rate": 1.7098305084745764e-05, - "loss": 1.4558, - "step": 4780 - }, - { - "epoch": 0.11, - "grad_norm": 6.478370666503906, - "learning_rate": 1.7091525423728814e-05, - "loss": 1.3639, - "step": 4790 - }, - { - "epoch": 0.11, - "grad_norm": 3.1923305988311768, - "learning_rate": 1.7084745762711867e-05, - "loss": 1.5741, - "step": 4800 - }, - { - "epoch": 0.11, - "grad_norm": 2.293412923812866, - "learning_rate": 1.7077966101694916e-05, - "loss": 1.2903, - "step": 4810 - }, - { - "epoch": 0.11, - "grad_norm": 2.4575860500335693, - "learning_rate": 1.7071186440677966e-05, - "loss": 1.4585, - "step": 4820 - }, - { - "epoch": 0.11, - "grad_norm": 3.9733614921569824, - "learning_rate": 1.706440677966102e-05, - "loss": 1.1767, - "step": 4830 - }, - { - "epoch": 0.11, - "grad_norm": 1.9043350219726562, - "learning_rate": 1.705762711864407e-05, - "loss": 1.4218, - "step": 4840 - }, - { - "epoch": 0.11, - "grad_norm": 2.5711846351623535, - "learning_rate": 1.705084745762712e-05, - "loss": 1.2186, - "step": 4850 - }, - { - "epoch": 0.11, - "grad_norm": 3.8665926456451416, - "learning_rate": 1.704406779661017e-05, - "loss": 1.3747, - "step": 4860 - }, - { - "epoch": 0.11, - "grad_norm": 3.0901689529418945, - "learning_rate": 1.703728813559322e-05, - "loss": 1.3356, - "step": 4870 - }, - { - "epoch": 0.11, - "grad_norm": 2.0309970378875732, - "learning_rate": 1.703050847457627e-05, - "loss": 1.4572, - "step": 4880 - }, - { - "epoch": 0.11, - "grad_norm": 2.9482085704803467, - "learning_rate": 1.7023728813559324e-05, - "loss": 1.3365, - "step": 4890 - }, - { - "epoch": 0.11, - "grad_norm": 2.5880916118621826, - "learning_rate": 1.7016949152542374e-05, - "loss": 1.4305, - "step": 4900 - }, - { - "epoch": 0.11, - "grad_norm": 5.12321138381958, - "learning_rate": 1.7010169491525427e-05, - "loss": 1.5018, - "step": 4910 - }, - { - "epoch": 0.11, - "grad_norm": 2.508700370788574, - "learning_rate": 1.7003389830508477e-05, - "loss": 1.237, - "step": 4920 - }, - { - "epoch": 0.11, - "grad_norm": 1.8924113512039185, - "learning_rate": 1.6996610169491526e-05, - "loss": 1.3583, - "step": 4930 - }, - { - "epoch": 0.11, - "grad_norm": 1.551520824432373, - "learning_rate": 1.698983050847458e-05, - "loss": 1.2687, - "step": 4940 - }, - { - "epoch": 0.11, - "grad_norm": 2.142587184906006, - "learning_rate": 1.698305084745763e-05, - "loss": 1.4377, - "step": 4950 - }, - { - "epoch": 0.11, - "grad_norm": 1.7592850923538208, - "learning_rate": 1.697627118644068e-05, - "loss": 1.5202, - "step": 4960 - }, - { - "epoch": 0.12, - "grad_norm": 4.212038040161133, - "learning_rate": 1.696949152542373e-05, - "loss": 1.3286, - "step": 4970 - }, - { - "epoch": 0.12, - "grad_norm": 4.211546897888184, - "learning_rate": 1.6962711864406782e-05, - "loss": 1.5126, - "step": 4980 - }, - { - "epoch": 0.12, - "grad_norm": 6.877481460571289, - "learning_rate": 1.695593220338983e-05, - "loss": 1.4129, - "step": 4990 - }, - { - "epoch": 0.12, - "grad_norm": 2.20015811920166, - "learning_rate": 1.694915254237288e-05, - "loss": 1.2721, - "step": 5000 - }, - { - "epoch": 0.12, - "eval_loss": 1.1188498735427856, - "eval_runtime": 67.1151, - "eval_samples_per_second": 14.9, - "eval_steps_per_second": 14.9, - "step": 5000 - }, - { - "epoch": 0.12, - "grad_norm": 2.303971290588379, - "learning_rate": 1.6942372881355934e-05, - "loss": 1.3757, - "step": 5010 - }, - { - "epoch": 0.12, - "grad_norm": 3.5342512130737305, - "learning_rate": 1.6935593220338984e-05, - "loss": 1.2597, - "step": 5020 - }, - { - "epoch": 0.12, - "grad_norm": 4.022207260131836, - "learning_rate": 1.6928813559322034e-05, - "loss": 1.4939, - "step": 5030 - }, - { - "epoch": 0.12, - "grad_norm": 2.4725759029388428, - "learning_rate": 1.6922033898305087e-05, - "loss": 1.2841, - "step": 5040 - }, - { - "epoch": 0.12, - "grad_norm": 3.944225788116455, - "learning_rate": 1.6915254237288136e-05, - "loss": 1.3808, - "step": 5050 - }, - { - "epoch": 0.12, - "grad_norm": 1.0974010229110718, - "learning_rate": 1.6908474576271186e-05, - "loss": 1.2744, - "step": 5060 - }, - { - "epoch": 0.12, - "grad_norm": 2.6920721530914307, - "learning_rate": 1.690169491525424e-05, - "loss": 1.5257, - "step": 5070 - }, - { - "epoch": 0.12, - "grad_norm": 3.715402126312256, - "learning_rate": 1.689491525423729e-05, - "loss": 1.5197, - "step": 5080 - }, - { - "epoch": 0.12, - "grad_norm": 2.5713796615600586, - "learning_rate": 1.6888135593220342e-05, - "loss": 1.3383, - "step": 5090 - }, - { - "epoch": 0.12, - "grad_norm": 3.9672067165374756, - "learning_rate": 1.6881355932203392e-05, - "loss": 1.2505, - "step": 5100 - }, - { - "epoch": 0.12, - "grad_norm": 4.107706069946289, - "learning_rate": 1.687457627118644e-05, - "loss": 1.3978, - "step": 5110 - }, - { - "epoch": 0.12, - "grad_norm": 3.694401979446411, - "learning_rate": 1.6867796610169495e-05, - "loss": 1.5616, - "step": 5120 - }, - { - "epoch": 0.12, - "grad_norm": 1.1937147378921509, - "learning_rate": 1.6861016949152544e-05, - "loss": 1.4247, - "step": 5130 - }, - { - "epoch": 0.12, - "grad_norm": 1.7874356508255005, - "learning_rate": 1.6854237288135594e-05, - "loss": 1.1953, - "step": 5140 - }, - { - "epoch": 0.12, - "grad_norm": 3.081228256225586, - "learning_rate": 1.6847457627118647e-05, - "loss": 1.3952, - "step": 5150 - }, - { - "epoch": 0.12, - "grad_norm": 4.792527198791504, - "learning_rate": 1.6840677966101697e-05, - "loss": 1.3015, - "step": 5160 - }, - { - "epoch": 0.12, - "grad_norm": 2.482060194015503, - "learning_rate": 1.683389830508475e-05, - "loss": 1.1953, - "step": 5170 - }, - { - "epoch": 0.12, - "grad_norm": 4.224519729614258, - "learning_rate": 1.6827118644067796e-05, - "loss": 1.3729, - "step": 5180 - }, - { - "epoch": 0.12, - "grad_norm": 2.609736680984497, - "learning_rate": 1.682033898305085e-05, - "loss": 1.4468, - "step": 5190 - }, - { - "epoch": 0.12, - "grad_norm": 2.786303997039795, - "learning_rate": 1.68135593220339e-05, - "loss": 1.4248, - "step": 5200 - }, - { - "epoch": 0.12, - "grad_norm": 4.584510326385498, - "learning_rate": 1.680677966101695e-05, - "loss": 1.4896, - "step": 5210 - }, - { - "epoch": 0.12, - "grad_norm": 7.068190574645996, - "learning_rate": 1.6800000000000002e-05, - "loss": 1.3613, - "step": 5220 - }, - { - "epoch": 0.12, - "grad_norm": 2.357131242752075, - "learning_rate": 1.679322033898305e-05, - "loss": 1.1566, - "step": 5230 - }, - { - "epoch": 0.12, - "grad_norm": 4.547120571136475, - "learning_rate": 1.67864406779661e-05, - "loss": 1.3428, - "step": 5240 - }, - { - "epoch": 0.12, - "grad_norm": 3.5816683769226074, - "learning_rate": 1.6779661016949154e-05, - "loss": 1.3112, - "step": 5250 - }, - { - "epoch": 0.12, - "grad_norm": 4.817299842834473, - "learning_rate": 1.6772881355932204e-05, - "loss": 1.4183, - "step": 5260 - }, - { - "epoch": 0.12, - "grad_norm": 2.906155586242676, - "learning_rate": 1.6766101694915257e-05, - "loss": 1.4456, - "step": 5270 - }, - { - "epoch": 0.12, - "grad_norm": 8.34229850769043, - "learning_rate": 1.6759322033898307e-05, - "loss": 1.3624, - "step": 5280 - }, - { - "epoch": 0.12, - "grad_norm": 2.7926371097564697, - "learning_rate": 1.6752542372881357e-05, - "loss": 1.4581, - "step": 5290 - }, - { - "epoch": 0.12, - "grad_norm": 3.89165997505188, - "learning_rate": 1.674576271186441e-05, - "loss": 1.5302, - "step": 5300 - }, - { - "epoch": 0.12, - "grad_norm": 1.7644487619400024, - "learning_rate": 1.673898305084746e-05, - "loss": 1.2788, - "step": 5310 - }, - { - "epoch": 0.12, - "grad_norm": 3.700753927230835, - "learning_rate": 1.673220338983051e-05, - "loss": 1.4187, - "step": 5320 - }, - { - "epoch": 0.12, - "grad_norm": 2.538086175918579, - "learning_rate": 1.6725423728813562e-05, - "loss": 1.4627, - "step": 5330 - }, - { - "epoch": 0.12, - "grad_norm": 2.7824482917785645, - "learning_rate": 1.6718644067796612e-05, - "loss": 1.3147, - "step": 5340 - }, - { - "epoch": 0.12, - "grad_norm": 3.426088571548462, - "learning_rate": 1.671186440677966e-05, - "loss": 1.4254, - "step": 5350 - }, - { - "epoch": 0.12, - "grad_norm": 2.100320339202881, - "learning_rate": 1.6705084745762715e-05, - "loss": 1.2516, - "step": 5360 - }, - { - "epoch": 0.12, - "grad_norm": 2.2726573944091797, - "learning_rate": 1.6698305084745764e-05, - "loss": 1.2826, - "step": 5370 - }, - { - "epoch": 0.12, - "grad_norm": 5.182670593261719, - "learning_rate": 1.6691525423728817e-05, - "loss": 1.468, - "step": 5380 - }, - { - "epoch": 0.12, - "grad_norm": 2.9416656494140625, - "learning_rate": 1.6684745762711864e-05, - "loss": 1.4455, - "step": 5390 - }, - { - "epoch": 0.13, - "grad_norm": 1.5433812141418457, - "learning_rate": 1.6677966101694917e-05, - "loss": 1.4634, - "step": 5400 - }, - { - "epoch": 0.13, - "grad_norm": 4.191746234893799, - "learning_rate": 1.6671186440677967e-05, - "loss": 1.3533, - "step": 5410 - }, - { - "epoch": 0.13, - "grad_norm": 1.96868097782135, - "learning_rate": 1.6664406779661016e-05, - "loss": 1.3821, - "step": 5420 - }, - { - "epoch": 0.13, - "grad_norm": 5.689413070678711, - "learning_rate": 1.665762711864407e-05, - "loss": 1.4411, - "step": 5430 - }, - { - "epoch": 0.13, - "grad_norm": 3.58811354637146, - "learning_rate": 1.665084745762712e-05, - "loss": 1.3856, - "step": 5440 - }, - { - "epoch": 0.13, - "grad_norm": 2.642380952835083, - "learning_rate": 1.6644067796610172e-05, - "loss": 1.4348, - "step": 5450 - }, - { - "epoch": 0.13, - "grad_norm": 2.2928035259246826, - "learning_rate": 1.6637288135593222e-05, - "loss": 1.5305, - "step": 5460 - }, - { - "epoch": 0.13, - "grad_norm": 5.261194705963135, - "learning_rate": 1.663050847457627e-05, - "loss": 1.2432, - "step": 5470 - }, - { - "epoch": 0.13, - "grad_norm": 1.4704355001449585, - "learning_rate": 1.6623728813559325e-05, - "loss": 1.5543, - "step": 5480 - }, - { - "epoch": 0.13, - "grad_norm": 2.8535382747650146, - "learning_rate": 1.6616949152542374e-05, - "loss": 1.3296, - "step": 5490 - }, - { - "epoch": 0.13, - "grad_norm": 2.3717048168182373, - "learning_rate": 1.6610169491525424e-05, - "loss": 1.5835, - "step": 5500 - }, - { - "epoch": 0.13, - "eval_loss": 1.0649362802505493, - "eval_runtime": 66.9134, - "eval_samples_per_second": 14.945, - "eval_steps_per_second": 14.945, - "step": 5500 - }, - { - "epoch": 0.13, - "grad_norm": 3.3180487155914307, - "learning_rate": 1.6603389830508477e-05, - "loss": 1.3602, - "step": 5510 - }, - { - "epoch": 0.13, - "grad_norm": 3.428536891937256, - "learning_rate": 1.6596610169491527e-05, - "loss": 1.4902, - "step": 5520 - }, - { - "epoch": 0.13, - "grad_norm": 1.3657978773117065, - "learning_rate": 1.6589830508474577e-05, - "loss": 1.3019, - "step": 5530 - }, - { - "epoch": 0.13, - "grad_norm": 1.775023341178894, - "learning_rate": 1.658305084745763e-05, - "loss": 1.309, - "step": 5540 - }, - { - "epoch": 0.13, - "grad_norm": 2.047219753265381, - "learning_rate": 1.657627118644068e-05, - "loss": 1.463, - "step": 5550 - }, - { - "epoch": 0.13, - "grad_norm": 1.6937634944915771, - "learning_rate": 1.6569491525423732e-05, - "loss": 1.4777, - "step": 5560 - }, - { - "epoch": 0.13, - "grad_norm": 2.0562212467193604, - "learning_rate": 1.6562711864406782e-05, - "loss": 1.4766, - "step": 5570 - }, - { - "epoch": 0.13, - "grad_norm": 1.577533483505249, - "learning_rate": 1.6555932203389832e-05, - "loss": 1.3865, - "step": 5580 - }, - { - "epoch": 0.13, - "grad_norm": 3.4926679134368896, - "learning_rate": 1.654915254237288e-05, - "loss": 1.3764, - "step": 5590 - }, - { - "epoch": 0.13, - "grad_norm": 2.904751777648926, - "learning_rate": 1.654237288135593e-05, - "loss": 1.2022, - "step": 5600 - }, - { - "epoch": 0.13, - "grad_norm": 4.692943096160889, - "learning_rate": 1.6535593220338984e-05, - "loss": 1.2504, - "step": 5610 - }, - { - "epoch": 0.13, - "grad_norm": 1.7578885555267334, - "learning_rate": 1.6528813559322034e-05, - "loss": 1.3805, - "step": 5620 - }, - { - "epoch": 0.13, - "grad_norm": 6.6114630699157715, - "learning_rate": 1.6522033898305084e-05, - "loss": 1.3391, - "step": 5630 - }, - { - "epoch": 0.13, - "grad_norm": 2.5723536014556885, - "learning_rate": 1.6515254237288137e-05, - "loss": 1.2655, - "step": 5640 - }, - { - "epoch": 0.13, - "grad_norm": 3.141366720199585, - "learning_rate": 1.6508474576271187e-05, - "loss": 1.4299, - "step": 5650 - }, - { - "epoch": 0.13, - "grad_norm": 2.970369577407837, - "learning_rate": 1.650169491525424e-05, - "loss": 1.4145, - "step": 5660 - }, - { - "epoch": 0.13, - "grad_norm": 4.518489837646484, - "learning_rate": 1.649491525423729e-05, - "loss": 1.419, - "step": 5670 - }, - { - "epoch": 0.13, - "grad_norm": 2.4335029125213623, - "learning_rate": 1.648813559322034e-05, - "loss": 1.3781, - "step": 5680 - }, - { - "epoch": 0.13, - "grad_norm": 3.025974988937378, - "learning_rate": 1.6481355932203392e-05, - "loss": 1.456, - "step": 5690 - }, - { - "epoch": 0.13, - "grad_norm": 1.4364434480667114, - "learning_rate": 1.6474576271186442e-05, - "loss": 1.4295, - "step": 5700 - }, - { - "epoch": 0.13, - "grad_norm": 3.17189359664917, - "learning_rate": 1.646779661016949e-05, - "loss": 1.3155, - "step": 5710 - }, - { - "epoch": 0.13, - "grad_norm": 4.905094146728516, - "learning_rate": 1.6461016949152545e-05, - "loss": 1.3161, - "step": 5720 - }, - { - "epoch": 0.13, - "grad_norm": 2.6364047527313232, - "learning_rate": 1.6454237288135594e-05, - "loss": 1.378, - "step": 5730 - }, - { - "epoch": 0.13, - "grad_norm": 1.8336093425750732, - "learning_rate": 1.6447457627118648e-05, - "loss": 1.2784, - "step": 5740 - }, - { - "epoch": 0.13, - "grad_norm": 4.631802558898926, - "learning_rate": 1.6440677966101697e-05, - "loss": 1.2603, - "step": 5750 - }, - { - "epoch": 0.13, - "grad_norm": 9.11829948425293, - "learning_rate": 1.6433898305084747e-05, - "loss": 1.3563, - "step": 5760 - }, - { - "epoch": 0.13, - "grad_norm": 3.024040460586548, - "learning_rate": 1.64271186440678e-05, - "loss": 1.3491, - "step": 5770 - }, - { - "epoch": 0.13, - "grad_norm": 3.558547019958496, - "learning_rate": 1.642033898305085e-05, - "loss": 1.3992, - "step": 5780 - }, - { - "epoch": 0.13, - "grad_norm": 3.277186870574951, - "learning_rate": 1.64135593220339e-05, - "loss": 1.3636, - "step": 5790 - }, - { - "epoch": 0.13, - "grad_norm": 3.6262595653533936, - "learning_rate": 1.640677966101695e-05, - "loss": 1.2908, - "step": 5800 - }, - { - "epoch": 0.13, - "grad_norm": 1.5076159238815308, - "learning_rate": 1.64e-05, - "loss": 1.4525, - "step": 5810 - }, - { - "epoch": 0.13, - "grad_norm": 1.863429069519043, - "learning_rate": 1.6393220338983052e-05, - "loss": 1.5479, - "step": 5820 - }, - { - "epoch": 0.13, - "grad_norm": 1.3503921031951904, - "learning_rate": 1.63864406779661e-05, - "loss": 1.2465, - "step": 5830 - }, - { - "epoch": 0.14, - "grad_norm": 2.307507276535034, - "learning_rate": 1.6379661016949155e-05, - "loss": 1.3346, - "step": 5840 - }, - { - "epoch": 0.14, - "grad_norm": 3.301724910736084, - "learning_rate": 1.6372881355932204e-05, - "loss": 1.4078, - "step": 5850 - }, - { - "epoch": 0.14, - "grad_norm": 3.821153163909912, - "learning_rate": 1.6366101694915254e-05, - "loss": 1.4262, - "step": 5860 - }, - { - "epoch": 0.14, - "grad_norm": 1.4718036651611328, - "learning_rate": 1.6359322033898307e-05, - "loss": 1.5426, - "step": 5870 - }, - { - "epoch": 0.14, - "grad_norm": 4.044488430023193, - "learning_rate": 1.6352542372881357e-05, - "loss": 1.3703, - "step": 5880 - }, - { - "epoch": 0.14, - "grad_norm": 2.0791540145874023, - "learning_rate": 1.6345762711864407e-05, - "loss": 1.3436, - "step": 5890 - }, - { - "epoch": 0.14, - "grad_norm": 2.2117247581481934, - "learning_rate": 1.633898305084746e-05, - "loss": 1.428, - "step": 5900 - }, - { - "epoch": 0.14, - "grad_norm": 3.8183112144470215, - "learning_rate": 1.633220338983051e-05, - "loss": 1.6166, - "step": 5910 - }, - { - "epoch": 0.14, - "grad_norm": 2.566955089569092, - "learning_rate": 1.6325423728813563e-05, - "loss": 1.4127, - "step": 5920 - }, - { - "epoch": 0.14, - "grad_norm": 4.455868721008301, - "learning_rate": 1.6318644067796612e-05, - "loss": 1.2684, - "step": 5930 - }, - { - "epoch": 0.14, - "grad_norm": 1.872295618057251, - "learning_rate": 1.6311864406779662e-05, - "loss": 1.1753, - "step": 5940 - }, - { - "epoch": 0.14, - "grad_norm": 2.5006000995635986, - "learning_rate": 1.6305084745762715e-05, - "loss": 1.1916, - "step": 5950 - }, - { - "epoch": 0.14, - "grad_norm": 2.872570514678955, - "learning_rate": 1.6298305084745765e-05, - "loss": 1.1692, - "step": 5960 - }, - { - "epoch": 0.14, - "grad_norm": 3.5257492065429688, - "learning_rate": 1.6291525423728814e-05, - "loss": 1.3611, - "step": 5970 - }, - { - "epoch": 0.14, - "grad_norm": 3.483407497406006, - "learning_rate": 1.6284745762711868e-05, - "loss": 1.4315, - "step": 5980 - }, - { - "epoch": 0.14, - "grad_norm": 1.8382426500320435, - "learning_rate": 1.6277966101694917e-05, - "loss": 1.4698, - "step": 5990 - }, - { - "epoch": 0.14, - "grad_norm": 4.766387939453125, - "learning_rate": 1.6271186440677967e-05, - "loss": 1.361, - "step": 6000 - }, - { - "epoch": 0.14, - "eval_loss": 1.1337964534759521, - "eval_runtime": 67.0656, - "eval_samples_per_second": 14.911, - "eval_steps_per_second": 14.911, - "step": 6000 - }, - { - "epoch": 0.14, - "grad_norm": 2.5609889030456543, - "learning_rate": 1.6264406779661017e-05, - "loss": 1.4736, - "step": 6010 - }, - { - "epoch": 0.14, - "grad_norm": 3.4243054389953613, - "learning_rate": 1.625762711864407e-05, - "loss": 1.3073, - "step": 6020 - }, - { - "epoch": 0.14, - "grad_norm": 1.1757571697235107, - "learning_rate": 1.625084745762712e-05, - "loss": 1.5282, - "step": 6030 - }, - { - "epoch": 0.14, - "grad_norm": 2.4322659969329834, - "learning_rate": 1.624406779661017e-05, - "loss": 1.534, - "step": 6040 - }, - { - "epoch": 0.14, - "grad_norm": 2.9377753734588623, - "learning_rate": 1.6237288135593222e-05, - "loss": 1.354, - "step": 6050 - }, - { - "epoch": 0.14, - "grad_norm": 4.534852981567383, - "learning_rate": 1.6230508474576272e-05, - "loss": 1.2613, - "step": 6060 - }, - { - "epoch": 0.14, - "grad_norm": 1.6617101430892944, - "learning_rate": 1.6223728813559322e-05, - "loss": 1.4414, - "step": 6070 - }, - { - "epoch": 0.14, - "grad_norm": 4.800033092498779, - "learning_rate": 1.6216949152542375e-05, - "loss": 1.3286, - "step": 6080 - }, - { - "epoch": 0.14, - "grad_norm": 3.045539379119873, - "learning_rate": 1.6210169491525424e-05, - "loss": 1.4305, - "step": 6090 - }, - { - "epoch": 0.14, - "grad_norm": 3.9078855514526367, - "learning_rate": 1.6203389830508474e-05, - "loss": 1.3735, - "step": 6100 - }, - { - "epoch": 0.14, - "grad_norm": 3.0546069145202637, - "learning_rate": 1.6196610169491527e-05, - "loss": 1.3062, - "step": 6110 - }, - { - "epoch": 0.14, - "grad_norm": 3.7809054851531982, - "learning_rate": 1.6189830508474577e-05, - "loss": 1.2676, - "step": 6120 - }, - { - "epoch": 0.14, - "grad_norm": 2.5306100845336914, - "learning_rate": 1.618305084745763e-05, - "loss": 1.4725, - "step": 6130 - }, - { - "epoch": 0.14, - "grad_norm": 3.504068374633789, - "learning_rate": 1.617627118644068e-05, - "loss": 1.3162, - "step": 6140 - }, - { - "epoch": 0.14, - "grad_norm": 5.180143356323242, - "learning_rate": 1.616949152542373e-05, - "loss": 1.3681, - "step": 6150 - }, - { - "epoch": 0.14, - "grad_norm": 4.692683696746826, - "learning_rate": 1.6162711864406783e-05, - "loss": 1.2732, - "step": 6160 - }, - { - "epoch": 0.14, - "grad_norm": 1.6309053897857666, - "learning_rate": 1.6155932203389832e-05, - "loss": 1.212, - "step": 6170 - }, - { - "epoch": 0.14, - "grad_norm": 4.247219562530518, - "learning_rate": 1.6149152542372882e-05, - "loss": 1.3263, - "step": 6180 - }, - { - "epoch": 0.14, - "grad_norm": 1.420581579208374, - "learning_rate": 1.6142372881355935e-05, - "loss": 1.2392, - "step": 6190 - }, - { - "epoch": 0.14, - "grad_norm": 3.2501585483551025, - "learning_rate": 1.6135593220338985e-05, - "loss": 1.3958, - "step": 6200 - }, - { - "epoch": 0.14, - "grad_norm": 5.111384868621826, - "learning_rate": 1.6128813559322038e-05, - "loss": 1.4578, - "step": 6210 - }, - { - "epoch": 0.14, - "grad_norm": 2.0279738903045654, - "learning_rate": 1.6122033898305084e-05, - "loss": 1.4083, - "step": 6220 - }, - { - "epoch": 0.14, - "grad_norm": 0.9996086955070496, - "learning_rate": 1.6115254237288137e-05, - "loss": 1.2599, - "step": 6230 - }, - { - "epoch": 0.14, - "grad_norm": 2.6425893306732178, - "learning_rate": 1.6108474576271187e-05, - "loss": 1.3794, - "step": 6240 - }, - { - "epoch": 0.14, - "grad_norm": 4.84988260269165, - "learning_rate": 1.6101694915254237e-05, - "loss": 1.3801, - "step": 6250 - }, - { - "epoch": 0.14, - "grad_norm": 7.806826591491699, - "learning_rate": 1.609491525423729e-05, - "loss": 1.3006, - "step": 6260 - }, - { - "epoch": 0.15, - "grad_norm": 6.427366733551025, - "learning_rate": 1.608813559322034e-05, - "loss": 1.4588, - "step": 6270 - }, - { - "epoch": 0.15, - "grad_norm": 2.51039719581604, - "learning_rate": 1.608135593220339e-05, - "loss": 1.4209, - "step": 6280 - }, - { - "epoch": 0.15, - "grad_norm": 2.638364553451538, - "learning_rate": 1.6074576271186442e-05, - "loss": 1.3952, - "step": 6290 - }, - { - "epoch": 0.15, - "grad_norm": 1.9240260124206543, - "learning_rate": 1.6067796610169492e-05, - "loss": 1.3194, - "step": 6300 - }, - { - "epoch": 0.15, - "grad_norm": 2.798541784286499, - "learning_rate": 1.6061016949152545e-05, - "loss": 1.2272, - "step": 6310 - }, - { - "epoch": 0.15, - "grad_norm": 1.5487680435180664, - "learning_rate": 1.6054237288135595e-05, - "loss": 1.4795, - "step": 6320 - }, - { - "epoch": 0.15, - "grad_norm": 3.1405465602874756, - "learning_rate": 1.6047457627118645e-05, - "loss": 1.3574, - "step": 6330 - }, - { - "epoch": 0.15, - "grad_norm": 1.4945203065872192, - "learning_rate": 1.6040677966101698e-05, - "loss": 1.3875, - "step": 6340 - }, - { - "epoch": 0.15, - "grad_norm": 2.4741933345794678, - "learning_rate": 1.6033898305084747e-05, - "loss": 1.3757, - "step": 6350 - }, - { - "epoch": 0.15, - "grad_norm": 2.7438557147979736, - "learning_rate": 1.6027118644067797e-05, - "loss": 1.4486, - "step": 6360 - }, - { - "epoch": 0.15, - "grad_norm": 3.901851177215576, - "learning_rate": 1.602033898305085e-05, - "loss": 1.3954, - "step": 6370 - }, - { - "epoch": 0.15, - "grad_norm": 1.6728180646896362, - "learning_rate": 1.60135593220339e-05, - "loss": 1.311, - "step": 6380 - }, - { - "epoch": 0.15, - "grad_norm": 2.970463752746582, - "learning_rate": 1.600677966101695e-05, - "loss": 1.4011, - "step": 6390 - }, - { - "epoch": 0.15, - "grad_norm": 12.278678894042969, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.4062, - "step": 6400 - }, - { - "epoch": 0.15, - "grad_norm": 2.6090126037597656, - "learning_rate": 1.5993220338983052e-05, - "loss": 1.3565, - "step": 6410 - }, - { - "epoch": 0.15, - "grad_norm": 6.103749752044678, - "learning_rate": 1.5986440677966105e-05, - "loss": 1.4804, - "step": 6420 - }, - { - "epoch": 0.15, - "grad_norm": 3.822219133377075, - "learning_rate": 1.5979661016949152e-05, - "loss": 1.4567, - "step": 6430 - }, - { - "epoch": 0.15, - "grad_norm": 4.869916915893555, - "learning_rate": 1.5972881355932205e-05, - "loss": 1.3093, - "step": 6440 - }, - { - "epoch": 0.15, - "grad_norm": 2.4998152256011963, - "learning_rate": 1.5966101694915255e-05, - "loss": 1.5148, - "step": 6450 - }, - { - "epoch": 0.15, - "grad_norm": 3.6436197757720947, - "learning_rate": 1.5959322033898304e-05, - "loss": 1.519, - "step": 6460 - }, - { - "epoch": 0.15, - "grad_norm": 2.9514076709747314, - "learning_rate": 1.5952542372881357e-05, - "loss": 1.2673, - "step": 6470 - }, - { - "epoch": 0.15, - "grad_norm": 2.19160795211792, - "learning_rate": 1.5945762711864407e-05, - "loss": 1.2964, - "step": 6480 - }, - { - "epoch": 0.15, - "grad_norm": 3.302166700363159, - "learning_rate": 1.593898305084746e-05, - "loss": 1.5311, - "step": 6490 - }, - { - "epoch": 0.15, - "grad_norm": 3.648465156555176, - "learning_rate": 1.593220338983051e-05, - "loss": 1.5642, - "step": 6500 - }, - { - "epoch": 0.15, - "eval_loss": 1.1224900484085083, - "eval_runtime": 67.0535, - "eval_samples_per_second": 14.913, - "eval_steps_per_second": 14.913, - "step": 6500 - }, - { - "epoch": 0.15, - "grad_norm": 4.6730732917785645, - "learning_rate": 1.592542372881356e-05, - "loss": 1.3842, - "step": 6510 - }, - { - "epoch": 0.15, - "grad_norm": 2.7324059009552, - "learning_rate": 1.5918644067796613e-05, - "loss": 1.4838, - "step": 6520 - }, - { - "epoch": 0.15, - "grad_norm": 1.5643513202667236, - "learning_rate": 1.5911864406779662e-05, - "loss": 1.326, - "step": 6530 - }, - { - "epoch": 0.15, - "grad_norm": 5.981233596801758, - "learning_rate": 1.5905084745762712e-05, - "loss": 1.1575, - "step": 6540 - }, - { - "epoch": 0.15, - "grad_norm": 1.4779703617095947, - "learning_rate": 1.5898305084745765e-05, - "loss": 1.3118, - "step": 6550 - }, - { - "epoch": 0.15, - "grad_norm": 5.831774711608887, - "learning_rate": 1.5891525423728815e-05, - "loss": 1.3681, - "step": 6560 - }, - { - "epoch": 0.15, - "grad_norm": 4.515669345855713, - "learning_rate": 1.5884745762711865e-05, - "loss": 1.2631, - "step": 6570 - }, - { - "epoch": 0.15, - "grad_norm": 1.7888094186782837, - "learning_rate": 1.5877966101694918e-05, - "loss": 1.4389, - "step": 6580 - }, - { - "epoch": 0.15, - "grad_norm": 2.9246420860290527, - "learning_rate": 1.5871186440677967e-05, - "loss": 1.2706, - "step": 6590 - }, - { - "epoch": 0.15, - "grad_norm": 2.629147529602051, - "learning_rate": 1.586440677966102e-05, - "loss": 1.4099, - "step": 6600 - }, - { - "epoch": 0.15, - "grad_norm": 1.5322067737579346, - "learning_rate": 1.585762711864407e-05, - "loss": 1.3107, - "step": 6610 - }, - { - "epoch": 0.15, - "grad_norm": 1.1020938158035278, - "learning_rate": 1.585084745762712e-05, - "loss": 1.3415, - "step": 6620 - }, - { - "epoch": 0.15, - "grad_norm": 5.216763496398926, - "learning_rate": 1.584406779661017e-05, - "loss": 1.3664, - "step": 6630 - }, - { - "epoch": 0.15, - "grad_norm": 3.0716798305511475, - "learning_rate": 1.583728813559322e-05, - "loss": 1.1287, - "step": 6640 - }, - { - "epoch": 0.15, - "grad_norm": 2.659613609313965, - "learning_rate": 1.5830508474576272e-05, - "loss": 1.4241, - "step": 6650 - }, - { - "epoch": 0.15, - "grad_norm": 7.536604881286621, - "learning_rate": 1.5823728813559322e-05, - "loss": 1.426, - "step": 6660 - }, - { - "epoch": 0.15, - "grad_norm": 3.213017702102661, - "learning_rate": 1.5816949152542372e-05, - "loss": 1.3564, - "step": 6670 - }, - { - "epoch": 0.15, - "grad_norm": 2.929147481918335, - "learning_rate": 1.5810169491525425e-05, - "loss": 1.5176, - "step": 6680 - }, - { - "epoch": 0.15, - "grad_norm": 3.5247678756713867, - "learning_rate": 1.5803389830508475e-05, - "loss": 1.4608, - "step": 6690 - }, - { - "epoch": 0.16, - "grad_norm": 2.313474178314209, - "learning_rate": 1.5796610169491528e-05, - "loss": 1.411, - "step": 6700 - }, - { - "epoch": 0.16, - "grad_norm": 2.516223907470703, - "learning_rate": 1.5789830508474577e-05, - "loss": 1.2478, - "step": 6710 - }, - { - "epoch": 0.16, - "grad_norm": 3.387552499771118, - "learning_rate": 1.5783050847457627e-05, - "loss": 1.3225, - "step": 6720 - }, - { - "epoch": 0.16, - "grad_norm": 1.9858660697937012, - "learning_rate": 1.577627118644068e-05, - "loss": 1.3289, - "step": 6730 - }, - { - "epoch": 0.16, - "grad_norm": 2.3014678955078125, - "learning_rate": 1.576949152542373e-05, - "loss": 1.457, - "step": 6740 - }, - { - "epoch": 0.16, - "grad_norm": 10.245163917541504, - "learning_rate": 1.576271186440678e-05, - "loss": 1.4311, - "step": 6750 - }, - { - "epoch": 0.16, - "grad_norm": 2.805487871170044, - "learning_rate": 1.5755932203389833e-05, - "loss": 1.3857, - "step": 6760 - }, - { - "epoch": 0.16, - "grad_norm": 1.2822433710098267, - "learning_rate": 1.5749152542372882e-05, - "loss": 1.4489, - "step": 6770 - }, - { - "epoch": 0.16, - "grad_norm": 2.254396915435791, - "learning_rate": 1.5742372881355936e-05, - "loss": 1.3914, - "step": 6780 - }, - { - "epoch": 0.16, - "grad_norm": 1.3311165571212769, - "learning_rate": 1.5735593220338985e-05, - "loss": 1.243, - "step": 6790 - }, - { - "epoch": 0.16, - "grad_norm": 1.7225605249404907, - "learning_rate": 1.5728813559322035e-05, - "loss": 1.425, - "step": 6800 - }, - { - "epoch": 0.16, - "grad_norm": 3.585598945617676, - "learning_rate": 1.5722033898305088e-05, - "loss": 1.414, - "step": 6810 - }, - { - "epoch": 0.16, - "grad_norm": 2.3213350772857666, - "learning_rate": 1.5715254237288138e-05, - "loss": 1.3233, - "step": 6820 - }, - { - "epoch": 0.16, - "grad_norm": 1.5480399131774902, - "learning_rate": 1.5708474576271187e-05, - "loss": 1.3096, - "step": 6830 - }, - { - "epoch": 0.16, - "grad_norm": 2.9461381435394287, - "learning_rate": 1.5701694915254237e-05, - "loss": 1.2209, - "step": 6840 - }, - { - "epoch": 0.16, - "grad_norm": 1.8784675598144531, - "learning_rate": 1.5694915254237287e-05, - "loss": 1.2154, - "step": 6850 - }, - { - "epoch": 0.16, - "grad_norm": 2.964296817779541, - "learning_rate": 1.568813559322034e-05, - "loss": 1.3913, - "step": 6860 - }, - { - "epoch": 0.16, - "grad_norm": 2.013964891433716, - "learning_rate": 1.568135593220339e-05, - "loss": 1.4206, - "step": 6870 - }, - { - "epoch": 0.16, - "grad_norm": 2.359316110610962, - "learning_rate": 1.5674576271186443e-05, - "loss": 1.3806, - "step": 6880 - }, - { - "epoch": 0.16, - "grad_norm": 5.8272175788879395, - "learning_rate": 1.5667796610169492e-05, - "loss": 1.2084, - "step": 6890 - }, - { - "epoch": 0.16, - "grad_norm": 1.1895947456359863, - "learning_rate": 1.5661016949152542e-05, - "loss": 1.5326, - "step": 6900 - }, - { - "epoch": 0.16, - "grad_norm": 3.105100631713867, - "learning_rate": 1.5654237288135595e-05, - "loss": 1.3586, - "step": 6910 - }, - { - "epoch": 0.16, - "grad_norm": 2.031907081604004, - "learning_rate": 1.5647457627118645e-05, - "loss": 1.0995, - "step": 6920 - }, - { - "epoch": 0.16, - "grad_norm": 15.673041343688965, - "learning_rate": 1.5640677966101695e-05, - "loss": 1.5304, - "step": 6930 - }, - { - "epoch": 0.16, - "grad_norm": 6.542800426483154, - "learning_rate": 1.5633898305084748e-05, - "loss": 1.4612, - "step": 6940 - }, - { - "epoch": 0.16, - "grad_norm": 4.4466352462768555, - "learning_rate": 1.5627118644067798e-05, - "loss": 1.4641, - "step": 6950 - }, - { - "epoch": 0.16, - "grad_norm": 2.0358104705810547, - "learning_rate": 1.562033898305085e-05, - "loss": 1.5169, - "step": 6960 - }, - { - "epoch": 0.16, - "grad_norm": 3.3914594650268555, - "learning_rate": 1.56135593220339e-05, - "loss": 1.3483, - "step": 6970 - }, - { - "epoch": 0.16, - "grad_norm": 2.626600503921509, - "learning_rate": 1.560677966101695e-05, - "loss": 1.3114, - "step": 6980 - }, - { - "epoch": 0.16, - "grad_norm": 5.424988746643066, - "learning_rate": 1.5600000000000003e-05, - "loss": 1.2768, - "step": 6990 - }, - { - "epoch": 0.16, - "grad_norm": 7.747529029846191, - "learning_rate": 1.5593220338983053e-05, - "loss": 1.1494, - "step": 7000 - }, - { - "epoch": 0.16, - "eval_loss": 1.0665773153305054, - "eval_runtime": 67.1469, - "eval_samples_per_second": 14.893, - "eval_steps_per_second": 14.893, - "step": 7000 - }, - { - "epoch": 0.16, - "grad_norm": 0.9942423701286316, - "learning_rate": 1.5586440677966103e-05, - "loss": 1.3625, - "step": 7010 - }, - { - "epoch": 0.16, - "grad_norm": 2.6136345863342285, - "learning_rate": 1.5579661016949156e-05, - "loss": 1.2317, - "step": 7020 - }, - { - "epoch": 0.16, - "grad_norm": 2.3277318477630615, - "learning_rate": 1.5572881355932205e-05, - "loss": 1.5961, - "step": 7030 - }, - { - "epoch": 0.16, - "grad_norm": 4.660468578338623, - "learning_rate": 1.5566101694915255e-05, - "loss": 1.3753, - "step": 7040 - }, - { - "epoch": 0.16, - "grad_norm": 2.3660237789154053, - "learning_rate": 1.5559322033898305e-05, - "loss": 1.311, - "step": 7050 - }, - { - "epoch": 0.16, - "grad_norm": 5.831724643707275, - "learning_rate": 1.5552542372881358e-05, - "loss": 1.3895, - "step": 7060 - }, - { - "epoch": 0.16, - "grad_norm": 4.309049606323242, - "learning_rate": 1.5545762711864408e-05, - "loss": 1.325, - "step": 7070 - }, - { - "epoch": 0.16, - "grad_norm": 2.5130770206451416, - "learning_rate": 1.5538983050847457e-05, - "loss": 1.3108, - "step": 7080 - }, - { - "epoch": 0.16, - "grad_norm": 11.385069847106934, - "learning_rate": 1.553220338983051e-05, - "loss": 1.3483, - "step": 7090 - }, - { - "epoch": 0.16, - "grad_norm": 2.305030107498169, - "learning_rate": 1.552542372881356e-05, - "loss": 1.12, - "step": 7100 - }, - { - "epoch": 0.16, - "grad_norm": 1.7694568634033203, - "learning_rate": 1.551864406779661e-05, - "loss": 1.4892, - "step": 7110 - }, - { - "epoch": 0.16, - "grad_norm": 3.4342284202575684, - "learning_rate": 1.5511864406779663e-05, - "loss": 1.3841, - "step": 7120 - }, - { - "epoch": 0.17, - "grad_norm": 8.617453575134277, - "learning_rate": 1.5505084745762713e-05, - "loss": 1.2856, - "step": 7130 - }, - { - "epoch": 0.17, - "grad_norm": 4.346908092498779, - "learning_rate": 1.5498305084745762e-05, - "loss": 1.2906, - "step": 7140 - }, - { - "epoch": 0.17, - "grad_norm": 3.7418603897094727, - "learning_rate": 1.5491525423728815e-05, - "loss": 1.5261, - "step": 7150 - }, - { - "epoch": 0.17, - "grad_norm": 1.1197525262832642, - "learning_rate": 1.5484745762711865e-05, - "loss": 1.2972, - "step": 7160 - }, - { - "epoch": 0.17, - "grad_norm": 3.5833466053009033, - "learning_rate": 1.5477966101694918e-05, - "loss": 1.3393, - "step": 7170 - }, - { - "epoch": 0.17, - "grad_norm": 3.046830892562866, - "learning_rate": 1.5471186440677968e-05, - "loss": 1.4726, - "step": 7180 - }, - { - "epoch": 0.17, - "grad_norm": 10.847610473632812, - "learning_rate": 1.5464406779661018e-05, - "loss": 1.4872, - "step": 7190 - }, - { - "epoch": 0.17, - "grad_norm": 1.379472255706787, - "learning_rate": 1.545762711864407e-05, - "loss": 1.3873, - "step": 7200 - }, - { - "epoch": 0.17, - "grad_norm": 2.1769838333129883, - "learning_rate": 1.545084745762712e-05, - "loss": 1.432, - "step": 7210 - }, - { - "epoch": 0.17, - "grad_norm": 2.547250747680664, - "learning_rate": 1.544406779661017e-05, - "loss": 1.5696, - "step": 7220 - }, - { - "epoch": 0.17, - "grad_norm": 8.801671981811523, - "learning_rate": 1.5437288135593223e-05, - "loss": 1.4942, - "step": 7230 - }, - { - "epoch": 0.17, - "grad_norm": 2.674645185470581, - "learning_rate": 1.5430508474576273e-05, - "loss": 1.2567, - "step": 7240 - }, - { - "epoch": 0.17, - "grad_norm": 3.047281503677368, - "learning_rate": 1.5423728813559326e-05, - "loss": 1.1286, - "step": 7250 - }, - { - "epoch": 0.17, - "grad_norm": 1.422094464302063, - "learning_rate": 1.5416949152542372e-05, - "loss": 1.4765, - "step": 7260 - }, - { - "epoch": 0.17, - "grad_norm": 3.8606622219085693, - "learning_rate": 1.5410169491525425e-05, - "loss": 1.2492, - "step": 7270 - }, - { - "epoch": 0.17, - "grad_norm": 2.5654449462890625, - "learning_rate": 1.5403389830508475e-05, - "loss": 1.2715, - "step": 7280 - }, - { - "epoch": 0.17, - "grad_norm": 2.5264976024627686, - "learning_rate": 1.5396610169491525e-05, - "loss": 1.4896, - "step": 7290 - }, - { - "epoch": 0.17, - "grad_norm": 5.585784435272217, - "learning_rate": 1.5389830508474578e-05, - "loss": 1.3929, - "step": 7300 - }, - { - "epoch": 0.17, - "grad_norm": 1.7968121767044067, - "learning_rate": 1.5383050847457628e-05, - "loss": 1.3018, - "step": 7310 - }, - { - "epoch": 0.17, - "grad_norm": 2.1171720027923584, - "learning_rate": 1.5376271186440677e-05, - "loss": 1.4814, - "step": 7320 - }, - { - "epoch": 0.17, - "grad_norm": 5.140902519226074, - "learning_rate": 1.536949152542373e-05, - "loss": 1.454, - "step": 7330 - }, - { - "epoch": 0.17, - "grad_norm": 8.208806037902832, - "learning_rate": 1.536271186440678e-05, - "loss": 1.3176, - "step": 7340 - }, - { - "epoch": 0.17, - "grad_norm": 4.932653903961182, - "learning_rate": 1.5355932203389833e-05, - "loss": 1.3674, - "step": 7350 - }, - { - "epoch": 0.17, - "grad_norm": 0.9559627175331116, - "learning_rate": 1.5349152542372883e-05, - "loss": 1.3701, - "step": 7360 - }, - { - "epoch": 0.17, - "grad_norm": 4.199779033660889, - "learning_rate": 1.5342372881355933e-05, - "loss": 1.2293, - "step": 7370 - }, - { - "epoch": 0.17, - "grad_norm": 4.100563049316406, - "learning_rate": 1.5335593220338986e-05, - "loss": 1.3448, - "step": 7380 - }, - { - "epoch": 0.17, - "grad_norm": 2.8607914447784424, - "learning_rate": 1.5328813559322035e-05, - "loss": 1.4457, - "step": 7390 - }, - { - "epoch": 0.17, - "grad_norm": 6.619777202606201, - "learning_rate": 1.5322033898305085e-05, - "loss": 1.5492, - "step": 7400 - }, - { - "epoch": 0.17, - "grad_norm": 5.614234447479248, - "learning_rate": 1.5315254237288138e-05, - "loss": 1.2983, - "step": 7410 - }, - { - "epoch": 0.17, - "grad_norm": 2.3404347896575928, - "learning_rate": 1.5308474576271188e-05, - "loss": 1.3885, - "step": 7420 - }, - { - "epoch": 0.17, - "grad_norm": 2.3918747901916504, - "learning_rate": 1.530169491525424e-05, - "loss": 1.2908, - "step": 7430 - }, - { - "epoch": 0.17, - "grad_norm": 2.5815324783325195, - "learning_rate": 1.529491525423729e-05, - "loss": 1.2632, - "step": 7440 - }, - { - "epoch": 0.17, - "grad_norm": 5.126127243041992, - "learning_rate": 1.528813559322034e-05, - "loss": 1.6003, - "step": 7450 - }, - { - "epoch": 0.17, - "grad_norm": 4.3783955574035645, - "learning_rate": 1.528135593220339e-05, - "loss": 1.3901, - "step": 7460 - }, - { - "epoch": 0.17, - "grad_norm": 2.654667615890503, - "learning_rate": 1.527457627118644e-05, - "loss": 1.2822, - "step": 7470 - }, - { - "epoch": 0.17, - "grad_norm": 7.821676731109619, - "learning_rate": 1.5267796610169493e-05, - "loss": 1.396, - "step": 7480 - }, - { - "epoch": 0.17, - "grad_norm": 2.0116024017333984, - "learning_rate": 1.5261016949152543e-05, - "loss": 1.4361, - "step": 7490 - }, - { - "epoch": 0.17, - "grad_norm": 2.1018917560577393, - "learning_rate": 1.5254237288135594e-05, - "loss": 1.3302, - "step": 7500 - }, - { - "epoch": 0.17, - "eval_loss": 1.1425343751907349, - "eval_runtime": 67.2314, - "eval_samples_per_second": 14.874, - "eval_steps_per_second": 14.874, - "step": 7500 - }, - { - "epoch": 0.17, - "grad_norm": 4.509432315826416, - "learning_rate": 1.5247457627118645e-05, - "loss": 1.3979, - "step": 7510 - }, - { - "epoch": 0.17, - "grad_norm": 2.7782204151153564, - "learning_rate": 1.5240677966101695e-05, - "loss": 1.4353, - "step": 7520 - }, - { - "epoch": 0.17, - "grad_norm": 1.9378540515899658, - "learning_rate": 1.5233898305084747e-05, - "loss": 1.2803, - "step": 7530 - }, - { - "epoch": 0.17, - "grad_norm": 3.195469379425049, - "learning_rate": 1.5227118644067798e-05, - "loss": 1.3325, - "step": 7540 - }, - { - "epoch": 0.17, - "grad_norm": 3.075035810470581, - "learning_rate": 1.522033898305085e-05, - "loss": 1.2988, - "step": 7550 - }, - { - "epoch": 0.18, - "grad_norm": 2.2794687747955322, - "learning_rate": 1.5213559322033899e-05, - "loss": 1.3588, - "step": 7560 - }, - { - "epoch": 0.18, - "grad_norm": 1.5682082176208496, - "learning_rate": 1.520677966101695e-05, - "loss": 1.2074, - "step": 7570 - }, - { - "epoch": 0.18, - "grad_norm": 1.2680002450942993, - "learning_rate": 1.5200000000000002e-05, - "loss": 1.3791, - "step": 7580 - }, - { - "epoch": 0.18, - "grad_norm": 4.0036725997924805, - "learning_rate": 1.5193220338983052e-05, - "loss": 1.4726, - "step": 7590 - }, - { - "epoch": 0.18, - "grad_norm": 2.5019006729125977, - "learning_rate": 1.5186440677966103e-05, - "loss": 1.418, - "step": 7600 - }, - { - "epoch": 0.18, - "grad_norm": 2.3398947715759277, - "learning_rate": 1.5179661016949154e-05, - "loss": 1.2933, - "step": 7610 - }, - { - "epoch": 0.18, - "grad_norm": 2.528242826461792, - "learning_rate": 1.5172881355932206e-05, - "loss": 1.1812, - "step": 7620 - }, - { - "epoch": 0.18, - "grad_norm": 1.616392970085144, - "learning_rate": 1.5166101694915255e-05, - "loss": 1.5172, - "step": 7630 - }, - { - "epoch": 0.18, - "grad_norm": 3.7177088260650635, - "learning_rate": 1.5159322033898307e-05, - "loss": 1.251, - "step": 7640 - }, - { - "epoch": 0.18, - "grad_norm": 1.7318824529647827, - "learning_rate": 1.5152542372881358e-05, - "loss": 1.4652, - "step": 7650 - }, - { - "epoch": 0.18, - "grad_norm": 9.107917785644531, - "learning_rate": 1.514576271186441e-05, - "loss": 1.2738, - "step": 7660 - }, - { - "epoch": 0.18, - "grad_norm": 2.2745494842529297, - "learning_rate": 1.5138983050847458e-05, - "loss": 1.3997, - "step": 7670 - }, - { - "epoch": 0.18, - "grad_norm": 2.564849376678467, - "learning_rate": 1.5132203389830509e-05, - "loss": 1.3054, - "step": 7680 - }, - { - "epoch": 0.18, - "grad_norm": 5.596649169921875, - "learning_rate": 1.512542372881356e-05, - "loss": 1.2581, - "step": 7690 - }, - { - "epoch": 0.18, - "grad_norm": 1.4178537130355835, - "learning_rate": 1.511864406779661e-05, - "loss": 1.3319, - "step": 7700 - }, - { - "epoch": 0.18, - "grad_norm": 3.1207597255706787, - "learning_rate": 1.5111864406779662e-05, - "loss": 1.3919, - "step": 7710 - }, - { - "epoch": 0.18, - "grad_norm": 1.1499109268188477, - "learning_rate": 1.5105084745762713e-05, - "loss": 1.2123, - "step": 7720 - }, - { - "epoch": 0.18, - "grad_norm": 7.321339130401611, - "learning_rate": 1.5098305084745763e-05, - "loss": 1.3294, - "step": 7730 - }, - { - "epoch": 0.18, - "grad_norm": 4.1394243240356445, - "learning_rate": 1.5091525423728814e-05, - "loss": 1.4105, - "step": 7740 - }, - { - "epoch": 0.18, - "grad_norm": 4.41034460067749, - "learning_rate": 1.5084745762711865e-05, - "loss": 1.4582, - "step": 7750 - }, - { - "epoch": 0.18, - "grad_norm": 5.8325324058532715, - "learning_rate": 1.5077966101694917e-05, - "loss": 1.1756, - "step": 7760 - }, - { - "epoch": 0.18, - "grad_norm": 3.059936761856079, - "learning_rate": 1.5071186440677967e-05, - "loss": 1.4276, - "step": 7770 - }, - { - "epoch": 0.18, - "grad_norm": 1.1993743181228638, - "learning_rate": 1.5064406779661018e-05, - "loss": 1.1909, - "step": 7780 - }, - { - "epoch": 0.18, - "grad_norm": 6.689935207366943, - "learning_rate": 1.505762711864407e-05, - "loss": 1.3381, - "step": 7790 - }, - { - "epoch": 0.18, - "grad_norm": 4.1810150146484375, - "learning_rate": 1.505084745762712e-05, - "loss": 1.4885, - "step": 7800 - }, - { - "epoch": 0.18, - "grad_norm": 4.501943588256836, - "learning_rate": 1.504406779661017e-05, - "loss": 1.3556, - "step": 7810 - }, - { - "epoch": 0.18, - "grad_norm": 7.109510898590088, - "learning_rate": 1.5037288135593222e-05, - "loss": 1.3156, - "step": 7820 - }, - { - "epoch": 0.18, - "grad_norm": 4.330848693847656, - "learning_rate": 1.5030508474576273e-05, - "loss": 1.508, - "step": 7830 - }, - { - "epoch": 0.18, - "grad_norm": 3.391294240951538, - "learning_rate": 1.5023728813559325e-05, - "loss": 1.3651, - "step": 7840 - }, - { - "epoch": 0.18, - "grad_norm": 1.8204346895217896, - "learning_rate": 1.5016949152542374e-05, - "loss": 1.3925, - "step": 7850 - }, - { - "epoch": 0.18, - "grad_norm": 1.8383281230926514, - "learning_rate": 1.5010169491525426e-05, - "loss": 1.4486, - "step": 7860 - }, - { - "epoch": 0.18, - "grad_norm": 7.470884323120117, - "learning_rate": 1.5003389830508477e-05, - "loss": 1.4738, - "step": 7870 - }, - { - "epoch": 0.18, - "grad_norm": 5.963416576385498, - "learning_rate": 1.4996610169491525e-05, - "loss": 1.3832, - "step": 7880 - }, - { - "epoch": 0.18, - "grad_norm": 5.30115270614624, - "learning_rate": 1.4989830508474577e-05, - "loss": 1.4412, - "step": 7890 - }, - { - "epoch": 0.18, - "grad_norm": 1.7669206857681274, - "learning_rate": 1.4983050847457628e-05, - "loss": 1.3016, - "step": 7900 - }, - { - "epoch": 0.18, - "grad_norm": 1.1460033655166626, - "learning_rate": 1.4976271186440678e-05, - "loss": 1.3938, - "step": 7910 - }, - { - "epoch": 0.18, - "grad_norm": 3.047388792037964, - "learning_rate": 1.4969491525423729e-05, - "loss": 1.352, - "step": 7920 - }, - { - "epoch": 0.18, - "grad_norm": 2.9169957637786865, - "learning_rate": 1.496271186440678e-05, - "loss": 1.274, - "step": 7930 - }, - { - "epoch": 0.18, - "grad_norm": 2.4173505306243896, - "learning_rate": 1.4955932203389832e-05, - "loss": 1.5062, - "step": 7940 - }, - { - "epoch": 0.18, - "grad_norm": 2.8410110473632812, - "learning_rate": 1.4949152542372882e-05, - "loss": 1.3504, - "step": 7950 - }, - { - "epoch": 0.18, - "grad_norm": 4.969642162322998, - "learning_rate": 1.4942372881355933e-05, - "loss": 1.4739, - "step": 7960 - }, - { - "epoch": 0.18, - "grad_norm": 3.8513638973236084, - "learning_rate": 1.4935593220338984e-05, - "loss": 1.4283, - "step": 7970 - }, - { - "epoch": 0.18, - "grad_norm": 1.2723006010055542, - "learning_rate": 1.4928813559322036e-05, - "loss": 1.5301, - "step": 7980 - }, - { - "epoch": 0.18, - "grad_norm": 1.1805778741836548, - "learning_rate": 1.4922033898305086e-05, - "loss": 1.3213, - "step": 7990 - }, - { - "epoch": 0.19, - "grad_norm": 3.0252439975738525, - "learning_rate": 1.4915254237288137e-05, - "loss": 1.2692, - "step": 8000 - }, - { - "epoch": 0.19, - "eval_loss": 1.098508358001709, - "eval_runtime": 67.0444, - "eval_samples_per_second": 14.915, - "eval_steps_per_second": 14.915, - "step": 8000 - }, - { - "epoch": 0.19, - "grad_norm": 2.2070083618164062, - "learning_rate": 1.4908474576271188e-05, - "loss": 1.3754, - "step": 8010 - }, - { - "epoch": 0.19, - "grad_norm": 2.910412549972534, - "learning_rate": 1.490169491525424e-05, - "loss": 1.5372, - "step": 8020 - }, - { - "epoch": 0.19, - "grad_norm": 2.161473274230957, - "learning_rate": 1.489491525423729e-05, - "loss": 1.4059, - "step": 8030 - }, - { - "epoch": 0.19, - "grad_norm": 2.312554121017456, - "learning_rate": 1.4888135593220341e-05, - "loss": 1.3982, - "step": 8040 - }, - { - "epoch": 0.19, - "grad_norm": 2.2093117237091064, - "learning_rate": 1.4881355932203392e-05, - "loss": 1.386, - "step": 8050 - }, - { - "epoch": 0.19, - "grad_norm": 1.6620709896087646, - "learning_rate": 1.4874576271186442e-05, - "loss": 1.2786, - "step": 8060 - }, - { - "epoch": 0.19, - "grad_norm": 2.2062294483184814, - "learning_rate": 1.4867796610169493e-05, - "loss": 1.419, - "step": 8070 - }, - { - "epoch": 0.19, - "grad_norm": 4.173048973083496, - "learning_rate": 1.4861016949152545e-05, - "loss": 1.3723, - "step": 8080 - }, - { - "epoch": 0.19, - "grad_norm": 4.707441329956055, - "learning_rate": 1.4854237288135593e-05, - "loss": 1.1348, - "step": 8090 - }, - { - "epoch": 0.19, - "grad_norm": 3.876202344894409, - "learning_rate": 1.4847457627118644e-05, - "loss": 1.1759, - "step": 8100 - }, - { - "epoch": 0.19, - "grad_norm": 1.0642389059066772, - "learning_rate": 1.4840677966101696e-05, - "loss": 1.2566, - "step": 8110 - }, - { - "epoch": 0.19, - "grad_norm": 2.49448823928833, - "learning_rate": 1.4833898305084747e-05, - "loss": 1.4826, - "step": 8120 - }, - { - "epoch": 0.19, - "grad_norm": 2.88454008102417, - "learning_rate": 1.4827118644067797e-05, - "loss": 1.3342, - "step": 8130 - }, - { - "epoch": 0.19, - "grad_norm": 14.040616035461426, - "learning_rate": 1.4820338983050848e-05, - "loss": 1.293, - "step": 8140 - }, - { - "epoch": 0.19, - "grad_norm": 1.384743094444275, - "learning_rate": 1.48135593220339e-05, - "loss": 1.5205, - "step": 8150 - }, - { - "epoch": 0.19, - "grad_norm": 3.8649702072143555, - "learning_rate": 1.4806779661016951e-05, - "loss": 1.4878, - "step": 8160 - }, - { - "epoch": 0.19, - "grad_norm": 2.677607774734497, - "learning_rate": 1.48e-05, - "loss": 1.3551, - "step": 8170 - }, - { - "epoch": 0.19, - "grad_norm": 2.7539191246032715, - "learning_rate": 1.4793220338983052e-05, - "loss": 1.6182, - "step": 8180 - }, - { - "epoch": 0.19, - "grad_norm": 1.3938488960266113, - "learning_rate": 1.4786440677966103e-05, - "loss": 1.3727, - "step": 8190 - }, - { - "epoch": 0.19, - "grad_norm": 2.8780012130737305, - "learning_rate": 1.4779661016949153e-05, - "loss": 1.3845, - "step": 8200 - }, - { - "epoch": 0.19, - "grad_norm": 2.8616294860839844, - "learning_rate": 1.4772881355932205e-05, - "loss": 1.4509, - "step": 8210 - }, - { - "epoch": 0.19, - "grad_norm": 1.9469797611236572, - "learning_rate": 1.4766101694915256e-05, - "loss": 1.2653, - "step": 8220 - }, - { - "epoch": 0.19, - "grad_norm": 3.5003085136413574, - "learning_rate": 1.4759322033898307e-05, - "loss": 1.4115, - "step": 8230 - }, - { - "epoch": 0.19, - "grad_norm": 5.820629596710205, - "learning_rate": 1.4752542372881357e-05, - "loss": 1.4058, - "step": 8240 - }, - { - "epoch": 0.19, - "grad_norm": 8.11052417755127, - "learning_rate": 1.4745762711864408e-05, - "loss": 1.3686, - "step": 8250 - }, - { - "epoch": 0.19, - "grad_norm": 5.473447799682617, - "learning_rate": 1.473898305084746e-05, - "loss": 1.2394, - "step": 8260 - }, - { - "epoch": 0.19, - "grad_norm": 8.85197925567627, - "learning_rate": 1.4732203389830511e-05, - "loss": 1.3407, - "step": 8270 - }, - { - "epoch": 0.19, - "grad_norm": 3.4516103267669678, - "learning_rate": 1.4725423728813561e-05, - "loss": 1.5529, - "step": 8280 - }, - { - "epoch": 0.19, - "grad_norm": 2.373441219329834, - "learning_rate": 1.4718644067796612e-05, - "loss": 1.3036, - "step": 8290 - }, - { - "epoch": 0.19, - "grad_norm": 4.667367458343506, - "learning_rate": 1.4711864406779662e-05, - "loss": 1.219, - "step": 8300 - }, - { - "epoch": 0.19, - "grad_norm": 1.8208861351013184, - "learning_rate": 1.4705084745762712e-05, - "loss": 1.2244, - "step": 8310 - }, - { - "epoch": 0.19, - "grad_norm": 9.953441619873047, - "learning_rate": 1.4698305084745763e-05, - "loss": 1.2222, - "step": 8320 - }, - { - "epoch": 0.19, - "grad_norm": 2.542130708694458, - "learning_rate": 1.4691525423728815e-05, - "loss": 1.4311, - "step": 8330 - }, - { - "epoch": 0.19, - "grad_norm": 4.040639400482178, - "learning_rate": 1.4684745762711864e-05, - "loss": 1.3049, - "step": 8340 - }, - { - "epoch": 0.19, - "grad_norm": 3.6556200981140137, - "learning_rate": 1.4677966101694916e-05, - "loss": 1.3456, - "step": 8350 - }, - { - "epoch": 0.19, - "grad_norm": 3.1826813220977783, - "learning_rate": 1.4671186440677967e-05, - "loss": 1.3355, - "step": 8360 - }, - { - "epoch": 0.19, - "grad_norm": 1.7379783391952515, - "learning_rate": 1.4664406779661018e-05, - "loss": 1.2498, - "step": 8370 - }, - { - "epoch": 0.19, - "grad_norm": 4.671259880065918, - "learning_rate": 1.4657627118644068e-05, - "loss": 1.4722, - "step": 8380 - }, - { - "epoch": 0.19, - "grad_norm": 1.5242054462432861, - "learning_rate": 1.465084745762712e-05, - "loss": 1.4148, - "step": 8390 - }, - { - "epoch": 0.19, - "grad_norm": 1.3360844850540161, - "learning_rate": 1.4644067796610171e-05, - "loss": 1.2182, - "step": 8400 - }, - { - "epoch": 0.19, - "grad_norm": 3.401080369949341, - "learning_rate": 1.4637288135593222e-05, - "loss": 1.2632, - "step": 8410 - }, - { - "epoch": 0.19, - "grad_norm": 3.884335517883301, - "learning_rate": 1.4630508474576272e-05, - "loss": 1.2798, - "step": 8420 - }, - { - "epoch": 0.2, - "grad_norm": 3.928900718688965, - "learning_rate": 1.4623728813559323e-05, - "loss": 1.3342, - "step": 8430 - }, - { - "epoch": 0.2, - "grad_norm": 3.478438377380371, - "learning_rate": 1.4616949152542375e-05, - "loss": 1.4387, - "step": 8440 - }, - { - "epoch": 0.2, - "grad_norm": 2.839172124862671, - "learning_rate": 1.4610169491525426e-05, - "loss": 1.4165, - "step": 8450 - }, - { - "epoch": 0.2, - "grad_norm": 8.968944549560547, - "learning_rate": 1.4603389830508476e-05, - "loss": 1.3139, - "step": 8460 - }, - { - "epoch": 0.2, - "grad_norm": 2.869877338409424, - "learning_rate": 1.4596610169491527e-05, - "loss": 1.4249, - "step": 8470 - }, - { - "epoch": 0.2, - "grad_norm": 3.0457489490509033, - "learning_rate": 1.4589830508474579e-05, - "loss": 1.2988, - "step": 8480 - }, - { - "epoch": 0.2, - "grad_norm": 5.064341068267822, - "learning_rate": 1.458305084745763e-05, - "loss": 1.337, - "step": 8490 - }, - { - "epoch": 0.2, - "grad_norm": 2.481555700302124, - "learning_rate": 1.4576271186440678e-05, - "loss": 1.2976, - "step": 8500 - }, - { - "epoch": 0.2, - "eval_loss": 1.0718435049057007, - "eval_runtime": 67.009, - "eval_samples_per_second": 14.923, - "eval_steps_per_second": 14.923, - "step": 8500 - }, - { - "epoch": 0.2, - "grad_norm": 1.9016735553741455, - "learning_rate": 1.456949152542373e-05, - "loss": 1.3663, - "step": 8510 - }, - { - "epoch": 0.2, - "grad_norm": 1.9207221269607544, - "learning_rate": 1.456271186440678e-05, - "loss": 1.2997, - "step": 8520 - }, - { - "epoch": 0.2, - "grad_norm": 2.0371081829071045, - "learning_rate": 1.455593220338983e-05, - "loss": 1.3997, - "step": 8530 - }, - { - "epoch": 0.2, - "grad_norm": 2.698136806488037, - "learning_rate": 1.4549152542372882e-05, - "loss": 1.4582, - "step": 8540 - }, - { - "epoch": 0.2, - "grad_norm": 5.272952556610107, - "learning_rate": 1.4542372881355933e-05, - "loss": 1.2139, - "step": 8550 - }, - { - "epoch": 0.2, - "grad_norm": 3.3560380935668945, - "learning_rate": 1.4535593220338983e-05, - "loss": 1.5006, - "step": 8560 - }, - { - "epoch": 0.2, - "grad_norm": 4.281118869781494, - "learning_rate": 1.4528813559322035e-05, - "loss": 1.3091, - "step": 8570 - }, - { - "epoch": 0.2, - "grad_norm": 5.719073295593262, - "learning_rate": 1.4522033898305086e-05, - "loss": 1.5326, - "step": 8580 - }, - { - "epoch": 0.2, - "grad_norm": 2.063105821609497, - "learning_rate": 1.4515254237288137e-05, - "loss": 1.3689, - "step": 8590 - }, - { - "epoch": 0.2, - "grad_norm": 1.946047306060791, - "learning_rate": 1.4508474576271187e-05, - "loss": 1.337, - "step": 8600 - }, - { - "epoch": 0.2, - "grad_norm": 0.704504668712616, - "learning_rate": 1.4501694915254239e-05, - "loss": 1.2556, - "step": 8610 - }, - { - "epoch": 0.2, - "grad_norm": 4.635929584503174, - "learning_rate": 1.449491525423729e-05, - "loss": 1.2115, - "step": 8620 - }, - { - "epoch": 0.2, - "grad_norm": 1.6211448907852173, - "learning_rate": 1.4488135593220341e-05, - "loss": 1.4003, - "step": 8630 - }, - { - "epoch": 0.2, - "grad_norm": 3.5351459980010986, - "learning_rate": 1.4481355932203391e-05, - "loss": 1.3165, - "step": 8640 - }, - { - "epoch": 0.2, - "grad_norm": 3.730940580368042, - "learning_rate": 1.4474576271186442e-05, - "loss": 1.1764, - "step": 8650 - }, - { - "epoch": 0.2, - "grad_norm": 2.209855794906616, - "learning_rate": 1.4467796610169494e-05, - "loss": 1.5537, - "step": 8660 - }, - { - "epoch": 0.2, - "grad_norm": 5.097927570343018, - "learning_rate": 1.4461016949152544e-05, - "loss": 1.2082, - "step": 8670 - }, - { - "epoch": 0.2, - "grad_norm": 2.625061273574829, - "learning_rate": 1.4454237288135595e-05, - "loss": 1.4377, - "step": 8680 - }, - { - "epoch": 0.2, - "grad_norm": 3.1008729934692383, - "learning_rate": 1.4447457627118646e-05, - "loss": 1.3205, - "step": 8690 - }, - { - "epoch": 0.2, - "grad_norm": 2.726665735244751, - "learning_rate": 1.4440677966101698e-05, - "loss": 1.5016, - "step": 8700 - }, - { - "epoch": 0.2, - "grad_norm": 2.084998369216919, - "learning_rate": 1.4433898305084746e-05, - "loss": 1.3151, - "step": 8710 - }, - { - "epoch": 0.2, - "grad_norm": 4.455216884613037, - "learning_rate": 1.4427118644067797e-05, - "loss": 1.2526, - "step": 8720 - }, - { - "epoch": 0.2, - "grad_norm": 3.0726282596588135, - "learning_rate": 1.4420338983050849e-05, - "loss": 1.3079, - "step": 8730 - }, - { - "epoch": 0.2, - "grad_norm": 6.472900867462158, - "learning_rate": 1.4413559322033898e-05, - "loss": 1.4239, - "step": 8740 - }, - { - "epoch": 0.2, - "grad_norm": 2.418191909790039, - "learning_rate": 1.440677966101695e-05, - "loss": 1.1407, - "step": 8750 - }, - { - "epoch": 0.2, - "grad_norm": 6.4082489013671875, - "learning_rate": 1.4400000000000001e-05, - "loss": 1.2607, - "step": 8760 - }, - { - "epoch": 0.2, - "grad_norm": 2.1980984210968018, - "learning_rate": 1.4393220338983052e-05, - "loss": 1.4132, - "step": 8770 - }, - { - "epoch": 0.2, - "grad_norm": 6.397686958312988, - "learning_rate": 1.4386440677966102e-05, - "loss": 1.4096, - "step": 8780 - }, - { - "epoch": 0.2, - "grad_norm": 1.1709277629852295, - "learning_rate": 1.4379661016949154e-05, - "loss": 1.3932, - "step": 8790 - }, - { - "epoch": 0.2, - "grad_norm": 1.7701913118362427, - "learning_rate": 1.4372881355932205e-05, - "loss": 1.4644, - "step": 8800 - }, - { - "epoch": 0.2, - "grad_norm": 4.566888809204102, - "learning_rate": 1.4366101694915255e-05, - "loss": 1.4883, - "step": 8810 - }, - { - "epoch": 0.2, - "grad_norm": 5.88577127456665, - "learning_rate": 1.4359322033898306e-05, - "loss": 1.4677, - "step": 8820 - }, - { - "epoch": 0.2, - "grad_norm": 3.0660550594329834, - "learning_rate": 1.4352542372881357e-05, - "loss": 1.3425, - "step": 8830 - }, - { - "epoch": 0.2, - "grad_norm": 5.181571006774902, - "learning_rate": 1.4345762711864409e-05, - "loss": 1.4117, - "step": 8840 - }, - { - "epoch": 0.2, - "grad_norm": 2.3248679637908936, - "learning_rate": 1.4338983050847459e-05, - "loss": 1.3787, - "step": 8850 - }, - { - "epoch": 0.21, - "grad_norm": 3.677826404571533, - "learning_rate": 1.433220338983051e-05, - "loss": 1.3205, - "step": 8860 - }, - { - "epoch": 0.21, - "grad_norm": 1.778449296951294, - "learning_rate": 1.4325423728813561e-05, - "loss": 1.183, - "step": 8870 - }, - { - "epoch": 0.21, - "grad_norm": 2.0881848335266113, - "learning_rate": 1.4318644067796613e-05, - "loss": 1.4995, - "step": 8880 - }, - { - "epoch": 0.21, - "grad_norm": 10.81136703491211, - "learning_rate": 1.4311864406779662e-05, - "loss": 1.3487, - "step": 8890 - }, - { - "epoch": 0.21, - "grad_norm": 8.108224868774414, - "learning_rate": 1.4305084745762714e-05, - "loss": 1.492, - "step": 8900 - }, - { - "epoch": 0.21, - "grad_norm": 3.3358426094055176, - "learning_rate": 1.4298305084745765e-05, - "loss": 1.4723, - "step": 8910 - }, - { - "epoch": 0.21, - "grad_norm": 1.7977956533432007, - "learning_rate": 1.4291525423728813e-05, - "loss": 1.4662, - "step": 8920 - }, - { - "epoch": 0.21, - "grad_norm": 5.2564921379089355, - "learning_rate": 1.4284745762711865e-05, - "loss": 1.2359, - "step": 8930 - }, - { - "epoch": 0.21, - "grad_norm": 6.835478782653809, - "learning_rate": 1.4277966101694916e-05, - "loss": 1.5774, - "step": 8940 - }, - { - "epoch": 0.21, - "grad_norm": 10.185200691223145, - "learning_rate": 1.4271186440677966e-05, - "loss": 1.5078, - "step": 8950 - }, - { - "epoch": 0.21, - "grad_norm": 3.049248218536377, - "learning_rate": 1.4264406779661017e-05, - "loss": 1.3978, - "step": 8960 - }, - { - "epoch": 0.21, - "grad_norm": 3.1511282920837402, - "learning_rate": 1.4257627118644069e-05, - "loss": 1.4465, - "step": 8970 - }, - { - "epoch": 0.21, - "grad_norm": 4.345871925354004, - "learning_rate": 1.425084745762712e-05, - "loss": 1.401, - "step": 8980 - }, - { - "epoch": 0.21, - "grad_norm": 4.231874942779541, - "learning_rate": 1.424406779661017e-05, - "loss": 1.2993, - "step": 8990 - }, - { - "epoch": 0.21, - "grad_norm": 2.336160182952881, - "learning_rate": 1.4237288135593221e-05, - "loss": 1.3631, - "step": 9000 - }, - { - "epoch": 0.21, - "eval_loss": 1.0496437549591064, - "eval_runtime": 67.0488, - "eval_samples_per_second": 14.915, - "eval_steps_per_second": 14.915, - "step": 9000 - }, - { - "epoch": 0.21, - "grad_norm": 2.6596415042877197, - "learning_rate": 1.4230508474576273e-05, - "loss": 1.3761, - "step": 9010 - }, - { - "epoch": 0.21, - "grad_norm": 3.467890501022339, - "learning_rate": 1.4223728813559324e-05, - "loss": 1.459, - "step": 9020 - }, - { - "epoch": 0.21, - "grad_norm": 2.3155555725097656, - "learning_rate": 1.4216949152542374e-05, - "loss": 1.3093, - "step": 9030 - }, - { - "epoch": 0.21, - "grad_norm": 2.9326798915863037, - "learning_rate": 1.4210169491525425e-05, - "loss": 1.3302, - "step": 9040 - }, - { - "epoch": 0.21, - "grad_norm": 3.875458002090454, - "learning_rate": 1.4203389830508476e-05, - "loss": 1.2721, - "step": 9050 - }, - { - "epoch": 0.21, - "grad_norm": 3.327162742614746, - "learning_rate": 1.4196610169491528e-05, - "loss": 1.3188, - "step": 9060 - }, - { - "epoch": 0.21, - "grad_norm": 6.240946292877197, - "learning_rate": 1.4189830508474578e-05, - "loss": 1.2654, - "step": 9070 - }, - { - "epoch": 0.21, - "grad_norm": 3.747529983520508, - "learning_rate": 1.4183050847457629e-05, - "loss": 1.3903, - "step": 9080 - }, - { - "epoch": 0.21, - "grad_norm": 3.1977028846740723, - "learning_rate": 1.417627118644068e-05, - "loss": 1.484, - "step": 9090 - }, - { - "epoch": 0.21, - "grad_norm": 2.0423030853271484, - "learning_rate": 1.416949152542373e-05, - "loss": 1.2111, - "step": 9100 - }, - { - "epoch": 0.21, - "grad_norm": 2.0299501419067383, - "learning_rate": 1.4162711864406781e-05, - "loss": 1.4146, - "step": 9110 - }, - { - "epoch": 0.21, - "grad_norm": 1.8823587894439697, - "learning_rate": 1.4155932203389833e-05, - "loss": 1.3893, - "step": 9120 - }, - { - "epoch": 0.21, - "grad_norm": 1.748755693435669, - "learning_rate": 1.414915254237288e-05, - "loss": 1.2844, - "step": 9130 - }, - { - "epoch": 0.21, - "grad_norm": 7.603834629058838, - "learning_rate": 1.4142372881355932e-05, - "loss": 1.4416, - "step": 9140 - }, - { - "epoch": 0.21, - "grad_norm": 4.06264591217041, - "learning_rate": 1.4135593220338984e-05, - "loss": 1.2758, - "step": 9150 - }, - { - "epoch": 0.21, - "grad_norm": 1.5511293411254883, - "learning_rate": 1.4128813559322035e-05, - "loss": 1.3112, - "step": 9160 - }, - { - "epoch": 0.21, - "grad_norm": 2.303243637084961, - "learning_rate": 1.4122033898305085e-05, - "loss": 1.2676, - "step": 9170 - }, - { - "epoch": 0.21, - "grad_norm": 6.051310062408447, - "learning_rate": 1.4115254237288136e-05, - "loss": 1.2643, - "step": 9180 - }, - { - "epoch": 0.21, - "grad_norm": 7.211860179901123, - "learning_rate": 1.4108474576271188e-05, - "loss": 1.3993, - "step": 9190 - }, - { - "epoch": 0.21, - "grad_norm": 1.2717708349227905, - "learning_rate": 1.4101694915254239e-05, - "loss": 1.21, - "step": 9200 - }, - { - "epoch": 0.21, - "grad_norm": 4.1961989402771, - "learning_rate": 1.4094915254237289e-05, - "loss": 1.1805, - "step": 9210 - }, - { - "epoch": 0.21, - "grad_norm": 2.593219041824341, - "learning_rate": 1.408813559322034e-05, - "loss": 1.3931, - "step": 9220 - }, - { - "epoch": 0.21, - "grad_norm": 2.4971346855163574, - "learning_rate": 1.4081355932203391e-05, - "loss": 1.4354, - "step": 9230 - }, - { - "epoch": 0.21, - "grad_norm": 2.813567876815796, - "learning_rate": 1.4074576271186441e-05, - "loss": 1.1847, - "step": 9240 - }, - { - "epoch": 0.21, - "grad_norm": 1.8391880989074707, - "learning_rate": 1.4067796610169493e-05, - "loss": 1.4731, - "step": 9250 - }, - { - "epoch": 0.21, - "grad_norm": 6.610534191131592, - "learning_rate": 1.4061016949152544e-05, - "loss": 1.4042, - "step": 9260 - }, - { - "epoch": 0.21, - "grad_norm": 1.301313042640686, - "learning_rate": 1.4054237288135595e-05, - "loss": 1.4343, - "step": 9270 - }, - { - "epoch": 0.21, - "grad_norm": 3.5929203033447266, - "learning_rate": 1.4047457627118645e-05, - "loss": 1.3178, - "step": 9280 - }, - { - "epoch": 0.22, - "grad_norm": 4.077080249786377, - "learning_rate": 1.4040677966101696e-05, - "loss": 1.2344, - "step": 9290 - }, - { - "epoch": 0.22, - "grad_norm": 4.020028591156006, - "learning_rate": 1.4033898305084748e-05, - "loss": 1.3188, - "step": 9300 - }, - { - "epoch": 0.22, - "grad_norm": 1.7268164157867432, - "learning_rate": 1.40271186440678e-05, - "loss": 1.3322, - "step": 9310 - }, - { - "epoch": 0.22, - "grad_norm": 3.74446177482605, - "learning_rate": 1.4020338983050849e-05, - "loss": 1.3143, - "step": 9320 - }, - { - "epoch": 0.22, - "grad_norm": 8.375783920288086, - "learning_rate": 1.4013559322033899e-05, - "loss": 1.3217, - "step": 9330 - }, - { - "epoch": 0.22, - "grad_norm": 1.9759814739227295, - "learning_rate": 1.400677966101695e-05, - "loss": 1.464, - "step": 9340 - }, - { - "epoch": 0.22, - "grad_norm": 7.948102951049805, - "learning_rate": 1.4e-05, - "loss": 1.5633, - "step": 9350 - }, - { - "epoch": 0.22, - "grad_norm": 1.1538234949111938, - "learning_rate": 1.3993220338983051e-05, - "loss": 1.2872, - "step": 9360 - }, - { - "epoch": 0.22, - "grad_norm": 4.251887798309326, - "learning_rate": 1.3986440677966103e-05, - "loss": 1.3319, - "step": 9370 - }, - { - "epoch": 0.22, - "grad_norm": 5.3007283210754395, - "learning_rate": 1.3979661016949152e-05, - "loss": 1.4477, - "step": 9380 - }, - { - "epoch": 0.22, - "grad_norm": 4.742395877838135, - "learning_rate": 1.3972881355932204e-05, - "loss": 1.3017, - "step": 9390 - }, - { - "epoch": 0.22, - "grad_norm": 3.886890411376953, - "learning_rate": 1.3966101694915255e-05, - "loss": 1.4781, - "step": 9400 - }, - { - "epoch": 0.22, - "grad_norm": 4.440849781036377, - "learning_rate": 1.3959322033898306e-05, - "loss": 1.5151, - "step": 9410 - }, - { - "epoch": 0.22, - "grad_norm": 1.388344168663025, - "learning_rate": 1.3952542372881356e-05, - "loss": 1.1664, - "step": 9420 - }, - { - "epoch": 0.22, - "grad_norm": 3.4817326068878174, - "learning_rate": 1.3945762711864408e-05, - "loss": 1.3492, - "step": 9430 - }, - { - "epoch": 0.22, - "grad_norm": 3.128695011138916, - "learning_rate": 1.3938983050847459e-05, - "loss": 1.3101, - "step": 9440 - }, - { - "epoch": 0.22, - "grad_norm": 5.121028900146484, - "learning_rate": 1.393220338983051e-05, - "loss": 1.4188, - "step": 9450 - }, - { - "epoch": 0.22, - "grad_norm": 3.124645471572876, - "learning_rate": 1.392542372881356e-05, - "loss": 1.1935, - "step": 9460 - }, - { - "epoch": 0.22, - "grad_norm": 2.997901439666748, - "learning_rate": 1.3918644067796612e-05, - "loss": 1.2564, - "step": 9470 - }, - { - "epoch": 0.22, - "grad_norm": 4.793147087097168, - "learning_rate": 1.3911864406779663e-05, - "loss": 1.402, - "step": 9480 - }, - { - "epoch": 0.22, - "grad_norm": 2.9552319049835205, - "learning_rate": 1.3905084745762714e-05, - "loss": 1.5437, - "step": 9490 - }, - { - "epoch": 0.22, - "grad_norm": 5.294174671173096, - "learning_rate": 1.3898305084745764e-05, - "loss": 1.4665, - "step": 9500 - }, - { - "epoch": 0.22, - "eval_loss": 1.102941632270813, - "eval_runtime": 67.0366, - "eval_samples_per_second": 14.917, - "eval_steps_per_second": 14.917, - "step": 9500 - }, - { - "epoch": 0.22, - "grad_norm": 0.7909783124923706, - "learning_rate": 1.3891525423728815e-05, - "loss": 1.3756, - "step": 9510 - }, - { - "epoch": 0.22, - "grad_norm": 3.4454197883605957, - "learning_rate": 1.3884745762711867e-05, - "loss": 1.3005, - "step": 9520 - }, - { - "epoch": 0.22, - "grad_norm": 4.054439067840576, - "learning_rate": 1.3877966101694918e-05, - "loss": 1.4208, - "step": 9530 - }, - { - "epoch": 0.22, - "grad_norm": 8.244361877441406, - "learning_rate": 1.3871186440677966e-05, - "loss": 1.3597, - "step": 9540 - }, - { - "epoch": 0.22, - "grad_norm": 2.499704360961914, - "learning_rate": 1.3864406779661018e-05, - "loss": 1.3397, - "step": 9550 - }, - { - "epoch": 0.22, - "grad_norm": 4.15438985824585, - "learning_rate": 1.3857627118644067e-05, - "loss": 1.3366, - "step": 9560 - }, - { - "epoch": 0.22, - "grad_norm": 1.504563808441162, - "learning_rate": 1.3850847457627119e-05, - "loss": 1.4157, - "step": 9570 - }, - { - "epoch": 0.22, - "grad_norm": 5.045544147491455, - "learning_rate": 1.384406779661017e-05, - "loss": 1.4529, - "step": 9580 - }, - { - "epoch": 0.22, - "grad_norm": 2.8790090084075928, - "learning_rate": 1.3837288135593222e-05, - "loss": 1.2581, - "step": 9590 - }, - { - "epoch": 0.22, - "grad_norm": 4.442860126495361, - "learning_rate": 1.3830508474576271e-05, - "loss": 1.3185, - "step": 9600 - }, - { - "epoch": 0.22, - "grad_norm": 2.301625967025757, - "learning_rate": 1.3823728813559323e-05, - "loss": 1.519, - "step": 9610 - }, - { - "epoch": 0.22, - "grad_norm": 3.222405195236206, - "learning_rate": 1.3816949152542374e-05, - "loss": 1.4471, - "step": 9620 - }, - { - "epoch": 0.22, - "grad_norm": 6.543327808380127, - "learning_rate": 1.3810169491525425e-05, - "loss": 1.3623, - "step": 9630 - }, - { - "epoch": 0.22, - "grad_norm": 7.1420392990112305, - "learning_rate": 1.3803389830508475e-05, - "loss": 1.1868, - "step": 9640 - }, - { - "epoch": 0.22, - "grad_norm": 6.8577094078063965, - "learning_rate": 1.3796610169491527e-05, - "loss": 1.3238, - "step": 9650 - }, - { - "epoch": 0.22, - "grad_norm": 3.2370662689208984, - "learning_rate": 1.3789830508474578e-05, - "loss": 1.2522, - "step": 9660 - }, - { - "epoch": 0.22, - "grad_norm": 2.8609619140625, - "learning_rate": 1.378305084745763e-05, - "loss": 1.3918, - "step": 9670 - }, - { - "epoch": 0.22, - "grad_norm": 2.962815523147583, - "learning_rate": 1.3776271186440679e-05, - "loss": 1.3291, - "step": 9680 - }, - { - "epoch": 0.22, - "grad_norm": 1.649778962135315, - "learning_rate": 1.376949152542373e-05, - "loss": 1.4385, - "step": 9690 - }, - { - "epoch": 0.22, - "grad_norm": 2.7426583766937256, - "learning_rate": 1.3762711864406782e-05, - "loss": 1.2947, - "step": 9700 - }, - { - "epoch": 0.22, - "grad_norm": 2.934476613998413, - "learning_rate": 1.3755932203389832e-05, - "loss": 1.4168, - "step": 9710 - }, - { - "epoch": 0.23, - "grad_norm": 5.4620747566223145, - "learning_rate": 1.3749152542372883e-05, - "loss": 1.3825, - "step": 9720 - }, - { - "epoch": 0.23, - "grad_norm": 2.6670947074890137, - "learning_rate": 1.3742372881355934e-05, - "loss": 1.1969, - "step": 9730 - }, - { - "epoch": 0.23, - "grad_norm": 5.469496726989746, - "learning_rate": 1.3735593220338986e-05, - "loss": 1.0666, - "step": 9740 - }, - { - "epoch": 0.23, - "grad_norm": 1.448007583618164, - "learning_rate": 1.3728813559322034e-05, - "loss": 1.4138, - "step": 9750 - }, - { - "epoch": 0.23, - "grad_norm": 2.5976903438568115, - "learning_rate": 1.3722033898305085e-05, - "loss": 1.4822, - "step": 9760 - }, - { - "epoch": 0.23, - "grad_norm": 1.9508200883865356, - "learning_rate": 1.3715254237288137e-05, - "loss": 1.2946, - "step": 9770 - }, - { - "epoch": 0.23, - "grad_norm": 2.993997812271118, - "learning_rate": 1.3708474576271186e-05, - "loss": 1.4221, - "step": 9780 - }, - { - "epoch": 0.23, - "grad_norm": 2.5066683292388916, - "learning_rate": 1.3701694915254238e-05, - "loss": 1.4399, - "step": 9790 - }, - { - "epoch": 0.23, - "grad_norm": 2.8072290420532227, - "learning_rate": 1.3694915254237289e-05, - "loss": 1.2055, - "step": 9800 - }, - { - "epoch": 0.23, - "grad_norm": 5.655754566192627, - "learning_rate": 1.368813559322034e-05, - "loss": 1.3866, - "step": 9810 - }, - { - "epoch": 0.23, - "grad_norm": 15.120744705200195, - "learning_rate": 1.368135593220339e-05, - "loss": 1.1179, - "step": 9820 - }, - { - "epoch": 0.23, - "grad_norm": 2.2080466747283936, - "learning_rate": 1.3674576271186442e-05, - "loss": 1.412, - "step": 9830 - }, - { - "epoch": 0.23, - "grad_norm": 2.3344991207122803, - "learning_rate": 1.3667796610169493e-05, - "loss": 1.3185, - "step": 9840 - }, - { - "epoch": 0.23, - "grad_norm": 2.8925857543945312, - "learning_rate": 1.3661016949152543e-05, - "loss": 1.3284, - "step": 9850 - }, - { - "epoch": 0.23, - "grad_norm": 6.113729476928711, - "learning_rate": 1.3654237288135594e-05, - "loss": 1.3287, - "step": 9860 - }, - { - "epoch": 0.23, - "grad_norm": 8.377769470214844, - "learning_rate": 1.3647457627118646e-05, - "loss": 1.4654, - "step": 9870 - }, - { - "epoch": 0.23, - "grad_norm": 3.898707866668701, - "learning_rate": 1.3640677966101697e-05, - "loss": 1.3557, - "step": 9880 - }, - { - "epoch": 0.23, - "grad_norm": 3.4954209327697754, - "learning_rate": 1.3633898305084747e-05, - "loss": 1.2589, - "step": 9890 - }, - { - "epoch": 0.23, - "grad_norm": 2.97265362739563, - "learning_rate": 1.3627118644067798e-05, - "loss": 1.4468, - "step": 9900 - }, - { - "epoch": 0.23, - "grad_norm": 4.927505970001221, - "learning_rate": 1.362033898305085e-05, - "loss": 1.3386, - "step": 9910 - }, - { - "epoch": 0.23, - "grad_norm": 5.02236795425415, - "learning_rate": 1.36135593220339e-05, - "loss": 1.6745, - "step": 9920 - }, - { - "epoch": 0.23, - "grad_norm": 3.2489023208618164, - "learning_rate": 1.360677966101695e-05, - "loss": 1.3722, - "step": 9930 - }, - { - "epoch": 0.23, - "grad_norm": 4.043548583984375, - "learning_rate": 1.3600000000000002e-05, - "loss": 1.3996, - "step": 9940 - }, - { - "epoch": 0.23, - "grad_norm": 1.7545967102050781, - "learning_rate": 1.3593220338983053e-05, - "loss": 1.3781, - "step": 9950 - }, - { - "epoch": 0.23, - "grad_norm": 2.4097864627838135, - "learning_rate": 1.3586440677966101e-05, - "loss": 1.3649, - "step": 9960 - }, - { - "epoch": 0.23, - "grad_norm": 2.1898608207702637, - "learning_rate": 1.3579661016949153e-05, - "loss": 1.4705, - "step": 9970 - }, - { - "epoch": 0.23, - "grad_norm": 5.296695232391357, - "learning_rate": 1.3572881355932204e-05, - "loss": 1.2701, - "step": 9980 - }, - { - "epoch": 0.23, - "grad_norm": 1.6690765619277954, - "learning_rate": 1.3566101694915254e-05, - "loss": 1.2846, - "step": 9990 - }, - { - "epoch": 0.23, - "grad_norm": 1.7802351713180542, - "learning_rate": 1.3559322033898305e-05, - "loss": 1.3004, - "step": 10000 - }, - { - "epoch": 0.23, - "eval_loss": 1.0849580764770508, - "eval_runtime": 67.0014, - "eval_samples_per_second": 14.925, - "eval_steps_per_second": 14.925, - "step": 10000 - }, - { - "epoch": 0.23, - "grad_norm": 2.575228214263916, - "learning_rate": 1.3552542372881357e-05, - "loss": 1.2472, - "step": 10010 - }, - { - "epoch": 0.23, - "grad_norm": 3.6964104175567627, - "learning_rate": 1.3545762711864408e-05, - "loss": 1.376, - "step": 10020 - }, - { - "epoch": 0.23, - "grad_norm": 1.1188055276870728, - "learning_rate": 1.3538983050847458e-05, - "loss": 1.1798, - "step": 10030 - }, - { - "epoch": 0.23, - "grad_norm": 6.469828128814697, - "learning_rate": 1.353220338983051e-05, - "loss": 1.3331, - "step": 10040 - }, - { - "epoch": 0.23, - "grad_norm": 6.409282207489014, - "learning_rate": 1.352542372881356e-05, - "loss": 1.3283, - "step": 10050 - }, - { - "epoch": 0.23, - "grad_norm": 3.001148223876953, - "learning_rate": 1.3518644067796612e-05, - "loss": 1.4138, - "step": 10060 - }, - { - "epoch": 0.23, - "grad_norm": 2.1178245544433594, - "learning_rate": 1.3511864406779662e-05, - "loss": 1.0088, - "step": 10070 - }, - { - "epoch": 0.23, - "grad_norm": 2.2544403076171875, - "learning_rate": 1.3505084745762713e-05, - "loss": 1.2236, - "step": 10080 - }, - { - "epoch": 0.23, - "grad_norm": 4.784693241119385, - "learning_rate": 1.3498305084745764e-05, - "loss": 1.3831, - "step": 10090 - }, - { - "epoch": 0.23, - "grad_norm": 2.8957157135009766, - "learning_rate": 1.3491525423728816e-05, - "loss": 1.4073, - "step": 10100 - }, - { - "epoch": 0.23, - "grad_norm": 1.957875370979309, - "learning_rate": 1.3484745762711866e-05, - "loss": 1.4168, - "step": 10110 - }, - { - "epoch": 0.23, - "grad_norm": 3.314802885055542, - "learning_rate": 1.3477966101694917e-05, - "loss": 1.2289, - "step": 10120 - }, - { - "epoch": 0.23, - "grad_norm": 3.6212656497955322, - "learning_rate": 1.3471186440677968e-05, - "loss": 1.5181, - "step": 10130 - }, - { - "epoch": 0.23, - "grad_norm": 1.864043116569519, - "learning_rate": 1.346440677966102e-05, - "loss": 1.4954, - "step": 10140 - }, - { - "epoch": 0.23, - "grad_norm": 4.504016876220703, - "learning_rate": 1.345762711864407e-05, - "loss": 1.2792, - "step": 10150 - }, - { - "epoch": 0.24, - "grad_norm": 4.482726573944092, - "learning_rate": 1.3450847457627121e-05, - "loss": 1.0474, - "step": 10160 - }, - { - "epoch": 0.24, - "grad_norm": 2.6446313858032227, - "learning_rate": 1.3444067796610169e-05, - "loss": 1.4683, - "step": 10170 - }, - { - "epoch": 0.24, - "grad_norm": 1.929124116897583, - "learning_rate": 1.343728813559322e-05, - "loss": 1.4199, - "step": 10180 - }, - { - "epoch": 0.24, - "grad_norm": 2.283059597015381, - "learning_rate": 1.3430508474576272e-05, - "loss": 1.2784, - "step": 10190 - }, - { - "epoch": 0.24, - "grad_norm": 2.409111738204956, - "learning_rate": 1.3423728813559323e-05, - "loss": 1.393, - "step": 10200 - }, - { - "epoch": 0.24, - "grad_norm": 6.90030574798584, - "learning_rate": 1.3416949152542373e-05, - "loss": 1.2889, - "step": 10210 - }, - { - "epoch": 0.24, - "grad_norm": 2.19612455368042, - "learning_rate": 1.3410169491525424e-05, - "loss": 1.646, - "step": 10220 - }, - { - "epoch": 0.24, - "grad_norm": 7.8987650871276855, - "learning_rate": 1.3403389830508476e-05, - "loss": 1.2646, - "step": 10230 - }, - { - "epoch": 0.24, - "grad_norm": 2.225478172302246, - "learning_rate": 1.3396610169491527e-05, - "loss": 1.2869, - "step": 10240 - }, - { - "epoch": 0.24, - "grad_norm": 3.281136989593506, - "learning_rate": 1.3389830508474577e-05, - "loss": 1.3977, - "step": 10250 - }, - { - "epoch": 0.24, - "grad_norm": 2.4851903915405273, - "learning_rate": 1.3383050847457628e-05, - "loss": 1.0693, - "step": 10260 - }, - { - "epoch": 0.24, - "grad_norm": 3.3886020183563232, - "learning_rate": 1.337627118644068e-05, - "loss": 1.3299, - "step": 10270 - }, - { - "epoch": 0.24, - "grad_norm": 5.062967300415039, - "learning_rate": 1.3369491525423731e-05, - "loss": 1.4647, - "step": 10280 - }, - { - "epoch": 0.24, - "grad_norm": 5.308380126953125, - "learning_rate": 1.336271186440678e-05, - "loss": 1.3072, - "step": 10290 - }, - { - "epoch": 0.24, - "grad_norm": 2.1562750339508057, - "learning_rate": 1.3355932203389832e-05, - "loss": 1.4808, - "step": 10300 - }, - { - "epoch": 0.24, - "grad_norm": 0.7641376852989197, - "learning_rate": 1.3349152542372883e-05, - "loss": 1.2411, - "step": 10310 - }, - { - "epoch": 0.24, - "grad_norm": 4.490609169006348, - "learning_rate": 1.3342372881355933e-05, - "loss": 1.4156, - "step": 10320 - }, - { - "epoch": 0.24, - "grad_norm": 1.7860184907913208, - "learning_rate": 1.3335593220338985e-05, - "loss": 1.35, - "step": 10330 - }, - { - "epoch": 0.24, - "grad_norm": 5.641195297241211, - "learning_rate": 1.3328813559322036e-05, - "loss": 1.3174, - "step": 10340 - }, - { - "epoch": 0.24, - "grad_norm": 2.737027406692505, - "learning_rate": 1.3322033898305087e-05, - "loss": 1.2823, - "step": 10350 - }, - { - "epoch": 0.24, - "grad_norm": 5.195732116699219, - "learning_rate": 1.3315254237288137e-05, - "loss": 1.2259, - "step": 10360 - }, - { - "epoch": 0.24, - "grad_norm": 2.1460018157958984, - "learning_rate": 1.3308474576271187e-05, - "loss": 1.0539, - "step": 10370 - }, - { - "epoch": 0.24, - "grad_norm": 4.896897315979004, - "learning_rate": 1.3301694915254238e-05, - "loss": 1.2379, - "step": 10380 - }, - { - "epoch": 0.24, - "grad_norm": 4.282651901245117, - "learning_rate": 1.3294915254237288e-05, - "loss": 1.4351, - "step": 10390 - }, - { - "epoch": 0.24, - "grad_norm": 2.8991894721984863, - "learning_rate": 1.328813559322034e-05, - "loss": 1.3508, - "step": 10400 - }, - { - "epoch": 0.24, - "grad_norm": 4.306196212768555, - "learning_rate": 1.328135593220339e-05, - "loss": 1.3804, - "step": 10410 - }, - { - "epoch": 0.24, - "grad_norm": 2.230379104614258, - "learning_rate": 1.3274576271186442e-05, - "loss": 1.2072, - "step": 10420 - }, - { - "epoch": 0.24, - "grad_norm": 1.0265578031539917, - "learning_rate": 1.3267796610169492e-05, - "loss": 1.3691, - "step": 10430 - }, - { - "epoch": 0.24, - "grad_norm": 1.1750489473342896, - "learning_rate": 1.3261016949152543e-05, - "loss": 1.2741, - "step": 10440 - }, - { - "epoch": 0.24, - "grad_norm": 6.429545879364014, - "learning_rate": 1.3254237288135595e-05, - "loss": 1.4234, - "step": 10450 - }, - { - "epoch": 0.24, - "grad_norm": 8.749063491821289, - "learning_rate": 1.3247457627118644e-05, - "loss": 1.3652, - "step": 10460 - }, - { - "epoch": 0.24, - "grad_norm": 3.783092498779297, - "learning_rate": 1.3240677966101696e-05, - "loss": 1.2662, - "step": 10470 - }, - { - "epoch": 0.24, - "grad_norm": 7.148987293243408, - "learning_rate": 1.3233898305084747e-05, - "loss": 1.2553, - "step": 10480 - }, - { - "epoch": 0.24, - "grad_norm": 4.071252822875977, - "learning_rate": 1.3227118644067798e-05, - "loss": 1.1767, - "step": 10490 - }, - { - "epoch": 0.24, - "grad_norm": 4.911563396453857, - "learning_rate": 1.3220338983050848e-05, - "loss": 1.4, - "step": 10500 - }, - { - "epoch": 0.24, - "eval_loss": 1.056018590927124, - "eval_runtime": 67.0592, - "eval_samples_per_second": 14.912, - "eval_steps_per_second": 14.912, - "step": 10500 - }, - { - "epoch": 0.24, - "grad_norm": 2.0508599281311035, - "learning_rate": 1.32135593220339e-05, - "loss": 1.3227, - "step": 10510 - }, - { - "epoch": 0.24, - "grad_norm": 2.7030413150787354, - "learning_rate": 1.3206779661016951e-05, - "loss": 1.4168, - "step": 10520 - }, - { - "epoch": 0.24, - "grad_norm": 8.267769813537598, - "learning_rate": 1.3200000000000002e-05, - "loss": 1.4354, - "step": 10530 - }, - { - "epoch": 0.24, - "grad_norm": 3.2510886192321777, - "learning_rate": 1.3193220338983052e-05, - "loss": 1.3922, - "step": 10540 - }, - { - "epoch": 0.24, - "grad_norm": 10.665681838989258, - "learning_rate": 1.3186440677966103e-05, - "loss": 1.3282, - "step": 10550 - }, - { - "epoch": 0.24, - "grad_norm": 1.7952899932861328, - "learning_rate": 1.3179661016949155e-05, - "loss": 1.2216, - "step": 10560 - }, - { - "epoch": 0.24, - "grad_norm": 3.8615293502807617, - "learning_rate": 1.3172881355932206e-05, - "loss": 1.3105, - "step": 10570 - }, - { - "epoch": 0.24, - "grad_norm": 5.162840366363525, - "learning_rate": 1.3166101694915254e-05, - "loss": 1.3709, - "step": 10580 - }, - { - "epoch": 0.25, - "grad_norm": 2.7145211696624756, - "learning_rate": 1.3159322033898306e-05, - "loss": 1.3356, - "step": 10590 - }, - { - "epoch": 0.25, - "grad_norm": 2.6916394233703613, - "learning_rate": 1.3152542372881355e-05, - "loss": 1.4327, - "step": 10600 - }, - { - "epoch": 0.25, - "grad_norm": 1.5745512247085571, - "learning_rate": 1.3145762711864407e-05, - "loss": 1.4339, - "step": 10610 - }, - { - "epoch": 0.25, - "grad_norm": 1.7034305334091187, - "learning_rate": 1.3138983050847458e-05, - "loss": 1.3206, - "step": 10620 - }, - { - "epoch": 0.25, - "grad_norm": 3.4381961822509766, - "learning_rate": 1.313220338983051e-05, - "loss": 1.3502, - "step": 10630 - }, - { - "epoch": 0.25, - "grad_norm": 3.2712607383728027, - "learning_rate": 1.312542372881356e-05, - "loss": 1.3906, - "step": 10640 - }, - { - "epoch": 0.25, - "grad_norm": 4.805257320404053, - "learning_rate": 1.311864406779661e-05, - "loss": 1.407, - "step": 10650 - }, - { - "epoch": 0.25, - "grad_norm": 1.6435729265213013, - "learning_rate": 1.3111864406779662e-05, - "loss": 1.4471, - "step": 10660 - }, - { - "epoch": 0.25, - "grad_norm": 1.9431121349334717, - "learning_rate": 1.3105084745762714e-05, - "loss": 1.2493, - "step": 10670 - }, - { - "epoch": 0.25, - "grad_norm": 1.9565777778625488, - "learning_rate": 1.3098305084745763e-05, - "loss": 1.4396, - "step": 10680 - }, - { - "epoch": 0.25, - "grad_norm": 5.754490852355957, - "learning_rate": 1.3091525423728815e-05, - "loss": 1.3714, - "step": 10690 - }, - { - "epoch": 0.25, - "grad_norm": 3.628016233444214, - "learning_rate": 1.3084745762711866e-05, - "loss": 1.2535, - "step": 10700 - }, - { - "epoch": 0.25, - "grad_norm": 3.5335326194763184, - "learning_rate": 1.3077966101694917e-05, - "loss": 1.4952, - "step": 10710 - }, - { - "epoch": 0.25, - "grad_norm": 4.725300312042236, - "learning_rate": 1.3071186440677967e-05, - "loss": 1.2693, - "step": 10720 - }, - { - "epoch": 0.25, - "grad_norm": 1.5443134307861328, - "learning_rate": 1.3064406779661019e-05, - "loss": 1.2728, - "step": 10730 - }, - { - "epoch": 0.25, - "grad_norm": 1.495453119277954, - "learning_rate": 1.305762711864407e-05, - "loss": 1.4771, - "step": 10740 - }, - { - "epoch": 0.25, - "grad_norm": 3.3322789669036865, - "learning_rate": 1.305084745762712e-05, - "loss": 1.4256, - "step": 10750 - }, - { - "epoch": 0.25, - "grad_norm": 7.982933521270752, - "learning_rate": 1.3044067796610171e-05, - "loss": 1.2098, - "step": 10760 - }, - { - "epoch": 0.25, - "grad_norm": 3.701568603515625, - "learning_rate": 1.3037288135593222e-05, - "loss": 1.0726, - "step": 10770 - }, - { - "epoch": 0.25, - "grad_norm": 1.6933153867721558, - "learning_rate": 1.3030508474576274e-05, - "loss": 1.5174, - "step": 10780 - }, - { - "epoch": 0.25, - "grad_norm": 3.9348666667938232, - "learning_rate": 1.3023728813559322e-05, - "loss": 1.3662, - "step": 10790 - }, - { - "epoch": 0.25, - "grad_norm": 7.25183629989624, - "learning_rate": 1.3016949152542373e-05, - "loss": 1.2264, - "step": 10800 - }, - { - "epoch": 0.25, - "grad_norm": 4.727850914001465, - "learning_rate": 1.3010169491525425e-05, - "loss": 1.2664, - "step": 10810 - }, - { - "epoch": 0.25, - "grad_norm": 3.9714324474334717, - "learning_rate": 1.3003389830508474e-05, - "loss": 1.316, - "step": 10820 - }, - { - "epoch": 0.25, - "grad_norm": 1.8657814264297485, - "learning_rate": 1.2996610169491526e-05, - "loss": 1.5116, - "step": 10830 - }, - { - "epoch": 0.25, - "grad_norm": 4.508814811706543, - "learning_rate": 1.2989830508474577e-05, - "loss": 1.2307, - "step": 10840 - }, - { - "epoch": 0.25, - "grad_norm": 3.6838784217834473, - "learning_rate": 1.2983050847457629e-05, - "loss": 1.1877, - "step": 10850 - }, - { - "epoch": 0.25, - "grad_norm": 4.849525451660156, - "learning_rate": 1.2976271186440678e-05, - "loss": 1.2414, - "step": 10860 - }, - { - "epoch": 0.25, - "grad_norm": 3.3128011226654053, - "learning_rate": 1.296949152542373e-05, - "loss": 1.3896, - "step": 10870 - }, - { - "epoch": 0.25, - "grad_norm": 2.422858238220215, - "learning_rate": 1.2962711864406781e-05, - "loss": 1.2159, - "step": 10880 - }, - { - "epoch": 0.25, - "grad_norm": 4.698125839233398, - "learning_rate": 1.295593220338983e-05, - "loss": 1.276, - "step": 10890 - }, - { - "epoch": 0.25, - "grad_norm": 5.102774143218994, - "learning_rate": 1.2949152542372882e-05, - "loss": 1.2338, - "step": 10900 - }, - { - "epoch": 0.25, - "grad_norm": 2.2019145488739014, - "learning_rate": 1.2942372881355934e-05, - "loss": 1.2739, - "step": 10910 - }, - { - "epoch": 0.25, - "grad_norm": 9.765484809875488, - "learning_rate": 1.2935593220338985e-05, - "loss": 1.3403, - "step": 10920 - }, - { - "epoch": 0.25, - "grad_norm": 3.763749837875366, - "learning_rate": 1.2928813559322035e-05, - "loss": 1.3235, - "step": 10930 - }, - { - "epoch": 0.25, - "grad_norm": 2.33292818069458, - "learning_rate": 1.2922033898305086e-05, - "loss": 1.4525, - "step": 10940 - }, - { - "epoch": 0.25, - "grad_norm": 5.402053356170654, - "learning_rate": 1.2915254237288137e-05, - "loss": 1.2736, - "step": 10950 - }, - { - "epoch": 0.25, - "grad_norm": 8.462284088134766, - "learning_rate": 1.2908474576271189e-05, - "loss": 1.302, - "step": 10960 - }, - { - "epoch": 0.25, - "grad_norm": 2.865405321121216, - "learning_rate": 1.2901694915254239e-05, - "loss": 1.3355, - "step": 10970 - }, - { - "epoch": 0.25, - "grad_norm": 1.8566632270812988, - "learning_rate": 1.289491525423729e-05, - "loss": 1.0694, - "step": 10980 - }, - { - "epoch": 0.25, - "grad_norm": 4.951323986053467, - "learning_rate": 1.2888135593220341e-05, - "loss": 1.3189, - "step": 10990 - }, - { - "epoch": 0.25, - "grad_norm": 2.248414993286133, - "learning_rate": 1.288135593220339e-05, - "loss": 1.3578, - "step": 11000 - }, - { - "epoch": 0.25, - "eval_loss": 1.1179373264312744, - "eval_runtime": 67.0257, - "eval_samples_per_second": 14.92, - "eval_steps_per_second": 14.92, - "step": 11000 - }, - { - "epoch": 0.25, - "grad_norm": 1.7238454818725586, - "learning_rate": 1.287457627118644e-05, - "loss": 1.2867, - "step": 11010 - }, - { - "epoch": 0.26, - "grad_norm": 3.1348774433135986, - "learning_rate": 1.2867796610169492e-05, - "loss": 1.3616, - "step": 11020 - }, - { - "epoch": 0.26, - "grad_norm": 5.413891315460205, - "learning_rate": 1.2861016949152542e-05, - "loss": 1.358, - "step": 11030 - }, - { - "epoch": 0.26, - "grad_norm": 2.0301661491394043, - "learning_rate": 1.2854237288135593e-05, - "loss": 1.4766, - "step": 11040 - }, - { - "epoch": 0.26, - "grad_norm": 2.3625426292419434, - "learning_rate": 1.2847457627118645e-05, - "loss": 1.2889, - "step": 11050 - }, - { - "epoch": 0.26, - "grad_norm": 6.010834217071533, - "learning_rate": 1.2840677966101696e-05, - "loss": 1.3095, - "step": 11060 - }, - { - "epoch": 0.26, - "grad_norm": 1.5633188486099243, - "learning_rate": 1.2833898305084746e-05, - "loss": 1.3397, - "step": 11070 - }, - { - "epoch": 0.26, - "grad_norm": 2.8885929584503174, - "learning_rate": 1.2827118644067797e-05, - "loss": 1.3388, - "step": 11080 - }, - { - "epoch": 0.26, - "grad_norm": 4.556473255157471, - "learning_rate": 1.2820338983050849e-05, - "loss": 1.2801, - "step": 11090 - }, - { - "epoch": 0.26, - "grad_norm": 3.5699386596679688, - "learning_rate": 1.28135593220339e-05, - "loss": 1.2587, - "step": 11100 - }, - { - "epoch": 0.26, - "grad_norm": 2.842324733734131, - "learning_rate": 1.280677966101695e-05, - "loss": 1.2892, - "step": 11110 - }, - { - "epoch": 0.26, - "grad_norm": 1.6393495798110962, - "learning_rate": 1.2800000000000001e-05, - "loss": 1.2888, - "step": 11120 - }, - { - "epoch": 0.26, - "grad_norm": 4.156044960021973, - "learning_rate": 1.2793220338983053e-05, - "loss": 1.4945, - "step": 11130 - }, - { - "epoch": 0.26, - "grad_norm": 5.133569717407227, - "learning_rate": 1.2786440677966104e-05, - "loss": 1.2537, - "step": 11140 - }, - { - "epoch": 0.26, - "grad_norm": 6.207439422607422, - "learning_rate": 1.2779661016949154e-05, - "loss": 1.2773, - "step": 11150 - }, - { - "epoch": 0.26, - "grad_norm": 7.413145542144775, - "learning_rate": 1.2772881355932205e-05, - "loss": 1.3265, - "step": 11160 - }, - { - "epoch": 0.26, - "grad_norm": 3.2502923011779785, - "learning_rate": 1.2766101694915256e-05, - "loss": 1.1874, - "step": 11170 - }, - { - "epoch": 0.26, - "grad_norm": 2.847926139831543, - "learning_rate": 1.2759322033898308e-05, - "loss": 1.2997, - "step": 11180 - }, - { - "epoch": 0.26, - "grad_norm": 5.950617790222168, - "learning_rate": 1.2752542372881358e-05, - "loss": 1.2431, - "step": 11190 - }, - { - "epoch": 0.26, - "grad_norm": 3.7104105949401855, - "learning_rate": 1.2745762711864407e-05, - "loss": 1.2631, - "step": 11200 - }, - { - "epoch": 0.26, - "grad_norm": 3.444988965988159, - "learning_rate": 1.2738983050847457e-05, - "loss": 1.3001, - "step": 11210 - }, - { - "epoch": 0.26, - "grad_norm": 2.375910520553589, - "learning_rate": 1.2732203389830508e-05, - "loss": 1.4155, - "step": 11220 - }, - { - "epoch": 0.26, - "grad_norm": 6.275600433349609, - "learning_rate": 1.272542372881356e-05, - "loss": 1.2824, - "step": 11230 - }, - { - "epoch": 0.26, - "grad_norm": 5.417289733886719, - "learning_rate": 1.2718644067796611e-05, - "loss": 1.4936, - "step": 11240 - }, - { - "epoch": 0.26, - "grad_norm": 2.149083375930786, - "learning_rate": 1.2711864406779661e-05, - "loss": 1.4582, - "step": 11250 - }, - { - "epoch": 0.26, - "grad_norm": 1.4166737794876099, - "learning_rate": 1.2705084745762712e-05, - "loss": 1.274, - "step": 11260 - }, - { - "epoch": 0.26, - "grad_norm": 4.054012775421143, - "learning_rate": 1.2698305084745764e-05, - "loss": 1.4788, - "step": 11270 - }, - { - "epoch": 0.26, - "grad_norm": 3.4033005237579346, - "learning_rate": 1.2691525423728815e-05, - "loss": 1.2238, - "step": 11280 - }, - { - "epoch": 0.26, - "grad_norm": 2.9126157760620117, - "learning_rate": 1.2684745762711865e-05, - "loss": 1.283, - "step": 11290 - }, - { - "epoch": 0.26, - "grad_norm": 4.059444427490234, - "learning_rate": 1.2677966101694916e-05, - "loss": 1.3338, - "step": 11300 - }, - { - "epoch": 0.26, - "grad_norm": 9.896828651428223, - "learning_rate": 1.2671186440677968e-05, - "loss": 1.5215, - "step": 11310 - }, - { - "epoch": 0.26, - "grad_norm": 3.37363862991333, - "learning_rate": 1.2664406779661019e-05, - "loss": 1.4575, - "step": 11320 - }, - { - "epoch": 0.26, - "grad_norm": 4.463009834289551, - "learning_rate": 1.2657627118644069e-05, - "loss": 1.3001, - "step": 11330 - }, - { - "epoch": 0.26, - "grad_norm": 5.098323345184326, - "learning_rate": 1.265084745762712e-05, - "loss": 1.3776, - "step": 11340 - }, - { - "epoch": 0.26, - "grad_norm": 8.601290702819824, - "learning_rate": 1.2644067796610171e-05, - "loss": 1.3683, - "step": 11350 - }, - { - "epoch": 0.26, - "grad_norm": 7.4985880851745605, - "learning_rate": 1.2637288135593221e-05, - "loss": 1.3786, - "step": 11360 - }, - { - "epoch": 0.26, - "grad_norm": 2.4713215827941895, - "learning_rate": 1.2630508474576273e-05, - "loss": 1.4187, - "step": 11370 - }, - { - "epoch": 0.26, - "grad_norm": 2.570042610168457, - "learning_rate": 1.2623728813559324e-05, - "loss": 1.3496, - "step": 11380 - }, - { - "epoch": 0.26, - "grad_norm": 2.476733446121216, - "learning_rate": 1.2616949152542375e-05, - "loss": 1.3347, - "step": 11390 - }, - { - "epoch": 0.26, - "grad_norm": 4.002094745635986, - "learning_rate": 1.2610169491525425e-05, - "loss": 1.4025, - "step": 11400 - }, - { - "epoch": 0.26, - "grad_norm": 7.079920768737793, - "learning_rate": 1.2603389830508475e-05, - "loss": 1.4474, - "step": 11410 - }, - { - "epoch": 0.26, - "grad_norm": 3.0040886402130127, - "learning_rate": 1.2596610169491526e-05, - "loss": 1.3848, - "step": 11420 - }, - { - "epoch": 0.26, - "grad_norm": 5.5370612144470215, - "learning_rate": 1.2589830508474576e-05, - "loss": 1.3444, - "step": 11430 - }, - { - "epoch": 0.26, - "grad_norm": 1.4202508926391602, - "learning_rate": 1.2583050847457627e-05, - "loss": 1.2744, - "step": 11440 - }, - { - "epoch": 0.27, - "grad_norm": 5.522201061248779, - "learning_rate": 1.2576271186440679e-05, - "loss": 1.1479, - "step": 11450 - }, - { - "epoch": 0.27, - "grad_norm": 4.666895866394043, - "learning_rate": 1.256949152542373e-05, - "loss": 1.3711, - "step": 11460 - }, - { - "epoch": 0.27, - "grad_norm": 4.038332462310791, - "learning_rate": 1.256271186440678e-05, - "loss": 1.0849, - "step": 11470 - }, - { - "epoch": 0.27, - "grad_norm": 1.2697900533676147, - "learning_rate": 1.2555932203389831e-05, - "loss": 1.2352, - "step": 11480 - }, - { - "epoch": 0.27, - "grad_norm": 1.8700276613235474, - "learning_rate": 1.2549152542372883e-05, - "loss": 1.2237, - "step": 11490 - }, - { - "epoch": 0.27, - "grad_norm": 4.803176403045654, - "learning_rate": 1.2542372881355932e-05, - "loss": 1.3036, - "step": 11500 - }, - { - "epoch": 0.27, - "eval_loss": 1.0648910999298096, - "eval_runtime": 66.9985, - "eval_samples_per_second": 14.926, - "eval_steps_per_second": 14.926, - "step": 11500 - }, - { - "epoch": 0.27, - "grad_norm": 2.106827974319458, - "learning_rate": 1.2535593220338984e-05, - "loss": 1.3212, - "step": 11510 - }, - { - "epoch": 0.27, - "grad_norm": 2.4768242835998535, - "learning_rate": 1.2528813559322035e-05, - "loss": 1.4306, - "step": 11520 - }, - { - "epoch": 0.27, - "grad_norm": 17.52560043334961, - "learning_rate": 1.2522033898305087e-05, - "loss": 1.5017, - "step": 11530 - }, - { - "epoch": 0.27, - "grad_norm": 5.7506866455078125, - "learning_rate": 1.2515254237288136e-05, - "loss": 1.2379, - "step": 11540 - }, - { - "epoch": 0.27, - "grad_norm": 2.74631929397583, - "learning_rate": 1.2508474576271188e-05, - "loss": 1.3201, - "step": 11550 - }, - { - "epoch": 0.27, - "grad_norm": 1.5436533689498901, - "learning_rate": 1.2501694915254239e-05, - "loss": 1.3991, - "step": 11560 - }, - { - "epoch": 0.27, - "grad_norm": 3.3251969814300537, - "learning_rate": 1.249491525423729e-05, - "loss": 1.5298, - "step": 11570 - }, - { - "epoch": 0.27, - "grad_norm": 2.3390953540802, - "learning_rate": 1.248813559322034e-05, - "loss": 1.272, - "step": 11580 - }, - { - "epoch": 0.27, - "grad_norm": 3.70845627784729, - "learning_rate": 1.2481355932203392e-05, - "loss": 1.235, - "step": 11590 - }, - { - "epoch": 0.27, - "grad_norm": 6.170680999755859, - "learning_rate": 1.2474576271186443e-05, - "loss": 1.3834, - "step": 11600 - }, - { - "epoch": 0.27, - "grad_norm": 1.4609490633010864, - "learning_rate": 1.2467796610169494e-05, - "loss": 1.2213, - "step": 11610 - }, - { - "epoch": 0.27, - "grad_norm": 2.1652746200561523, - "learning_rate": 1.2461016949152542e-05, - "loss": 1.4787, - "step": 11620 - }, - { - "epoch": 0.27, - "grad_norm": 7.743651866912842, - "learning_rate": 1.2454237288135594e-05, - "loss": 1.476, - "step": 11630 - }, - { - "epoch": 0.27, - "grad_norm": 2.615380048751831, - "learning_rate": 1.2447457627118643e-05, - "loss": 1.2527, - "step": 11640 - }, - { - "epoch": 0.27, - "grad_norm": 2.593022346496582, - "learning_rate": 1.2440677966101695e-05, - "loss": 1.3468, - "step": 11650 - }, - { - "epoch": 0.27, - "grad_norm": 1.5045115947723389, - "learning_rate": 1.2433898305084746e-05, - "loss": 1.4336, - "step": 11660 - }, - { - "epoch": 0.27, - "grad_norm": 6.107186317443848, - "learning_rate": 1.2427118644067798e-05, - "loss": 1.4119, - "step": 11670 - }, - { - "epoch": 0.27, - "grad_norm": 3.435987710952759, - "learning_rate": 1.2420338983050847e-05, - "loss": 1.1495, - "step": 11680 - }, - { - "epoch": 0.27, - "grad_norm": 5.024228096008301, - "learning_rate": 1.2413559322033899e-05, - "loss": 1.3461, - "step": 11690 - }, - { - "epoch": 0.27, - "grad_norm": 3.844212293624878, - "learning_rate": 1.240677966101695e-05, - "loss": 1.2638, - "step": 11700 - }, - { - "epoch": 0.27, - "grad_norm": 3.6562612056732178, - "learning_rate": 1.2400000000000002e-05, - "loss": 1.1978, - "step": 11710 - }, - { - "epoch": 0.27, - "grad_norm": 5.006361484527588, - "learning_rate": 1.2393220338983051e-05, - "loss": 1.3342, - "step": 11720 - }, - { - "epoch": 0.27, - "grad_norm": 2.448744297027588, - "learning_rate": 1.2386440677966103e-05, - "loss": 1.3128, - "step": 11730 - }, - { - "epoch": 0.27, - "grad_norm": 3.1291112899780273, - "learning_rate": 1.2379661016949154e-05, - "loss": 1.1585, - "step": 11740 - }, - { - "epoch": 0.27, - "grad_norm": 1.8178492784500122, - "learning_rate": 1.2372881355932205e-05, - "loss": 1.4148, - "step": 11750 - }, - { - "epoch": 0.27, - "grad_norm": 9.099708557128906, - "learning_rate": 1.2366101694915255e-05, - "loss": 1.275, - "step": 11760 - }, - { - "epoch": 0.27, - "grad_norm": 4.819830894470215, - "learning_rate": 1.2359322033898307e-05, - "loss": 1.2382, - "step": 11770 - }, - { - "epoch": 0.27, - "grad_norm": 3.805961847305298, - "learning_rate": 1.2352542372881358e-05, - "loss": 1.2934, - "step": 11780 - }, - { - "epoch": 0.27, - "grad_norm": 5.781748294830322, - "learning_rate": 1.234576271186441e-05, - "loss": 1.2673, - "step": 11790 - }, - { - "epoch": 0.27, - "grad_norm": 5.577550411224365, - "learning_rate": 1.2338983050847459e-05, - "loss": 1.4464, - "step": 11800 - }, - { - "epoch": 0.27, - "grad_norm": 2.494887351989746, - "learning_rate": 1.233220338983051e-05, - "loss": 1.5913, - "step": 11810 - }, - { - "epoch": 0.27, - "grad_norm": 5.071074962615967, - "learning_rate": 1.2325423728813562e-05, - "loss": 1.1268, - "step": 11820 - }, - { - "epoch": 0.27, - "grad_norm": 2.8962059020996094, - "learning_rate": 1.231864406779661e-05, - "loss": 1.2039, - "step": 11830 - }, - { - "epoch": 0.27, - "grad_norm": 5.128442287445068, - "learning_rate": 1.2311864406779661e-05, - "loss": 1.3777, - "step": 11840 - }, - { - "epoch": 0.27, - "grad_norm": 3.5109148025512695, - "learning_rate": 1.2305084745762713e-05, - "loss": 1.3644, - "step": 11850 - }, - { - "epoch": 0.27, - "grad_norm": 3.0351719856262207, - "learning_rate": 1.2298305084745762e-05, - "loss": 1.3146, - "step": 11860 - }, - { - "epoch": 0.27, - "grad_norm": 4.8273749351501465, - "learning_rate": 1.2291525423728814e-05, - "loss": 1.278, - "step": 11870 - }, - { - "epoch": 0.28, - "grad_norm": 2.8249599933624268, - "learning_rate": 1.2284745762711865e-05, - "loss": 1.2473, - "step": 11880 - }, - { - "epoch": 0.28, - "grad_norm": 3.006662607192993, - "learning_rate": 1.2277966101694917e-05, - "loss": 1.2185, - "step": 11890 - }, - { - "epoch": 0.28, - "grad_norm": 5.592995643615723, - "learning_rate": 1.2271186440677966e-05, - "loss": 1.3326, - "step": 11900 - }, - { - "epoch": 0.28, - "grad_norm": 3.6979246139526367, - "learning_rate": 1.2264406779661018e-05, - "loss": 1.2077, - "step": 11910 - }, - { - "epoch": 0.28, - "grad_norm": 7.2336812019348145, - "learning_rate": 1.2257627118644069e-05, - "loss": 1.3997, - "step": 11920 - }, - { - "epoch": 0.28, - "grad_norm": 11.921634674072266, - "learning_rate": 1.225084745762712e-05, - "loss": 1.3697, - "step": 11930 - }, - { - "epoch": 0.28, - "grad_norm": 2.800546646118164, - "learning_rate": 1.224406779661017e-05, - "loss": 1.3643, - "step": 11940 - }, - { - "epoch": 0.28, - "grad_norm": 8.160839080810547, - "learning_rate": 1.2237288135593222e-05, - "loss": 1.3598, - "step": 11950 - }, - { - "epoch": 0.28, - "grad_norm": 3.9122064113616943, - "learning_rate": 1.2230508474576273e-05, - "loss": 1.4034, - "step": 11960 - }, - { - "epoch": 0.28, - "grad_norm": 3.0154528617858887, - "learning_rate": 1.2223728813559323e-05, - "loss": 1.4547, - "step": 11970 - }, - { - "epoch": 0.28, - "grad_norm": 3.0988502502441406, - "learning_rate": 1.2216949152542374e-05, - "loss": 1.2566, - "step": 11980 - }, - { - "epoch": 0.28, - "grad_norm": 4.406126976013184, - "learning_rate": 1.2210169491525426e-05, - "loss": 1.2769, - "step": 11990 - }, - { - "epoch": 0.28, - "grad_norm": 2.9631588459014893, - "learning_rate": 1.2203389830508477e-05, - "loss": 1.2008, - "step": 12000 - }, - { - "epoch": 0.28, - "eval_loss": 1.0481234788894653, - "eval_runtime": 67.0601, - "eval_samples_per_second": 14.912, - "eval_steps_per_second": 14.912, - "step": 12000 - }, - { - "epoch": 0.28, - "grad_norm": 1.6879490613937378, - "learning_rate": 1.2196610169491527e-05, - "loss": 1.318, - "step": 12010 - }, - { - "epoch": 0.28, - "grad_norm": 1.431481957435608, - "learning_rate": 1.2189830508474578e-05, - "loss": 1.1515, - "step": 12020 - }, - { - "epoch": 0.28, - "grad_norm": 3.4427268505096436, - "learning_rate": 1.2183050847457628e-05, - "loss": 1.3034, - "step": 12030 - }, - { - "epoch": 0.28, - "grad_norm": 2.564251184463501, - "learning_rate": 1.2176271186440677e-05, - "loss": 1.4187, - "step": 12040 - }, - { - "epoch": 0.28, - "grad_norm": 4.561717510223389, - "learning_rate": 1.2169491525423729e-05, - "loss": 1.2476, - "step": 12050 - }, - { - "epoch": 0.28, - "grad_norm": 2.120867967605591, - "learning_rate": 1.216271186440678e-05, - "loss": 1.3806, - "step": 12060 - }, - { - "epoch": 0.28, - "grad_norm": 2.209594964981079, - "learning_rate": 1.2155932203389832e-05, - "loss": 1.384, - "step": 12070 - }, - { - "epoch": 0.28, - "grad_norm": 3.614448070526123, - "learning_rate": 1.2149152542372881e-05, - "loss": 1.3525, - "step": 12080 - }, - { - "epoch": 0.28, - "grad_norm": 5.756226062774658, - "learning_rate": 1.2142372881355933e-05, - "loss": 1.2727, - "step": 12090 - }, - { - "epoch": 0.28, - "grad_norm": 3.3497910499572754, - "learning_rate": 1.2135593220338984e-05, - "loss": 1.2477, - "step": 12100 - }, - { - "epoch": 0.28, - "grad_norm": 4.628206729888916, - "learning_rate": 1.2128813559322034e-05, - "loss": 1.2596, - "step": 12110 - }, - { - "epoch": 0.28, - "grad_norm": 3.101430654525757, - "learning_rate": 1.2122033898305085e-05, - "loss": 1.4617, - "step": 12120 - }, - { - "epoch": 0.28, - "grad_norm": 6.928171157836914, - "learning_rate": 1.2115254237288137e-05, - "loss": 1.1119, - "step": 12130 - }, - { - "epoch": 0.28, - "grad_norm": 8.110835075378418, - "learning_rate": 1.2108474576271188e-05, - "loss": 1.365, - "step": 12140 - }, - { - "epoch": 0.28, - "grad_norm": 2.325059652328491, - "learning_rate": 1.2101694915254238e-05, - "loss": 1.1705, - "step": 12150 - }, - { - "epoch": 0.28, - "grad_norm": 6.017369747161865, - "learning_rate": 1.209491525423729e-05, - "loss": 1.1434, - "step": 12160 - }, - { - "epoch": 0.28, - "grad_norm": 3.4774413108825684, - "learning_rate": 1.208813559322034e-05, - "loss": 1.4273, - "step": 12170 - }, - { - "epoch": 0.28, - "grad_norm": 3.168473482131958, - "learning_rate": 1.2081355932203392e-05, - "loss": 1.2839, - "step": 12180 - }, - { - "epoch": 0.28, - "grad_norm": 7.533470153808594, - "learning_rate": 1.2074576271186442e-05, - "loss": 1.3452, - "step": 12190 - }, - { - "epoch": 0.28, - "grad_norm": 5.115807056427002, - "learning_rate": 1.2067796610169493e-05, - "loss": 1.3136, - "step": 12200 - }, - { - "epoch": 0.28, - "grad_norm": 6.920198917388916, - "learning_rate": 1.2061016949152544e-05, - "loss": 1.3263, - "step": 12210 - }, - { - "epoch": 0.28, - "grad_norm": 4.812027454376221, - "learning_rate": 1.2054237288135596e-05, - "loss": 1.1978, - "step": 12220 - }, - { - "epoch": 0.28, - "grad_norm": 4.1168107986450195, - "learning_rate": 1.2047457627118646e-05, - "loss": 1.3051, - "step": 12230 - }, - { - "epoch": 0.28, - "grad_norm": 2.500709056854248, - "learning_rate": 1.2040677966101695e-05, - "loss": 1.1895, - "step": 12240 - }, - { - "epoch": 0.28, - "grad_norm": 3.5548219680786133, - "learning_rate": 1.2033898305084745e-05, - "loss": 1.4415, - "step": 12250 - }, - { - "epoch": 0.28, - "grad_norm": 4.0832295417785645, - "learning_rate": 1.2027118644067796e-05, - "loss": 1.2931, - "step": 12260 - }, - { - "epoch": 0.28, - "grad_norm": 3.2954070568084717, - "learning_rate": 1.2020338983050848e-05, - "loss": 1.052, - "step": 12270 - }, - { - "epoch": 0.28, - "grad_norm": 2.6456809043884277, - "learning_rate": 1.20135593220339e-05, - "loss": 1.3214, - "step": 12280 - }, - { - "epoch": 0.28, - "grad_norm": 3.9517974853515625, - "learning_rate": 1.2006779661016949e-05, - "loss": 1.3797, - "step": 12290 - }, - { - "epoch": 0.28, - "grad_norm": 6.248586654663086, - "learning_rate": 1.2e-05, - "loss": 1.3371, - "step": 12300 - }, - { - "epoch": 0.29, - "grad_norm": 3.673175811767578, - "learning_rate": 1.1993220338983052e-05, - "loss": 1.3891, - "step": 12310 - }, - { - "epoch": 0.29, - "grad_norm": 2.9075310230255127, - "learning_rate": 1.1986440677966103e-05, - "loss": 1.3943, - "step": 12320 - }, - { - "epoch": 0.29, - "grad_norm": 6.2103705406188965, - "learning_rate": 1.1979661016949153e-05, - "loss": 1.3813, - "step": 12330 - }, - { - "epoch": 0.29, - "grad_norm": 3.8567051887512207, - "learning_rate": 1.1972881355932204e-05, - "loss": 1.4039, - "step": 12340 - }, - { - "epoch": 0.29, - "grad_norm": 4.756994724273682, - "learning_rate": 1.1966101694915256e-05, - "loss": 1.2371, - "step": 12350 - }, - { - "epoch": 0.29, - "grad_norm": 4.874370574951172, - "learning_rate": 1.1959322033898307e-05, - "loss": 1.3543, - "step": 12360 - }, - { - "epoch": 0.29, - "grad_norm": 2.5366311073303223, - "learning_rate": 1.1952542372881357e-05, - "loss": 1.2105, - "step": 12370 - }, - { - "epoch": 0.29, - "grad_norm": 1.0801111459732056, - "learning_rate": 1.1945762711864408e-05, - "loss": 1.46, - "step": 12380 - }, - { - "epoch": 0.29, - "grad_norm": 4.35442590713501, - "learning_rate": 1.193898305084746e-05, - "loss": 1.3463, - "step": 12390 - }, - { - "epoch": 0.29, - "grad_norm": 1.817285180091858, - "learning_rate": 1.1932203389830511e-05, - "loss": 1.1985, - "step": 12400 - }, - { - "epoch": 0.29, - "grad_norm": 1.607805848121643, - "learning_rate": 1.192542372881356e-05, - "loss": 1.2979, - "step": 12410 - }, - { - "epoch": 0.29, - "grad_norm": 0.8578438758850098, - "learning_rate": 1.1918644067796612e-05, - "loss": 1.1953, - "step": 12420 - }, - { - "epoch": 0.29, - "grad_norm": 4.037782669067383, - "learning_rate": 1.1911864406779663e-05, - "loss": 1.4013, - "step": 12430 - }, - { - "epoch": 0.29, - "grad_norm": 2.66656231880188, - "learning_rate": 1.1905084745762713e-05, - "loss": 1.1451, - "step": 12440 - }, - { - "epoch": 0.29, - "grad_norm": 6.166327953338623, - "learning_rate": 1.1898305084745763e-05, - "loss": 1.2794, - "step": 12450 - }, - { - "epoch": 0.29, - "grad_norm": 6.4938483238220215, - "learning_rate": 1.1891525423728814e-05, - "loss": 1.2848, - "step": 12460 - }, - { - "epoch": 0.29, - "grad_norm": 4.013676166534424, - "learning_rate": 1.1884745762711864e-05, - "loss": 1.3793, - "step": 12470 - }, - { - "epoch": 0.29, - "grad_norm": 1.673925757408142, - "learning_rate": 1.1877966101694915e-05, - "loss": 1.1985, - "step": 12480 - }, - { - "epoch": 0.29, - "grad_norm": 3.0690205097198486, - "learning_rate": 1.1871186440677967e-05, - "loss": 1.3797, - "step": 12490 - }, - { - "epoch": 0.29, - "grad_norm": 2.671661853790283, - "learning_rate": 1.1864406779661018e-05, - "loss": 1.3697, - "step": 12500 - }, - { - "epoch": 0.29, - "eval_loss": 1.0424882173538208, - "eval_runtime": 67.1094, - "eval_samples_per_second": 14.901, - "eval_steps_per_second": 14.901, - "step": 12500 - }, - { - "epoch": 0.29, - "grad_norm": 4.892620086669922, - "learning_rate": 1.1857627118644068e-05, - "loss": 1.3287, - "step": 12510 - }, - { - "epoch": 0.29, - "grad_norm": 7.4180145263671875, - "learning_rate": 1.185084745762712e-05, - "loss": 1.4554, - "step": 12520 - }, - { - "epoch": 0.29, - "grad_norm": 6.7615966796875, - "learning_rate": 1.184406779661017e-05, - "loss": 1.3294, - "step": 12530 - }, - { - "epoch": 0.29, - "grad_norm": 5.470348358154297, - "learning_rate": 1.183728813559322e-05, - "loss": 1.4311, - "step": 12540 - }, - { - "epoch": 0.29, - "grad_norm": 3.9983062744140625, - "learning_rate": 1.1830508474576272e-05, - "loss": 1.313, - "step": 12550 - }, - { - "epoch": 0.29, - "grad_norm": 7.445504188537598, - "learning_rate": 1.1823728813559323e-05, - "loss": 1.2363, - "step": 12560 - }, - { - "epoch": 0.29, - "grad_norm": 4.365860462188721, - "learning_rate": 1.1816949152542375e-05, - "loss": 1.3424, - "step": 12570 - }, - { - "epoch": 0.29, - "grad_norm": 2.8447556495666504, - "learning_rate": 1.1810169491525424e-05, - "loss": 1.3638, - "step": 12580 - }, - { - "epoch": 0.29, - "grad_norm": 4.913205623626709, - "learning_rate": 1.1803389830508476e-05, - "loss": 1.3654, - "step": 12590 - }, - { - "epoch": 0.29, - "grad_norm": 6.282583713531494, - "learning_rate": 1.1796610169491527e-05, - "loss": 1.386, - "step": 12600 - }, - { - "epoch": 0.29, - "grad_norm": 4.249077320098877, - "learning_rate": 1.1789830508474578e-05, - "loss": 1.2501, - "step": 12610 - }, - { - "epoch": 0.29, - "grad_norm": 5.649593353271484, - "learning_rate": 1.1783050847457628e-05, - "loss": 1.3802, - "step": 12620 - }, - { - "epoch": 0.29, - "grad_norm": 4.310429096221924, - "learning_rate": 1.177627118644068e-05, - "loss": 1.3718, - "step": 12630 - }, - { - "epoch": 0.29, - "grad_norm": 8.635117530822754, - "learning_rate": 1.1769491525423731e-05, - "loss": 1.1369, - "step": 12640 - }, - { - "epoch": 0.29, - "grad_norm": 2.3350889682769775, - "learning_rate": 1.1762711864406782e-05, - "loss": 1.5591, - "step": 12650 - }, - { - "epoch": 0.29, - "grad_norm": 3.3547542095184326, - "learning_rate": 1.175593220338983e-05, - "loss": 1.0917, - "step": 12660 - }, - { - "epoch": 0.29, - "grad_norm": 6.9904937744140625, - "learning_rate": 1.1749152542372882e-05, - "loss": 1.3379, - "step": 12670 - }, - { - "epoch": 0.29, - "grad_norm": 3.06915020942688, - "learning_rate": 1.1742372881355931e-05, - "loss": 1.3451, - "step": 12680 - }, - { - "epoch": 0.29, - "grad_norm": 6.47864294052124, - "learning_rate": 1.1735593220338983e-05, - "loss": 1.2393, - "step": 12690 - }, - { - "epoch": 0.29, - "grad_norm": 4.751437664031982, - "learning_rate": 1.1728813559322034e-05, - "loss": 1.2097, - "step": 12700 - }, - { - "epoch": 0.29, - "grad_norm": 5.026716232299805, - "learning_rate": 1.1722033898305086e-05, - "loss": 1.3033, - "step": 12710 - }, - { - "epoch": 0.29, - "grad_norm": 2.250887393951416, - "learning_rate": 1.1715254237288135e-05, - "loss": 1.3686, - "step": 12720 - }, - { - "epoch": 0.29, - "grad_norm": 2.050124406814575, - "learning_rate": 1.1708474576271187e-05, - "loss": 1.3823, - "step": 12730 - }, - { - "epoch": 0.29, - "grad_norm": 3.651585817337036, - "learning_rate": 1.1701694915254238e-05, - "loss": 1.3815, - "step": 12740 - }, - { - "epoch": 0.3, - "grad_norm": 1.1647934913635254, - "learning_rate": 1.169491525423729e-05, - "loss": 1.346, - "step": 12750 - }, - { - "epoch": 0.3, - "grad_norm": 2.0802316665649414, - "learning_rate": 1.168813559322034e-05, - "loss": 1.2188, - "step": 12760 - }, - { - "epoch": 0.3, - "grad_norm": 6.935558319091797, - "learning_rate": 1.168135593220339e-05, - "loss": 1.2733, - "step": 12770 - }, - { - "epoch": 0.3, - "grad_norm": 3.3272249698638916, - "learning_rate": 1.1674576271186442e-05, - "loss": 1.4055, - "step": 12780 - }, - { - "epoch": 0.3, - "grad_norm": 7.0462517738342285, - "learning_rate": 1.1667796610169494e-05, - "loss": 1.3269, - "step": 12790 - }, - { - "epoch": 0.3, - "grad_norm": 2.325636148452759, - "learning_rate": 1.1661016949152543e-05, - "loss": 1.4648, - "step": 12800 - }, - { - "epoch": 0.3, - "grad_norm": 1.3131448030471802, - "learning_rate": 1.1654237288135595e-05, - "loss": 1.3613, - "step": 12810 - }, - { - "epoch": 0.3, - "grad_norm": 5.341305732727051, - "learning_rate": 1.1647457627118646e-05, - "loss": 1.2867, - "step": 12820 - }, - { - "epoch": 0.3, - "grad_norm": 5.281243324279785, - "learning_rate": 1.1640677966101697e-05, - "loss": 1.2868, - "step": 12830 - }, - { - "epoch": 0.3, - "grad_norm": 3.2200121879577637, - "learning_rate": 1.1633898305084747e-05, - "loss": 1.5082, - "step": 12840 - }, - { - "epoch": 0.3, - "grad_norm": 7.734021186828613, - "learning_rate": 1.1627118644067799e-05, - "loss": 1.1832, - "step": 12850 - }, - { - "epoch": 0.3, - "grad_norm": 2.2634496688842773, - "learning_rate": 1.162033898305085e-05, - "loss": 1.3862, - "step": 12860 - }, - { - "epoch": 0.3, - "grad_norm": 3.978215456008911, - "learning_rate": 1.1613559322033898e-05, - "loss": 1.19, - "step": 12870 - }, - { - "epoch": 0.3, - "grad_norm": 1.9776360988616943, - "learning_rate": 1.160677966101695e-05, - "loss": 1.3434, - "step": 12880 - }, - { - "epoch": 0.3, - "grad_norm": 9.741606712341309, - "learning_rate": 1.16e-05, - "loss": 1.5127, - "step": 12890 - }, - { - "epoch": 0.3, - "grad_norm": 1.8154492378234863, - "learning_rate": 1.159322033898305e-05, - "loss": 1.1796, - "step": 12900 - }, - { - "epoch": 0.3, - "grad_norm": 2.0180463790893555, - "learning_rate": 1.1586440677966102e-05, - "loss": 1.3409, - "step": 12910 - }, - { - "epoch": 0.3, - "grad_norm": 2.2689106464385986, - "learning_rate": 1.1579661016949153e-05, - "loss": 1.279, - "step": 12920 - }, - { - "epoch": 0.3, - "grad_norm": 3.992006778717041, - "learning_rate": 1.1572881355932205e-05, - "loss": 1.0891, - "step": 12930 - }, - { - "epoch": 0.3, - "grad_norm": 7.110446453094482, - "learning_rate": 1.1566101694915254e-05, - "loss": 1.2186, - "step": 12940 - }, - { - "epoch": 0.3, - "grad_norm": 2.776698350906372, - "learning_rate": 1.1559322033898306e-05, - "loss": 1.505, - "step": 12950 - }, - { - "epoch": 0.3, - "grad_norm": 4.205793857574463, - "learning_rate": 1.1552542372881357e-05, - "loss": 1.4795, - "step": 12960 - }, - { - "epoch": 0.3, - "grad_norm": 5.062299728393555, - "learning_rate": 1.1545762711864409e-05, - "loss": 1.3921, - "step": 12970 - }, - { - "epoch": 0.3, - "grad_norm": 14.015942573547363, - "learning_rate": 1.1538983050847458e-05, - "loss": 1.3975, - "step": 12980 - }, - { - "epoch": 0.3, - "grad_norm": 1.4116904735565186, - "learning_rate": 1.153220338983051e-05, - "loss": 1.1743, - "step": 12990 - }, - { - "epoch": 0.3, - "grad_norm": 2.194725513458252, - "learning_rate": 1.1525423728813561e-05, - "loss": 1.4538, - "step": 13000 - }, - { - "epoch": 0.3, - "eval_loss": 1.071112036705017, - "eval_runtime": 67.075, - "eval_samples_per_second": 14.909, - "eval_steps_per_second": 14.909, - "step": 13000 - }, - { - "epoch": 0.3, - "grad_norm": 2.108156204223633, - "learning_rate": 1.151864406779661e-05, - "loss": 1.4626, - "step": 13010 - }, - { - "epoch": 0.3, - "grad_norm": 0.9883029460906982, - "learning_rate": 1.1511864406779662e-05, - "loss": 1.2526, - "step": 13020 - }, - { - "epoch": 0.3, - "grad_norm": 3.394761085510254, - "learning_rate": 1.1505084745762714e-05, - "loss": 1.35, - "step": 13030 - }, - { - "epoch": 0.3, - "grad_norm": 2.971503973007202, - "learning_rate": 1.1498305084745765e-05, - "loss": 1.3228, - "step": 13040 - }, - { - "epoch": 0.3, - "grad_norm": 3.9637510776519775, - "learning_rate": 1.1491525423728815e-05, - "loss": 1.3027, - "step": 13050 - }, - { - "epoch": 0.3, - "grad_norm": 2.263624906539917, - "learning_rate": 1.1484745762711866e-05, - "loss": 1.2493, - "step": 13060 - }, - { - "epoch": 0.3, - "grad_norm": 2.722733497619629, - "learning_rate": 1.1477966101694916e-05, - "loss": 1.2733, - "step": 13070 - }, - { - "epoch": 0.3, - "grad_norm": 3.7181248664855957, - "learning_rate": 1.1471186440677965e-05, - "loss": 1.1724, - "step": 13080 - }, - { - "epoch": 0.3, - "grad_norm": 3.551340341567993, - "learning_rate": 1.1464406779661017e-05, - "loss": 1.1791, - "step": 13090 - }, - { - "epoch": 0.3, - "grad_norm": 9.841154098510742, - "learning_rate": 1.1457627118644068e-05, - "loss": 1.2801, - "step": 13100 - }, - { - "epoch": 0.3, - "grad_norm": 5.1563191413879395, - "learning_rate": 1.145084745762712e-05, - "loss": 1.4498, - "step": 13110 - }, - { - "epoch": 0.3, - "grad_norm": 5.396986961364746, - "learning_rate": 1.144406779661017e-05, - "loss": 1.3006, - "step": 13120 - }, - { - "epoch": 0.3, - "grad_norm": 1.691427230834961, - "learning_rate": 1.143728813559322e-05, - "loss": 1.3966, - "step": 13130 - }, - { - "epoch": 0.3, - "grad_norm": 3.0465362071990967, - "learning_rate": 1.1430508474576272e-05, - "loss": 1.4104, - "step": 13140 - }, - { - "epoch": 0.3, - "grad_norm": 5.615993976593018, - "learning_rate": 1.1423728813559322e-05, - "loss": 1.231, - "step": 13150 - }, - { - "epoch": 0.3, - "grad_norm": 2.800468683242798, - "learning_rate": 1.1416949152542373e-05, - "loss": 1.1893, - "step": 13160 - }, - { - "epoch": 0.3, - "grad_norm": 5.876188278198242, - "learning_rate": 1.1410169491525425e-05, - "loss": 1.1381, - "step": 13170 - }, - { - "epoch": 0.31, - "grad_norm": 2.2849178314208984, - "learning_rate": 1.1403389830508476e-05, - "loss": 1.2292, - "step": 13180 - }, - { - "epoch": 0.31, - "grad_norm": 5.406613826751709, - "learning_rate": 1.1396610169491526e-05, - "loss": 1.1515, - "step": 13190 - }, - { - "epoch": 0.31, - "grad_norm": 5.904646873474121, - "learning_rate": 1.1389830508474577e-05, - "loss": 1.5528, - "step": 13200 - }, - { - "epoch": 0.31, - "grad_norm": 2.9938790798187256, - "learning_rate": 1.1383050847457629e-05, - "loss": 1.1436, - "step": 13210 - }, - { - "epoch": 0.31, - "grad_norm": 3.544445037841797, - "learning_rate": 1.137627118644068e-05, - "loss": 1.3652, - "step": 13220 - }, - { - "epoch": 0.31, - "grad_norm": 5.070495128631592, - "learning_rate": 1.136949152542373e-05, - "loss": 1.2352, - "step": 13230 - }, - { - "epoch": 0.31, - "grad_norm": 2.612300395965576, - "learning_rate": 1.1362711864406781e-05, - "loss": 1.3513, - "step": 13240 - }, - { - "epoch": 0.31, - "grad_norm": 3.960466146469116, - "learning_rate": 1.1355932203389833e-05, - "loss": 1.0844, - "step": 13250 - }, - { - "epoch": 0.31, - "grad_norm": 2.9677114486694336, - "learning_rate": 1.1349152542372884e-05, - "loss": 1.4191, - "step": 13260 - }, - { - "epoch": 0.31, - "grad_norm": 3.220777988433838, - "learning_rate": 1.1342372881355934e-05, - "loss": 1.3462, - "step": 13270 - }, - { - "epoch": 0.31, - "grad_norm": 4.4845452308654785, - "learning_rate": 1.1335593220338983e-05, - "loss": 1.1878, - "step": 13280 - }, - { - "epoch": 0.31, - "grad_norm": 3.976458787918091, - "learning_rate": 1.1328813559322033e-05, - "loss": 1.1003, - "step": 13290 - }, - { - "epoch": 0.31, - "grad_norm": 2.404944658279419, - "learning_rate": 1.1322033898305084e-05, - "loss": 1.4279, - "step": 13300 - }, - { - "epoch": 0.31, - "grad_norm": 5.025093078613281, - "learning_rate": 1.1315254237288136e-05, - "loss": 1.2617, - "step": 13310 - }, - { - "epoch": 0.31, - "grad_norm": 1.922884225845337, - "learning_rate": 1.1308474576271187e-05, - "loss": 1.4879, - "step": 13320 - }, - { - "epoch": 0.31, - "grad_norm": 5.648707866668701, - "learning_rate": 1.1301694915254237e-05, - "loss": 1.4279, - "step": 13330 - }, - { - "epoch": 0.31, - "grad_norm": 2.952711820602417, - "learning_rate": 1.1294915254237288e-05, - "loss": 1.294, - "step": 13340 - }, - { - "epoch": 0.31, - "grad_norm": 3.5280747413635254, - "learning_rate": 1.128813559322034e-05, - "loss": 1.3706, - "step": 13350 - }, - { - "epoch": 0.31, - "grad_norm": 2.169260025024414, - "learning_rate": 1.1281355932203391e-05, - "loss": 1.3188, - "step": 13360 - }, - { - "epoch": 0.31, - "grad_norm": 2.602459192276001, - "learning_rate": 1.1274576271186441e-05, - "loss": 1.1604, - "step": 13370 - }, - { - "epoch": 0.31, - "grad_norm": 2.6066842079162598, - "learning_rate": 1.1267796610169492e-05, - "loss": 1.3923, - "step": 13380 - }, - { - "epoch": 0.31, - "grad_norm": 2.369154930114746, - "learning_rate": 1.1261016949152544e-05, - "loss": 1.3249, - "step": 13390 - }, - { - "epoch": 0.31, - "grad_norm": 1.7909421920776367, - "learning_rate": 1.1254237288135595e-05, - "loss": 1.3519, - "step": 13400 - }, - { - "epoch": 0.31, - "grad_norm": 3.983210802078247, - "learning_rate": 1.1247457627118645e-05, - "loss": 1.2492, - "step": 13410 - }, - { - "epoch": 0.31, - "grad_norm": 2.9838898181915283, - "learning_rate": 1.1240677966101696e-05, - "loss": 1.4542, - "step": 13420 - }, - { - "epoch": 0.31, - "grad_norm": 6.088125705718994, - "learning_rate": 1.1233898305084748e-05, - "loss": 1.2546, - "step": 13430 - }, - { - "epoch": 0.31, - "grad_norm": 2.840747833251953, - "learning_rate": 1.1227118644067799e-05, - "loss": 1.2426, - "step": 13440 - }, - { - "epoch": 0.31, - "grad_norm": 4.945519924163818, - "learning_rate": 1.1220338983050849e-05, - "loss": 1.3064, - "step": 13450 - }, - { - "epoch": 0.31, - "grad_norm": 4.030840873718262, - "learning_rate": 1.12135593220339e-05, - "loss": 1.1405, - "step": 13460 - }, - { - "epoch": 0.31, - "grad_norm": 4.2843194007873535, - "learning_rate": 1.1206779661016951e-05, - "loss": 1.3749, - "step": 13470 - }, - { - "epoch": 0.31, - "grad_norm": 8.288750648498535, - "learning_rate": 1.1200000000000001e-05, - "loss": 1.37, - "step": 13480 - }, - { - "epoch": 0.31, - "grad_norm": 3.233605146408081, - "learning_rate": 1.1193220338983051e-05, - "loss": 1.1979, - "step": 13490 - }, - { - "epoch": 0.31, - "grad_norm": 3.9739513397216797, - "learning_rate": 1.1186440677966102e-05, - "loss": 1.0539, - "step": 13500 - }, - { - "epoch": 0.31, - "eval_loss": 1.068673849105835, - "eval_runtime": 67.158, - "eval_samples_per_second": 14.89, - "eval_steps_per_second": 14.89, - "step": 13500 - }, - { - "epoch": 0.31, - "grad_norm": 2.8984007835388184, - "learning_rate": 1.1179661016949152e-05, - "loss": 1.1189, - "step": 13510 - }, - { - "epoch": 0.31, - "grad_norm": 5.570311069488525, - "learning_rate": 1.1172881355932203e-05, - "loss": 1.2069, - "step": 13520 - }, - { - "epoch": 0.31, - "grad_norm": 3.0752551555633545, - "learning_rate": 1.1166101694915255e-05, - "loss": 1.304, - "step": 13530 - }, - { - "epoch": 0.31, - "grad_norm": 4.809720039367676, - "learning_rate": 1.1159322033898306e-05, - "loss": 1.3403, - "step": 13540 - }, - { - "epoch": 0.31, - "grad_norm": 6.9752655029296875, - "learning_rate": 1.1152542372881356e-05, - "loss": 1.4062, - "step": 13550 - }, - { - "epoch": 0.31, - "grad_norm": 3.2357780933380127, - "learning_rate": 1.1145762711864407e-05, - "loss": 1.409, - "step": 13560 - }, - { - "epoch": 0.31, - "grad_norm": 3.0999038219451904, - "learning_rate": 1.1138983050847459e-05, - "loss": 1.3877, - "step": 13570 - }, - { - "epoch": 0.31, - "grad_norm": 5.735909461975098, - "learning_rate": 1.113220338983051e-05, - "loss": 1.3124, - "step": 13580 - }, - { - "epoch": 0.31, - "grad_norm": 2.736616849899292, - "learning_rate": 1.112542372881356e-05, - "loss": 1.3948, - "step": 13590 - }, - { - "epoch": 0.31, - "grad_norm": 3.9741382598876953, - "learning_rate": 1.1118644067796611e-05, - "loss": 1.3462, - "step": 13600 - }, - { - "epoch": 0.32, - "grad_norm": 2.8991127014160156, - "learning_rate": 1.1111864406779663e-05, - "loss": 1.3715, - "step": 13610 - }, - { - "epoch": 0.32, - "grad_norm": 1.8136656284332275, - "learning_rate": 1.1105084745762712e-05, - "loss": 1.1655, - "step": 13620 - }, - { - "epoch": 0.32, - "grad_norm": 5.556140899658203, - "learning_rate": 1.1098305084745764e-05, - "loss": 1.1872, - "step": 13630 - }, - { - "epoch": 0.32, - "grad_norm": 4.87309455871582, - "learning_rate": 1.1091525423728815e-05, - "loss": 1.2085, - "step": 13640 - }, - { - "epoch": 0.32, - "grad_norm": 4.111629486083984, - "learning_rate": 1.1084745762711867e-05, - "loss": 1.2432, - "step": 13650 - }, - { - "epoch": 0.32, - "grad_norm": 3.4309825897216797, - "learning_rate": 1.1077966101694916e-05, - "loss": 1.291, - "step": 13660 - }, - { - "epoch": 0.32, - "grad_norm": 4.081984043121338, - "learning_rate": 1.1071186440677968e-05, - "loss": 1.2944, - "step": 13670 - }, - { - "epoch": 0.32, - "grad_norm": 3.980055093765259, - "learning_rate": 1.1064406779661019e-05, - "loss": 1.4403, - "step": 13680 - }, - { - "epoch": 0.32, - "grad_norm": 6.602412223815918, - "learning_rate": 1.105762711864407e-05, - "loss": 1.2835, - "step": 13690 - }, - { - "epoch": 0.32, - "grad_norm": 6.045549392700195, - "learning_rate": 1.1050847457627118e-05, - "loss": 1.2792, - "step": 13700 - }, - { - "epoch": 0.32, - "grad_norm": 8.093133926391602, - "learning_rate": 1.104406779661017e-05, - "loss": 1.3566, - "step": 13710 - }, - { - "epoch": 0.32, - "grad_norm": 3.0017004013061523, - "learning_rate": 1.1037288135593221e-05, - "loss": 1.0925, - "step": 13720 - }, - { - "epoch": 0.32, - "grad_norm": 4.844403266906738, - "learning_rate": 1.1030508474576271e-05, - "loss": 1.3142, - "step": 13730 - }, - { - "epoch": 0.32, - "grad_norm": 4.788667678833008, - "learning_rate": 1.1023728813559322e-05, - "loss": 1.4304, - "step": 13740 - }, - { - "epoch": 0.32, - "grad_norm": 12.615195274353027, - "learning_rate": 1.1016949152542374e-05, - "loss": 1.4411, - "step": 13750 - }, - { - "epoch": 0.32, - "grad_norm": 5.6674323081970215, - "learning_rate": 1.1010169491525423e-05, - "loss": 1.2066, - "step": 13760 - }, - { - "epoch": 0.32, - "grad_norm": 10.92747688293457, - "learning_rate": 1.1003389830508475e-05, - "loss": 1.4857, - "step": 13770 - }, - { - "epoch": 0.32, - "grad_norm": 2.42288875579834, - "learning_rate": 1.0996610169491526e-05, - "loss": 1.2415, - "step": 13780 - }, - { - "epoch": 0.32, - "grad_norm": 9.594301223754883, - "learning_rate": 1.0989830508474578e-05, - "loss": 1.3813, - "step": 13790 - }, - { - "epoch": 0.32, - "grad_norm": 3.701327323913574, - "learning_rate": 1.0983050847457627e-05, - "loss": 1.1592, - "step": 13800 - }, - { - "epoch": 0.32, - "grad_norm": 3.5337729454040527, - "learning_rate": 1.0976271186440679e-05, - "loss": 1.3177, - "step": 13810 - }, - { - "epoch": 0.32, - "grad_norm": 14.992568016052246, - "learning_rate": 1.096949152542373e-05, - "loss": 1.1037, - "step": 13820 - }, - { - "epoch": 0.32, - "grad_norm": 12.353404998779297, - "learning_rate": 1.0962711864406782e-05, - "loss": 1.3257, - "step": 13830 - }, - { - "epoch": 0.32, - "grad_norm": 3.713555335998535, - "learning_rate": 1.0955932203389831e-05, - "loss": 1.2935, - "step": 13840 - }, - { - "epoch": 0.32, - "grad_norm": 5.5155348777771, - "learning_rate": 1.0949152542372883e-05, - "loss": 1.4259, - "step": 13850 - }, - { - "epoch": 0.32, - "grad_norm": 2.6537179946899414, - "learning_rate": 1.0942372881355934e-05, - "loss": 1.333, - "step": 13860 - }, - { - "epoch": 0.32, - "grad_norm": 3.3852736949920654, - "learning_rate": 1.0935593220338985e-05, - "loss": 1.2526, - "step": 13870 - }, - { - "epoch": 0.32, - "grad_norm": 3.6579887866973877, - "learning_rate": 1.0928813559322035e-05, - "loss": 1.4214, - "step": 13880 - }, - { - "epoch": 0.32, - "grad_norm": 3.4810287952423096, - "learning_rate": 1.0922033898305087e-05, - "loss": 1.4154, - "step": 13890 - }, - { - "epoch": 0.32, - "grad_norm": 4.413066387176514, - "learning_rate": 1.0915254237288135e-05, - "loss": 1.4967, - "step": 13900 - }, - { - "epoch": 0.32, - "grad_norm": 2.1642134189605713, - "learning_rate": 1.0908474576271186e-05, - "loss": 1.2179, - "step": 13910 - }, - { - "epoch": 0.32, - "grad_norm": 1.1288875341415405, - "learning_rate": 1.0901694915254237e-05, - "loss": 1.2011, - "step": 13920 - }, - { - "epoch": 0.32, - "grad_norm": 2.4285356998443604, - "learning_rate": 1.0894915254237289e-05, - "loss": 1.3319, - "step": 13930 - }, - { - "epoch": 0.32, - "grad_norm": 2.1330859661102295, - "learning_rate": 1.0888135593220339e-05, - "loss": 1.2524, - "step": 13940 - }, - { - "epoch": 0.32, - "grad_norm": 2.8436052799224854, - "learning_rate": 1.088135593220339e-05, - "loss": 1.2021, - "step": 13950 - }, - { - "epoch": 0.32, - "grad_norm": 1.7708642482757568, - "learning_rate": 1.0874576271186441e-05, - "loss": 1.1705, - "step": 13960 - }, - { - "epoch": 0.32, - "grad_norm": 5.702758312225342, - "learning_rate": 1.0867796610169493e-05, - "loss": 1.3999, - "step": 13970 - }, - { - "epoch": 0.32, - "grad_norm": 1.627833604812622, - "learning_rate": 1.0861016949152542e-05, - "loss": 1.497, - "step": 13980 - }, - { - "epoch": 0.32, - "grad_norm": 2.4980685710906982, - "learning_rate": 1.0854237288135594e-05, - "loss": 1.2449, - "step": 13990 - }, - { - "epoch": 0.32, - "grad_norm": 3.2824714183807373, - "learning_rate": 1.0847457627118645e-05, - "loss": 1.1941, - "step": 14000 - }, - { - "epoch": 0.32, - "eval_loss": 1.097359299659729, - "eval_runtime": 67.0566, - "eval_samples_per_second": 14.913, - "eval_steps_per_second": 14.913, - "step": 14000 - }, - { - "epoch": 0.32, - "grad_norm": 3.3165223598480225, - "learning_rate": 1.0840677966101697e-05, - "loss": 1.2049, - "step": 14010 - }, - { - "epoch": 0.32, - "grad_norm": 3.123730182647705, - "learning_rate": 1.0833898305084746e-05, - "loss": 1.2982, - "step": 14020 - }, - { - "epoch": 0.32, - "grad_norm": 1.46307373046875, - "learning_rate": 1.0827118644067798e-05, - "loss": 1.175, - "step": 14030 - }, - { - "epoch": 0.33, - "grad_norm": 2.6726436614990234, - "learning_rate": 1.0820338983050849e-05, - "loss": 1.1683, - "step": 14040 - }, - { - "epoch": 0.33, - "grad_norm": 2.585521936416626, - "learning_rate": 1.08135593220339e-05, - "loss": 1.4859, - "step": 14050 - }, - { - "epoch": 0.33, - "grad_norm": 2.8152098655700684, - "learning_rate": 1.080677966101695e-05, - "loss": 1.2218, - "step": 14060 - }, - { - "epoch": 0.33, - "grad_norm": 5.557183742523193, - "learning_rate": 1.0800000000000002e-05, - "loss": 1.1304, - "step": 14070 - }, - { - "epoch": 0.33, - "grad_norm": 5.458992958068848, - "learning_rate": 1.0793220338983053e-05, - "loss": 1.2276, - "step": 14080 - }, - { - "epoch": 0.33, - "grad_norm": 2.8788580894470215, - "learning_rate": 1.0786440677966103e-05, - "loss": 1.3762, - "step": 14090 - }, - { - "epoch": 0.33, - "grad_norm": 7.669591426849365, - "learning_rate": 1.0779661016949154e-05, - "loss": 1.2418, - "step": 14100 - }, - { - "epoch": 0.33, - "grad_norm": 5.8121442794799805, - "learning_rate": 1.0772881355932204e-05, - "loss": 1.2151, - "step": 14110 - }, - { - "epoch": 0.33, - "grad_norm": 4.32166051864624, - "learning_rate": 1.0766101694915254e-05, - "loss": 1.3599, - "step": 14120 - }, - { - "epoch": 0.33, - "grad_norm": 2.139389991760254, - "learning_rate": 1.0759322033898305e-05, - "loss": 1.1879, - "step": 14130 - }, - { - "epoch": 0.33, - "grad_norm": 1.7812507152557373, - "learning_rate": 1.0752542372881356e-05, - "loss": 1.3147, - "step": 14140 - }, - { - "epoch": 0.33, - "grad_norm": 3.0108046531677246, - "learning_rate": 1.0745762711864408e-05, - "loss": 1.1607, - "step": 14150 - }, - { - "epoch": 0.33, - "grad_norm": 6.003909587860107, - "learning_rate": 1.0738983050847457e-05, - "loss": 1.3212, - "step": 14160 - }, - { - "epoch": 0.33, - "grad_norm": 6.925515174865723, - "learning_rate": 1.0732203389830509e-05, - "loss": 1.342, - "step": 14170 - }, - { - "epoch": 0.33, - "grad_norm": 5.450997352600098, - "learning_rate": 1.072542372881356e-05, - "loss": 1.327, - "step": 14180 - }, - { - "epoch": 0.33, - "grad_norm": 4.291378974914551, - "learning_rate": 1.0718644067796612e-05, - "loss": 1.2563, - "step": 14190 - }, - { - "epoch": 0.33, - "grad_norm": 2.7784173488616943, - "learning_rate": 1.0711864406779661e-05, - "loss": 1.5323, - "step": 14200 - }, - { - "epoch": 0.33, - "grad_norm": 2.37923526763916, - "learning_rate": 1.0705084745762713e-05, - "loss": 1.3377, - "step": 14210 - }, - { - "epoch": 0.33, - "grad_norm": 1.3921860456466675, - "learning_rate": 1.0698305084745764e-05, - "loss": 1.2814, - "step": 14220 - }, - { - "epoch": 0.33, - "grad_norm": 4.000369548797607, - "learning_rate": 1.0691525423728814e-05, - "loss": 1.1886, - "step": 14230 - }, - { - "epoch": 0.33, - "grad_norm": 3.731414318084717, - "learning_rate": 1.0684745762711865e-05, - "loss": 1.5797, - "step": 14240 - }, - { - "epoch": 0.33, - "grad_norm": 3.6613595485687256, - "learning_rate": 1.0677966101694917e-05, - "loss": 1.2735, - "step": 14250 - }, - { - "epoch": 0.33, - "grad_norm": 1.467409610748291, - "learning_rate": 1.0671186440677968e-05, - "loss": 1.4635, - "step": 14260 - }, - { - "epoch": 0.33, - "grad_norm": 1.406770944595337, - "learning_rate": 1.0664406779661018e-05, - "loss": 1.1051, - "step": 14270 - }, - { - "epoch": 0.33, - "grad_norm": 1.697908878326416, - "learning_rate": 1.065762711864407e-05, - "loss": 1.2916, - "step": 14280 - }, - { - "epoch": 0.33, - "grad_norm": 3.9356634616851807, - "learning_rate": 1.065084745762712e-05, - "loss": 1.2845, - "step": 14290 - }, - { - "epoch": 0.33, - "grad_norm": 4.0369553565979, - "learning_rate": 1.0644067796610172e-05, - "loss": 1.1785, - "step": 14300 - }, - { - "epoch": 0.33, - "grad_norm": 7.373144149780273, - "learning_rate": 1.0637288135593222e-05, - "loss": 1.2085, - "step": 14310 - }, - { - "epoch": 0.33, - "grad_norm": 3.261367082595825, - "learning_rate": 1.0630508474576271e-05, - "loss": 1.2291, - "step": 14320 - }, - { - "epoch": 0.33, - "grad_norm": 2.6595752239227295, - "learning_rate": 1.0623728813559323e-05, - "loss": 1.4075, - "step": 14330 - }, - { - "epoch": 0.33, - "grad_norm": 3.082679510116577, - "learning_rate": 1.0616949152542373e-05, - "loss": 1.2423, - "step": 14340 - }, - { - "epoch": 0.33, - "grad_norm": 2.0270230770111084, - "learning_rate": 1.0610169491525424e-05, - "loss": 1.3866, - "step": 14350 - }, - { - "epoch": 0.33, - "grad_norm": 2.6601712703704834, - "learning_rate": 1.0603389830508475e-05, - "loss": 1.5351, - "step": 14360 - }, - { - "epoch": 0.33, - "grad_norm": 3.8816442489624023, - "learning_rate": 1.0596610169491525e-05, - "loss": 1.4206, - "step": 14370 - }, - { - "epoch": 0.33, - "grad_norm": 2.6769402027130127, - "learning_rate": 1.0589830508474576e-05, - "loss": 1.3384, - "step": 14380 - }, - { - "epoch": 0.33, - "grad_norm": 7.779047966003418, - "learning_rate": 1.0583050847457628e-05, - "loss": 1.2507, - "step": 14390 - }, - { - "epoch": 0.33, - "grad_norm": 1.7086968421936035, - "learning_rate": 1.057627118644068e-05, - "loss": 1.4237, - "step": 14400 - }, - { - "epoch": 0.33, - "grad_norm": 6.948585033416748, - "learning_rate": 1.0569491525423729e-05, - "loss": 1.4241, - "step": 14410 - }, - { - "epoch": 0.33, - "grad_norm": 4.5062103271484375, - "learning_rate": 1.056271186440678e-05, - "loss": 1.1971, - "step": 14420 - }, - { - "epoch": 0.33, - "grad_norm": 3.1329903602600098, - "learning_rate": 1.0555932203389832e-05, - "loss": 1.2855, - "step": 14430 - }, - { - "epoch": 0.33, - "grad_norm": 4.693443298339844, - "learning_rate": 1.0549152542372883e-05, - "loss": 1.4162, - "step": 14440 - }, - { - "epoch": 0.33, - "grad_norm": 6.092784404754639, - "learning_rate": 1.0542372881355933e-05, - "loss": 1.2929, - "step": 14450 - }, - { - "epoch": 0.33, - "grad_norm": 5.795193195343018, - "learning_rate": 1.0535593220338984e-05, - "loss": 1.1753, - "step": 14460 - }, - { - "epoch": 0.34, - "grad_norm": 3.002300500869751, - "learning_rate": 1.0528813559322036e-05, - "loss": 1.2696, - "step": 14470 - }, - { - "epoch": 0.34, - "grad_norm": 8.367115020751953, - "learning_rate": 1.0522033898305087e-05, - "loss": 1.4971, - "step": 14480 - }, - { - "epoch": 0.34, - "grad_norm": 7.178134918212891, - "learning_rate": 1.0515254237288137e-05, - "loss": 1.2025, - "step": 14490 - }, - { - "epoch": 0.34, - "grad_norm": 6.813628673553467, - "learning_rate": 1.0508474576271188e-05, - "loss": 1.2135, - "step": 14500 - }, - { - "epoch": 0.34, - "eval_loss": 1.056099772453308, - "eval_runtime": 67.1376, - "eval_samples_per_second": 14.895, - "eval_steps_per_second": 14.895, - "step": 14500 - }, - { - "epoch": 0.34, - "grad_norm": 6.232717990875244, - "learning_rate": 1.050169491525424e-05, - "loss": 1.2396, - "step": 14510 - }, - { - "epoch": 0.34, - "grad_norm": 6.143510341644287, - "learning_rate": 1.049491525423729e-05, - "loss": 1.503, - "step": 14520 - }, - { - "epoch": 0.34, - "grad_norm": 4.227885723114014, - "learning_rate": 1.0488135593220339e-05, - "loss": 1.2606, - "step": 14530 - }, - { - "epoch": 0.34, - "grad_norm": 3.4114978313446045, - "learning_rate": 1.048135593220339e-05, - "loss": 1.3879, - "step": 14540 - }, - { - "epoch": 0.34, - "grad_norm": 4.572566509246826, - "learning_rate": 1.047457627118644e-05, - "loss": 1.3413, - "step": 14550 - }, - { - "epoch": 0.34, - "grad_norm": 5.934830188751221, - "learning_rate": 1.0467796610169491e-05, - "loss": 1.2421, - "step": 14560 - }, - { - "epoch": 0.34, - "grad_norm": 3.879978656768799, - "learning_rate": 1.0461016949152543e-05, - "loss": 1.2392, - "step": 14570 - }, - { - "epoch": 0.34, - "grad_norm": 2.335127353668213, - "learning_rate": 1.0454237288135594e-05, - "loss": 1.1258, - "step": 14580 - }, - { - "epoch": 0.34, - "grad_norm": 9.824913024902344, - "learning_rate": 1.0447457627118644e-05, - "loss": 1.2003, - "step": 14590 - }, - { - "epoch": 0.34, - "grad_norm": 2.6796653270721436, - "learning_rate": 1.0440677966101695e-05, - "loss": 1.3512, - "step": 14600 - }, - { - "epoch": 0.34, - "grad_norm": 2.872577667236328, - "learning_rate": 1.0433898305084747e-05, - "loss": 1.1608, - "step": 14610 - }, - { - "epoch": 0.34, - "grad_norm": 5.358550071716309, - "learning_rate": 1.0427118644067798e-05, - "loss": 1.3207, - "step": 14620 - }, - { - "epoch": 0.34, - "grad_norm": 5.672595500946045, - "learning_rate": 1.0420338983050848e-05, - "loss": 1.5645, - "step": 14630 - }, - { - "epoch": 0.34, - "grad_norm": 6.134851455688477, - "learning_rate": 1.04135593220339e-05, - "loss": 1.4636, - "step": 14640 - }, - { - "epoch": 0.34, - "grad_norm": 3.763167381286621, - "learning_rate": 1.040677966101695e-05, - "loss": 1.3488, - "step": 14650 - }, - { - "epoch": 0.34, - "grad_norm": 4.057564735412598, - "learning_rate": 1.04e-05, - "loss": 1.1286, - "step": 14660 - }, - { - "epoch": 0.34, - "grad_norm": 5.70720100402832, - "learning_rate": 1.0393220338983052e-05, - "loss": 1.2291, - "step": 14670 - }, - { - "epoch": 0.34, - "grad_norm": 4.895830154418945, - "learning_rate": 1.0386440677966103e-05, - "loss": 1.3372, - "step": 14680 - }, - { - "epoch": 0.34, - "grad_norm": 4.813084125518799, - "learning_rate": 1.0379661016949155e-05, - "loss": 1.388, - "step": 14690 - }, - { - "epoch": 0.34, - "grad_norm": 3.9301507472991943, - "learning_rate": 1.0372881355932204e-05, - "loss": 1.3411, - "step": 14700 - }, - { - "epoch": 0.34, - "grad_norm": 1.1061105728149414, - "learning_rate": 1.0366101694915256e-05, - "loss": 1.1152, - "step": 14710 - }, - { - "epoch": 0.34, - "grad_norm": 2.1552724838256836, - "learning_rate": 1.0359322033898307e-05, - "loss": 1.3387, - "step": 14720 - }, - { - "epoch": 0.34, - "grad_norm": 4.779531002044678, - "learning_rate": 1.0352542372881358e-05, - "loss": 1.2235, - "step": 14730 - }, - { - "epoch": 0.34, - "grad_norm": 5.468095779418945, - "learning_rate": 1.0345762711864406e-05, - "loss": 1.458, - "step": 14740 - }, - { - "epoch": 0.34, - "grad_norm": 3.9823062419891357, - "learning_rate": 1.0338983050847458e-05, - "loss": 1.2354, - "step": 14750 - }, - { - "epoch": 0.34, - "grad_norm": 5.236638069152832, - "learning_rate": 1.033220338983051e-05, - "loss": 1.2741, - "step": 14760 - }, - { - "epoch": 0.34, - "grad_norm": 4.5503621101379395, - "learning_rate": 1.0325423728813559e-05, - "loss": 1.1736, - "step": 14770 - }, - { - "epoch": 0.34, - "grad_norm": 8.381211280822754, - "learning_rate": 1.031864406779661e-05, - "loss": 1.1965, - "step": 14780 - }, - { - "epoch": 0.34, - "grad_norm": 6.995142459869385, - "learning_rate": 1.0311864406779662e-05, - "loss": 1.2316, - "step": 14790 - }, - { - "epoch": 0.34, - "grad_norm": 8.22109317779541, - "learning_rate": 1.0305084745762712e-05, - "loss": 1.3987, - "step": 14800 - }, - { - "epoch": 0.34, - "grad_norm": 1.7374868392944336, - "learning_rate": 1.0298305084745763e-05, - "loss": 1.2053, - "step": 14810 - }, - { - "epoch": 0.34, - "grad_norm": 3.8697991371154785, - "learning_rate": 1.0291525423728814e-05, - "loss": 1.1784, - "step": 14820 - }, - { - "epoch": 0.34, - "grad_norm": 9.522156715393066, - "learning_rate": 1.0284745762711866e-05, - "loss": 1.4226, - "step": 14830 - }, - { - "epoch": 0.34, - "grad_norm": 5.162941932678223, - "learning_rate": 1.0277966101694915e-05, - "loss": 1.3565, - "step": 14840 - }, - { - "epoch": 0.34, - "grad_norm": 4.80945348739624, - "learning_rate": 1.0271186440677967e-05, - "loss": 1.3013, - "step": 14850 - }, - { - "epoch": 0.34, - "grad_norm": 3.0665078163146973, - "learning_rate": 1.0264406779661018e-05, - "loss": 1.4093, - "step": 14860 - }, - { - "epoch": 0.34, - "grad_norm": 2.34722900390625, - "learning_rate": 1.025762711864407e-05, - "loss": 1.3063, - "step": 14870 - }, - { - "epoch": 0.34, - "grad_norm": 5.438226699829102, - "learning_rate": 1.025084745762712e-05, - "loss": 1.1209, - "step": 14880 - }, - { - "epoch": 0.34, - "grad_norm": 2.0597119331359863, - "learning_rate": 1.024406779661017e-05, - "loss": 1.3328, - "step": 14890 - }, - { - "epoch": 0.34, - "grad_norm": 1.7713210582733154, - "learning_rate": 1.0237288135593222e-05, - "loss": 1.2971, - "step": 14900 - }, - { - "epoch": 0.35, - "grad_norm": 1.6047179698944092, - "learning_rate": 1.0230508474576274e-05, - "loss": 1.226, - "step": 14910 - }, - { - "epoch": 0.35, - "grad_norm": 3.7487828731536865, - "learning_rate": 1.0223728813559323e-05, - "loss": 1.0162, - "step": 14920 - }, - { - "epoch": 0.35, - "grad_norm": 3.8491158485412598, - "learning_rate": 1.0216949152542375e-05, - "loss": 1.3324, - "step": 14930 - }, - { - "epoch": 0.35, - "grad_norm": 3.6983745098114014, - "learning_rate": 1.0210169491525423e-05, - "loss": 1.3416, - "step": 14940 - }, - { - "epoch": 0.35, - "grad_norm": 7.2397589683532715, - "learning_rate": 1.0203389830508474e-05, - "loss": 1.4135, - "step": 14950 - }, - { - "epoch": 0.35, - "grad_norm": 6.375013828277588, - "learning_rate": 1.0196610169491525e-05, - "loss": 1.2063, - "step": 14960 - }, - { - "epoch": 0.35, - "grad_norm": 7.71971321105957, - "learning_rate": 1.0189830508474577e-05, - "loss": 1.2969, - "step": 14970 - }, - { - "epoch": 0.35, - "grad_norm": 5.973591327667236, - "learning_rate": 1.0183050847457627e-05, - "loss": 1.179, - "step": 14980 - }, - { - "epoch": 0.35, - "grad_norm": 6.778249263763428, - "learning_rate": 1.0176271186440678e-05, - "loss": 1.2906, - "step": 14990 - }, - { - "epoch": 0.35, - "grad_norm": 3.3302714824676514, - "learning_rate": 1.016949152542373e-05, - "loss": 1.2764, - "step": 15000 - }, - { - "epoch": 0.35, - "eval_loss": 1.0367546081542969, - "eval_runtime": 67.0312, - "eval_samples_per_second": 14.918, - "eval_steps_per_second": 14.918, - "step": 15000 - }, - { - "epoch": 0.35, - "grad_norm": 4.099465847015381, - "learning_rate": 1.016271186440678e-05, - "loss": 1.2667, - "step": 15010 - }, - { - "epoch": 0.35, - "grad_norm": 3.9707233905792236, - "learning_rate": 1.015593220338983e-05, - "loss": 1.1102, - "step": 15020 - }, - { - "epoch": 0.35, - "grad_norm": 4.409958362579346, - "learning_rate": 1.0149152542372882e-05, - "loss": 1.406, - "step": 15030 - }, - { - "epoch": 0.35, - "grad_norm": 6.402803421020508, - "learning_rate": 1.0142372881355933e-05, - "loss": 1.3003, - "step": 15040 - }, - { - "epoch": 0.35, - "grad_norm": 1.084704041481018, - "learning_rate": 1.0135593220338985e-05, - "loss": 1.2643, - "step": 15050 - }, - { - "epoch": 0.35, - "grad_norm": 7.1039910316467285, - "learning_rate": 1.0128813559322034e-05, - "loss": 1.3562, - "step": 15060 - }, - { - "epoch": 0.35, - "grad_norm": 4.218326568603516, - "learning_rate": 1.0122033898305086e-05, - "loss": 1.3977, - "step": 15070 - }, - { - "epoch": 0.35, - "grad_norm": 5.44041109085083, - "learning_rate": 1.0115254237288137e-05, - "loss": 1.3783, - "step": 15080 - }, - { - "epoch": 0.35, - "grad_norm": 3.026660442352295, - "learning_rate": 1.0108474576271189e-05, - "loss": 1.0689, - "step": 15090 - }, - { - "epoch": 0.35, - "grad_norm": 4.522537708282471, - "learning_rate": 1.0101694915254238e-05, - "loss": 1.2961, - "step": 15100 - }, - { - "epoch": 0.35, - "grad_norm": 2.490846872329712, - "learning_rate": 1.009491525423729e-05, - "loss": 1.4791, - "step": 15110 - }, - { - "epoch": 0.35, - "grad_norm": 6.137222766876221, - "learning_rate": 1.0088135593220341e-05, - "loss": 1.4384, - "step": 15120 - }, - { - "epoch": 0.35, - "grad_norm": 2.501014232635498, - "learning_rate": 1.008135593220339e-05, - "loss": 1.4224, - "step": 15130 - }, - { - "epoch": 0.35, - "grad_norm": 4.077551364898682, - "learning_rate": 1.0074576271186442e-05, - "loss": 1.4564, - "step": 15140 - }, - { - "epoch": 0.35, - "grad_norm": 3.0064074993133545, - "learning_rate": 1.0067796610169492e-05, - "loss": 1.3707, - "step": 15150 - }, - { - "epoch": 0.35, - "grad_norm": 6.761713027954102, - "learning_rate": 1.0061016949152542e-05, - "loss": 1.2507, - "step": 15160 - }, - { - "epoch": 0.35, - "grad_norm": 2.3495090007781982, - "learning_rate": 1.0054237288135593e-05, - "loss": 1.3328, - "step": 15170 - }, - { - "epoch": 0.35, - "grad_norm": 2.105854034423828, - "learning_rate": 1.0047457627118644e-05, - "loss": 1.3227, - "step": 15180 - }, - { - "epoch": 0.35, - "grad_norm": 2.0211234092712402, - "learning_rate": 1.0040677966101696e-05, - "loss": 1.3816, - "step": 15190 - }, - { - "epoch": 0.35, - "grad_norm": 3.667546272277832, - "learning_rate": 1.0033898305084746e-05, - "loss": 1.1994, - "step": 15200 - }, - { - "epoch": 0.35, - "grad_norm": 3.69437837600708, - "learning_rate": 1.0027118644067797e-05, - "loss": 1.3234, - "step": 15210 - }, - { - "epoch": 0.35, - "grad_norm": 6.547877311706543, - "learning_rate": 1.0020338983050848e-05, - "loss": 1.4976, - "step": 15220 - }, - { - "epoch": 0.35, - "grad_norm": 4.0074992179870605, - "learning_rate": 1.00135593220339e-05, - "loss": 1.141, - "step": 15230 - }, - { - "epoch": 0.35, - "grad_norm": 6.511413097381592, - "learning_rate": 1.000677966101695e-05, - "loss": 1.447, - "step": 15240 - }, - { - "epoch": 0.35, - "grad_norm": 5.6213603019714355, - "learning_rate": 1e-05, - "loss": 1.3133, - "step": 15250 - }, - { - "epoch": 0.35, - "grad_norm": 5.2933831214904785, - "learning_rate": 9.993220338983052e-06, - "loss": 1.5486, - "step": 15260 - }, - { - "epoch": 0.35, - "grad_norm": 7.773339748382568, - "learning_rate": 9.986440677966102e-06, - "loss": 1.192, - "step": 15270 - }, - { - "epoch": 0.35, - "grad_norm": 2.686492443084717, - "learning_rate": 9.979661016949153e-06, - "loss": 1.3922, - "step": 15280 - }, - { - "epoch": 0.35, - "grad_norm": 4.0056610107421875, - "learning_rate": 9.972881355932205e-06, - "loss": 1.3592, - "step": 15290 - }, - { - "epoch": 0.35, - "grad_norm": 4.592884063720703, - "learning_rate": 9.966101694915256e-06, - "loss": 1.3173, - "step": 15300 - }, - { - "epoch": 0.35, - "grad_norm": 5.68196964263916, - "learning_rate": 9.959322033898306e-06, - "loss": 1.3826, - "step": 15310 - }, - { - "epoch": 0.35, - "grad_norm": 8.197639465332031, - "learning_rate": 9.952542372881356e-06, - "loss": 1.1825, - "step": 15320 - }, - { - "epoch": 0.35, - "grad_norm": 9.864594459533691, - "learning_rate": 9.945762711864407e-06, - "loss": 1.1862, - "step": 15330 - }, - { - "epoch": 0.36, - "grad_norm": 7.495543003082275, - "learning_rate": 9.938983050847458e-06, - "loss": 1.2566, - "step": 15340 - }, - { - "epoch": 0.36, - "grad_norm": 6.1277899742126465, - "learning_rate": 9.93220338983051e-06, - "loss": 1.3178, - "step": 15350 - }, - { - "epoch": 0.36, - "grad_norm": 8.173718452453613, - "learning_rate": 9.92542372881356e-06, - "loss": 1.3092, - "step": 15360 - }, - { - "epoch": 0.36, - "grad_norm": 3.474973678588867, - "learning_rate": 9.918644067796611e-06, - "loss": 1.3152, - "step": 15370 - }, - { - "epoch": 0.36, - "grad_norm": 3.8019461631774902, - "learning_rate": 9.911864406779662e-06, - "loss": 1.4228, - "step": 15380 - }, - { - "epoch": 0.36, - "grad_norm": 3.221346139907837, - "learning_rate": 9.905084745762714e-06, - "loss": 1.4204, - "step": 15390 - }, - { - "epoch": 0.36, - "grad_norm": 2.06070613861084, - "learning_rate": 9.898305084745763e-06, - "loss": 1.3482, - "step": 15400 - }, - { - "epoch": 0.36, - "grad_norm": 4.18265962600708, - "learning_rate": 9.891525423728813e-06, - "loss": 1.1693, - "step": 15410 - }, - { - "epoch": 0.36, - "grad_norm": 1.873125433921814, - "learning_rate": 9.884745762711864e-06, - "loss": 1.3536, - "step": 15420 - }, - { - "epoch": 0.36, - "grad_norm": 2.7039427757263184, - "learning_rate": 9.877966101694916e-06, - "loss": 1.3127, - "step": 15430 - }, - { - "epoch": 0.36, - "grad_norm": 1.9308912754058838, - "learning_rate": 9.871186440677967e-06, - "loss": 1.2807, - "step": 15440 - }, - { - "epoch": 0.36, - "grad_norm": 4.275124549865723, - "learning_rate": 9.864406779661017e-06, - "loss": 1.2304, - "step": 15450 - }, - { - "epoch": 0.36, - "grad_norm": 15.040656089782715, - "learning_rate": 9.857627118644068e-06, - "loss": 1.1825, - "step": 15460 - }, - { - "epoch": 0.36, - "grad_norm": 6.073838710784912, - "learning_rate": 9.85084745762712e-06, - "loss": 1.1309, - "step": 15470 - }, - { - "epoch": 0.36, - "grad_norm": 6.772380828857422, - "learning_rate": 9.844067796610171e-06, - "loss": 1.4216, - "step": 15480 - }, - { - "epoch": 0.36, - "grad_norm": 2.349153995513916, - "learning_rate": 9.837288135593221e-06, - "loss": 1.2706, - "step": 15490 - }, - { - "epoch": 0.36, - "grad_norm": 4.798342227935791, - "learning_rate": 9.830508474576272e-06, - "loss": 1.3675, - "step": 15500 - }, - { - "epoch": 0.36, - "eval_loss": 1.0479532480239868, - "eval_runtime": 66.9728, - "eval_samples_per_second": 14.931, - "eval_steps_per_second": 14.931, - "step": 15500 - }, - { - "epoch": 0.36, - "grad_norm": 3.6128780841827393, - "learning_rate": 9.823728813559322e-06, - "loss": 1.2832, - "step": 15510 - }, - { - "epoch": 0.36, - "grad_norm": 3.2983522415161133, - "learning_rate": 9.816949152542373e-06, - "loss": 1.4409, - "step": 15520 - }, - { - "epoch": 0.36, - "grad_norm": 2.769399881362915, - "learning_rate": 9.810169491525425e-06, - "loss": 1.2814, - "step": 15530 - }, - { - "epoch": 0.36, - "grad_norm": 1.404224157333374, - "learning_rate": 9.803389830508474e-06, - "loss": 1.5151, - "step": 15540 - }, - { - "epoch": 0.36, - "grad_norm": 1.0403872728347778, - "learning_rate": 9.796610169491526e-06, - "loss": 1.1822, - "step": 15550 - }, - { - "epoch": 0.36, - "grad_norm": 8.834808349609375, - "learning_rate": 9.789830508474577e-06, - "loss": 1.2262, - "step": 15560 - }, - { - "epoch": 0.36, - "grad_norm": 4.028998851776123, - "learning_rate": 9.783050847457629e-06, - "loss": 1.1775, - "step": 15570 - }, - { - "epoch": 0.36, - "grad_norm": 6.36124849319458, - "learning_rate": 9.776271186440678e-06, - "loss": 1.2234, - "step": 15580 - }, - { - "epoch": 0.36, - "grad_norm": 6.729506969451904, - "learning_rate": 9.76949152542373e-06, - "loss": 1.5841, - "step": 15590 - }, - { - "epoch": 0.36, - "grad_norm": 1.4375039339065552, - "learning_rate": 9.762711864406781e-06, - "loss": 1.2468, - "step": 15600 - }, - { - "epoch": 0.36, - "grad_norm": 5.384178161621094, - "learning_rate": 9.755932203389833e-06, - "loss": 1.572, - "step": 15610 - }, - { - "epoch": 0.36, - "grad_norm": 6.532320499420166, - "learning_rate": 9.749152542372882e-06, - "loss": 1.2631, - "step": 15620 - }, - { - "epoch": 0.36, - "grad_norm": 6.689794540405273, - "learning_rate": 9.742372881355932e-06, - "loss": 1.3595, - "step": 15630 - }, - { - "epoch": 0.36, - "grad_norm": 3.815664768218994, - "learning_rate": 9.735593220338983e-06, - "loss": 1.3499, - "step": 15640 - }, - { - "epoch": 0.36, - "grad_norm": 1.9858266115188599, - "learning_rate": 9.728813559322035e-06, - "loss": 1.2567, - "step": 15650 - }, - { - "epoch": 0.36, - "grad_norm": 4.2085723876953125, - "learning_rate": 9.722033898305086e-06, - "loss": 1.3345, - "step": 15660 - }, - { - "epoch": 0.36, - "grad_norm": 4.255419731140137, - "learning_rate": 9.715254237288136e-06, - "loss": 1.3081, - "step": 15670 - }, - { - "epoch": 0.36, - "grad_norm": 2.9408528804779053, - "learning_rate": 9.708474576271187e-06, - "loss": 1.3258, - "step": 15680 - }, - { - "epoch": 0.36, - "grad_norm": 7.384323596954346, - "learning_rate": 9.701694915254239e-06, - "loss": 1.2384, - "step": 15690 - }, - { - "epoch": 0.36, - "grad_norm": 2.5653786659240723, - "learning_rate": 9.69491525423729e-06, - "loss": 1.3809, - "step": 15700 - }, - { - "epoch": 0.36, - "grad_norm": 5.427888870239258, - "learning_rate": 9.68813559322034e-06, - "loss": 1.1215, - "step": 15710 - }, - { - "epoch": 0.36, - "grad_norm": 5.703902244567871, - "learning_rate": 9.68135593220339e-06, - "loss": 1.3512, - "step": 15720 - }, - { - "epoch": 0.36, - "grad_norm": 4.905301570892334, - "learning_rate": 9.674576271186441e-06, - "loss": 1.3938, - "step": 15730 - }, - { - "epoch": 0.36, - "grad_norm": 2.5882115364074707, - "learning_rate": 9.667796610169492e-06, - "loss": 1.2503, - "step": 15740 - }, - { - "epoch": 0.36, - "grad_norm": 2.825324535369873, - "learning_rate": 9.661016949152544e-06, - "loss": 1.2018, - "step": 15750 - }, - { - "epoch": 0.36, - "grad_norm": 4.857571601867676, - "learning_rate": 9.654237288135593e-06, - "loss": 1.5004, - "step": 15760 - }, - { - "epoch": 0.37, - "grad_norm": 3.466376543045044, - "learning_rate": 9.647457627118645e-06, - "loss": 1.3539, - "step": 15770 - }, - { - "epoch": 0.37, - "grad_norm": 6.780067443847656, - "learning_rate": 9.640677966101696e-06, - "loss": 1.2378, - "step": 15780 - }, - { - "epoch": 0.37, - "grad_norm": 6.265717506408691, - "learning_rate": 9.633898305084746e-06, - "loss": 1.3072, - "step": 15790 - }, - { - "epoch": 0.37, - "grad_norm": 2.7451460361480713, - "learning_rate": 9.627118644067797e-06, - "loss": 1.1819, - "step": 15800 - }, - { - "epoch": 0.37, - "grad_norm": 4.156853675842285, - "learning_rate": 9.620338983050849e-06, - "loss": 1.3481, - "step": 15810 - }, - { - "epoch": 0.37, - "grad_norm": 3.0684568881988525, - "learning_rate": 9.6135593220339e-06, - "loss": 1.2366, - "step": 15820 - }, - { - "epoch": 0.37, - "grad_norm": 8.240663528442383, - "learning_rate": 9.60677966101695e-06, - "loss": 1.101, - "step": 15830 - }, - { - "epoch": 0.37, - "grad_norm": 1.4690364599227905, - "learning_rate": 9.600000000000001e-06, - "loss": 1.2979, - "step": 15840 - }, - { - "epoch": 0.37, - "grad_norm": 5.775753021240234, - "learning_rate": 9.593220338983051e-06, - "loss": 1.2432, - "step": 15850 - }, - { - "epoch": 0.37, - "grad_norm": 6.069706439971924, - "learning_rate": 9.586440677966102e-06, - "loss": 1.0817, - "step": 15860 - }, - { - "epoch": 0.37, - "grad_norm": 3.6312060356140137, - "learning_rate": 9.579661016949154e-06, - "loss": 1.3273, - "step": 15870 - }, - { - "epoch": 0.37, - "grad_norm": 2.591268539428711, - "learning_rate": 9.572881355932203e-06, - "loss": 1.3013, - "step": 15880 - }, - { - "epoch": 0.37, - "grad_norm": 9.177107810974121, - "learning_rate": 9.566101694915255e-06, - "loss": 1.4208, - "step": 15890 - }, - { - "epoch": 0.37, - "grad_norm": 4.9722490310668945, - "learning_rate": 9.559322033898306e-06, - "loss": 1.3375, - "step": 15900 - }, - { - "epoch": 0.37, - "grad_norm": 5.919930458068848, - "learning_rate": 9.552542372881358e-06, - "loss": 1.3232, - "step": 15910 - }, - { - "epoch": 0.37, - "grad_norm": 3.5664663314819336, - "learning_rate": 9.545762711864407e-06, - "loss": 1.3428, - "step": 15920 - }, - { - "epoch": 0.37, - "grad_norm": 2.6872286796569824, - "learning_rate": 9.538983050847457e-06, - "loss": 1.4764, - "step": 15930 - }, - { - "epoch": 0.37, - "grad_norm": 1.3224186897277832, - "learning_rate": 9.532203389830508e-06, - "loss": 1.2986, - "step": 15940 - }, - { - "epoch": 0.37, - "grad_norm": 3.2773120403289795, - "learning_rate": 9.52542372881356e-06, - "loss": 1.2231, - "step": 15950 - }, - { - "epoch": 0.37, - "grad_norm": 3.5411736965179443, - "learning_rate": 9.518644067796611e-06, - "loss": 1.3613, - "step": 15960 - }, - { - "epoch": 0.37, - "grad_norm": 2.742453098297119, - "learning_rate": 9.511864406779661e-06, - "loss": 1.3298, - "step": 15970 - }, - { - "epoch": 0.37, - "grad_norm": 5.865776538848877, - "learning_rate": 9.505084745762712e-06, - "loss": 1.3194, - "step": 15980 - }, - { - "epoch": 0.37, - "grad_norm": 4.6556715965271, - "learning_rate": 9.498305084745764e-06, - "loss": 1.33, - "step": 15990 - }, - { - "epoch": 0.37, - "grad_norm": 8.305102348327637, - "learning_rate": 9.491525423728815e-06, - "loss": 1.3663, - "step": 16000 - }, - { - "epoch": 0.37, - "eval_loss": 1.01907479763031, - "eval_runtime": 67.0887, - "eval_samples_per_second": 14.906, - "eval_steps_per_second": 14.906, - "step": 16000 - }, - { - "epoch": 0.37, - "grad_norm": 5.403275012969971, - "learning_rate": 9.484745762711865e-06, - "loss": 1.3266, - "step": 16010 - }, - { - "epoch": 0.37, - "grad_norm": 7.849989891052246, - "learning_rate": 9.477966101694916e-06, - "loss": 1.2395, - "step": 16020 - }, - { - "epoch": 0.37, - "grad_norm": 10.609501838684082, - "learning_rate": 9.471186440677966e-06, - "loss": 1.3162, - "step": 16030 - }, - { - "epoch": 0.37, - "grad_norm": 4.230147361755371, - "learning_rate": 9.464406779661017e-06, - "loss": 1.4996, - "step": 16040 - }, - { - "epoch": 0.37, - "grad_norm": 2.444842576980591, - "learning_rate": 9.457627118644069e-06, - "loss": 1.3524, - "step": 16050 - }, - { - "epoch": 0.37, - "grad_norm": 3.158411741256714, - "learning_rate": 9.450847457627119e-06, - "loss": 1.4404, - "step": 16060 - }, - { - "epoch": 0.37, - "grad_norm": 4.415703773498535, - "learning_rate": 9.44406779661017e-06, - "loss": 1.2824, - "step": 16070 - }, - { - "epoch": 0.37, - "grad_norm": 3.985328435897827, - "learning_rate": 9.437288135593221e-06, - "loss": 1.1726, - "step": 16080 - }, - { - "epoch": 0.37, - "grad_norm": 6.863582134246826, - "learning_rate": 9.430508474576273e-06, - "loss": 1.2735, - "step": 16090 - }, - { - "epoch": 0.37, - "grad_norm": 4.635952472686768, - "learning_rate": 9.423728813559322e-06, - "loss": 1.1791, - "step": 16100 - }, - { - "epoch": 0.37, - "grad_norm": 0.7141504287719727, - "learning_rate": 9.416949152542374e-06, - "loss": 1.2523, - "step": 16110 - }, - { - "epoch": 0.37, - "grad_norm": 3.634265661239624, - "learning_rate": 9.410169491525425e-06, - "loss": 1.2288, - "step": 16120 - }, - { - "epoch": 0.37, - "grad_norm": 2.960615634918213, - "learning_rate": 9.403389830508477e-06, - "loss": 1.1966, - "step": 16130 - }, - { - "epoch": 0.37, - "grad_norm": 4.205376148223877, - "learning_rate": 9.396610169491526e-06, - "loss": 1.2102, - "step": 16140 - }, - { - "epoch": 0.37, - "grad_norm": 3.364990472793579, - "learning_rate": 9.389830508474576e-06, - "loss": 1.2442, - "step": 16150 - }, - { - "epoch": 0.37, - "grad_norm": 6.015055179595947, - "learning_rate": 9.383050847457627e-06, - "loss": 1.3283, - "step": 16160 - }, - { - "epoch": 0.37, - "grad_norm": 3.9518237113952637, - "learning_rate": 9.376271186440679e-06, - "loss": 1.4116, - "step": 16170 - }, - { - "epoch": 0.37, - "grad_norm": 7.87483024597168, - "learning_rate": 9.36949152542373e-06, - "loss": 1.4171, - "step": 16180 - }, - { - "epoch": 0.37, - "grad_norm": 1.998564600944519, - "learning_rate": 9.36271186440678e-06, - "loss": 1.275, - "step": 16190 - }, - { - "epoch": 0.38, - "grad_norm": 1.7648671865463257, - "learning_rate": 9.355932203389831e-06, - "loss": 1.155, - "step": 16200 - }, - { - "epoch": 0.38, - "grad_norm": 3.3635053634643555, - "learning_rate": 9.349152542372883e-06, - "loss": 1.245, - "step": 16210 - }, - { - "epoch": 0.38, - "grad_norm": 5.173689842224121, - "learning_rate": 9.342372881355934e-06, - "loss": 1.1604, - "step": 16220 - }, - { - "epoch": 0.38, - "grad_norm": 7.293972492218018, - "learning_rate": 9.335593220338984e-06, - "loss": 1.3044, - "step": 16230 - }, - { - "epoch": 0.38, - "grad_norm": 2.6143198013305664, - "learning_rate": 9.328813559322034e-06, - "loss": 1.0515, - "step": 16240 - }, - { - "epoch": 0.38, - "grad_norm": 10.650688171386719, - "learning_rate": 9.322033898305085e-06, - "loss": 1.0784, - "step": 16250 - }, - { - "epoch": 0.38, - "grad_norm": 3.8562324047088623, - "learning_rate": 9.315254237288136e-06, - "loss": 1.1731, - "step": 16260 - }, - { - "epoch": 0.38, - "grad_norm": 7.339954853057861, - "learning_rate": 9.308474576271188e-06, - "loss": 1.3072, - "step": 16270 - }, - { - "epoch": 0.38, - "grad_norm": 11.781778335571289, - "learning_rate": 9.301694915254237e-06, - "loss": 1.369, - "step": 16280 - }, - { - "epoch": 0.38, - "grad_norm": 2.4666757583618164, - "learning_rate": 9.294915254237289e-06, - "loss": 1.3628, - "step": 16290 - }, - { - "epoch": 0.38, - "grad_norm": 3.2646923065185547, - "learning_rate": 9.28813559322034e-06, - "loss": 1.33, - "step": 16300 - }, - { - "epoch": 0.38, - "grad_norm": 2.234771251678467, - "learning_rate": 9.28135593220339e-06, - "loss": 1.2017, - "step": 16310 - }, - { - "epoch": 0.38, - "grad_norm": 10.070908546447754, - "learning_rate": 9.274576271186441e-06, - "loss": 1.019, - "step": 16320 - }, - { - "epoch": 0.38, - "grad_norm": 2.7779605388641357, - "learning_rate": 9.267796610169493e-06, - "loss": 1.3126, - "step": 16330 - }, - { - "epoch": 0.38, - "grad_norm": 0.898559033870697, - "learning_rate": 9.261016949152544e-06, - "loss": 1.1969, - "step": 16340 - }, - { - "epoch": 0.38, - "grad_norm": 6.065027236938477, - "learning_rate": 9.254237288135594e-06, - "loss": 1.3629, - "step": 16350 - }, - { - "epoch": 0.38, - "grad_norm": 18.8370361328125, - "learning_rate": 9.247457627118645e-06, - "loss": 1.4472, - "step": 16360 - }, - { - "epoch": 0.38, - "grad_norm": 4.5132598876953125, - "learning_rate": 9.240677966101695e-06, - "loss": 1.219, - "step": 16370 - }, - { - "epoch": 0.38, - "grad_norm": 5.263208389282227, - "learning_rate": 9.233898305084746e-06, - "loss": 1.2699, - "step": 16380 - }, - { - "epoch": 0.38, - "grad_norm": 2.9755587577819824, - "learning_rate": 9.227118644067798e-06, - "loss": 1.3508, - "step": 16390 - }, - { - "epoch": 0.38, - "grad_norm": 2.9797961711883545, - "learning_rate": 9.220338983050847e-06, - "loss": 1.0866, - "step": 16400 - }, - { - "epoch": 0.38, - "grad_norm": 5.462897777557373, - "learning_rate": 9.213559322033899e-06, - "loss": 1.1406, - "step": 16410 - }, - { - "epoch": 0.38, - "grad_norm": 7.175684928894043, - "learning_rate": 9.20677966101695e-06, - "loss": 1.1563, - "step": 16420 - }, - { - "epoch": 0.38, - "grad_norm": 5.566606044769287, - "learning_rate": 9.200000000000002e-06, - "loss": 1.4053, - "step": 16430 - }, - { - "epoch": 0.38, - "grad_norm": 5.2373762130737305, - "learning_rate": 9.193220338983051e-06, - "loss": 1.3666, - "step": 16440 - }, - { - "epoch": 0.38, - "grad_norm": 2.5914816856384277, - "learning_rate": 9.186440677966101e-06, - "loss": 1.3011, - "step": 16450 - }, - { - "epoch": 0.38, - "grad_norm": 7.714232921600342, - "learning_rate": 9.179661016949153e-06, - "loss": 1.3538, - "step": 16460 - }, - { - "epoch": 0.38, - "grad_norm": 6.387141227722168, - "learning_rate": 9.172881355932204e-06, - "loss": 1.2126, - "step": 16470 - }, - { - "epoch": 0.38, - "grad_norm": 12.002847671508789, - "learning_rate": 9.166101694915255e-06, - "loss": 1.3061, - "step": 16480 - }, - { - "epoch": 0.38, - "grad_norm": 4.265731334686279, - "learning_rate": 9.159322033898305e-06, - "loss": 1.3956, - "step": 16490 - }, - { - "epoch": 0.38, - "grad_norm": 7.408833980560303, - "learning_rate": 9.152542372881356e-06, - "loss": 1.2394, - "step": 16500 - }, - { - "epoch": 0.38, - "eval_loss": 1.063765048980713, - "eval_runtime": 65.9613, - "eval_samples_per_second": 15.16, - "eval_steps_per_second": 15.16, - "step": 16500 - }, - { - "epoch": 0.38, - "grad_norm": 1.7360392808914185, - "learning_rate": 9.145762711864408e-06, - "loss": 1.2752, - "step": 16510 - }, - { - "epoch": 0.38, - "grad_norm": 4.30725622177124, - "learning_rate": 9.13898305084746e-06, - "loss": 1.4356, - "step": 16520 - }, - { - "epoch": 0.38, - "grad_norm": 3.4266469478607178, - "learning_rate": 9.132203389830509e-06, - "loss": 1.2648, - "step": 16530 - }, - { - "epoch": 0.38, - "grad_norm": 4.342678070068359, - "learning_rate": 9.12542372881356e-06, - "loss": 1.452, - "step": 16540 - }, - { - "epoch": 0.38, - "grad_norm": 3.7193539142608643, - "learning_rate": 9.11864406779661e-06, - "loss": 1.2244, - "step": 16550 - }, - { - "epoch": 0.38, - "grad_norm": 4.744646072387695, - "learning_rate": 9.111864406779661e-06, - "loss": 1.1743, - "step": 16560 - }, - { - "epoch": 0.38, - "grad_norm": 6.784905910491943, - "learning_rate": 9.105084745762713e-06, - "loss": 1.3432, - "step": 16570 - }, - { - "epoch": 0.38, - "grad_norm": 3.6726789474487305, - "learning_rate": 9.098305084745763e-06, - "loss": 1.4602, - "step": 16580 - }, - { - "epoch": 0.38, - "grad_norm": 2.33778715133667, - "learning_rate": 9.091525423728814e-06, - "loss": 1.4069, - "step": 16590 - }, - { - "epoch": 0.38, - "grad_norm": 9.637640953063965, - "learning_rate": 9.084745762711865e-06, - "loss": 1.408, - "step": 16600 - }, - { - "epoch": 0.38, - "grad_norm": 2.0369181632995605, - "learning_rate": 9.077966101694917e-06, - "loss": 1.3524, - "step": 16610 - }, - { - "epoch": 0.38, - "grad_norm": 4.292532920837402, - "learning_rate": 9.071186440677966e-06, - "loss": 1.1809, - "step": 16620 - }, - { - "epoch": 0.39, - "grad_norm": 7.0022735595703125, - "learning_rate": 9.064406779661018e-06, - "loss": 1.2977, - "step": 16630 - }, - { - "epoch": 0.39, - "grad_norm": 10.691664695739746, - "learning_rate": 9.05762711864407e-06, - "loss": 1.2714, - "step": 16640 - }, - { - "epoch": 0.39, - "grad_norm": 11.661876678466797, - "learning_rate": 9.05084745762712e-06, - "loss": 1.3014, - "step": 16650 - }, - { - "epoch": 0.39, - "grad_norm": 11.593371391296387, - "learning_rate": 9.04406779661017e-06, - "loss": 1.2261, - "step": 16660 - }, - { - "epoch": 0.39, - "grad_norm": 4.483781337738037, - "learning_rate": 9.03728813559322e-06, - "loss": 1.3501, - "step": 16670 - }, - { - "epoch": 0.39, - "grad_norm": 4.500730037689209, - "learning_rate": 9.030508474576271e-06, - "loss": 1.2256, - "step": 16680 - }, - { - "epoch": 0.39, - "grad_norm": 3.4255197048187256, - "learning_rate": 9.023728813559323e-06, - "loss": 1.2245, - "step": 16690 - }, - { - "epoch": 0.39, - "grad_norm": 5.569160461425781, - "learning_rate": 9.016949152542374e-06, - "loss": 1.4078, - "step": 16700 - }, - { - "epoch": 0.39, - "grad_norm": 1.7080422639846802, - "learning_rate": 9.010169491525424e-06, - "loss": 1.4236, - "step": 16710 - }, - { - "epoch": 0.39, - "grad_norm": 4.914741516113281, - "learning_rate": 9.003389830508475e-06, - "loss": 1.2597, - "step": 16720 - }, - { - "epoch": 0.39, - "grad_norm": 4.742468357086182, - "learning_rate": 8.996610169491527e-06, - "loss": 1.1909, - "step": 16730 - }, - { - "epoch": 0.39, - "grad_norm": 1.7503118515014648, - "learning_rate": 8.989830508474578e-06, - "loss": 1.3589, - "step": 16740 - }, - { - "epoch": 0.39, - "grad_norm": 4.5486040115356445, - "learning_rate": 8.983050847457628e-06, - "loss": 1.3431, - "step": 16750 - }, - { - "epoch": 0.39, - "grad_norm": 1.0202205181121826, - "learning_rate": 8.976271186440678e-06, - "loss": 1.3625, - "step": 16760 - }, - { - "epoch": 0.39, - "grad_norm": 7.154101371765137, - "learning_rate": 8.969491525423729e-06, - "loss": 1.1777, - "step": 16770 - }, - { - "epoch": 0.39, - "grad_norm": 2.6144134998321533, - "learning_rate": 8.96271186440678e-06, - "loss": 1.2143, - "step": 16780 - }, - { - "epoch": 0.39, - "grad_norm": 3.3913540840148926, - "learning_rate": 8.955932203389832e-06, - "loss": 1.2127, - "step": 16790 - }, - { - "epoch": 0.39, - "grad_norm": 3.7216756343841553, - "learning_rate": 8.949152542372881e-06, - "loss": 1.1532, - "step": 16800 - }, - { - "epoch": 0.39, - "grad_norm": 8.174360275268555, - "learning_rate": 8.942372881355933e-06, - "loss": 1.2403, - "step": 16810 - }, - { - "epoch": 0.39, - "grad_norm": 4.247581958770752, - "learning_rate": 8.935593220338984e-06, - "loss": 1.2231, - "step": 16820 - }, - { - "epoch": 0.39, - "grad_norm": 2.393038749694824, - "learning_rate": 8.928813559322036e-06, - "loss": 1.3235, - "step": 16830 - }, - { - "epoch": 0.39, - "grad_norm": 1.777172327041626, - "learning_rate": 8.922033898305085e-06, - "loss": 1.2097, - "step": 16840 - }, - { - "epoch": 0.39, - "grad_norm": 4.403905868530273, - "learning_rate": 8.915254237288137e-06, - "loss": 1.1161, - "step": 16850 - }, - { - "epoch": 0.39, - "grad_norm": 3.6407907009124756, - "learning_rate": 8.908474576271188e-06, - "loss": 1.3792, - "step": 16860 - }, - { - "epoch": 0.39, - "grad_norm": 4.673400402069092, - "learning_rate": 8.901694915254238e-06, - "loss": 1.2941, - "step": 16870 - }, - { - "epoch": 0.39, - "grad_norm": 5.5751848220825195, - "learning_rate": 8.89491525423729e-06, - "loss": 1.2355, - "step": 16880 - }, - { - "epoch": 0.39, - "grad_norm": 4.737181186676025, - "learning_rate": 8.888135593220339e-06, - "loss": 1.1943, - "step": 16890 - }, - { - "epoch": 0.39, - "grad_norm": 4.271194934844971, - "learning_rate": 8.88135593220339e-06, - "loss": 1.2646, - "step": 16900 - }, - { - "epoch": 0.39, - "grad_norm": 1.9672961235046387, - "learning_rate": 8.874576271186442e-06, - "loss": 1.3767, - "step": 16910 - }, - { - "epoch": 0.39, - "grad_norm": 3.734816074371338, - "learning_rate": 8.867796610169492e-06, - "loss": 1.2552, - "step": 16920 - }, - { - "epoch": 0.39, - "grad_norm": 4.603008270263672, - "learning_rate": 8.861016949152543e-06, - "loss": 1.428, - "step": 16930 - }, - { - "epoch": 0.39, - "grad_norm": 9.842779159545898, - "learning_rate": 8.854237288135594e-06, - "loss": 1.2608, - "step": 16940 - }, - { - "epoch": 0.39, - "grad_norm": 3.2394957542419434, - "learning_rate": 8.847457627118646e-06, - "loss": 1.1189, - "step": 16950 - }, - { - "epoch": 0.39, - "grad_norm": 4.726215362548828, - "learning_rate": 8.840677966101695e-06, - "loss": 1.3895, - "step": 16960 - }, - { - "epoch": 0.39, - "grad_norm": 4.315942287445068, - "learning_rate": 8.833898305084747e-06, - "loss": 1.2961, - "step": 16970 - }, - { - "epoch": 0.39, - "grad_norm": 2.8540353775024414, - "learning_rate": 8.827118644067797e-06, - "loss": 1.2502, - "step": 16980 - }, - { - "epoch": 0.39, - "grad_norm": 5.723995685577393, - "learning_rate": 8.820338983050848e-06, - "loss": 1.3241, - "step": 16990 - }, - { - "epoch": 0.39, - "grad_norm": 4.132941246032715, - "learning_rate": 8.8135593220339e-06, - "loss": 1.2777, - "step": 17000 - }, - { - "epoch": 0.39, - "eval_loss": 1.0359538793563843, - "eval_runtime": 66.4953, - "eval_samples_per_second": 15.039, - "eval_steps_per_second": 15.039, - "step": 17000 - }, - { - "epoch": 0.39, - "grad_norm": 3.6101505756378174, - "learning_rate": 8.806779661016949e-06, - "loss": 1.302, - "step": 17010 - }, - { - "epoch": 0.39, - "grad_norm": 2.8257572650909424, - "learning_rate": 8.8e-06, - "loss": 1.3581, - "step": 17020 - }, - { - "epoch": 0.39, - "grad_norm": 2.660344123840332, - "learning_rate": 8.793220338983052e-06, - "loss": 1.2804, - "step": 17030 - }, - { - "epoch": 0.39, - "grad_norm": 9.050156593322754, - "learning_rate": 8.786440677966103e-06, - "loss": 1.2465, - "step": 17040 - }, - { - "epoch": 0.39, - "grad_norm": 3.4612698554992676, - "learning_rate": 8.779661016949153e-06, - "loss": 1.4898, - "step": 17050 - }, - { - "epoch": 0.39, - "grad_norm": 10.567662239074707, - "learning_rate": 8.772881355932204e-06, - "loss": 1.1548, - "step": 17060 - }, - { - "epoch": 0.4, - "grad_norm": 5.1014275550842285, - "learning_rate": 8.766101694915254e-06, - "loss": 1.3642, - "step": 17070 - }, - { - "epoch": 0.4, - "grad_norm": 7.804174423217773, - "learning_rate": 8.759322033898305e-06, - "loss": 1.2271, - "step": 17080 - }, - { - "epoch": 0.4, - "grad_norm": 1.9340236186981201, - "learning_rate": 8.752542372881357e-06, - "loss": 1.3143, - "step": 17090 - }, - { - "epoch": 0.4, - "grad_norm": 7.017419815063477, - "learning_rate": 8.745762711864407e-06, - "loss": 1.189, - "step": 17100 - }, - { - "epoch": 0.4, - "grad_norm": 7.2211995124816895, - "learning_rate": 8.738983050847458e-06, - "loss": 1.3399, - "step": 17110 - }, - { - "epoch": 0.4, - "grad_norm": 4.3273162841796875, - "learning_rate": 8.73220338983051e-06, - "loss": 1.3636, - "step": 17120 - }, - { - "epoch": 0.4, - "grad_norm": 4.173155784606934, - "learning_rate": 8.72542372881356e-06, - "loss": 1.2755, - "step": 17130 - }, - { - "epoch": 0.4, - "grad_norm": 3.535107374191284, - "learning_rate": 8.71864406779661e-06, - "loss": 1.2036, - "step": 17140 - }, - { - "epoch": 0.4, - "grad_norm": 5.383171081542969, - "learning_rate": 8.711864406779662e-06, - "loss": 1.2716, - "step": 17150 - }, - { - "epoch": 0.4, - "grad_norm": 3.0449705123901367, - "learning_rate": 8.705084745762713e-06, - "loss": 1.287, - "step": 17160 - }, - { - "epoch": 0.4, - "grad_norm": 6.901495933532715, - "learning_rate": 8.698305084745765e-06, - "loss": 1.4108, - "step": 17170 - }, - { - "epoch": 0.4, - "grad_norm": 3.3814964294433594, - "learning_rate": 8.691525423728814e-06, - "loss": 1.3023, - "step": 17180 - }, - { - "epoch": 0.4, - "grad_norm": 2.271169424057007, - "learning_rate": 8.684745762711864e-06, - "loss": 1.2989, - "step": 17190 - }, - { - "epoch": 0.4, - "grad_norm": 3.1464428901672363, - "learning_rate": 8.677966101694915e-06, - "loss": 1.2671, - "step": 17200 - }, - { - "epoch": 0.4, - "grad_norm": 3.007338047027588, - "learning_rate": 8.671186440677967e-06, - "loss": 1.3452, - "step": 17210 - }, - { - "epoch": 0.4, - "grad_norm": 5.046366214752197, - "learning_rate": 8.664406779661018e-06, - "loss": 1.3612, - "step": 17220 - }, - { - "epoch": 0.4, - "grad_norm": 2.2831978797912598, - "learning_rate": 8.657627118644068e-06, - "loss": 1.2544, - "step": 17230 - }, - { - "epoch": 0.4, - "grad_norm": 2.5876657962799072, - "learning_rate": 8.65084745762712e-06, - "loss": 1.1922, - "step": 17240 - }, - { - "epoch": 0.4, - "grad_norm": 3.5324642658233643, - "learning_rate": 8.64406779661017e-06, - "loss": 1.2893, - "step": 17250 - }, - { - "epoch": 0.4, - "grad_norm": 3.1740715503692627, - "learning_rate": 8.637288135593222e-06, - "loss": 1.3322, - "step": 17260 - }, - { - "epoch": 0.4, - "grad_norm": 6.979104995727539, - "learning_rate": 8.630508474576272e-06, - "loss": 1.0987, - "step": 17270 - }, - { - "epoch": 0.4, - "grad_norm": 2.1378564834594727, - "learning_rate": 8.623728813559322e-06, - "loss": 1.2972, - "step": 17280 - }, - { - "epoch": 0.4, - "grad_norm": 3.754072666168213, - "learning_rate": 8.616949152542373e-06, - "loss": 1.1845, - "step": 17290 - }, - { - "epoch": 0.4, - "grad_norm": 5.770691394805908, - "learning_rate": 8.610169491525424e-06, - "loss": 1.2964, - "step": 17300 - }, - { - "epoch": 0.4, - "grad_norm": 1.7626854181289673, - "learning_rate": 8.603389830508476e-06, - "loss": 1.2309, - "step": 17310 - }, - { - "epoch": 0.4, - "grad_norm": 4.450747966766357, - "learning_rate": 8.596610169491526e-06, - "loss": 1.3676, - "step": 17320 - }, - { - "epoch": 0.4, - "grad_norm": 4.775254249572754, - "learning_rate": 8.589830508474577e-06, - "loss": 1.3344, - "step": 17330 - }, - { - "epoch": 0.4, - "grad_norm": 8.103263854980469, - "learning_rate": 8.583050847457628e-06, - "loss": 1.3521, - "step": 17340 - }, - { - "epoch": 0.4, - "grad_norm": 6.5187201499938965, - "learning_rate": 8.57627118644068e-06, - "loss": 1.323, - "step": 17350 - }, - { - "epoch": 0.4, - "grad_norm": 3.5734620094299316, - "learning_rate": 8.56949152542373e-06, - "loss": 1.3845, - "step": 17360 - }, - { - "epoch": 0.4, - "grad_norm": 6.281393527984619, - "learning_rate": 8.56271186440678e-06, - "loss": 1.2092, - "step": 17370 - }, - { - "epoch": 0.4, - "grad_norm": 3.1913156509399414, - "learning_rate": 8.55593220338983e-06, - "loss": 1.2908, - "step": 17380 - }, - { - "epoch": 0.4, - "grad_norm": 12.576632499694824, - "learning_rate": 8.549152542372882e-06, - "loss": 1.2864, - "step": 17390 - }, - { - "epoch": 0.4, - "grad_norm": 4.168999195098877, - "learning_rate": 8.542372881355933e-06, - "loss": 1.3254, - "step": 17400 - }, - { - "epoch": 0.4, - "grad_norm": 4.167811870574951, - "learning_rate": 8.535593220338983e-06, - "loss": 1.2052, - "step": 17410 - }, - { - "epoch": 0.4, - "grad_norm": 6.4759135246276855, - "learning_rate": 8.528813559322034e-06, - "loss": 1.2101, - "step": 17420 - }, - { - "epoch": 0.4, - "grad_norm": 4.099415302276611, - "learning_rate": 8.522033898305086e-06, - "loss": 1.2088, - "step": 17430 - }, - { - "epoch": 0.4, - "grad_norm": 5.991481781005859, - "learning_rate": 8.515254237288136e-06, - "loss": 1.3606, - "step": 17440 - }, - { - "epoch": 0.4, - "grad_norm": 5.722553253173828, - "learning_rate": 8.508474576271187e-06, - "loss": 1.2074, - "step": 17450 - }, - { - "epoch": 0.4, - "grad_norm": 5.729592800140381, - "learning_rate": 8.501694915254238e-06, - "loss": 1.2838, - "step": 17460 - }, - { - "epoch": 0.4, - "grad_norm": 3.647024393081665, - "learning_rate": 8.49491525423729e-06, - "loss": 1.2778, - "step": 17470 - }, - { - "epoch": 0.4, - "grad_norm": 4.270313262939453, - "learning_rate": 8.48813559322034e-06, - "loss": 1.3161, - "step": 17480 - }, - { - "epoch": 0.4, - "grad_norm": 8.540990829467773, - "learning_rate": 8.481355932203391e-06, - "loss": 1.1557, - "step": 17490 - }, - { - "epoch": 0.41, - "grad_norm": 3.2297253608703613, - "learning_rate": 8.47457627118644e-06, - "loss": 1.4842, - "step": 17500 - }, - { - "epoch": 0.41, - "eval_loss": 1.0164830684661865, - "eval_runtime": 67.3932, - "eval_samples_per_second": 14.838, - "eval_steps_per_second": 14.838, - "step": 17500 - }, - { - "epoch": 0.41, - "grad_norm": 1.271462082862854, - "learning_rate": 8.467796610169492e-06, - "loss": 1.3579, - "step": 17510 - }, - { - "epoch": 0.41, - "grad_norm": 3.716254711151123, - "learning_rate": 8.461016949152543e-06, - "loss": 1.3783, - "step": 17520 - }, - { - "epoch": 0.41, - "grad_norm": 4.856709957122803, - "learning_rate": 8.454237288135593e-06, - "loss": 1.2537, - "step": 17530 - }, - { - "epoch": 0.41, - "grad_norm": 11.454696655273438, - "learning_rate": 8.447457627118644e-06, - "loss": 1.1546, - "step": 17540 - }, - { - "epoch": 0.41, - "grad_norm": 3.388411045074463, - "learning_rate": 8.440677966101696e-06, - "loss": 1.3001, - "step": 17550 - }, - { - "epoch": 0.41, - "grad_norm": 1.902894139289856, - "learning_rate": 8.433898305084747e-06, - "loss": 1.1925, - "step": 17560 - }, - { - "epoch": 0.41, - "grad_norm": 2.1532018184661865, - "learning_rate": 8.427118644067797e-06, - "loss": 1.3065, - "step": 17570 - }, - { - "epoch": 0.41, - "grad_norm": 3.9372098445892334, - "learning_rate": 8.420338983050848e-06, - "loss": 1.3958, - "step": 17580 - }, - { - "epoch": 0.41, - "grad_norm": 7.873089790344238, - "learning_rate": 8.413559322033898e-06, - "loss": 1.1727, - "step": 17590 - }, - { - "epoch": 0.41, - "grad_norm": 3.4659693241119385, - "learning_rate": 8.40677966101695e-06, - "loss": 1.355, - "step": 17600 - }, - { - "epoch": 0.41, - "grad_norm": 18.613733291625977, - "learning_rate": 8.400000000000001e-06, - "loss": 1.3742, - "step": 17610 - }, - { - "epoch": 0.41, - "grad_norm": 3.247617721557617, - "learning_rate": 8.39322033898305e-06, - "loss": 1.1606, - "step": 17620 - }, - { - "epoch": 0.41, - "grad_norm": 2.1550047397613525, - "learning_rate": 8.386440677966102e-06, - "loss": 1.3164, - "step": 17630 - }, - { - "epoch": 0.41, - "grad_norm": 6.266103744506836, - "learning_rate": 8.379661016949153e-06, - "loss": 1.3522, - "step": 17640 - }, - { - "epoch": 0.41, - "grad_norm": 2.7962443828582764, - "learning_rate": 8.372881355932205e-06, - "loss": 1.4491, - "step": 17650 - }, - { - "epoch": 0.41, - "grad_norm": 5.119645118713379, - "learning_rate": 8.366101694915255e-06, - "loss": 1.447, - "step": 17660 - }, - { - "epoch": 0.41, - "grad_norm": 6.110286712646484, - "learning_rate": 8.359322033898306e-06, - "loss": 1.2631, - "step": 17670 - }, - { - "epoch": 0.41, - "grad_norm": 9.99429702758789, - "learning_rate": 8.352542372881357e-06, - "loss": 1.3755, - "step": 17680 - }, - { - "epoch": 0.41, - "grad_norm": 2.3583624362945557, - "learning_rate": 8.345762711864409e-06, - "loss": 1.4084, - "step": 17690 - }, - { - "epoch": 0.41, - "grad_norm": 3.867703437805176, - "learning_rate": 8.338983050847458e-06, - "loss": 1.3138, - "step": 17700 - }, - { - "epoch": 0.41, - "grad_norm": 8.489445686340332, - "learning_rate": 8.332203389830508e-06, - "loss": 1.2486, - "step": 17710 - }, - { - "epoch": 0.41, - "grad_norm": 5.573801040649414, - "learning_rate": 8.32542372881356e-06, - "loss": 1.2657, - "step": 17720 - }, - { - "epoch": 0.41, - "grad_norm": 5.751363754272461, - "learning_rate": 8.318644067796611e-06, - "loss": 1.2009, - "step": 17730 - }, - { - "epoch": 0.41, - "grad_norm": 3.761434316635132, - "learning_rate": 8.311864406779662e-06, - "loss": 1.3223, - "step": 17740 - }, - { - "epoch": 0.41, - "grad_norm": 3.0040767192840576, - "learning_rate": 8.305084745762712e-06, - "loss": 1.2842, - "step": 17750 - }, - { - "epoch": 0.41, - "grad_norm": 4.172005653381348, - "learning_rate": 8.298305084745763e-06, - "loss": 1.2445, - "step": 17760 - }, - { - "epoch": 0.41, - "grad_norm": 6.599450588226318, - "learning_rate": 8.291525423728815e-06, - "loss": 1.0738, - "step": 17770 - }, - { - "epoch": 0.41, - "grad_norm": 7.292091369628906, - "learning_rate": 8.284745762711866e-06, - "loss": 1.3037, - "step": 17780 - }, - { - "epoch": 0.41, - "grad_norm": 2.842538595199585, - "learning_rate": 8.277966101694916e-06, - "loss": 1.3071, - "step": 17790 - }, - { - "epoch": 0.41, - "grad_norm": 4.334558010101318, - "learning_rate": 8.271186440677966e-06, - "loss": 1.5332, - "step": 17800 - }, - { - "epoch": 0.41, - "grad_norm": 3.017799139022827, - "learning_rate": 8.264406779661017e-06, - "loss": 1.3143, - "step": 17810 - }, - { - "epoch": 0.41, - "grad_norm": 8.827996253967285, - "learning_rate": 8.257627118644068e-06, - "loss": 1.0828, - "step": 17820 - }, - { - "epoch": 0.41, - "grad_norm": 10.552117347717285, - "learning_rate": 8.25084745762712e-06, - "loss": 1.2198, - "step": 17830 - }, - { - "epoch": 0.41, - "grad_norm": 4.006974697113037, - "learning_rate": 8.24406779661017e-06, - "loss": 1.2621, - "step": 17840 - }, - { - "epoch": 0.41, - "grad_norm": 4.397966384887695, - "learning_rate": 8.237288135593221e-06, - "loss": 1.1562, - "step": 17850 - }, - { - "epoch": 0.41, - "grad_norm": 7.467845916748047, - "learning_rate": 8.230508474576272e-06, - "loss": 1.3113, - "step": 17860 - }, - { - "epoch": 0.41, - "grad_norm": 3.8428890705108643, - "learning_rate": 8.223728813559324e-06, - "loss": 0.9949, - "step": 17870 - }, - { - "epoch": 0.41, - "grad_norm": 3.2165286540985107, - "learning_rate": 8.216949152542373e-06, - "loss": 1.3019, - "step": 17880 - }, - { - "epoch": 0.41, - "grad_norm": 9.67989444732666, - "learning_rate": 8.210169491525425e-06, - "loss": 1.2628, - "step": 17890 - }, - { - "epoch": 0.41, - "grad_norm": 0.9366636276245117, - "learning_rate": 8.203389830508475e-06, - "loss": 1.0139, - "step": 17900 - }, - { - "epoch": 0.41, - "grad_norm": 6.279294967651367, - "learning_rate": 8.196610169491526e-06, - "loss": 1.2977, - "step": 17910 - }, - { - "epoch": 0.41, - "grad_norm": 5.070333003997803, - "learning_rate": 8.189830508474577e-06, - "loss": 1.3068, - "step": 17920 - }, - { - "epoch": 0.42, - "grad_norm": 3.6785717010498047, - "learning_rate": 8.183050847457627e-06, - "loss": 1.2096, - "step": 17930 - }, - { - "epoch": 0.42, - "grad_norm": 2.7473363876342773, - "learning_rate": 8.176271186440678e-06, - "loss": 1.2947, - "step": 17940 - }, - { - "epoch": 0.42, - "grad_norm": 8.139301300048828, - "learning_rate": 8.16949152542373e-06, - "loss": 1.3725, - "step": 17950 - }, - { - "epoch": 0.42, - "grad_norm": 5.076933860778809, - "learning_rate": 8.162711864406781e-06, - "loss": 1.3476, - "step": 17960 - }, - { - "epoch": 0.42, - "grad_norm": 35.75548553466797, - "learning_rate": 8.155932203389831e-06, - "loss": 1.2893, - "step": 17970 - }, - { - "epoch": 0.42, - "grad_norm": 5.612383842468262, - "learning_rate": 8.149152542372882e-06, - "loss": 1.278, - "step": 17980 - }, - { - "epoch": 0.42, - "grad_norm": 4.277181148529053, - "learning_rate": 8.142372881355934e-06, - "loss": 1.2526, - "step": 17990 - }, - { - "epoch": 0.42, - "grad_norm": 3.2299575805664062, - "learning_rate": 8.135593220338983e-06, - "loss": 1.2866, - "step": 18000 - }, - { - "epoch": 0.42, - "eval_loss": 1.0518407821655273, - "eval_runtime": 67.5281, - "eval_samples_per_second": 14.809, - "eval_steps_per_second": 14.809, - "step": 18000 - }, - { - "epoch": 0.42, - "grad_norm": 3.5516388416290283, - "learning_rate": 8.128813559322035e-06, - "loss": 1.3295, - "step": 18010 - }, - { - "epoch": 0.42, - "grad_norm": 5.150609970092773, - "learning_rate": 8.122033898305085e-06, - "loss": 1.1718, - "step": 18020 - }, - { - "epoch": 0.42, - "grad_norm": 9.092333793640137, - "learning_rate": 8.115254237288136e-06, - "loss": 1.2794, - "step": 18030 - }, - { - "epoch": 0.42, - "grad_norm": 4.030599117279053, - "learning_rate": 8.108474576271187e-06, - "loss": 1.3244, - "step": 18040 - }, - { - "epoch": 0.42, - "grad_norm": 2.527414083480835, - "learning_rate": 8.101694915254237e-06, - "loss": 1.3997, - "step": 18050 - }, - { - "epoch": 0.42, - "grad_norm": 3.05289626121521, - "learning_rate": 8.094915254237289e-06, - "loss": 1.4377, - "step": 18060 - }, - { - "epoch": 0.42, - "grad_norm": 1.4701480865478516, - "learning_rate": 8.08813559322034e-06, - "loss": 1.2932, - "step": 18070 - }, - { - "epoch": 0.42, - "grad_norm": 7.444494247436523, - "learning_rate": 8.081355932203391e-06, - "loss": 1.171, - "step": 18080 - }, - { - "epoch": 0.42, - "grad_norm": 3.2663989067077637, - "learning_rate": 8.074576271186441e-06, - "loss": 1.4609, - "step": 18090 - }, - { - "epoch": 0.42, - "grad_norm": 3.3425662517547607, - "learning_rate": 8.067796610169492e-06, - "loss": 1.2233, - "step": 18100 - }, - { - "epoch": 0.42, - "grad_norm": 4.549169063568115, - "learning_rate": 8.061016949152542e-06, - "loss": 1.3096, - "step": 18110 - }, - { - "epoch": 0.42, - "grad_norm": 7.133522033691406, - "learning_rate": 8.054237288135594e-06, - "loss": 1.3431, - "step": 18120 - }, - { - "epoch": 0.42, - "grad_norm": 3.107785224914551, - "learning_rate": 8.047457627118645e-06, - "loss": 1.2448, - "step": 18130 - }, - { - "epoch": 0.42, - "grad_norm": 3.182917356491089, - "learning_rate": 8.040677966101695e-06, - "loss": 1.2348, - "step": 18140 - }, - { - "epoch": 0.42, - "grad_norm": 10.907735824584961, - "learning_rate": 8.033898305084746e-06, - "loss": 1.102, - "step": 18150 - }, - { - "epoch": 0.42, - "grad_norm": 4.007683753967285, - "learning_rate": 8.027118644067797e-06, - "loss": 1.3091, - "step": 18160 - }, - { - "epoch": 0.42, - "grad_norm": 7.087812900543213, - "learning_rate": 8.020338983050849e-06, - "loss": 1.3511, - "step": 18170 - }, - { - "epoch": 0.42, - "grad_norm": 3.0910608768463135, - "learning_rate": 8.013559322033899e-06, - "loss": 1.2003, - "step": 18180 - }, - { - "epoch": 0.42, - "grad_norm": 8.238131523132324, - "learning_rate": 8.00677966101695e-06, - "loss": 1.0435, - "step": 18190 - }, - { - "epoch": 0.42, - "grad_norm": 1.3896546363830566, - "learning_rate": 8.000000000000001e-06, - "loss": 1.3478, - "step": 18200 - }, - { - "epoch": 0.42, - "grad_norm": 2.614786148071289, - "learning_rate": 7.993220338983053e-06, - "loss": 1.4047, - "step": 18210 - }, - { - "epoch": 0.42, - "grad_norm": 5.2004714012146, - "learning_rate": 7.986440677966102e-06, - "loss": 1.2459, - "step": 18220 - }, - { - "epoch": 0.42, - "grad_norm": 8.22817325592041, - "learning_rate": 7.979661016949152e-06, - "loss": 1.3499, - "step": 18230 - }, - { - "epoch": 0.42, - "grad_norm": 3.287627696990967, - "learning_rate": 7.972881355932204e-06, - "loss": 1.1953, - "step": 18240 - }, - { - "epoch": 0.42, - "grad_norm": 7.237671852111816, - "learning_rate": 7.966101694915255e-06, - "loss": 1.254, - "step": 18250 - }, - { - "epoch": 0.42, - "grad_norm": 2.2025198936462402, - "learning_rate": 7.959322033898306e-06, - "loss": 1.2611, - "step": 18260 - }, - { - "epoch": 0.42, - "grad_norm": 3.426156520843506, - "learning_rate": 7.952542372881356e-06, - "loss": 1.2885, - "step": 18270 - }, - { - "epoch": 0.42, - "grad_norm": 1.2702093124389648, - "learning_rate": 7.945762711864407e-06, - "loss": 1.1662, - "step": 18280 - }, - { - "epoch": 0.42, - "grad_norm": 4.970961570739746, - "learning_rate": 7.938983050847459e-06, - "loss": 1.3333, - "step": 18290 - }, - { - "epoch": 0.42, - "grad_norm": 11.826562881469727, - "learning_rate": 7.93220338983051e-06, - "loss": 1.2809, - "step": 18300 - }, - { - "epoch": 0.42, - "grad_norm": 5.391317367553711, - "learning_rate": 7.92542372881356e-06, - "loss": 1.2781, - "step": 18310 - }, - { - "epoch": 0.42, - "grad_norm": 4.817717552185059, - "learning_rate": 7.91864406779661e-06, - "loss": 1.2691, - "step": 18320 - }, - { - "epoch": 0.42, - "grad_norm": 7.846217632293701, - "learning_rate": 7.911864406779661e-06, - "loss": 1.3397, - "step": 18330 - }, - { - "epoch": 0.42, - "grad_norm": 2.251600742340088, - "learning_rate": 7.905084745762712e-06, - "loss": 1.1359, - "step": 18340 - }, - { - "epoch": 0.42, - "grad_norm": 2.6260218620300293, - "learning_rate": 7.898305084745764e-06, - "loss": 1.2603, - "step": 18350 - }, - { - "epoch": 0.43, - "grad_norm": 1.5109496116638184, - "learning_rate": 7.891525423728814e-06, - "loss": 1.1711, - "step": 18360 - }, - { - "epoch": 0.43, - "grad_norm": 4.166934013366699, - "learning_rate": 7.884745762711865e-06, - "loss": 1.2943, - "step": 18370 - }, - { - "epoch": 0.43, - "grad_norm": 2.4080865383148193, - "learning_rate": 7.877966101694916e-06, - "loss": 1.3551, - "step": 18380 - }, - { - "epoch": 0.43, - "grad_norm": 2.820498466491699, - "learning_rate": 7.871186440677968e-06, - "loss": 1.2382, - "step": 18390 - }, - { - "epoch": 0.43, - "grad_norm": 7.695006847381592, - "learning_rate": 7.864406779661017e-06, - "loss": 1.295, - "step": 18400 - }, - { - "epoch": 0.43, - "grad_norm": 9.21014404296875, - "learning_rate": 7.857627118644069e-06, - "loss": 1.0621, - "step": 18410 - }, - { - "epoch": 0.43, - "grad_norm": 2.656609535217285, - "learning_rate": 7.850847457627119e-06, - "loss": 1.2838, - "step": 18420 - }, - { - "epoch": 0.43, - "grad_norm": 7.399263381958008, - "learning_rate": 7.84406779661017e-06, - "loss": 1.2595, - "step": 18430 - }, - { - "epoch": 0.43, - "grad_norm": 2.139892101287842, - "learning_rate": 7.837288135593221e-06, - "loss": 1.2488, - "step": 18440 - }, - { - "epoch": 0.43, - "grad_norm": 3.8797223567962646, - "learning_rate": 7.830508474576271e-06, - "loss": 1.1638, - "step": 18450 - }, - { - "epoch": 0.43, - "grad_norm": 3.705308198928833, - "learning_rate": 7.823728813559322e-06, - "loss": 1.22, - "step": 18460 - }, - { - "epoch": 0.43, - "grad_norm": 7.386727333068848, - "learning_rate": 7.816949152542374e-06, - "loss": 1.3104, - "step": 18470 - }, - { - "epoch": 0.43, - "grad_norm": 3.4528744220733643, - "learning_rate": 7.810169491525425e-06, - "loss": 1.3548, - "step": 18480 - }, - { - "epoch": 0.43, - "grad_norm": 5.502497673034668, - "learning_rate": 7.803389830508475e-06, - "loss": 1.1125, - "step": 18490 - }, - { - "epoch": 0.43, - "grad_norm": 6.493344783782959, - "learning_rate": 7.796610169491526e-06, - "loss": 1.5306, - "step": 18500 - }, - { - "epoch": 0.43, - "eval_loss": 1.0274684429168701, - "eval_runtime": 68.6334, - "eval_samples_per_second": 14.57, - "eval_steps_per_second": 14.57, - "step": 18500 - }, - { - "epoch": 0.43, - "grad_norm": 8.007162094116211, - "learning_rate": 7.789830508474578e-06, - "loss": 1.4575, - "step": 18510 - }, - { - "epoch": 0.43, - "grad_norm": 4.510526657104492, - "learning_rate": 7.783050847457628e-06, - "loss": 1.3597, - "step": 18520 - }, - { - "epoch": 0.43, - "grad_norm": 6.399461269378662, - "learning_rate": 7.776271186440679e-06, - "loss": 1.249, - "step": 18530 - }, - { - "epoch": 0.43, - "grad_norm": 2.6798839569091797, - "learning_rate": 7.769491525423729e-06, - "loss": 1.1667, - "step": 18540 - }, - { - "epoch": 0.43, - "grad_norm": 3.981032133102417, - "learning_rate": 7.76271186440678e-06, - "loss": 1.4405, - "step": 18550 - }, - { - "epoch": 0.43, - "grad_norm": 3.538968086242676, - "learning_rate": 7.755932203389831e-06, - "loss": 1.2578, - "step": 18560 - }, - { - "epoch": 0.43, - "grad_norm": 1.8838539123535156, - "learning_rate": 7.749152542372881e-06, - "loss": 1.1573, - "step": 18570 - }, - { - "epoch": 0.43, - "grad_norm": 2.920963764190674, - "learning_rate": 7.742372881355933e-06, - "loss": 1.3485, - "step": 18580 - }, - { - "epoch": 0.43, - "grad_norm": 1.1590783596038818, - "learning_rate": 7.735593220338984e-06, - "loss": 1.28, - "step": 18590 - }, - { - "epoch": 0.43, - "grad_norm": 7.310429573059082, - "learning_rate": 7.728813559322035e-06, - "loss": 1.5065, - "step": 18600 - }, - { - "epoch": 0.43, - "grad_norm": 3.0086493492126465, - "learning_rate": 7.722033898305085e-06, - "loss": 1.4836, - "step": 18610 - }, - { - "epoch": 0.43, - "grad_norm": 3.2605090141296387, - "learning_rate": 7.715254237288136e-06, - "loss": 1.3881, - "step": 18620 - }, - { - "epoch": 0.43, - "grad_norm": 2.652811288833618, - "learning_rate": 7.708474576271186e-06, - "loss": 1.218, - "step": 18630 - }, - { - "epoch": 0.43, - "grad_norm": 1.251134991645813, - "learning_rate": 7.701694915254238e-06, - "loss": 1.3565, - "step": 18640 - }, - { - "epoch": 0.43, - "grad_norm": 4.377309322357178, - "learning_rate": 7.694915254237289e-06, - "loss": 1.1737, - "step": 18650 - }, - { - "epoch": 0.43, - "grad_norm": 6.178280353546143, - "learning_rate": 7.688135593220339e-06, - "loss": 1.241, - "step": 18660 - }, - { - "epoch": 0.43, - "grad_norm": 1.76272451877594, - "learning_rate": 7.68135593220339e-06, - "loss": 1.1855, - "step": 18670 - }, - { - "epoch": 0.43, - "grad_norm": 5.491523742675781, - "learning_rate": 7.674576271186441e-06, - "loss": 1.3062, - "step": 18680 - }, - { - "epoch": 0.43, - "grad_norm": 3.912921190261841, - "learning_rate": 7.667796610169493e-06, - "loss": 1.2213, - "step": 18690 - }, - { - "epoch": 0.43, - "grad_norm": 2.2884957790374756, - "learning_rate": 7.661016949152543e-06, - "loss": 1.2815, - "step": 18700 - }, - { - "epoch": 0.43, - "grad_norm": 2.6772258281707764, - "learning_rate": 7.654237288135594e-06, - "loss": 1.109, - "step": 18710 - }, - { - "epoch": 0.43, - "grad_norm": 5.152644157409668, - "learning_rate": 7.647457627118645e-06, - "loss": 1.2494, - "step": 18720 - }, - { - "epoch": 0.43, - "grad_norm": 6.88754940032959, - "learning_rate": 7.640677966101695e-06, - "loss": 1.3743, - "step": 18730 - }, - { - "epoch": 0.43, - "grad_norm": 5.835016250610352, - "learning_rate": 7.633898305084746e-06, - "loss": 1.4158, - "step": 18740 - }, - { - "epoch": 0.43, - "grad_norm": 5.073338508605957, - "learning_rate": 7.627118644067797e-06, - "loss": 1.1477, - "step": 18750 - }, - { - "epoch": 0.43, - "grad_norm": 3.2228505611419678, - "learning_rate": 7.6203389830508476e-06, - "loss": 0.979, - "step": 18760 - }, - { - "epoch": 0.43, - "grad_norm": 6.220423698425293, - "learning_rate": 7.613559322033899e-06, - "loss": 1.2679, - "step": 18770 - }, - { - "epoch": 0.43, - "grad_norm": 5.256744861602783, - "learning_rate": 7.6067796610169495e-06, - "loss": 1.1012, - "step": 18780 - }, - { - "epoch": 0.44, - "grad_norm": 7.843850612640381, - "learning_rate": 7.600000000000001e-06, - "loss": 1.2904, - "step": 18790 - }, - { - "epoch": 0.44, - "grad_norm": 7.586652755737305, - "learning_rate": 7.5932203389830515e-06, - "loss": 1.3106, - "step": 18800 - }, - { - "epoch": 0.44, - "grad_norm": 8.580673217773438, - "learning_rate": 7.586440677966103e-06, - "loss": 1.0683, - "step": 18810 - }, - { - "epoch": 0.44, - "grad_norm": 9.023046493530273, - "learning_rate": 7.5796610169491534e-06, - "loss": 1.3849, - "step": 18820 - }, - { - "epoch": 0.44, - "grad_norm": 8.47220516204834, - "learning_rate": 7.572881355932205e-06, - "loss": 1.3014, - "step": 18830 - }, - { - "epoch": 0.44, - "grad_norm": 4.808041572570801, - "learning_rate": 7.5661016949152545e-06, - "loss": 1.4718, - "step": 18840 - }, - { - "epoch": 0.44, - "grad_norm": 4.828659534454346, - "learning_rate": 7.559322033898305e-06, - "loss": 1.226, - "step": 18850 - }, - { - "epoch": 0.44, - "grad_norm": 3.637918710708618, - "learning_rate": 7.5525423728813565e-06, - "loss": 1.4644, - "step": 18860 - }, - { - "epoch": 0.44, - "grad_norm": 5.678231716156006, - "learning_rate": 7.545762711864407e-06, - "loss": 1.3399, - "step": 18870 - }, - { - "epoch": 0.44, - "grad_norm": 2.2599503993988037, - "learning_rate": 7.5389830508474584e-06, - "loss": 1.395, - "step": 18880 - }, - { - "epoch": 0.44, - "grad_norm": 1.7738438844680786, - "learning_rate": 7.532203389830509e-06, - "loss": 1.2668, - "step": 18890 - }, - { - "epoch": 0.44, - "grad_norm": 7.916548252105713, - "learning_rate": 7.52542372881356e-06, - "loss": 1.3632, - "step": 18900 - }, - { - "epoch": 0.44, - "grad_norm": 9.459602355957031, - "learning_rate": 7.518644067796611e-06, - "loss": 1.4194, - "step": 18910 - }, - { - "epoch": 0.44, - "grad_norm": 6.116416931152344, - "learning_rate": 7.511864406779662e-06, - "loss": 1.3118, - "step": 18920 - }, - { - "epoch": 0.44, - "grad_norm": 4.6954731941223145, - "learning_rate": 7.505084745762713e-06, - "loss": 1.1006, - "step": 18930 - }, - { - "epoch": 0.44, - "grad_norm": 2.977074384689331, - "learning_rate": 7.498305084745763e-06, - "loss": 1.0664, - "step": 18940 - }, - { - "epoch": 0.44, - "grad_norm": 4.856612682342529, - "learning_rate": 7.491525423728814e-06, - "loss": 1.2166, - "step": 18950 - }, - { - "epoch": 0.44, - "grad_norm": 4.739044189453125, - "learning_rate": 7.4847457627118646e-06, - "loss": 1.1675, - "step": 18960 - }, - { - "epoch": 0.44, - "grad_norm": 2.360950231552124, - "learning_rate": 7.477966101694916e-06, - "loss": 1.3736, - "step": 18970 - }, - { - "epoch": 0.44, - "grad_norm": 5.020655155181885, - "learning_rate": 7.4711864406779665e-06, - "loss": 1.4255, - "step": 18980 - }, - { - "epoch": 0.44, - "grad_norm": 10.082642555236816, - "learning_rate": 7.464406779661018e-06, - "loss": 1.2732, - "step": 18990 - }, - { - "epoch": 0.44, - "grad_norm": 4.41738748550415, - "learning_rate": 7.4576271186440685e-06, - "loss": 1.4897, - "step": 19000 - }, - { - "epoch": 0.44, - "eval_loss": 1.0319198369979858, - "eval_runtime": 66.943, - "eval_samples_per_second": 14.938, - "eval_steps_per_second": 14.938, - "step": 19000 - }, - { - "epoch": 0.44, - "grad_norm": 3.4302687644958496, - "learning_rate": 7.45084745762712e-06, - "loss": 1.1379, - "step": 19010 - }, - { - "epoch": 0.44, - "grad_norm": 4.5467529296875, - "learning_rate": 7.4440677966101704e-06, - "loss": 1.1605, - "step": 19020 - }, - { - "epoch": 0.44, - "grad_norm": 3.840532064437866, - "learning_rate": 7.437288135593221e-06, - "loss": 1.318, - "step": 19030 - }, - { - "epoch": 0.44, - "grad_norm": 8.278573989868164, - "learning_rate": 7.430508474576272e-06, - "loss": 1.4069, - "step": 19040 - }, - { - "epoch": 0.44, - "grad_norm": 8.6283597946167, - "learning_rate": 7.423728813559322e-06, - "loss": 1.2437, - "step": 19050 - }, - { - "epoch": 0.44, - "grad_norm": 5.682373523712158, - "learning_rate": 7.4169491525423735e-06, - "loss": 1.2322, - "step": 19060 - }, - { - "epoch": 0.44, - "grad_norm": 5.24190616607666, - "learning_rate": 7.410169491525424e-06, - "loss": 1.137, - "step": 19070 - }, - { - "epoch": 0.44, - "grad_norm": 5.345545291900635, - "learning_rate": 7.4033898305084754e-06, - "loss": 1.2763, - "step": 19080 - }, - { - "epoch": 0.44, - "grad_norm": 6.952364444732666, - "learning_rate": 7.396610169491526e-06, - "loss": 1.3103, - "step": 19090 - }, - { - "epoch": 0.44, - "grad_norm": 4.714378356933594, - "learning_rate": 7.3898305084745766e-06, - "loss": 1.3069, - "step": 19100 - }, - { - "epoch": 0.44, - "grad_norm": 1.7550067901611328, - "learning_rate": 7.383050847457628e-06, - "loss": 1.3664, - "step": 19110 - }, - { - "epoch": 0.44, - "grad_norm": 5.599852561950684, - "learning_rate": 7.3762711864406785e-06, - "loss": 1.2475, - "step": 19120 - }, - { - "epoch": 0.44, - "grad_norm": 7.171874046325684, - "learning_rate": 7.36949152542373e-06, - "loss": 1.1256, - "step": 19130 - }, - { - "epoch": 0.44, - "grad_norm": 2.445876359939575, - "learning_rate": 7.3627118644067805e-06, - "loss": 1.3442, - "step": 19140 - }, - { - "epoch": 0.44, - "grad_norm": 3.083585500717163, - "learning_rate": 7.355932203389831e-06, - "loss": 1.3258, - "step": 19150 - }, - { - "epoch": 0.44, - "grad_norm": 5.623142719268799, - "learning_rate": 7.3491525423728816e-06, - "loss": 1.1306, - "step": 19160 - }, - { - "epoch": 0.44, - "grad_norm": 7.293304920196533, - "learning_rate": 7.342372881355932e-06, - "loss": 1.2778, - "step": 19170 - }, - { - "epoch": 0.44, - "grad_norm": 2.301374673843384, - "learning_rate": 7.3355932203389835e-06, - "loss": 1.3221, - "step": 19180 - }, - { - "epoch": 0.44, - "grad_norm": 5.147269248962402, - "learning_rate": 7.328813559322034e-06, - "loss": 1.1004, - "step": 19190 - }, - { - "epoch": 0.44, - "grad_norm": 3.936070203781128, - "learning_rate": 7.3220338983050855e-06, - "loss": 1.3795, - "step": 19200 - }, - { - "epoch": 0.44, - "grad_norm": 1.6208925247192383, - "learning_rate": 7.315254237288136e-06, - "loss": 1.2793, - "step": 19210 - }, - { - "epoch": 0.44, - "grad_norm": 6.317408084869385, - "learning_rate": 7.3084745762711874e-06, - "loss": 1.3759, - "step": 19220 - }, - { - "epoch": 0.45, - "grad_norm": 3.9492716789245605, - "learning_rate": 7.301694915254238e-06, - "loss": 1.1688, - "step": 19230 - }, - { - "epoch": 0.45, - "grad_norm": 5.192453384399414, - "learning_rate": 7.294915254237289e-06, - "loss": 1.2945, - "step": 19240 - }, - { - "epoch": 0.45, - "grad_norm": 3.642240524291992, - "learning_rate": 7.288135593220339e-06, - "loss": 1.3392, - "step": 19250 - }, - { - "epoch": 0.45, - "grad_norm": 1.0650874376296997, - "learning_rate": 7.28135593220339e-06, - "loss": 1.202, - "step": 19260 - }, - { - "epoch": 0.45, - "grad_norm": 4.307705402374268, - "learning_rate": 7.274576271186441e-06, - "loss": 1.3651, - "step": 19270 - }, - { - "epoch": 0.45, - "grad_norm": 6.180150985717773, - "learning_rate": 7.267796610169492e-06, - "loss": 1.3021, - "step": 19280 - }, - { - "epoch": 0.45, - "grad_norm": 5.6004438400268555, - "learning_rate": 7.261016949152543e-06, - "loss": 1.2091, - "step": 19290 - }, - { - "epoch": 0.45, - "grad_norm": 1.1322782039642334, - "learning_rate": 7.2542372881355936e-06, - "loss": 1.1944, - "step": 19300 - }, - { - "epoch": 0.45, - "grad_norm": 4.901897430419922, - "learning_rate": 7.247457627118645e-06, - "loss": 1.2975, - "step": 19310 - }, - { - "epoch": 0.45, - "grad_norm": 4.5151190757751465, - "learning_rate": 7.2406779661016955e-06, - "loss": 1.2506, - "step": 19320 - }, - { - "epoch": 0.45, - "grad_norm": 3.7965264320373535, - "learning_rate": 7.233898305084747e-06, - "loss": 1.2635, - "step": 19330 - }, - { - "epoch": 0.45, - "grad_norm": 2.9827957153320312, - "learning_rate": 7.2271186440677975e-06, - "loss": 1.3371, - "step": 19340 - }, - { - "epoch": 0.45, - "grad_norm": 5.417064666748047, - "learning_rate": 7.220338983050849e-06, - "loss": 1.2549, - "step": 19350 - }, - { - "epoch": 0.45, - "grad_norm": 6.065445423126221, - "learning_rate": 7.2135593220338986e-06, - "loss": 1.1856, - "step": 19360 - }, - { - "epoch": 0.45, - "grad_norm": 5.525112628936768, - "learning_rate": 7.206779661016949e-06, - "loss": 1.0842, - "step": 19370 - }, - { - "epoch": 0.45, - "grad_norm": 9.134572982788086, - "learning_rate": 7.2000000000000005e-06, - "loss": 1.1286, - "step": 19380 - }, - { - "epoch": 0.45, - "grad_norm": 1.2406967878341675, - "learning_rate": 7.193220338983051e-06, - "loss": 1.0884, - "step": 19390 - }, - { - "epoch": 0.45, - "grad_norm": 3.5600786209106445, - "learning_rate": 7.1864406779661025e-06, - "loss": 1.299, - "step": 19400 - }, - { - "epoch": 0.45, - "grad_norm": 7.953360557556152, - "learning_rate": 7.179661016949153e-06, - "loss": 1.3167, - "step": 19410 - }, - { - "epoch": 0.45, - "grad_norm": 6.8465986251831055, - "learning_rate": 7.1728813559322044e-06, - "loss": 1.2951, - "step": 19420 - }, - { - "epoch": 0.45, - "grad_norm": 4.177648544311523, - "learning_rate": 7.166101694915255e-06, - "loss": 1.3032, - "step": 19430 - }, - { - "epoch": 0.45, - "grad_norm": 6.590419292449951, - "learning_rate": 7.159322033898306e-06, - "loss": 1.3326, - "step": 19440 - }, - { - "epoch": 0.45, - "grad_norm": 10.273443222045898, - "learning_rate": 7.152542372881357e-06, - "loss": 1.1746, - "step": 19450 - }, - { - "epoch": 0.45, - "grad_norm": 4.114923000335693, - "learning_rate": 7.145762711864407e-06, - "loss": 1.5382, - "step": 19460 - }, - { - "epoch": 0.45, - "grad_norm": 6.737642765045166, - "learning_rate": 7.138983050847458e-06, - "loss": 1.3364, - "step": 19470 - }, - { - "epoch": 0.45, - "grad_norm": 4.186859607696533, - "learning_rate": 7.132203389830509e-06, - "loss": 1.2651, - "step": 19480 - }, - { - "epoch": 0.45, - "grad_norm": 4.74080753326416, - "learning_rate": 7.12542372881356e-06, - "loss": 1.2772, - "step": 19490 - }, - { - "epoch": 0.45, - "grad_norm": 5.5528883934021, - "learning_rate": 7.1186440677966106e-06, - "loss": 1.2557, - "step": 19500 - }, - { - "epoch": 0.45, - "eval_loss": 1.0282971858978271, - "eval_runtime": 67.939, - "eval_samples_per_second": 14.719, - "eval_steps_per_second": 14.719, - "step": 19500 - }, - { - "epoch": 0.45, - "grad_norm": 3.818643093109131, - "learning_rate": 7.111864406779662e-06, - "loss": 1.3204, - "step": 19510 - }, - { - "epoch": 0.45, - "grad_norm": 5.886016845703125, - "learning_rate": 7.1050847457627125e-06, - "loss": 1.4398, - "step": 19520 - }, - { - "epoch": 0.45, - "grad_norm": 1.2431572675704956, - "learning_rate": 7.098305084745764e-06, - "loss": 1.1902, - "step": 19530 - }, - { - "epoch": 0.45, - "grad_norm": 11.248309135437012, - "learning_rate": 7.0915254237288145e-06, - "loss": 1.2207, - "step": 19540 - }, - { - "epoch": 0.45, - "grad_norm": 11.708429336547852, - "learning_rate": 7.084745762711865e-06, - "loss": 1.3756, - "step": 19550 - }, - { - "epoch": 0.45, - "grad_norm": 2.923466920852661, - "learning_rate": 7.077966101694916e-06, - "loss": 1.4096, - "step": 19560 - }, - { - "epoch": 0.45, - "grad_norm": 8.958571434020996, - "learning_rate": 7.071186440677966e-06, - "loss": 1.2582, - "step": 19570 - }, - { - "epoch": 0.45, - "grad_norm": 6.289533615112305, - "learning_rate": 7.0644067796610175e-06, - "loss": 1.1581, - "step": 19580 - }, - { - "epoch": 0.45, - "grad_norm": 2.9920499324798584, - "learning_rate": 7.057627118644068e-06, - "loss": 1.2995, - "step": 19590 - }, - { - "epoch": 0.45, - "grad_norm": 4.625074863433838, - "learning_rate": 7.0508474576271195e-06, - "loss": 1.211, - "step": 19600 - }, - { - "epoch": 0.45, - "grad_norm": 15.013193130493164, - "learning_rate": 7.04406779661017e-06, - "loss": 1.302, - "step": 19610 - }, - { - "epoch": 0.45, - "grad_norm": 4.741778373718262, - "learning_rate": 7.037288135593221e-06, - "loss": 1.3025, - "step": 19620 - }, - { - "epoch": 0.45, - "grad_norm": 9.325716972351074, - "learning_rate": 7.030508474576272e-06, - "loss": 1.2179, - "step": 19630 - }, - { - "epoch": 0.45, - "grad_norm": 5.192024230957031, - "learning_rate": 7.0237288135593225e-06, - "loss": 1.1417, - "step": 19640 - }, - { - "epoch": 0.45, - "grad_norm": 5.244385719299316, - "learning_rate": 7.016949152542374e-06, - "loss": 1.3456, - "step": 19650 - }, - { - "epoch": 0.46, - "grad_norm": 3.3191256523132324, - "learning_rate": 7.0101694915254245e-06, - "loss": 1.307, - "step": 19660 - }, - { - "epoch": 0.46, - "grad_norm": 6.4485039710998535, - "learning_rate": 7.003389830508475e-06, - "loss": 1.238, - "step": 19670 - }, - { - "epoch": 0.46, - "grad_norm": 8.261616706848145, - "learning_rate": 6.996610169491526e-06, - "loss": 1.1738, - "step": 19680 - }, - { - "epoch": 0.46, - "grad_norm": 6.185995101928711, - "learning_rate": 6.989830508474576e-06, - "loss": 1.3407, - "step": 19690 - }, - { - "epoch": 0.46, - "grad_norm": 9.428878784179688, - "learning_rate": 6.9830508474576275e-06, - "loss": 1.2947, - "step": 19700 - }, - { - "epoch": 0.46, - "grad_norm": 1.5929450988769531, - "learning_rate": 6.976271186440678e-06, - "loss": 1.025, - "step": 19710 - }, - { - "epoch": 0.46, - "grad_norm": 4.876119136810303, - "learning_rate": 6.9694915254237295e-06, - "loss": 1.4673, - "step": 19720 - }, - { - "epoch": 0.46, - "grad_norm": 2.259251356124878, - "learning_rate": 6.96271186440678e-06, - "loss": 1.5241, - "step": 19730 - }, - { - "epoch": 0.46, - "grad_norm": 4.431079864501953, - "learning_rate": 6.9559322033898315e-06, - "loss": 1.0948, - "step": 19740 - }, - { - "epoch": 0.46, - "grad_norm": 5.089422225952148, - "learning_rate": 6.949152542372882e-06, - "loss": 1.2419, - "step": 19750 - }, - { - "epoch": 0.46, - "grad_norm": 3.312706708908081, - "learning_rate": 6.942372881355933e-06, - "loss": 1.3715, - "step": 19760 - }, - { - "epoch": 0.46, - "grad_norm": 9.536430358886719, - "learning_rate": 6.935593220338983e-06, - "loss": 1.3488, - "step": 19770 - }, - { - "epoch": 0.46, - "grad_norm": 3.4993770122528076, - "learning_rate": 6.928813559322034e-06, - "loss": 1.2832, - "step": 19780 - }, - { - "epoch": 0.46, - "grad_norm": 3.5873186588287354, - "learning_rate": 6.922033898305085e-06, - "loss": 1.2135, - "step": 19790 - }, - { - "epoch": 0.46, - "grad_norm": 15.397970199584961, - "learning_rate": 6.915254237288136e-06, - "loss": 1.3824, - "step": 19800 - }, - { - "epoch": 0.46, - "grad_norm": 5.0459794998168945, - "learning_rate": 6.908474576271187e-06, - "loss": 1.298, - "step": 19810 - }, - { - "epoch": 0.46, - "grad_norm": 2.630267858505249, - "learning_rate": 6.901694915254238e-06, - "loss": 1.3083, - "step": 19820 - }, - { - "epoch": 0.46, - "grad_norm": 7.766806602478027, - "learning_rate": 6.894915254237289e-06, - "loss": 1.2522, - "step": 19830 - }, - { - "epoch": 0.46, - "grad_norm": 2.8050918579101562, - "learning_rate": 6.8881355932203395e-06, - "loss": 1.2368, - "step": 19840 - }, - { - "epoch": 0.46, - "grad_norm": 2.825176239013672, - "learning_rate": 6.881355932203391e-06, - "loss": 1.2529, - "step": 19850 - }, - { - "epoch": 0.46, - "grad_norm": 2.7948811054229736, - "learning_rate": 6.8745762711864415e-06, - "loss": 1.1087, - "step": 19860 - }, - { - "epoch": 0.46, - "grad_norm": 5.275668144226074, - "learning_rate": 6.867796610169493e-06, - "loss": 1.3271, - "step": 19870 - }, - { - "epoch": 0.46, - "grad_norm": 4.819080352783203, - "learning_rate": 6.861016949152543e-06, - "loss": 1.2839, - "step": 19880 - }, - { - "epoch": 0.46, - "grad_norm": 6.651522159576416, - "learning_rate": 6.854237288135593e-06, - "loss": 1.1781, - "step": 19890 - }, - { - "epoch": 0.46, - "grad_norm": 4.455416202545166, - "learning_rate": 6.8474576271186445e-06, - "loss": 1.3672, - "step": 19900 - }, - { - "epoch": 0.46, - "grad_norm": 5.660415172576904, - "learning_rate": 6.840677966101695e-06, - "loss": 1.1597, - "step": 19910 - }, - { - "epoch": 0.46, - "grad_norm": 4.537483215332031, - "learning_rate": 6.8338983050847465e-06, - "loss": 1.0678, - "step": 19920 - }, - { - "epoch": 0.46, - "grad_norm": 8.044699668884277, - "learning_rate": 6.827118644067797e-06, - "loss": 1.1595, - "step": 19930 - }, - { - "epoch": 0.46, - "grad_norm": 6.171726226806641, - "learning_rate": 6.8203389830508485e-06, - "loss": 1.3844, - "step": 19940 - }, - { - "epoch": 0.46, - "grad_norm": 7.469864368438721, - "learning_rate": 6.813559322033899e-06, - "loss": 1.3641, - "step": 19950 - }, - { - "epoch": 0.46, - "grad_norm": 1.8413645029067993, - "learning_rate": 6.80677966101695e-06, - "loss": 1.3229, - "step": 19960 - }, - { - "epoch": 0.46, - "grad_norm": 7.607746124267578, - "learning_rate": 6.800000000000001e-06, - "loss": 1.3169, - "step": 19970 - }, - { - "epoch": 0.46, - "grad_norm": 6.271143436431885, - "learning_rate": 6.793220338983051e-06, - "loss": 1.1564, - "step": 19980 - }, - { - "epoch": 0.46, - "grad_norm": 6.224188804626465, - "learning_rate": 6.786440677966102e-06, - "loss": 1.426, - "step": 19990 - }, - { - "epoch": 0.46, - "grad_norm": 5.205558776855469, - "learning_rate": 6.779661016949153e-06, - "loss": 1.0772, - "step": 20000 - }, - { - "epoch": 0.46, - "eval_loss": 0.9983803629875183, - "eval_runtime": 67.8281, - "eval_samples_per_second": 14.743, - "eval_steps_per_second": 14.743, - "step": 20000 - }, - { - "epoch": 0.46, - "grad_norm": 2.1063528060913086, - "learning_rate": 6.772881355932204e-06, - "loss": 1.2372, - "step": 20010 - }, - { - "epoch": 0.46, - "grad_norm": 8.878252983093262, - "learning_rate": 6.766101694915255e-06, - "loss": 1.3002, - "step": 20020 - }, - { - "epoch": 0.46, - "grad_norm": 4.35091495513916, - "learning_rate": 6.759322033898306e-06, - "loss": 1.2132, - "step": 20030 - }, - { - "epoch": 0.46, - "grad_norm": 3.8765366077423096, - "learning_rate": 6.7525423728813565e-06, - "loss": 1.2695, - "step": 20040 - }, - { - "epoch": 0.46, - "grad_norm": 3.4200284481048584, - "learning_rate": 6.745762711864408e-06, - "loss": 1.4985, - "step": 20050 - }, - { - "epoch": 0.46, - "grad_norm": 4.551451683044434, - "learning_rate": 6.7389830508474585e-06, - "loss": 1.303, - "step": 20060 - }, - { - "epoch": 0.46, - "grad_norm": 5.922582149505615, - "learning_rate": 6.73220338983051e-06, - "loss": 1.2245, - "step": 20070 - }, - { - "epoch": 0.46, - "grad_norm": 7.12042760848999, - "learning_rate": 6.7254237288135604e-06, - "loss": 1.1991, - "step": 20080 - }, - { - "epoch": 0.47, - "grad_norm": 3.4062719345092773, - "learning_rate": 6.71864406779661e-06, - "loss": 1.236, - "step": 20090 - }, - { - "epoch": 0.47, - "grad_norm": 8.163664817810059, - "learning_rate": 6.7118644067796615e-06, - "loss": 1.3316, - "step": 20100 - }, - { - "epoch": 0.47, - "grad_norm": 5.4666876792907715, - "learning_rate": 6.705084745762712e-06, - "loss": 1.2533, - "step": 20110 - }, - { - "epoch": 0.47, - "grad_norm": 2.7013297080993652, - "learning_rate": 6.6983050847457635e-06, - "loss": 1.3016, - "step": 20120 - }, - { - "epoch": 0.47, - "grad_norm": 5.204336166381836, - "learning_rate": 6.691525423728814e-06, - "loss": 1.2233, - "step": 20130 - }, - { - "epoch": 0.47, - "grad_norm": 4.580149173736572, - "learning_rate": 6.6847457627118655e-06, - "loss": 1.1783, - "step": 20140 - }, - { - "epoch": 0.47, - "grad_norm": 6.327539920806885, - "learning_rate": 6.677966101694916e-06, - "loss": 1.1495, - "step": 20150 - }, - { - "epoch": 0.47, - "grad_norm": 12.739646911621094, - "learning_rate": 6.6711864406779666e-06, - "loss": 1.4032, - "step": 20160 - }, - { - "epoch": 0.47, - "grad_norm": 2.9852492809295654, - "learning_rate": 6.664406779661018e-06, - "loss": 1.4179, - "step": 20170 - }, - { - "epoch": 0.47, - "grad_norm": 5.05040979385376, - "learning_rate": 6.6576271186440685e-06, - "loss": 1.2684, - "step": 20180 - }, - { - "epoch": 0.47, - "grad_norm": 7.304352760314941, - "learning_rate": 6.650847457627119e-06, - "loss": 1.3162, - "step": 20190 - }, - { - "epoch": 0.47, - "grad_norm": 1.9470860958099365, - "learning_rate": 6.64406779661017e-06, - "loss": 1.3961, - "step": 20200 - }, - { - "epoch": 0.47, - "grad_norm": 22.234329223632812, - "learning_rate": 6.637288135593221e-06, - "loss": 1.4659, - "step": 20210 - }, - { - "epoch": 0.47, - "grad_norm": 5.601120948791504, - "learning_rate": 6.6305084745762716e-06, - "loss": 1.3222, - "step": 20220 - }, - { - "epoch": 0.47, - "grad_norm": 4.113243579864502, - "learning_rate": 6.623728813559322e-06, - "loss": 1.2246, - "step": 20230 - }, - { - "epoch": 0.47, - "grad_norm": 2.9469447135925293, - "learning_rate": 6.6169491525423735e-06, - "loss": 1.3428, - "step": 20240 - }, - { - "epoch": 0.47, - "grad_norm": 3.2708497047424316, - "learning_rate": 6.610169491525424e-06, - "loss": 1.2096, - "step": 20250 - }, - { - "epoch": 0.47, - "grad_norm": 2.1987311840057373, - "learning_rate": 6.6033898305084755e-06, - "loss": 1.2681, - "step": 20260 - }, - { - "epoch": 0.47, - "grad_norm": 7.16361665725708, - "learning_rate": 6.596610169491526e-06, - "loss": 1.2586, - "step": 20270 - }, - { - "epoch": 0.47, - "grad_norm": 4.537783622741699, - "learning_rate": 6.5898305084745774e-06, - "loss": 1.2015, - "step": 20280 - }, - { - "epoch": 0.47, - "grad_norm": 6.448202610015869, - "learning_rate": 6.583050847457627e-06, - "loss": 1.1919, - "step": 20290 - }, - { - "epoch": 0.47, - "grad_norm": 0.4741743206977844, - "learning_rate": 6.576271186440678e-06, - "loss": 1.4343, - "step": 20300 - }, - { - "epoch": 0.47, - "grad_norm": 7.066291332244873, - "learning_rate": 6.569491525423729e-06, - "loss": 1.2109, - "step": 20310 - }, - { - "epoch": 0.47, - "grad_norm": 3.167917490005493, - "learning_rate": 6.56271186440678e-06, - "loss": 1.2345, - "step": 20320 - }, - { - "epoch": 0.47, - "grad_norm": 4.370973110198975, - "learning_rate": 6.555932203389831e-06, - "loss": 1.1774, - "step": 20330 - }, - { - "epoch": 0.47, - "grad_norm": 3.6259400844573975, - "learning_rate": 6.549152542372882e-06, - "loss": 1.338, - "step": 20340 - }, - { - "epoch": 0.47, - "grad_norm": 2.601332664489746, - "learning_rate": 6.542372881355933e-06, - "loss": 1.3791, - "step": 20350 - }, - { - "epoch": 0.47, - "grad_norm": 9.787617683410645, - "learning_rate": 6.5355932203389836e-06, - "loss": 1.2684, - "step": 20360 - }, - { - "epoch": 0.47, - "grad_norm": 1.965387225151062, - "learning_rate": 6.528813559322035e-06, - "loss": 1.2012, - "step": 20370 - }, - { - "epoch": 0.47, - "grad_norm": 7.392675399780273, - "learning_rate": 6.5220338983050855e-06, - "loss": 1.0166, - "step": 20380 - }, - { - "epoch": 0.47, - "grad_norm": 6.649271011352539, - "learning_rate": 6.515254237288137e-06, - "loss": 1.1422, - "step": 20390 - }, - { - "epoch": 0.47, - "grad_norm": 6.833590984344482, - "learning_rate": 6.508474576271187e-06, - "loss": 1.3592, - "step": 20400 - }, - { - "epoch": 0.47, - "grad_norm": 1.1567243337631226, - "learning_rate": 6.501694915254237e-06, - "loss": 1.4019, - "step": 20410 - }, - { - "epoch": 0.47, - "grad_norm": 1.9526944160461426, - "learning_rate": 6.4949152542372886e-06, - "loss": 1.3748, - "step": 20420 - }, - { - "epoch": 0.47, - "grad_norm": 9.20057201385498, - "learning_rate": 6.488135593220339e-06, - "loss": 1.1884, - "step": 20430 - }, - { - "epoch": 0.47, - "grad_norm": 5.133617401123047, - "learning_rate": 6.4813559322033905e-06, - "loss": 1.2444, - "step": 20440 - }, - { - "epoch": 0.47, - "grad_norm": 4.195832252502441, - "learning_rate": 6.474576271186441e-06, - "loss": 1.3187, - "step": 20450 - }, - { - "epoch": 0.47, - "grad_norm": 8.69914722442627, - "learning_rate": 6.4677966101694925e-06, - "loss": 1.2932, - "step": 20460 - }, - { - "epoch": 0.47, - "grad_norm": 3.7510390281677246, - "learning_rate": 6.461016949152543e-06, - "loss": 1.1252, - "step": 20470 - }, - { - "epoch": 0.47, - "grad_norm": 5.250388145446777, - "learning_rate": 6.4542372881355944e-06, - "loss": 1.2828, - "step": 20480 - }, - { - "epoch": 0.47, - "grad_norm": 8.589373588562012, - "learning_rate": 6.447457627118645e-06, - "loss": 1.2028, - "step": 20490 - }, - { - "epoch": 0.47, - "grad_norm": 7.695883274078369, - "learning_rate": 6.440677966101695e-06, - "loss": 1.3743, - "step": 20500 - }, - { - "epoch": 0.47, - "eval_loss": 1.0409547090530396, - "eval_runtime": 67.4208, - "eval_samples_per_second": 14.832, - "eval_steps_per_second": 14.832, - "step": 20500 - }, - { - "epoch": 0.47, - "grad_norm": 10.85682487487793, - "learning_rate": 6.433898305084746e-06, - "loss": 1.4262, - "step": 20510 - }, - { - "epoch": 0.48, - "grad_norm": 6.717245101928711, - "learning_rate": 6.427118644067797e-06, - "loss": 1.3546, - "step": 20520 - }, - { - "epoch": 0.48, - "grad_norm": 9.486088752746582, - "learning_rate": 6.420338983050848e-06, - "loss": 1.2189, - "step": 20530 - }, - { - "epoch": 0.48, - "grad_norm": 4.002749919891357, - "learning_rate": 6.413559322033899e-06, - "loss": 1.3337, - "step": 20540 - }, - { - "epoch": 0.48, - "grad_norm": 8.079071044921875, - "learning_rate": 6.40677966101695e-06, - "loss": 1.3489, - "step": 20550 - }, - { - "epoch": 0.48, - "grad_norm": 6.364570617675781, - "learning_rate": 6.4000000000000006e-06, - "loss": 1.0267, - "step": 20560 - }, - { - "epoch": 0.48, - "grad_norm": 4.548829555511475, - "learning_rate": 6.393220338983052e-06, - "loss": 1.3025, - "step": 20570 - }, - { - "epoch": 0.48, - "grad_norm": 8.967122077941895, - "learning_rate": 6.3864406779661025e-06, - "loss": 1.2183, - "step": 20580 - }, - { - "epoch": 0.48, - "grad_norm": 8.529352188110352, - "learning_rate": 6.379661016949154e-06, - "loss": 1.2688, - "step": 20590 - }, - { - "epoch": 0.48, - "grad_norm": 7.445988178253174, - "learning_rate": 6.372881355932204e-06, - "loss": 1.3147, - "step": 20600 - }, - { - "epoch": 0.48, - "grad_norm": 8.145844459533691, - "learning_rate": 6.366101694915254e-06, - "loss": 1.3358, - "step": 20610 - }, - { - "epoch": 0.48, - "grad_norm": 3.0077157020568848, - "learning_rate": 6.3593220338983056e-06, - "loss": 1.4507, - "step": 20620 - }, - { - "epoch": 0.48, - "grad_norm": 7.429611682891846, - "learning_rate": 6.352542372881356e-06, - "loss": 1.0919, - "step": 20630 - }, - { - "epoch": 0.48, - "grad_norm": 6.513698101043701, - "learning_rate": 6.3457627118644075e-06, - "loss": 1.392, - "step": 20640 - }, - { - "epoch": 0.48, - "grad_norm": 10.298941612243652, - "learning_rate": 6.338983050847458e-06, - "loss": 1.1327, - "step": 20650 - }, - { - "epoch": 0.48, - "grad_norm": 3.2288424968719482, - "learning_rate": 6.3322033898305095e-06, - "loss": 1.1693, - "step": 20660 - }, - { - "epoch": 0.48, - "grad_norm": 4.072999000549316, - "learning_rate": 6.32542372881356e-06, - "loss": 1.1615, - "step": 20670 - }, - { - "epoch": 0.48, - "grad_norm": 2.6640424728393555, - "learning_rate": 6.318644067796611e-06, - "loss": 1.1804, - "step": 20680 - }, - { - "epoch": 0.48, - "grad_norm": 7.22818660736084, - "learning_rate": 6.311864406779662e-06, - "loss": 1.3203, - "step": 20690 - }, - { - "epoch": 0.48, - "grad_norm": 3.37728214263916, - "learning_rate": 6.3050847457627125e-06, - "loss": 1.1533, - "step": 20700 - }, - { - "epoch": 0.48, - "grad_norm": 2.2169246673583984, - "learning_rate": 6.298305084745763e-06, - "loss": 1.3152, - "step": 20710 - }, - { - "epoch": 0.48, - "grad_norm": 8.210066795349121, - "learning_rate": 6.291525423728814e-06, - "loss": 1.292, - "step": 20720 - }, - { - "epoch": 0.48, - "grad_norm": 8.596230506896973, - "learning_rate": 6.284745762711865e-06, - "loss": 1.2931, - "step": 20730 - }, - { - "epoch": 0.48, - "grad_norm": 4.568326950073242, - "learning_rate": 6.277966101694916e-06, - "loss": 1.3412, - "step": 20740 - }, - { - "epoch": 0.48, - "grad_norm": 7.391994953155518, - "learning_rate": 6.271186440677966e-06, - "loss": 1.1354, - "step": 20750 - }, - { - "epoch": 0.48, - "grad_norm": 5.019871711730957, - "learning_rate": 6.2644067796610176e-06, - "loss": 1.3048, - "step": 20760 - }, - { - "epoch": 0.48, - "grad_norm": 2.394231081008911, - "learning_rate": 6.257627118644068e-06, - "loss": 1.372, - "step": 20770 - }, - { - "epoch": 0.48, - "grad_norm": 13.933137893676758, - "learning_rate": 6.2508474576271195e-06, - "loss": 1.2364, - "step": 20780 - }, - { - "epoch": 0.48, - "grad_norm": 0.63595050573349, - "learning_rate": 6.24406779661017e-06, - "loss": 1.3786, - "step": 20790 - }, - { - "epoch": 0.48, - "grad_norm": 4.854935646057129, - "learning_rate": 6.2372881355932215e-06, - "loss": 1.2068, - "step": 20800 - }, - { - "epoch": 0.48, - "grad_norm": 8.855067253112793, - "learning_rate": 6.230508474576271e-06, - "loss": 1.1401, - "step": 20810 - }, - { - "epoch": 0.48, - "grad_norm": 2.5127904415130615, - "learning_rate": 6.223728813559322e-06, - "loss": 1.3908, - "step": 20820 - }, - { - "epoch": 0.48, - "grad_norm": 4.830283164978027, - "learning_rate": 6.216949152542373e-06, - "loss": 1.2906, - "step": 20830 - }, - { - "epoch": 0.48, - "grad_norm": 5.127600193023682, - "learning_rate": 6.210169491525424e-06, - "loss": 1.2737, - "step": 20840 - }, - { - "epoch": 0.48, - "grad_norm": 2.2829642295837402, - "learning_rate": 6.203389830508475e-06, - "loss": 1.1188, - "step": 20850 - }, - { - "epoch": 0.48, - "grad_norm": 3.748267889022827, - "learning_rate": 6.196610169491526e-06, - "loss": 1.2071, - "step": 20860 - }, - { - "epoch": 0.48, - "grad_norm": 5.889699935913086, - "learning_rate": 6.189830508474577e-06, - "loss": 1.2175, - "step": 20870 - }, - { - "epoch": 0.48, - "grad_norm": 2.2203211784362793, - "learning_rate": 6.183050847457628e-06, - "loss": 1.4178, - "step": 20880 - }, - { - "epoch": 0.48, - "grad_norm": 9.717460632324219, - "learning_rate": 6.176271186440679e-06, - "loss": 1.2213, - "step": 20890 - }, - { - "epoch": 0.48, - "grad_norm": 4.968123912811279, - "learning_rate": 6.1694915254237295e-06, - "loss": 1.2122, - "step": 20900 - }, - { - "epoch": 0.48, - "grad_norm": 11.27860164642334, - "learning_rate": 6.162711864406781e-06, - "loss": 1.2388, - "step": 20910 - }, - { - "epoch": 0.48, - "grad_norm": 7.691869735717773, - "learning_rate": 6.155932203389831e-06, - "loss": 1.3148, - "step": 20920 - }, - { - "epoch": 0.48, - "grad_norm": 3.1556854248046875, - "learning_rate": 6.149152542372881e-06, - "loss": 1.0672, - "step": 20930 - }, - { - "epoch": 0.48, - "grad_norm": 11.499942779541016, - "learning_rate": 6.142372881355933e-06, - "loss": 1.2098, - "step": 20940 - }, - { - "epoch": 0.49, - "grad_norm": 2.9171266555786133, - "learning_rate": 6.135593220338983e-06, - "loss": 1.2329, - "step": 20950 - }, - { - "epoch": 0.49, - "grad_norm": 12.616515159606934, - "learning_rate": 6.1288135593220346e-06, - "loss": 1.1922, - "step": 20960 - }, - { - "epoch": 0.49, - "grad_norm": 4.825931072235107, - "learning_rate": 6.122033898305085e-06, - "loss": 1.2403, - "step": 20970 - }, - { - "epoch": 0.49, - "grad_norm": 9.974098205566406, - "learning_rate": 6.1152542372881365e-06, - "loss": 1.4907, - "step": 20980 - }, - { - "epoch": 0.49, - "grad_norm": 3.1674904823303223, - "learning_rate": 6.108474576271187e-06, - "loss": 1.1076, - "step": 20990 - }, - { - "epoch": 0.49, - "grad_norm": 5.067592144012451, - "learning_rate": 6.1016949152542385e-06, - "loss": 1.3289, - "step": 21000 - }, - { - "epoch": 0.49, - "eval_loss": 1.0621670484542847, - "eval_runtime": 67.2686, - "eval_samples_per_second": 14.866, - "eval_steps_per_second": 14.866, - "step": 21000 - }, - { - "epoch": 0.49, - "grad_norm": 2.247994899749756, - "learning_rate": 6.094915254237289e-06, - "loss": 0.9261, - "step": 21010 - }, - { - "epoch": 0.49, - "grad_norm": 8.553903579711914, - "learning_rate": 6.088135593220339e-06, - "loss": 1.3122, - "step": 21020 - }, - { - "epoch": 0.49, - "grad_norm": 3.4144701957702637, - "learning_rate": 6.08135593220339e-06, - "loss": 1.5104, - "step": 21030 - }, - { - "epoch": 0.49, - "grad_norm": 3.242537021636963, - "learning_rate": 6.074576271186441e-06, - "loss": 1.3842, - "step": 21040 - }, - { - "epoch": 0.49, - "grad_norm": 2.6115124225616455, - "learning_rate": 6.067796610169492e-06, - "loss": 1.4534, - "step": 21050 - }, - { - "epoch": 0.49, - "grad_norm": 3.6139254570007324, - "learning_rate": 6.061016949152543e-06, - "loss": 1.1461, - "step": 21060 - }, - { - "epoch": 0.49, - "grad_norm": 8.1109619140625, - "learning_rate": 6.054237288135594e-06, - "loss": 1.2601, - "step": 21070 - }, - { - "epoch": 0.49, - "grad_norm": 7.504269123077393, - "learning_rate": 6.047457627118645e-06, - "loss": 0.9487, - "step": 21080 - }, - { - "epoch": 0.49, - "grad_norm": 3.8124566078186035, - "learning_rate": 6.040677966101696e-06, - "loss": 1.1965, - "step": 21090 - }, - { - "epoch": 0.49, - "grad_norm": 6.297351837158203, - "learning_rate": 6.0338983050847465e-06, - "loss": 1.4083, - "step": 21100 - }, - { - "epoch": 0.49, - "grad_norm": 7.318817138671875, - "learning_rate": 6.027118644067798e-06, - "loss": 1.3782, - "step": 21110 - }, - { - "epoch": 0.49, - "grad_norm": 3.4005706310272217, - "learning_rate": 6.020338983050848e-06, - "loss": 1.3072, - "step": 21120 - }, - { - "epoch": 0.49, - "grad_norm": 6.140954971313477, - "learning_rate": 6.013559322033898e-06, - "loss": 1.3371, - "step": 21130 - }, - { - "epoch": 0.49, - "grad_norm": 9.956124305725098, - "learning_rate": 6.00677966101695e-06, - "loss": 1.3825, - "step": 21140 - }, - { - "epoch": 0.49, - "grad_norm": 5.303738117218018, - "learning_rate": 6e-06, - "loss": 1.2292, - "step": 21150 - }, - { - "epoch": 0.49, - "grad_norm": 7.743120193481445, - "learning_rate": 5.9932203389830516e-06, - "loss": 1.1453, - "step": 21160 - }, - { - "epoch": 0.49, - "grad_norm": 9.754213333129883, - "learning_rate": 5.986440677966102e-06, - "loss": 1.2849, - "step": 21170 - }, - { - "epoch": 0.49, - "grad_norm": 4.077347755432129, - "learning_rate": 5.9796610169491535e-06, - "loss": 1.3983, - "step": 21180 - }, - { - "epoch": 0.49, - "grad_norm": 7.309597969055176, - "learning_rate": 5.972881355932204e-06, - "loss": 1.3914, - "step": 21190 - }, - { - "epoch": 0.49, - "grad_norm": 8.950671195983887, - "learning_rate": 5.9661016949152555e-06, - "loss": 1.3262, - "step": 21200 - }, - { - "epoch": 0.49, - "grad_norm": 4.134212493896484, - "learning_rate": 5.959322033898306e-06, - "loss": 1.2166, - "step": 21210 - }, - { - "epoch": 0.49, - "grad_norm": 2.0232081413269043, - "learning_rate": 5.9525423728813566e-06, - "loss": 1.2959, - "step": 21220 - }, - { - "epoch": 0.49, - "grad_norm": 2.094144582748413, - "learning_rate": 5.945762711864407e-06, - "loss": 1.2989, - "step": 21230 - }, - { - "epoch": 0.49, - "grad_norm": 7.6865925788879395, - "learning_rate": 5.938983050847458e-06, - "loss": 1.3427, - "step": 21240 - }, - { - "epoch": 0.49, - "grad_norm": 2.456575870513916, - "learning_rate": 5.932203389830509e-06, - "loss": 1.1175, - "step": 21250 - }, - { - "epoch": 0.49, - "grad_norm": 5.038335800170898, - "learning_rate": 5.92542372881356e-06, - "loss": 1.1995, - "step": 21260 - }, - { - "epoch": 0.49, - "grad_norm": 6.489980697631836, - "learning_rate": 5.91864406779661e-06, - "loss": 1.2752, - "step": 21270 - }, - { - "epoch": 0.49, - "grad_norm": 4.00714111328125, - "learning_rate": 5.911864406779662e-06, - "loss": 1.3853, - "step": 21280 - }, - { - "epoch": 0.49, - "grad_norm": 2.6164281368255615, - "learning_rate": 5.905084745762712e-06, - "loss": 1.0804, - "step": 21290 - }, - { - "epoch": 0.49, - "grad_norm": 3.9665138721466064, - "learning_rate": 5.8983050847457635e-06, - "loss": 1.1759, - "step": 21300 - }, - { - "epoch": 0.49, - "grad_norm": 2.67433500289917, - "learning_rate": 5.891525423728814e-06, - "loss": 1.393, - "step": 21310 - }, - { - "epoch": 0.49, - "grad_norm": 7.603160381317139, - "learning_rate": 5.8847457627118655e-06, - "loss": 1.5365, - "step": 21320 - }, - { - "epoch": 0.49, - "grad_norm": 7.298281192779541, - "learning_rate": 5.877966101694915e-06, - "loss": 1.3866, - "step": 21330 - }, - { - "epoch": 0.49, - "grad_norm": 2.494446039199829, - "learning_rate": 5.871186440677966e-06, - "loss": 1.2081, - "step": 21340 - }, - { - "epoch": 0.49, - "grad_norm": 8.670098304748535, - "learning_rate": 5.864406779661017e-06, - "loss": 1.2346, - "step": 21350 - }, - { - "epoch": 0.49, - "grad_norm": 8.977124214172363, - "learning_rate": 5.857627118644068e-06, - "loss": 1.0865, - "step": 21360 - }, - { - "epoch": 0.49, - "grad_norm": 7.3538384437561035, - "learning_rate": 5.850847457627119e-06, - "loss": 1.0902, - "step": 21370 - }, - { - "epoch": 0.5, - "grad_norm": 5.1297783851623535, - "learning_rate": 5.84406779661017e-06, - "loss": 1.532, - "step": 21380 - }, - { - "epoch": 0.5, - "grad_norm": 9.002881050109863, - "learning_rate": 5.837288135593221e-06, - "loss": 1.2216, - "step": 21390 - }, - { - "epoch": 0.5, - "grad_norm": 7.277677536010742, - "learning_rate": 5.830508474576272e-06, - "loss": 1.1974, - "step": 21400 - }, - { - "epoch": 0.5, - "grad_norm": 3.383936643600464, - "learning_rate": 5.823728813559323e-06, - "loss": 1.2097, - "step": 21410 - }, - { - "epoch": 0.5, - "grad_norm": 9.847234725952148, - "learning_rate": 5.8169491525423736e-06, - "loss": 1.225, - "step": 21420 - }, - { - "epoch": 0.5, - "grad_norm": 9.401296615600586, - "learning_rate": 5.810169491525425e-06, - "loss": 1.4399, - "step": 21430 - }, - { - "epoch": 0.5, - "grad_norm": 6.085318088531494, - "learning_rate": 5.803389830508475e-06, - "loss": 1.2921, - "step": 21440 - }, - { - "epoch": 0.5, - "grad_norm": 1.833952784538269, - "learning_rate": 5.796610169491525e-06, - "loss": 1.3274, - "step": 21450 - }, - { - "epoch": 0.5, - "grad_norm": 4.7563090324401855, - "learning_rate": 5.789830508474577e-06, - "loss": 1.2354, - "step": 21460 - }, - { - "epoch": 0.5, - "grad_norm": 2.760796546936035, - "learning_rate": 5.783050847457627e-06, - "loss": 1.2845, - "step": 21470 - }, - { - "epoch": 0.5, - "grad_norm": 8.02619743347168, - "learning_rate": 5.776271186440679e-06, - "loss": 1.3089, - "step": 21480 - }, - { - "epoch": 0.5, - "grad_norm": 7.648224830627441, - "learning_rate": 5.769491525423729e-06, - "loss": 1.3202, - "step": 21490 - }, - { - "epoch": 0.5, - "grad_norm": 3.990171194076538, - "learning_rate": 5.7627118644067805e-06, - "loss": 1.3624, - "step": 21500 - }, - { - "epoch": 0.5, - "eval_loss": 1.033817172050476, - "eval_runtime": 68.2554, - "eval_samples_per_second": 14.651, - "eval_steps_per_second": 14.651, - "step": 21500 - }, - { - "epoch": 0.5, - "grad_norm": 5.392577171325684, - "learning_rate": 5.755932203389831e-06, - "loss": 1.1602, - "step": 21510 - }, - { - "epoch": 0.5, - "grad_norm": 2.582357883453369, - "learning_rate": 5.7491525423728825e-06, - "loss": 1.0574, - "step": 21520 - }, - { - "epoch": 0.5, - "grad_norm": 4.663832187652588, - "learning_rate": 5.742372881355933e-06, - "loss": 1.1496, - "step": 21530 - }, - { - "epoch": 0.5, - "grad_norm": 8.770710945129395, - "learning_rate": 5.735593220338983e-06, - "loss": 1.2921, - "step": 21540 - }, - { - "epoch": 0.5, - "grad_norm": 9.296341896057129, - "learning_rate": 5.728813559322034e-06, - "loss": 1.284, - "step": 21550 - }, - { - "epoch": 0.5, - "grad_norm": 9.320971488952637, - "learning_rate": 5.722033898305085e-06, - "loss": 1.3229, - "step": 21560 - }, - { - "epoch": 0.5, - "grad_norm": 5.163783073425293, - "learning_rate": 5.715254237288136e-06, - "loss": 1.3256, - "step": 21570 - }, - { - "epoch": 0.5, - "grad_norm": 7.684600830078125, - "learning_rate": 5.708474576271187e-06, - "loss": 1.2054, - "step": 21580 - }, - { - "epoch": 0.5, - "grad_norm": 11.25024700164795, - "learning_rate": 5.701694915254238e-06, - "loss": 1.3583, - "step": 21590 - }, - { - "epoch": 0.5, - "grad_norm": 1.7152315378189087, - "learning_rate": 5.694915254237289e-06, - "loss": 1.2261, - "step": 21600 - }, - { - "epoch": 0.5, - "grad_norm": 4.451650619506836, - "learning_rate": 5.68813559322034e-06, - "loss": 1.2716, - "step": 21610 - }, - { - "epoch": 0.5, - "grad_norm": 1.9379420280456543, - "learning_rate": 5.6813559322033906e-06, - "loss": 1.2794, - "step": 21620 - }, - { - "epoch": 0.5, - "grad_norm": 2.325803756713867, - "learning_rate": 5.674576271186442e-06, - "loss": 1.3336, - "step": 21630 - }, - { - "epoch": 0.5, - "grad_norm": 5.5600457191467285, - "learning_rate": 5.667796610169492e-06, - "loss": 1.2249, - "step": 21640 - }, - { - "epoch": 0.5, - "grad_norm": 6.445628643035889, - "learning_rate": 5.661016949152542e-06, - "loss": 1.1567, - "step": 21650 - }, - { - "epoch": 0.5, - "grad_norm": 4.747307300567627, - "learning_rate": 5.654237288135594e-06, - "loss": 1.1082, - "step": 21660 - }, - { - "epoch": 0.5, - "grad_norm": 4.780632495880127, - "learning_rate": 5.647457627118644e-06, - "loss": 1.2751, - "step": 21670 - }, - { - "epoch": 0.5, - "grad_norm": 6.126337051391602, - "learning_rate": 5.640677966101696e-06, - "loss": 1.2164, - "step": 21680 - }, - { - "epoch": 0.5, - "grad_norm": 7.235857963562012, - "learning_rate": 5.633898305084746e-06, - "loss": 1.5172, - "step": 21690 - }, - { - "epoch": 0.5, - "grad_norm": 5.676464080810547, - "learning_rate": 5.6271186440677975e-06, - "loss": 1.1409, - "step": 21700 - }, - { - "epoch": 0.5, - "grad_norm": 4.40902853012085, - "learning_rate": 5.620338983050848e-06, - "loss": 1.2341, - "step": 21710 - }, - { - "epoch": 0.5, - "grad_norm": 2.4780306816101074, - "learning_rate": 5.6135593220338995e-06, - "loss": 1.2982, - "step": 21720 - }, - { - "epoch": 0.5, - "grad_norm": 4.747418403625488, - "learning_rate": 5.60677966101695e-06, - "loss": 1.1433, - "step": 21730 - }, - { - "epoch": 0.5, - "grad_norm": 1.319555401802063, - "learning_rate": 5.600000000000001e-06, - "loss": 1.1173, - "step": 21740 - }, - { - "epoch": 0.5, - "grad_norm": 6.180632591247559, - "learning_rate": 5.593220338983051e-06, - "loss": 1.3304, - "step": 21750 - }, - { - "epoch": 0.5, - "grad_norm": 17.385282516479492, - "learning_rate": 5.586440677966102e-06, - "loss": 1.3295, - "step": 21760 - }, - { - "epoch": 0.5, - "grad_norm": 1.6295877695083618, - "learning_rate": 5.579661016949153e-06, - "loss": 1.329, - "step": 21770 - }, - { - "epoch": 0.5, - "grad_norm": 7.083805561065674, - "learning_rate": 5.572881355932204e-06, - "loss": 1.337, - "step": 21780 - }, - { - "epoch": 0.5, - "grad_norm": 9.662304878234863, - "learning_rate": 5.566101694915255e-06, - "loss": 1.2301, - "step": 21790 - }, - { - "epoch": 0.5, - "grad_norm": 7.854396820068359, - "learning_rate": 5.559322033898306e-06, - "loss": 1.3312, - "step": 21800 - }, - { - "epoch": 0.5, - "grad_norm": 4.626899242401123, - "learning_rate": 5.552542372881356e-06, - "loss": 1.1979, - "step": 21810 - }, - { - "epoch": 0.51, - "grad_norm": 7.825695037841797, - "learning_rate": 5.5457627118644076e-06, - "loss": 1.2997, - "step": 21820 - }, - { - "epoch": 0.51, - "grad_norm": 4.459179401397705, - "learning_rate": 5.538983050847458e-06, - "loss": 1.4772, - "step": 21830 - }, - { - "epoch": 0.51, - "grad_norm": 5.008397579193115, - "learning_rate": 5.5322033898305095e-06, - "loss": 1.2599, - "step": 21840 - }, - { - "epoch": 0.51, - "grad_norm": 5.558854103088379, - "learning_rate": 5.525423728813559e-06, - "loss": 1.3258, - "step": 21850 - }, - { - "epoch": 0.51, - "grad_norm": 2.846191883087158, - "learning_rate": 5.518644067796611e-06, - "loss": 1.344, - "step": 21860 - }, - { - "epoch": 0.51, - "grad_norm": 3.7226576805114746, - "learning_rate": 5.511864406779661e-06, - "loss": 1.1642, - "step": 21870 - }, - { - "epoch": 0.51, - "grad_norm": 8.011185646057129, - "learning_rate": 5.505084745762712e-06, - "loss": 1.1073, - "step": 21880 - }, - { - "epoch": 0.51, - "grad_norm": 8.683090209960938, - "learning_rate": 5.498305084745763e-06, - "loss": 1.2852, - "step": 21890 - }, - { - "epoch": 0.51, - "grad_norm": 3.06953763961792, - "learning_rate": 5.491525423728814e-06, - "loss": 1.2416, - "step": 21900 - }, - { - "epoch": 0.51, - "grad_norm": 3.721527576446533, - "learning_rate": 5.484745762711865e-06, - "loss": 1.3548, - "step": 21910 - }, - { - "epoch": 0.51, - "grad_norm": 4.444101810455322, - "learning_rate": 5.477966101694916e-06, - "loss": 1.3911, - "step": 21920 - }, - { - "epoch": 0.51, - "grad_norm": 9.347046852111816, - "learning_rate": 5.471186440677967e-06, - "loss": 1.237, - "step": 21930 - }, - { - "epoch": 0.51, - "grad_norm": 5.472478866577148, - "learning_rate": 5.464406779661018e-06, - "loss": 1.1721, - "step": 21940 - }, - { - "epoch": 0.51, - "grad_norm": 3.854987382888794, - "learning_rate": 5.457627118644067e-06, - "loss": 1.245, - "step": 21950 - }, - { - "epoch": 0.51, - "grad_norm": 8.591747283935547, - "learning_rate": 5.450847457627119e-06, - "loss": 1.2754, - "step": 21960 - }, - { - "epoch": 0.51, - "grad_norm": 2.2374930381774902, - "learning_rate": 5.444067796610169e-06, - "loss": 1.248, - "step": 21970 - }, - { - "epoch": 0.51, - "grad_norm": 3.7026805877685547, - "learning_rate": 5.437288135593221e-06, - "loss": 1.2337, - "step": 21980 - }, - { - "epoch": 0.51, - "grad_norm": 3.4988057613372803, - "learning_rate": 5.430508474576271e-06, - "loss": 1.2272, - "step": 21990 - }, - { - "epoch": 0.51, - "grad_norm": 3.3364341259002686, - "learning_rate": 5.423728813559323e-06, - "loss": 1.2301, - "step": 22000 - }, - { - "epoch": 0.51, - "eval_loss": 0.9484620690345764, - "eval_runtime": 67.8436, - "eval_samples_per_second": 14.74, - "eval_steps_per_second": 14.74, - "step": 22000 - }, - { - "epoch": 0.51, - "grad_norm": 8.392367362976074, - "learning_rate": 5.416949152542373e-06, - "loss": 1.3843, - "step": 22010 - }, - { - "epoch": 0.51, - "grad_norm": 7.365686416625977, - "learning_rate": 5.4101694915254246e-06, - "loss": 1.2589, - "step": 22020 - }, - { - "epoch": 0.51, - "grad_norm": 2.972006320953369, - "learning_rate": 5.403389830508475e-06, - "loss": 1.428, - "step": 22030 - }, - { - "epoch": 0.51, - "grad_norm": 3.8066885471343994, - "learning_rate": 5.3966101694915265e-06, - "loss": 1.2202, - "step": 22040 - }, - { - "epoch": 0.51, - "grad_norm": 5.83513879776001, - "learning_rate": 5.389830508474577e-06, - "loss": 1.3555, - "step": 22050 - }, - { - "epoch": 0.51, - "grad_norm": 10.071037292480469, - "learning_rate": 5.383050847457627e-06, - "loss": 1.4515, - "step": 22060 - }, - { - "epoch": 0.51, - "grad_norm": 5.605524063110352, - "learning_rate": 5.376271186440678e-06, - "loss": 1.3172, - "step": 22070 - }, - { - "epoch": 0.51, - "grad_norm": 7.423837184906006, - "learning_rate": 5.369491525423729e-06, - "loss": 1.2454, - "step": 22080 - }, - { - "epoch": 0.51, - "grad_norm": 7.91333532333374, - "learning_rate": 5.36271186440678e-06, - "loss": 0.9647, - "step": 22090 - }, - { - "epoch": 0.51, - "grad_norm": 5.196317672729492, - "learning_rate": 5.355932203389831e-06, - "loss": 1.3829, - "step": 22100 - }, - { - "epoch": 0.51, - "grad_norm": 1.629678726196289, - "learning_rate": 5.349152542372882e-06, - "loss": 1.306, - "step": 22110 - }, - { - "epoch": 0.51, - "grad_norm": 3.215089797973633, - "learning_rate": 5.342372881355933e-06, - "loss": 1.3034, - "step": 22120 - }, - { - "epoch": 0.51, - "grad_norm": 5.362987995147705, - "learning_rate": 5.335593220338984e-06, - "loss": 1.3381, - "step": 22130 - }, - { - "epoch": 0.51, - "grad_norm": 3.5946292877197266, - "learning_rate": 5.328813559322035e-06, - "loss": 1.2814, - "step": 22140 - }, - { - "epoch": 0.51, - "grad_norm": 5.043945789337158, - "learning_rate": 5.322033898305086e-06, - "loss": 1.1752, - "step": 22150 - }, - { - "epoch": 0.51, - "grad_norm": 6.917520046234131, - "learning_rate": 5.315254237288136e-06, - "loss": 1.3277, - "step": 22160 - }, - { - "epoch": 0.51, - "grad_norm": 10.332216262817383, - "learning_rate": 5.308474576271186e-06, - "loss": 1.2168, - "step": 22170 - }, - { - "epoch": 0.51, - "grad_norm": 7.612697124481201, - "learning_rate": 5.301694915254238e-06, - "loss": 1.2865, - "step": 22180 - }, - { - "epoch": 0.51, - "grad_norm": 7.4925360679626465, - "learning_rate": 5.294915254237288e-06, - "loss": 1.1815, - "step": 22190 - }, - { - "epoch": 0.51, - "grad_norm": 1.8448129892349243, - "learning_rate": 5.28813559322034e-06, - "loss": 1.2843, - "step": 22200 - }, - { - "epoch": 0.51, - "grad_norm": 5.239171981811523, - "learning_rate": 5.28135593220339e-06, - "loss": 1.2266, - "step": 22210 - }, - { - "epoch": 0.51, - "grad_norm": 3.959794759750366, - "learning_rate": 5.2745762711864416e-06, - "loss": 1.1932, - "step": 22220 - }, - { - "epoch": 0.51, - "grad_norm": 4.557924270629883, - "learning_rate": 5.267796610169492e-06, - "loss": 1.386, - "step": 22230 - }, - { - "epoch": 0.51, - "grad_norm": 3.9078621864318848, - "learning_rate": 5.2610169491525435e-06, - "loss": 1.2638, - "step": 22240 - }, - { - "epoch": 0.52, - "grad_norm": 0.628909707069397, - "learning_rate": 5.254237288135594e-06, - "loss": 1.1575, - "step": 22250 - }, - { - "epoch": 0.52, - "grad_norm": 7.980692386627197, - "learning_rate": 5.247457627118645e-06, - "loss": 1.2014, - "step": 22260 - }, - { - "epoch": 0.52, - "grad_norm": 13.897843360900879, - "learning_rate": 5.240677966101695e-06, - "loss": 1.3383, - "step": 22270 - }, - { - "epoch": 0.52, - "grad_norm": 7.637335300445557, - "learning_rate": 5.233898305084746e-06, - "loss": 1.1604, - "step": 22280 - }, - { - "epoch": 0.52, - "grad_norm": 8.646268844604492, - "learning_rate": 5.227118644067797e-06, - "loss": 1.3669, - "step": 22290 - }, - { - "epoch": 0.52, - "grad_norm": 10.057101249694824, - "learning_rate": 5.220338983050848e-06, - "loss": 1.1849, - "step": 22300 - }, - { - "epoch": 0.52, - "grad_norm": 4.541333198547363, - "learning_rate": 5.213559322033899e-06, - "loss": 1.116, - "step": 22310 - }, - { - "epoch": 0.52, - "grad_norm": 16.73844337463379, - "learning_rate": 5.20677966101695e-06, - "loss": 1.3564, - "step": 22320 - }, - { - "epoch": 0.52, - "grad_norm": 13.738208770751953, - "learning_rate": 5.2e-06, - "loss": 1.365, - "step": 22330 - }, - { - "epoch": 0.52, - "grad_norm": 4.143723011016846, - "learning_rate": 5.193220338983052e-06, - "loss": 1.2913, - "step": 22340 - }, - { - "epoch": 0.52, - "grad_norm": 7.556609153747559, - "learning_rate": 5.186440677966102e-06, - "loss": 1.2513, - "step": 22350 - }, - { - "epoch": 0.52, - "grad_norm": 5.468164920806885, - "learning_rate": 5.1796610169491535e-06, - "loss": 1.2641, - "step": 22360 - }, - { - "epoch": 0.52, - "grad_norm": 6.362597942352295, - "learning_rate": 5.172881355932203e-06, - "loss": 1.4793, - "step": 22370 - }, - { - "epoch": 0.52, - "grad_norm": 3.212599992752075, - "learning_rate": 5.166101694915255e-06, - "loss": 1.4001, - "step": 22380 - }, - { - "epoch": 0.52, - "grad_norm": 7.666508197784424, - "learning_rate": 5.159322033898305e-06, - "loss": 1.3046, - "step": 22390 - }, - { - "epoch": 0.52, - "grad_norm": 7.109092712402344, - "learning_rate": 5.152542372881356e-06, - "loss": 1.2948, - "step": 22400 - }, - { - "epoch": 0.52, - "grad_norm": 6.648116588592529, - "learning_rate": 5.145762711864407e-06, - "loss": 1.1994, - "step": 22410 - }, - { - "epoch": 0.52, - "grad_norm": 4.890117645263672, - "learning_rate": 5.138983050847458e-06, - "loss": 1.2836, - "step": 22420 - }, - { - "epoch": 0.52, - "grad_norm": 10.657286643981934, - "learning_rate": 5.132203389830509e-06, - "loss": 1.0558, - "step": 22430 - }, - { - "epoch": 0.52, - "grad_norm": 2.1854753494262695, - "learning_rate": 5.12542372881356e-06, - "loss": 1.3231, - "step": 22440 - }, - { - "epoch": 0.52, - "grad_norm": 5.85990571975708, - "learning_rate": 5.118644067796611e-06, - "loss": 1.0908, - "step": 22450 - }, - { - "epoch": 0.52, - "grad_norm": 6.751573085784912, - "learning_rate": 5.111864406779662e-06, - "loss": 1.1804, - "step": 22460 - }, - { - "epoch": 0.52, - "grad_norm": 8.723662376403809, - "learning_rate": 5.105084745762711e-06, - "loss": 1.3736, - "step": 22470 - }, - { - "epoch": 0.52, - "grad_norm": 4.84275484085083, - "learning_rate": 5.098305084745763e-06, - "loss": 1.3529, - "step": 22480 - }, - { - "epoch": 0.52, - "grad_norm": 3.9603848457336426, - "learning_rate": 5.091525423728813e-06, - "loss": 1.1711, - "step": 22490 - }, - { - "epoch": 0.52, - "grad_norm": 1.6602897644042969, - "learning_rate": 5.084745762711865e-06, - "loss": 1.2808, - "step": 22500 - }, - { - "epoch": 0.52, - "eval_loss": 1.0503261089324951, - "eval_runtime": 68.1322, - "eval_samples_per_second": 14.677, - "eval_steps_per_second": 14.677, - "step": 22500 - }, - { - "epoch": 0.52, - "grad_norm": 4.92212438583374, - "learning_rate": 5.077966101694915e-06, - "loss": 1.0928, - "step": 22510 - }, - { - "epoch": 0.52, - "grad_norm": 11.511126518249512, - "learning_rate": 5.071186440677967e-06, - "loss": 1.3047, - "step": 22520 - }, - { - "epoch": 0.52, - "grad_norm": 4.2978596687316895, - "learning_rate": 5.064406779661017e-06, - "loss": 1.3928, - "step": 22530 - }, - { - "epoch": 0.52, - "grad_norm": 4.02852201461792, - "learning_rate": 5.057627118644069e-06, - "loss": 1.3077, - "step": 22540 - }, - { - "epoch": 0.52, - "grad_norm": 4.235876560211182, - "learning_rate": 5.050847457627119e-06, - "loss": 1.1435, - "step": 22550 - }, - { - "epoch": 0.52, - "grad_norm": 3.8855769634246826, - "learning_rate": 5.0440677966101705e-06, - "loss": 1.2596, - "step": 22560 - }, - { - "epoch": 0.52, - "grad_norm": 10.052867889404297, - "learning_rate": 5.037288135593221e-06, - "loss": 1.1957, - "step": 22570 - }, - { - "epoch": 0.52, - "grad_norm": 6.440583229064941, - "learning_rate": 5.030508474576271e-06, - "loss": 1.1874, - "step": 22580 - }, - { - "epoch": 0.52, - "grad_norm": 4.625791072845459, - "learning_rate": 5.023728813559322e-06, - "loss": 1.4208, - "step": 22590 - }, - { - "epoch": 0.52, - "grad_norm": 2.446409225463867, - "learning_rate": 5.016949152542373e-06, - "loss": 1.142, - "step": 22600 - }, - { - "epoch": 0.52, - "grad_norm": 10.698454856872559, - "learning_rate": 5.010169491525424e-06, - "loss": 1.1436, - "step": 22610 - }, - { - "epoch": 0.52, - "grad_norm": 1.8976472616195679, - "learning_rate": 5.003389830508475e-06, - "loss": 1.1139, - "step": 22620 - }, - { - "epoch": 0.52, - "grad_norm": 4.049407958984375, - "learning_rate": 4.996610169491526e-06, - "loss": 1.2085, - "step": 22630 - }, - { - "epoch": 0.52, - "grad_norm": 14.139172554016113, - "learning_rate": 4.989830508474577e-06, - "loss": 1.192, - "step": 22640 - }, - { - "epoch": 0.52, - "grad_norm": 12.04554271697998, - "learning_rate": 4.983050847457628e-06, - "loss": 1.2503, - "step": 22650 - }, - { - "epoch": 0.52, - "grad_norm": 4.248260498046875, - "learning_rate": 4.976271186440678e-06, - "loss": 0.9741, - "step": 22660 - }, - { - "epoch": 0.52, - "grad_norm": 8.9169282913208, - "learning_rate": 4.969491525423729e-06, - "loss": 1.4536, - "step": 22670 - }, - { - "epoch": 0.53, - "grad_norm": 2.942535877227783, - "learning_rate": 4.96271186440678e-06, - "loss": 1.1215, - "step": 22680 - }, - { - "epoch": 0.53, - "grad_norm": 3.7272868156433105, - "learning_rate": 4.955932203389831e-06, - "loss": 1.2022, - "step": 22690 - }, - { - "epoch": 0.53, - "grad_norm": 6.489871501922607, - "learning_rate": 4.949152542372882e-06, - "loss": 1.1204, - "step": 22700 - }, - { - "epoch": 0.53, - "grad_norm": 5.630454063415527, - "learning_rate": 4.942372881355932e-06, - "loss": 1.2873, - "step": 22710 - }, - { - "epoch": 0.53, - "grad_norm": 2.8798165321350098, - "learning_rate": 4.935593220338984e-06, - "loss": 1.3039, - "step": 22720 - }, - { - "epoch": 0.53, - "grad_norm": 5.791500568389893, - "learning_rate": 4.928813559322034e-06, - "loss": 1.5419, - "step": 22730 - }, - { - "epoch": 0.53, - "grad_norm": 4.488035202026367, - "learning_rate": 4.922033898305086e-06, - "loss": 1.0108, - "step": 22740 - }, - { - "epoch": 0.53, - "grad_norm": 2.9141738414764404, - "learning_rate": 4.915254237288136e-06, - "loss": 1.0819, - "step": 22750 - }, - { - "epoch": 0.53, - "grad_norm": 7.7986626625061035, - "learning_rate": 4.908474576271187e-06, - "loss": 1.1406, - "step": 22760 - }, - { - "epoch": 0.53, - "grad_norm": 5.593525409698486, - "learning_rate": 4.901694915254237e-06, - "loss": 1.1587, - "step": 22770 - }, - { - "epoch": 0.53, - "grad_norm": 6.104484558105469, - "learning_rate": 4.894915254237289e-06, - "loss": 1.3049, - "step": 22780 - }, - { - "epoch": 0.53, - "grad_norm": 10.20970630645752, - "learning_rate": 4.888135593220339e-06, - "loss": 1.1083, - "step": 22790 - }, - { - "epoch": 0.53, - "grad_norm": 6.432941913604736, - "learning_rate": 4.881355932203391e-06, - "loss": 1.2579, - "step": 22800 - }, - { - "epoch": 0.53, - "grad_norm": 5.444655895233154, - "learning_rate": 4.874576271186441e-06, - "loss": 1.1385, - "step": 22810 - }, - { - "epoch": 0.53, - "grad_norm": 6.204164028167725, - "learning_rate": 4.867796610169492e-06, - "loss": 1.2969, - "step": 22820 - }, - { - "epoch": 0.53, - "grad_norm": 2.851778984069824, - "learning_rate": 4.861016949152543e-06, - "loss": 1.3118, - "step": 22830 - }, - { - "epoch": 0.53, - "grad_norm": 4.721585750579834, - "learning_rate": 4.854237288135594e-06, - "loss": 1.0812, - "step": 22840 - }, - { - "epoch": 0.53, - "grad_norm": 7.1442646980285645, - "learning_rate": 4.847457627118645e-06, - "loss": 1.258, - "step": 22850 - }, - { - "epoch": 0.53, - "grad_norm": 3.9416871070861816, - "learning_rate": 4.840677966101695e-06, - "loss": 1.0249, - "step": 22860 - }, - { - "epoch": 0.53, - "grad_norm": 4.044296741485596, - "learning_rate": 4.833898305084746e-06, - "loss": 1.2126, - "step": 22870 - }, - { - "epoch": 0.53, - "grad_norm": 2.5377376079559326, - "learning_rate": 4.827118644067797e-06, - "loss": 1.4611, - "step": 22880 - }, - { - "epoch": 0.53, - "grad_norm": 3.37980055809021, - "learning_rate": 4.820338983050848e-06, - "loss": 1.2157, - "step": 22890 - }, - { - "epoch": 0.53, - "grad_norm": 7.17388916015625, - "learning_rate": 4.813559322033899e-06, - "loss": 1.252, - "step": 22900 - }, - { - "epoch": 0.53, - "grad_norm": 5.8747124671936035, - "learning_rate": 4.80677966101695e-06, - "loss": 1.3771, - "step": 22910 - }, - { - "epoch": 0.53, - "grad_norm": 8.872361183166504, - "learning_rate": 4.800000000000001e-06, - "loss": 1.3137, - "step": 22920 - }, - { - "epoch": 0.53, - "grad_norm": 6.978908061981201, - "learning_rate": 4.793220338983051e-06, - "loss": 1.1414, - "step": 22930 - }, - { - "epoch": 0.53, - "grad_norm": 1.8234301805496216, - "learning_rate": 4.786440677966102e-06, - "loss": 1.2421, - "step": 22940 - }, - { - "epoch": 0.53, - "grad_norm": 9.105183601379395, - "learning_rate": 4.779661016949153e-06, - "loss": 1.4391, - "step": 22950 - }, - { - "epoch": 0.53, - "grad_norm": 27.357824325561523, - "learning_rate": 4.772881355932204e-06, - "loss": 1.1089, - "step": 22960 - }, - { - "epoch": 0.53, - "grad_norm": 5.128312110900879, - "learning_rate": 4.766101694915254e-06, - "loss": 1.0676, - "step": 22970 - }, - { - "epoch": 0.53, - "grad_norm": 4.661288261413574, - "learning_rate": 4.759322033898306e-06, - "loss": 1.2435, - "step": 22980 - }, - { - "epoch": 0.53, - "grad_norm": 3.2815043926239014, - "learning_rate": 4.752542372881356e-06, - "loss": 1.1472, - "step": 22990 - }, - { - "epoch": 0.53, - "grad_norm": 5.986001014709473, - "learning_rate": 4.745762711864408e-06, - "loss": 1.3882, - "step": 23000 - }, - { - "epoch": 0.53, - "eval_loss": 0.9571949243545532, - "eval_runtime": 68.0527, - "eval_samples_per_second": 14.694, - "eval_steps_per_second": 14.694, - "step": 23000 - }, - { - "epoch": 0.53, - "grad_norm": 2.8159420490264893, - "learning_rate": 4.738983050847458e-06, - "loss": 1.193, - "step": 23010 - }, - { - "epoch": 0.53, - "grad_norm": 4.794797897338867, - "learning_rate": 4.732203389830509e-06, - "loss": 1.3156, - "step": 23020 - }, - { - "epoch": 0.53, - "grad_norm": 3.945427656173706, - "learning_rate": 4.725423728813559e-06, - "loss": 1.2774, - "step": 23030 - }, - { - "epoch": 0.53, - "grad_norm": 9.099642753601074, - "learning_rate": 4.718644067796611e-06, - "loss": 1.2536, - "step": 23040 - }, - { - "epoch": 0.53, - "grad_norm": 6.1287455558776855, - "learning_rate": 4.711864406779661e-06, - "loss": 1.225, - "step": 23050 - }, - { - "epoch": 0.53, - "grad_norm": 4.507009506225586, - "learning_rate": 4.705084745762713e-06, - "loss": 1.3274, - "step": 23060 - }, - { - "epoch": 0.53, - "grad_norm": 14.352507591247559, - "learning_rate": 4.698305084745763e-06, - "loss": 1.3935, - "step": 23070 - }, - { - "epoch": 0.53, - "grad_norm": 2.108365297317505, - "learning_rate": 4.691525423728814e-06, - "loss": 1.294, - "step": 23080 - }, - { - "epoch": 0.53, - "grad_norm": 6.877602577209473, - "learning_rate": 4.684745762711865e-06, - "loss": 1.3154, - "step": 23090 - }, - { - "epoch": 0.53, - "grad_norm": 12.653403282165527, - "learning_rate": 4.677966101694916e-06, - "loss": 1.1215, - "step": 23100 - }, - { - "epoch": 0.54, - "grad_norm": 6.832087993621826, - "learning_rate": 4.671186440677967e-06, - "loss": 1.3124, - "step": 23110 - }, - { - "epoch": 0.54, - "grad_norm": 8.196941375732422, - "learning_rate": 4.664406779661017e-06, - "loss": 1.357, - "step": 23120 - }, - { - "epoch": 0.54, - "grad_norm": 1.573502540588379, - "learning_rate": 4.657627118644068e-06, - "loss": 1.3047, - "step": 23130 - }, - { - "epoch": 0.54, - "grad_norm": 6.144334316253662, - "learning_rate": 4.650847457627119e-06, - "loss": 1.2824, - "step": 23140 - }, - { - "epoch": 0.54, - "grad_norm": 2.6499195098876953, - "learning_rate": 4.64406779661017e-06, - "loss": 1.3157, - "step": 23150 - }, - { - "epoch": 0.54, - "grad_norm": 3.0066230297088623, - "learning_rate": 4.637288135593221e-06, - "loss": 1.2067, - "step": 23160 - }, - { - "epoch": 0.54, - "grad_norm": 2.056504726409912, - "learning_rate": 4.630508474576272e-06, - "loss": 1.3069, - "step": 23170 - }, - { - "epoch": 0.54, - "grad_norm": 5.741653919219971, - "learning_rate": 4.623728813559323e-06, - "loss": 1.2342, - "step": 23180 - }, - { - "epoch": 0.54, - "grad_norm": 14.528912544250488, - "learning_rate": 4.616949152542373e-06, - "loss": 1.1091, - "step": 23190 - }, - { - "epoch": 0.54, - "grad_norm": 2.9248435497283936, - "learning_rate": 4.610169491525424e-06, - "loss": 1.0753, - "step": 23200 - }, - { - "epoch": 0.54, - "grad_norm": 7.365577697753906, - "learning_rate": 4.603389830508475e-06, - "loss": 1.2809, - "step": 23210 - }, - { - "epoch": 0.54, - "grad_norm": 16.456893920898438, - "learning_rate": 4.596610169491526e-06, - "loss": 1.0776, - "step": 23220 - }, - { - "epoch": 0.54, - "grad_norm": 2.7638049125671387, - "learning_rate": 4.589830508474576e-06, - "loss": 1.3565, - "step": 23230 - }, - { - "epoch": 0.54, - "grad_norm": 3.463066816329956, - "learning_rate": 4.583050847457628e-06, - "loss": 1.2449, - "step": 23240 - }, - { - "epoch": 0.54, - "grad_norm": 5.465227127075195, - "learning_rate": 4.576271186440678e-06, - "loss": 1.2915, - "step": 23250 - }, - { - "epoch": 0.54, - "grad_norm": 4.380557060241699, - "learning_rate": 4.56949152542373e-06, - "loss": 1.3565, - "step": 23260 - }, - { - "epoch": 0.54, - "grad_norm": 4.977199554443359, - "learning_rate": 4.56271186440678e-06, - "loss": 1.3312, - "step": 23270 - }, - { - "epoch": 0.54, - "grad_norm": 4.714416027069092, - "learning_rate": 4.555932203389831e-06, - "loss": 1.318, - "step": 23280 - }, - { - "epoch": 0.54, - "grad_norm": 5.0472092628479, - "learning_rate": 4.549152542372881e-06, - "loss": 1.3783, - "step": 23290 - }, - { - "epoch": 0.54, - "grad_norm": 10.963289260864258, - "learning_rate": 4.542372881355933e-06, - "loss": 1.3035, - "step": 23300 - }, - { - "epoch": 0.54, - "grad_norm": 2.592956066131592, - "learning_rate": 4.535593220338983e-06, - "loss": 1.3272, - "step": 23310 - }, - { - "epoch": 0.54, - "grad_norm": 11.702526092529297, - "learning_rate": 4.528813559322035e-06, - "loss": 1.1363, - "step": 23320 - }, - { - "epoch": 0.54, - "grad_norm": 4.2335710525512695, - "learning_rate": 4.522033898305085e-06, - "loss": 1.3878, - "step": 23330 - }, - { - "epoch": 0.54, - "grad_norm": 7.734435558319092, - "learning_rate": 4.515254237288136e-06, - "loss": 1.3184, - "step": 23340 - }, - { - "epoch": 0.54, - "grad_norm": 3.9147768020629883, - "learning_rate": 4.508474576271187e-06, - "loss": 1.2077, - "step": 23350 - }, - { - "epoch": 0.54, - "grad_norm": 3.1659061908721924, - "learning_rate": 4.501694915254238e-06, - "loss": 1.3058, - "step": 23360 - }, - { - "epoch": 0.54, - "grad_norm": 3.9177017211914062, - "learning_rate": 4.494915254237289e-06, - "loss": 1.1948, - "step": 23370 - }, - { - "epoch": 0.54, - "grad_norm": 6.0111870765686035, - "learning_rate": 4.488135593220339e-06, - "loss": 1.1927, - "step": 23380 - }, - { - "epoch": 0.54, - "grad_norm": 5.929579257965088, - "learning_rate": 4.48135593220339e-06, - "loss": 1.299, - "step": 23390 - }, - { - "epoch": 0.54, - "grad_norm": 10.30470085144043, - "learning_rate": 4.474576271186441e-06, - "loss": 1.3172, - "step": 23400 - }, - { - "epoch": 0.54, - "grad_norm": 2.541947841644287, - "learning_rate": 4.467796610169492e-06, - "loss": 1.1529, - "step": 23410 - }, - { - "epoch": 0.54, - "grad_norm": 9.664104461669922, - "learning_rate": 4.461016949152543e-06, - "loss": 1.008, - "step": 23420 - }, - { - "epoch": 0.54, - "grad_norm": 3.3850762844085693, - "learning_rate": 4.454237288135594e-06, - "loss": 1.2801, - "step": 23430 - }, - { - "epoch": 0.54, - "grad_norm": 4.204078197479248, - "learning_rate": 4.447457627118645e-06, - "loss": 1.3791, - "step": 23440 - }, - { - "epoch": 0.54, - "grad_norm": 1.8552970886230469, - "learning_rate": 4.440677966101695e-06, - "loss": 1.1793, - "step": 23450 - }, - { - "epoch": 0.54, - "grad_norm": 9.347552299499512, - "learning_rate": 4.433898305084746e-06, - "loss": 1.2106, - "step": 23460 - }, - { - "epoch": 0.54, - "grad_norm": 3.1846823692321777, - "learning_rate": 4.427118644067797e-06, - "loss": 1.5138, - "step": 23470 - }, - { - "epoch": 0.54, - "grad_norm": 7.087679862976074, - "learning_rate": 4.420338983050848e-06, - "loss": 1.1433, - "step": 23480 - }, - { - "epoch": 0.54, - "grad_norm": 6.835491180419922, - "learning_rate": 4.413559322033898e-06, - "loss": 1.221, - "step": 23490 - }, - { - "epoch": 0.54, - "grad_norm": 5.99943733215332, - "learning_rate": 4.40677966101695e-06, - "loss": 1.4262, - "step": 23500 - }, - { - "epoch": 0.54, - "eval_loss": 1.024583339691162, - "eval_runtime": 67.2037, - "eval_samples_per_second": 14.88, - "eval_steps_per_second": 14.88, - "step": 23500 - }, - { - "epoch": 0.54, - "grad_norm": 11.216558456420898, - "learning_rate": 4.4e-06, - "loss": 1.2844, - "step": 23510 - }, - { - "epoch": 0.54, - "grad_norm": 8.619065284729004, - "learning_rate": 4.393220338983052e-06, - "loss": 1.4356, - "step": 23520 - }, - { - "epoch": 0.54, - "grad_norm": 11.311079978942871, - "learning_rate": 4.386440677966102e-06, - "loss": 1.1927, - "step": 23530 - }, - { - "epoch": 0.55, - "grad_norm": 4.765082836151123, - "learning_rate": 4.379661016949153e-06, - "loss": 1.3196, - "step": 23540 - }, - { - "epoch": 0.55, - "grad_norm": 3.4813079833984375, - "learning_rate": 4.372881355932203e-06, - "loss": 1.3019, - "step": 23550 - }, - { - "epoch": 0.55, - "grad_norm": 2.4694595336914062, - "learning_rate": 4.366101694915255e-06, - "loss": 1.0386, - "step": 23560 - }, - { - "epoch": 0.55, - "grad_norm": 5.6850481033325195, - "learning_rate": 4.359322033898305e-06, - "loss": 1.3539, - "step": 23570 - }, - { - "epoch": 0.55, - "grad_norm": 4.677857398986816, - "learning_rate": 4.352542372881357e-06, - "loss": 1.3064, - "step": 23580 - }, - { - "epoch": 0.55, - "grad_norm": 7.017549514770508, - "learning_rate": 4.345762711864407e-06, - "loss": 1.2506, - "step": 23590 - }, - { - "epoch": 0.55, - "grad_norm": 7.754286766052246, - "learning_rate": 4.338983050847458e-06, - "loss": 1.2139, - "step": 23600 - }, - { - "epoch": 0.55, - "grad_norm": 8.794230461120605, - "learning_rate": 4.332203389830509e-06, - "loss": 1.2354, - "step": 23610 - }, - { - "epoch": 0.55, - "grad_norm": 8.946222305297852, - "learning_rate": 4.32542372881356e-06, - "loss": 1.1265, - "step": 23620 - }, - { - "epoch": 0.55, - "grad_norm": 6.819628715515137, - "learning_rate": 4.318644067796611e-06, - "loss": 1.2775, - "step": 23630 - }, - { - "epoch": 0.55, - "grad_norm": 5.190433502197266, - "learning_rate": 4.311864406779661e-06, - "loss": 1.3512, - "step": 23640 - }, - { - "epoch": 0.55, - "grad_norm": 3.929642677307129, - "learning_rate": 4.305084745762712e-06, - "loss": 1.2403, - "step": 23650 - }, - { - "epoch": 0.55, - "grad_norm": 8.023259162902832, - "learning_rate": 4.298305084745763e-06, - "loss": 1.336, - "step": 23660 - }, - { - "epoch": 0.55, - "grad_norm": 5.015733242034912, - "learning_rate": 4.291525423728814e-06, - "loss": 1.2565, - "step": 23670 - }, - { - "epoch": 0.55, - "grad_norm": 7.108937740325928, - "learning_rate": 4.284745762711865e-06, - "loss": 1.2121, - "step": 23680 - }, - { - "epoch": 0.55, - "grad_norm": 10.912981033325195, - "learning_rate": 4.277966101694915e-06, - "loss": 1.353, - "step": 23690 - }, - { - "epoch": 0.55, - "grad_norm": 2.0632991790771484, - "learning_rate": 4.271186440677967e-06, - "loss": 1.2225, - "step": 23700 - }, - { - "epoch": 0.55, - "grad_norm": 9.637491226196289, - "learning_rate": 4.264406779661017e-06, - "loss": 1.1805, - "step": 23710 - }, - { - "epoch": 0.55, - "grad_norm": 8.959497451782227, - "learning_rate": 4.257627118644068e-06, - "loss": 1.2289, - "step": 23720 - }, - { - "epoch": 0.55, - "grad_norm": 9.83276081085205, - "learning_rate": 4.250847457627119e-06, - "loss": 1.0274, - "step": 23730 - }, - { - "epoch": 0.55, - "grad_norm": 9.423148155212402, - "learning_rate": 4.24406779661017e-06, - "loss": 1.2569, - "step": 23740 - }, - { - "epoch": 0.55, - "grad_norm": 4.011594772338867, - "learning_rate": 4.23728813559322e-06, - "loss": 1.1647, - "step": 23750 - }, - { - "epoch": 0.55, - "grad_norm": 6.655368328094482, - "learning_rate": 4.230508474576272e-06, - "loss": 1.4665, - "step": 23760 - }, - { - "epoch": 0.55, - "grad_norm": 3.5101451873779297, - "learning_rate": 4.223728813559322e-06, - "loss": 1.1459, - "step": 23770 - }, - { - "epoch": 0.55, - "grad_norm": 20.52288818359375, - "learning_rate": 4.216949152542374e-06, - "loss": 1.1491, - "step": 23780 - }, - { - "epoch": 0.55, - "grad_norm": 9.20742130279541, - "learning_rate": 4.210169491525424e-06, - "loss": 1.2733, - "step": 23790 - }, - { - "epoch": 0.55, - "grad_norm": 6.216460704803467, - "learning_rate": 4.203389830508475e-06, - "loss": 1.2745, - "step": 23800 - }, - { - "epoch": 0.55, - "grad_norm": 2.417329788208008, - "learning_rate": 4.196610169491525e-06, - "loss": 1.035, - "step": 23810 - }, - { - "epoch": 0.55, - "grad_norm": 10.275775909423828, - "learning_rate": 4.189830508474577e-06, - "loss": 1.2926, - "step": 23820 - }, - { - "epoch": 0.55, - "grad_norm": 3.9145314693450928, - "learning_rate": 4.183050847457627e-06, - "loss": 1.2843, - "step": 23830 - }, - { - "epoch": 0.55, - "grad_norm": 15.721736907958984, - "learning_rate": 4.176271186440679e-06, - "loss": 1.2988, - "step": 23840 - }, - { - "epoch": 0.55, - "grad_norm": 3.7596094608306885, - "learning_rate": 4.169491525423729e-06, - "loss": 1.3231, - "step": 23850 - }, - { - "epoch": 0.55, - "grad_norm": 1.9014712572097778, - "learning_rate": 4.16271186440678e-06, - "loss": 1.3156, - "step": 23860 - }, - { - "epoch": 0.55, - "grad_norm": 3.3489229679107666, - "learning_rate": 4.155932203389831e-06, - "loss": 1.3292, - "step": 23870 - }, - { - "epoch": 0.55, - "grad_norm": 4.741728782653809, - "learning_rate": 4.149152542372882e-06, - "loss": 1.317, - "step": 23880 - }, - { - "epoch": 0.55, - "grad_norm": 8.91144847869873, - "learning_rate": 4.142372881355933e-06, - "loss": 1.3414, - "step": 23890 - }, - { - "epoch": 0.55, - "grad_norm": 1.4547756910324097, - "learning_rate": 4.135593220338983e-06, - "loss": 1.2971, - "step": 23900 - }, - { - "epoch": 0.55, - "grad_norm": 8.662474632263184, - "learning_rate": 4.128813559322034e-06, - "loss": 1.1533, - "step": 23910 - }, - { - "epoch": 0.55, - "grad_norm": 3.9637677669525146, - "learning_rate": 4.122033898305085e-06, - "loss": 1.2564, - "step": 23920 - }, - { - "epoch": 0.55, - "grad_norm": 4.98095703125, - "learning_rate": 4.115254237288136e-06, - "loss": 1.0988, - "step": 23930 - }, - { - "epoch": 0.55, - "grad_norm": 6.112912654876709, - "learning_rate": 4.108474576271187e-06, - "loss": 1.058, - "step": 23940 - }, - { - "epoch": 0.55, - "grad_norm": 3.396317720413208, - "learning_rate": 4.101694915254237e-06, - "loss": 1.0615, - "step": 23950 - }, - { - "epoch": 0.55, - "grad_norm": 2.1811277866363525, - "learning_rate": 4.094915254237289e-06, - "loss": 1.0981, - "step": 23960 - }, - { - "epoch": 0.55, - "grad_norm": 7.639281749725342, - "learning_rate": 4.088135593220339e-06, - "loss": 1.39, - "step": 23970 - }, - { - "epoch": 0.56, - "grad_norm": 6.836162567138672, - "learning_rate": 4.081355932203391e-06, - "loss": 1.3098, - "step": 23980 - }, - { - "epoch": 0.56, - "grad_norm": 4.436983108520508, - "learning_rate": 4.074576271186441e-06, - "loss": 1.2947, - "step": 23990 - }, - { - "epoch": 0.56, - "grad_norm": 2.344386339187622, - "learning_rate": 4.067796610169492e-06, - "loss": 1.23, - "step": 24000 - }, - { - "epoch": 0.56, - "eval_loss": 1.038734793663025, - "eval_runtime": 68.6728, - "eval_samples_per_second": 14.562, - "eval_steps_per_second": 14.562, - "step": 24000 - }, - { - "epoch": 0.56, - "grad_norm": 2.359607219696045, - "learning_rate": 4.061016949152542e-06, - "loss": 1.4914, - "step": 24010 - }, - { - "epoch": 0.56, - "grad_norm": 3.044067621231079, - "learning_rate": 4.054237288135594e-06, - "loss": 1.0596, - "step": 24020 - }, - { - "epoch": 0.56, - "grad_norm": 6.635824203491211, - "learning_rate": 4.047457627118644e-06, - "loss": 1.1495, - "step": 24030 - }, - { - "epoch": 0.56, - "grad_norm": 2.152034044265747, - "learning_rate": 4.040677966101696e-06, - "loss": 1.2823, - "step": 24040 - }, - { - "epoch": 0.56, - "grad_norm": 6.193742752075195, - "learning_rate": 4.033898305084746e-06, - "loss": 1.2369, - "step": 24050 - }, - { - "epoch": 0.56, - "grad_norm": 5.2856245040893555, - "learning_rate": 4.027118644067797e-06, - "loss": 1.3229, - "step": 24060 - }, - { - "epoch": 0.56, - "grad_norm": 4.423139572143555, - "learning_rate": 4.020338983050847e-06, - "loss": 1.4499, - "step": 24070 - }, - { - "epoch": 0.56, - "grad_norm": 1.6255598068237305, - "learning_rate": 4.013559322033899e-06, - "loss": 1.1325, - "step": 24080 - }, - { - "epoch": 0.56, - "grad_norm": 1.646109700202942, - "learning_rate": 4.006779661016949e-06, - "loss": 1.2737, - "step": 24090 - }, - { - "epoch": 0.56, - "grad_norm": 5.978227615356445, - "learning_rate": 4.000000000000001e-06, - "loss": 1.1537, - "step": 24100 - }, - { - "epoch": 0.56, - "grad_norm": 9.052789688110352, - "learning_rate": 3.993220338983051e-06, - "loss": 1.4989, - "step": 24110 - }, - { - "epoch": 0.56, - "grad_norm": 5.04679012298584, - "learning_rate": 3.986440677966102e-06, - "loss": 1.3025, - "step": 24120 - }, - { - "epoch": 0.56, - "grad_norm": 4.390671730041504, - "learning_rate": 3.979661016949153e-06, - "loss": 1.3109, - "step": 24130 - }, - { - "epoch": 0.56, - "grad_norm": 11.515645027160645, - "learning_rate": 3.972881355932204e-06, - "loss": 1.3073, - "step": 24140 - }, - { - "epoch": 0.56, - "grad_norm": 8.161099433898926, - "learning_rate": 3.966101694915255e-06, - "loss": 1.0133, - "step": 24150 - }, - { - "epoch": 0.56, - "grad_norm": 5.4128851890563965, - "learning_rate": 3.959322033898305e-06, - "loss": 1.3091, - "step": 24160 - }, - { - "epoch": 0.56, - "grad_norm": 1.8805668354034424, - "learning_rate": 3.952542372881356e-06, - "loss": 1.4176, - "step": 24170 - }, - { - "epoch": 0.56, - "grad_norm": 7.201099395751953, - "learning_rate": 3.945762711864407e-06, - "loss": 1.0447, - "step": 24180 - }, - { - "epoch": 0.56, - "grad_norm": 2.0222816467285156, - "learning_rate": 3.938983050847458e-06, - "loss": 1.2018, - "step": 24190 - }, - { - "epoch": 0.56, - "grad_norm": 4.226831912994385, - "learning_rate": 3.932203389830509e-06, - "loss": 1.1199, - "step": 24200 - }, - { - "epoch": 0.56, - "grad_norm": 6.030657768249512, - "learning_rate": 3.925423728813559e-06, - "loss": 1.2632, - "step": 24210 - }, - { - "epoch": 0.56, - "grad_norm": 2.487889528274536, - "learning_rate": 3.918644067796611e-06, - "loss": 1.3595, - "step": 24220 - }, - { - "epoch": 0.56, - "grad_norm": 2.413583755493164, - "learning_rate": 3.911864406779661e-06, - "loss": 1.2903, - "step": 24230 - }, - { - "epoch": 0.56, - "grad_norm": 8.041131019592285, - "learning_rate": 3.905084745762713e-06, - "loss": 1.1923, - "step": 24240 - }, - { - "epoch": 0.56, - "grad_norm": 7.538460731506348, - "learning_rate": 3.898305084745763e-06, - "loss": 1.3482, - "step": 24250 - }, - { - "epoch": 0.56, - "grad_norm": 3.833472728729248, - "learning_rate": 3.891525423728814e-06, - "loss": 1.2592, - "step": 24260 - }, - { - "epoch": 0.56, - "grad_norm": 3.3971173763275146, - "learning_rate": 3.884745762711864e-06, - "loss": 1.2468, - "step": 24270 - }, - { - "epoch": 0.56, - "grad_norm": 10.157732009887695, - "learning_rate": 3.877966101694916e-06, - "loss": 1.3728, - "step": 24280 - }, - { - "epoch": 0.56, - "grad_norm": 3.008604049682617, - "learning_rate": 3.871186440677966e-06, - "loss": 1.0438, - "step": 24290 - }, - { - "epoch": 0.56, - "grad_norm": 9.531476020812988, - "learning_rate": 3.864406779661018e-06, - "loss": 1.3266, - "step": 24300 - }, - { - "epoch": 0.56, - "grad_norm": 6.8482818603515625, - "learning_rate": 3.857627118644068e-06, - "loss": 1.3832, - "step": 24310 - }, - { - "epoch": 0.56, - "grad_norm": 2.2738068103790283, - "learning_rate": 3.850847457627119e-06, - "loss": 1.3126, - "step": 24320 - }, - { - "epoch": 0.56, - "grad_norm": 9.141778945922852, - "learning_rate": 3.844067796610169e-06, - "loss": 1.1799, - "step": 24330 - }, - { - "epoch": 0.56, - "grad_norm": 4.968504905700684, - "learning_rate": 3.837288135593221e-06, - "loss": 1.2729, - "step": 24340 - }, - { - "epoch": 0.56, - "grad_norm": 1.8136253356933594, - "learning_rate": 3.830508474576271e-06, - "loss": 1.2373, - "step": 24350 - }, - { - "epoch": 0.56, - "grad_norm": 10.53274917602539, - "learning_rate": 3.823728813559323e-06, - "loss": 1.1203, - "step": 24360 - }, - { - "epoch": 0.56, - "grad_norm": 4.648149013519287, - "learning_rate": 3.816949152542373e-06, - "loss": 1.1483, - "step": 24370 - }, - { - "epoch": 0.56, - "grad_norm": 8.629014015197754, - "learning_rate": 3.8101694915254238e-06, - "loss": 1.3082, - "step": 24380 - }, - { - "epoch": 0.56, - "grad_norm": 6.339970588684082, - "learning_rate": 3.8033898305084748e-06, - "loss": 1.4333, - "step": 24390 - }, - { - "epoch": 0.56, - "grad_norm": 5.55668830871582, - "learning_rate": 3.7966101694915257e-06, - "loss": 1.4178, - "step": 24400 - }, - { - "epoch": 0.57, - "grad_norm": 6.1273884773254395, - "learning_rate": 3.7898305084745767e-06, - "loss": 1.1529, - "step": 24410 - }, - { - "epoch": 0.57, - "grad_norm": 3.123335361480713, - "learning_rate": 3.7830508474576273e-06, - "loss": 1.1294, - "step": 24420 - }, - { - "epoch": 0.57, - "grad_norm": 4.093799591064453, - "learning_rate": 3.7762711864406782e-06, - "loss": 1.263, - "step": 24430 - }, - { - "epoch": 0.57, - "grad_norm": 7.780729293823242, - "learning_rate": 3.7694915254237292e-06, - "loss": 1.1628, - "step": 24440 - }, - { - "epoch": 0.57, - "grad_norm": 5.5196614265441895, - "learning_rate": 3.76271186440678e-06, - "loss": 1.204, - "step": 24450 - }, - { - "epoch": 0.57, - "grad_norm": 4.023863315582275, - "learning_rate": 3.755932203389831e-06, - "loss": 1.2161, - "step": 24460 - }, - { - "epoch": 0.57, - "grad_norm": 12.464215278625488, - "learning_rate": 3.7491525423728813e-06, - "loss": 1.1825, - "step": 24470 - }, - { - "epoch": 0.57, - "grad_norm": 3.583104372024536, - "learning_rate": 3.7423728813559323e-06, - "loss": 1.2815, - "step": 24480 - }, - { - "epoch": 0.57, - "grad_norm": 2.9160475730895996, - "learning_rate": 3.7355932203389833e-06, - "loss": 1.2902, - "step": 24490 - }, - { - "epoch": 0.57, - "grad_norm": 5.577888488769531, - "learning_rate": 3.7288135593220342e-06, - "loss": 1.2927, - "step": 24500 - }, - { - "epoch": 0.57, - "eval_loss": 1.001274585723877, - "eval_runtime": 69.0094, - "eval_samples_per_second": 14.491, - "eval_steps_per_second": 14.491, - "step": 24500 - }, - { - "epoch": 0.57, - "grad_norm": 26.491436004638672, - "learning_rate": 3.7220338983050852e-06, - "loss": 1.3748, - "step": 24510 - }, - { - "epoch": 0.57, - "grad_norm": 3.675736904144287, - "learning_rate": 3.715254237288136e-06, - "loss": 1.3274, - "step": 24520 - }, - { - "epoch": 0.57, - "grad_norm": 0.7855976223945618, - "learning_rate": 3.7084745762711867e-06, - "loss": 1.1811, - "step": 24530 - }, - { - "epoch": 0.57, - "grad_norm": 16.270326614379883, - "learning_rate": 3.7016949152542377e-06, - "loss": 1.2293, - "step": 24540 - }, - { - "epoch": 0.57, - "grad_norm": 3.6461169719696045, - "learning_rate": 3.6949152542372883e-06, - "loss": 1.2937, - "step": 24550 - }, - { - "epoch": 0.57, - "grad_norm": 5.9607954025268555, - "learning_rate": 3.6881355932203393e-06, - "loss": 1.2017, - "step": 24560 - }, - { - "epoch": 0.57, - "grad_norm": 8.822928428649902, - "learning_rate": 3.6813559322033902e-06, - "loss": 1.2073, - "step": 24570 - }, - { - "epoch": 0.57, - "grad_norm": 3.9838669300079346, - "learning_rate": 3.6745762711864408e-06, - "loss": 1.0297, - "step": 24580 - }, - { - "epoch": 0.57, - "grad_norm": 3.180518865585327, - "learning_rate": 3.6677966101694918e-06, - "loss": 1.2039, - "step": 24590 - }, - { - "epoch": 0.57, - "grad_norm": 3.557624340057373, - "learning_rate": 3.6610169491525427e-06, - "loss": 1.2882, - "step": 24600 - }, - { - "epoch": 0.57, - "grad_norm": 2.1503684520721436, - "learning_rate": 3.6542372881355937e-06, - "loss": 1.2697, - "step": 24610 - }, - { - "epoch": 0.57, - "grad_norm": 4.290503978729248, - "learning_rate": 3.6474576271186447e-06, - "loss": 1.3283, - "step": 24620 - }, - { - "epoch": 0.57, - "grad_norm": 12.416634559631348, - "learning_rate": 3.640677966101695e-06, - "loss": 1.2059, - "step": 24630 - }, - { - "epoch": 0.57, - "grad_norm": 3.7786366939544678, - "learning_rate": 3.633898305084746e-06, - "loss": 1.1957, - "step": 24640 - }, - { - "epoch": 0.57, - "grad_norm": 6.75616455078125, - "learning_rate": 3.6271186440677968e-06, - "loss": 1.3586, - "step": 24650 - }, - { - "epoch": 0.57, - "grad_norm": 8.630069732666016, - "learning_rate": 3.6203389830508478e-06, - "loss": 1.1611, - "step": 24660 - }, - { - "epoch": 0.57, - "grad_norm": 3.7713470458984375, - "learning_rate": 3.6135593220338987e-06, - "loss": 1.1613, - "step": 24670 - }, - { - "epoch": 0.57, - "grad_norm": 8.666251182556152, - "learning_rate": 3.6067796610169493e-06, - "loss": 1.3801, - "step": 24680 - }, - { - "epoch": 0.57, - "grad_norm": 13.79195785522461, - "learning_rate": 3.6000000000000003e-06, - "loss": 1.3715, - "step": 24690 - }, - { - "epoch": 0.57, - "grad_norm": 1.1425895690917969, - "learning_rate": 3.5932203389830512e-06, - "loss": 1.2213, - "step": 24700 - }, - { - "epoch": 0.57, - "grad_norm": 4.308348655700684, - "learning_rate": 3.5864406779661022e-06, - "loss": 1.2354, - "step": 24710 - }, - { - "epoch": 0.57, - "grad_norm": 7.868862152099609, - "learning_rate": 3.579661016949153e-06, - "loss": 1.2111, - "step": 24720 - }, - { - "epoch": 0.57, - "grad_norm": 8.803450584411621, - "learning_rate": 3.5728813559322033e-06, - "loss": 1.2107, - "step": 24730 - }, - { - "epoch": 0.57, - "grad_norm": 5.428405284881592, - "learning_rate": 3.5661016949152543e-06, - "loss": 1.3327, - "step": 24740 - }, - { - "epoch": 0.57, - "grad_norm": 4.115501880645752, - "learning_rate": 3.5593220338983053e-06, - "loss": 1.167, - "step": 24750 - }, - { - "epoch": 0.57, - "grad_norm": 19.842674255371094, - "learning_rate": 3.5525423728813563e-06, - "loss": 1.2822, - "step": 24760 - }, - { - "epoch": 0.57, - "grad_norm": 17.216901779174805, - "learning_rate": 3.5457627118644072e-06, - "loss": 1.1838, - "step": 24770 - }, - { - "epoch": 0.57, - "grad_norm": 3.8147220611572266, - "learning_rate": 3.538983050847458e-06, - "loss": 1.3505, - "step": 24780 - }, - { - "epoch": 0.57, - "grad_norm": 6.949516773223877, - "learning_rate": 3.5322033898305088e-06, - "loss": 1.3103, - "step": 24790 - }, - { - "epoch": 0.57, - "grad_norm": 3.2688727378845215, - "learning_rate": 3.5254237288135597e-06, - "loss": 1.1356, - "step": 24800 - }, - { - "epoch": 0.57, - "grad_norm": 9.062920570373535, - "learning_rate": 3.5186440677966103e-06, - "loss": 1.182, - "step": 24810 - }, - { - "epoch": 0.57, - "grad_norm": 5.235373497009277, - "learning_rate": 3.5118644067796613e-06, - "loss": 1.3181, - "step": 24820 - }, - { - "epoch": 0.57, - "grad_norm": 6.707292079925537, - "learning_rate": 3.5050847457627122e-06, - "loss": 1.204, - "step": 24830 - }, - { - "epoch": 0.58, - "grad_norm": 3.9347891807556152, - "learning_rate": 3.498305084745763e-06, - "loss": 1.2906, - "step": 24840 - }, - { - "epoch": 0.58, - "grad_norm": 4.399641513824463, - "learning_rate": 3.4915254237288138e-06, - "loss": 1.3014, - "step": 24850 - }, - { - "epoch": 0.58, - "grad_norm": 2.8803553581237793, - "learning_rate": 3.4847457627118648e-06, - "loss": 1.2676, - "step": 24860 - }, - { - "epoch": 0.58, - "grad_norm": 8.522972106933594, - "learning_rate": 3.4779661016949157e-06, - "loss": 1.194, - "step": 24870 - }, - { - "epoch": 0.58, - "grad_norm": 6.1468825340271, - "learning_rate": 3.4711864406779667e-06, - "loss": 1.172, - "step": 24880 - }, - { - "epoch": 0.58, - "grad_norm": 4.32474422454834, - "learning_rate": 3.464406779661017e-06, - "loss": 1.2484, - "step": 24890 - }, - { - "epoch": 0.58, - "grad_norm": 11.452080726623535, - "learning_rate": 3.457627118644068e-06, - "loss": 1.291, - "step": 24900 - }, - { - "epoch": 0.58, - "grad_norm": 1.0818170309066772, - "learning_rate": 3.450847457627119e-06, - "loss": 1.1327, - "step": 24910 - }, - { - "epoch": 0.58, - "grad_norm": 5.037398815155029, - "learning_rate": 3.4440677966101698e-06, - "loss": 1.3494, - "step": 24920 - }, - { - "epoch": 0.58, - "grad_norm": 3.8455469608306885, - "learning_rate": 3.4372881355932207e-06, - "loss": 1.2283, - "step": 24930 - }, - { - "epoch": 0.58, - "grad_norm": 5.2556681632995605, - "learning_rate": 3.4305084745762713e-06, - "loss": 1.1897, - "step": 24940 - }, - { - "epoch": 0.58, - "grad_norm": 8.240093231201172, - "learning_rate": 3.4237288135593223e-06, - "loss": 1.3777, - "step": 24950 - }, - { - "epoch": 0.58, - "grad_norm": 2.0133845806121826, - "learning_rate": 3.4169491525423733e-06, - "loss": 1.2694, - "step": 24960 - }, - { - "epoch": 0.58, - "grad_norm": 1.9996819496154785, - "learning_rate": 3.4101694915254242e-06, - "loss": 1.2311, - "step": 24970 - }, - { - "epoch": 0.58, - "grad_norm": 3.0491392612457275, - "learning_rate": 3.403389830508475e-06, - "loss": 1.2316, - "step": 24980 - }, - { - "epoch": 0.58, - "grad_norm": 21.063587188720703, - "learning_rate": 3.3966101694915253e-06, - "loss": 1.2148, - "step": 24990 - }, - { - "epoch": 0.58, - "grad_norm": 13.847090721130371, - "learning_rate": 3.3898305084745763e-06, - "loss": 1.2155, - "step": 25000 - }, - { - "epoch": 0.58, - "eval_loss": 0.9729424715042114, - "eval_runtime": 68.3419, - "eval_samples_per_second": 14.632, - "eval_steps_per_second": 14.632, - "step": 25000 - }, - { - "epoch": 0.58, - "grad_norm": 5.260663032531738, - "learning_rate": 3.3830508474576273e-06, - "loss": 1.2702, - "step": 25010 - }, - { - "epoch": 0.58, - "grad_norm": 5.188079357147217, - "learning_rate": 3.3762711864406783e-06, - "loss": 1.2995, - "step": 25020 - }, - { - "epoch": 0.58, - "grad_norm": 7.982115745544434, - "learning_rate": 3.3694915254237292e-06, - "loss": 1.2687, - "step": 25030 - }, - { - "epoch": 0.58, - "grad_norm": 3.1299421787261963, - "learning_rate": 3.3627118644067802e-06, - "loss": 1.3372, - "step": 25040 - }, - { - "epoch": 0.58, - "grad_norm": 5.5412726402282715, - "learning_rate": 3.3559322033898308e-06, - "loss": 1.2002, - "step": 25050 - }, - { - "epoch": 0.58, - "grad_norm": 6.219215393066406, - "learning_rate": 3.3491525423728817e-06, - "loss": 1.1117, - "step": 25060 - }, - { - "epoch": 0.58, - "grad_norm": 3.6740598678588867, - "learning_rate": 3.3423728813559327e-06, - "loss": 1.2222, - "step": 25070 - }, - { - "epoch": 0.58, - "grad_norm": 5.168256759643555, - "learning_rate": 3.3355932203389833e-06, - "loss": 1.2059, - "step": 25080 - }, - { - "epoch": 0.58, - "grad_norm": 4.997997283935547, - "learning_rate": 3.3288135593220343e-06, - "loss": 1.3613, - "step": 25090 - }, - { - "epoch": 0.58, - "grad_norm": 10.958406448364258, - "learning_rate": 3.322033898305085e-06, - "loss": 1.2508, - "step": 25100 - }, - { - "epoch": 0.58, - "grad_norm": 7.793730735778809, - "learning_rate": 3.3152542372881358e-06, - "loss": 1.1418, - "step": 25110 - }, - { - "epoch": 0.58, - "grad_norm": 6.706384181976318, - "learning_rate": 3.3084745762711868e-06, - "loss": 1.1661, - "step": 25120 - }, - { - "epoch": 0.58, - "grad_norm": 9.17857551574707, - "learning_rate": 3.3016949152542377e-06, - "loss": 1.1569, - "step": 25130 - }, - { - "epoch": 0.58, - "grad_norm": 1.5717544555664062, - "learning_rate": 3.2949152542372887e-06, - "loss": 1.0838, - "step": 25140 - }, - { - "epoch": 0.58, - "grad_norm": 7.292511940002441, - "learning_rate": 3.288135593220339e-06, - "loss": 1.3231, - "step": 25150 - }, - { - "epoch": 0.58, - "grad_norm": 6.030792236328125, - "learning_rate": 3.28135593220339e-06, - "loss": 1.1247, - "step": 25160 - }, - { - "epoch": 0.58, - "grad_norm": 2.3071470260620117, - "learning_rate": 3.274576271186441e-06, - "loss": 1.113, - "step": 25170 - }, - { - "epoch": 0.58, - "grad_norm": 5.088674545288086, - "learning_rate": 3.2677966101694918e-06, - "loss": 1.0984, - "step": 25180 - }, - { - "epoch": 0.58, - "grad_norm": 13.216058731079102, - "learning_rate": 3.2610169491525428e-06, - "loss": 1.3596, - "step": 25190 - }, - { - "epoch": 0.58, - "grad_norm": 5.061989784240723, - "learning_rate": 3.2542372881355933e-06, - "loss": 1.1028, - "step": 25200 - }, - { - "epoch": 0.58, - "grad_norm": 5.740258693695068, - "learning_rate": 3.2474576271186443e-06, - "loss": 1.14, - "step": 25210 - }, - { - "epoch": 0.58, - "grad_norm": 5.7218098640441895, - "learning_rate": 3.2406779661016953e-06, - "loss": 1.3268, - "step": 25220 - }, - { - "epoch": 0.58, - "grad_norm": 10.579949378967285, - "learning_rate": 3.2338983050847462e-06, - "loss": 1.1409, - "step": 25230 - }, - { - "epoch": 0.58, - "grad_norm": 10.984354019165039, - "learning_rate": 3.2271186440677972e-06, - "loss": 1.3797, - "step": 25240 - }, - { - "epoch": 0.58, - "grad_norm": 8.028718948364258, - "learning_rate": 3.2203389830508473e-06, - "loss": 1.2275, - "step": 25250 - }, - { - "epoch": 0.58, - "grad_norm": 2.102750301361084, - "learning_rate": 3.2135593220338983e-06, - "loss": 1.3638, - "step": 25260 - }, - { - "epoch": 0.59, - "grad_norm": 12.67302417755127, - "learning_rate": 3.2067796610169493e-06, - "loss": 1.3955, - "step": 25270 - }, - { - "epoch": 0.59, - "grad_norm": 9.690311431884766, - "learning_rate": 3.2000000000000003e-06, - "loss": 1.178, - "step": 25280 - }, - { - "epoch": 0.59, - "grad_norm": 9.31050968170166, - "learning_rate": 3.1932203389830513e-06, - "loss": 1.3644, - "step": 25290 - }, - { - "epoch": 0.59, - "grad_norm": 6.5038933753967285, - "learning_rate": 3.186440677966102e-06, - "loss": 1.4397, - "step": 25300 - }, - { - "epoch": 0.59, - "grad_norm": 4.691766262054443, - "learning_rate": 3.1796610169491528e-06, - "loss": 1.3062, - "step": 25310 - }, - { - "epoch": 0.59, - "grad_norm": 9.108213424682617, - "learning_rate": 3.1728813559322038e-06, - "loss": 1.1835, - "step": 25320 - }, - { - "epoch": 0.59, - "grad_norm": 6.10239315032959, - "learning_rate": 3.1661016949152547e-06, - "loss": 1.3249, - "step": 25330 - }, - { - "epoch": 0.59, - "grad_norm": 3.791691780090332, - "learning_rate": 3.1593220338983053e-06, - "loss": 1.2023, - "step": 25340 - }, - { - "epoch": 0.59, - "grad_norm": 5.800852298736572, - "learning_rate": 3.1525423728813563e-06, - "loss": 1.352, - "step": 25350 - }, - { - "epoch": 0.59, - "grad_norm": 6.795603275299072, - "learning_rate": 3.145762711864407e-06, - "loss": 1.3013, - "step": 25360 - }, - { - "epoch": 0.59, - "grad_norm": 8.434121131896973, - "learning_rate": 3.138983050847458e-06, - "loss": 1.1872, - "step": 25370 - }, - { - "epoch": 0.59, - "grad_norm": 4.576048851013184, - "learning_rate": 3.1322033898305088e-06, - "loss": 1.2377, - "step": 25380 - }, - { - "epoch": 0.59, - "grad_norm": 11.319803237915039, - "learning_rate": 3.1254237288135598e-06, - "loss": 1.1306, - "step": 25390 - }, - { - "epoch": 0.59, - "grad_norm": 11.858842849731445, - "learning_rate": 3.1186440677966107e-06, - "loss": 1.3123, - "step": 25400 - }, - { - "epoch": 0.59, - "grad_norm": 10.17744255065918, - "learning_rate": 3.111864406779661e-06, - "loss": 1.2951, - "step": 25410 - }, - { - "epoch": 0.59, - "grad_norm": 5.539567947387695, - "learning_rate": 3.105084745762712e-06, - "loss": 1.369, - "step": 25420 - }, - { - "epoch": 0.59, - "grad_norm": 6.66847038269043, - "learning_rate": 3.098305084745763e-06, - "loss": 1.1254, - "step": 25430 - }, - { - "epoch": 0.59, - "grad_norm": 2.0820562839508057, - "learning_rate": 3.091525423728814e-06, - "loss": 1.0718, - "step": 25440 - }, - { - "epoch": 0.59, - "grad_norm": 1.1296765804290771, - "learning_rate": 3.0847457627118648e-06, - "loss": 1.3201, - "step": 25450 - }, - { - "epoch": 0.59, - "grad_norm": 6.302907943725586, - "learning_rate": 3.0779661016949153e-06, - "loss": 1.3956, - "step": 25460 - }, - { - "epoch": 0.59, - "grad_norm": 7.594703674316406, - "learning_rate": 3.0711864406779663e-06, - "loss": 1.109, - "step": 25470 - }, - { - "epoch": 0.59, - "grad_norm": 4.60818338394165, - "learning_rate": 3.0644067796610173e-06, - "loss": 1.194, - "step": 25480 - }, - { - "epoch": 0.59, - "grad_norm": 9.426482200622559, - "learning_rate": 3.0576271186440683e-06, - "loss": 1.1175, - "step": 25490 - }, - { - "epoch": 0.59, - "grad_norm": 1.6626375913619995, - "learning_rate": 3.0508474576271192e-06, - "loss": 1.241, - "step": 25500 - }, - { - "epoch": 0.59, - "eval_loss": 0.998432993888855, - "eval_runtime": 66.8613, - "eval_samples_per_second": 14.956, - "eval_steps_per_second": 14.956, - "step": 25500 - }, - { - "epoch": 0.59, - "grad_norm": 5.7481255531311035, - "learning_rate": 3.0440677966101694e-06, - "loss": 1.0914, - "step": 25510 - }, - { - "epoch": 0.59, - "grad_norm": 6.295405864715576, - "learning_rate": 3.0372881355932203e-06, - "loss": 1.2265, - "step": 25520 - }, - { - "epoch": 0.59, - "grad_norm": 7.302733898162842, - "learning_rate": 3.0305084745762713e-06, - "loss": 1.2384, - "step": 25530 - }, - { - "epoch": 0.59, - "grad_norm": 6.079695224761963, - "learning_rate": 3.0237288135593223e-06, - "loss": 1.3285, - "step": 25540 - }, - { - "epoch": 0.59, - "grad_norm": 3.6032302379608154, - "learning_rate": 3.0169491525423733e-06, - "loss": 1.3156, - "step": 25550 - }, - { - "epoch": 0.59, - "grad_norm": 4.823906421661377, - "learning_rate": 3.010169491525424e-06, - "loss": 0.8886, - "step": 25560 - }, - { - "epoch": 0.59, - "grad_norm": 3.6529293060302734, - "learning_rate": 3.003389830508475e-06, - "loss": 1.2771, - "step": 25570 - }, - { - "epoch": 0.59, - "grad_norm": 8.288748741149902, - "learning_rate": 2.9966101694915258e-06, - "loss": 1.1765, - "step": 25580 - }, - { - "epoch": 0.59, - "grad_norm": 17.39524269104004, - "learning_rate": 2.9898305084745768e-06, - "loss": 1.3095, - "step": 25590 - }, - { - "epoch": 0.59, - "grad_norm": 6.149013042449951, - "learning_rate": 2.9830508474576277e-06, - "loss": 0.9663, - "step": 25600 - }, - { - "epoch": 0.59, - "grad_norm": 9.429570198059082, - "learning_rate": 2.9762711864406783e-06, - "loss": 1.1291, - "step": 25610 - }, - { - "epoch": 0.59, - "grad_norm": 7.571442127227783, - "learning_rate": 2.969491525423729e-06, - "loss": 1.0891, - "step": 25620 - }, - { - "epoch": 0.59, - "grad_norm": 8.435641288757324, - "learning_rate": 2.96271186440678e-06, - "loss": 1.207, - "step": 25630 - }, - { - "epoch": 0.59, - "grad_norm": 5.694164276123047, - "learning_rate": 2.955932203389831e-06, - "loss": 1.2288, - "step": 25640 - }, - { - "epoch": 0.59, - "grad_norm": 7.887094974517822, - "learning_rate": 2.9491525423728818e-06, - "loss": 1.2683, - "step": 25650 - }, - { - "epoch": 0.59, - "grad_norm": 1.4776153564453125, - "learning_rate": 2.9423728813559327e-06, - "loss": 1.2045, - "step": 25660 - }, - { - "epoch": 0.59, - "grad_norm": 22.860342025756836, - "learning_rate": 2.935593220338983e-06, - "loss": 1.1633, - "step": 25670 - }, - { - "epoch": 0.59, - "grad_norm": 5.612551689147949, - "learning_rate": 2.928813559322034e-06, - "loss": 1.2065, - "step": 25680 - }, - { - "epoch": 0.59, - "grad_norm": 4.532423973083496, - "learning_rate": 2.922033898305085e-06, - "loss": 1.2672, - "step": 25690 - }, - { - "epoch": 0.6, - "grad_norm": 8.843605041503906, - "learning_rate": 2.915254237288136e-06, - "loss": 1.2333, - "step": 25700 - }, - { - "epoch": 0.6, - "grad_norm": 7.5249481201171875, - "learning_rate": 2.9084745762711868e-06, - "loss": 1.2351, - "step": 25710 - }, - { - "epoch": 0.6, - "grad_norm": 4.021324634552002, - "learning_rate": 2.9016949152542373e-06, - "loss": 1.302, - "step": 25720 - }, - { - "epoch": 0.6, - "grad_norm": 12.174166679382324, - "learning_rate": 2.8949152542372883e-06, - "loss": 1.1858, - "step": 25730 - }, - { - "epoch": 0.6, - "grad_norm": 9.587007522583008, - "learning_rate": 2.8881355932203393e-06, - "loss": 1.2249, - "step": 25740 - }, - { - "epoch": 0.6, - "grad_norm": 9.73293399810791, - "learning_rate": 2.8813559322033903e-06, - "loss": 1.1867, - "step": 25750 - }, - { - "epoch": 0.6, - "grad_norm": 17.562885284423828, - "learning_rate": 2.8745762711864412e-06, - "loss": 1.2162, - "step": 25760 - }, - { - "epoch": 0.6, - "grad_norm": 2.855794668197632, - "learning_rate": 2.8677966101694914e-06, - "loss": 1.2218, - "step": 25770 - }, - { - "epoch": 0.6, - "grad_norm": 8.89600658416748, - "learning_rate": 2.8610169491525424e-06, - "loss": 1.288, - "step": 25780 - }, - { - "epoch": 0.6, - "grad_norm": 9.732436180114746, - "learning_rate": 2.8542372881355933e-06, - "loss": 1.4694, - "step": 25790 - }, - { - "epoch": 0.6, - "grad_norm": 7.497152805328369, - "learning_rate": 2.8474576271186443e-06, - "loss": 1.1525, - "step": 25800 - }, - { - "epoch": 0.6, - "grad_norm": 1.6418439149856567, - "learning_rate": 2.8406779661016953e-06, - "loss": 1.1873, - "step": 25810 - }, - { - "epoch": 0.6, - "grad_norm": 9.75865650177002, - "learning_rate": 2.833898305084746e-06, - "loss": 1.3353, - "step": 25820 - }, - { - "epoch": 0.6, - "grad_norm": 9.915677070617676, - "learning_rate": 2.827118644067797e-06, - "loss": 1.2384, - "step": 25830 - }, - { - "epoch": 0.6, - "grad_norm": 11.455119132995605, - "learning_rate": 2.820338983050848e-06, - "loss": 1.0484, - "step": 25840 - }, - { - "epoch": 0.6, - "grad_norm": 4.330808162689209, - "learning_rate": 2.8135593220338988e-06, - "loss": 1.1905, - "step": 25850 - }, - { - "epoch": 0.6, - "grad_norm": 7.7616376876831055, - "learning_rate": 2.8067796610169497e-06, - "loss": 1.2721, - "step": 25860 - }, - { - "epoch": 0.6, - "grad_norm": 9.885458946228027, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.9887, - "step": 25870 - }, - { - "epoch": 0.6, - "grad_norm": 7.448554039001465, - "learning_rate": 2.793220338983051e-06, - "loss": 1.1294, - "step": 25880 - }, - { - "epoch": 0.6, - "grad_norm": 3.008898973464966, - "learning_rate": 2.786440677966102e-06, - "loss": 1.2489, - "step": 25890 - }, - { - "epoch": 0.6, - "grad_norm": 8.211142539978027, - "learning_rate": 2.779661016949153e-06, - "loss": 1.2185, - "step": 25900 - }, - { - "epoch": 0.6, - "grad_norm": 7.966844081878662, - "learning_rate": 2.7728813559322038e-06, - "loss": 1.3666, - "step": 25910 - }, - { - "epoch": 0.6, - "grad_norm": 7.830656051635742, - "learning_rate": 2.7661016949152548e-06, - "loss": 1.3537, - "step": 25920 - }, - { - "epoch": 0.6, - "grad_norm": 4.714284420013428, - "learning_rate": 2.7593220338983053e-06, - "loss": 1.1768, - "step": 25930 - }, - { - "epoch": 0.6, - "grad_norm": 11.090384483337402, - "learning_rate": 2.752542372881356e-06, - "loss": 1.3252, - "step": 25940 - }, - { - "epoch": 0.6, - "grad_norm": 7.201011657714844, - "learning_rate": 2.745762711864407e-06, - "loss": 1.3252, - "step": 25950 - }, - { - "epoch": 0.6, - "grad_norm": 3.7255921363830566, - "learning_rate": 2.738983050847458e-06, - "loss": 1.2302, - "step": 25960 - }, - { - "epoch": 0.6, - "grad_norm": 9.323339462280273, - "learning_rate": 2.732203389830509e-06, - "loss": 1.2951, - "step": 25970 - }, - { - "epoch": 0.6, - "grad_norm": 24.529966354370117, - "learning_rate": 2.7254237288135593e-06, - "loss": 1.2912, - "step": 25980 - }, - { - "epoch": 0.6, - "grad_norm": 4.739007472991943, - "learning_rate": 2.7186440677966103e-06, - "loss": 0.9912, - "step": 25990 - }, - { - "epoch": 0.6, - "grad_norm": 8.324442863464355, - "learning_rate": 2.7118644067796613e-06, - "loss": 1.1363, - "step": 26000 - }, - { - "epoch": 0.6, - "eval_loss": 0.9869400858879089, - "eval_runtime": 66.847, - "eval_samples_per_second": 14.96, - "eval_steps_per_second": 14.96, - "step": 26000 - }, - { - "epoch": 0.6, - "grad_norm": 8.513010025024414, - "learning_rate": 2.7050847457627123e-06, - "loss": 1.1538, - "step": 26010 - }, - { - "epoch": 0.6, - "grad_norm": 3.9168806076049805, - "learning_rate": 2.6983050847457633e-06, - "loss": 1.0728, - "step": 26020 - }, - { - "epoch": 0.6, - "grad_norm": 5.090580463409424, - "learning_rate": 2.6915254237288134e-06, - "loss": 1.2442, - "step": 26030 - }, - { - "epoch": 0.6, - "grad_norm": 5.569304466247559, - "learning_rate": 2.6847457627118644e-06, - "loss": 0.9942, - "step": 26040 - }, - { - "epoch": 0.6, - "grad_norm": 6.856686115264893, - "learning_rate": 2.6779661016949153e-06, - "loss": 1.1926, - "step": 26050 - }, - { - "epoch": 0.6, - "grad_norm": 4.85548210144043, - "learning_rate": 2.6711864406779663e-06, - "loss": 1.1911, - "step": 26060 - }, - { - "epoch": 0.6, - "grad_norm": 5.3112335205078125, - "learning_rate": 2.6644067796610173e-06, - "loss": 1.1381, - "step": 26070 - }, - { - "epoch": 0.6, - "grad_norm": 5.764985084533691, - "learning_rate": 2.657627118644068e-06, - "loss": 1.1531, - "step": 26080 - }, - { - "epoch": 0.6, - "grad_norm": 6.8243184089660645, - "learning_rate": 2.650847457627119e-06, - "loss": 1.1892, - "step": 26090 - }, - { - "epoch": 0.6, - "grad_norm": 6.5991692543029785, - "learning_rate": 2.64406779661017e-06, - "loss": 1.0998, - "step": 26100 - }, - { - "epoch": 0.6, - "grad_norm": 9.651345252990723, - "learning_rate": 2.6372881355932208e-06, - "loss": 1.2483, - "step": 26110 - }, - { - "epoch": 0.6, - "grad_norm": 3.886197328567505, - "learning_rate": 2.6305084745762718e-06, - "loss": 1.1655, - "step": 26120 - }, - { - "epoch": 0.6, - "grad_norm": 4.277401924133301, - "learning_rate": 2.6237288135593223e-06, - "loss": 1.2908, - "step": 26130 - }, - { - "epoch": 0.61, - "grad_norm": 4.072823524475098, - "learning_rate": 2.616949152542373e-06, - "loss": 1.0785, - "step": 26140 - }, - { - "epoch": 0.61, - "grad_norm": 4.426355838775635, - "learning_rate": 2.610169491525424e-06, - "loss": 1.157, - "step": 26150 - }, - { - "epoch": 0.61, - "grad_norm": 1.601050853729248, - "learning_rate": 2.603389830508475e-06, - "loss": 1.3465, - "step": 26160 - }, - { - "epoch": 0.61, - "grad_norm": 22.35835075378418, - "learning_rate": 2.596610169491526e-06, - "loss": 1.3287, - "step": 26170 - }, - { - "epoch": 0.61, - "grad_norm": 4.574925422668457, - "learning_rate": 2.5898305084745768e-06, - "loss": 1.1298, - "step": 26180 - }, - { - "epoch": 0.61, - "grad_norm": 6.630609512329102, - "learning_rate": 2.5830508474576273e-06, - "loss": 1.1522, - "step": 26190 - }, - { - "epoch": 0.61, - "grad_norm": 5.038731098175049, - "learning_rate": 2.576271186440678e-06, - "loss": 1.1709, - "step": 26200 - }, - { - "epoch": 0.61, - "grad_norm": 9.418252944946289, - "learning_rate": 2.569491525423729e-06, - "loss": 1.2216, - "step": 26210 - }, - { - "epoch": 0.61, - "grad_norm": 16.634244918823242, - "learning_rate": 2.56271186440678e-06, - "loss": 1.2476, - "step": 26220 - }, - { - "epoch": 0.61, - "grad_norm": 6.329607009887695, - "learning_rate": 2.555932203389831e-06, - "loss": 1.3251, - "step": 26230 - }, - { - "epoch": 0.61, - "grad_norm": 7.770134449005127, - "learning_rate": 2.5491525423728814e-06, - "loss": 1.3357, - "step": 26240 - }, - { - "epoch": 0.61, - "grad_norm": 8.234681129455566, - "learning_rate": 2.5423728813559323e-06, - "loss": 1.3181, - "step": 26250 - }, - { - "epoch": 0.61, - "grad_norm": 4.855250358581543, - "learning_rate": 2.5355932203389833e-06, - "loss": 1.3914, - "step": 26260 - }, - { - "epoch": 0.61, - "grad_norm": 9.435198783874512, - "learning_rate": 2.5288135593220343e-06, - "loss": 1.225, - "step": 26270 - }, - { - "epoch": 0.61, - "grad_norm": 2.1608779430389404, - "learning_rate": 2.5220338983050853e-06, - "loss": 1.4329, - "step": 26280 - }, - { - "epoch": 0.61, - "grad_norm": 5.846624851226807, - "learning_rate": 2.5152542372881354e-06, - "loss": 1.2809, - "step": 26290 - }, - { - "epoch": 0.61, - "grad_norm": 2.737983465194702, - "learning_rate": 2.5084745762711864e-06, - "loss": 1.1873, - "step": 26300 - }, - { - "epoch": 0.61, - "grad_norm": 2.908550977706909, - "learning_rate": 2.5016949152542374e-06, - "loss": 1.1736, - "step": 26310 - }, - { - "epoch": 0.61, - "grad_norm": 36.61166000366211, - "learning_rate": 2.4949152542372883e-06, - "loss": 1.1875, - "step": 26320 - }, - { - "epoch": 0.61, - "grad_norm": 20.149581909179688, - "learning_rate": 2.488135593220339e-06, - "loss": 1.3924, - "step": 26330 - }, - { - "epoch": 0.61, - "grad_norm": 8.97331428527832, - "learning_rate": 2.48135593220339e-06, - "loss": 1.0913, - "step": 26340 - }, - { - "epoch": 0.61, - "grad_norm": 6.3798441886901855, - "learning_rate": 2.474576271186441e-06, - "loss": 1.0683, - "step": 26350 - }, - { - "epoch": 0.61, - "grad_norm": 13.193538665771484, - "learning_rate": 2.467796610169492e-06, - "loss": 1.0535, - "step": 26360 - }, - { - "epoch": 0.61, - "grad_norm": 14.773574829101562, - "learning_rate": 2.461016949152543e-06, - "loss": 1.0665, - "step": 26370 - }, - { - "epoch": 0.61, - "grad_norm": 3.1088144779205322, - "learning_rate": 2.4542372881355933e-06, - "loss": 1.3309, - "step": 26380 - }, - { - "epoch": 0.61, - "grad_norm": 3.1904797554016113, - "learning_rate": 2.4474576271186443e-06, - "loss": 1.2429, - "step": 26390 - }, - { - "epoch": 0.61, - "grad_norm": 3.5016331672668457, - "learning_rate": 2.4406779661016953e-06, - "loss": 1.2189, - "step": 26400 - }, - { - "epoch": 0.61, - "grad_norm": 6.315566539764404, - "learning_rate": 2.433898305084746e-06, - "loss": 1.2291, - "step": 26410 - }, - { - "epoch": 0.61, - "grad_norm": 14.041393280029297, - "learning_rate": 2.427118644067797e-06, - "loss": 1.1644, - "step": 26420 - }, - { - "epoch": 0.61, - "grad_norm": 20.225662231445312, - "learning_rate": 2.4203389830508474e-06, - "loss": 1.2319, - "step": 26430 - }, - { - "epoch": 0.61, - "grad_norm": 3.7186009883880615, - "learning_rate": 2.4135593220338984e-06, - "loss": 1.2453, - "step": 26440 - }, - { - "epoch": 0.61, - "grad_norm": 2.4279704093933105, - "learning_rate": 2.4067796610169493e-06, - "loss": 1.0585, - "step": 26450 - }, - { - "epoch": 0.61, - "grad_norm": 4.853421211242676, - "learning_rate": 2.4000000000000003e-06, - "loss": 1.2464, - "step": 26460 - }, - { - "epoch": 0.61, - "grad_norm": 4.436877250671387, - "learning_rate": 2.393220338983051e-06, - "loss": 1.3915, - "step": 26470 - }, - { - "epoch": 0.61, - "grad_norm": 3.7520599365234375, - "learning_rate": 2.386440677966102e-06, - "loss": 1.3439, - "step": 26480 - }, - { - "epoch": 0.61, - "grad_norm": 1.7208768129348755, - "learning_rate": 2.379661016949153e-06, - "loss": 1.2509, - "step": 26490 - }, - { - "epoch": 0.61, - "grad_norm": 2.465243101119995, - "learning_rate": 2.372881355932204e-06, - "loss": 1.2785, - "step": 26500 - }, - { - "epoch": 0.61, - "eval_loss": 0.9529283046722412, - "eval_runtime": 66.4596, - "eval_samples_per_second": 15.047, - "eval_steps_per_second": 15.047, - "step": 26500 - }, - { - "epoch": 0.61, - "grad_norm": 16.25847816467285, - "learning_rate": 2.3661016949152544e-06, - "loss": 1.2935, - "step": 26510 - }, - { - "epoch": 0.61, - "grad_norm": 5.584811687469482, - "learning_rate": 2.3593220338983053e-06, - "loss": 1.1677, - "step": 26520 - }, - { - "epoch": 0.61, - "grad_norm": 5.604238033294678, - "learning_rate": 2.3525423728813563e-06, - "loss": 1.1095, - "step": 26530 - }, - { - "epoch": 0.61, - "grad_norm": 7.715309143066406, - "learning_rate": 2.345762711864407e-06, - "loss": 1.3044, - "step": 26540 - }, - { - "epoch": 0.61, - "grad_norm": 1.3220744132995605, - "learning_rate": 2.338983050847458e-06, - "loss": 1.1298, - "step": 26550 - }, - { - "epoch": 0.61, - "grad_norm": 2.8409392833709717, - "learning_rate": 2.3322033898305084e-06, - "loss": 1.2565, - "step": 26560 - }, - { - "epoch": 0.62, - "grad_norm": 3.3840484619140625, - "learning_rate": 2.3254237288135594e-06, - "loss": 1.2529, - "step": 26570 - }, - { - "epoch": 0.62, - "grad_norm": 2.952516555786133, - "learning_rate": 2.3186440677966103e-06, - "loss": 1.3135, - "step": 26580 - }, - { - "epoch": 0.62, - "grad_norm": 6.298173904418945, - "learning_rate": 2.3118644067796613e-06, - "loss": 1.2517, - "step": 26590 - }, - { - "epoch": 0.62, - "grad_norm": 17.91318702697754, - "learning_rate": 2.305084745762712e-06, - "loss": 1.1782, - "step": 26600 - }, - { - "epoch": 0.62, - "grad_norm": 9.400506973266602, - "learning_rate": 2.298305084745763e-06, - "loss": 1.3047, - "step": 26610 - }, - { - "epoch": 0.62, - "grad_norm": 8.402453422546387, - "learning_rate": 2.291525423728814e-06, - "loss": 1.3617, - "step": 26620 - }, - { - "epoch": 0.62, - "grad_norm": 6.365287780761719, - "learning_rate": 2.284745762711865e-06, - "loss": 1.2257, - "step": 26630 - }, - { - "epoch": 0.62, - "grad_norm": 2.8725926876068115, - "learning_rate": 2.2779661016949154e-06, - "loss": 1.0295, - "step": 26640 - }, - { - "epoch": 0.62, - "grad_norm": 13.41898250579834, - "learning_rate": 2.2711864406779663e-06, - "loss": 1.1358, - "step": 26650 - }, - { - "epoch": 0.62, - "grad_norm": 3.4462170600891113, - "learning_rate": 2.2644067796610173e-06, - "loss": 1.2295, - "step": 26660 - }, - { - "epoch": 0.62, - "grad_norm": 1.85061514377594, - "learning_rate": 2.257627118644068e-06, - "loss": 1.2474, - "step": 26670 - }, - { - "epoch": 0.62, - "grad_norm": 4.921010971069336, - "learning_rate": 2.250847457627119e-06, - "loss": 1.4324, - "step": 26680 - }, - { - "epoch": 0.62, - "grad_norm": 5.649210453033447, - "learning_rate": 2.2440677966101694e-06, - "loss": 1.2167, - "step": 26690 - }, - { - "epoch": 0.62, - "grad_norm": 13.460455894470215, - "learning_rate": 2.2372881355932204e-06, - "loss": 1.1124, - "step": 26700 - }, - { - "epoch": 0.62, - "grad_norm": 4.873124599456787, - "learning_rate": 2.2305084745762714e-06, - "loss": 1.2533, - "step": 26710 - }, - { - "epoch": 0.62, - "grad_norm": 4.260035514831543, - "learning_rate": 2.2237288135593223e-06, - "loss": 1.1214, - "step": 26720 - }, - { - "epoch": 0.62, - "grad_norm": 9.646221160888672, - "learning_rate": 2.216949152542373e-06, - "loss": 1.3115, - "step": 26730 - }, - { - "epoch": 0.62, - "grad_norm": 2.029663324356079, - "learning_rate": 2.210169491525424e-06, - "loss": 1.1277, - "step": 26740 - }, - { - "epoch": 0.62, - "grad_norm": 2.2031261920928955, - "learning_rate": 2.203389830508475e-06, - "loss": 1.275, - "step": 26750 - }, - { - "epoch": 0.62, - "grad_norm": 7.809500217437744, - "learning_rate": 2.196610169491526e-06, - "loss": 1.4312, - "step": 26760 - }, - { - "epoch": 0.62, - "grad_norm": 3.175096273422241, - "learning_rate": 2.1898305084745764e-06, - "loss": 1.3533, - "step": 26770 - }, - { - "epoch": 0.62, - "grad_norm": 7.439695835113525, - "learning_rate": 2.1830508474576273e-06, - "loss": 1.2436, - "step": 26780 - }, - { - "epoch": 0.62, - "grad_norm": 6.698858737945557, - "learning_rate": 2.1762711864406783e-06, - "loss": 1.3145, - "step": 26790 - }, - { - "epoch": 0.62, - "grad_norm": 4.412804126739502, - "learning_rate": 2.169491525423729e-06, - "loss": 1.185, - "step": 26800 - }, - { - "epoch": 0.62, - "grad_norm": 9.170161247253418, - "learning_rate": 2.16271186440678e-06, - "loss": 1.1868, - "step": 26810 - }, - { - "epoch": 0.62, - "grad_norm": 21.11345672607422, - "learning_rate": 2.1559322033898304e-06, - "loss": 1.2738, - "step": 26820 - }, - { - "epoch": 0.62, - "grad_norm": 52.31527328491211, - "learning_rate": 2.1491525423728814e-06, - "loss": 1.1694, - "step": 26830 - }, - { - "epoch": 0.62, - "grad_norm": 4.136544704437256, - "learning_rate": 2.1423728813559324e-06, - "loss": 1.0897, - "step": 26840 - }, - { - "epoch": 0.62, - "grad_norm": 3.128509759902954, - "learning_rate": 2.1355932203389833e-06, - "loss": 1.1818, - "step": 26850 - }, - { - "epoch": 0.62, - "grad_norm": 12.374292373657227, - "learning_rate": 2.128813559322034e-06, - "loss": 1.3163, - "step": 26860 - }, - { - "epoch": 0.62, - "grad_norm": 8.347363471984863, - "learning_rate": 2.122033898305085e-06, - "loss": 1.1177, - "step": 26870 - }, - { - "epoch": 0.62, - "grad_norm": 17.513408660888672, - "learning_rate": 2.115254237288136e-06, - "loss": 1.2128, - "step": 26880 - }, - { - "epoch": 0.62, - "grad_norm": 5.053252220153809, - "learning_rate": 2.108474576271187e-06, - "loss": 1.4111, - "step": 26890 - }, - { - "epoch": 0.62, - "grad_norm": 2.5421042442321777, - "learning_rate": 2.1016949152542374e-06, - "loss": 1.3909, - "step": 26900 - }, - { - "epoch": 0.62, - "grad_norm": 14.27386474609375, - "learning_rate": 2.0949152542372883e-06, - "loss": 1.3056, - "step": 26910 - }, - { - "epoch": 0.62, - "grad_norm": 3.5711681842803955, - "learning_rate": 2.0881355932203393e-06, - "loss": 1.1021, - "step": 26920 - }, - { - "epoch": 0.62, - "grad_norm": 7.276656627655029, - "learning_rate": 2.08135593220339e-06, - "loss": 1.0432, - "step": 26930 - }, - { - "epoch": 0.62, - "grad_norm": 2.4546101093292236, - "learning_rate": 2.074576271186441e-06, - "loss": 1.3391, - "step": 26940 - }, - { - "epoch": 0.62, - "grad_norm": 6.01200008392334, - "learning_rate": 2.0677966101694914e-06, - "loss": 1.1382, - "step": 26950 - }, - { - "epoch": 0.62, - "grad_norm": 6.168879508972168, - "learning_rate": 2.0610169491525424e-06, - "loss": 1.1858, - "step": 26960 - }, - { - "epoch": 0.62, - "grad_norm": 8.381854057312012, - "learning_rate": 2.0542372881355934e-06, - "loss": 1.2046, - "step": 26970 - }, - { - "epoch": 0.62, - "grad_norm": 3.6406259536743164, - "learning_rate": 2.0474576271186443e-06, - "loss": 1.2098, - "step": 26980 - }, - { - "epoch": 0.62, - "grad_norm": 7.411496162414551, - "learning_rate": 2.0406779661016953e-06, - "loss": 1.2822, - "step": 26990 - }, - { - "epoch": 0.63, - "grad_norm": 4.259427547454834, - "learning_rate": 2.033898305084746e-06, - "loss": 1.0367, - "step": 27000 - }, - { - "epoch": 0.63, - "eval_loss": 1.0261023044586182, - "eval_runtime": 66.6287, - "eval_samples_per_second": 15.009, - "eval_steps_per_second": 15.009, - "step": 27000 - }, - { - "epoch": 0.63, - "grad_norm": 1.8957105875015259, - "learning_rate": 2.027118644067797e-06, - "loss": 1.1367, - "step": 27010 - }, - { - "epoch": 0.63, - "grad_norm": 8.313997268676758, - "learning_rate": 2.020338983050848e-06, - "loss": 1.1379, - "step": 27020 - }, - { - "epoch": 0.63, - "grad_norm": 5.363570690155029, - "learning_rate": 2.0135593220338984e-06, - "loss": 1.1295, - "step": 27030 - }, - { - "epoch": 0.63, - "grad_norm": 15.122271537780762, - "learning_rate": 2.0067796610169494e-06, - "loss": 1.1451, - "step": 27040 - }, - { - "epoch": 0.63, - "grad_norm": 4.8829755783081055, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.411, - "step": 27050 - }, - { - "epoch": 0.63, - "grad_norm": 9.953533172607422, - "learning_rate": 1.993220338983051e-06, - "loss": 1.0443, - "step": 27060 - }, - { - "epoch": 0.63, - "grad_norm": 6.28991174697876, - "learning_rate": 1.986440677966102e-06, - "loss": 1.0607, - "step": 27070 - }, - { - "epoch": 0.63, - "grad_norm": 8.279200553894043, - "learning_rate": 1.9796610169491524e-06, - "loss": 1.3226, - "step": 27080 - }, - { - "epoch": 0.63, - "grad_norm": 6.716332912445068, - "learning_rate": 1.9728813559322034e-06, - "loss": 1.1337, - "step": 27090 - }, - { - "epoch": 0.63, - "grad_norm": 8.114790916442871, - "learning_rate": 1.9661016949152544e-06, - "loss": 1.1927, - "step": 27100 - }, - { - "epoch": 0.63, - "grad_norm": 8.28928279876709, - "learning_rate": 1.9593220338983053e-06, - "loss": 1.2793, - "step": 27110 - }, - { - "epoch": 0.63, - "grad_norm": 5.299505710601807, - "learning_rate": 1.9525423728813563e-06, - "loss": 1.1913, - "step": 27120 - }, - { - "epoch": 0.63, - "grad_norm": 5.9343061447143555, - "learning_rate": 1.945762711864407e-06, - "loss": 1.162, - "step": 27130 - }, - { - "epoch": 0.63, - "grad_norm": 5.81511926651001, - "learning_rate": 1.938983050847458e-06, - "loss": 1.3911, - "step": 27140 - }, - { - "epoch": 0.63, - "grad_norm": 2.0106678009033203, - "learning_rate": 1.932203389830509e-06, - "loss": 1.3768, - "step": 27150 - }, - { - "epoch": 0.63, - "grad_norm": 5.970021724700928, - "learning_rate": 1.9254237288135594e-06, - "loss": 1.2548, - "step": 27160 - }, - { - "epoch": 0.63, - "grad_norm": 12.892632484436035, - "learning_rate": 1.9186440677966104e-06, - "loss": 1.3527, - "step": 27170 - }, - { - "epoch": 0.63, - "grad_norm": 12.3954496383667, - "learning_rate": 1.9118644067796613e-06, - "loss": 1.2633, - "step": 27180 - }, - { - "epoch": 0.63, - "grad_norm": 9.317601203918457, - "learning_rate": 1.9050847457627119e-06, - "loss": 1.1907, - "step": 27190 - }, - { - "epoch": 0.63, - "grad_norm": 2.2695984840393066, - "learning_rate": 1.8983050847457629e-06, - "loss": 1.2513, - "step": 27200 - }, - { - "epoch": 0.63, - "grad_norm": 5.8285908699035645, - "learning_rate": 1.8915254237288136e-06, - "loss": 1.2707, - "step": 27210 - }, - { - "epoch": 0.63, - "grad_norm": 1.861101508140564, - "learning_rate": 1.8847457627118646e-06, - "loss": 0.9353, - "step": 27220 - }, - { - "epoch": 0.63, - "grad_norm": 5.153651714324951, - "learning_rate": 1.8779661016949156e-06, - "loss": 1.2232, - "step": 27230 - }, - { - "epoch": 0.63, - "grad_norm": 4.55015230178833, - "learning_rate": 1.8711864406779661e-06, - "loss": 1.2731, - "step": 27240 - }, - { - "epoch": 0.63, - "grad_norm": 2.4303572177886963, - "learning_rate": 1.8644067796610171e-06, - "loss": 1.3679, - "step": 27250 - }, - { - "epoch": 0.63, - "grad_norm": 7.1049017906188965, - "learning_rate": 1.857627118644068e-06, - "loss": 1.2542, - "step": 27260 - }, - { - "epoch": 0.63, - "grad_norm": 7.2281036376953125, - "learning_rate": 1.8508474576271189e-06, - "loss": 1.197, - "step": 27270 - }, - { - "epoch": 0.63, - "grad_norm": 7.088107109069824, - "learning_rate": 1.8440677966101696e-06, - "loss": 1.3486, - "step": 27280 - }, - { - "epoch": 0.63, - "grad_norm": 5.903868675231934, - "learning_rate": 1.8372881355932204e-06, - "loss": 1.2879, - "step": 27290 - }, - { - "epoch": 0.63, - "grad_norm": 1.4684103727340698, - "learning_rate": 1.8305084745762714e-06, - "loss": 1.2303, - "step": 27300 - }, - { - "epoch": 0.63, - "grad_norm": 10.257400512695312, - "learning_rate": 1.8237288135593223e-06, - "loss": 1.2068, - "step": 27310 - }, - { - "epoch": 0.63, - "grad_norm": 4.519632816314697, - "learning_rate": 1.816949152542373e-06, - "loss": 1.1828, - "step": 27320 - }, - { - "epoch": 0.63, - "grad_norm": 2.953141689300537, - "learning_rate": 1.8101694915254239e-06, - "loss": 1.2411, - "step": 27330 - }, - { - "epoch": 0.63, - "grad_norm": 3.631997585296631, - "learning_rate": 1.8033898305084746e-06, - "loss": 1.267, - "step": 27340 - }, - { - "epoch": 0.63, - "grad_norm": 4.540164947509766, - "learning_rate": 1.7966101694915256e-06, - "loss": 1.1731, - "step": 27350 - }, - { - "epoch": 0.63, - "grad_norm": 5.641757965087891, - "learning_rate": 1.7898305084745766e-06, - "loss": 1.1819, - "step": 27360 - }, - { - "epoch": 0.63, - "grad_norm": 10.886664390563965, - "learning_rate": 1.7830508474576271e-06, - "loss": 1.1085, - "step": 27370 - }, - { - "epoch": 0.63, - "grad_norm": 8.04320240020752, - "learning_rate": 1.7762711864406781e-06, - "loss": 1.2514, - "step": 27380 - }, - { - "epoch": 0.63, - "grad_norm": 8.10466480255127, - "learning_rate": 1.769491525423729e-06, - "loss": 1.3978, - "step": 27390 - }, - { - "epoch": 0.63, - "grad_norm": 8.447578430175781, - "learning_rate": 1.7627118644067799e-06, - "loss": 1.1422, - "step": 27400 - }, - { - "epoch": 0.63, - "grad_norm": 15.72360610961914, - "learning_rate": 1.7559322033898306e-06, - "loss": 1.2554, - "step": 27410 - }, - { - "epoch": 0.63, - "grad_norm": 4.728666305541992, - "learning_rate": 1.7491525423728814e-06, - "loss": 1.0925, - "step": 27420 - }, - { - "epoch": 0.64, - "grad_norm": 6.482224464416504, - "learning_rate": 1.7423728813559324e-06, - "loss": 1.2841, - "step": 27430 - }, - { - "epoch": 0.64, - "grad_norm": 17.51344871520996, - "learning_rate": 1.7355932203389834e-06, - "loss": 1.2075, - "step": 27440 - }, - { - "epoch": 0.64, - "grad_norm": 4.776230335235596, - "learning_rate": 1.728813559322034e-06, - "loss": 1.2672, - "step": 27450 - }, - { - "epoch": 0.64, - "grad_norm": 11.152587890625, - "learning_rate": 1.7220338983050849e-06, - "loss": 1.3695, - "step": 27460 - }, - { - "epoch": 0.64, - "grad_norm": 4.942652702331543, - "learning_rate": 1.7152542372881356e-06, - "loss": 1.4412, - "step": 27470 - }, - { - "epoch": 0.64, - "grad_norm": 7.248074054718018, - "learning_rate": 1.7084745762711866e-06, - "loss": 1.2882, - "step": 27480 - }, - { - "epoch": 0.64, - "grad_norm": 3.3746461868286133, - "learning_rate": 1.7016949152542376e-06, - "loss": 1.2192, - "step": 27490 - }, - { - "epoch": 0.64, - "grad_norm": 0.75871741771698, - "learning_rate": 1.6949152542372882e-06, - "loss": 1.1612, - "step": 27500 - }, - { - "epoch": 0.64, - "eval_loss": 0.9824415445327759, - "eval_runtime": 66.6076, - "eval_samples_per_second": 15.013, - "eval_steps_per_second": 15.013, - "step": 27500 - }, - { - "epoch": 0.64, - "grad_norm": 8.995759963989258, - "learning_rate": 1.6881355932203391e-06, - "loss": 1.1534, - "step": 27510 - }, - { - "epoch": 0.64, - "grad_norm": 4.333745002746582, - "learning_rate": 1.6813559322033901e-06, - "loss": 1.3447, - "step": 27520 - }, - { - "epoch": 0.64, - "grad_norm": 9.009721755981445, - "learning_rate": 1.6745762711864409e-06, - "loss": 1.102, - "step": 27530 - }, - { - "epoch": 0.64, - "grad_norm": 1.8660717010498047, - "learning_rate": 1.6677966101694916e-06, - "loss": 1.3257, - "step": 27540 - }, - { - "epoch": 0.64, - "grad_norm": 8.743804931640625, - "learning_rate": 1.6610169491525424e-06, - "loss": 1.3621, - "step": 27550 - }, - { - "epoch": 0.64, - "grad_norm": 4.872500419616699, - "learning_rate": 1.6542372881355934e-06, - "loss": 1.348, - "step": 27560 - }, - { - "epoch": 0.64, - "grad_norm": 6.630684852600098, - "learning_rate": 1.6474576271186444e-06, - "loss": 1.1863, - "step": 27570 - }, - { - "epoch": 0.64, - "grad_norm": 3.588062286376953, - "learning_rate": 1.640677966101695e-06, - "loss": 1.1248, - "step": 27580 - }, - { - "epoch": 0.64, - "grad_norm": 9.038145065307617, - "learning_rate": 1.6338983050847459e-06, - "loss": 1.2381, - "step": 27590 - }, - { - "epoch": 0.64, - "grad_norm": 2.703775644302368, - "learning_rate": 1.6271186440677967e-06, - "loss": 1.2388, - "step": 27600 - }, - { - "epoch": 0.64, - "grad_norm": 6.438141345977783, - "learning_rate": 1.6203389830508476e-06, - "loss": 1.4026, - "step": 27610 - }, - { - "epoch": 0.64, - "grad_norm": 23.630895614624023, - "learning_rate": 1.6135593220338986e-06, - "loss": 1.2498, - "step": 27620 - }, - { - "epoch": 0.64, - "grad_norm": 3.4313204288482666, - "learning_rate": 1.6067796610169492e-06, - "loss": 1.275, - "step": 27630 - }, - { - "epoch": 0.64, - "grad_norm": 9.49851131439209, - "learning_rate": 1.6000000000000001e-06, - "loss": 1.2096, - "step": 27640 - }, - { - "epoch": 0.64, - "grad_norm": 3.8425252437591553, - "learning_rate": 1.593220338983051e-06, - "loss": 1.225, - "step": 27650 - }, - { - "epoch": 0.64, - "grad_norm": 4.10614538192749, - "learning_rate": 1.5864406779661019e-06, - "loss": 1.182, - "step": 27660 - }, - { - "epoch": 0.64, - "grad_norm": 6.613473892211914, - "learning_rate": 1.5796610169491526e-06, - "loss": 1.316, - "step": 27670 - }, - { - "epoch": 0.64, - "grad_norm": 3.006357431411743, - "learning_rate": 1.5728813559322034e-06, - "loss": 0.991, - "step": 27680 - }, - { - "epoch": 0.64, - "grad_norm": 6.98958158493042, - "learning_rate": 1.5661016949152544e-06, - "loss": 1.2518, - "step": 27690 - }, - { - "epoch": 0.64, - "grad_norm": 18.53824806213379, - "learning_rate": 1.5593220338983054e-06, - "loss": 1.2037, - "step": 27700 - }, - { - "epoch": 0.64, - "grad_norm": 9.595442771911621, - "learning_rate": 1.552542372881356e-06, - "loss": 1.1781, - "step": 27710 - }, - { - "epoch": 0.64, - "grad_norm": 8.525481224060059, - "learning_rate": 1.545762711864407e-06, - "loss": 1.0589, - "step": 27720 - }, - { - "epoch": 0.64, - "grad_norm": 7.7480010986328125, - "learning_rate": 1.5389830508474577e-06, - "loss": 1.2321, - "step": 27730 - }, - { - "epoch": 0.64, - "grad_norm": 4.403713703155518, - "learning_rate": 1.5322033898305086e-06, - "loss": 1.2394, - "step": 27740 - }, - { - "epoch": 0.64, - "grad_norm": 8.966909408569336, - "learning_rate": 1.5254237288135596e-06, - "loss": 1.1507, - "step": 27750 - }, - { - "epoch": 0.64, - "grad_norm": 3.3880155086517334, - "learning_rate": 1.5186440677966102e-06, - "loss": 1.3728, - "step": 27760 - }, - { - "epoch": 0.64, - "grad_norm": 5.387197971343994, - "learning_rate": 1.5118644067796611e-06, - "loss": 1.3463, - "step": 27770 - }, - { - "epoch": 0.64, - "grad_norm": 9.160344123840332, - "learning_rate": 1.505084745762712e-06, - "loss": 1.2807, - "step": 27780 - }, - { - "epoch": 0.64, - "grad_norm": 9.067780494689941, - "learning_rate": 1.4983050847457629e-06, - "loss": 1.2772, - "step": 27790 - }, - { - "epoch": 0.64, - "grad_norm": 11.345379829406738, - "learning_rate": 1.4915254237288139e-06, - "loss": 1.2251, - "step": 27800 - }, - { - "epoch": 0.64, - "grad_norm": 1.177361249923706, - "learning_rate": 1.4847457627118644e-06, - "loss": 1.2393, - "step": 27810 - }, - { - "epoch": 0.64, - "grad_norm": 10.291409492492676, - "learning_rate": 1.4779661016949154e-06, - "loss": 1.1844, - "step": 27820 - }, - { - "epoch": 0.64, - "grad_norm": 3.2554609775543213, - "learning_rate": 1.4711864406779664e-06, - "loss": 1.4098, - "step": 27830 - }, - { - "epoch": 0.64, - "grad_norm": 10.152715682983398, - "learning_rate": 1.464406779661017e-06, - "loss": 1.1902, - "step": 27840 - }, - { - "epoch": 0.64, - "grad_norm": 7.060266494750977, - "learning_rate": 1.457627118644068e-06, - "loss": 1.255, - "step": 27850 - }, - { - "epoch": 0.65, - "grad_norm": 1.495127558708191, - "learning_rate": 1.4508474576271187e-06, - "loss": 1.2665, - "step": 27860 - }, - { - "epoch": 0.65, - "grad_norm": 4.548370838165283, - "learning_rate": 1.4440677966101696e-06, - "loss": 1.301, - "step": 27870 - }, - { - "epoch": 0.65, - "grad_norm": 7.4621734619140625, - "learning_rate": 1.4372881355932206e-06, - "loss": 1.2617, - "step": 27880 - }, - { - "epoch": 0.65, - "grad_norm": 7.851536273956299, - "learning_rate": 1.4305084745762712e-06, - "loss": 1.2416, - "step": 27890 - }, - { - "epoch": 0.65, - "grad_norm": 8.992350578308105, - "learning_rate": 1.4237288135593222e-06, - "loss": 1.1459, - "step": 27900 - }, - { - "epoch": 0.65, - "grad_norm": 4.336245059967041, - "learning_rate": 1.416949152542373e-06, - "loss": 1.4332, - "step": 27910 - }, - { - "epoch": 0.65, - "grad_norm": 11.527148246765137, - "learning_rate": 1.410169491525424e-06, - "loss": 1.1412, - "step": 27920 - }, - { - "epoch": 0.65, - "grad_norm": 3.6670548915863037, - "learning_rate": 1.4033898305084749e-06, - "loss": 1.3735, - "step": 27930 - }, - { - "epoch": 0.65, - "grad_norm": 10.903753280639648, - "learning_rate": 1.3966101694915254e-06, - "loss": 1.2655, - "step": 27940 - }, - { - "epoch": 0.65, - "grad_norm": 17.927316665649414, - "learning_rate": 1.3898305084745764e-06, - "loss": 1.2136, - "step": 27950 - }, - { - "epoch": 0.65, - "grad_norm": 8.938633918762207, - "learning_rate": 1.3830508474576274e-06, - "loss": 1.4995, - "step": 27960 - }, - { - "epoch": 0.65, - "grad_norm": 8.094979286193848, - "learning_rate": 1.376271186440678e-06, - "loss": 1.2659, - "step": 27970 - }, - { - "epoch": 0.65, - "grad_norm": 9.596714973449707, - "learning_rate": 1.369491525423729e-06, - "loss": 1.1559, - "step": 27980 - }, - { - "epoch": 0.65, - "grad_norm": 10.698532104492188, - "learning_rate": 1.3627118644067797e-06, - "loss": 1.3288, - "step": 27990 - }, - { - "epoch": 0.65, - "grad_norm": 2.6723885536193848, - "learning_rate": 1.3559322033898307e-06, - "loss": 1.2338, - "step": 28000 - }, - { - "epoch": 0.65, - "eval_loss": 1.0027884244918823, - "eval_runtime": 67.3345, - "eval_samples_per_second": 14.851, - "eval_steps_per_second": 14.851, - "step": 28000 - }, - { - "epoch": 0.65, - "grad_norm": 7.515227794647217, - "learning_rate": 1.3491525423728816e-06, - "loss": 1.365, - "step": 28010 - }, - { - "epoch": 0.65, - "grad_norm": 3.3241326808929443, - "learning_rate": 1.3423728813559322e-06, - "loss": 1.2401, - "step": 28020 - }, - { - "epoch": 0.65, - "grad_norm": 8.155590057373047, - "learning_rate": 1.3355932203389832e-06, - "loss": 1.3003, - "step": 28030 - }, - { - "epoch": 0.65, - "grad_norm": 2.6986613273620605, - "learning_rate": 1.328813559322034e-06, - "loss": 1.3645, - "step": 28040 - }, - { - "epoch": 0.65, - "grad_norm": 3.558117151260376, - "learning_rate": 1.322033898305085e-06, - "loss": 1.2466, - "step": 28050 - }, - { - "epoch": 0.65, - "grad_norm": 2.6389589309692383, - "learning_rate": 1.3152542372881359e-06, - "loss": 1.4448, - "step": 28060 - }, - { - "epoch": 0.65, - "grad_norm": 4.357456684112549, - "learning_rate": 1.3084745762711864e-06, - "loss": 1.0606, - "step": 28070 - }, - { - "epoch": 0.65, - "grad_norm": 3.8627960681915283, - "learning_rate": 1.3016949152542374e-06, - "loss": 1.2686, - "step": 28080 - }, - { - "epoch": 0.65, - "grad_norm": 6.192730903625488, - "learning_rate": 1.2949152542372884e-06, - "loss": 1.4787, - "step": 28090 - }, - { - "epoch": 0.65, - "grad_norm": 4.1315131187438965, - "learning_rate": 1.288135593220339e-06, - "loss": 1.2422, - "step": 28100 - }, - { - "epoch": 0.65, - "grad_norm": 13.30528450012207, - "learning_rate": 1.28135593220339e-06, - "loss": 1.2519, - "step": 28110 - }, - { - "epoch": 0.65, - "grad_norm": 6.925759315490723, - "learning_rate": 1.2745762711864407e-06, - "loss": 1.388, - "step": 28120 - }, - { - "epoch": 0.65, - "grad_norm": 5.491391181945801, - "learning_rate": 1.2677966101694917e-06, - "loss": 1.2848, - "step": 28130 - }, - { - "epoch": 0.65, - "grad_norm": 6.054429054260254, - "learning_rate": 1.2610169491525426e-06, - "loss": 1.1429, - "step": 28140 - }, - { - "epoch": 0.65, - "grad_norm": 13.854817390441895, - "learning_rate": 1.2542372881355932e-06, - "loss": 1.207, - "step": 28150 - }, - { - "epoch": 0.65, - "grad_norm": 10.493091583251953, - "learning_rate": 1.2474576271186442e-06, - "loss": 1.0886, - "step": 28160 - }, - { - "epoch": 0.65, - "grad_norm": 2.1718876361846924, - "learning_rate": 1.240677966101695e-06, - "loss": 1.2488, - "step": 28170 - }, - { - "epoch": 0.65, - "grad_norm": 2.642390727996826, - "learning_rate": 1.233898305084746e-06, - "loss": 1.1109, - "step": 28180 - }, - { - "epoch": 0.65, - "grad_norm": 7.8057332038879395, - "learning_rate": 1.2271186440677967e-06, - "loss": 1.121, - "step": 28190 - }, - { - "epoch": 0.65, - "grad_norm": 4.232807636260986, - "learning_rate": 1.2203389830508477e-06, - "loss": 1.193, - "step": 28200 - }, - { - "epoch": 0.65, - "grad_norm": 7.1039862632751465, - "learning_rate": 1.2135593220338984e-06, - "loss": 1.142, - "step": 28210 - }, - { - "epoch": 0.65, - "grad_norm": 4.6590752601623535, - "learning_rate": 1.2067796610169492e-06, - "loss": 1.2522, - "step": 28220 - }, - { - "epoch": 0.65, - "grad_norm": 6.117308139801025, - "learning_rate": 1.2000000000000002e-06, - "loss": 1.3453, - "step": 28230 - }, - { - "epoch": 0.65, - "grad_norm": 6.451774597167969, - "learning_rate": 1.193220338983051e-06, - "loss": 1.297, - "step": 28240 - }, - { - "epoch": 0.65, - "grad_norm": 4.05746603012085, - "learning_rate": 1.186440677966102e-06, - "loss": 1.2008, - "step": 28250 - }, - { - "epoch": 0.65, - "grad_norm": 12.00547981262207, - "learning_rate": 1.1796610169491527e-06, - "loss": 1.1584, - "step": 28260 - }, - { - "epoch": 0.65, - "grad_norm": 5.004361152648926, - "learning_rate": 1.1728813559322034e-06, - "loss": 1.1504, - "step": 28270 - }, - { - "epoch": 0.65, - "grad_norm": 5.276272296905518, - "learning_rate": 1.1661016949152542e-06, - "loss": 1.1542, - "step": 28280 - }, - { - "epoch": 0.65, - "grad_norm": 6.736107349395752, - "learning_rate": 1.1593220338983052e-06, - "loss": 1.2236, - "step": 28290 - }, - { - "epoch": 0.66, - "grad_norm": 3.8124780654907227, - "learning_rate": 1.152542372881356e-06, - "loss": 1.2236, - "step": 28300 - }, - { - "epoch": 0.66, - "grad_norm": 7.842329025268555, - "learning_rate": 1.145762711864407e-06, - "loss": 1.3622, - "step": 28310 - }, - { - "epoch": 0.66, - "grad_norm": 6.599586486816406, - "learning_rate": 1.1389830508474577e-06, - "loss": 1.1218, - "step": 28320 - }, - { - "epoch": 0.66, - "grad_norm": 4.077427387237549, - "learning_rate": 1.1322033898305087e-06, - "loss": 1.2038, - "step": 28330 - }, - { - "epoch": 0.66, - "grad_norm": 8.3840970993042, - "learning_rate": 1.1254237288135594e-06, - "loss": 1.3661, - "step": 28340 - }, - { - "epoch": 0.66, - "grad_norm": 8.212132453918457, - "learning_rate": 1.1186440677966102e-06, - "loss": 1.1966, - "step": 28350 - }, - { - "epoch": 0.66, - "grad_norm": 7.084817886352539, - "learning_rate": 1.1118644067796612e-06, - "loss": 1.2721, - "step": 28360 - }, - { - "epoch": 0.66, - "grad_norm": 9.409709930419922, - "learning_rate": 1.105084745762712e-06, - "loss": 1.2941, - "step": 28370 - }, - { - "epoch": 0.66, - "grad_norm": 4.141493320465088, - "learning_rate": 1.098305084745763e-06, - "loss": 1.4355, - "step": 28380 - }, - { - "epoch": 0.66, - "grad_norm": 8.694832801818848, - "learning_rate": 1.0915254237288137e-06, - "loss": 1.2153, - "step": 28390 - }, - { - "epoch": 0.66, - "grad_norm": 12.472442626953125, - "learning_rate": 1.0847457627118644e-06, - "loss": 1.1019, - "step": 28400 - }, - { - "epoch": 0.66, - "grad_norm": 6.5215163230896, - "learning_rate": 1.0779661016949152e-06, - "loss": 1.0885, - "step": 28410 - }, - { - "epoch": 0.66, - "grad_norm": 5.509222030639648, - "learning_rate": 1.0711864406779662e-06, - "loss": 1.2676, - "step": 28420 - }, - { - "epoch": 0.66, - "grad_norm": 9.21153736114502, - "learning_rate": 1.064406779661017e-06, - "loss": 1.3133, - "step": 28430 - }, - { - "epoch": 0.66, - "grad_norm": 5.743494987487793, - "learning_rate": 1.057627118644068e-06, - "loss": 1.292, - "step": 28440 - }, - { - "epoch": 0.66, - "grad_norm": 8.526168823242188, - "learning_rate": 1.0508474576271187e-06, - "loss": 1.0585, - "step": 28450 - }, - { - "epoch": 0.66, - "grad_norm": 3.5986156463623047, - "learning_rate": 1.0440677966101697e-06, - "loss": 1.1234, - "step": 28460 - }, - { - "epoch": 0.66, - "grad_norm": 13.26816463470459, - "learning_rate": 1.0372881355932204e-06, - "loss": 1.1792, - "step": 28470 - }, - { - "epoch": 0.66, - "grad_norm": 3.407280445098877, - "learning_rate": 1.0305084745762712e-06, - "loss": 1.1652, - "step": 28480 - }, - { - "epoch": 0.66, - "grad_norm": 8.368290901184082, - "learning_rate": 1.0237288135593222e-06, - "loss": 1.3459, - "step": 28490 - }, - { - "epoch": 0.66, - "grad_norm": 8.060779571533203, - "learning_rate": 1.016949152542373e-06, - "loss": 1.3085, - "step": 28500 - }, - { - "epoch": 0.66, - "eval_loss": 1.0383493900299072, - "eval_runtime": 68.471, - "eval_samples_per_second": 14.605, - "eval_steps_per_second": 14.605, - "step": 28500 - }, - { - "epoch": 0.66, - "grad_norm": 2.879181385040283, - "learning_rate": 1.010169491525424e-06, - "loss": 1.2482, - "step": 28510 - }, - { - "epoch": 0.66, - "grad_norm": 10.214177131652832, - "learning_rate": 1.0033898305084747e-06, - "loss": 1.2781, - "step": 28520 - }, - { - "epoch": 0.66, - "grad_norm": 17.13050079345703, - "learning_rate": 9.966101694915254e-07, - "loss": 1.0837, - "step": 28530 - }, - { - "epoch": 0.66, - "grad_norm": 15.841142654418945, - "learning_rate": 9.898305084745762e-07, - "loss": 1.2962, - "step": 28540 - }, - { - "epoch": 0.66, - "grad_norm": 7.079030990600586, - "learning_rate": 9.830508474576272e-07, - "loss": 0.9777, - "step": 28550 - }, - { - "epoch": 0.66, - "grad_norm": 3.9758317470550537, - "learning_rate": 9.762711864406782e-07, - "loss": 1.1584, - "step": 28560 - }, - { - "epoch": 0.66, - "grad_norm": 4.3860392570495605, - "learning_rate": 9.69491525423729e-07, - "loss": 1.1247, - "step": 28570 - }, - { - "epoch": 0.66, - "grad_norm": 3.759638786315918, - "learning_rate": 9.627118644067797e-07, - "loss": 1.2665, - "step": 28580 - }, - { - "epoch": 0.66, - "grad_norm": 16.677642822265625, - "learning_rate": 9.559322033898307e-07, - "loss": 1.4559, - "step": 28590 - }, - { - "epoch": 0.66, - "grad_norm": 7.994900703430176, - "learning_rate": 9.491525423728814e-07, - "loss": 1.0231, - "step": 28600 - }, - { - "epoch": 0.66, - "grad_norm": 5.5230841636657715, - "learning_rate": 9.423728813559323e-07, - "loss": 1.1959, - "step": 28610 - }, - { - "epoch": 0.66, - "grad_norm": 1.7889764308929443, - "learning_rate": 9.355932203389831e-07, - "loss": 1.3042, - "step": 28620 - }, - { - "epoch": 0.66, - "grad_norm": 1.4875215291976929, - "learning_rate": 9.28813559322034e-07, - "loss": 1.1124, - "step": 28630 - }, - { - "epoch": 0.66, - "grad_norm": 4.865635395050049, - "learning_rate": 9.220338983050848e-07, - "loss": 1.2639, - "step": 28640 - }, - { - "epoch": 0.66, - "grad_norm": 3.1794333457946777, - "learning_rate": 9.152542372881357e-07, - "loss": 1.2244, - "step": 28650 - }, - { - "epoch": 0.66, - "grad_norm": 9.40485954284668, - "learning_rate": 9.084745762711864e-07, - "loss": 1.244, - "step": 28660 - }, - { - "epoch": 0.66, - "grad_norm": 8.51891803741455, - "learning_rate": 9.016949152542373e-07, - "loss": 1.2456, - "step": 28670 - }, - { - "epoch": 0.66, - "grad_norm": 1.864579200744629, - "learning_rate": 8.949152542372883e-07, - "loss": 1.1113, - "step": 28680 - }, - { - "epoch": 0.66, - "grad_norm": 12.539873123168945, - "learning_rate": 8.881355932203391e-07, - "loss": 1.1996, - "step": 28690 - }, - { - "epoch": 0.66, - "grad_norm": 6.042193412780762, - "learning_rate": 8.813559322033899e-07, - "loss": 1.2614, - "step": 28700 - }, - { - "epoch": 0.66, - "grad_norm": 5.701644420623779, - "learning_rate": 8.745762711864407e-07, - "loss": 1.1356, - "step": 28710 - }, - { - "epoch": 0.66, - "grad_norm": 1.5281639099121094, - "learning_rate": 8.677966101694917e-07, - "loss": 1.244, - "step": 28720 - }, - { - "epoch": 0.67, - "grad_norm": 11.545940399169922, - "learning_rate": 8.610169491525424e-07, - "loss": 1.2447, - "step": 28730 - }, - { - "epoch": 0.67, - "grad_norm": 3.2043371200561523, - "learning_rate": 8.542372881355933e-07, - "loss": 1.2653, - "step": 28740 - }, - { - "epoch": 0.67, - "grad_norm": 9.815803527832031, - "learning_rate": 8.474576271186441e-07, - "loss": 1.2609, - "step": 28750 - }, - { - "epoch": 0.67, - "grad_norm": 2.5789248943328857, - "learning_rate": 8.406779661016951e-07, - "loss": 1.3839, - "step": 28760 - }, - { - "epoch": 0.67, - "grad_norm": 6.396823883056641, - "learning_rate": 8.338983050847458e-07, - "loss": 1.2267, - "step": 28770 - }, - { - "epoch": 0.67, - "grad_norm": 4.092123985290527, - "learning_rate": 8.271186440677967e-07, - "loss": 1.16, - "step": 28780 - }, - { - "epoch": 0.67, - "grad_norm": 1.5519795417785645, - "learning_rate": 8.203389830508475e-07, - "loss": 1.1852, - "step": 28790 - }, - { - "epoch": 0.67, - "grad_norm": 10.254398345947266, - "learning_rate": 8.135593220338983e-07, - "loss": 1.3109, - "step": 28800 - }, - { - "epoch": 0.67, - "grad_norm": 5.28373384475708, - "learning_rate": 8.067796610169493e-07, - "loss": 1.2634, - "step": 28810 - }, - { - "epoch": 0.67, - "grad_norm": 5.046334266662598, - "learning_rate": 8.000000000000001e-07, - "loss": 1.3029, - "step": 28820 - }, - { - "epoch": 0.67, - "grad_norm": 7.71566104888916, - "learning_rate": 7.932203389830509e-07, - "loss": 1.1061, - "step": 28830 - }, - { - "epoch": 0.67, - "grad_norm": 2.412494659423828, - "learning_rate": 7.864406779661017e-07, - "loss": 1.4086, - "step": 28840 - }, - { - "epoch": 0.67, - "grad_norm": 6.458393096923828, - "learning_rate": 7.796610169491527e-07, - "loss": 1.238, - "step": 28850 - }, - { - "epoch": 0.67, - "grad_norm": 4.167256832122803, - "learning_rate": 7.728813559322034e-07, - "loss": 1.24, - "step": 28860 - }, - { - "epoch": 0.67, - "grad_norm": 7.2174177169799805, - "learning_rate": 7.661016949152543e-07, - "loss": 0.9788, - "step": 28870 - }, - { - "epoch": 0.67, - "grad_norm": 8.519877433776855, - "learning_rate": 7.593220338983051e-07, - "loss": 1.3748, - "step": 28880 - }, - { - "epoch": 0.67, - "grad_norm": 5.170273303985596, - "learning_rate": 7.52542372881356e-07, - "loss": 1.2798, - "step": 28890 - }, - { - "epoch": 0.67, - "grad_norm": 9.433271408081055, - "learning_rate": 7.457627118644069e-07, - "loss": 1.2496, - "step": 28900 - }, - { - "epoch": 0.67, - "grad_norm": 3.0811305046081543, - "learning_rate": 7.389830508474577e-07, - "loss": 1.3467, - "step": 28910 - }, - { - "epoch": 0.67, - "grad_norm": 3.777050495147705, - "learning_rate": 7.322033898305085e-07, - "loss": 1.3679, - "step": 28920 - }, - { - "epoch": 0.67, - "grad_norm": 2.2685747146606445, - "learning_rate": 7.254237288135593e-07, - "loss": 1.3076, - "step": 28930 - }, - { - "epoch": 0.67, - "grad_norm": 6.493512153625488, - "learning_rate": 7.186440677966103e-07, - "loss": 1.1433, - "step": 28940 - }, - { - "epoch": 0.67, - "grad_norm": 3.230098009109497, - "learning_rate": 7.118644067796611e-07, - "loss": 1.3191, - "step": 28950 - }, - { - "epoch": 0.67, - "grad_norm": 1.8938180208206177, - "learning_rate": 7.05084745762712e-07, - "loss": 1.138, - "step": 28960 - }, - { - "epoch": 0.67, - "grad_norm": 7.385470867156982, - "learning_rate": 6.983050847457627e-07, - "loss": 1.1831, - "step": 28970 - }, - { - "epoch": 0.67, - "grad_norm": 4.667689800262451, - "learning_rate": 6.915254237288137e-07, - "loss": 1.3282, - "step": 28980 - }, - { - "epoch": 0.67, - "grad_norm": 7.002562999725342, - "learning_rate": 6.847457627118645e-07, - "loss": 1.3384, - "step": 28990 - }, - { - "epoch": 0.67, - "grad_norm": 7.729028701782227, - "learning_rate": 6.779661016949153e-07, - "loss": 1.2527, - "step": 29000 - }, - { - "epoch": 0.67, - "eval_loss": 1.036794900894165, - "eval_runtime": 67.0053, - "eval_samples_per_second": 14.924, - "eval_steps_per_second": 14.924, - "step": 29000 - }, - { - "epoch": 0.67, - "grad_norm": 5.627978801727295, - "learning_rate": 6.711864406779661e-07, - "loss": 1.2501, - "step": 29010 - }, - { - "epoch": 0.67, - "grad_norm": 18.444440841674805, - "learning_rate": 6.64406779661017e-07, - "loss": 1.0848, - "step": 29020 - }, - { - "epoch": 0.67, - "grad_norm": 5.111781120300293, - "learning_rate": 6.576271186440679e-07, - "loss": 1.1205, - "step": 29030 - }, - { - "epoch": 0.67, - "grad_norm": 10.785350799560547, - "learning_rate": 6.508474576271187e-07, - "loss": 1.0386, - "step": 29040 - }, - { - "epoch": 0.67, - "grad_norm": 5.114529609680176, - "learning_rate": 6.440677966101695e-07, - "loss": 1.2195, - "step": 29050 - }, - { - "epoch": 0.67, - "grad_norm": 9.179306983947754, - "learning_rate": 6.372881355932203e-07, - "loss": 1.2742, - "step": 29060 - }, - { - "epoch": 0.67, - "grad_norm": 3.526697874069214, - "learning_rate": 6.305084745762713e-07, - "loss": 1.2442, - "step": 29070 - }, - { - "epoch": 0.67, - "grad_norm": 2.499582052230835, - "learning_rate": 6.237288135593221e-07, - "loss": 1.1093, - "step": 29080 - }, - { - "epoch": 0.67, - "grad_norm": 7.2823638916015625, - "learning_rate": 6.16949152542373e-07, - "loss": 1.3122, - "step": 29090 - }, - { - "epoch": 0.67, - "grad_norm": 4.526163578033447, - "learning_rate": 6.101694915254238e-07, - "loss": 1.3623, - "step": 29100 - }, - { - "epoch": 0.67, - "grad_norm": 3.685267686843872, - "learning_rate": 6.033898305084746e-07, - "loss": 1.0836, - "step": 29110 - }, - { - "epoch": 0.67, - "grad_norm": 5.807446479797363, - "learning_rate": 5.966101694915255e-07, - "loss": 1.4172, - "step": 29120 - }, - { - "epoch": 0.67, - "grad_norm": 7.100170135498047, - "learning_rate": 5.898305084745763e-07, - "loss": 1.424, - "step": 29130 - }, - { - "epoch": 0.67, - "grad_norm": 2.293673276901245, - "learning_rate": 5.830508474576271e-07, - "loss": 1.2277, - "step": 29140 - }, - { - "epoch": 0.67, - "grad_norm": 5.1463165283203125, - "learning_rate": 5.76271186440678e-07, - "loss": 1.0603, - "step": 29150 - }, - { - "epoch": 0.68, - "grad_norm": 2.3667409420013428, - "learning_rate": 5.694915254237288e-07, - "loss": 1.235, - "step": 29160 - }, - { - "epoch": 0.68, - "grad_norm": 12.005767822265625, - "learning_rate": 5.627118644067797e-07, - "loss": 1.2028, - "step": 29170 - }, - { - "epoch": 0.68, - "grad_norm": 6.225417613983154, - "learning_rate": 5.559322033898306e-07, - "loss": 1.0662, - "step": 29180 - }, - { - "epoch": 0.68, - "grad_norm": 3.9254140853881836, - "learning_rate": 5.491525423728815e-07, - "loss": 1.209, - "step": 29190 - }, - { - "epoch": 0.68, - "grad_norm": 2.4858005046844482, - "learning_rate": 5.423728813559322e-07, - "loss": 1.1661, - "step": 29200 - }, - { - "epoch": 0.68, - "grad_norm": 5.9373884201049805, - "learning_rate": 5.355932203389831e-07, - "loss": 1.1282, - "step": 29210 - }, - { - "epoch": 0.68, - "grad_norm": 6.60464334487915, - "learning_rate": 5.28813559322034e-07, - "loss": 1.3383, - "step": 29220 - }, - { - "epoch": 0.68, - "grad_norm": 4.6242828369140625, - "learning_rate": 5.220338983050848e-07, - "loss": 1.3385, - "step": 29230 - }, - { - "epoch": 0.68, - "grad_norm": 7.414794445037842, - "learning_rate": 5.152542372881356e-07, - "loss": 1.2059, - "step": 29240 - }, - { - "epoch": 0.68, - "grad_norm": 3.5053508281707764, - "learning_rate": 5.084745762711865e-07, - "loss": 1.2981, - "step": 29250 - }, - { - "epoch": 0.68, - "grad_norm": 5.528652191162109, - "learning_rate": 5.016949152542373e-07, - "loss": 1.2069, - "step": 29260 - }, - { - "epoch": 0.68, - "grad_norm": 7.826475143432617, - "learning_rate": 4.949152542372881e-07, - "loss": 1.286, - "step": 29270 - }, - { - "epoch": 0.68, - "grad_norm": 2.1104512214660645, - "learning_rate": 4.881355932203391e-07, - "loss": 1.0453, - "step": 29280 - }, - { - "epoch": 0.68, - "grad_norm": 3.8154537677764893, - "learning_rate": 4.813559322033898e-07, - "loss": 1.265, - "step": 29290 - }, - { - "epoch": 0.68, - "grad_norm": 8.516002655029297, - "learning_rate": 4.745762711864407e-07, - "loss": 1.2142, - "step": 29300 - }, - { - "epoch": 0.68, - "grad_norm": 3.6356046199798584, - "learning_rate": 4.6779661016949154e-07, - "loss": 1.3932, - "step": 29310 - }, - { - "epoch": 0.68, - "grad_norm": 5.065585613250732, - "learning_rate": 4.610169491525424e-07, - "loss": 1.1825, - "step": 29320 - }, - { - "epoch": 0.68, - "grad_norm": 3.2396178245544434, - "learning_rate": 4.542372881355932e-07, - "loss": 1.269, - "step": 29330 - }, - { - "epoch": 0.68, - "grad_norm": 3.066288471221924, - "learning_rate": 4.4745762711864415e-07, - "loss": 1.1986, - "step": 29340 - }, - { - "epoch": 0.68, - "grad_norm": 8.957977294921875, - "learning_rate": 4.4067796610169497e-07, - "loss": 1.2721, - "step": 29350 - }, - { - "epoch": 0.68, - "grad_norm": 6.242004871368408, - "learning_rate": 4.3389830508474584e-07, - "loss": 1.1724, - "step": 29360 - }, - { - "epoch": 0.68, - "grad_norm": 10.986865997314453, - "learning_rate": 4.2711864406779666e-07, - "loss": 1.3121, - "step": 29370 - }, - { - "epoch": 0.68, - "grad_norm": 7.176580905914307, - "learning_rate": 4.2033898305084753e-07, - "loss": 1.0941, - "step": 29380 - }, - { - "epoch": 0.68, - "grad_norm": 8.999109268188477, - "learning_rate": 4.1355932203389835e-07, - "loss": 1.2917, - "step": 29390 - }, - { - "epoch": 0.68, - "grad_norm": 3.9739575386047363, - "learning_rate": 4.0677966101694916e-07, - "loss": 1.1372, - "step": 29400 - }, - { - "epoch": 0.68, - "grad_norm": 7.752053737640381, - "learning_rate": 4.0000000000000003e-07, - "loss": 1.229, - "step": 29410 - }, - { - "epoch": 0.68, - "grad_norm": 15.617894172668457, - "learning_rate": 3.9322033898305085e-07, - "loss": 1.2178, - "step": 29420 - }, - { - "epoch": 0.68, - "grad_norm": 5.103687763214111, - "learning_rate": 3.864406779661017e-07, - "loss": 1.1923, - "step": 29430 - }, - { - "epoch": 0.68, - "grad_norm": 13.210460662841797, - "learning_rate": 3.7966101694915254e-07, - "loss": 1.3562, - "step": 29440 - }, - { - "epoch": 0.68, - "grad_norm": 14.239701271057129, - "learning_rate": 3.7288135593220347e-07, - "loss": 1.2422, - "step": 29450 - }, - { - "epoch": 0.68, - "grad_norm": 15.124138832092285, - "learning_rate": 3.6610169491525423e-07, - "loss": 1.3159, - "step": 29460 - }, - { - "epoch": 0.68, - "grad_norm": 10.07345199584961, - "learning_rate": 3.5932203389830516e-07, - "loss": 1.0358, - "step": 29470 - }, - { - "epoch": 0.68, - "grad_norm": 8.566622734069824, - "learning_rate": 3.52542372881356e-07, - "loss": 1.0562, - "step": 29480 - }, - { - "epoch": 0.68, - "grad_norm": 13.473173141479492, - "learning_rate": 3.4576271186440684e-07, - "loss": 1.1538, - "step": 29490 - }, - { - "epoch": 0.68, - "grad_norm": 3.2546353340148926, - "learning_rate": 3.3898305084745766e-07, - "loss": 1.3963, - "step": 29500 - }, - { - "epoch": 0.68, - "eval_loss": 0.9776638150215149, - "eval_runtime": 67.0335, - "eval_samples_per_second": 14.918, - "eval_steps_per_second": 14.918, - "step": 29500 - }, - { - "epoch": 0.68, - "grad_norm": 8.819803237915039, - "learning_rate": 3.322033898305085e-07, - "loss": 1.3515, - "step": 29510 - }, - { - "epoch": 0.68, - "grad_norm": 26.38185691833496, - "learning_rate": 3.2542372881355935e-07, - "loss": 1.1068, - "step": 29520 - }, - { - "epoch": 0.68, - "grad_norm": 5.794632434844971, - "learning_rate": 3.1864406779661017e-07, - "loss": 1.2475, - "step": 29530 - }, - { - "epoch": 0.68, - "grad_norm": 9.786145210266113, - "learning_rate": 3.1186440677966104e-07, - "loss": 1.2613, - "step": 29540 - }, - { - "epoch": 0.68, - "grad_norm": 8.388969421386719, - "learning_rate": 3.050847457627119e-07, - "loss": 1.4074, - "step": 29550 - }, - { - "epoch": 0.68, - "grad_norm": 3.0043601989746094, - "learning_rate": 2.9830508474576273e-07, - "loss": 1.4106, - "step": 29560 - }, - { - "epoch": 0.68, - "grad_norm": 12.958208084106445, - "learning_rate": 2.9152542372881355e-07, - "loss": 1.2264, - "step": 29570 - }, - { - "epoch": 0.68, - "grad_norm": 6.8235063552856445, - "learning_rate": 2.847457627118644e-07, - "loss": 1.1727, - "step": 29580 - }, - { - "epoch": 0.69, - "grad_norm": 4.284834384918213, - "learning_rate": 2.779661016949153e-07, - "loss": 1.2897, - "step": 29590 - }, - { - "epoch": 0.69, - "grad_norm": 3.338212251663208, - "learning_rate": 2.711864406779661e-07, - "loss": 1.2449, - "step": 29600 - }, - { - "epoch": 0.69, - "grad_norm": 23.881393432617188, - "learning_rate": 2.64406779661017e-07, - "loss": 1.2986, - "step": 29610 - }, - { - "epoch": 0.69, - "grad_norm": 9.715893745422363, - "learning_rate": 2.576271186440678e-07, - "loss": 1.2566, - "step": 29620 - }, - { - "epoch": 0.69, - "grad_norm": 2.027327060699463, - "learning_rate": 2.5084745762711867e-07, - "loss": 1.2739, - "step": 29630 - }, - { - "epoch": 0.69, - "grad_norm": 6.084362030029297, - "learning_rate": 2.4406779661016954e-07, - "loss": 1.1787, - "step": 29640 - }, - { - "epoch": 0.69, - "grad_norm": 9.463132858276367, - "learning_rate": 2.3728813559322036e-07, - "loss": 1.1894, - "step": 29650 - }, - { - "epoch": 0.69, - "grad_norm": 12.049840927124023, - "learning_rate": 2.305084745762712e-07, - "loss": 1.2903, - "step": 29660 - }, - { - "epoch": 0.69, - "grad_norm": 4.152366638183594, - "learning_rate": 2.2372881355932207e-07, - "loss": 1.096, - "step": 29670 - }, - { - "epoch": 0.69, - "grad_norm": 9.684791564941406, - "learning_rate": 2.1694915254237292e-07, - "loss": 1.2017, - "step": 29680 - }, - { - "epoch": 0.69, - "grad_norm": 7.867280960083008, - "learning_rate": 2.1016949152542376e-07, - "loss": 1.1625, - "step": 29690 - }, - { - "epoch": 0.69, - "grad_norm": 1.4374995231628418, - "learning_rate": 2.0338983050847458e-07, - "loss": 1.4949, - "step": 29700 - }, - { - "epoch": 0.69, - "grad_norm": 4.6095428466796875, - "learning_rate": 1.9661016949152543e-07, - "loss": 1.3103, - "step": 29710 - }, - { - "epoch": 0.69, - "grad_norm": 5.289913177490234, - "learning_rate": 1.8983050847457627e-07, - "loss": 1.3265, - "step": 29720 - }, - { - "epoch": 0.69, - "grad_norm": 9.2072172164917, - "learning_rate": 1.8305084745762712e-07, - "loss": 1.1148, - "step": 29730 - }, - { - "epoch": 0.69, - "grad_norm": 4.103106498718262, - "learning_rate": 1.76271186440678e-07, - "loss": 1.3842, - "step": 29740 - }, - { - "epoch": 0.69, - "grad_norm": 15.820337295532227, - "learning_rate": 1.6949152542372883e-07, - "loss": 1.2755, - "step": 29750 - }, - { - "epoch": 0.69, - "grad_norm": 4.733695030212402, - "learning_rate": 1.6271186440677968e-07, - "loss": 1.462, - "step": 29760 - }, - { - "epoch": 0.69, - "grad_norm": 8.206982612609863, - "learning_rate": 1.5593220338983052e-07, - "loss": 1.235, - "step": 29770 - }, - { - "epoch": 0.69, - "grad_norm": 7.3087663650512695, - "learning_rate": 1.4915254237288137e-07, - "loss": 1.1845, - "step": 29780 - }, - { - "epoch": 0.69, - "grad_norm": 15.122031211853027, - "learning_rate": 1.423728813559322e-07, - "loss": 1.1609, - "step": 29790 - }, - { - "epoch": 0.69, - "grad_norm": 5.786062717437744, - "learning_rate": 1.3559322033898305e-07, - "loss": 1.0769, - "step": 29800 - }, - { - "epoch": 0.69, - "grad_norm": 1.0411622524261475, - "learning_rate": 1.288135593220339e-07, - "loss": 1.2727, - "step": 29810 - }, - { - "epoch": 0.69, - "grad_norm": 3.1188580989837646, - "learning_rate": 1.2203389830508477e-07, - "loss": 1.2504, - "step": 29820 - }, - { - "epoch": 0.69, - "grad_norm": 1.6318976879119873, - "learning_rate": 1.152542372881356e-07, - "loss": 1.2839, - "step": 29830 - }, - { - "epoch": 0.69, - "grad_norm": 7.041867733001709, - "learning_rate": 1.0847457627118646e-07, - "loss": 1.0513, - "step": 29840 - }, - { - "epoch": 0.69, - "grad_norm": 12.029023170471191, - "learning_rate": 1.0169491525423729e-07, - "loss": 1.132, - "step": 29850 - }, - { - "epoch": 0.69, - "grad_norm": 5.014979839324951, - "learning_rate": 9.491525423728814e-08, - "loss": 1.2018, - "step": 29860 - }, - { - "epoch": 0.69, - "grad_norm": 2.5008723735809326, - "learning_rate": 8.8135593220339e-08, - "loss": 1.2404, - "step": 29870 - }, - { - "epoch": 0.69, - "grad_norm": 10.014387130737305, - "learning_rate": 8.135593220338984e-08, - "loss": 1.2046, - "step": 29880 - }, - { - "epoch": 0.69, - "grad_norm": 12.765832901000977, - "learning_rate": 7.457627118644068e-08, - "loss": 1.0579, - "step": 29890 - }, - { - "epoch": 0.69, - "grad_norm": 4.352108001708984, - "learning_rate": 6.779661016949153e-08, - "loss": 1.3139, - "step": 29900 - }, - { - "epoch": 0.69, - "grad_norm": 10.630574226379395, - "learning_rate": 6.101694915254239e-08, - "loss": 1.3174, - "step": 29910 - }, - { - "epoch": 0.69, - "grad_norm": 11.868256568908691, - "learning_rate": 5.423728813559323e-08, - "loss": 0.9875, - "step": 29920 - }, - { - "epoch": 0.69, - "grad_norm": 3.4866442680358887, - "learning_rate": 4.745762711864407e-08, - "loss": 1.3073, - "step": 29930 - }, - { - "epoch": 0.69, - "grad_norm": 9.97262191772461, - "learning_rate": 4.067796610169492e-08, - "loss": 1.276, - "step": 29940 - }, - { - "epoch": 0.69, - "grad_norm": 3.1585495471954346, - "learning_rate": 3.3898305084745764e-08, - "loss": 1.3528, - "step": 29950 - }, - { - "epoch": 0.69, - "grad_norm": 14.819723129272461, - "learning_rate": 2.7118644067796615e-08, - "loss": 1.1333, - "step": 29960 - }, - { - "epoch": 0.69, - "grad_norm": 4.731696128845215, - "learning_rate": 2.033898305084746e-08, - "loss": 1.5296, - "step": 29970 - }, - { - "epoch": 0.69, - "grad_norm": 9.690890312194824, - "learning_rate": 1.3559322033898307e-08, - "loss": 1.2169, - "step": 29980 - }, - { - "epoch": 0.69, - "grad_norm": 3.7671804428100586, - "learning_rate": 6.779661016949154e-09, - "loss": 1.304, - "step": 29990 - }, - { - "epoch": 0.69, - "grad_norm": 13.029277801513672, - "learning_rate": 0.0, - "loss": 1.0771, - "step": 30000 - }, - { - "epoch": 0.69, - "eval_loss": 1.0159568786621094, - "eval_runtime": 67.116, - "eval_samples_per_second": 14.9, - "eval_steps_per_second": 14.9, - "step": 30000 } ], "logging_steps": 10, - "max_steps": 30000, + "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, - "total_flos": 4.8306377981952e+17, + "total_flos": 4.025531498496e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null